{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 7518, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003990422984836393, "grad_norm": 1.0935096740722656, "learning_rate": 1.3297872340425533e-07, "loss": 0.5485, "num_tokens": 131072.0, "step": 1 }, { "epoch": 0.0007980845969672786, "grad_norm": 1.2665417194366455, "learning_rate": 2.6595744680851066e-07, "loss": 0.5535, "num_tokens": 262144.0, "step": 2 }, { "epoch": 0.0011971268954509178, "grad_norm": 1.089948296546936, "learning_rate": 3.9893617021276597e-07, "loss": 0.5714, "num_tokens": 393216.0, "step": 3 }, { "epoch": 0.0015961691939345571, "grad_norm": 1.231433629989624, "learning_rate": 5.319148936170213e-07, "loss": 0.6074, "num_tokens": 524288.0, "step": 4 }, { "epoch": 0.0019952114924181963, "grad_norm": 1.2506704330444336, "learning_rate": 6.648936170212766e-07, "loss": 0.5528, "num_tokens": 655360.0, "step": 5 }, { "epoch": 0.0023942537909018356, "grad_norm": 1.2356318235397339, "learning_rate": 7.978723404255319e-07, "loss": 0.5661, "num_tokens": 786432.0, "step": 6 }, { "epoch": 0.002793296089385475, "grad_norm": 1.1928491592407227, "learning_rate": 9.308510638297872e-07, "loss": 0.6217, "num_tokens": 917504.0, "step": 7 }, { "epoch": 0.0031923383878691143, "grad_norm": 1.3019524812698364, "learning_rate": 1.0638297872340427e-06, "loss": 0.5429, "num_tokens": 1048576.0, "step": 8 }, { "epoch": 0.003591380686352753, "grad_norm": 1.2138481140136719, "learning_rate": 1.1968085106382979e-06, "loss": 0.5542, "num_tokens": 1179648.0, "step": 9 }, { "epoch": 0.0039904229848363925, "grad_norm": 1.3049747943878174, "learning_rate": 1.3297872340425533e-06, "loss": 0.5796, "num_tokens": 1310720.0, "step": 10 }, { "epoch": 0.004389465283320032, "grad_norm": 1.1071012020111084, "learning_rate": 1.4627659574468087e-06, "loss": 0.5457, "num_tokens": 1441792.0, "step": 11 }, { "epoch": 0.004788507581803671, "grad_norm": 1.2530375719070435, "learning_rate": 1.5957446808510639e-06, "loss": 0.5832, "num_tokens": 1572864.0, "step": 12 }, { "epoch": 0.0051875498802873106, "grad_norm": 1.153944492340088, "learning_rate": 1.7287234042553193e-06, "loss": 0.6149, "num_tokens": 1703936.0, "step": 13 }, { "epoch": 0.00558659217877095, "grad_norm": 1.2241123914718628, "learning_rate": 1.8617021276595745e-06, "loss": 0.596, "num_tokens": 1835008.0, "step": 14 }, { "epoch": 0.005985634477254589, "grad_norm": 1.2541342973709106, "learning_rate": 1.99468085106383e-06, "loss": 0.5566, "num_tokens": 1966080.0, "step": 15 }, { "epoch": 0.006384676775738229, "grad_norm": 1.1250274181365967, "learning_rate": 2.1276595744680853e-06, "loss": 0.5394, "num_tokens": 2097152.0, "step": 16 }, { "epoch": 0.006783719074221868, "grad_norm": 1.0519423484802246, "learning_rate": 2.2606382978723405e-06, "loss": 0.5279, "num_tokens": 2228224.0, "step": 17 }, { "epoch": 0.007182761372705506, "grad_norm": 1.068806767463684, "learning_rate": 2.3936170212765957e-06, "loss": 0.5545, "num_tokens": 2359296.0, "step": 18 }, { "epoch": 0.007581803671189146, "grad_norm": 38.802284240722656, "learning_rate": 2.526595744680851e-06, "loss": 1.776, "num_tokens": 2490368.0, "step": 19 }, { "epoch": 0.007980845969672785, "grad_norm": 0.9665082693099976, "learning_rate": 2.6595744680851065e-06, "loss": 0.5804, "num_tokens": 2621440.0, "step": 20 }, { "epoch": 0.008379888268156424, "grad_norm": 0.8970898389816284, "learning_rate": 2.7925531914893617e-06, "loss": 0.5428, "num_tokens": 2752512.0, "step": 21 }, { "epoch": 0.008778930566640064, "grad_norm": 0.9247125387191772, "learning_rate": 2.9255319148936174e-06, "loss": 0.5549, "num_tokens": 2883584.0, "step": 22 }, { "epoch": 0.009177972865123703, "grad_norm": 0.8996503949165344, "learning_rate": 3.0585106382978726e-06, "loss": 0.5423, "num_tokens": 3014656.0, "step": 23 }, { "epoch": 0.009577015163607342, "grad_norm": 0.7233424782752991, "learning_rate": 3.1914893617021277e-06, "loss": 0.5287, "num_tokens": 3145728.0, "step": 24 }, { "epoch": 0.009976057462090982, "grad_norm": 0.6564655303955078, "learning_rate": 3.324468085106383e-06, "loss": 0.5208, "num_tokens": 3276800.0, "step": 25 }, { "epoch": 0.010375099760574621, "grad_norm": 0.5814170837402344, "learning_rate": 3.4574468085106386e-06, "loss": 0.5154, "num_tokens": 3407872.0, "step": 26 }, { "epoch": 0.01077414205905826, "grad_norm": 0.634429931640625, "learning_rate": 3.590425531914894e-06, "loss": 0.5727, "num_tokens": 3538944.0, "step": 27 }, { "epoch": 0.0111731843575419, "grad_norm": 0.5540726184844971, "learning_rate": 3.723404255319149e-06, "loss": 0.5143, "num_tokens": 3670016.0, "step": 28 }, { "epoch": 0.01157222665602554, "grad_norm": 0.5412524938583374, "learning_rate": 3.856382978723404e-06, "loss": 0.4994, "num_tokens": 3801088.0, "step": 29 }, { "epoch": 0.011971268954509178, "grad_norm": 0.5491828322410583, "learning_rate": 3.98936170212766e-06, "loss": 0.5457, "num_tokens": 3932160.0, "step": 30 }, { "epoch": 0.012370311252992818, "grad_norm": 0.477096825838089, "learning_rate": 4.1223404255319146e-06, "loss": 0.4842, "num_tokens": 4063232.0, "step": 31 }, { "epoch": 0.012769353551476457, "grad_norm": 0.46781811118125916, "learning_rate": 4.255319148936171e-06, "loss": 0.4861, "num_tokens": 4194304.0, "step": 32 }, { "epoch": 0.013168395849960097, "grad_norm": 0.4309675991535187, "learning_rate": 4.388297872340426e-06, "loss": 0.5366, "num_tokens": 4323011.0, "step": 33 }, { "epoch": 0.013567438148443736, "grad_norm": 0.36655765771865845, "learning_rate": 4.521276595744681e-06, "loss": 0.5059, "num_tokens": 4454083.0, "step": 34 }, { "epoch": 0.013966480446927373, "grad_norm": 0.4116179645061493, "learning_rate": 4.654255319148936e-06, "loss": 0.5256, "num_tokens": 4585155.0, "step": 35 }, { "epoch": 0.014365522745411013, "grad_norm": 0.470704585313797, "learning_rate": 4.787234042553191e-06, "loss": 0.585, "num_tokens": 4716227.0, "step": 36 }, { "epoch": 0.014764565043894652, "grad_norm": 0.49466609954833984, "learning_rate": 4.9202127659574475e-06, "loss": 0.5232, "num_tokens": 4847299.0, "step": 37 }, { "epoch": 0.015163607342378291, "grad_norm": 0.5204284191131592, "learning_rate": 5.053191489361702e-06, "loss": 0.5182, "num_tokens": 4978371.0, "step": 38 }, { "epoch": 0.01556264964086193, "grad_norm": 0.4733237326145172, "learning_rate": 5.186170212765958e-06, "loss": 0.4878, "num_tokens": 5109443.0, "step": 39 }, { "epoch": 0.01596169193934557, "grad_norm": 0.4837912321090698, "learning_rate": 5.319148936170213e-06, "loss": 0.5171, "num_tokens": 5240515.0, "step": 40 }, { "epoch": 0.01636073423782921, "grad_norm": 0.5055720210075378, "learning_rate": 5.452127659574468e-06, "loss": 0.5533, "num_tokens": 5371587.0, "step": 41 }, { "epoch": 0.01675977653631285, "grad_norm": 0.47883081436157227, "learning_rate": 5.5851063829787235e-06, "loss": 0.5046, "num_tokens": 5502659.0, "step": 42 }, { "epoch": 0.017158818834796488, "grad_norm": 0.43938198685646057, "learning_rate": 5.718085106382979e-06, "loss": 0.527, "num_tokens": 5633731.0, "step": 43 }, { "epoch": 0.017557861133280128, "grad_norm": 0.39899495244026184, "learning_rate": 5.851063829787235e-06, "loss": 0.5167, "num_tokens": 5761548.0, "step": 44 }, { "epoch": 0.017956903431763767, "grad_norm": 0.3626687824726105, "learning_rate": 5.98404255319149e-06, "loss": 0.4935, "num_tokens": 5892620.0, "step": 45 }, { "epoch": 0.018355945730247406, "grad_norm": 0.3482113778591156, "learning_rate": 6.117021276595745e-06, "loss": 0.4976, "num_tokens": 6023692.0, "step": 46 }, { "epoch": 0.018754988028731046, "grad_norm": 0.3229896128177643, "learning_rate": 6.25e-06, "loss": 0.4783, "num_tokens": 6154764.0, "step": 47 }, { "epoch": 0.019154030327214685, "grad_norm": 0.3258231282234192, "learning_rate": 6.3829787234042555e-06, "loss": 0.493, "num_tokens": 6285836.0, "step": 48 }, { "epoch": 0.019553072625698324, "grad_norm": 0.31160402297973633, "learning_rate": 6.5159574468085115e-06, "loss": 0.4979, "num_tokens": 6416908.0, "step": 49 }, { "epoch": 0.019952114924181964, "grad_norm": 0.30258819460868835, "learning_rate": 6.648936170212766e-06, "loss": 0.477, "num_tokens": 6547980.0, "step": 50 }, { "epoch": 0.020351157222665603, "grad_norm": 0.38437142968177795, "learning_rate": 6.781914893617021e-06, "loss": 0.5657, "num_tokens": 6679052.0, "step": 51 }, { "epoch": 0.020750199521149242, "grad_norm": 0.3073842525482178, "learning_rate": 6.914893617021277e-06, "loss": 0.4516, "num_tokens": 6810124.0, "step": 52 }, { "epoch": 0.02114924181963288, "grad_norm": 0.37123072147369385, "learning_rate": 7.047872340425532e-06, "loss": 0.4605, "num_tokens": 6941196.0, "step": 53 }, { "epoch": 0.02154828411811652, "grad_norm": 0.296469509601593, "learning_rate": 7.180851063829788e-06, "loss": 0.475, "num_tokens": 7072268.0, "step": 54 }, { "epoch": 0.02194732641660016, "grad_norm": 0.32677656412124634, "learning_rate": 7.313829787234043e-06, "loss": 0.4945, "num_tokens": 7203340.0, "step": 55 }, { "epoch": 0.0223463687150838, "grad_norm": 0.2882196307182312, "learning_rate": 7.446808510638298e-06, "loss": 0.4539, "num_tokens": 7334412.0, "step": 56 }, { "epoch": 0.02274541101356744, "grad_norm": 0.29415184259414673, "learning_rate": 7.579787234042554e-06, "loss": 0.4998, "num_tokens": 7465484.0, "step": 57 }, { "epoch": 0.02314445331205108, "grad_norm": 0.2819296419620514, "learning_rate": 7.712765957446808e-06, "loss": 0.4413, "num_tokens": 7596556.0, "step": 58 }, { "epoch": 0.023543495610534718, "grad_norm": 0.31600189208984375, "learning_rate": 7.845744680851064e-06, "loss": 0.4954, "num_tokens": 7727628.0, "step": 59 }, { "epoch": 0.023942537909018357, "grad_norm": 36.098873138427734, "learning_rate": 7.97872340425532e-06, "loss": 1.7925, "num_tokens": 7858700.0, "step": 60 }, { "epoch": 0.024341580207501996, "grad_norm": 0.31701424717903137, "learning_rate": 8.111702127659574e-06, "loss": 0.5357, "num_tokens": 7989772.0, "step": 61 }, { "epoch": 0.024740622505985636, "grad_norm": 0.2819848358631134, "learning_rate": 8.244680851063829e-06, "loss": 0.4869, "num_tokens": 8120844.0, "step": 62 }, { "epoch": 0.025139664804469275, "grad_norm": 0.28542155027389526, "learning_rate": 8.377659574468086e-06, "loss": 0.5159, "num_tokens": 8251916.0, "step": 63 }, { "epoch": 0.025538707102952914, "grad_norm": 0.26655930280685425, "learning_rate": 8.510638297872341e-06, "loss": 0.4825, "num_tokens": 8382988.0, "step": 64 }, { "epoch": 0.025937749401436554, "grad_norm": 0.27099233865737915, "learning_rate": 8.643617021276596e-06, "loss": 0.5037, "num_tokens": 8514060.0, "step": 65 }, { "epoch": 0.026336791699920193, "grad_norm": 0.25782421231269836, "learning_rate": 8.776595744680852e-06, "loss": 0.4591, "num_tokens": 8645132.0, "step": 66 }, { "epoch": 0.026735833998403832, "grad_norm": 0.27061837911605835, "learning_rate": 8.909574468085107e-06, "loss": 0.4827, "num_tokens": 8776204.0, "step": 67 }, { "epoch": 0.02713487629688747, "grad_norm": 0.27931374311447144, "learning_rate": 9.042553191489362e-06, "loss": 0.4943, "num_tokens": 8907276.0, "step": 68 }, { "epoch": 0.02753391859537111, "grad_norm": 0.7191590666770935, "learning_rate": 9.175531914893617e-06, "loss": 0.4777, "num_tokens": 9038348.0, "step": 69 }, { "epoch": 0.027932960893854747, "grad_norm": 0.2600814402103424, "learning_rate": 9.308510638297872e-06, "loss": 0.4624, "num_tokens": 9169420.0, "step": 70 }, { "epoch": 0.028332003192338386, "grad_norm": 0.25648680329322815, "learning_rate": 9.44148936170213e-06, "loss": 0.4447, "num_tokens": 9300492.0, "step": 71 }, { "epoch": 0.028731045490822026, "grad_norm": 0.2686166763305664, "learning_rate": 9.574468085106383e-06, "loss": 0.4953, "num_tokens": 9431564.0, "step": 72 }, { "epoch": 0.029130087789305665, "grad_norm": 0.26078474521636963, "learning_rate": 9.707446808510638e-06, "loss": 0.4574, "num_tokens": 9562636.0, "step": 73 }, { "epoch": 0.029529130087789304, "grad_norm": 0.2991742789745331, "learning_rate": 9.840425531914895e-06, "loss": 0.5725, "num_tokens": 9693708.0, "step": 74 }, { "epoch": 0.029928172386272944, "grad_norm": 0.25177672505378723, "learning_rate": 9.973404255319148e-06, "loss": 0.4495, "num_tokens": 9824780.0, "step": 75 }, { "epoch": 0.030327214684756583, "grad_norm": 0.3217047452926636, "learning_rate": 1.0106382978723404e-05, "loss": 0.5148, "num_tokens": 9955852.0, "step": 76 }, { "epoch": 0.030726256983240222, "grad_norm": 0.26048654317855835, "learning_rate": 1.023936170212766e-05, "loss": 0.4465, "num_tokens": 10086924.0, "step": 77 }, { "epoch": 0.03112529928172386, "grad_norm": 0.24433547258377075, "learning_rate": 1.0372340425531916e-05, "loss": 0.4602, "num_tokens": 10217996.0, "step": 78 }, { "epoch": 0.031524341580207504, "grad_norm": 0.23410926759243011, "learning_rate": 1.0505319148936171e-05, "loss": 0.4212, "num_tokens": 10349068.0, "step": 79 }, { "epoch": 0.03192338387869114, "grad_norm": 0.24921119213104248, "learning_rate": 1.0638297872340426e-05, "loss": 0.4268, "num_tokens": 10480140.0, "step": 80 }, { "epoch": 0.03232242617717478, "grad_norm": 0.27502119541168213, "learning_rate": 1.0771276595744681e-05, "loss": 0.5114, "num_tokens": 10611212.0, "step": 81 }, { "epoch": 0.03272146847565842, "grad_norm": 0.24561260640621185, "learning_rate": 1.0904255319148937e-05, "loss": 0.4773, "num_tokens": 10742284.0, "step": 82 }, { "epoch": 0.03312051077414206, "grad_norm": 0.27630165219306946, "learning_rate": 1.1037234042553192e-05, "loss": 0.4561, "num_tokens": 10873356.0, "step": 83 }, { "epoch": 0.0335195530726257, "grad_norm": 0.26567786931991577, "learning_rate": 1.1170212765957447e-05, "loss": 0.4916, "num_tokens": 11004428.0, "step": 84 }, { "epoch": 0.03391859537110934, "grad_norm": 0.24625694751739502, "learning_rate": 1.1303191489361704e-05, "loss": 0.4873, "num_tokens": 11135500.0, "step": 85 }, { "epoch": 0.034317637669592976, "grad_norm": 0.2793661057949066, "learning_rate": 1.1436170212765957e-05, "loss": 0.574, "num_tokens": 11266572.0, "step": 86 }, { "epoch": 0.03471667996807662, "grad_norm": 0.24853239953517914, "learning_rate": 1.1569148936170213e-05, "loss": 0.4539, "num_tokens": 11397644.0, "step": 87 }, { "epoch": 0.035115722266560255, "grad_norm": 0.25712040066719055, "learning_rate": 1.170212765957447e-05, "loss": 0.4838, "num_tokens": 11528716.0, "step": 88 }, { "epoch": 0.0355147645650439, "grad_norm": 0.3058083653450012, "learning_rate": 1.1835106382978723e-05, "loss": 0.5677, "num_tokens": 11659788.0, "step": 89 }, { "epoch": 0.035913806863527534, "grad_norm": 0.26118725538253784, "learning_rate": 1.196808510638298e-05, "loss": 0.4863, "num_tokens": 11790860.0, "step": 90 }, { "epoch": 0.036312849162011177, "grad_norm": 0.24740946292877197, "learning_rate": 1.2101063829787235e-05, "loss": 0.458, "num_tokens": 11921932.0, "step": 91 }, { "epoch": 0.03671189146049481, "grad_norm": 0.30554142594337463, "learning_rate": 1.223404255319149e-05, "loss": 0.5462, "num_tokens": 12037521.0, "step": 92 }, { "epoch": 0.03711093375897845, "grad_norm": 0.24950599670410156, "learning_rate": 1.2367021276595745e-05, "loss": 0.4618, "num_tokens": 12168593.0, "step": 93 }, { "epoch": 0.03750997605746209, "grad_norm": 0.266081303358078, "learning_rate": 1.25e-05, "loss": 0.4989, "num_tokens": 12299665.0, "step": 94 }, { "epoch": 0.03790901835594573, "grad_norm": 0.26219046115875244, "learning_rate": 1.2632978723404257e-05, "loss": 0.4861, "num_tokens": 12430737.0, "step": 95 }, { "epoch": 0.03830806065442937, "grad_norm": 0.24024677276611328, "learning_rate": 1.2765957446808511e-05, "loss": 0.4658, "num_tokens": 12561809.0, "step": 96 }, { "epoch": 0.038707102952913006, "grad_norm": 0.25809815526008606, "learning_rate": 1.2898936170212766e-05, "loss": 0.5103, "num_tokens": 12692881.0, "step": 97 }, { "epoch": 0.03910614525139665, "grad_norm": 0.25482848286628723, "learning_rate": 1.3031914893617023e-05, "loss": 0.4898, "num_tokens": 12823953.0, "step": 98 }, { "epoch": 0.039505187549880284, "grad_norm": 0.26262062788009644, "learning_rate": 1.3164893617021277e-05, "loss": 0.4664, "num_tokens": 12955025.0, "step": 99 }, { "epoch": 0.03990422984836393, "grad_norm": 0.23817437887191772, "learning_rate": 1.3297872340425532e-05, "loss": 0.4508, "num_tokens": 13086097.0, "step": 100 }, { "epoch": 0.04030327214684756, "grad_norm": 0.23811402916908264, "learning_rate": 1.3430851063829789e-05, "loss": 0.4641, "num_tokens": 13217169.0, "step": 101 }, { "epoch": 0.040702314445331206, "grad_norm": 0.24493294954299927, "learning_rate": 1.3563829787234042e-05, "loss": 0.465, "num_tokens": 13348241.0, "step": 102 }, { "epoch": 0.04110135674381484, "grad_norm": 0.25624793767929077, "learning_rate": 1.3696808510638297e-05, "loss": 0.498, "num_tokens": 13479313.0, "step": 103 }, { "epoch": 0.041500399042298484, "grad_norm": 0.24715401232242584, "learning_rate": 1.3829787234042554e-05, "loss": 0.4793, "num_tokens": 13610385.0, "step": 104 }, { "epoch": 0.04189944134078212, "grad_norm": 0.24800682067871094, "learning_rate": 1.3962765957446808e-05, "loss": 0.474, "num_tokens": 13741457.0, "step": 105 }, { "epoch": 0.04229848363926576, "grad_norm": 0.24537457525730133, "learning_rate": 1.4095744680851065e-05, "loss": 0.4681, "num_tokens": 13872529.0, "step": 106 }, { "epoch": 0.0426975259377494, "grad_norm": 0.23058868944644928, "learning_rate": 1.422872340425532e-05, "loss": 0.4223, "num_tokens": 14003601.0, "step": 107 }, { "epoch": 0.04309656823623304, "grad_norm": 0.257001131772995, "learning_rate": 1.4361702127659577e-05, "loss": 0.4849, "num_tokens": 14134673.0, "step": 108 }, { "epoch": 0.04349561053471668, "grad_norm": 0.26177752017974854, "learning_rate": 1.449468085106383e-05, "loss": 0.5017, "num_tokens": 14265745.0, "step": 109 }, { "epoch": 0.04389465283320032, "grad_norm": 0.2310706079006195, "learning_rate": 1.4627659574468085e-05, "loss": 0.4088, "num_tokens": 14396817.0, "step": 110 }, { "epoch": 0.044293695131683956, "grad_norm": 0.25261884927749634, "learning_rate": 1.4760638297872342e-05, "loss": 0.4661, "num_tokens": 14527889.0, "step": 111 }, { "epoch": 0.0446927374301676, "grad_norm": 0.24328429996967316, "learning_rate": 1.4893617021276596e-05, "loss": 0.4458, "num_tokens": 14658961.0, "step": 112 }, { "epoch": 0.045091779728651235, "grad_norm": 0.24224220216274261, "learning_rate": 1.5026595744680853e-05, "loss": 0.4765, "num_tokens": 14790033.0, "step": 113 }, { "epoch": 0.04549082202713488, "grad_norm": 0.24765361845493317, "learning_rate": 1.5159574468085108e-05, "loss": 0.4699, "num_tokens": 14921105.0, "step": 114 }, { "epoch": 0.045889864325618514, "grad_norm": 0.24814373254776, "learning_rate": 1.5292553191489363e-05, "loss": 0.4498, "num_tokens": 15052177.0, "step": 115 }, { "epoch": 0.04628890662410216, "grad_norm": 0.2610844671726227, "learning_rate": 1.5425531914893617e-05, "loss": 0.4783, "num_tokens": 15183249.0, "step": 116 }, { "epoch": 0.04668794892258579, "grad_norm": 0.2505512833595276, "learning_rate": 1.5558510638297874e-05, "loss": 0.4828, "num_tokens": 15314321.0, "step": 117 }, { "epoch": 0.047086991221069435, "grad_norm": 0.25578680634498596, "learning_rate": 1.5691489361702127e-05, "loss": 0.5172, "num_tokens": 15445393.0, "step": 118 }, { "epoch": 0.04748603351955307, "grad_norm": 0.23545147478580475, "learning_rate": 1.5824468085106384e-05, "loss": 0.4398, "num_tokens": 15576465.0, "step": 119 }, { "epoch": 0.047885075818036714, "grad_norm": 0.6977869272232056, "learning_rate": 1.595744680851064e-05, "loss": 0.4694, "num_tokens": 15707537.0, "step": 120 }, { "epoch": 0.04828411811652035, "grad_norm": 0.2487770915031433, "learning_rate": 1.6090425531914894e-05, "loss": 0.4709, "num_tokens": 15838609.0, "step": 121 }, { "epoch": 0.04868316041500399, "grad_norm": 32.42752456665039, "learning_rate": 1.6223404255319148e-05, "loss": 1.1507, "num_tokens": 15969681.0, "step": 122 }, { "epoch": 0.04908220271348763, "grad_norm": 0.2593122124671936, "learning_rate": 1.6356382978723405e-05, "loss": 0.4476, "num_tokens": 16100753.0, "step": 123 }, { "epoch": 0.04948124501197127, "grad_norm": 0.2787291705608368, "learning_rate": 1.6489361702127658e-05, "loss": 0.4999, "num_tokens": 16231825.0, "step": 124 }, { "epoch": 0.04988028731045491, "grad_norm": 0.2492211014032364, "learning_rate": 1.6622340425531915e-05, "loss": 0.4669, "num_tokens": 16362897.0, "step": 125 }, { "epoch": 0.05027932960893855, "grad_norm": 0.2568272054195404, "learning_rate": 1.6755319148936172e-05, "loss": 0.4838, "num_tokens": 16493969.0, "step": 126 }, { "epoch": 0.050678371907422186, "grad_norm": 0.235215961933136, "learning_rate": 1.6888297872340426e-05, "loss": 0.4533, "num_tokens": 16625041.0, "step": 127 }, { "epoch": 0.05107741420590583, "grad_norm": 0.2518365681171417, "learning_rate": 1.7021276595744682e-05, "loss": 0.4544, "num_tokens": 16756113.0, "step": 128 }, { "epoch": 0.051476456504389465, "grad_norm": 0.26128679513931274, "learning_rate": 1.7154255319148936e-05, "loss": 0.4889, "num_tokens": 16887185.0, "step": 129 }, { "epoch": 0.05187549880287311, "grad_norm": 0.2868451178073883, "learning_rate": 1.7287234042553193e-05, "loss": 0.528, "num_tokens": 17018257.0, "step": 130 }, { "epoch": 0.05227454110135674, "grad_norm": 0.24189393222332, "learning_rate": 1.7420212765957446e-05, "loss": 0.4553, "num_tokens": 17149329.0, "step": 131 }, { "epoch": 0.052673583399840386, "grad_norm": 0.257959246635437, "learning_rate": 1.7553191489361703e-05, "loss": 0.4815, "num_tokens": 17280401.0, "step": 132 }, { "epoch": 0.05307262569832402, "grad_norm": 0.25147518515586853, "learning_rate": 1.768617021276596e-05, "loss": 0.4475, "num_tokens": 17411473.0, "step": 133 }, { "epoch": 0.053471667996807665, "grad_norm": 0.2601257860660553, "learning_rate": 1.7819148936170214e-05, "loss": 0.4422, "num_tokens": 17542545.0, "step": 134 }, { "epoch": 0.0538707102952913, "grad_norm": 0.2669246196746826, "learning_rate": 1.795212765957447e-05, "loss": 0.476, "num_tokens": 17673617.0, "step": 135 }, { "epoch": 0.05426975259377494, "grad_norm": 0.2442956417798996, "learning_rate": 1.8085106382978724e-05, "loss": 0.4467, "num_tokens": 17804689.0, "step": 136 }, { "epoch": 0.05466879489225858, "grad_norm": 0.24844925105571747, "learning_rate": 1.8218085106382978e-05, "loss": 0.4413, "num_tokens": 17935761.0, "step": 137 }, { "epoch": 0.05506783719074222, "grad_norm": 0.2801011800765991, "learning_rate": 1.8351063829787234e-05, "loss": 0.4913, "num_tokens": 18066833.0, "step": 138 }, { "epoch": 0.05546687948922586, "grad_norm": 0.2574571371078491, "learning_rate": 1.848404255319149e-05, "loss": 0.426, "num_tokens": 18197905.0, "step": 139 }, { "epoch": 0.055865921787709494, "grad_norm": 0.25964704155921936, "learning_rate": 1.8617021276595745e-05, "loss": 0.4928, "num_tokens": 18328977.0, "step": 140 }, { "epoch": 0.05626496408619314, "grad_norm": 0.2493567317724228, "learning_rate": 1.8750000000000002e-05, "loss": 0.4657, "num_tokens": 18460049.0, "step": 141 }, { "epoch": 0.05666400638467677, "grad_norm": 0.2620711326599121, "learning_rate": 1.888297872340426e-05, "loss": 0.4871, "num_tokens": 18591121.0, "step": 142 }, { "epoch": 0.057063048683160415, "grad_norm": 0.2543109059333801, "learning_rate": 1.9015957446808512e-05, "loss": 0.4917, "num_tokens": 18722193.0, "step": 143 }, { "epoch": 0.05746209098164405, "grad_norm": 0.24712114036083221, "learning_rate": 1.9148936170212766e-05, "loss": 0.4265, "num_tokens": 18853265.0, "step": 144 }, { "epoch": 0.057861133280127694, "grad_norm": 0.2541089951992035, "learning_rate": 1.9281914893617023e-05, "loss": 0.4379, "num_tokens": 18984337.0, "step": 145 }, { "epoch": 0.05826017557861133, "grad_norm": 0.2720959186553955, "learning_rate": 1.9414893617021276e-05, "loss": 0.5022, "num_tokens": 19115409.0, "step": 146 }, { "epoch": 0.05865921787709497, "grad_norm": 0.2585955262184143, "learning_rate": 1.9547872340425533e-05, "loss": 0.4695, "num_tokens": 19246481.0, "step": 147 }, { "epoch": 0.05905826017557861, "grad_norm": 0.24683062732219696, "learning_rate": 1.968085106382979e-05, "loss": 0.4617, "num_tokens": 19377553.0, "step": 148 }, { "epoch": 0.05945730247406225, "grad_norm": 0.2597990036010742, "learning_rate": 1.9813829787234043e-05, "loss": 0.4599, "num_tokens": 19500941.0, "step": 149 }, { "epoch": 0.05985634477254589, "grad_norm": 0.27069970965385437, "learning_rate": 1.9946808510638297e-05, "loss": 0.4988, "num_tokens": 19632013.0, "step": 150 }, { "epoch": 0.06025538707102953, "grad_norm": 0.2548081874847412, "learning_rate": 2.0079787234042554e-05, "loss": 0.4421, "num_tokens": 19763085.0, "step": 151 }, { "epoch": 0.060654429369513166, "grad_norm": 0.8707162141799927, "learning_rate": 2.0212765957446807e-05, "loss": 0.4274, "num_tokens": 19894157.0, "step": 152 }, { "epoch": 0.06105347166799681, "grad_norm": 0.24314740300178528, "learning_rate": 2.0345744680851064e-05, "loss": 0.4734, "num_tokens": 20025229.0, "step": 153 }, { "epoch": 0.061452513966480445, "grad_norm": 0.24286741018295288, "learning_rate": 2.047872340425532e-05, "loss": 0.4406, "num_tokens": 20156301.0, "step": 154 }, { "epoch": 0.06185155626496409, "grad_norm": 0.25544828176498413, "learning_rate": 2.0611702127659578e-05, "loss": 0.469, "num_tokens": 20287373.0, "step": 155 }, { "epoch": 0.06225059856344772, "grad_norm": 0.27329081296920776, "learning_rate": 2.074468085106383e-05, "loss": 0.4513, "num_tokens": 20418445.0, "step": 156 }, { "epoch": 0.06264964086193137, "grad_norm": 0.250164657831192, "learning_rate": 2.0877659574468085e-05, "loss": 0.4349, "num_tokens": 20549517.0, "step": 157 }, { "epoch": 0.06304868316041501, "grad_norm": 0.26311561465263367, "learning_rate": 2.1010638297872342e-05, "loss": 0.5108, "num_tokens": 20680589.0, "step": 158 }, { "epoch": 0.06344772545889864, "grad_norm": 0.26545658707618713, "learning_rate": 2.1143617021276595e-05, "loss": 0.4684, "num_tokens": 20811661.0, "step": 159 }, { "epoch": 0.06384676775738228, "grad_norm": 0.2473437339067459, "learning_rate": 2.1276595744680852e-05, "loss": 0.4439, "num_tokens": 20942733.0, "step": 160 }, { "epoch": 0.06424581005586592, "grad_norm": 0.25923508405685425, "learning_rate": 2.140957446808511e-05, "loss": 0.4845, "num_tokens": 21073805.0, "step": 161 }, { "epoch": 0.06464485235434957, "grad_norm": 0.2570391893386841, "learning_rate": 2.1542553191489363e-05, "loss": 0.4354, "num_tokens": 21204877.0, "step": 162 }, { "epoch": 0.0650438946528332, "grad_norm": 0.25096023082733154, "learning_rate": 2.167553191489362e-05, "loss": 0.4442, "num_tokens": 21335949.0, "step": 163 }, { "epoch": 0.06544293695131684, "grad_norm": 0.2513369917869568, "learning_rate": 2.1808510638297873e-05, "loss": 0.4525, "num_tokens": 21467021.0, "step": 164 }, { "epoch": 0.06584197924980048, "grad_norm": 0.25537368655204773, "learning_rate": 2.1941489361702127e-05, "loss": 0.4478, "num_tokens": 21598093.0, "step": 165 }, { "epoch": 0.06624102154828412, "grad_norm": 0.25839585065841675, "learning_rate": 2.2074468085106383e-05, "loss": 0.4767, "num_tokens": 21729165.0, "step": 166 }, { "epoch": 0.06664006384676775, "grad_norm": 0.26516008377075195, "learning_rate": 2.220744680851064e-05, "loss": 0.4527, "num_tokens": 21860237.0, "step": 167 }, { "epoch": 0.0670391061452514, "grad_norm": 0.2508988380432129, "learning_rate": 2.2340425531914894e-05, "loss": 0.4269, "num_tokens": 21991309.0, "step": 168 }, { "epoch": 0.06743814844373504, "grad_norm": 0.24344177544116974, "learning_rate": 2.247340425531915e-05, "loss": 0.4442, "num_tokens": 22122381.0, "step": 169 }, { "epoch": 0.06783719074221868, "grad_norm": 0.25720179080963135, "learning_rate": 2.2606382978723408e-05, "loss": 0.5041, "num_tokens": 22253453.0, "step": 170 }, { "epoch": 0.06823623304070231, "grad_norm": 0.25548672676086426, "learning_rate": 2.273936170212766e-05, "loss": 0.4409, "num_tokens": 22384525.0, "step": 171 }, { "epoch": 0.06863527533918595, "grad_norm": 0.2882319986820221, "learning_rate": 2.2872340425531915e-05, "loss": 0.4793, "num_tokens": 22515597.0, "step": 172 }, { "epoch": 0.0690343176376696, "grad_norm": 0.23093490302562714, "learning_rate": 2.300531914893617e-05, "loss": 0.4106, "num_tokens": 22646669.0, "step": 173 }, { "epoch": 0.06943335993615324, "grad_norm": 0.22624988853931427, "learning_rate": 2.3138297872340425e-05, "loss": 0.3542, "num_tokens": 22777741.0, "step": 174 }, { "epoch": 0.06983240223463687, "grad_norm": 0.2723310887813568, "learning_rate": 2.3271276595744682e-05, "loss": 0.4684, "num_tokens": 22908813.0, "step": 175 }, { "epoch": 0.07023144453312051, "grad_norm": 0.2723351716995239, "learning_rate": 2.340425531914894e-05, "loss": 0.4642, "num_tokens": 23039885.0, "step": 176 }, { "epoch": 0.07063048683160415, "grad_norm": 0.25692564249038696, "learning_rate": 2.3537234042553192e-05, "loss": 0.4389, "num_tokens": 23170957.0, "step": 177 }, { "epoch": 0.0710295291300878, "grad_norm": 0.2798815965652466, "learning_rate": 2.3670212765957446e-05, "loss": 0.477, "num_tokens": 23302029.0, "step": 178 }, { "epoch": 0.07142857142857142, "grad_norm": 0.27503278851509094, "learning_rate": 2.3803191489361703e-05, "loss": 0.4758, "num_tokens": 23433101.0, "step": 179 }, { "epoch": 0.07182761372705507, "grad_norm": 0.26907023787498474, "learning_rate": 2.393617021276596e-05, "loss": 0.4266, "num_tokens": 23564173.0, "step": 180 }, { "epoch": 0.07222665602553871, "grad_norm": 0.2460441142320633, "learning_rate": 2.4069148936170213e-05, "loss": 0.412, "num_tokens": 23695245.0, "step": 181 }, { "epoch": 0.07262569832402235, "grad_norm": 0.24602942168712616, "learning_rate": 2.420212765957447e-05, "loss": 0.4252, "num_tokens": 23826317.0, "step": 182 }, { "epoch": 0.07302474062250598, "grad_norm": 0.26344195008277893, "learning_rate": 2.4335106382978727e-05, "loss": 0.4541, "num_tokens": 23957389.0, "step": 183 }, { "epoch": 0.07342378292098962, "grad_norm": 0.25653916597366333, "learning_rate": 2.446808510638298e-05, "loss": 0.4672, "num_tokens": 24088461.0, "step": 184 }, { "epoch": 0.07382282521947327, "grad_norm": 0.22038860619068146, "learning_rate": 2.4601063829787234e-05, "loss": 0.3853, "num_tokens": 24219533.0, "step": 185 }, { "epoch": 0.0742218675179569, "grad_norm": 0.25787702202796936, "learning_rate": 2.473404255319149e-05, "loss": 0.4514, "num_tokens": 24350605.0, "step": 186 }, { "epoch": 0.07462090981644054, "grad_norm": 0.2464601993560791, "learning_rate": 2.4867021276595744e-05, "loss": 0.4529, "num_tokens": 24481677.0, "step": 187 }, { "epoch": 0.07501995211492418, "grad_norm": 0.25278592109680176, "learning_rate": 2.5e-05, "loss": 0.4372, "num_tokens": 24601733.0, "step": 188 }, { "epoch": 0.07541899441340782, "grad_norm": 0.23631125688552856, "learning_rate": 2.5132978723404255e-05, "loss": 0.4025, "num_tokens": 24732805.0, "step": 189 }, { "epoch": 0.07581803671189145, "grad_norm": 0.27361273765563965, "learning_rate": 2.5265957446808515e-05, "loss": 0.5305, "num_tokens": 24863877.0, "step": 190 }, { "epoch": 0.0762170790103751, "grad_norm": 0.22882258892059326, "learning_rate": 2.539893617021277e-05, "loss": 0.3577, "num_tokens": 24994949.0, "step": 191 }, { "epoch": 0.07661612130885874, "grad_norm": 0.24907666444778442, "learning_rate": 2.5531914893617022e-05, "loss": 0.4606, "num_tokens": 25126021.0, "step": 192 }, { "epoch": 0.07701516360734238, "grad_norm": 0.2512986361980438, "learning_rate": 2.566489361702128e-05, "loss": 0.4395, "num_tokens": 25257093.0, "step": 193 }, { "epoch": 0.07741420590582601, "grad_norm": 0.24734695255756378, "learning_rate": 2.5797872340425532e-05, "loss": 0.4508, "num_tokens": 25388165.0, "step": 194 }, { "epoch": 0.07781324820430965, "grad_norm": 0.24441903829574585, "learning_rate": 2.5930851063829786e-05, "loss": 0.4239, "num_tokens": 25519237.0, "step": 195 }, { "epoch": 0.0782122905027933, "grad_norm": 0.24900777637958527, "learning_rate": 2.6063829787234046e-05, "loss": 0.4733, "num_tokens": 25650309.0, "step": 196 }, { "epoch": 0.07861133280127694, "grad_norm": 0.28946638107299805, "learning_rate": 2.61968085106383e-05, "loss": 0.5206, "num_tokens": 25781381.0, "step": 197 }, { "epoch": 0.07901037509976057, "grad_norm": 0.24393562972545624, "learning_rate": 2.6329787234042553e-05, "loss": 0.4119, "num_tokens": 25912453.0, "step": 198 }, { "epoch": 0.07940941739824421, "grad_norm": 0.29705384373664856, "learning_rate": 2.646276595744681e-05, "loss": 0.5352, "num_tokens": 26043525.0, "step": 199 }, { "epoch": 0.07980845969672785, "grad_norm": 0.26417240500450134, "learning_rate": 2.6595744680851064e-05, "loss": 0.4464, "num_tokens": 26174597.0, "step": 200 }, { "epoch": 0.0802075019952115, "grad_norm": 0.2660817503929138, "learning_rate": 2.6728723404255317e-05, "loss": 0.4907, "num_tokens": 26305669.0, "step": 201 }, { "epoch": 0.08060654429369513, "grad_norm": 0.2478124052286148, "learning_rate": 2.6861702127659577e-05, "loss": 0.428, "num_tokens": 26436741.0, "step": 202 }, { "epoch": 0.08100558659217877, "grad_norm": 0.24880246818065643, "learning_rate": 2.699468085106383e-05, "loss": 0.4463, "num_tokens": 26567813.0, "step": 203 }, { "epoch": 0.08140462889066241, "grad_norm": 0.25435900688171387, "learning_rate": 2.7127659574468084e-05, "loss": 0.4682, "num_tokens": 26698885.0, "step": 204 }, { "epoch": 0.08180367118914605, "grad_norm": 0.25726500153541565, "learning_rate": 2.726063829787234e-05, "loss": 0.4461, "num_tokens": 26829957.0, "step": 205 }, { "epoch": 0.08220271348762968, "grad_norm": 0.2626053988933563, "learning_rate": 2.7393617021276595e-05, "loss": 0.4435, "num_tokens": 26961029.0, "step": 206 }, { "epoch": 0.08260175578611333, "grad_norm": 0.25883400440216064, "learning_rate": 2.7526595744680848e-05, "loss": 0.4652, "num_tokens": 27092101.0, "step": 207 }, { "epoch": 0.08300079808459697, "grad_norm": 0.26769956946372986, "learning_rate": 2.765957446808511e-05, "loss": 0.4728, "num_tokens": 27223173.0, "step": 208 }, { "epoch": 0.08339984038308061, "grad_norm": 0.2781299650669098, "learning_rate": 2.7792553191489362e-05, "loss": 0.442, "num_tokens": 27354245.0, "step": 209 }, { "epoch": 0.08379888268156424, "grad_norm": 0.2844361960887909, "learning_rate": 2.7925531914893616e-05, "loss": 0.4956, "num_tokens": 27485317.0, "step": 210 }, { "epoch": 0.08419792498004788, "grad_norm": 0.24751682579517365, "learning_rate": 2.8058510638297876e-05, "loss": 0.4572, "num_tokens": 27616389.0, "step": 211 }, { "epoch": 0.08459696727853153, "grad_norm": 0.2573261559009552, "learning_rate": 2.819148936170213e-05, "loss": 0.4535, "num_tokens": 27747461.0, "step": 212 }, { "epoch": 0.08499600957701517, "grad_norm": 0.23923224210739136, "learning_rate": 2.8324468085106386e-05, "loss": 0.3999, "num_tokens": 27878533.0, "step": 213 }, { "epoch": 0.0853950518754988, "grad_norm": 0.2774321734905243, "learning_rate": 2.845744680851064e-05, "loss": 0.4682, "num_tokens": 28009605.0, "step": 214 }, { "epoch": 0.08579409417398244, "grad_norm": 0.25129252672195435, "learning_rate": 2.8590425531914893e-05, "loss": 0.439, "num_tokens": 28140677.0, "step": 215 }, { "epoch": 0.08619313647246608, "grad_norm": 0.28384676575660706, "learning_rate": 2.8723404255319154e-05, "loss": 0.4839, "num_tokens": 28271749.0, "step": 216 }, { "epoch": 0.08659217877094973, "grad_norm": 0.27506160736083984, "learning_rate": 2.8856382978723407e-05, "loss": 0.4417, "num_tokens": 28402821.0, "step": 217 }, { "epoch": 0.08699122106943336, "grad_norm": 0.30230918526649475, "learning_rate": 2.898936170212766e-05, "loss": 0.5337, "num_tokens": 28533893.0, "step": 218 }, { "epoch": 0.087390263367917, "grad_norm": 0.2329840362071991, "learning_rate": 2.9122340425531917e-05, "loss": 0.3854, "num_tokens": 28664965.0, "step": 219 }, { "epoch": 0.08778930566640064, "grad_norm": 0.28985080122947693, "learning_rate": 2.925531914893617e-05, "loss": 0.4954, "num_tokens": 28796037.0, "step": 220 }, { "epoch": 0.08818834796488428, "grad_norm": 0.24004895985126495, "learning_rate": 2.9388297872340424e-05, "loss": 0.3922, "num_tokens": 28927109.0, "step": 221 }, { "epoch": 0.08858739026336791, "grad_norm": 0.2976018488407135, "learning_rate": 2.9521276595744685e-05, "loss": 0.4756, "num_tokens": 29058181.0, "step": 222 }, { "epoch": 0.08898643256185156, "grad_norm": 0.38856253027915955, "learning_rate": 2.9654255319148938e-05, "loss": 0.3924, "num_tokens": 29189253.0, "step": 223 }, { "epoch": 0.0893854748603352, "grad_norm": 0.26142188906669617, "learning_rate": 2.9787234042553192e-05, "loss": 0.4643, "num_tokens": 29320325.0, "step": 224 }, { "epoch": 0.08978451715881884, "grad_norm": 0.26540541648864746, "learning_rate": 2.9920212765957452e-05, "loss": 0.4745, "num_tokens": 29451397.0, "step": 225 }, { "epoch": 0.09018355945730247, "grad_norm": 0.2765464782714844, "learning_rate": 3.0053191489361706e-05, "loss": 0.48, "num_tokens": 29582469.0, "step": 226 }, { "epoch": 0.09058260175578611, "grad_norm": 0.25111889839172363, "learning_rate": 3.0186170212765956e-05, "loss": 0.4102, "num_tokens": 29713541.0, "step": 227 }, { "epoch": 0.09098164405426976, "grad_norm": 0.25406211614608765, "learning_rate": 3.0319148936170216e-05, "loss": 0.4486, "num_tokens": 29844613.0, "step": 228 }, { "epoch": 0.0913806863527534, "grad_norm": 0.32896336913108826, "learning_rate": 3.045212765957447e-05, "loss": 0.52, "num_tokens": 29975685.0, "step": 229 }, { "epoch": 0.09177972865123703, "grad_norm": 0.2737705409526825, "learning_rate": 3.0585106382978726e-05, "loss": 0.452, "num_tokens": 30106757.0, "step": 230 }, { "epoch": 0.09217877094972067, "grad_norm": 0.243442103266716, "learning_rate": 3.071808510638298e-05, "loss": 0.4463, "num_tokens": 30237829.0, "step": 231 }, { "epoch": 0.09257781324820431, "grad_norm": 0.2626839876174927, "learning_rate": 3.085106382978723e-05, "loss": 0.4269, "num_tokens": 30368901.0, "step": 232 }, { "epoch": 0.09297685554668794, "grad_norm": 0.25030234456062317, "learning_rate": 3.098404255319149e-05, "loss": 0.4393, "num_tokens": 30499973.0, "step": 233 }, { "epoch": 0.09337589784517158, "grad_norm": 0.25290098786354065, "learning_rate": 3.111702127659575e-05, "loss": 0.4195, "num_tokens": 30631045.0, "step": 234 }, { "epoch": 0.09377494014365523, "grad_norm": 0.2726176977157593, "learning_rate": 3.125e-05, "loss": 0.4558, "num_tokens": 30762117.0, "step": 235 }, { "epoch": 0.09417398244213887, "grad_norm": 0.2819133698940277, "learning_rate": 3.1382978723404254e-05, "loss": 0.4236, "num_tokens": 30893189.0, "step": 236 }, { "epoch": 0.0945730247406225, "grad_norm": 0.26953041553497314, "learning_rate": 3.1515957446808514e-05, "loss": 0.4556, "num_tokens": 31024261.0, "step": 237 }, { "epoch": 0.09497206703910614, "grad_norm": 0.26708343625068665, "learning_rate": 3.164893617021277e-05, "loss": 0.4354, "num_tokens": 31155333.0, "step": 238 }, { "epoch": 0.09537110933758979, "grad_norm": 0.269732803106308, "learning_rate": 3.178191489361702e-05, "loss": 0.4815, "num_tokens": 31286405.0, "step": 239 }, { "epoch": 0.09577015163607343, "grad_norm": 0.25556299090385437, "learning_rate": 3.191489361702128e-05, "loss": 0.4406, "num_tokens": 31417477.0, "step": 240 }, { "epoch": 0.09616919393455706, "grad_norm": 0.2531879246234894, "learning_rate": 3.2047872340425535e-05, "loss": 0.4486, "num_tokens": 31548549.0, "step": 241 }, { "epoch": 0.0965682362330407, "grad_norm": 0.25574174523353577, "learning_rate": 3.218085106382979e-05, "loss": 0.4543, "num_tokens": 31679621.0, "step": 242 }, { "epoch": 0.09696727853152434, "grad_norm": 0.2639654278755188, "learning_rate": 3.231382978723405e-05, "loss": 0.3734, "num_tokens": 31810693.0, "step": 243 }, { "epoch": 0.09736632083000799, "grad_norm": 0.2553044855594635, "learning_rate": 3.2446808510638296e-05, "loss": 0.4182, "num_tokens": 31941765.0, "step": 244 }, { "epoch": 0.09776536312849161, "grad_norm": 0.24754759669303894, "learning_rate": 3.257978723404255e-05, "loss": 0.4068, "num_tokens": 32072837.0, "step": 245 }, { "epoch": 0.09816440542697526, "grad_norm": 0.26426205039024353, "learning_rate": 3.271276595744681e-05, "loss": 0.4803, "num_tokens": 32203909.0, "step": 246 }, { "epoch": 0.0985634477254589, "grad_norm": 0.24866656959056854, "learning_rate": 3.284574468085106e-05, "loss": 0.4395, "num_tokens": 32334981.0, "step": 247 }, { "epoch": 0.09896249002394254, "grad_norm": 0.24908918142318726, "learning_rate": 3.2978723404255317e-05, "loss": 0.4492, "num_tokens": 32466053.0, "step": 248 }, { "epoch": 0.09936153232242617, "grad_norm": 0.23171289265155792, "learning_rate": 3.311170212765958e-05, "loss": 0.4092, "num_tokens": 32597125.0, "step": 249 }, { "epoch": 0.09976057462090981, "grad_norm": 0.24817681312561035, "learning_rate": 3.324468085106383e-05, "loss": 0.474, "num_tokens": 32728197.0, "step": 250 }, { "epoch": 0.10015961691939346, "grad_norm": 0.2394656538963318, "learning_rate": 3.3377659574468084e-05, "loss": 0.4492, "num_tokens": 32859269.0, "step": 251 }, { "epoch": 0.1005586592178771, "grad_norm": 0.23572421073913574, "learning_rate": 3.3510638297872344e-05, "loss": 0.3912, "num_tokens": 32990341.0, "step": 252 }, { "epoch": 0.10095770151636073, "grad_norm": 0.25200140476226807, "learning_rate": 3.36436170212766e-05, "loss": 0.4258, "num_tokens": 33121413.0, "step": 253 }, { "epoch": 0.10135674381484437, "grad_norm": 0.29105764627456665, "learning_rate": 3.377659574468085e-05, "loss": 0.5254, "num_tokens": 33252485.0, "step": 254 }, { "epoch": 0.10175578611332801, "grad_norm": 0.25955867767333984, "learning_rate": 3.390957446808511e-05, "loss": 0.4493, "num_tokens": 33383557.0, "step": 255 }, { "epoch": 0.10215482841181166, "grad_norm": 0.2748526334762573, "learning_rate": 3.4042553191489365e-05, "loss": 0.4559, "num_tokens": 33514629.0, "step": 256 }, { "epoch": 0.10255387071029529, "grad_norm": 0.24738065898418427, "learning_rate": 3.417553191489362e-05, "loss": 0.4248, "num_tokens": 33645701.0, "step": 257 }, { "epoch": 0.10295291300877893, "grad_norm": 0.2569233775138855, "learning_rate": 3.430851063829787e-05, "loss": 0.4638, "num_tokens": 33776773.0, "step": 258 }, { "epoch": 0.10335195530726257, "grad_norm": 0.2603776454925537, "learning_rate": 3.4441489361702125e-05, "loss": 0.4083, "num_tokens": 33893358.0, "step": 259 }, { "epoch": 0.10375099760574621, "grad_norm": 0.2682965099811554, "learning_rate": 3.4574468085106386e-05, "loss": 0.4642, "num_tokens": 34024430.0, "step": 260 }, { "epoch": 0.10415003990422984, "grad_norm": 0.24753111600875854, "learning_rate": 3.470744680851064e-05, "loss": 0.4158, "num_tokens": 34155502.0, "step": 261 }, { "epoch": 0.10454908220271349, "grad_norm": 0.269619345664978, "learning_rate": 3.484042553191489e-05, "loss": 0.4616, "num_tokens": 34286574.0, "step": 262 }, { "epoch": 0.10494812450119713, "grad_norm": 0.2668876051902771, "learning_rate": 3.497340425531915e-05, "loss": 0.4206, "num_tokens": 34417646.0, "step": 263 }, { "epoch": 0.10534716679968077, "grad_norm": 0.2894762456417084, "learning_rate": 3.5106382978723407e-05, "loss": 0.481, "num_tokens": 34548718.0, "step": 264 }, { "epoch": 0.1057462090981644, "grad_norm": 0.2637569010257721, "learning_rate": 3.523936170212766e-05, "loss": 0.4142, "num_tokens": 34665995.0, "step": 265 }, { "epoch": 0.10614525139664804, "grad_norm": 0.26821374893188477, "learning_rate": 3.537234042553192e-05, "loss": 0.4332, "num_tokens": 34797067.0, "step": 266 }, { "epoch": 0.10654429369513169, "grad_norm": 0.25334012508392334, "learning_rate": 3.5505319148936174e-05, "loss": 0.3959, "num_tokens": 34928139.0, "step": 267 }, { "epoch": 0.10694333599361533, "grad_norm": 0.26649630069732666, "learning_rate": 3.563829787234043e-05, "loss": 0.3919, "num_tokens": 35059211.0, "step": 268 }, { "epoch": 0.10734237829209896, "grad_norm": 0.2545076310634613, "learning_rate": 3.577127659574469e-05, "loss": 0.4382, "num_tokens": 35190283.0, "step": 269 }, { "epoch": 0.1077414205905826, "grad_norm": 0.26825466752052307, "learning_rate": 3.590425531914894e-05, "loss": 0.4659, "num_tokens": 35321355.0, "step": 270 }, { "epoch": 0.10814046288906624, "grad_norm": 0.29352736473083496, "learning_rate": 3.6037234042553195e-05, "loss": 0.447, "num_tokens": 35452427.0, "step": 271 }, { "epoch": 0.10853950518754989, "grad_norm": 0.2223019301891327, "learning_rate": 3.617021276595745e-05, "loss": 0.3593, "num_tokens": 35583499.0, "step": 272 }, { "epoch": 0.10893854748603352, "grad_norm": 0.23453085124492645, "learning_rate": 3.63031914893617e-05, "loss": 0.4136, "num_tokens": 35714571.0, "step": 273 }, { "epoch": 0.10933758978451716, "grad_norm": 0.2663435637950897, "learning_rate": 3.6436170212765955e-05, "loss": 0.4805, "num_tokens": 35845643.0, "step": 274 }, { "epoch": 0.1097366320830008, "grad_norm": 0.25560593605041504, "learning_rate": 3.6569148936170215e-05, "loss": 0.4177, "num_tokens": 35976715.0, "step": 275 }, { "epoch": 0.11013567438148444, "grad_norm": 0.25389614701271057, "learning_rate": 3.670212765957447e-05, "loss": 0.454, "num_tokens": 36107787.0, "step": 276 }, { "epoch": 0.11053471667996807, "grad_norm": 0.2697886824607849, "learning_rate": 3.683510638297872e-05, "loss": 0.4744, "num_tokens": 36238859.0, "step": 277 }, { "epoch": 0.11093375897845172, "grad_norm": 0.24942241609096527, "learning_rate": 3.696808510638298e-05, "loss": 0.4221, "num_tokens": 36369931.0, "step": 278 }, { "epoch": 0.11133280127693536, "grad_norm": 0.24209865927696228, "learning_rate": 3.7101063829787236e-05, "loss": 0.4243, "num_tokens": 36501003.0, "step": 279 }, { "epoch": 0.11173184357541899, "grad_norm": 0.283399373292923, "learning_rate": 3.723404255319149e-05, "loss": 0.4303, "num_tokens": 36632075.0, "step": 280 }, { "epoch": 0.11213088587390263, "grad_norm": 0.2620205581188202, "learning_rate": 3.736702127659575e-05, "loss": 0.4415, "num_tokens": 36763147.0, "step": 281 }, { "epoch": 0.11252992817238627, "grad_norm": 0.26432865858078003, "learning_rate": 3.7500000000000003e-05, "loss": 0.4274, "num_tokens": 36894219.0, "step": 282 }, { "epoch": 0.11292897047086992, "grad_norm": 0.26603326201438904, "learning_rate": 3.763297872340426e-05, "loss": 0.4586, "num_tokens": 37025291.0, "step": 283 }, { "epoch": 0.11332801276935354, "grad_norm": 0.2697400748729706, "learning_rate": 3.776595744680852e-05, "loss": 0.4892, "num_tokens": 37156363.0, "step": 284 }, { "epoch": 0.11372705506783719, "grad_norm": 0.26621922850608826, "learning_rate": 3.789893617021277e-05, "loss": 0.4656, "num_tokens": 37287435.0, "step": 285 }, { "epoch": 0.11412609736632083, "grad_norm": 0.2813340425491333, "learning_rate": 3.8031914893617024e-05, "loss": 0.4682, "num_tokens": 37418507.0, "step": 286 }, { "epoch": 0.11452513966480447, "grad_norm": 0.27705639600753784, "learning_rate": 3.816489361702128e-05, "loss": 0.4085, "num_tokens": 37549579.0, "step": 287 }, { "epoch": 0.1149241819632881, "grad_norm": 0.275735080242157, "learning_rate": 3.829787234042553e-05, "loss": 0.4695, "num_tokens": 37680651.0, "step": 288 }, { "epoch": 0.11532322426177175, "grad_norm": 0.2951165735721588, "learning_rate": 3.8430851063829785e-05, "loss": 0.4889, "num_tokens": 37811723.0, "step": 289 }, { "epoch": 0.11572226656025539, "grad_norm": 0.2487882375717163, "learning_rate": 3.8563829787234045e-05, "loss": 0.449, "num_tokens": 37942795.0, "step": 290 }, { "epoch": 0.11612130885873903, "grad_norm": 0.3169589340686798, "learning_rate": 3.86968085106383e-05, "loss": 0.5122, "num_tokens": 38073867.0, "step": 291 }, { "epoch": 0.11652035115722266, "grad_norm": 0.2647607624530792, "learning_rate": 3.882978723404255e-05, "loss": 0.3695, "num_tokens": 38204939.0, "step": 292 }, { "epoch": 0.1169193934557063, "grad_norm": 0.2594250440597534, "learning_rate": 3.896276595744681e-05, "loss": 0.4252, "num_tokens": 38336011.0, "step": 293 }, { "epoch": 0.11731843575418995, "grad_norm": 0.2651838958263397, "learning_rate": 3.9095744680851066e-05, "loss": 0.4655, "num_tokens": 38467083.0, "step": 294 }, { "epoch": 0.11771747805267359, "grad_norm": 0.27622339129447937, "learning_rate": 3.922872340425532e-05, "loss": 0.4499, "num_tokens": 38598155.0, "step": 295 }, { "epoch": 0.11811652035115722, "grad_norm": 0.2986569106578827, "learning_rate": 3.936170212765958e-05, "loss": 0.4578, "num_tokens": 38729227.0, "step": 296 }, { "epoch": 0.11851556264964086, "grad_norm": 0.2824555039405823, "learning_rate": 3.949468085106383e-05, "loss": 0.4787, "num_tokens": 38860299.0, "step": 297 }, { "epoch": 0.1189146049481245, "grad_norm": 0.26430743932724, "learning_rate": 3.962765957446809e-05, "loss": 0.4529, "num_tokens": 38991371.0, "step": 298 }, { "epoch": 0.11931364724660815, "grad_norm": 0.2673518657684326, "learning_rate": 3.976063829787234e-05, "loss": 0.4582, "num_tokens": 39122443.0, "step": 299 }, { "epoch": 0.11971268954509177, "grad_norm": 0.25658971071243286, "learning_rate": 3.9893617021276594e-05, "loss": 0.4365, "num_tokens": 39253515.0, "step": 300 }, { "epoch": 0.12011173184357542, "grad_norm": 0.2811719477176666, "learning_rate": 4.002659574468085e-05, "loss": 0.4676, "num_tokens": 39374511.0, "step": 301 }, { "epoch": 0.12051077414205906, "grad_norm": 0.2956825792789459, "learning_rate": 4.015957446808511e-05, "loss": 0.486, "num_tokens": 39505583.0, "step": 302 }, { "epoch": 0.1209098164405427, "grad_norm": 0.249615877866745, "learning_rate": 4.029255319148936e-05, "loss": 0.4235, "num_tokens": 39636655.0, "step": 303 }, { "epoch": 0.12130885873902633, "grad_norm": 0.2759823501110077, "learning_rate": 4.0425531914893614e-05, "loss": 0.4714, "num_tokens": 39767727.0, "step": 304 }, { "epoch": 0.12170790103750997, "grad_norm": 0.2765004634857178, "learning_rate": 4.0558510638297875e-05, "loss": 0.4637, "num_tokens": 39898799.0, "step": 305 }, { "epoch": 0.12210694333599362, "grad_norm": 0.29050320386886597, "learning_rate": 4.069148936170213e-05, "loss": 0.5341, "num_tokens": 40029871.0, "step": 306 }, { "epoch": 0.12250598563447726, "grad_norm": 0.2688447833061218, "learning_rate": 4.082446808510639e-05, "loss": 0.4759, "num_tokens": 40160943.0, "step": 307 }, { "epoch": 0.12290502793296089, "grad_norm": 0.22201116383075714, "learning_rate": 4.095744680851064e-05, "loss": 0.3876, "num_tokens": 40292015.0, "step": 308 }, { "epoch": 0.12330407023144453, "grad_norm": 0.26729416847229004, "learning_rate": 4.1090425531914896e-05, "loss": 0.4473, "num_tokens": 40423087.0, "step": 309 }, { "epoch": 0.12370311252992817, "grad_norm": 0.25963929295539856, "learning_rate": 4.1223404255319156e-05, "loss": 0.4454, "num_tokens": 40554159.0, "step": 310 }, { "epoch": 0.12410215482841182, "grad_norm": 0.2458062469959259, "learning_rate": 4.135638297872341e-05, "loss": 0.4106, "num_tokens": 40685231.0, "step": 311 }, { "epoch": 0.12450119712689545, "grad_norm": 0.2868601381778717, "learning_rate": 4.148936170212766e-05, "loss": 0.5142, "num_tokens": 40816303.0, "step": 312 }, { "epoch": 0.12490023942537909, "grad_norm": 0.2443508505821228, "learning_rate": 4.1622340425531916e-05, "loss": 0.4495, "num_tokens": 40947375.0, "step": 313 }, { "epoch": 0.12529928172386273, "grad_norm": 0.2702677547931671, "learning_rate": 4.175531914893617e-05, "loss": 0.4573, "num_tokens": 41078447.0, "step": 314 }, { "epoch": 0.12569832402234637, "grad_norm": 0.24030081927776337, "learning_rate": 4.188829787234042e-05, "loss": 0.4388, "num_tokens": 41209519.0, "step": 315 }, { "epoch": 0.12609736632083002, "grad_norm": 0.2729737162590027, "learning_rate": 4.2021276595744684e-05, "loss": 0.4515, "num_tokens": 41340591.0, "step": 316 }, { "epoch": 0.12649640861931366, "grad_norm": 0.2628028988838196, "learning_rate": 4.215425531914894e-05, "loss": 0.4501, "num_tokens": 41471663.0, "step": 317 }, { "epoch": 0.12689545091779728, "grad_norm": 0.2883978486061096, "learning_rate": 4.228723404255319e-05, "loss": 0.5045, "num_tokens": 41602735.0, "step": 318 }, { "epoch": 0.12729449321628092, "grad_norm": 0.23867759108543396, "learning_rate": 4.242021276595745e-05, "loss": 0.4074, "num_tokens": 41733807.0, "step": 319 }, { "epoch": 0.12769353551476456, "grad_norm": 0.2362176775932312, "learning_rate": 4.2553191489361704e-05, "loss": 0.3959, "num_tokens": 41864879.0, "step": 320 }, { "epoch": 0.1280925778132482, "grad_norm": 0.267561137676239, "learning_rate": 4.268617021276596e-05, "loss": 0.4085, "num_tokens": 41995951.0, "step": 321 }, { "epoch": 0.12849162011173185, "grad_norm": 0.2556478679180145, "learning_rate": 4.281914893617022e-05, "loss": 0.4102, "num_tokens": 42127023.0, "step": 322 }, { "epoch": 0.1288906624102155, "grad_norm": 0.23905949294567108, "learning_rate": 4.295212765957447e-05, "loss": 0.4006, "num_tokens": 42258095.0, "step": 323 }, { "epoch": 0.12928970470869913, "grad_norm": 0.26740846037864685, "learning_rate": 4.3085106382978725e-05, "loss": 0.4714, "num_tokens": 42389167.0, "step": 324 }, { "epoch": 0.12968874700718275, "grad_norm": 0.2509155571460724, "learning_rate": 4.3218085106382986e-05, "loss": 0.4336, "num_tokens": 42520239.0, "step": 325 }, { "epoch": 0.1300877893056664, "grad_norm": 0.23864826560020447, "learning_rate": 4.335106382978724e-05, "loss": 0.4153, "num_tokens": 42651311.0, "step": 326 }, { "epoch": 0.13048683160415003, "grad_norm": 0.2588057816028595, "learning_rate": 4.348404255319149e-05, "loss": 0.4233, "num_tokens": 42782383.0, "step": 327 }, { "epoch": 0.13088587390263368, "grad_norm": 0.24003636837005615, "learning_rate": 4.3617021276595746e-05, "loss": 0.4043, "num_tokens": 42907775.0, "step": 328 }, { "epoch": 0.13128491620111732, "grad_norm": 0.2445879876613617, "learning_rate": 4.375e-05, "loss": 0.4573, "num_tokens": 43038847.0, "step": 329 }, { "epoch": 0.13168395849960096, "grad_norm": 0.284863144159317, "learning_rate": 4.388297872340425e-05, "loss": 0.4748, "num_tokens": 43169919.0, "step": 330 }, { "epoch": 0.1320830007980846, "grad_norm": 0.23044171929359436, "learning_rate": 4.401595744680851e-05, "loss": 0.4076, "num_tokens": 43300991.0, "step": 331 }, { "epoch": 0.13248204309656825, "grad_norm": 0.26539987325668335, "learning_rate": 4.414893617021277e-05, "loss": 0.4646, "num_tokens": 43432063.0, "step": 332 }, { "epoch": 0.13288108539505186, "grad_norm": 0.2509593367576599, "learning_rate": 4.428191489361702e-05, "loss": 0.4148, "num_tokens": 43563135.0, "step": 333 }, { "epoch": 0.1332801276935355, "grad_norm": 0.26571938395500183, "learning_rate": 4.441489361702128e-05, "loss": 0.4763, "num_tokens": 43694207.0, "step": 334 }, { "epoch": 0.13367916999201915, "grad_norm": 0.259259968996048, "learning_rate": 4.4547872340425534e-05, "loss": 0.4368, "num_tokens": 43825279.0, "step": 335 }, { "epoch": 0.1340782122905028, "grad_norm": 0.2563018202781677, "learning_rate": 4.468085106382979e-05, "loss": 0.471, "num_tokens": 43956351.0, "step": 336 }, { "epoch": 0.13447725458898643, "grad_norm": 0.28118348121643066, "learning_rate": 4.481382978723405e-05, "loss": 0.4572, "num_tokens": 44087423.0, "step": 337 }, { "epoch": 0.13487629688747008, "grad_norm": 0.26723238825798035, "learning_rate": 4.49468085106383e-05, "loss": 0.4747, "num_tokens": 44218495.0, "step": 338 }, { "epoch": 0.13527533918595372, "grad_norm": 0.2541075646877289, "learning_rate": 4.5079787234042555e-05, "loss": 0.4242, "num_tokens": 44349567.0, "step": 339 }, { "epoch": 0.13567438148443736, "grad_norm": 0.3039921820163727, "learning_rate": 4.5212765957446815e-05, "loss": 0.515, "num_tokens": 44480639.0, "step": 340 }, { "epoch": 0.13607342378292098, "grad_norm": 0.2583296000957489, "learning_rate": 4.534574468085107e-05, "loss": 0.4542, "num_tokens": 44611711.0, "step": 341 }, { "epoch": 0.13647246608140462, "grad_norm": 0.23683951795101166, "learning_rate": 4.547872340425532e-05, "loss": 0.4384, "num_tokens": 44742783.0, "step": 342 }, { "epoch": 0.13687150837988826, "grad_norm": 0.3144960105419159, "learning_rate": 4.5611702127659576e-05, "loss": 0.5697, "num_tokens": 44873855.0, "step": 343 }, { "epoch": 0.1372705506783719, "grad_norm": 0.23996654152870178, "learning_rate": 4.574468085106383e-05, "loss": 0.4397, "num_tokens": 45004927.0, "step": 344 }, { "epoch": 0.13766959297685555, "grad_norm": 0.24481624364852905, "learning_rate": 4.587765957446808e-05, "loss": 0.3968, "num_tokens": 45135999.0, "step": 345 }, { "epoch": 0.1380686352753392, "grad_norm": 0.2536270022392273, "learning_rate": 4.601063829787234e-05, "loss": 0.4022, "num_tokens": 45267071.0, "step": 346 }, { "epoch": 0.13846767757382283, "grad_norm": 0.24861963093280792, "learning_rate": 4.6143617021276597e-05, "loss": 0.3979, "num_tokens": 45398143.0, "step": 347 }, { "epoch": 0.13886671987230648, "grad_norm": 0.30415603518486023, "learning_rate": 4.627659574468085e-05, "loss": 0.4833, "num_tokens": 45514730.0, "step": 348 }, { "epoch": 0.1392657621707901, "grad_norm": 0.25523510575294495, "learning_rate": 4.640957446808511e-05, "loss": 0.4294, "num_tokens": 45645802.0, "step": 349 }, { "epoch": 0.13966480446927373, "grad_norm": 0.28670480847358704, "learning_rate": 4.6542553191489364e-05, "loss": 0.4955, "num_tokens": 45776874.0, "step": 350 }, { "epoch": 0.14006384676775738, "grad_norm": 0.2498353272676468, "learning_rate": 4.667553191489362e-05, "loss": 0.3862, "num_tokens": 45907946.0, "step": 351 }, { "epoch": 0.14046288906624102, "grad_norm": 0.31287431716918945, "learning_rate": 4.680851063829788e-05, "loss": 0.5251, "num_tokens": 46035770.0, "step": 352 }, { "epoch": 0.14086193136472466, "grad_norm": 0.23651902377605438, "learning_rate": 4.694148936170213e-05, "loss": 0.4289, "num_tokens": 46166842.0, "step": 353 }, { "epoch": 0.1412609736632083, "grad_norm": 0.2838471829891205, "learning_rate": 4.7074468085106385e-05, "loss": 0.4817, "num_tokens": 46297914.0, "step": 354 }, { "epoch": 0.14166001596169195, "grad_norm": 0.26003915071487427, "learning_rate": 4.720744680851064e-05, "loss": 0.4327, "num_tokens": 46428986.0, "step": 355 }, { "epoch": 0.1420590582601756, "grad_norm": 0.24603037536144257, "learning_rate": 4.734042553191489e-05, "loss": 0.4251, "num_tokens": 46560058.0, "step": 356 }, { "epoch": 0.1424581005586592, "grad_norm": 0.24688659608364105, "learning_rate": 4.747340425531915e-05, "loss": 0.4196, "num_tokens": 46691130.0, "step": 357 }, { "epoch": 0.14285714285714285, "grad_norm": 0.26346006989479065, "learning_rate": 4.7606382978723405e-05, "loss": 0.46, "num_tokens": 46822202.0, "step": 358 }, { "epoch": 0.1432561851556265, "grad_norm": 0.30917975306510925, "learning_rate": 4.773936170212766e-05, "loss": 0.5142, "num_tokens": 46953274.0, "step": 359 }, { "epoch": 0.14365522745411013, "grad_norm": 0.3257162868976593, "learning_rate": 4.787234042553192e-05, "loss": 0.4828, "num_tokens": 47084346.0, "step": 360 }, { "epoch": 0.14405426975259378, "grad_norm": 0.2820822298526764, "learning_rate": 4.800531914893617e-05, "loss": 0.4848, "num_tokens": 47215418.0, "step": 361 }, { "epoch": 0.14445331205107742, "grad_norm": 0.4765893816947937, "learning_rate": 4.8138297872340426e-05, "loss": 0.466, "num_tokens": 47346490.0, "step": 362 }, { "epoch": 0.14485235434956106, "grad_norm": 0.2625393867492676, "learning_rate": 4.8271276595744686e-05, "loss": 0.4546, "num_tokens": 47477562.0, "step": 363 }, { "epoch": 0.1452513966480447, "grad_norm": 0.2586839497089386, "learning_rate": 4.840425531914894e-05, "loss": 0.4525, "num_tokens": 47608634.0, "step": 364 }, { "epoch": 0.14565043894652832, "grad_norm": 0.25251245498657227, "learning_rate": 4.8537234042553194e-05, "loss": 0.4083, "num_tokens": 47739706.0, "step": 365 }, { "epoch": 0.14604948124501196, "grad_norm": 0.27271008491516113, "learning_rate": 4.8670212765957454e-05, "loss": 0.4783, "num_tokens": 47870778.0, "step": 366 }, { "epoch": 0.1464485235434956, "grad_norm": 0.26495373249053955, "learning_rate": 4.880319148936171e-05, "loss": 0.4987, "num_tokens": 48001850.0, "step": 367 }, { "epoch": 0.14684756584197925, "grad_norm": 0.26305249333381653, "learning_rate": 4.893617021276596e-05, "loss": 0.4186, "num_tokens": 48132922.0, "step": 368 }, { "epoch": 0.1472466081404629, "grad_norm": 0.2491915076971054, "learning_rate": 4.9069148936170214e-05, "loss": 0.4364, "num_tokens": 48263994.0, "step": 369 }, { "epoch": 0.14764565043894654, "grad_norm": 0.2774709463119507, "learning_rate": 4.920212765957447e-05, "loss": 0.4419, "num_tokens": 48395066.0, "step": 370 }, { "epoch": 0.14804469273743018, "grad_norm": 0.25710538029670715, "learning_rate": 4.933510638297872e-05, "loss": 0.4702, "num_tokens": 48526138.0, "step": 371 }, { "epoch": 0.1484437350359138, "grad_norm": 0.2623225450515747, "learning_rate": 4.946808510638298e-05, "loss": 0.4643, "num_tokens": 48657210.0, "step": 372 }, { "epoch": 0.14884277733439744, "grad_norm": 0.29057955741882324, "learning_rate": 4.9601063829787235e-05, "loss": 0.4732, "num_tokens": 48788282.0, "step": 373 }, { "epoch": 0.14924181963288108, "grad_norm": 0.2733030319213867, "learning_rate": 4.973404255319149e-05, "loss": 0.4295, "num_tokens": 48919354.0, "step": 374 }, { "epoch": 0.14964086193136472, "grad_norm": 0.25393885374069214, "learning_rate": 4.986702127659575e-05, "loss": 0.4224, "num_tokens": 49050426.0, "step": 375 }, { "epoch": 0.15003990422984836, "grad_norm": 0.2650797367095947, "learning_rate": 5e-05, "loss": 0.4542, "num_tokens": 49176932.0, "step": 376 }, { "epoch": 0.150438946528332, "grad_norm": 0.27149730920791626, "learning_rate": 4.999999782322987e-05, "loss": 0.4507, "num_tokens": 49308004.0, "step": 377 }, { "epoch": 0.15083798882681565, "grad_norm": 0.24070300161838531, "learning_rate": 4.999999129291991e-05, "loss": 0.4344, "num_tokens": 49439076.0, "step": 378 }, { "epoch": 0.1512370311252993, "grad_norm": 0.2832837998867035, "learning_rate": 4.999998040907137e-05, "loss": 0.4522, "num_tokens": 49570148.0, "step": 379 }, { "epoch": 0.1516360734237829, "grad_norm": 0.23114994168281555, "learning_rate": 4.999996517168636e-05, "loss": 0.3903, "num_tokens": 49701220.0, "step": 380 }, { "epoch": 0.15203511572226655, "grad_norm": 0.2606949806213379, "learning_rate": 4.999994558076784e-05, "loss": 0.4567, "num_tokens": 49829395.0, "step": 381 }, { "epoch": 0.1524341580207502, "grad_norm": 0.2790696918964386, "learning_rate": 4.999992163631958e-05, "loss": 0.4339, "num_tokens": 49960467.0, "step": 382 }, { "epoch": 0.15283320031923384, "grad_norm": 0.3148126006126404, "learning_rate": 4.999989333834624e-05, "loss": 0.4979, "num_tokens": 50091539.0, "step": 383 }, { "epoch": 0.15323224261771748, "grad_norm": 0.2409343272447586, "learning_rate": 4.999986068685327e-05, "loss": 0.4227, "num_tokens": 50222611.0, "step": 384 }, { "epoch": 0.15363128491620112, "grad_norm": 0.2819683849811554, "learning_rate": 4.9999823681847e-05, "loss": 0.4923, "num_tokens": 50353683.0, "step": 385 }, { "epoch": 0.15403032721468476, "grad_norm": 0.2697558104991913, "learning_rate": 4.99997823233346e-05, "loss": 0.401, "num_tokens": 50484755.0, "step": 386 }, { "epoch": 0.1544293695131684, "grad_norm": 0.2588367164134979, "learning_rate": 4.999973661132404e-05, "loss": 0.4399, "num_tokens": 50615827.0, "step": 387 }, { "epoch": 0.15482841181165202, "grad_norm": 0.25009846687316895, "learning_rate": 4.9999686545824206e-05, "loss": 0.4175, "num_tokens": 50744584.0, "step": 388 }, { "epoch": 0.15522745411013567, "grad_norm": 0.3039231300354004, "learning_rate": 4.999963212684475e-05, "loss": 0.5031, "num_tokens": 50875656.0, "step": 389 }, { "epoch": 0.1556264964086193, "grad_norm": 0.24797455966472626, "learning_rate": 4.999957335439622e-05, "loss": 0.4399, "num_tokens": 51006728.0, "step": 390 }, { "epoch": 0.15602553870710295, "grad_norm": 0.25278258323669434, "learning_rate": 4.999951022848998e-05, "loss": 0.4525, "num_tokens": 51137800.0, "step": 391 }, { "epoch": 0.1564245810055866, "grad_norm": 0.28019872307777405, "learning_rate": 4.999944274913826e-05, "loss": 0.482, "num_tokens": 51268872.0, "step": 392 }, { "epoch": 0.15682362330407024, "grad_norm": 0.2749232351779938, "learning_rate": 4.99993709163541e-05, "loss": 0.463, "num_tokens": 51399944.0, "step": 393 }, { "epoch": 0.15722266560255388, "grad_norm": 0.2897484600543976, "learning_rate": 4.9999294730151404e-05, "loss": 0.4435, "num_tokens": 51531016.0, "step": 394 }, { "epoch": 0.15762170790103752, "grad_norm": 0.29252028465270996, "learning_rate": 4.999921419054492e-05, "loss": 0.4622, "num_tokens": 51662088.0, "step": 395 }, { "epoch": 0.15802075019952114, "grad_norm": 0.2565585672855377, "learning_rate": 4.999912929755021e-05, "loss": 0.3954, "num_tokens": 51793160.0, "step": 396 }, { "epoch": 0.15841979249800478, "grad_norm": 0.2494918704032898, "learning_rate": 4.9999040051183724e-05, "loss": 0.4358, "num_tokens": 51924232.0, "step": 397 }, { "epoch": 0.15881883479648842, "grad_norm": 0.26754865050315857, "learning_rate": 4.999894645146272e-05, "loss": 0.4787, "num_tokens": 52055304.0, "step": 398 }, { "epoch": 0.15921787709497207, "grad_norm": 0.27321484684944153, "learning_rate": 4.999884849840531e-05, "loss": 0.4408, "num_tokens": 52186376.0, "step": 399 }, { "epoch": 0.1596169193934557, "grad_norm": 0.2539835274219513, "learning_rate": 4.999874619203045e-05, "loss": 0.4291, "num_tokens": 52317448.0, "step": 400 }, { "epoch": 0.16001596169193935, "grad_norm": 0.24174118041992188, "learning_rate": 4.999863953235793e-05, "loss": 0.4296, "num_tokens": 52448520.0, "step": 401 }, { "epoch": 0.160415003990423, "grad_norm": 0.2513832151889801, "learning_rate": 4.999852851940839e-05, "loss": 0.4075, "num_tokens": 52579592.0, "step": 402 }, { "epoch": 0.16081404628890664, "grad_norm": 0.25203490257263184, "learning_rate": 4.999841315320331e-05, "loss": 0.4575, "num_tokens": 52710664.0, "step": 403 }, { "epoch": 0.16121308858739025, "grad_norm": 0.27181363105773926, "learning_rate": 4.9998293433765016e-05, "loss": 0.486, "num_tokens": 52841736.0, "step": 404 }, { "epoch": 0.1616121308858739, "grad_norm": 0.27551335096359253, "learning_rate": 4.999816936111666e-05, "loss": 0.4941, "num_tokens": 52972808.0, "step": 405 }, { "epoch": 0.16201117318435754, "grad_norm": 0.3021710515022278, "learning_rate": 4.999804093528226e-05, "loss": 0.3864, "num_tokens": 53103880.0, "step": 406 }, { "epoch": 0.16241021548284118, "grad_norm": 0.28887584805488586, "learning_rate": 4.999790815628668e-05, "loss": 0.4976, "num_tokens": 53234952.0, "step": 407 }, { "epoch": 0.16280925778132482, "grad_norm": 0.2791736125946045, "learning_rate": 4.9997771024155575e-05, "loss": 0.4455, "num_tokens": 53366024.0, "step": 408 }, { "epoch": 0.16320830007980847, "grad_norm": 0.3035256564617157, "learning_rate": 4.99976295389155e-05, "loss": 0.5071, "num_tokens": 53497096.0, "step": 409 }, { "epoch": 0.1636073423782921, "grad_norm": 0.26846498250961304, "learning_rate": 4.9997483700593836e-05, "loss": 0.4396, "num_tokens": 53628168.0, "step": 410 }, { "epoch": 0.16400638467677575, "grad_norm": 0.284834086894989, "learning_rate": 4.99973335092188e-05, "loss": 0.476, "num_tokens": 53759240.0, "step": 411 }, { "epoch": 0.16440542697525937, "grad_norm": 0.25212979316711426, "learning_rate": 4.9997178964819434e-05, "loss": 0.4121, "num_tokens": 53890312.0, "step": 412 }, { "epoch": 0.164804469273743, "grad_norm": 0.2699526250362396, "learning_rate": 4.999702006742566e-05, "loss": 0.4472, "num_tokens": 54021384.0, "step": 413 }, { "epoch": 0.16520351157222665, "grad_norm": 0.24566955864429474, "learning_rate": 4.9996856817068224e-05, "loss": 0.4517, "num_tokens": 54152456.0, "step": 414 }, { "epoch": 0.1656025538707103, "grad_norm": 0.23138391971588135, "learning_rate": 4.99966892137787e-05, "loss": 0.4134, "num_tokens": 54283528.0, "step": 415 }, { "epoch": 0.16600159616919394, "grad_norm": 0.26867836713790894, "learning_rate": 4.9996517257589524e-05, "loss": 0.4301, "num_tokens": 54414600.0, "step": 416 }, { "epoch": 0.16640063846767758, "grad_norm": 0.2700470983982086, "learning_rate": 4.999634094853397e-05, "loss": 0.4542, "num_tokens": 54545672.0, "step": 417 }, { "epoch": 0.16679968076616122, "grad_norm": 0.23979073762893677, "learning_rate": 4.999616028664615e-05, "loss": 0.3899, "num_tokens": 54676744.0, "step": 418 }, { "epoch": 0.16719872306464484, "grad_norm": 0.23637248575687408, "learning_rate": 4.999597527196103e-05, "loss": 0.4006, "num_tokens": 54807816.0, "step": 419 }, { "epoch": 0.16759776536312848, "grad_norm": 0.23879370093345642, "learning_rate": 4.9995785904514384e-05, "loss": 0.3556, "num_tokens": 54938888.0, "step": 420 }, { "epoch": 0.16799680766161212, "grad_norm": 0.271435409784317, "learning_rate": 4.999559218434288e-05, "loss": 0.4151, "num_tokens": 55069960.0, "step": 421 }, { "epoch": 0.16839584996009577, "grad_norm": 0.28393521904945374, "learning_rate": 4.999539411148399e-05, "loss": 0.4533, "num_tokens": 55201032.0, "step": 422 }, { "epoch": 0.1687948922585794, "grad_norm": 0.3027734160423279, "learning_rate": 4.999519168597604e-05, "loss": 0.5072, "num_tokens": 55332104.0, "step": 423 }, { "epoch": 0.16919393455706305, "grad_norm": 0.30562442541122437, "learning_rate": 4.999498490785818e-05, "loss": 0.5557, "num_tokens": 55463176.0, "step": 424 }, { "epoch": 0.1695929768555467, "grad_norm": 0.3004646599292755, "learning_rate": 4.999477377717045e-05, "loss": 0.4619, "num_tokens": 55574978.0, "step": 425 }, { "epoch": 0.16999201915403034, "grad_norm": 0.2633204460144043, "learning_rate": 4.999455829395368e-05, "loss": 0.3882, "num_tokens": 55696038.0, "step": 426 }, { "epoch": 0.17039106145251395, "grad_norm": 0.23509864509105682, "learning_rate": 4.999433845824958e-05, "loss": 0.3819, "num_tokens": 55827110.0, "step": 427 }, { "epoch": 0.1707901037509976, "grad_norm": 0.27235203981399536, "learning_rate": 4.9994114270100675e-05, "loss": 0.4495, "num_tokens": 55958182.0, "step": 428 }, { "epoch": 0.17118914604948124, "grad_norm": 0.2775338590145111, "learning_rate": 4.9993885729550346e-05, "loss": 0.4738, "num_tokens": 56089254.0, "step": 429 }, { "epoch": 0.17158818834796488, "grad_norm": 0.28202539682388306, "learning_rate": 4.999365283664281e-05, "loss": 0.4373, "num_tokens": 56220326.0, "step": 430 }, { "epoch": 0.17198723064644852, "grad_norm": 0.2551959753036499, "learning_rate": 4.999341559142314e-05, "loss": 0.4107, "num_tokens": 56351398.0, "step": 431 }, { "epoch": 0.17238627294493217, "grad_norm": 0.25013741850852966, "learning_rate": 4.999317399393723e-05, "loss": 0.4097, "num_tokens": 56482470.0, "step": 432 }, { "epoch": 0.1727853152434158, "grad_norm": 0.2491718977689743, "learning_rate": 4.9992928044231834e-05, "loss": 0.4315, "num_tokens": 56612765.0, "step": 433 }, { "epoch": 0.17318435754189945, "grad_norm": 0.2800091803073883, "learning_rate": 4.9992677742354526e-05, "loss": 0.4952, "num_tokens": 56730338.0, "step": 434 }, { "epoch": 0.17358339984038307, "grad_norm": 0.24700672924518585, "learning_rate": 4.9992423088353765e-05, "loss": 0.3996, "num_tokens": 56861410.0, "step": 435 }, { "epoch": 0.1739824421388667, "grad_norm": 0.25681599974632263, "learning_rate": 4.9992164082278795e-05, "loss": 0.435, "num_tokens": 56992482.0, "step": 436 }, { "epoch": 0.17438148443735035, "grad_norm": 0.2631327509880066, "learning_rate": 4.9991900724179754e-05, "loss": 0.447, "num_tokens": 57123554.0, "step": 437 }, { "epoch": 0.174780526735834, "grad_norm": 0.2561827003955841, "learning_rate": 4.9991633014107586e-05, "loss": 0.4543, "num_tokens": 57254626.0, "step": 438 }, { "epoch": 0.17517956903431764, "grad_norm": 0.2409096211194992, "learning_rate": 4.9991360952114094e-05, "loss": 0.4108, "num_tokens": 57385698.0, "step": 439 }, { "epoch": 0.17557861133280128, "grad_norm": 0.2924349308013916, "learning_rate": 4.999108453825192e-05, "loss": 0.4734, "num_tokens": 57516770.0, "step": 440 }, { "epoch": 0.17597765363128492, "grad_norm": 0.25724342465400696, "learning_rate": 4.999080377257455e-05, "loss": 0.45, "num_tokens": 57647842.0, "step": 441 }, { "epoch": 0.17637669592976857, "grad_norm": 0.254417359828949, "learning_rate": 4.99905186551363e-05, "loss": 0.456, "num_tokens": 57778914.0, "step": 442 }, { "epoch": 0.17677573822825218, "grad_norm": 0.2497948557138443, "learning_rate": 4.999022918599235e-05, "loss": 0.4192, "num_tokens": 57909986.0, "step": 443 }, { "epoch": 0.17717478052673583, "grad_norm": 0.26035815477371216, "learning_rate": 4.99899353651987e-05, "loss": 0.4546, "num_tokens": 58033075.0, "step": 444 }, { "epoch": 0.17757382282521947, "grad_norm": 0.2906597852706909, "learning_rate": 4.998963719281221e-05, "loss": 0.4926, "num_tokens": 58164147.0, "step": 445 }, { "epoch": 0.1779728651237031, "grad_norm": 0.280853271484375, "learning_rate": 4.998933466889056e-05, "loss": 0.4732, "num_tokens": 58295219.0, "step": 446 }, { "epoch": 0.17837190742218675, "grad_norm": 0.23415671288967133, "learning_rate": 4.998902779349231e-05, "loss": 0.3903, "num_tokens": 58426291.0, "step": 447 }, { "epoch": 0.1787709497206704, "grad_norm": 0.2667790651321411, "learning_rate": 4.998871656667681e-05, "loss": 0.4461, "num_tokens": 58557363.0, "step": 448 }, { "epoch": 0.17916999201915404, "grad_norm": 0.2932549715042114, "learning_rate": 4.998840098850429e-05, "loss": 0.5386, "num_tokens": 58688435.0, "step": 449 }, { "epoch": 0.17956903431763768, "grad_norm": 0.23040582239627838, "learning_rate": 4.9988081059035816e-05, "loss": 0.4188, "num_tokens": 58819507.0, "step": 450 }, { "epoch": 0.1799680766161213, "grad_norm": 0.2794885039329529, "learning_rate": 4.998775677833329e-05, "loss": 0.4659, "num_tokens": 58950579.0, "step": 451 }, { "epoch": 0.18036711891460494, "grad_norm": 0.25659504532814026, "learning_rate": 4.9987428146459456e-05, "loss": 0.4391, "num_tokens": 59081651.0, "step": 452 }, { "epoch": 0.18076616121308858, "grad_norm": 0.26623666286468506, "learning_rate": 4.9987095163477895e-05, "loss": 0.4667, "num_tokens": 59212723.0, "step": 453 }, { "epoch": 0.18116520351157223, "grad_norm": 0.288239985704422, "learning_rate": 4.998675782945305e-05, "loss": 0.465, "num_tokens": 59333175.0, "step": 454 }, { "epoch": 0.18156424581005587, "grad_norm": 0.26842084527015686, "learning_rate": 4.9986416144450183e-05, "loss": 0.4358, "num_tokens": 59464247.0, "step": 455 }, { "epoch": 0.1819632881085395, "grad_norm": 0.26442405581474304, "learning_rate": 4.9986070108535404e-05, "loss": 0.4335, "num_tokens": 59595319.0, "step": 456 }, { "epoch": 0.18236233040702315, "grad_norm": 0.26274845004081726, "learning_rate": 4.998571972177567e-05, "loss": 0.436, "num_tokens": 59726391.0, "step": 457 }, { "epoch": 0.1827613727055068, "grad_norm": 0.29849115014076233, "learning_rate": 4.998536498423878e-05, "loss": 0.5, "num_tokens": 59857463.0, "step": 458 }, { "epoch": 0.1831604150039904, "grad_norm": 0.2702873647212982, "learning_rate": 4.998500589599337e-05, "loss": 0.4448, "num_tokens": 59988535.0, "step": 459 }, { "epoch": 0.18355945730247406, "grad_norm": 0.2555505931377411, "learning_rate": 4.998464245710893e-05, "loss": 0.4416, "num_tokens": 60119607.0, "step": 460 }, { "epoch": 0.1839584996009577, "grad_norm": 0.2451276183128357, "learning_rate": 4.998427466765577e-05, "loss": 0.3941, "num_tokens": 60250679.0, "step": 461 }, { "epoch": 0.18435754189944134, "grad_norm": 0.2701883018016815, "learning_rate": 4.998390252770505e-05, "loss": 0.4856, "num_tokens": 60381751.0, "step": 462 }, { "epoch": 0.18475658419792498, "grad_norm": 0.23190012574195862, "learning_rate": 4.9983526037328794e-05, "loss": 0.3954, "num_tokens": 60512823.0, "step": 463 }, { "epoch": 0.18515562649640863, "grad_norm": 0.25768423080444336, "learning_rate": 4.998314519659983e-05, "loss": 0.44, "num_tokens": 60643895.0, "step": 464 }, { "epoch": 0.18555466879489227, "grad_norm": 0.2577480375766754, "learning_rate": 4.998276000559186e-05, "loss": 0.4378, "num_tokens": 60774967.0, "step": 465 }, { "epoch": 0.18595371109337588, "grad_norm": 0.2735345959663391, "learning_rate": 4.9982370464379405e-05, "loss": 0.4523, "num_tokens": 60906039.0, "step": 466 }, { "epoch": 0.18635275339185953, "grad_norm": 0.25512996315956116, "learning_rate": 4.998197657303785e-05, "loss": 0.4449, "num_tokens": 61037111.0, "step": 467 }, { "epoch": 0.18675179569034317, "grad_norm": 0.23123431205749512, "learning_rate": 4.99815783316434e-05, "loss": 0.4089, "num_tokens": 61168183.0, "step": 468 }, { "epoch": 0.1871508379888268, "grad_norm": 0.26184120774269104, "learning_rate": 4.998117574027312e-05, "loss": 0.3928, "num_tokens": 61299255.0, "step": 469 }, { "epoch": 0.18754988028731046, "grad_norm": 0.258802592754364, "learning_rate": 4.99807687990049e-05, "loss": 0.4316, "num_tokens": 61430327.0, "step": 470 }, { "epoch": 0.1879489225857941, "grad_norm": 0.26134973764419556, "learning_rate": 4.998035750791747e-05, "loss": 0.4028, "num_tokens": 61561399.0, "step": 471 }, { "epoch": 0.18834796488427774, "grad_norm": 0.23801590502262115, "learning_rate": 4.997994186709042e-05, "loss": 0.4197, "num_tokens": 61692471.0, "step": 472 }, { "epoch": 0.18874700718276138, "grad_norm": 0.2697362005710602, "learning_rate": 4.9979521876604184e-05, "loss": 0.492, "num_tokens": 61823543.0, "step": 473 }, { "epoch": 0.189146049481245, "grad_norm": 0.23105572164058685, "learning_rate": 4.997909753654001e-05, "loss": 0.3681, "num_tokens": 61954615.0, "step": 474 }, { "epoch": 0.18954509177972864, "grad_norm": 0.2856653928756714, "learning_rate": 4.997866884698002e-05, "loss": 0.4926, "num_tokens": 62085687.0, "step": 475 }, { "epoch": 0.18994413407821228, "grad_norm": 0.22928033769130707, "learning_rate": 4.997823580800715e-05, "loss": 0.4088, "num_tokens": 62216759.0, "step": 476 }, { "epoch": 0.19034317637669593, "grad_norm": 0.2629873752593994, "learning_rate": 4.997779841970518e-05, "loss": 0.4296, "num_tokens": 62347831.0, "step": 477 }, { "epoch": 0.19074221867517957, "grad_norm": 0.2952132523059845, "learning_rate": 4.9977356682158756e-05, "loss": 0.473, "num_tokens": 62478903.0, "step": 478 }, { "epoch": 0.1911412609736632, "grad_norm": 0.28333592414855957, "learning_rate": 4.997691059545335e-05, "loss": 0.4699, "num_tokens": 62609975.0, "step": 479 }, { "epoch": 0.19154030327214686, "grad_norm": 0.25120973587036133, "learning_rate": 4.997646015967526e-05, "loss": 0.4168, "num_tokens": 62741047.0, "step": 480 }, { "epoch": 0.1919393455706305, "grad_norm": 0.2596670389175415, "learning_rate": 4.997600537491166e-05, "loss": 0.4237, "num_tokens": 62872119.0, "step": 481 }, { "epoch": 0.1923383878691141, "grad_norm": 0.2593408524990082, "learning_rate": 4.9975546241250544e-05, "loss": 0.388, "num_tokens": 63001680.0, "step": 482 }, { "epoch": 0.19273743016759776, "grad_norm": 0.2444639503955841, "learning_rate": 4.997508275878073e-05, "loss": 0.406, "num_tokens": 63132752.0, "step": 483 }, { "epoch": 0.1931364724660814, "grad_norm": 0.3025197982788086, "learning_rate": 4.9974614927591926e-05, "loss": 0.507, "num_tokens": 63263824.0, "step": 484 }, { "epoch": 0.19353551476456504, "grad_norm": 0.2941150367259979, "learning_rate": 4.997414274777463e-05, "loss": 0.5075, "num_tokens": 63394896.0, "step": 485 }, { "epoch": 0.19393455706304868, "grad_norm": 0.24978499114513397, "learning_rate": 4.997366621942022e-05, "loss": 0.3955, "num_tokens": 63525968.0, "step": 486 }, { "epoch": 0.19433359936153233, "grad_norm": 0.21768110990524292, "learning_rate": 4.997318534262089e-05, "loss": 0.3717, "num_tokens": 63657040.0, "step": 487 }, { "epoch": 0.19473264166001597, "grad_norm": 0.44832953810691833, "learning_rate": 4.997270011746969e-05, "loss": 0.4742, "num_tokens": 63788112.0, "step": 488 }, { "epoch": 0.1951316839584996, "grad_norm": 0.28012949228286743, "learning_rate": 4.99722105440605e-05, "loss": 0.4573, "num_tokens": 63919184.0, "step": 489 }, { "epoch": 0.19553072625698323, "grad_norm": 0.22794799506664276, "learning_rate": 4.9971716622488055e-05, "loss": 0.3949, "num_tokens": 64050256.0, "step": 490 }, { "epoch": 0.19592976855546687, "grad_norm": 0.2351038157939911, "learning_rate": 4.997121835284793e-05, "loss": 0.4087, "num_tokens": 64181328.0, "step": 491 }, { "epoch": 0.19632881085395051, "grad_norm": 0.2684168815612793, "learning_rate": 4.997071573523652e-05, "loss": 0.4305, "num_tokens": 64298503.0, "step": 492 }, { "epoch": 0.19672785315243416, "grad_norm": 0.225712388753891, "learning_rate": 4.9970208769751084e-05, "loss": 0.3876, "num_tokens": 64429575.0, "step": 493 }, { "epoch": 0.1971268954509178, "grad_norm": 0.29777592420578003, "learning_rate": 4.996969745648972e-05, "loss": 0.4748, "num_tokens": 64560647.0, "step": 494 }, { "epoch": 0.19752593774940144, "grad_norm": 0.2690163552761078, "learning_rate": 4.996918179555136e-05, "loss": 0.4839, "num_tokens": 64691719.0, "step": 495 }, { "epoch": 0.19792498004788509, "grad_norm": 0.2740970551967621, "learning_rate": 4.9968661787035776e-05, "loss": 0.5108, "num_tokens": 64822791.0, "step": 496 }, { "epoch": 0.19832402234636873, "grad_norm": 0.24251413345336914, "learning_rate": 4.9968137431043576e-05, "loss": 0.4254, "num_tokens": 64953863.0, "step": 497 }, { "epoch": 0.19872306464485234, "grad_norm": 0.24215763807296753, "learning_rate": 4.996760872767624e-05, "loss": 0.4565, "num_tokens": 65084935.0, "step": 498 }, { "epoch": 0.19912210694333599, "grad_norm": 0.2724033296108246, "learning_rate": 4.9967075677036053e-05, "loss": 0.3721, "num_tokens": 65216007.0, "step": 499 }, { "epoch": 0.19952114924181963, "grad_norm": 0.2473989874124527, "learning_rate": 4.996653827922615e-05, "loss": 0.4573, "num_tokens": 65347079.0, "step": 500 }, { "epoch": 0.19992019154030327, "grad_norm": 0.23202256858348846, "learning_rate": 4.9965996534350536e-05, "loss": 0.3956, "num_tokens": 65478151.0, "step": 501 }, { "epoch": 0.20031923383878691, "grad_norm": 0.2541813254356384, "learning_rate": 4.996545044251401e-05, "loss": 0.4649, "num_tokens": 65609223.0, "step": 502 }, { "epoch": 0.20071827613727056, "grad_norm": 0.24876455962657928, "learning_rate": 4.9964900003822246e-05, "loss": 0.442, "num_tokens": 65740295.0, "step": 503 }, { "epoch": 0.2011173184357542, "grad_norm": 0.24685554206371307, "learning_rate": 4.996434521838174e-05, "loss": 0.4162, "num_tokens": 65871367.0, "step": 504 }, { "epoch": 0.20151636073423784, "grad_norm": 0.22878623008728027, "learning_rate": 4.996378608629985e-05, "loss": 0.4109, "num_tokens": 66002439.0, "step": 505 }, { "epoch": 0.20191540303272146, "grad_norm": 0.305128276348114, "learning_rate": 4.996322260768476e-05, "loss": 0.4972, "num_tokens": 66133511.0, "step": 506 }, { "epoch": 0.2023144453312051, "grad_norm": 0.27900081872940063, "learning_rate": 4.9962654782645476e-05, "loss": 0.4529, "num_tokens": 66264583.0, "step": 507 }, { "epoch": 0.20271348762968874, "grad_norm": 0.24850887060165405, "learning_rate": 4.9962082611291906e-05, "loss": 0.4091, "num_tokens": 66395655.0, "step": 508 }, { "epoch": 0.2031125299281724, "grad_norm": 0.2399938702583313, "learning_rate": 4.9961506093734735e-05, "loss": 0.3931, "num_tokens": 66526727.0, "step": 509 }, { "epoch": 0.20351157222665603, "grad_norm": 0.2568218410015106, "learning_rate": 4.9960925230085516e-05, "loss": 0.4344, "num_tokens": 66657799.0, "step": 510 }, { "epoch": 0.20391061452513967, "grad_norm": 0.25881174206733704, "learning_rate": 4.9960340020456645e-05, "loss": 0.4212, "num_tokens": 66788871.0, "step": 511 }, { "epoch": 0.20430965682362331, "grad_norm": 0.25023555755615234, "learning_rate": 4.995975046496135e-05, "loss": 0.4461, "num_tokens": 66919943.0, "step": 512 }, { "epoch": 0.20470869912210693, "grad_norm": 0.2602289915084839, "learning_rate": 4.99591565637137e-05, "loss": 0.4583, "num_tokens": 67051015.0, "step": 513 }, { "epoch": 0.20510774142059057, "grad_norm": 0.24715006351470947, "learning_rate": 4.995855831682863e-05, "loss": 0.42, "num_tokens": 67182087.0, "step": 514 }, { "epoch": 0.20550678371907422, "grad_norm": 0.24369561672210693, "learning_rate": 4.995795572442188e-05, "loss": 0.4482, "num_tokens": 67313159.0, "step": 515 }, { "epoch": 0.20590582601755786, "grad_norm": 0.2419108897447586, "learning_rate": 4.995734878661004e-05, "loss": 0.4478, "num_tokens": 67444231.0, "step": 516 }, { "epoch": 0.2063048683160415, "grad_norm": 0.2520694434642792, "learning_rate": 4.995673750351056e-05, "loss": 0.4607, "num_tokens": 67575303.0, "step": 517 }, { "epoch": 0.20670391061452514, "grad_norm": 0.2106887400150299, "learning_rate": 4.995612187524171e-05, "loss": 0.3674, "num_tokens": 67706375.0, "step": 518 }, { "epoch": 0.2071029529130088, "grad_norm": 0.23085249960422516, "learning_rate": 4.9955501901922616e-05, "loss": 0.4027, "num_tokens": 67837447.0, "step": 519 }, { "epoch": 0.20750199521149243, "grad_norm": 0.24637196958065033, "learning_rate": 4.995487758367322e-05, "loss": 0.4382, "num_tokens": 67968519.0, "step": 520 }, { "epoch": 0.20790103750997604, "grad_norm": 0.2649473249912262, "learning_rate": 4.995424892061434e-05, "loss": 0.4834, "num_tokens": 68099591.0, "step": 521 }, { "epoch": 0.2083000798084597, "grad_norm": 0.22532224655151367, "learning_rate": 4.995361591286761e-05, "loss": 0.389, "num_tokens": 68230663.0, "step": 522 }, { "epoch": 0.20869912210694333, "grad_norm": 0.34016185998916626, "learning_rate": 4.9952978560555516e-05, "loss": 0.4373, "num_tokens": 68361735.0, "step": 523 }, { "epoch": 0.20909816440542697, "grad_norm": 0.26485538482666016, "learning_rate": 4.995233686380137e-05, "loss": 0.4983, "num_tokens": 68492807.0, "step": 524 }, { "epoch": 0.20949720670391062, "grad_norm": 0.26189035177230835, "learning_rate": 4.995169082272933e-05, "loss": 0.4519, "num_tokens": 68623879.0, "step": 525 }, { "epoch": 0.20989624900239426, "grad_norm": 0.21425561606884003, "learning_rate": 4.995104043746442e-05, "loss": 0.3553, "num_tokens": 68754951.0, "step": 526 }, { "epoch": 0.2102952913008779, "grad_norm": 0.25376951694488525, "learning_rate": 4.995038570813247e-05, "loss": 0.4459, "num_tokens": 68886023.0, "step": 527 }, { "epoch": 0.21069433359936154, "grad_norm": 0.2531341314315796, "learning_rate": 4.9949726634860164e-05, "loss": 0.4191, "num_tokens": 69000916.0, "step": 528 }, { "epoch": 0.21109337589784516, "grad_norm": 0.2593301832675934, "learning_rate": 4.994906321777502e-05, "loss": 0.4769, "num_tokens": 69131988.0, "step": 529 }, { "epoch": 0.2114924181963288, "grad_norm": 0.274198055267334, "learning_rate": 4.994839545700542e-05, "loss": 0.4652, "num_tokens": 69263060.0, "step": 530 }, { "epoch": 0.21189146049481244, "grad_norm": 0.25352203845977783, "learning_rate": 4.994772335268056e-05, "loss": 0.4833, "num_tokens": 69394132.0, "step": 531 }, { "epoch": 0.2122905027932961, "grad_norm": 0.24415770173072815, "learning_rate": 4.994704690493048e-05, "loss": 0.4317, "num_tokens": 69525204.0, "step": 532 }, { "epoch": 0.21268954509177973, "grad_norm": 0.2471856325864792, "learning_rate": 4.994636611388608e-05, "loss": 0.3823, "num_tokens": 69656276.0, "step": 533 }, { "epoch": 0.21308858739026337, "grad_norm": 0.2568964958190918, "learning_rate": 4.9945680979679076e-05, "loss": 0.4371, "num_tokens": 69787348.0, "step": 534 }, { "epoch": 0.21348762968874702, "grad_norm": 0.23976805806159973, "learning_rate": 4.994499150244204e-05, "loss": 0.4167, "num_tokens": 69918420.0, "step": 535 }, { "epoch": 0.21388667198723066, "grad_norm": 0.3388320505619049, "learning_rate": 4.994429768230837e-05, "loss": 0.4378, "num_tokens": 70042516.0, "step": 536 }, { "epoch": 0.21428571428571427, "grad_norm": 0.2803105413913727, "learning_rate": 4.994359951941233e-05, "loss": 0.4868, "num_tokens": 70173588.0, "step": 537 }, { "epoch": 0.21468475658419792, "grad_norm": 0.22051875293254852, "learning_rate": 4.9942897013889e-05, "loss": 0.3756, "num_tokens": 70304660.0, "step": 538 }, { "epoch": 0.21508379888268156, "grad_norm": 0.24435210227966309, "learning_rate": 4.99421901658743e-05, "loss": 0.4558, "num_tokens": 70435732.0, "step": 539 }, { "epoch": 0.2154828411811652, "grad_norm": 0.2516486942768097, "learning_rate": 4.994147897550501e-05, "loss": 0.4325, "num_tokens": 70566804.0, "step": 540 }, { "epoch": 0.21588188347964885, "grad_norm": 0.2379952073097229, "learning_rate": 4.994076344291874e-05, "loss": 0.3907, "num_tokens": 70697876.0, "step": 541 }, { "epoch": 0.2162809257781325, "grad_norm": 0.2428758293390274, "learning_rate": 4.994004356825394e-05, "loss": 0.4615, "num_tokens": 70828948.0, "step": 542 }, { "epoch": 0.21667996807661613, "grad_norm": 0.24184100329875946, "learning_rate": 4.993931935164988e-05, "loss": 0.4346, "num_tokens": 70960020.0, "step": 543 }, { "epoch": 0.21707901037509977, "grad_norm": 0.23506422340869904, "learning_rate": 4.9938590793246697e-05, "loss": 0.4108, "num_tokens": 71091092.0, "step": 544 }, { "epoch": 0.2174780526735834, "grad_norm": 0.24807824194431305, "learning_rate": 4.993785789318538e-05, "loss": 0.4511, "num_tokens": 71222164.0, "step": 545 }, { "epoch": 0.21787709497206703, "grad_norm": 0.35716137290000916, "learning_rate": 4.9937120651607716e-05, "loss": 0.4487, "num_tokens": 71353236.0, "step": 546 }, { "epoch": 0.21827613727055067, "grad_norm": 0.2519386410713196, "learning_rate": 4.9936379068656365e-05, "loss": 0.4216, "num_tokens": 71484308.0, "step": 547 }, { "epoch": 0.21867517956903432, "grad_norm": 0.23951876163482666, "learning_rate": 4.9935633144474805e-05, "loss": 0.4011, "num_tokens": 71615380.0, "step": 548 }, { "epoch": 0.21907422186751796, "grad_norm": 0.27666041254997253, "learning_rate": 4.9934882879207385e-05, "loss": 0.4921, "num_tokens": 71746452.0, "step": 549 }, { "epoch": 0.2194732641660016, "grad_norm": 0.2705152928829193, "learning_rate": 4.9934128272999265e-05, "loss": 0.4581, "num_tokens": 71877524.0, "step": 550 }, { "epoch": 0.21987230646448525, "grad_norm": 0.24738825857639313, "learning_rate": 4.993336932599644e-05, "loss": 0.429, "num_tokens": 72008596.0, "step": 551 }, { "epoch": 0.2202713487629689, "grad_norm": 0.23472794890403748, "learning_rate": 4.993260603834578e-05, "loss": 0.39, "num_tokens": 72139668.0, "step": 552 }, { "epoch": 0.2206703910614525, "grad_norm": 0.24607022106647491, "learning_rate": 4.993183841019496e-05, "loss": 0.4639, "num_tokens": 72270740.0, "step": 553 }, { "epoch": 0.22106943335993615, "grad_norm": 0.2605130672454834, "learning_rate": 4.993106644169252e-05, "loss": 0.4817, "num_tokens": 72401812.0, "step": 554 }, { "epoch": 0.2214684756584198, "grad_norm": 0.26144495606422424, "learning_rate": 4.993029013298782e-05, "loss": 0.4414, "num_tokens": 72532884.0, "step": 555 }, { "epoch": 0.22186751795690343, "grad_norm": 0.2606964409351349, "learning_rate": 4.992950948423107e-05, "loss": 0.4089, "num_tokens": 72663956.0, "step": 556 }, { "epoch": 0.22226656025538707, "grad_norm": 0.25421565771102905, "learning_rate": 4.992872449557333e-05, "loss": 0.4847, "num_tokens": 72795028.0, "step": 557 }, { "epoch": 0.22266560255387072, "grad_norm": 0.24727213382720947, "learning_rate": 4.992793516716647e-05, "loss": 0.4163, "num_tokens": 72926100.0, "step": 558 }, { "epoch": 0.22306464485235436, "grad_norm": 0.27104952931404114, "learning_rate": 4.9927141499163235e-05, "loss": 0.4487, "num_tokens": 73041443.0, "step": 559 }, { "epoch": 0.22346368715083798, "grad_norm": 0.23431868851184845, "learning_rate": 4.992634349171716e-05, "loss": 0.4032, "num_tokens": 73172515.0, "step": 560 }, { "epoch": 0.22386272944932162, "grad_norm": 0.24305523931980133, "learning_rate": 4.99255411449827e-05, "loss": 0.4158, "num_tokens": 73303587.0, "step": 561 }, { "epoch": 0.22426177174780526, "grad_norm": 0.23998276889324188, "learning_rate": 4.9924734459115066e-05, "loss": 0.4178, "num_tokens": 73434659.0, "step": 562 }, { "epoch": 0.2246608140462889, "grad_norm": 0.24107813835144043, "learning_rate": 4.992392343427036e-05, "loss": 0.4398, "num_tokens": 73565731.0, "step": 563 }, { "epoch": 0.22505985634477255, "grad_norm": 0.23626796901226044, "learning_rate": 4.992310807060549e-05, "loss": 0.448, "num_tokens": 73696803.0, "step": 564 }, { "epoch": 0.2254588986432562, "grad_norm": 0.2603203058242798, "learning_rate": 4.9922288368278245e-05, "loss": 0.454, "num_tokens": 73827875.0, "step": 565 }, { "epoch": 0.22585794094173983, "grad_norm": 0.23479564487934113, "learning_rate": 4.992146432744722e-05, "loss": 0.4246, "num_tokens": 73958947.0, "step": 566 }, { "epoch": 0.22625698324022347, "grad_norm": 0.27729126811027527, "learning_rate": 4.992063594827185e-05, "loss": 0.4516, "num_tokens": 74085606.0, "step": 567 }, { "epoch": 0.2266560255387071, "grad_norm": 0.25926920771598816, "learning_rate": 4.9919803230912434e-05, "loss": 0.4189, "num_tokens": 74216678.0, "step": 568 }, { "epoch": 0.22705506783719073, "grad_norm": 0.25573134422302246, "learning_rate": 4.9918966175530077e-05, "loss": 0.4225, "num_tokens": 74347750.0, "step": 569 }, { "epoch": 0.22745411013567438, "grad_norm": 0.24370598793029785, "learning_rate": 4.991812478228676e-05, "loss": 0.4231, "num_tokens": 74478822.0, "step": 570 }, { "epoch": 0.22785315243415802, "grad_norm": 0.29395875334739685, "learning_rate": 4.991727905134528e-05, "loss": 0.4456, "num_tokens": 74609894.0, "step": 571 }, { "epoch": 0.22825219473264166, "grad_norm": 0.2645760178565979, "learning_rate": 4.991642898286927e-05, "loss": 0.443, "num_tokens": 74740966.0, "step": 572 }, { "epoch": 0.2286512370311253, "grad_norm": 0.2555461525917053, "learning_rate": 4.9915574577023214e-05, "loss": 0.4223, "num_tokens": 74872038.0, "step": 573 }, { "epoch": 0.22905027932960895, "grad_norm": 0.2840244174003601, "learning_rate": 4.991471583397244e-05, "loss": 0.4932, "num_tokens": 75003110.0, "step": 574 }, { "epoch": 0.2294493216280926, "grad_norm": 0.24279838800430298, "learning_rate": 4.9913852753883086e-05, "loss": 0.4324, "num_tokens": 75134182.0, "step": 575 }, { "epoch": 0.2298483639265762, "grad_norm": 0.25891178846359253, "learning_rate": 4.9912985336922175e-05, "loss": 0.4075, "num_tokens": 75265254.0, "step": 576 }, { "epoch": 0.23024740622505985, "grad_norm": 0.2694702446460724, "learning_rate": 4.9912113583257524e-05, "loss": 0.4157, "num_tokens": 75396326.0, "step": 577 }, { "epoch": 0.2306464485235435, "grad_norm": 0.3153887093067169, "learning_rate": 4.991123749305783e-05, "loss": 0.5697, "num_tokens": 75527398.0, "step": 578 }, { "epoch": 0.23104549082202713, "grad_norm": 0.2523496150970459, "learning_rate": 4.9910357066492586e-05, "loss": 0.4255, "num_tokens": 75658470.0, "step": 579 }, { "epoch": 0.23144453312051078, "grad_norm": 0.25979509949684143, "learning_rate": 4.9909472303732155e-05, "loss": 0.4092, "num_tokens": 75789542.0, "step": 580 }, { "epoch": 0.23184357541899442, "grad_norm": 0.2616458237171173, "learning_rate": 4.990858320494773e-05, "loss": 0.4508, "num_tokens": 75920614.0, "step": 581 }, { "epoch": 0.23224261771747806, "grad_norm": 0.27647072076797485, "learning_rate": 4.990768977031135e-05, "loss": 0.439, "num_tokens": 76051686.0, "step": 582 }, { "epoch": 0.2326416600159617, "grad_norm": 7.049631118774414, "learning_rate": 4.990679199999589e-05, "loss": 0.4052, "num_tokens": 76182758.0, "step": 583 }, { "epoch": 0.23304070231444532, "grad_norm": 0.271066814661026, "learning_rate": 4.990588989417504e-05, "loss": 0.4165, "num_tokens": 76313830.0, "step": 584 }, { "epoch": 0.23343974461292896, "grad_norm": 0.300197958946228, "learning_rate": 4.9904983453023365e-05, "loss": 0.4826, "num_tokens": 76444902.0, "step": 585 }, { "epoch": 0.2338387869114126, "grad_norm": 0.2539636194705963, "learning_rate": 4.9904072676716243e-05, "loss": 0.4497, "num_tokens": 76575974.0, "step": 586 }, { "epoch": 0.23423782920989625, "grad_norm": 0.24150718748569489, "learning_rate": 4.9903157565429914e-05, "loss": 0.4294, "num_tokens": 76707046.0, "step": 587 }, { "epoch": 0.2346368715083799, "grad_norm": 0.2787134051322937, "learning_rate": 4.9902238119341435e-05, "loss": 0.4283, "num_tokens": 76838118.0, "step": 588 }, { "epoch": 0.23503591380686353, "grad_norm": 0.7874109148979187, "learning_rate": 4.99013143386287e-05, "loss": 0.3667, "num_tokens": 76969190.0, "step": 589 }, { "epoch": 0.23543495610534718, "grad_norm": 0.2529389262199402, "learning_rate": 4.9900386223470476e-05, "loss": 0.4288, "num_tokens": 77100262.0, "step": 590 }, { "epoch": 0.23583399840383082, "grad_norm": 0.25409242510795593, "learning_rate": 4.989945377404633e-05, "loss": 0.4082, "num_tokens": 77231334.0, "step": 591 }, { "epoch": 0.23623304070231443, "grad_norm": 0.2672390639781952, "learning_rate": 4.989851699053667e-05, "loss": 0.4548, "num_tokens": 77362406.0, "step": 592 }, { "epoch": 0.23663208300079808, "grad_norm": 0.2557407319545746, "learning_rate": 4.989757587312279e-05, "loss": 0.4235, "num_tokens": 77493478.0, "step": 593 }, { "epoch": 0.23703112529928172, "grad_norm": 0.33171364665031433, "learning_rate": 4.989663042198675e-05, "loss": 0.3653, "num_tokens": 77624550.0, "step": 594 }, { "epoch": 0.23743016759776536, "grad_norm": 0.22790780663490295, "learning_rate": 4.98956806373115e-05, "loss": 0.3929, "num_tokens": 77755622.0, "step": 595 }, { "epoch": 0.237829209896249, "grad_norm": 0.24202878773212433, "learning_rate": 4.9894726519280826e-05, "loss": 0.4096, "num_tokens": 77886694.0, "step": 596 }, { "epoch": 0.23822825219473265, "grad_norm": 0.23607738316059113, "learning_rate": 4.989376806807933e-05, "loss": 0.3959, "num_tokens": 78017766.0, "step": 597 }, { "epoch": 0.2386272944932163, "grad_norm": 0.25637388229370117, "learning_rate": 4.989280528389246e-05, "loss": 0.4288, "num_tokens": 78148838.0, "step": 598 }, { "epoch": 0.23902633679169993, "grad_norm": 0.25497084856033325, "learning_rate": 4.989183816690652e-05, "loss": 0.4433, "num_tokens": 78279910.0, "step": 599 }, { "epoch": 0.23942537909018355, "grad_norm": 0.24506323039531708, "learning_rate": 4.9890866717308626e-05, "loss": 0.4126, "num_tokens": 78410982.0, "step": 600 }, { "epoch": 0.2398244213886672, "grad_norm": 0.24874094128608704, "learning_rate": 4.9889890935286744e-05, "loss": 0.4564, "num_tokens": 78542054.0, "step": 601 }, { "epoch": 0.24022346368715083, "grad_norm": 0.2456931173801422, "learning_rate": 4.9888910821029676e-05, "loss": 0.4113, "num_tokens": 78673126.0, "step": 602 }, { "epoch": 0.24062250598563448, "grad_norm": 0.23858867585659027, "learning_rate": 4.9887926374727076e-05, "loss": 0.4362, "num_tokens": 78804198.0, "step": 603 }, { "epoch": 0.24102154828411812, "grad_norm": 0.2491639405488968, "learning_rate": 4.9886937596569424e-05, "loss": 0.4429, "num_tokens": 78935270.0, "step": 604 }, { "epoch": 0.24142059058260176, "grad_norm": 0.2673707604408264, "learning_rate": 4.988594448674803e-05, "loss": 0.4295, "num_tokens": 79066342.0, "step": 605 }, { "epoch": 0.2418196328810854, "grad_norm": 0.25368624925613403, "learning_rate": 4.9884947045455064e-05, "loss": 0.4139, "num_tokens": 79197414.0, "step": 606 }, { "epoch": 0.24221867517956902, "grad_norm": 0.245109423995018, "learning_rate": 4.988394527288351e-05, "loss": 0.4405, "num_tokens": 79328486.0, "step": 607 }, { "epoch": 0.24261771747805266, "grad_norm": 0.23759326338768005, "learning_rate": 4.9882939169227205e-05, "loss": 0.4229, "num_tokens": 79459558.0, "step": 608 }, { "epoch": 0.2430167597765363, "grad_norm": 0.26277580857276917, "learning_rate": 4.988192873468082e-05, "loss": 0.4498, "num_tokens": 79590630.0, "step": 609 }, { "epoch": 0.24341580207501995, "grad_norm": 0.2689858078956604, "learning_rate": 4.988091396943987e-05, "loss": 0.4715, "num_tokens": 79721702.0, "step": 610 }, { "epoch": 0.2438148443735036, "grad_norm": 0.2631058692932129, "learning_rate": 4.98798948737007e-05, "loss": 0.4844, "num_tokens": 79852774.0, "step": 611 }, { "epoch": 0.24421388667198723, "grad_norm": 0.25213149189949036, "learning_rate": 4.98788714476605e-05, "loss": 0.4234, "num_tokens": 79983846.0, "step": 612 }, { "epoch": 0.24461292897047088, "grad_norm": 0.2433376908302307, "learning_rate": 4.9877843691517274e-05, "loss": 0.4166, "num_tokens": 80114918.0, "step": 613 }, { "epoch": 0.24501197126895452, "grad_norm": 0.23458191752433777, "learning_rate": 4.9876811605469905e-05, "loss": 0.4298, "num_tokens": 80245990.0, "step": 614 }, { "epoch": 0.24541101356743814, "grad_norm": 0.23565778136253357, "learning_rate": 4.987577518971808e-05, "loss": 0.4362, "num_tokens": 80377062.0, "step": 615 }, { "epoch": 0.24581005586592178, "grad_norm": 0.2606508433818817, "learning_rate": 4.987473444446234e-05, "loss": 0.4545, "num_tokens": 80508134.0, "step": 616 }, { "epoch": 0.24620909816440542, "grad_norm": 0.26645591855049133, "learning_rate": 4.987368936990407e-05, "loss": 0.4822, "num_tokens": 80639206.0, "step": 617 }, { "epoch": 0.24660814046288906, "grad_norm": 0.254816472530365, "learning_rate": 4.987263996624547e-05, "loss": 0.4616, "num_tokens": 80763252.0, "step": 618 }, { "epoch": 0.2470071827613727, "grad_norm": 0.25671201944351196, "learning_rate": 4.987158623368958e-05, "loss": 0.4227, "num_tokens": 80894324.0, "step": 619 }, { "epoch": 0.24740622505985635, "grad_norm": 0.2509811222553253, "learning_rate": 4.987052817244031e-05, "loss": 0.4295, "num_tokens": 81025396.0, "step": 620 }, { "epoch": 0.24780526735834, "grad_norm": 0.26034924387931824, "learning_rate": 4.986946578270237e-05, "loss": 0.4423, "num_tokens": 81156468.0, "step": 621 }, { "epoch": 0.24820430965682364, "grad_norm": 3.206785202026367, "learning_rate": 4.986839906468133e-05, "loss": 0.4199, "num_tokens": 81287540.0, "step": 622 }, { "epoch": 0.24860335195530725, "grad_norm": 0.2627192735671997, "learning_rate": 4.9867328018583584e-05, "loss": 0.4399, "num_tokens": 81418612.0, "step": 623 }, { "epoch": 0.2490023942537909, "grad_norm": 0.23992876708507538, "learning_rate": 4.9866252644616375e-05, "loss": 0.3841, "num_tokens": 81549684.0, "step": 624 }, { "epoch": 0.24940143655227454, "grad_norm": 0.2506602108478546, "learning_rate": 4.986517294298777e-05, "loss": 0.4147, "num_tokens": 81680756.0, "step": 625 }, { "epoch": 0.24980047885075818, "grad_norm": 0.2346167415380478, "learning_rate": 4.9864088913906676e-05, "loss": 0.4166, "num_tokens": 81811828.0, "step": 626 }, { "epoch": 0.2501995211492418, "grad_norm": 0.2520923614501953, "learning_rate": 4.986300055758287e-05, "loss": 0.4214, "num_tokens": 81942900.0, "step": 627 }, { "epoch": 0.25059856344772546, "grad_norm": 0.25172141194343567, "learning_rate": 4.9861907874226917e-05, "loss": 0.4051, "num_tokens": 82073972.0, "step": 628 }, { "epoch": 0.2509976057462091, "grad_norm": 0.27998337149620056, "learning_rate": 4.986081086405024e-05, "loss": 0.453, "num_tokens": 82205044.0, "step": 629 }, { "epoch": 0.25139664804469275, "grad_norm": 0.2301217019557953, "learning_rate": 4.9859709527265105e-05, "loss": 0.4143, "num_tokens": 82336116.0, "step": 630 }, { "epoch": 0.2517956903431764, "grad_norm": 0.27993473410606384, "learning_rate": 4.985860386408462e-05, "loss": 0.4812, "num_tokens": 82467188.0, "step": 631 }, { "epoch": 0.25219473264166004, "grad_norm": 0.26262423396110535, "learning_rate": 4.985749387472271e-05, "loss": 0.4135, "num_tokens": 82598260.0, "step": 632 }, { "epoch": 0.2525937749401437, "grad_norm": 0.26559361815452576, "learning_rate": 4.9856379559394147e-05, "loss": 0.413, "num_tokens": 82729332.0, "step": 633 }, { "epoch": 0.2529928172386273, "grad_norm": 0.23342731595039368, "learning_rate": 4.985526091831453e-05, "loss": 0.4171, "num_tokens": 82860404.0, "step": 634 }, { "epoch": 0.2533918595371109, "grad_norm": 0.22139155864715576, "learning_rate": 4.9854137951700336e-05, "loss": 0.39, "num_tokens": 82991476.0, "step": 635 }, { "epoch": 0.25379090183559455, "grad_norm": 0.28390833735466003, "learning_rate": 4.985301065976883e-05, "loss": 0.4644, "num_tokens": 83106309.0, "step": 636 }, { "epoch": 0.2541899441340782, "grad_norm": 0.2363303154706955, "learning_rate": 4.9851879042738124e-05, "loss": 0.4242, "num_tokens": 83237381.0, "step": 637 }, { "epoch": 0.25458898643256184, "grad_norm": 0.25368475914001465, "learning_rate": 4.985074310082719e-05, "loss": 0.4098, "num_tokens": 83368453.0, "step": 638 }, { "epoch": 0.2549880287310455, "grad_norm": 0.24841450154781342, "learning_rate": 4.984960283425582e-05, "loss": 0.4633, "num_tokens": 83499525.0, "step": 639 }, { "epoch": 0.2553870710295291, "grad_norm": 0.23667560517787933, "learning_rate": 4.9848458243244634e-05, "loss": 0.4277, "num_tokens": 83630597.0, "step": 640 }, { "epoch": 0.25578611332801277, "grad_norm": 0.235762819647789, "learning_rate": 4.984730932801511e-05, "loss": 0.4077, "num_tokens": 83761669.0, "step": 641 }, { "epoch": 0.2561851556264964, "grad_norm": 0.24718095362186432, "learning_rate": 4.9846156088789555e-05, "loss": 0.4367, "num_tokens": 83892741.0, "step": 642 }, { "epoch": 0.25658419792498005, "grad_norm": 0.23002247512340546, "learning_rate": 4.9844998525791103e-05, "loss": 0.4033, "num_tokens": 84023813.0, "step": 643 }, { "epoch": 0.2569832402234637, "grad_norm": 0.2458215355873108, "learning_rate": 4.984383663924373e-05, "loss": 0.4514, "num_tokens": 84154885.0, "step": 644 }, { "epoch": 0.25738228252194734, "grad_norm": 0.2390502691268921, "learning_rate": 4.984267042937226e-05, "loss": 0.4141, "num_tokens": 84285957.0, "step": 645 }, { "epoch": 0.257781324820431, "grad_norm": 0.24778248369693756, "learning_rate": 4.984149989640233e-05, "loss": 0.429, "num_tokens": 84417029.0, "step": 646 }, { "epoch": 0.2581803671189146, "grad_norm": 0.733531653881073, "learning_rate": 4.984032504056044e-05, "loss": 0.5047, "num_tokens": 84542077.0, "step": 647 }, { "epoch": 0.25857940941739826, "grad_norm": 0.22281363606452942, "learning_rate": 4.9839145862073906e-05, "loss": 0.3729, "num_tokens": 84673149.0, "step": 648 }, { "epoch": 0.2589784517158819, "grad_norm": 0.24588650465011597, "learning_rate": 4.983796236117089e-05, "loss": 0.43, "num_tokens": 84804221.0, "step": 649 }, { "epoch": 0.2593774940143655, "grad_norm": 0.23386865854263306, "learning_rate": 4.983677453808039e-05, "loss": 0.3892, "num_tokens": 84935293.0, "step": 650 }, { "epoch": 0.25977653631284914, "grad_norm": 0.25513142347335815, "learning_rate": 4.9835582393032244e-05, "loss": 0.4367, "num_tokens": 85066365.0, "step": 651 }, { "epoch": 0.2601755786113328, "grad_norm": 0.240444153547287, "learning_rate": 4.98343859262571e-05, "loss": 0.4293, "num_tokens": 85197437.0, "step": 652 }, { "epoch": 0.2605746209098164, "grad_norm": 0.23105552792549133, "learning_rate": 4.9833185137986487e-05, "loss": 0.352, "num_tokens": 85328509.0, "step": 653 }, { "epoch": 0.26097366320830007, "grad_norm": 0.2782333493232727, "learning_rate": 4.983198002845274e-05, "loss": 0.4737, "num_tokens": 85459581.0, "step": 654 }, { "epoch": 0.2613727055067837, "grad_norm": 0.24804966151714325, "learning_rate": 4.983077059788903e-05, "loss": 0.44, "num_tokens": 85590653.0, "step": 655 }, { "epoch": 0.26177174780526735, "grad_norm": 0.24170050024986267, "learning_rate": 4.982955684652938e-05, "loss": 0.442, "num_tokens": 85721725.0, "step": 656 }, { "epoch": 0.262170790103751, "grad_norm": 0.2462780773639679, "learning_rate": 4.9828338774608624e-05, "loss": 0.419, "num_tokens": 85852797.0, "step": 657 }, { "epoch": 0.26256983240223464, "grad_norm": 0.26182126998901367, "learning_rate": 4.982711638236246e-05, "loss": 0.4581, "num_tokens": 85983869.0, "step": 658 }, { "epoch": 0.2629688747007183, "grad_norm": 0.2625906765460968, "learning_rate": 4.982588967002741e-05, "loss": 0.4693, "num_tokens": 86114941.0, "step": 659 }, { "epoch": 0.2633679169992019, "grad_norm": 0.2872505784034729, "learning_rate": 4.982465863784082e-05, "loss": 0.4965, "num_tokens": 86246013.0, "step": 660 }, { "epoch": 0.26376695929768557, "grad_norm": 0.24668443202972412, "learning_rate": 4.98234232860409e-05, "loss": 0.4618, "num_tokens": 86377085.0, "step": 661 }, { "epoch": 0.2641660015961692, "grad_norm": 0.25990378856658936, "learning_rate": 4.982218361486667e-05, "loss": 0.4666, "num_tokens": 86508157.0, "step": 662 }, { "epoch": 0.26456504389465285, "grad_norm": 0.2262910157442093, "learning_rate": 4.982093962455799e-05, "loss": 0.3908, "num_tokens": 86639229.0, "step": 663 }, { "epoch": 0.2649640861931365, "grad_norm": 0.2733113169670105, "learning_rate": 4.981969131535558e-05, "loss": 0.4699, "num_tokens": 86770301.0, "step": 664 }, { "epoch": 0.26536312849162014, "grad_norm": 0.24766094982624054, "learning_rate": 4.981843868750095e-05, "loss": 0.4589, "num_tokens": 86901373.0, "step": 665 }, { "epoch": 0.2657621707901037, "grad_norm": 0.22898848354816437, "learning_rate": 4.981718174123648e-05, "loss": 0.392, "num_tokens": 87032445.0, "step": 666 }, { "epoch": 0.26616121308858737, "grad_norm": 0.2550484240055084, "learning_rate": 4.981592047680539e-05, "loss": 0.4023, "num_tokens": 87163517.0, "step": 667 }, { "epoch": 0.266560255387071, "grad_norm": 4.506929874420166, "learning_rate": 4.981465489445171e-05, "loss": 0.3578, "num_tokens": 87294589.0, "step": 668 }, { "epoch": 0.26695929768555465, "grad_norm": 0.20993667840957642, "learning_rate": 4.981338499442032e-05, "loss": 0.3198, "num_tokens": 87425661.0, "step": 669 }, { "epoch": 0.2673583399840383, "grad_norm": 0.25803640484809875, "learning_rate": 4.981211077695694e-05, "loss": 0.4229, "num_tokens": 87556733.0, "step": 670 }, { "epoch": 0.26775738228252194, "grad_norm": 0.2332911491394043, "learning_rate": 4.9810832242308115e-05, "loss": 0.3871, "num_tokens": 87687805.0, "step": 671 }, { "epoch": 0.2681564245810056, "grad_norm": 0.30330580472946167, "learning_rate": 4.980954939072123e-05, "loss": 0.4566, "num_tokens": 87818877.0, "step": 672 }, { "epoch": 0.2685554668794892, "grad_norm": 0.2389308363199234, "learning_rate": 4.9808262222444504e-05, "loss": 0.42, "num_tokens": 87949949.0, "step": 673 }, { "epoch": 0.26895450917797287, "grad_norm": 0.24217922985553741, "learning_rate": 4.9806970737726995e-05, "loss": 0.4163, "num_tokens": 88081021.0, "step": 674 }, { "epoch": 0.2693535514764565, "grad_norm": 0.2553924322128296, "learning_rate": 4.980567493681859e-05, "loss": 0.4698, "num_tokens": 88212093.0, "step": 675 }, { "epoch": 0.26975259377494015, "grad_norm": 0.25024542212486267, "learning_rate": 4.980437481997001e-05, "loss": 0.4765, "num_tokens": 88343165.0, "step": 676 }, { "epoch": 0.2701516360734238, "grad_norm": 0.23243595659732819, "learning_rate": 4.9803070387432825e-05, "loss": 0.4112, "num_tokens": 88474237.0, "step": 677 }, { "epoch": 0.27055067837190744, "grad_norm": 0.2511288821697235, "learning_rate": 4.980176163945943e-05, "loss": 0.459, "num_tokens": 88605309.0, "step": 678 }, { "epoch": 0.2709497206703911, "grad_norm": 0.23971295356750488, "learning_rate": 4.980044857630305e-05, "loss": 0.3995, "num_tokens": 88736381.0, "step": 679 }, { "epoch": 0.2713487629688747, "grad_norm": 0.20899567008018494, "learning_rate": 4.979913119821775e-05, "loss": 0.3731, "num_tokens": 88867453.0, "step": 680 }, { "epoch": 0.27174780526735837, "grad_norm": 0.23797500133514404, "learning_rate": 4.979780950545844e-05, "loss": 0.4142, "num_tokens": 88998525.0, "step": 681 }, { "epoch": 0.27214684756584195, "grad_norm": 0.2611469030380249, "learning_rate": 4.979648349828085e-05, "loss": 0.4536, "num_tokens": 89129597.0, "step": 682 }, { "epoch": 0.2725458898643256, "grad_norm": 0.26083892583847046, "learning_rate": 4.979515317694154e-05, "loss": 0.449, "num_tokens": 89260669.0, "step": 683 }, { "epoch": 0.27294493216280924, "grad_norm": 0.2164057046175003, "learning_rate": 4.9793818541697926e-05, "loss": 0.3945, "num_tokens": 89391741.0, "step": 684 }, { "epoch": 0.2733439744612929, "grad_norm": 0.2542794644832611, "learning_rate": 4.9792479592808237e-05, "loss": 0.4325, "num_tokens": 89522813.0, "step": 685 }, { "epoch": 0.2737430167597765, "grad_norm": 0.2705951929092407, "learning_rate": 4.9791136330531565e-05, "loss": 0.4419, "num_tokens": 89653885.0, "step": 686 }, { "epoch": 0.27414205905826017, "grad_norm": 0.22378921508789062, "learning_rate": 4.97897887551278e-05, "loss": 0.4061, "num_tokens": 89784957.0, "step": 687 }, { "epoch": 0.2745411013567438, "grad_norm": 0.3113923966884613, "learning_rate": 4.9788436866857704e-05, "loss": 0.3787, "num_tokens": 89916029.0, "step": 688 }, { "epoch": 0.27494014365522745, "grad_norm": 0.25009721517562866, "learning_rate": 4.978708066598284e-05, "loss": 0.4673, "num_tokens": 90047101.0, "step": 689 }, { "epoch": 0.2753391859537111, "grad_norm": 0.23031248152256012, "learning_rate": 4.9785720152765626e-05, "loss": 0.4138, "num_tokens": 90178173.0, "step": 690 }, { "epoch": 0.27573822825219474, "grad_norm": 0.23529981076717377, "learning_rate": 4.97843553274693e-05, "loss": 0.4281, "num_tokens": 90309245.0, "step": 691 }, { "epoch": 0.2761372705506784, "grad_norm": 0.23995593190193176, "learning_rate": 4.978298619035795e-05, "loss": 0.4226, "num_tokens": 90440317.0, "step": 692 }, { "epoch": 0.276536312849162, "grad_norm": 0.25109678506851196, "learning_rate": 4.9781612741696496e-05, "loss": 0.4683, "num_tokens": 90571389.0, "step": 693 }, { "epoch": 0.27693535514764567, "grad_norm": 0.23616504669189453, "learning_rate": 4.978023498175069e-05, "loss": 0.3868, "num_tokens": 90702461.0, "step": 694 }, { "epoch": 0.2773343974461293, "grad_norm": 0.32809001207351685, "learning_rate": 4.97788529107871e-05, "loss": 0.488, "num_tokens": 90833533.0, "step": 695 }, { "epoch": 0.27773343974461295, "grad_norm": 0.2647342085838318, "learning_rate": 4.977746652907315e-05, "loss": 0.4537, "num_tokens": 90964605.0, "step": 696 }, { "epoch": 0.27813248204309654, "grad_norm": 0.25335341691970825, "learning_rate": 4.97760758368771e-05, "loss": 0.4272, "num_tokens": 91095677.0, "step": 697 }, { "epoch": 0.2785315243415802, "grad_norm": 6.242318153381348, "learning_rate": 4.9774680834468035e-05, "loss": 0.7206, "num_tokens": 91226749.0, "step": 698 }, { "epoch": 0.2789305666400638, "grad_norm": 0.27399635314941406, "learning_rate": 4.977328152211586e-05, "loss": 0.4733, "num_tokens": 91357821.0, "step": 699 }, { "epoch": 0.27932960893854747, "grad_norm": 0.26999548077583313, "learning_rate": 4.977187790009135e-05, "loss": 0.4539, "num_tokens": 91488893.0, "step": 700 }, { "epoch": 0.2797286512370311, "grad_norm": 0.21414203941822052, "learning_rate": 4.977046996866608e-05, "loss": 0.3515, "num_tokens": 91619965.0, "step": 701 }, { "epoch": 0.28012769353551475, "grad_norm": 0.2714909017086029, "learning_rate": 4.976905772811247e-05, "loss": 0.457, "num_tokens": 91751037.0, "step": 702 }, { "epoch": 0.2805267358339984, "grad_norm": 0.23164594173431396, "learning_rate": 4.976764117870378e-05, "loss": 0.3976, "num_tokens": 91882109.0, "step": 703 }, { "epoch": 0.28092577813248204, "grad_norm": 0.2575211226940155, "learning_rate": 4.976622032071409e-05, "loss": 0.4158, "num_tokens": 92013181.0, "step": 704 }, { "epoch": 0.2813248204309657, "grad_norm": 0.25666138529777527, "learning_rate": 4.976479515441834e-05, "loss": 0.4311, "num_tokens": 92135435.0, "step": 705 }, { "epoch": 0.2817238627294493, "grad_norm": 0.2535516619682312, "learning_rate": 4.976336568009228e-05, "loss": 0.4353, "num_tokens": 92266507.0, "step": 706 }, { "epoch": 0.28212290502793297, "grad_norm": 0.26945844292640686, "learning_rate": 4.976193189801249e-05, "loss": 0.4739, "num_tokens": 92397579.0, "step": 707 }, { "epoch": 0.2825219473264166, "grad_norm": 0.26970863342285156, "learning_rate": 4.9760493808456404e-05, "loss": 0.4644, "num_tokens": 92528651.0, "step": 708 }, { "epoch": 0.28292098962490025, "grad_norm": 0.23881573975086212, "learning_rate": 4.975905141170228e-05, "loss": 0.3954, "num_tokens": 92659723.0, "step": 709 }, { "epoch": 0.2833200319233839, "grad_norm": 0.23768256604671478, "learning_rate": 4.97576047080292e-05, "loss": 0.4174, "num_tokens": 92790795.0, "step": 710 }, { "epoch": 0.28371907422186754, "grad_norm": 0.252530038356781, "learning_rate": 4.975615369771709e-05, "loss": 0.4452, "num_tokens": 92921867.0, "step": 711 }, { "epoch": 0.2841181165203512, "grad_norm": 0.2342309057712555, "learning_rate": 4.975469838104671e-05, "loss": 0.4177, "num_tokens": 93052939.0, "step": 712 }, { "epoch": 0.28451715881883477, "grad_norm": 0.21564508974552155, "learning_rate": 4.975323875829965e-05, "loss": 0.3675, "num_tokens": 93184011.0, "step": 713 }, { "epoch": 0.2849162011173184, "grad_norm": 0.255493700504303, "learning_rate": 4.9751774829758336e-05, "loss": 0.4707, "num_tokens": 93315083.0, "step": 714 }, { "epoch": 0.28531524341580206, "grad_norm": 0.25850212574005127, "learning_rate": 4.9750306595706014e-05, "loss": 0.4672, "num_tokens": 93442143.0, "step": 715 }, { "epoch": 0.2857142857142857, "grad_norm": 0.22695669531822205, "learning_rate": 4.974883405642679e-05, "loss": 0.4036, "num_tokens": 93573215.0, "step": 716 }, { "epoch": 0.28611332801276934, "grad_norm": 0.2575581967830658, "learning_rate": 4.9747357212205565e-05, "loss": 0.4316, "num_tokens": 93704287.0, "step": 717 }, { "epoch": 0.286512370311253, "grad_norm": 0.23539146780967712, "learning_rate": 4.974587606332811e-05, "loss": 0.404, "num_tokens": 93835359.0, "step": 718 }, { "epoch": 0.2869114126097366, "grad_norm": 0.23997950553894043, "learning_rate": 4.974439061008101e-05, "loss": 0.393, "num_tokens": 93966431.0, "step": 719 }, { "epoch": 0.28731045490822027, "grad_norm": 0.25808557868003845, "learning_rate": 4.9742900852751695e-05, "loss": 0.4529, "num_tokens": 94097503.0, "step": 720 }, { "epoch": 0.2877094972067039, "grad_norm": 0.2592238485813141, "learning_rate": 4.97414067916284e-05, "loss": 0.4182, "num_tokens": 94228575.0, "step": 721 }, { "epoch": 0.28810853950518756, "grad_norm": 0.21040964126586914, "learning_rate": 4.973990842700023e-05, "loss": 0.3764, "num_tokens": 94359647.0, "step": 722 }, { "epoch": 0.2885075818036712, "grad_norm": 0.22673258185386658, "learning_rate": 4.973840575915709e-05, "loss": 0.3872, "num_tokens": 94490719.0, "step": 723 }, { "epoch": 0.28890662410215484, "grad_norm": 0.23921562731266022, "learning_rate": 4.973689878838975e-05, "loss": 0.3986, "num_tokens": 94621791.0, "step": 724 }, { "epoch": 0.2893056664006385, "grad_norm": 0.24567312002182007, "learning_rate": 4.973538751498977e-05, "loss": 0.4237, "num_tokens": 94752863.0, "step": 725 }, { "epoch": 0.2897047086991221, "grad_norm": 0.23558613657951355, "learning_rate": 4.9733871939249596e-05, "loss": 0.4164, "num_tokens": 94883935.0, "step": 726 }, { "epoch": 0.29010375099760577, "grad_norm": 0.22688953578472137, "learning_rate": 4.973235206146245e-05, "loss": 0.3822, "num_tokens": 95015007.0, "step": 727 }, { "epoch": 0.2905027932960894, "grad_norm": 0.23782722651958466, "learning_rate": 4.973082788192244e-05, "loss": 0.3934, "num_tokens": 95146079.0, "step": 728 }, { "epoch": 0.290901835594573, "grad_norm": 0.27870145440101624, "learning_rate": 4.9729299400924454e-05, "loss": 0.4519, "num_tokens": 95277151.0, "step": 729 }, { "epoch": 0.29130087789305664, "grad_norm": 0.24753695726394653, "learning_rate": 4.9727766618764265e-05, "loss": 0.4552, "num_tokens": 95408223.0, "step": 730 }, { "epoch": 0.2916999201915403, "grad_norm": 0.6590440273284912, "learning_rate": 4.972622953573844e-05, "loss": 0.4473, "num_tokens": 95539295.0, "step": 731 }, { "epoch": 0.29209896249002393, "grad_norm": 0.2743186354637146, "learning_rate": 4.9724688152144386e-05, "loss": 0.4544, "num_tokens": 95670367.0, "step": 732 }, { "epoch": 0.29249800478850757, "grad_norm": 0.23368071019649506, "learning_rate": 4.972314246828035e-05, "loss": 0.3774, "num_tokens": 95801439.0, "step": 733 }, { "epoch": 0.2928970470869912, "grad_norm": 0.2704026401042938, "learning_rate": 4.97215924844454e-05, "loss": 0.4645, "num_tokens": 95932511.0, "step": 734 }, { "epoch": 0.29329608938547486, "grad_norm": 0.2462460845708847, "learning_rate": 4.972003820093946e-05, "loss": 0.4025, "num_tokens": 96063583.0, "step": 735 }, { "epoch": 0.2936951316839585, "grad_norm": 0.2504191994667053, "learning_rate": 4.971847961806326e-05, "loss": 0.4408, "num_tokens": 96194655.0, "step": 736 }, { "epoch": 0.29409417398244214, "grad_norm": 0.2652498483657837, "learning_rate": 4.971691673611838e-05, "loss": 0.4765, "num_tokens": 96325727.0, "step": 737 }, { "epoch": 0.2944932162809258, "grad_norm": 0.22850385308265686, "learning_rate": 4.9715349555407204e-05, "loss": 0.4027, "num_tokens": 96456799.0, "step": 738 }, { "epoch": 0.2948922585794094, "grad_norm": 0.2325052171945572, "learning_rate": 4.9713778076232984e-05, "loss": 0.4188, "num_tokens": 96587871.0, "step": 739 }, { "epoch": 0.29529130087789307, "grad_norm": 0.25101912021636963, "learning_rate": 4.9712202298899774e-05, "loss": 0.4305, "num_tokens": 96718943.0, "step": 740 }, { "epoch": 0.2956903431763767, "grad_norm": 0.25313031673431396, "learning_rate": 4.971062222371248e-05, "loss": 0.4231, "num_tokens": 96850015.0, "step": 741 }, { "epoch": 0.29608938547486036, "grad_norm": 0.26533496379852295, "learning_rate": 4.970903785097683e-05, "loss": 0.4927, "num_tokens": 96981087.0, "step": 742 }, { "epoch": 0.296488427773344, "grad_norm": 0.23974275588989258, "learning_rate": 4.970744918099939e-05, "loss": 0.4074, "num_tokens": 97112159.0, "step": 743 }, { "epoch": 0.2968874700718276, "grad_norm": 0.24310094118118286, "learning_rate": 4.970585621408755e-05, "loss": 0.4384, "num_tokens": 97243231.0, "step": 744 }, { "epoch": 0.29728651237031123, "grad_norm": 0.2308449149131775, "learning_rate": 4.970425895054953e-05, "loss": 0.3601, "num_tokens": 97374303.0, "step": 745 }, { "epoch": 0.29768555466879487, "grad_norm": 0.28165489435195923, "learning_rate": 4.970265739069439e-05, "loss": 0.4805, "num_tokens": 97505375.0, "step": 746 }, { "epoch": 0.2980845969672785, "grad_norm": 0.23442092537879944, "learning_rate": 4.9701051534832016e-05, "loss": 0.4108, "num_tokens": 97636447.0, "step": 747 }, { "epoch": 0.29848363926576216, "grad_norm": 0.23787830770015717, "learning_rate": 4.969944138327312e-05, "loss": 0.4137, "num_tokens": 97767519.0, "step": 748 }, { "epoch": 0.2988826815642458, "grad_norm": 0.22401118278503418, "learning_rate": 4.9697826936329256e-05, "loss": 0.4317, "num_tokens": 97898591.0, "step": 749 }, { "epoch": 0.29928172386272944, "grad_norm": 0.22760531306266785, "learning_rate": 4.969620819431281e-05, "loss": 0.3874, "num_tokens": 98029663.0, "step": 750 }, { "epoch": 0.2996807661612131, "grad_norm": 0.2403881549835205, "learning_rate": 4.969458515753699e-05, "loss": 0.4308, "num_tokens": 98160735.0, "step": 751 }, { "epoch": 0.30007980845969673, "grad_norm": 0.2479414939880371, "learning_rate": 4.969295782631583e-05, "loss": 0.4424, "num_tokens": 98291807.0, "step": 752 }, { "epoch": 0.30047885075818037, "grad_norm": 0.23063448071479797, "learning_rate": 4.9691326200964226e-05, "loss": 0.4182, "num_tokens": 98422879.0, "step": 753 }, { "epoch": 0.300877893056664, "grad_norm": 0.25147464871406555, "learning_rate": 4.968969028179786e-05, "loss": 0.4736, "num_tokens": 98553951.0, "step": 754 }, { "epoch": 0.30127693535514766, "grad_norm": 0.2501717507839203, "learning_rate": 4.968805006913327e-05, "loss": 0.4235, "num_tokens": 98685023.0, "step": 755 }, { "epoch": 0.3016759776536313, "grad_norm": 0.239242285490036, "learning_rate": 4.9686405563287826e-05, "loss": 0.4213, "num_tokens": 98816095.0, "step": 756 }, { "epoch": 0.30207501995211494, "grad_norm": 0.2785801589488983, "learning_rate": 4.968475676457972e-05, "loss": 0.4062, "num_tokens": 98947167.0, "step": 757 }, { "epoch": 0.3024740622505986, "grad_norm": 0.22786572575569153, "learning_rate": 4.9683103673328e-05, "loss": 0.3871, "num_tokens": 99078239.0, "step": 758 }, { "epoch": 0.30287310454908223, "grad_norm": 0.21995952725410461, "learning_rate": 4.9681446289852493e-05, "loss": 0.3726, "num_tokens": 99209311.0, "step": 759 }, { "epoch": 0.3032721468475658, "grad_norm": 0.2595064342021942, "learning_rate": 4.967978461447391e-05, "loss": 0.4458, "num_tokens": 99340383.0, "step": 760 }, { "epoch": 0.30367118914604946, "grad_norm": 0.2687545120716095, "learning_rate": 4.967811864751376e-05, "loss": 0.4544, "num_tokens": 99471455.0, "step": 761 }, { "epoch": 0.3040702314445331, "grad_norm": 0.25186339020729065, "learning_rate": 4.9676448389294386e-05, "loss": 0.4544, "num_tokens": 99602527.0, "step": 762 }, { "epoch": 0.30446927374301674, "grad_norm": 0.23419298231601715, "learning_rate": 4.967477384013898e-05, "loss": 0.372, "num_tokens": 99733599.0, "step": 763 }, { "epoch": 0.3048683160415004, "grad_norm": 0.23199345171451569, "learning_rate": 4.967309500037155e-05, "loss": 0.4079, "num_tokens": 99864671.0, "step": 764 }, { "epoch": 0.30526735833998403, "grad_norm": 0.2562766969203949, "learning_rate": 4.967141187031692e-05, "loss": 0.4231, "num_tokens": 99995743.0, "step": 765 }, { "epoch": 0.3056664006384677, "grad_norm": 0.2477886974811554, "learning_rate": 4.966972445030078e-05, "loss": 0.3916, "num_tokens": 100126815.0, "step": 766 }, { "epoch": 0.3060654429369513, "grad_norm": 0.23134931921958923, "learning_rate": 4.966803274064963e-05, "loss": 0.3822, "num_tokens": 100257887.0, "step": 767 }, { "epoch": 0.30646448523543496, "grad_norm": 0.2584376037120819, "learning_rate": 4.966633674169078e-05, "loss": 0.4552, "num_tokens": 100388959.0, "step": 768 }, { "epoch": 0.3068635275339186, "grad_norm": 0.274125337600708, "learning_rate": 4.966463645375241e-05, "loss": 0.413, "num_tokens": 100520031.0, "step": 769 }, { "epoch": 0.30726256983240224, "grad_norm": 0.4638015031814575, "learning_rate": 4.96629318771635e-05, "loss": 0.4442, "num_tokens": 100651103.0, "step": 770 }, { "epoch": 0.3076616121308859, "grad_norm": 0.2568604350090027, "learning_rate": 4.966122301225387e-05, "loss": 0.4853, "num_tokens": 100782175.0, "step": 771 }, { "epoch": 0.30806065442936953, "grad_norm": 0.9422112703323364, "learning_rate": 4.965950985935418e-05, "loss": 0.492, "num_tokens": 100913247.0, "step": 772 }, { "epoch": 0.30845969672785317, "grad_norm": 0.28515997529029846, "learning_rate": 4.9657792418795894e-05, "loss": 0.4843, "num_tokens": 101044319.0, "step": 773 }, { "epoch": 0.3088587390263368, "grad_norm": 0.256024032831192, "learning_rate": 4.965607069091133e-05, "loss": 0.43, "num_tokens": 101175391.0, "step": 774 }, { "epoch": 0.30925778132482046, "grad_norm": 0.25945326685905457, "learning_rate": 4.965434467603362e-05, "loss": 0.4564, "num_tokens": 101306463.0, "step": 775 }, { "epoch": 0.30965682362330404, "grad_norm": 0.26109421253204346, "learning_rate": 4.965261437449674e-05, "loss": 0.4146, "num_tokens": 101437535.0, "step": 776 }, { "epoch": 0.3100558659217877, "grad_norm": 0.23874928057193756, "learning_rate": 4.965087978663548e-05, "loss": 0.3876, "num_tokens": 101568607.0, "step": 777 }, { "epoch": 0.31045490822027133, "grad_norm": 0.2260579913854599, "learning_rate": 4.964914091278546e-05, "loss": 0.3929, "num_tokens": 101699679.0, "step": 778 }, { "epoch": 0.310853950518755, "grad_norm": 0.2431190311908722, "learning_rate": 4.964739775328316e-05, "loss": 0.4143, "num_tokens": 101830751.0, "step": 779 }, { "epoch": 0.3112529928172386, "grad_norm": 0.26888906955718994, "learning_rate": 4.964565030846584e-05, "loss": 0.3735, "num_tokens": 101961823.0, "step": 780 }, { "epoch": 0.31165203511572226, "grad_norm": 0.25507551431655884, "learning_rate": 4.9643898578671634e-05, "loss": 0.3953, "num_tokens": 102092895.0, "step": 781 }, { "epoch": 0.3120510774142059, "grad_norm": 0.2287735491991043, "learning_rate": 4.964214256423947e-05, "loss": 0.3863, "num_tokens": 102223967.0, "step": 782 }, { "epoch": 0.31245011971268954, "grad_norm": 0.4274979829788208, "learning_rate": 4.964038226550913e-05, "loss": 0.4483, "num_tokens": 102355039.0, "step": 783 }, { "epoch": 0.3128491620111732, "grad_norm": 0.26387083530426025, "learning_rate": 4.96386176828212e-05, "loss": 0.4723, "num_tokens": 102486111.0, "step": 784 }, { "epoch": 0.31324820430965683, "grad_norm": 0.22966977953910828, "learning_rate": 4.963684881651713e-05, "loss": 0.4133, "num_tokens": 102617183.0, "step": 785 }, { "epoch": 0.3136472466081405, "grad_norm": 0.2206268310546875, "learning_rate": 4.9635075666939174e-05, "loss": 0.4203, "num_tokens": 102748255.0, "step": 786 }, { "epoch": 0.3140462889066241, "grad_norm": 0.23710355162620544, "learning_rate": 4.963329823443042e-05, "loss": 0.4366, "num_tokens": 102878947.0, "step": 787 }, { "epoch": 0.31444533120510776, "grad_norm": 0.2538776695728302, "learning_rate": 4.963151651933477e-05, "loss": 0.4312, "num_tokens": 103010019.0, "step": 788 }, { "epoch": 0.3148443735035914, "grad_norm": 0.28161904215812683, "learning_rate": 4.962973052199699e-05, "loss": 0.425, "num_tokens": 103137314.0, "step": 789 }, { "epoch": 0.31524341580207504, "grad_norm": 0.2528238892555237, "learning_rate": 4.9627940242762633e-05, "loss": 0.4696, "num_tokens": 103252639.0, "step": 790 }, { "epoch": 0.31564245810055863, "grad_norm": 0.2240891307592392, "learning_rate": 4.962614568197813e-05, "loss": 0.39, "num_tokens": 103383711.0, "step": 791 }, { "epoch": 0.3160415003990423, "grad_norm": 0.2169046700000763, "learning_rate": 4.9624346839990686e-05, "loss": 0.3855, "num_tokens": 103514783.0, "step": 792 }, { "epoch": 0.3164405426975259, "grad_norm": 0.22143244743347168, "learning_rate": 4.9622543717148365e-05, "loss": 0.4062, "num_tokens": 103645855.0, "step": 793 }, { "epoch": 0.31683958499600956, "grad_norm": 0.2490648329257965, "learning_rate": 4.962073631380006e-05, "loss": 0.4143, "num_tokens": 103776927.0, "step": 794 }, { "epoch": 0.3172386272944932, "grad_norm": 0.2270108163356781, "learning_rate": 4.961892463029549e-05, "loss": 0.4188, "num_tokens": 103907999.0, "step": 795 }, { "epoch": 0.31763766959297685, "grad_norm": 0.21063947677612305, "learning_rate": 4.961710866698519e-05, "loss": 0.3548, "num_tokens": 104039071.0, "step": 796 }, { "epoch": 0.3180367118914605, "grad_norm": 0.23781223595142365, "learning_rate": 4.961528842422054e-05, "loss": 0.3982, "num_tokens": 104170143.0, "step": 797 }, { "epoch": 0.31843575418994413, "grad_norm": 0.2238934189081192, "learning_rate": 4.9613463902353734e-05, "loss": 0.4171, "num_tokens": 104301215.0, "step": 798 }, { "epoch": 0.3188347964884278, "grad_norm": 0.2390570044517517, "learning_rate": 4.9611635101737805e-05, "loss": 0.4391, "num_tokens": 104432287.0, "step": 799 }, { "epoch": 0.3192338387869114, "grad_norm": 0.2277417927980423, "learning_rate": 4.96098020227266e-05, "loss": 0.3958, "num_tokens": 104563359.0, "step": 800 }, { "epoch": 0.31963288108539506, "grad_norm": 0.2418816238641739, "learning_rate": 4.960796466567481e-05, "loss": 0.3927, "num_tokens": 104686451.0, "step": 801 }, { "epoch": 0.3200319233838787, "grad_norm": 0.2259652465581894, "learning_rate": 4.960612303093795e-05, "loss": 0.4037, "num_tokens": 104817523.0, "step": 802 }, { "epoch": 0.32043096568236235, "grad_norm": 0.26346057653427124, "learning_rate": 4.9604277118872345e-05, "loss": 0.4471, "num_tokens": 104948595.0, "step": 803 }, { "epoch": 0.320830007980846, "grad_norm": 0.22559155523777008, "learning_rate": 4.960242692983518e-05, "loss": 0.3939, "num_tokens": 105079667.0, "step": 804 }, { "epoch": 0.32122905027932963, "grad_norm": 0.23328982293605804, "learning_rate": 4.9600572464184434e-05, "loss": 0.3964, "num_tokens": 105210739.0, "step": 805 }, { "epoch": 0.3216280925778133, "grad_norm": 0.23869232833385468, "learning_rate": 4.959871372227894e-05, "loss": 0.4424, "num_tokens": 105341811.0, "step": 806 }, { "epoch": 0.32202713487629686, "grad_norm": 0.2294788509607315, "learning_rate": 4.9596850704478335e-05, "loss": 0.4208, "num_tokens": 105472883.0, "step": 807 }, { "epoch": 0.3224261771747805, "grad_norm": 0.2517457902431488, "learning_rate": 4.9594983411143113e-05, "loss": 0.4899, "num_tokens": 105603955.0, "step": 808 }, { "epoch": 0.32282521947326415, "grad_norm": 0.23611228168010712, "learning_rate": 4.959311184263456e-05, "loss": 0.4218, "num_tokens": 105735027.0, "step": 809 }, { "epoch": 0.3232242617717478, "grad_norm": 0.2566143572330475, "learning_rate": 4.9591235999314824e-05, "loss": 0.4572, "num_tokens": 105866099.0, "step": 810 }, { "epoch": 0.32362330407023143, "grad_norm": 0.23416730761528015, "learning_rate": 4.958935588154684e-05, "loss": 0.3802, "num_tokens": 105997171.0, "step": 811 }, { "epoch": 0.3240223463687151, "grad_norm": 0.22752174735069275, "learning_rate": 4.958747148969442e-05, "loss": 0.3736, "num_tokens": 106128243.0, "step": 812 }, { "epoch": 0.3244213886671987, "grad_norm": 0.2810189425945282, "learning_rate": 4.9585582824122155e-05, "loss": 0.4568, "num_tokens": 106259315.0, "step": 813 }, { "epoch": 0.32482043096568236, "grad_norm": 0.25620514154434204, "learning_rate": 4.95836898851955e-05, "loss": 0.454, "num_tokens": 106390387.0, "step": 814 }, { "epoch": 0.325219473264166, "grad_norm": 0.2313988208770752, "learning_rate": 4.958179267328071e-05, "loss": 0.402, "num_tokens": 106521459.0, "step": 815 }, { "epoch": 0.32561851556264965, "grad_norm": 0.22773709893226624, "learning_rate": 4.957989118874488e-05, "loss": 0.4036, "num_tokens": 106652531.0, "step": 816 }, { "epoch": 0.3260175578611333, "grad_norm": 0.2515188455581665, "learning_rate": 4.9577985431955935e-05, "loss": 0.4362, "num_tokens": 106783603.0, "step": 817 }, { "epoch": 0.32641660015961693, "grad_norm": 0.23373883962631226, "learning_rate": 4.957607540328261e-05, "loss": 0.4007, "num_tokens": 106914675.0, "step": 818 }, { "epoch": 0.3268156424581006, "grad_norm": 0.2412547916173935, "learning_rate": 4.95741611030945e-05, "loss": 0.4341, "num_tokens": 107045747.0, "step": 819 }, { "epoch": 0.3272146847565842, "grad_norm": 0.23111028969287872, "learning_rate": 4.957224253176197e-05, "loss": 0.436, "num_tokens": 107176819.0, "step": 820 }, { "epoch": 0.32761372705506786, "grad_norm": 1.557268738746643, "learning_rate": 4.957031968965627e-05, "loss": 0.4219, "num_tokens": 107307891.0, "step": 821 }, { "epoch": 0.3280127693535515, "grad_norm": 0.2897958755493164, "learning_rate": 4.9568392577149456e-05, "loss": 0.4103, "num_tokens": 107438963.0, "step": 822 }, { "epoch": 0.3284118116520351, "grad_norm": 0.22349046170711517, "learning_rate": 4.956646119461439e-05, "loss": 0.4132, "num_tokens": 107570035.0, "step": 823 }, { "epoch": 0.32881085395051873, "grad_norm": 0.24998736381530762, "learning_rate": 4.9564525542424786e-05, "loss": 0.4228, "num_tokens": 107701107.0, "step": 824 }, { "epoch": 0.3292098962490024, "grad_norm": 0.2312653660774231, "learning_rate": 4.9562585620955176e-05, "loss": 0.3929, "num_tokens": 107832179.0, "step": 825 }, { "epoch": 0.329608938547486, "grad_norm": 0.253259539604187, "learning_rate": 4.9560641430580905e-05, "loss": 0.376, "num_tokens": 107963251.0, "step": 826 }, { "epoch": 0.33000798084596966, "grad_norm": 0.2585609555244446, "learning_rate": 4.955869297167817e-05, "loss": 0.4762, "num_tokens": 108094323.0, "step": 827 }, { "epoch": 0.3304070231444533, "grad_norm": 0.25824785232543945, "learning_rate": 4.9556740244623964e-05, "loss": 0.4127, "num_tokens": 108225395.0, "step": 828 }, { "epoch": 0.33080606544293695, "grad_norm": 0.2377702295780182, "learning_rate": 4.955478324979615e-05, "loss": 0.4087, "num_tokens": 108356467.0, "step": 829 }, { "epoch": 0.3312051077414206, "grad_norm": 0.2345699667930603, "learning_rate": 4.955282198757335e-05, "loss": 0.3837, "num_tokens": 108487539.0, "step": 830 }, { "epoch": 0.33160415003990423, "grad_norm": 0.2423536330461502, "learning_rate": 4.9550856458335084e-05, "loss": 0.3853, "num_tokens": 108602547.0, "step": 831 }, { "epoch": 0.3320031923383879, "grad_norm": 0.2501680254936218, "learning_rate": 4.954888666246164e-05, "loss": 0.4221, "num_tokens": 108733619.0, "step": 832 }, { "epoch": 0.3324022346368715, "grad_norm": 0.22802689671516418, "learning_rate": 4.9546912600334165e-05, "loss": 0.3723, "num_tokens": 108864691.0, "step": 833 }, { "epoch": 0.33280127693535516, "grad_norm": 0.22061379253864288, "learning_rate": 4.9544934272334625e-05, "loss": 0.387, "num_tokens": 108995763.0, "step": 834 }, { "epoch": 0.3332003192338388, "grad_norm": 0.2460906058549881, "learning_rate": 4.95429516788458e-05, "loss": 0.4379, "num_tokens": 109126835.0, "step": 835 }, { "epoch": 0.33359936153232245, "grad_norm": 0.26019108295440674, "learning_rate": 4.9540964820251304e-05, "loss": 0.4568, "num_tokens": 109257907.0, "step": 836 }, { "epoch": 0.3339984038308061, "grad_norm": 0.22881627082824707, "learning_rate": 4.953897369693558e-05, "loss": 0.401, "num_tokens": 109388979.0, "step": 837 }, { "epoch": 0.3343974461292897, "grad_norm": 0.2263326346874237, "learning_rate": 4.953697830928389e-05, "loss": 0.3463, "num_tokens": 109520051.0, "step": 838 }, { "epoch": 0.3347964884277733, "grad_norm": 0.22798649966716766, "learning_rate": 4.953497865768233e-05, "loss": 0.3971, "num_tokens": 109651123.0, "step": 839 }, { "epoch": 0.33519553072625696, "grad_norm": 0.2553224265575409, "learning_rate": 4.9532974742517806e-05, "loss": 0.4501, "num_tokens": 109782195.0, "step": 840 }, { "epoch": 0.3355945730247406, "grad_norm": 0.2749249339103699, "learning_rate": 4.953096656417805e-05, "loss": 0.4627, "num_tokens": 109897297.0, "step": 841 }, { "epoch": 0.33599361532322425, "grad_norm": 0.24258951842784882, "learning_rate": 4.952895412305164e-05, "loss": 0.412, "num_tokens": 110028369.0, "step": 842 }, { "epoch": 0.3363926576217079, "grad_norm": 0.25379568338394165, "learning_rate": 4.952693741952796e-05, "loss": 0.4523, "num_tokens": 110159441.0, "step": 843 }, { "epoch": 0.33679169992019153, "grad_norm": 0.25204914808273315, "learning_rate": 4.952491645399721e-05, "loss": 0.471, "num_tokens": 110290513.0, "step": 844 }, { "epoch": 0.3371907422186752, "grad_norm": 0.23650255799293518, "learning_rate": 4.9522891226850455e-05, "loss": 0.401, "num_tokens": 110421585.0, "step": 845 }, { "epoch": 0.3375897845171588, "grad_norm": 0.25874796509742737, "learning_rate": 4.952086173847954e-05, "loss": 0.4671, "num_tokens": 110552657.0, "step": 846 }, { "epoch": 0.33798882681564246, "grad_norm": 0.24461820721626282, "learning_rate": 4.9518827989277147e-05, "loss": 0.4188, "num_tokens": 110683729.0, "step": 847 }, { "epoch": 0.3383878691141261, "grad_norm": 0.22432976961135864, "learning_rate": 4.95167899796368e-05, "loss": 0.3803, "num_tokens": 110814801.0, "step": 848 }, { "epoch": 0.33878691141260975, "grad_norm": 0.22130031883716583, "learning_rate": 4.9514747709952825e-05, "loss": 0.3768, "num_tokens": 110945873.0, "step": 849 }, { "epoch": 0.3391859537110934, "grad_norm": 0.2612733244895935, "learning_rate": 4.951270118062039e-05, "loss": 0.4686, "num_tokens": 111076945.0, "step": 850 }, { "epoch": 0.33958499600957703, "grad_norm": 0.2434006929397583, "learning_rate": 4.951065039203548e-05, "loss": 0.452, "num_tokens": 111208017.0, "step": 851 }, { "epoch": 0.3399840383080607, "grad_norm": 0.26104798913002014, "learning_rate": 4.9508595344594885e-05, "loss": 0.4388, "num_tokens": 111339089.0, "step": 852 }, { "epoch": 0.3403830806065443, "grad_norm": 0.23313556611537933, "learning_rate": 4.950653603869627e-05, "loss": 0.3706, "num_tokens": 111470161.0, "step": 853 }, { "epoch": 0.3407821229050279, "grad_norm": 0.23167890310287476, "learning_rate": 4.950447247473807e-05, "loss": 0.4019, "num_tokens": 111601233.0, "step": 854 }, { "epoch": 0.34118116520351155, "grad_norm": 0.2634051442146301, "learning_rate": 4.950240465311957e-05, "loss": 0.4841, "num_tokens": 111732305.0, "step": 855 }, { "epoch": 0.3415802075019952, "grad_norm": 0.25546830892562866, "learning_rate": 4.950033257424086e-05, "loss": 0.4118, "num_tokens": 111863377.0, "step": 856 }, { "epoch": 0.34197924980047884, "grad_norm": 0.23231828212738037, "learning_rate": 4.94982562385029e-05, "loss": 0.367, "num_tokens": 111994449.0, "step": 857 }, { "epoch": 0.3423782920989625, "grad_norm": 0.2305700182914734, "learning_rate": 4.949617564630742e-05, "loss": 0.3886, "num_tokens": 112125521.0, "step": 858 }, { "epoch": 0.3427773343974461, "grad_norm": 0.22786958515644073, "learning_rate": 4.949409079805698e-05, "loss": 0.3961, "num_tokens": 112256593.0, "step": 859 }, { "epoch": 0.34317637669592976, "grad_norm": 0.2982991933822632, "learning_rate": 4.949200169415501e-05, "loss": 0.426, "num_tokens": 112387665.0, "step": 860 }, { "epoch": 0.3435754189944134, "grad_norm": 0.232236847281456, "learning_rate": 4.948990833500572e-05, "loss": 0.3745, "num_tokens": 112518737.0, "step": 861 }, { "epoch": 0.34397446129289705, "grad_norm": 0.26228782534599304, "learning_rate": 4.948781072101415e-05, "loss": 0.4369, "num_tokens": 112634408.0, "step": 862 }, { "epoch": 0.3443735035913807, "grad_norm": 0.2229522168636322, "learning_rate": 4.948570885258618e-05, "loss": 0.3476, "num_tokens": 112765480.0, "step": 863 }, { "epoch": 0.34477254588986433, "grad_norm": 0.2669422924518585, "learning_rate": 4.948360273012849e-05, "loss": 0.3976, "num_tokens": 112896552.0, "step": 864 }, { "epoch": 0.345171588188348, "grad_norm": 0.2402997463941574, "learning_rate": 4.94814923540486e-05, "loss": 0.3955, "num_tokens": 113024596.0, "step": 865 }, { "epoch": 0.3455706304868316, "grad_norm": 0.24820592999458313, "learning_rate": 4.947937772475485e-05, "loss": 0.4012, "num_tokens": 113155668.0, "step": 866 }, { "epoch": 0.34596967278531526, "grad_norm": 0.2604999542236328, "learning_rate": 4.94772588426564e-05, "loss": 0.408, "num_tokens": 113286740.0, "step": 867 }, { "epoch": 0.3463687150837989, "grad_norm": 0.25201156735420227, "learning_rate": 4.9475135708163236e-05, "loss": 0.4551, "num_tokens": 113417812.0, "step": 868 }, { "epoch": 0.34676775738228255, "grad_norm": 0.25405463576316833, "learning_rate": 4.947300832168616e-05, "loss": 0.4712, "num_tokens": 113548884.0, "step": 869 }, { "epoch": 0.34716679968076614, "grad_norm": 0.24831584095954895, "learning_rate": 4.9470876683636804e-05, "loss": 0.4492, "num_tokens": 113679956.0, "step": 870 }, { "epoch": 0.3475658419792498, "grad_norm": 0.2993150055408478, "learning_rate": 4.946874079442761e-05, "loss": 0.4267, "num_tokens": 113811028.0, "step": 871 }, { "epoch": 0.3479648842777334, "grad_norm": 0.22640176117420197, "learning_rate": 4.946660065447187e-05, "loss": 0.424, "num_tokens": 113942100.0, "step": 872 }, { "epoch": 0.34836392657621706, "grad_norm": 0.22437971830368042, "learning_rate": 4.946445626418368e-05, "loss": 0.3798, "num_tokens": 114073172.0, "step": 873 }, { "epoch": 0.3487629688747007, "grad_norm": 0.24597689509391785, "learning_rate": 4.946230762397794e-05, "loss": 0.4533, "num_tokens": 114204244.0, "step": 874 }, { "epoch": 0.34916201117318435, "grad_norm": 0.2454037368297577, "learning_rate": 4.946015473427042e-05, "loss": 0.4634, "num_tokens": 114335316.0, "step": 875 }, { "epoch": 0.349561053471668, "grad_norm": 0.2240416258573532, "learning_rate": 4.945799759547766e-05, "loss": 0.3944, "num_tokens": 114466388.0, "step": 876 }, { "epoch": 0.34996009577015164, "grad_norm": 0.2234128713607788, "learning_rate": 4.945583620801705e-05, "loss": 0.3725, "num_tokens": 114597460.0, "step": 877 }, { "epoch": 0.3503591380686353, "grad_norm": 0.23259860277175903, "learning_rate": 4.945367057230681e-05, "loss": 0.3637, "num_tokens": 114728532.0, "step": 878 }, { "epoch": 0.3507581803671189, "grad_norm": 0.23096604645252228, "learning_rate": 4.945150068876596e-05, "loss": 0.4082, "num_tokens": 114859604.0, "step": 879 }, { "epoch": 0.35115722266560256, "grad_norm": 0.2714957594871521, "learning_rate": 4.944932655781436e-05, "loss": 0.4724, "num_tokens": 114990676.0, "step": 880 }, { "epoch": 0.3515562649640862, "grad_norm": 0.23257976770401, "learning_rate": 4.944714817987268e-05, "loss": 0.3876, "num_tokens": 115121748.0, "step": 881 }, { "epoch": 0.35195530726256985, "grad_norm": 0.21097324788570404, "learning_rate": 4.944496555536241e-05, "loss": 0.3221, "num_tokens": 115252820.0, "step": 882 }, { "epoch": 0.3523543495610535, "grad_norm": 0.25255143642425537, "learning_rate": 4.944277868470588e-05, "loss": 0.4561, "num_tokens": 115383892.0, "step": 883 }, { "epoch": 0.35275339185953714, "grad_norm": 0.24507516622543335, "learning_rate": 4.944058756832622e-05, "loss": 0.4308, "num_tokens": 115514964.0, "step": 884 }, { "epoch": 0.3531524341580207, "grad_norm": 0.23859632015228271, "learning_rate": 4.94383922066474e-05, "loss": 0.4369, "num_tokens": 115646036.0, "step": 885 }, { "epoch": 0.35355147645650437, "grad_norm": 0.2553892433643341, "learning_rate": 4.943619260009419e-05, "loss": 0.4502, "num_tokens": 115777108.0, "step": 886 }, { "epoch": 0.353950518754988, "grad_norm": 0.2609286904335022, "learning_rate": 4.9433988749092196e-05, "loss": 0.4608, "num_tokens": 115908180.0, "step": 887 }, { "epoch": 0.35434956105347165, "grad_norm": 0.2519763112068176, "learning_rate": 4.943178065406784e-05, "loss": 0.4151, "num_tokens": 116039252.0, "step": 888 }, { "epoch": 0.3547486033519553, "grad_norm": 0.25278693437576294, "learning_rate": 4.942956831544838e-05, "loss": 0.4337, "num_tokens": 116170324.0, "step": 889 }, { "epoch": 0.35514764565043894, "grad_norm": 0.21001729369163513, "learning_rate": 4.942735173366188e-05, "loss": 0.3153, "num_tokens": 116301396.0, "step": 890 }, { "epoch": 0.3555466879489226, "grad_norm": 0.24860066175460815, "learning_rate": 4.942513090913722e-05, "loss": 0.4228, "num_tokens": 116432468.0, "step": 891 }, { "epoch": 0.3559457302474062, "grad_norm": 0.2133292555809021, "learning_rate": 4.942290584230411e-05, "loss": 0.3373, "num_tokens": 116563540.0, "step": 892 }, { "epoch": 0.35634477254588987, "grad_norm": 0.25940096378326416, "learning_rate": 4.942067653359308e-05, "loss": 0.4317, "num_tokens": 116694612.0, "step": 893 }, { "epoch": 0.3567438148443735, "grad_norm": 0.24670924246311188, "learning_rate": 4.941844298343549e-05, "loss": 0.4211, "num_tokens": 116825684.0, "step": 894 }, { "epoch": 0.35714285714285715, "grad_norm": 0.21174213290214539, "learning_rate": 4.941620519226349e-05, "loss": 0.3358, "num_tokens": 116956756.0, "step": 895 }, { "epoch": 0.3575418994413408, "grad_norm": 0.22899273037910461, "learning_rate": 4.9413963160510104e-05, "loss": 0.3773, "num_tokens": 117087828.0, "step": 896 }, { "epoch": 0.35794094173982444, "grad_norm": 0.2571898400783539, "learning_rate": 4.9411716888609116e-05, "loss": 0.4018, "num_tokens": 117218900.0, "step": 897 }, { "epoch": 0.3583399840383081, "grad_norm": 0.2380102276802063, "learning_rate": 4.940946637699517e-05, "loss": 0.4279, "num_tokens": 117349972.0, "step": 898 }, { "epoch": 0.3587390263367917, "grad_norm": 0.2513631582260132, "learning_rate": 4.9407211626103716e-05, "loss": 0.4129, "num_tokens": 117481044.0, "step": 899 }, { "epoch": 0.35913806863527536, "grad_norm": 0.22116857767105103, "learning_rate": 4.940495263637103e-05, "loss": 0.3243, "num_tokens": 117612116.0, "step": 900 }, { "epoch": 0.35953711093375895, "grad_norm": 0.2532989978790283, "learning_rate": 4.940268940823421e-05, "loss": 0.4552, "num_tokens": 117743188.0, "step": 901 }, { "epoch": 0.3599361532322426, "grad_norm": 0.2227490395307541, "learning_rate": 4.940042194213115e-05, "loss": 0.3745, "num_tokens": 117874260.0, "step": 902 }, { "epoch": 0.36033519553072624, "grad_norm": 0.20309753715991974, "learning_rate": 4.9398150238500606e-05, "loss": 0.3274, "num_tokens": 118005332.0, "step": 903 }, { "epoch": 0.3607342378292099, "grad_norm": 0.24464842677116394, "learning_rate": 4.939587429778213e-05, "loss": 0.4142, "num_tokens": 118136404.0, "step": 904 }, { "epoch": 0.3611332801276935, "grad_norm": 0.244638592004776, "learning_rate": 4.939359412041608e-05, "loss": 0.4126, "num_tokens": 118267476.0, "step": 905 }, { "epoch": 0.36153232242617717, "grad_norm": 0.22363656759262085, "learning_rate": 4.9391309706843655e-05, "loss": 0.3798, "num_tokens": 118398548.0, "step": 906 }, { "epoch": 0.3619313647246608, "grad_norm": 0.26600977778434753, "learning_rate": 4.938902105750687e-05, "loss": 0.4313, "num_tokens": 118517496.0, "step": 907 }, { "epoch": 0.36233040702314445, "grad_norm": 0.24806679785251617, "learning_rate": 4.938672817284856e-05, "loss": 0.392, "num_tokens": 118648568.0, "step": 908 }, { "epoch": 0.3627294493216281, "grad_norm": 0.21176396310329437, "learning_rate": 4.938443105331238e-05, "loss": 0.3402, "num_tokens": 118779640.0, "step": 909 }, { "epoch": 0.36312849162011174, "grad_norm": 0.23178952932357788, "learning_rate": 4.938212969934279e-05, "loss": 0.3542, "num_tokens": 118910712.0, "step": 910 }, { "epoch": 0.3635275339185954, "grad_norm": 0.22842830419540405, "learning_rate": 4.937982411138507e-05, "loss": 0.3927, "num_tokens": 119041784.0, "step": 911 }, { "epoch": 0.363926576217079, "grad_norm": 0.24181503057479858, "learning_rate": 4.937751428988537e-05, "loss": 0.4076, "num_tokens": 119172856.0, "step": 912 }, { "epoch": 0.36432561851556267, "grad_norm": 0.32528600096702576, "learning_rate": 4.937520023529058e-05, "loss": 0.444, "num_tokens": 119303928.0, "step": 913 }, { "epoch": 0.3647246608140463, "grad_norm": 0.9125648736953735, "learning_rate": 4.937288194804847e-05, "loss": 0.3527, "num_tokens": 119435000.0, "step": 914 }, { "epoch": 0.36512370311252995, "grad_norm": 0.2293378710746765, "learning_rate": 4.93705594286076e-05, "loss": 0.3629, "num_tokens": 119566072.0, "step": 915 }, { "epoch": 0.3655227454110136, "grad_norm": 0.25666388869285583, "learning_rate": 4.936823267741735e-05, "loss": 0.4418, "num_tokens": 119697144.0, "step": 916 }, { "epoch": 0.3659217877094972, "grad_norm": 0.22739972174167633, "learning_rate": 4.936590169492794e-05, "loss": 0.3846, "num_tokens": 119828216.0, "step": 917 }, { "epoch": 0.3663208300079808, "grad_norm": 0.2562474012374878, "learning_rate": 4.936356648159037e-05, "loss": 0.4457, "num_tokens": 119959288.0, "step": 918 }, { "epoch": 0.36671987230646447, "grad_norm": 0.23732684552669525, "learning_rate": 4.93612270378565e-05, "loss": 0.4177, "num_tokens": 120090360.0, "step": 919 }, { "epoch": 0.3671189146049481, "grad_norm": 0.2626221776008606, "learning_rate": 4.935888336417898e-05, "loss": 0.4884, "num_tokens": 120221432.0, "step": 920 }, { "epoch": 0.36751795690343175, "grad_norm": 0.2570772171020508, "learning_rate": 4.9356535461011305e-05, "loss": 0.4407, "num_tokens": 120352504.0, "step": 921 }, { "epoch": 0.3679169992019154, "grad_norm": 0.239701047539711, "learning_rate": 4.9354183328807764e-05, "loss": 0.4187, "num_tokens": 120480864.0, "step": 922 }, { "epoch": 0.36831604150039904, "grad_norm": 0.2609691917896271, "learning_rate": 4.935182696802346e-05, "loss": 0.4221, "num_tokens": 120611936.0, "step": 923 }, { "epoch": 0.3687150837988827, "grad_norm": 0.24960602819919586, "learning_rate": 4.934946637911434e-05, "loss": 0.3994, "num_tokens": 120743008.0, "step": 924 }, { "epoch": 0.3691141260973663, "grad_norm": 0.2649988532066345, "learning_rate": 4.934710156253716e-05, "loss": 0.4542, "num_tokens": 120874080.0, "step": 925 }, { "epoch": 0.36951316839584997, "grad_norm": 0.2522490620613098, "learning_rate": 4.934473251874948e-05, "loss": 0.4541, "num_tokens": 121005152.0, "step": 926 }, { "epoch": 0.3699122106943336, "grad_norm": 0.24744214117527008, "learning_rate": 4.9342359248209695e-05, "loss": 0.4115, "num_tokens": 121136224.0, "step": 927 }, { "epoch": 0.37031125299281725, "grad_norm": 0.23244741559028625, "learning_rate": 4.933998175137701e-05, "loss": 0.3795, "num_tokens": 121267296.0, "step": 928 }, { "epoch": 0.3707102952913009, "grad_norm": 0.2418338656425476, "learning_rate": 4.933760002871144e-05, "loss": 0.406, "num_tokens": 121398368.0, "step": 929 }, { "epoch": 0.37110933758978454, "grad_norm": 0.2765147089958191, "learning_rate": 4.933521408067383e-05, "loss": 0.4765, "num_tokens": 121529440.0, "step": 930 }, { "epoch": 0.3715083798882682, "grad_norm": 0.24393853545188904, "learning_rate": 4.9332823907725846e-05, "loss": 0.4123, "num_tokens": 121660512.0, "step": 931 }, { "epoch": 0.37190742218675177, "grad_norm": 0.2346920520067215, "learning_rate": 4.9330429510329955e-05, "loss": 0.3609, "num_tokens": 121791584.0, "step": 932 }, { "epoch": 0.3723064644852354, "grad_norm": 0.2843295633792877, "learning_rate": 4.932803088894946e-05, "loss": 0.4803, "num_tokens": 121922656.0, "step": 933 }, { "epoch": 0.37270550678371905, "grad_norm": 0.2579941153526306, "learning_rate": 4.932562804404846e-05, "loss": 0.3998, "num_tokens": 122053728.0, "step": 934 }, { "epoch": 0.3731045490822027, "grad_norm": 0.24467889964580536, "learning_rate": 4.932322097609189e-05, "loss": 0.412, "num_tokens": 122184800.0, "step": 935 }, { "epoch": 0.37350359138068634, "grad_norm": 0.2522808611392975, "learning_rate": 4.9320809685545494e-05, "loss": 0.4224, "num_tokens": 122315872.0, "step": 936 }, { "epoch": 0.37390263367917, "grad_norm": 0.25705447793006897, "learning_rate": 4.931839417287584e-05, "loss": 0.4358, "num_tokens": 122446944.0, "step": 937 }, { "epoch": 0.3743016759776536, "grad_norm": 0.25336530804634094, "learning_rate": 4.93159744385503e-05, "loss": 0.4134, "num_tokens": 122578016.0, "step": 938 }, { "epoch": 0.37470071827613727, "grad_norm": 0.23511262238025665, "learning_rate": 4.931355048303708e-05, "loss": 0.4041, "num_tokens": 122709088.0, "step": 939 }, { "epoch": 0.3750997605746209, "grad_norm": 0.25172874331474304, "learning_rate": 4.931112230680517e-05, "loss": 0.4441, "num_tokens": 122840160.0, "step": 940 }, { "epoch": 0.37549880287310455, "grad_norm": 0.26307037472724915, "learning_rate": 4.9308689910324416e-05, "loss": 0.4328, "num_tokens": 122965949.0, "step": 941 }, { "epoch": 0.3758978451715882, "grad_norm": 0.24217285215854645, "learning_rate": 4.930625329406547e-05, "loss": 0.4225, "num_tokens": 123097021.0, "step": 942 }, { "epoch": 0.37629688747007184, "grad_norm": 0.24822460114955902, "learning_rate": 4.930381245849978e-05, "loss": 0.4612, "num_tokens": 123228093.0, "step": 943 }, { "epoch": 0.3766959297685555, "grad_norm": 0.24724623560905457, "learning_rate": 4.930136740409964e-05, "loss": 0.4091, "num_tokens": 123359165.0, "step": 944 }, { "epoch": 0.3770949720670391, "grad_norm": 0.2629963159561157, "learning_rate": 4.929891813133813e-05, "loss": 0.4419, "num_tokens": 123490237.0, "step": 945 }, { "epoch": 0.37749401436552277, "grad_norm": 0.22250542044639587, "learning_rate": 4.9296464640689175e-05, "loss": 0.4153, "num_tokens": 123621309.0, "step": 946 }, { "epoch": 0.3778930566640064, "grad_norm": 0.24717208743095398, "learning_rate": 4.929400693262749e-05, "loss": 0.4131, "num_tokens": 123752381.0, "step": 947 }, { "epoch": 0.37829209896249, "grad_norm": 0.22691695392131805, "learning_rate": 4.9291545007628626e-05, "loss": 0.4152, "num_tokens": 123883453.0, "step": 948 }, { "epoch": 0.37869114126097364, "grad_norm": 0.2309306561946869, "learning_rate": 4.928907886616895e-05, "loss": 0.3831, "num_tokens": 124014525.0, "step": 949 }, { "epoch": 0.3790901835594573, "grad_norm": 0.22901347279548645, "learning_rate": 4.9286608508725615e-05, "loss": 0.3921, "num_tokens": 124145597.0, "step": 950 }, { "epoch": 0.3794892258579409, "grad_norm": 0.2468089610338211, "learning_rate": 4.9284133935776625e-05, "loss": 0.4278, "num_tokens": 124276669.0, "step": 951 }, { "epoch": 0.37988826815642457, "grad_norm": 0.23998965322971344, "learning_rate": 4.92816551478008e-05, "loss": 0.4417, "num_tokens": 124407741.0, "step": 952 }, { "epoch": 0.3802873104549082, "grad_norm": 0.23197536170482635, "learning_rate": 4.927917214527774e-05, "loss": 0.4236, "num_tokens": 124538813.0, "step": 953 }, { "epoch": 0.38068635275339185, "grad_norm": 0.23089347779750824, "learning_rate": 4.92766849286879e-05, "loss": 0.4118, "num_tokens": 124669885.0, "step": 954 }, { "epoch": 0.3810853950518755, "grad_norm": 0.23733852803707123, "learning_rate": 4.9274193498512524e-05, "loss": 0.3374, "num_tokens": 124800957.0, "step": 955 }, { "epoch": 0.38148443735035914, "grad_norm": 0.2343587726354599, "learning_rate": 4.9271697855233686e-05, "loss": 0.3899, "num_tokens": 124932029.0, "step": 956 }, { "epoch": 0.3818834796488428, "grad_norm": 0.2481846958398819, "learning_rate": 4.926919799933426e-05, "loss": 0.4135, "num_tokens": 125063101.0, "step": 957 }, { "epoch": 0.3822825219473264, "grad_norm": 0.2337896227836609, "learning_rate": 4.926669393129795e-05, "loss": 0.3917, "num_tokens": 125194173.0, "step": 958 }, { "epoch": 0.38268156424581007, "grad_norm": 0.23461660742759705, "learning_rate": 4.9264185651609274e-05, "loss": 0.3823, "num_tokens": 125325245.0, "step": 959 }, { "epoch": 0.3830806065442937, "grad_norm": 0.2373945116996765, "learning_rate": 4.926167316075356e-05, "loss": 0.4187, "num_tokens": 125456317.0, "step": 960 }, { "epoch": 0.38347964884277735, "grad_norm": 0.23397387564182281, "learning_rate": 4.925915645921694e-05, "loss": 0.3859, "num_tokens": 125587389.0, "step": 961 }, { "epoch": 0.383878691141261, "grad_norm": 0.25463420152664185, "learning_rate": 4.925663554748639e-05, "loss": 0.454, "num_tokens": 125718461.0, "step": 962 }, { "epoch": 0.38427773343974464, "grad_norm": 0.22351078689098358, "learning_rate": 4.9254110426049665e-05, "loss": 0.3735, "num_tokens": 125849533.0, "step": 963 }, { "epoch": 0.3846767757382282, "grad_norm": 0.27185991406440735, "learning_rate": 4.9251581095395374e-05, "loss": 0.4121, "num_tokens": 125980605.0, "step": 964 }, { "epoch": 0.38507581803671187, "grad_norm": 0.26495522260665894, "learning_rate": 4.9249047556012895e-05, "loss": 0.419, "num_tokens": 126111677.0, "step": 965 }, { "epoch": 0.3854748603351955, "grad_norm": 0.24675332009792328, "learning_rate": 4.9246509808392464e-05, "loss": 0.4354, "num_tokens": 126242749.0, "step": 966 }, { "epoch": 0.38587390263367916, "grad_norm": 0.23834878206253052, "learning_rate": 4.9243967853025103e-05, "loss": 0.4117, "num_tokens": 126373821.0, "step": 967 }, { "epoch": 0.3862729449321628, "grad_norm": 0.23353902995586395, "learning_rate": 4.9241421690402654e-05, "loss": 0.3721, "num_tokens": 126504893.0, "step": 968 }, { "epoch": 0.38667198723064644, "grad_norm": 0.2309710532426834, "learning_rate": 4.9238871321017784e-05, "loss": 0.3881, "num_tokens": 126635965.0, "step": 969 }, { "epoch": 0.3870710295291301, "grad_norm": 0.24012404680252075, "learning_rate": 4.9236316745363954e-05, "loss": 0.4055, "num_tokens": 126754057.0, "step": 970 }, { "epoch": 0.3874700718276137, "grad_norm": 0.2406294047832489, "learning_rate": 4.923375796393546e-05, "loss": 0.3965, "num_tokens": 126885129.0, "step": 971 }, { "epoch": 0.38786911412609737, "grad_norm": 0.2485412061214447, "learning_rate": 4.92311949772274e-05, "loss": 0.4146, "num_tokens": 127016201.0, "step": 972 }, { "epoch": 0.388268156424581, "grad_norm": 0.22916454076766968, "learning_rate": 4.9228627785735686e-05, "loss": 0.3777, "num_tokens": 127147169.0, "step": 973 }, { "epoch": 0.38866719872306466, "grad_norm": 0.23086293041706085, "learning_rate": 4.9226056389957055e-05, "loss": 0.3898, "num_tokens": 127278241.0, "step": 974 }, { "epoch": 0.3890662410215483, "grad_norm": 0.2607508897781372, "learning_rate": 4.9223480790389034e-05, "loss": 0.4415, "num_tokens": 127409313.0, "step": 975 }, { "epoch": 0.38946528332003194, "grad_norm": 0.21808657050132751, "learning_rate": 4.9220900987529985e-05, "loss": 0.3786, "num_tokens": 127540385.0, "step": 976 }, { "epoch": 0.3898643256185156, "grad_norm": 0.2571830153465271, "learning_rate": 4.921831698187907e-05, "loss": 0.4494, "num_tokens": 127671457.0, "step": 977 }, { "epoch": 0.3902633679169992, "grad_norm": 0.2339329868555069, "learning_rate": 4.9215728773936274e-05, "loss": 0.3894, "num_tokens": 127799863.0, "step": 978 }, { "epoch": 0.3906624102154828, "grad_norm": 0.23201368749141693, "learning_rate": 4.921313636420239e-05, "loss": 0.4003, "num_tokens": 127930935.0, "step": 979 }, { "epoch": 0.39106145251396646, "grad_norm": 0.220558300614357, "learning_rate": 4.921053975317903e-05, "loss": 0.3939, "num_tokens": 128062007.0, "step": 980 }, { "epoch": 0.3914604948124501, "grad_norm": 0.22157630324363708, "learning_rate": 4.920793894136862e-05, "loss": 0.4127, "num_tokens": 128193079.0, "step": 981 }, { "epoch": 0.39185953711093374, "grad_norm": 0.21934671700000763, "learning_rate": 4.9205333929274376e-05, "loss": 0.4078, "num_tokens": 128324151.0, "step": 982 }, { "epoch": 0.3922585794094174, "grad_norm": 0.240092471241951, "learning_rate": 4.920272471740035e-05, "loss": 0.4036, "num_tokens": 128455223.0, "step": 983 }, { "epoch": 0.39265762170790103, "grad_norm": 0.2265879511833191, "learning_rate": 4.92001113062514e-05, "loss": 0.3955, "num_tokens": 128586295.0, "step": 984 }, { "epoch": 0.39305666400638467, "grad_norm": 0.2387651652097702, "learning_rate": 4.919749369633321e-05, "loss": 0.4256, "num_tokens": 128717367.0, "step": 985 }, { "epoch": 0.3934557063048683, "grad_norm": 0.25854554772377014, "learning_rate": 4.9194871888152236e-05, "loss": 0.4279, "num_tokens": 128848439.0, "step": 986 }, { "epoch": 0.39385474860335196, "grad_norm": 0.24528704583644867, "learning_rate": 4.9192245882215805e-05, "loss": 0.4408, "num_tokens": 128979511.0, "step": 987 }, { "epoch": 0.3942537909018356, "grad_norm": 0.2309814840555191, "learning_rate": 4.9189615679031994e-05, "loss": 0.3959, "num_tokens": 129110583.0, "step": 988 }, { "epoch": 0.39465283320031924, "grad_norm": 0.2062840610742569, "learning_rate": 4.9186981279109746e-05, "loss": 0.3331, "num_tokens": 129241655.0, "step": 989 }, { "epoch": 0.3950518754988029, "grad_norm": 0.23767264187335968, "learning_rate": 4.918434268295877e-05, "loss": 0.358, "num_tokens": 129357832.0, "step": 990 }, { "epoch": 0.3954509177972865, "grad_norm": 0.249285489320755, "learning_rate": 4.918169989108964e-05, "loss": 0.4098, "num_tokens": 129488904.0, "step": 991 }, { "epoch": 0.39584996009577017, "grad_norm": 0.2540219724178314, "learning_rate": 4.917905290401369e-05, "loss": 0.4314, "num_tokens": 129619976.0, "step": 992 }, { "epoch": 0.3962490023942538, "grad_norm": 0.2626572251319885, "learning_rate": 4.9176401722243095e-05, "loss": 0.4109, "num_tokens": 129751048.0, "step": 993 }, { "epoch": 0.39664804469273746, "grad_norm": 0.2736155688762665, "learning_rate": 4.9173746346290825e-05, "loss": 0.4952, "num_tokens": 129882120.0, "step": 994 }, { "epoch": 0.39704708699122104, "grad_norm": 0.22661526501178741, "learning_rate": 4.9171086776670675e-05, "loss": 0.3592, "num_tokens": 130013192.0, "step": 995 }, { "epoch": 0.3974461292897047, "grad_norm": 0.2171398103237152, "learning_rate": 4.916842301389726e-05, "loss": 0.3581, "num_tokens": 130144264.0, "step": 996 }, { "epoch": 0.39784517158818833, "grad_norm": 0.27003347873687744, "learning_rate": 4.916575505848597e-05, "loss": 0.4543, "num_tokens": 130275336.0, "step": 997 }, { "epoch": 0.39824421388667197, "grad_norm": 0.2132273018360138, "learning_rate": 4.916308291095304e-05, "loss": 0.3563, "num_tokens": 130406408.0, "step": 998 }, { "epoch": 0.3986432561851556, "grad_norm": 0.24525126814842224, "learning_rate": 4.9160406571815525e-05, "loss": 0.4112, "num_tokens": 130537480.0, "step": 999 }, { "epoch": 0.39904229848363926, "grad_norm": 0.2518914043903351, "learning_rate": 4.9157726041591245e-05, "loss": 0.4593, "num_tokens": 130668552.0, "step": 1000 }, { "epoch": 0.3994413407821229, "grad_norm": 0.2166762351989746, "learning_rate": 4.915504132079887e-05, "loss": 0.3202, "num_tokens": 130799624.0, "step": 1001 }, { "epoch": 0.39984038308060654, "grad_norm": 0.23452894389629364, "learning_rate": 4.915235240995786e-05, "loss": 0.416, "num_tokens": 130930696.0, "step": 1002 }, { "epoch": 0.4002394253790902, "grad_norm": 0.22649116814136505, "learning_rate": 4.91496593095885e-05, "loss": 0.3892, "num_tokens": 131061768.0, "step": 1003 }, { "epoch": 0.40063846767757383, "grad_norm": 0.249775692820549, "learning_rate": 4.914696202021187e-05, "loss": 0.3967, "num_tokens": 131192840.0, "step": 1004 }, { "epoch": 0.40103750997605747, "grad_norm": 0.22818569839000702, "learning_rate": 4.91442605423499e-05, "loss": 0.3741, "num_tokens": 131323912.0, "step": 1005 }, { "epoch": 0.4014365522745411, "grad_norm": 0.25476616621017456, "learning_rate": 4.914155487652527e-05, "loss": 0.4634, "num_tokens": 131454984.0, "step": 1006 }, { "epoch": 0.40183559457302476, "grad_norm": 0.2579227089881897, "learning_rate": 4.9138845023261504e-05, "loss": 0.4335, "num_tokens": 131586056.0, "step": 1007 }, { "epoch": 0.4022346368715084, "grad_norm": 0.24654360115528107, "learning_rate": 4.9136130983082946e-05, "loss": 0.4224, "num_tokens": 131717128.0, "step": 1008 }, { "epoch": 0.40263367916999204, "grad_norm": 0.25010061264038086, "learning_rate": 4.9133412756514736e-05, "loss": 0.4256, "num_tokens": 131848200.0, "step": 1009 }, { "epoch": 0.4030327214684757, "grad_norm": 0.25099441409111023, "learning_rate": 4.913069034408282e-05, "loss": 0.4297, "num_tokens": 131979272.0, "step": 1010 }, { "epoch": 0.4034317637669593, "grad_norm": 0.23241949081420898, "learning_rate": 4.912796374631396e-05, "loss": 0.4318, "num_tokens": 132110344.0, "step": 1011 }, { "epoch": 0.4038308060654429, "grad_norm": 0.23826991021633148, "learning_rate": 4.912523296373573e-05, "loss": 0.3673, "num_tokens": 132241416.0, "step": 1012 }, { "epoch": 0.40422984836392656, "grad_norm": 0.23206673562526703, "learning_rate": 4.912249799687651e-05, "loss": 0.4094, "num_tokens": 132372488.0, "step": 1013 }, { "epoch": 0.4046288906624102, "grad_norm": 0.23572944104671478, "learning_rate": 4.911975884626549e-05, "loss": 0.3982, "num_tokens": 132503560.0, "step": 1014 }, { "epoch": 0.40502793296089384, "grad_norm": 0.24068380892276764, "learning_rate": 4.911701551243267e-05, "loss": 0.4528, "num_tokens": 132634632.0, "step": 1015 }, { "epoch": 0.4054269752593775, "grad_norm": 0.21879500150680542, "learning_rate": 4.9114267995908855e-05, "loss": 0.3672, "num_tokens": 132765704.0, "step": 1016 }, { "epoch": 0.40582601755786113, "grad_norm": 0.2403414249420166, "learning_rate": 4.911151629722567e-05, "loss": 0.4004, "num_tokens": 132896776.0, "step": 1017 }, { "epoch": 0.4062250598563448, "grad_norm": 0.2358921468257904, "learning_rate": 4.910876041691555e-05, "loss": 0.38, "num_tokens": 133026869.0, "step": 1018 }, { "epoch": 0.4066241021548284, "grad_norm": 0.23308353126049042, "learning_rate": 4.910600035551171e-05, "loss": 0.4041, "num_tokens": 133157941.0, "step": 1019 }, { "epoch": 0.40702314445331206, "grad_norm": 0.22102466225624084, "learning_rate": 4.9103236113548215e-05, "loss": 0.3629, "num_tokens": 133289013.0, "step": 1020 }, { "epoch": 0.4074221867517957, "grad_norm": 0.22629860043525696, "learning_rate": 4.910046769155992e-05, "loss": 0.3886, "num_tokens": 133420085.0, "step": 1021 }, { "epoch": 0.40782122905027934, "grad_norm": 0.27576494216918945, "learning_rate": 4.9097695090082485e-05, "loss": 0.4258, "num_tokens": 133551157.0, "step": 1022 }, { "epoch": 0.408220271348763, "grad_norm": 0.24576598405838013, "learning_rate": 4.909491830965237e-05, "loss": 0.4142, "num_tokens": 133682229.0, "step": 1023 }, { "epoch": 0.40861931364724663, "grad_norm": 0.27200135588645935, "learning_rate": 4.909213735080688e-05, "loss": 0.4691, "num_tokens": 133813301.0, "step": 1024 }, { "epoch": 0.40901835594573027, "grad_norm": 0.2525199353694916, "learning_rate": 4.908935221408408e-05, "loss": 0.4302, "num_tokens": 133944373.0, "step": 1025 }, { "epoch": 0.40941739824421386, "grad_norm": 0.25072750449180603, "learning_rate": 4.908656290002289e-05, "loss": 0.4472, "num_tokens": 134075445.0, "step": 1026 }, { "epoch": 0.4098164405426975, "grad_norm": 0.25325438380241394, "learning_rate": 4.9083769409163e-05, "loss": 0.4573, "num_tokens": 134206517.0, "step": 1027 }, { "epoch": 0.41021548284118114, "grad_norm": 0.2403097301721573, "learning_rate": 4.908097174204493e-05, "loss": 0.4166, "num_tokens": 134337589.0, "step": 1028 }, { "epoch": 0.4106145251396648, "grad_norm": 0.2301785945892334, "learning_rate": 4.907816989921001e-05, "loss": 0.4144, "num_tokens": 134468661.0, "step": 1029 }, { "epoch": 0.41101356743814843, "grad_norm": 0.23710688948631287, "learning_rate": 4.907536388120036e-05, "loss": 0.39, "num_tokens": 134599733.0, "step": 1030 }, { "epoch": 0.4114126097366321, "grad_norm": 0.2352222055196762, "learning_rate": 4.907255368855892e-05, "loss": 0.4285, "num_tokens": 134730805.0, "step": 1031 }, { "epoch": 0.4118116520351157, "grad_norm": 0.2370745688676834, "learning_rate": 4.906973932182943e-05, "loss": 0.4267, "num_tokens": 134861877.0, "step": 1032 }, { "epoch": 0.41221069433359936, "grad_norm": 0.22946566343307495, "learning_rate": 4.906692078155647e-05, "loss": 0.3518, "num_tokens": 134992949.0, "step": 1033 }, { "epoch": 0.412609736632083, "grad_norm": 0.2451578825712204, "learning_rate": 4.9064098068285375e-05, "loss": 0.4501, "num_tokens": 135124021.0, "step": 1034 }, { "epoch": 0.41300877893056664, "grad_norm": 0.23352505266666412, "learning_rate": 4.9061271182562324e-05, "loss": 0.4426, "num_tokens": 135255093.0, "step": 1035 }, { "epoch": 0.4134078212290503, "grad_norm": 0.2567373216152191, "learning_rate": 4.9058440124934284e-05, "loss": 0.4696, "num_tokens": 135386165.0, "step": 1036 }, { "epoch": 0.41380686352753393, "grad_norm": 0.273186594247818, "learning_rate": 4.905560489594905e-05, "loss": 0.4449, "num_tokens": 135517237.0, "step": 1037 }, { "epoch": 0.4142059058260176, "grad_norm": 0.24823510646820068, "learning_rate": 4.9052765496155204e-05, "loss": 0.36, "num_tokens": 135648309.0, "step": 1038 }, { "epoch": 0.4146049481245012, "grad_norm": 0.27174657583236694, "learning_rate": 4.904992192610215e-05, "loss": 0.4124, "num_tokens": 135779381.0, "step": 1039 }, { "epoch": 0.41500399042298486, "grad_norm": 0.23398683965206146, "learning_rate": 4.904707418634009e-05, "loss": 0.4241, "num_tokens": 135910453.0, "step": 1040 }, { "epoch": 0.4154030327214685, "grad_norm": 0.25720298290252686, "learning_rate": 4.904422227742003e-05, "loss": 0.4426, "num_tokens": 136041525.0, "step": 1041 }, { "epoch": 0.4158020750199521, "grad_norm": 0.2515329420566559, "learning_rate": 4.9041366199893794e-05, "loss": 0.3948, "num_tokens": 136172597.0, "step": 1042 }, { "epoch": 0.41620111731843573, "grad_norm": 0.2209288477897644, "learning_rate": 4.9038505954314e-05, "loss": 0.3603, "num_tokens": 136303669.0, "step": 1043 }, { "epoch": 0.4166001596169194, "grad_norm": 0.20726008713245392, "learning_rate": 4.903564154123409e-05, "loss": 0.3632, "num_tokens": 136434741.0, "step": 1044 }, { "epoch": 0.416999201915403, "grad_norm": 0.23646280169487, "learning_rate": 4.90327729612083e-05, "loss": 0.426, "num_tokens": 136565813.0, "step": 1045 }, { "epoch": 0.41739824421388666, "grad_norm": 0.22986525297164917, "learning_rate": 4.9029900214791655e-05, "loss": 0.341, "num_tokens": 136696885.0, "step": 1046 }, { "epoch": 0.4177972865123703, "grad_norm": 0.2186344563961029, "learning_rate": 4.902702330254002e-05, "loss": 0.3543, "num_tokens": 136827957.0, "step": 1047 }, { "epoch": 0.41819632881085395, "grad_norm": 0.26891276240348816, "learning_rate": 4.902414222501005e-05, "loss": 0.4023, "num_tokens": 136945419.0, "step": 1048 }, { "epoch": 0.4185953711093376, "grad_norm": 0.25044214725494385, "learning_rate": 4.902125698275921e-05, "loss": 0.411, "num_tokens": 137076491.0, "step": 1049 }, { "epoch": 0.41899441340782123, "grad_norm": 0.2521921694278717, "learning_rate": 4.9018367576345755e-05, "loss": 0.3993, "num_tokens": 137207563.0, "step": 1050 }, { "epoch": 0.4193934557063049, "grad_norm": 0.23043562471866608, "learning_rate": 4.9015474006328773e-05, "loss": 0.3309, "num_tokens": 137338635.0, "step": 1051 }, { "epoch": 0.4197924980047885, "grad_norm": 0.2551150321960449, "learning_rate": 4.901257627326813e-05, "loss": 0.3925, "num_tokens": 137469707.0, "step": 1052 }, { "epoch": 0.42019154030327216, "grad_norm": 0.2564396262168884, "learning_rate": 4.900967437772452e-05, "loss": 0.409, "num_tokens": 137600779.0, "step": 1053 }, { "epoch": 0.4205905826017558, "grad_norm": 0.22078345715999603, "learning_rate": 4.900676832025942e-05, "loss": 0.3892, "num_tokens": 137731851.0, "step": 1054 }, { "epoch": 0.42098962490023945, "grad_norm": 0.24574358761310577, "learning_rate": 4.900385810143515e-05, "loss": 0.3625, "num_tokens": 137862923.0, "step": 1055 }, { "epoch": 0.4213886671987231, "grad_norm": 0.23114065825939178, "learning_rate": 4.900094372181478e-05, "loss": 0.3878, "num_tokens": 137993995.0, "step": 1056 }, { "epoch": 0.42178770949720673, "grad_norm": 0.22873620688915253, "learning_rate": 4.899802518196223e-05, "loss": 0.4051, "num_tokens": 138125067.0, "step": 1057 }, { "epoch": 0.4221867517956903, "grad_norm": 0.22453325986862183, "learning_rate": 4.899510248244221e-05, "loss": 0.4077, "num_tokens": 138256139.0, "step": 1058 }, { "epoch": 0.42258579409417396, "grad_norm": 0.24396121501922607, "learning_rate": 4.899217562382024e-05, "loss": 0.4022, "num_tokens": 138387211.0, "step": 1059 }, { "epoch": 0.4229848363926576, "grad_norm": 0.24797052145004272, "learning_rate": 4.8989244606662624e-05, "loss": 0.4841, "num_tokens": 138518283.0, "step": 1060 }, { "epoch": 0.42338387869114125, "grad_norm": 0.2738354206085205, "learning_rate": 4.898630943153651e-05, "loss": 0.4522, "num_tokens": 138649355.0, "step": 1061 }, { "epoch": 0.4237829209896249, "grad_norm": 0.2160312980413437, "learning_rate": 4.8983370099009804e-05, "loss": 0.3878, "num_tokens": 138780427.0, "step": 1062 }, { "epoch": 0.42418196328810853, "grad_norm": 0.22961612045764923, "learning_rate": 4.8980426609651246e-05, "loss": 0.409, "num_tokens": 138910089.0, "step": 1063 }, { "epoch": 0.4245810055865922, "grad_norm": 0.25964322686195374, "learning_rate": 4.897747896403039e-05, "loss": 0.4591, "num_tokens": 139041161.0, "step": 1064 }, { "epoch": 0.4249800478850758, "grad_norm": 0.2491307556629181, "learning_rate": 4.897452716271756e-05, "loss": 0.4368, "num_tokens": 139172233.0, "step": 1065 }, { "epoch": 0.42537909018355946, "grad_norm": 0.2521800994873047, "learning_rate": 4.89715712062839e-05, "loss": 0.4366, "num_tokens": 139303305.0, "step": 1066 }, { "epoch": 0.4257781324820431, "grad_norm": 0.2734006941318512, "learning_rate": 4.896861109530138e-05, "loss": 0.4971, "num_tokens": 139434377.0, "step": 1067 }, { "epoch": 0.42617717478052675, "grad_norm": 0.24202591180801392, "learning_rate": 4.896564683034273e-05, "loss": 0.4093, "num_tokens": 139565449.0, "step": 1068 }, { "epoch": 0.4265762170790104, "grad_norm": 0.24202241003513336, "learning_rate": 4.896267841198152e-05, "loss": 0.4034, "num_tokens": 139696521.0, "step": 1069 }, { "epoch": 0.42697525937749403, "grad_norm": 0.19214555621147156, "learning_rate": 4.8959705840792115e-05, "loss": 0.3142, "num_tokens": 139827593.0, "step": 1070 }, { "epoch": 0.4273743016759777, "grad_norm": 0.21968093514442444, "learning_rate": 4.895672911734968e-05, "loss": 0.359, "num_tokens": 139958665.0, "step": 1071 }, { "epoch": 0.4277733439744613, "grad_norm": 0.22096432745456696, "learning_rate": 4.895374824223016e-05, "loss": 0.3746, "num_tokens": 140089737.0, "step": 1072 }, { "epoch": 0.4281723862729449, "grad_norm": 0.22386065125465393, "learning_rate": 4.8950763216010353e-05, "loss": 0.4023, "num_tokens": 140220809.0, "step": 1073 }, { "epoch": 0.42857142857142855, "grad_norm": 0.2148187756538391, "learning_rate": 4.894777403926783e-05, "loss": 0.3489, "num_tokens": 140351881.0, "step": 1074 }, { "epoch": 0.4289704708699122, "grad_norm": 0.203452005982399, "learning_rate": 4.894478071258096e-05, "loss": 0.2906, "num_tokens": 140482953.0, "step": 1075 }, { "epoch": 0.42936951316839583, "grad_norm": 0.2658850848674774, "learning_rate": 4.894178323652892e-05, "loss": 0.4355, "num_tokens": 140614025.0, "step": 1076 }, { "epoch": 0.4297685554668795, "grad_norm": 0.2723584771156311, "learning_rate": 4.8938781611691715e-05, "loss": 0.4339, "num_tokens": 140745097.0, "step": 1077 }, { "epoch": 0.4301675977653631, "grad_norm": 0.25403282046318054, "learning_rate": 4.893577583865011e-05, "loss": 0.4332, "num_tokens": 140876169.0, "step": 1078 }, { "epoch": 0.43056664006384676, "grad_norm": 0.22321395576000214, "learning_rate": 4.893276591798571e-05, "loss": 0.3647, "num_tokens": 141007241.0, "step": 1079 }, { "epoch": 0.4309656823623304, "grad_norm": 0.25128188729286194, "learning_rate": 4.8929751850280885e-05, "loss": 0.4226, "num_tokens": 141138313.0, "step": 1080 }, { "epoch": 0.43136472466081405, "grad_norm": 0.23274032771587372, "learning_rate": 4.8926733636118854e-05, "loss": 0.3867, "num_tokens": 141269385.0, "step": 1081 }, { "epoch": 0.4317637669592977, "grad_norm": 0.2553775906562805, "learning_rate": 4.8923711276083597e-05, "loss": 0.4055, "num_tokens": 141400457.0, "step": 1082 }, { "epoch": 0.43216280925778133, "grad_norm": 0.229497030377388, "learning_rate": 4.892068477075992e-05, "loss": 0.3994, "num_tokens": 141531529.0, "step": 1083 }, { "epoch": 0.432561851556265, "grad_norm": 0.2526293098926544, "learning_rate": 4.891765412073342e-05, "loss": 0.458, "num_tokens": 141662601.0, "step": 1084 }, { "epoch": 0.4329608938547486, "grad_norm": 0.22455525398254395, "learning_rate": 4.891461932659049e-05, "loss": 0.3626, "num_tokens": 141793673.0, "step": 1085 }, { "epoch": 0.43335993615323226, "grad_norm": 0.2224312573671341, "learning_rate": 4.891158038891836e-05, "loss": 0.3843, "num_tokens": 141924745.0, "step": 1086 }, { "epoch": 0.4337589784517159, "grad_norm": 0.236037939786911, "learning_rate": 4.890853730830501e-05, "loss": 0.3972, "num_tokens": 142055817.0, "step": 1087 }, { "epoch": 0.43415802075019955, "grad_norm": 0.22449268400669098, "learning_rate": 4.8905490085339275e-05, "loss": 0.4068, "num_tokens": 142186889.0, "step": 1088 }, { "epoch": 0.43455706304868313, "grad_norm": 0.2394649088382721, "learning_rate": 4.890243872061073e-05, "loss": 0.4413, "num_tokens": 142317961.0, "step": 1089 }, { "epoch": 0.4349561053471668, "grad_norm": 0.2352701872587204, "learning_rate": 4.8899383214709816e-05, "loss": 0.3883, "num_tokens": 142449033.0, "step": 1090 }, { "epoch": 0.4353551476456504, "grad_norm": 0.24530865252017975, "learning_rate": 4.889632356822773e-05, "loss": 0.428, "num_tokens": 142580105.0, "step": 1091 }, { "epoch": 0.43575418994413406, "grad_norm": 0.23010948300361633, "learning_rate": 4.889325978175648e-05, "loss": 0.3836, "num_tokens": 142711177.0, "step": 1092 }, { "epoch": 0.4361532322426177, "grad_norm": 0.23871104419231415, "learning_rate": 4.889019185588889e-05, "loss": 0.3604, "num_tokens": 142842249.0, "step": 1093 }, { "epoch": 0.43655227454110135, "grad_norm": 0.23465506732463837, "learning_rate": 4.888711979121858e-05, "loss": 0.382, "num_tokens": 142973321.0, "step": 1094 }, { "epoch": 0.436951316839585, "grad_norm": 0.2650533616542816, "learning_rate": 4.888404358833995e-05, "loss": 0.4141, "num_tokens": 143104393.0, "step": 1095 }, { "epoch": 0.43735035913806863, "grad_norm": 0.25871187448501587, "learning_rate": 4.888096324784823e-05, "loss": 0.4329, "num_tokens": 143235465.0, "step": 1096 }, { "epoch": 0.4377494014365523, "grad_norm": 0.23703524470329285, "learning_rate": 4.8877878770339435e-05, "loss": 0.3692, "num_tokens": 143366537.0, "step": 1097 }, { "epoch": 0.4381484437350359, "grad_norm": 0.2427586317062378, "learning_rate": 4.8874790156410374e-05, "loss": 0.4253, "num_tokens": 143497609.0, "step": 1098 }, { "epoch": 0.43854748603351956, "grad_norm": 0.22351501882076263, "learning_rate": 4.8871697406658675e-05, "loss": 0.3793, "num_tokens": 143628681.0, "step": 1099 }, { "epoch": 0.4389465283320032, "grad_norm": 0.2258119136095047, "learning_rate": 4.886860052168275e-05, "loss": 0.3911, "num_tokens": 143759753.0, "step": 1100 }, { "epoch": 0.43934557063048685, "grad_norm": 0.24991117417812347, "learning_rate": 4.886549950208182e-05, "loss": 0.4453, "num_tokens": 143890825.0, "step": 1101 }, { "epoch": 0.4397446129289705, "grad_norm": 0.2270672768354416, "learning_rate": 4.88623943484559e-05, "loss": 0.4206, "num_tokens": 144021897.0, "step": 1102 }, { "epoch": 0.44014365522745413, "grad_norm": 0.21994784474372864, "learning_rate": 4.885928506140582e-05, "loss": 0.4227, "num_tokens": 144151827.0, "step": 1103 }, { "epoch": 0.4405426975259378, "grad_norm": 0.24185465276241302, "learning_rate": 4.8856171641533175e-05, "loss": 0.413, "num_tokens": 144268398.0, "step": 1104 }, { "epoch": 0.44094173982442136, "grad_norm": 0.23951034247875214, "learning_rate": 4.885305408944041e-05, "loss": 0.3977, "num_tokens": 144399470.0, "step": 1105 }, { "epoch": 0.441340782122905, "grad_norm": 0.24190612137317657, "learning_rate": 4.884993240573072e-05, "loss": 0.3929, "num_tokens": 144530542.0, "step": 1106 }, { "epoch": 0.44173982442138865, "grad_norm": 0.23221945762634277, "learning_rate": 4.884680659100814e-05, "loss": 0.4337, "num_tokens": 144657481.0, "step": 1107 }, { "epoch": 0.4421388667198723, "grad_norm": 0.23172974586486816, "learning_rate": 4.884367664587747e-05, "loss": 0.3598, "num_tokens": 144788553.0, "step": 1108 }, { "epoch": 0.44253790901835593, "grad_norm": 0.25297021865844727, "learning_rate": 4.884054257094434e-05, "loss": 0.3833, "num_tokens": 144919625.0, "step": 1109 }, { "epoch": 0.4429369513168396, "grad_norm": 0.23750896751880646, "learning_rate": 4.883740436681515e-05, "loss": 0.4037, "num_tokens": 145050697.0, "step": 1110 }, { "epoch": 0.4433359936153232, "grad_norm": 0.26143962144851685, "learning_rate": 4.8834262034097125e-05, "loss": 0.4656, "num_tokens": 145180297.0, "step": 1111 }, { "epoch": 0.44373503591380686, "grad_norm": 0.24561218917369843, "learning_rate": 4.8831115573398264e-05, "loss": 0.4393, "num_tokens": 145311369.0, "step": 1112 }, { "epoch": 0.4441340782122905, "grad_norm": 0.24085873365402222, "learning_rate": 4.8827964985327395e-05, "loss": 0.3879, "num_tokens": 145442441.0, "step": 1113 }, { "epoch": 0.44453312051077415, "grad_norm": 0.2374010533094406, "learning_rate": 4.882481027049411e-05, "loss": 0.4039, "num_tokens": 145573513.0, "step": 1114 }, { "epoch": 0.4449321628092578, "grad_norm": 0.22538204491138458, "learning_rate": 4.882165142950883e-05, "loss": 0.3766, "num_tokens": 145704585.0, "step": 1115 }, { "epoch": 0.44533120510774143, "grad_norm": 0.24725662171840668, "learning_rate": 4.8818488462982755e-05, "loss": 0.3801, "num_tokens": 145835657.0, "step": 1116 }, { "epoch": 0.4457302474062251, "grad_norm": 0.23480761051177979, "learning_rate": 4.881532137152789e-05, "loss": 0.3863, "num_tokens": 145966729.0, "step": 1117 }, { "epoch": 0.4461292897047087, "grad_norm": 0.2553716003894806, "learning_rate": 4.881215015575704e-05, "loss": 0.4162, "num_tokens": 146097801.0, "step": 1118 }, { "epoch": 0.44652833200319236, "grad_norm": 0.22007077932357788, "learning_rate": 4.88089748162838e-05, "loss": 0.3471, "num_tokens": 146228873.0, "step": 1119 }, { "epoch": 0.44692737430167595, "grad_norm": 0.2464483678340912, "learning_rate": 4.880579535372258e-05, "loss": 0.4239, "num_tokens": 146359945.0, "step": 1120 }, { "epoch": 0.4473264166001596, "grad_norm": 0.3099668323993683, "learning_rate": 4.880261176868857e-05, "loss": 0.4111, "num_tokens": 146489205.0, "step": 1121 }, { "epoch": 0.44772545889864324, "grad_norm": 0.2482464611530304, "learning_rate": 4.879942406179775e-05, "loss": 0.4138, "num_tokens": 146620277.0, "step": 1122 }, { "epoch": 0.4481245011971269, "grad_norm": 0.30161136388778687, "learning_rate": 4.8796232233666933e-05, "loss": 0.4415, "num_tokens": 146751349.0, "step": 1123 }, { "epoch": 0.4485235434956105, "grad_norm": 0.25428661704063416, "learning_rate": 4.87930362849137e-05, "loss": 0.4048, "num_tokens": 146882421.0, "step": 1124 }, { "epoch": 0.44892258579409416, "grad_norm": 0.25466498732566833, "learning_rate": 4.878983621615644e-05, "loss": 0.4252, "num_tokens": 147013493.0, "step": 1125 }, { "epoch": 0.4493216280925778, "grad_norm": 0.21333393454551697, "learning_rate": 4.8786632028014326e-05, "loss": 0.3812, "num_tokens": 147144565.0, "step": 1126 }, { "epoch": 0.44972067039106145, "grad_norm": 0.23273368179798126, "learning_rate": 4.878342372110735e-05, "loss": 0.3919, "num_tokens": 147275637.0, "step": 1127 }, { "epoch": 0.4501197126895451, "grad_norm": 0.2421995848417282, "learning_rate": 4.878021129605629e-05, "loss": 0.4096, "num_tokens": 147406709.0, "step": 1128 }, { "epoch": 0.45051875498802874, "grad_norm": 0.27584925293922424, "learning_rate": 4.877699475348271e-05, "loss": 0.4612, "num_tokens": 147537781.0, "step": 1129 }, { "epoch": 0.4509177972865124, "grad_norm": 0.21789047122001648, "learning_rate": 4.877377409400899e-05, "loss": 0.384, "num_tokens": 147668853.0, "step": 1130 }, { "epoch": 0.451316839584996, "grad_norm": 0.24938376247882843, "learning_rate": 4.877054931825829e-05, "loss": 0.4701, "num_tokens": 147799925.0, "step": 1131 }, { "epoch": 0.45171588188347966, "grad_norm": 0.21768148243427277, "learning_rate": 4.876732042685459e-05, "loss": 0.3954, "num_tokens": 147930997.0, "step": 1132 }, { "epoch": 0.4521149241819633, "grad_norm": 0.23633582890033722, "learning_rate": 4.876408742042263e-05, "loss": 0.4153, "num_tokens": 148062069.0, "step": 1133 }, { "epoch": 0.45251396648044695, "grad_norm": 0.2508683204650879, "learning_rate": 4.876085029958798e-05, "loss": 0.4108, "num_tokens": 148193141.0, "step": 1134 }, { "epoch": 0.4529130087789306, "grad_norm": 0.2344217598438263, "learning_rate": 4.8757609064976975e-05, "loss": 0.4004, "num_tokens": 148324213.0, "step": 1135 }, { "epoch": 0.4533120510774142, "grad_norm": 0.23961478471755981, "learning_rate": 4.875436371721679e-05, "loss": 0.4031, "num_tokens": 148455285.0, "step": 1136 }, { "epoch": 0.4537110933758978, "grad_norm": 0.24788688123226166, "learning_rate": 4.875111425693535e-05, "loss": 0.4292, "num_tokens": 148586357.0, "step": 1137 }, { "epoch": 0.45411013567438147, "grad_norm": 0.23069536685943604, "learning_rate": 4.8747860684761396e-05, "loss": 0.4045, "num_tokens": 148717429.0, "step": 1138 }, { "epoch": 0.4545091779728651, "grad_norm": 0.25887465476989746, "learning_rate": 4.874460300132448e-05, "loss": 0.4345, "num_tokens": 148848501.0, "step": 1139 }, { "epoch": 0.45490822027134875, "grad_norm": 0.27271223068237305, "learning_rate": 4.874134120725492e-05, "loss": 0.434, "num_tokens": 148979573.0, "step": 1140 }, { "epoch": 0.4553072625698324, "grad_norm": 0.23575612902641296, "learning_rate": 4.873807530318385e-05, "loss": 0.3906, "num_tokens": 149110645.0, "step": 1141 }, { "epoch": 0.45570630486831604, "grad_norm": 0.22991454601287842, "learning_rate": 4.873480528974318e-05, "loss": 0.3736, "num_tokens": 149241717.0, "step": 1142 }, { "epoch": 0.4561053471667997, "grad_norm": 0.2487708181142807, "learning_rate": 4.873153116756564e-05, "loss": 0.4383, "num_tokens": 149372789.0, "step": 1143 }, { "epoch": 0.4565043894652833, "grad_norm": 0.2492000311613083, "learning_rate": 4.872825293728474e-05, "loss": 0.4361, "num_tokens": 149503861.0, "step": 1144 }, { "epoch": 0.45690343176376697, "grad_norm": 0.2408650666475296, "learning_rate": 4.8724970599534775e-05, "loss": 0.3617, "num_tokens": 149634933.0, "step": 1145 }, { "epoch": 0.4573024740622506, "grad_norm": 0.2174694985151291, "learning_rate": 4.8721684154950855e-05, "loss": 0.3809, "num_tokens": 149766005.0, "step": 1146 }, { "epoch": 0.45770151636073425, "grad_norm": 0.2518158257007599, "learning_rate": 4.871839360416888e-05, "loss": 0.4721, "num_tokens": 149897077.0, "step": 1147 }, { "epoch": 0.4581005586592179, "grad_norm": 0.24100364744663239, "learning_rate": 4.871509894782554e-05, "loss": 0.427, "num_tokens": 150028149.0, "step": 1148 }, { "epoch": 0.45849960095770154, "grad_norm": 0.231802299618721, "learning_rate": 4.8711800186558326e-05, "loss": 0.4141, "num_tokens": 150159221.0, "step": 1149 }, { "epoch": 0.4588986432561852, "grad_norm": 0.19730067253112793, "learning_rate": 4.87084973210055e-05, "loss": 0.3301, "num_tokens": 150290293.0, "step": 1150 }, { "epoch": 0.4592976855546688, "grad_norm": 0.22622589766979218, "learning_rate": 4.870519035180615e-05, "loss": 0.3741, "num_tokens": 150421365.0, "step": 1151 }, { "epoch": 0.4596967278531524, "grad_norm": 0.24316184222698212, "learning_rate": 4.870187927960014e-05, "loss": 0.3993, "num_tokens": 150552437.0, "step": 1152 }, { "epoch": 0.46009577015163605, "grad_norm": 0.21177811920642853, "learning_rate": 4.8698564105028125e-05, "loss": 0.3167, "num_tokens": 150683509.0, "step": 1153 }, { "epoch": 0.4604948124501197, "grad_norm": 0.22367839515209198, "learning_rate": 4.869524482873156e-05, "loss": 0.3502, "num_tokens": 150814581.0, "step": 1154 }, { "epoch": 0.46089385474860334, "grad_norm": 0.22067248821258545, "learning_rate": 4.869192145135271e-05, "loss": 0.3681, "num_tokens": 150945653.0, "step": 1155 }, { "epoch": 0.461292897047087, "grad_norm": 0.22458504140377045, "learning_rate": 4.868859397353461e-05, "loss": 0.3734, "num_tokens": 151076725.0, "step": 1156 }, { "epoch": 0.4616919393455706, "grad_norm": 0.2533625066280365, "learning_rate": 4.868526239592109e-05, "loss": 0.4273, "num_tokens": 151207797.0, "step": 1157 }, { "epoch": 0.46209098164405427, "grad_norm": 0.2608823776245117, "learning_rate": 4.868192671915678e-05, "loss": 0.419, "num_tokens": 151338869.0, "step": 1158 }, { "epoch": 0.4624900239425379, "grad_norm": 0.24207037687301636, "learning_rate": 4.8678586943887106e-05, "loss": 0.406, "num_tokens": 151469941.0, "step": 1159 }, { "epoch": 0.46288906624102155, "grad_norm": 0.44661107659339905, "learning_rate": 4.867524307075829e-05, "loss": 0.4797, "num_tokens": 151601013.0, "step": 1160 }, { "epoch": 0.4632881085395052, "grad_norm": 0.23703503608703613, "learning_rate": 4.8671895100417325e-05, "loss": 0.3736, "num_tokens": 151732085.0, "step": 1161 }, { "epoch": 0.46368715083798884, "grad_norm": 0.22751738131046295, "learning_rate": 4.866854303351202e-05, "loss": 0.3746, "num_tokens": 151863157.0, "step": 1162 }, { "epoch": 0.4640861931364725, "grad_norm": 0.2765241265296936, "learning_rate": 4.866518687069097e-05, "loss": 0.3723, "num_tokens": 151994229.0, "step": 1163 }, { "epoch": 0.4644852354349561, "grad_norm": 0.2203981578350067, "learning_rate": 4.866182661260356e-05, "loss": 0.3428, "num_tokens": 152125301.0, "step": 1164 }, { "epoch": 0.46488427773343977, "grad_norm": 0.23018886148929596, "learning_rate": 4.8658462259899976e-05, "loss": 0.3928, "num_tokens": 152256373.0, "step": 1165 }, { "epoch": 0.4652833200319234, "grad_norm": 0.2314324676990509, "learning_rate": 4.865509381323117e-05, "loss": 0.358, "num_tokens": 152387445.0, "step": 1166 }, { "epoch": 0.465682362330407, "grad_norm": 0.2470618486404419, "learning_rate": 4.8651721273248923e-05, "loss": 0.4472, "num_tokens": 152518517.0, "step": 1167 }, { "epoch": 0.46608140462889064, "grad_norm": 0.21710218489170074, "learning_rate": 4.864834464060578e-05, "loss": 0.3511, "num_tokens": 152649589.0, "step": 1168 }, { "epoch": 0.4664804469273743, "grad_norm": 0.24353359639644623, "learning_rate": 4.8644963915955095e-05, "loss": 0.4089, "num_tokens": 152780661.0, "step": 1169 }, { "epoch": 0.4668794892258579, "grad_norm": 0.23200036585330963, "learning_rate": 4.864157909995101e-05, "loss": 0.4017, "num_tokens": 152911733.0, "step": 1170 }, { "epoch": 0.46727853152434157, "grad_norm": 0.23016157746315002, "learning_rate": 4.8638190193248436e-05, "loss": 0.3827, "num_tokens": 153042805.0, "step": 1171 }, { "epoch": 0.4676775738228252, "grad_norm": 0.22969157993793488, "learning_rate": 4.863479719650311e-05, "loss": 0.4117, "num_tokens": 153173877.0, "step": 1172 }, { "epoch": 0.46807661612130885, "grad_norm": 0.22814521193504333, "learning_rate": 4.863140011037154e-05, "loss": 0.3865, "num_tokens": 153304949.0, "step": 1173 }, { "epoch": 0.4684756584197925, "grad_norm": 0.2251121997833252, "learning_rate": 4.862799893551104e-05, "loss": 0.3583, "num_tokens": 153436021.0, "step": 1174 }, { "epoch": 0.46887470071827614, "grad_norm": 0.26610344648361206, "learning_rate": 4.86245936725797e-05, "loss": 0.4506, "num_tokens": 153567093.0, "step": 1175 }, { "epoch": 0.4692737430167598, "grad_norm": 0.23062559962272644, "learning_rate": 4.8621184322236396e-05, "loss": 0.4062, "num_tokens": 153698165.0, "step": 1176 }, { "epoch": 0.4696727853152434, "grad_norm": 0.21700796484947205, "learning_rate": 4.8617770885140816e-05, "loss": 0.3836, "num_tokens": 153829237.0, "step": 1177 }, { "epoch": 0.47007182761372707, "grad_norm": 0.24465042352676392, "learning_rate": 4.861435336195343e-05, "loss": 0.4216, "num_tokens": 153960309.0, "step": 1178 }, { "epoch": 0.4704708699122107, "grad_norm": 0.22411847114562988, "learning_rate": 4.8610931753335495e-05, "loss": 0.3769, "num_tokens": 154091381.0, "step": 1179 }, { "epoch": 0.47086991221069435, "grad_norm": 0.28318947553634644, "learning_rate": 4.8607506059949056e-05, "loss": 0.3536, "num_tokens": 154222453.0, "step": 1180 }, { "epoch": 0.471268954509178, "grad_norm": 0.21238596737384796, "learning_rate": 4.860407628245696e-05, "loss": 0.3654, "num_tokens": 154353525.0, "step": 1181 }, { "epoch": 0.47166799680766164, "grad_norm": 0.22183044254779816, "learning_rate": 4.8600642421522834e-05, "loss": 0.3778, "num_tokens": 154484597.0, "step": 1182 }, { "epoch": 0.4720670391061452, "grad_norm": 0.23275914788246155, "learning_rate": 4.8597204477811096e-05, "loss": 0.3817, "num_tokens": 154615669.0, "step": 1183 }, { "epoch": 0.47246608140462887, "grad_norm": 0.2358921468257904, "learning_rate": 4.859376245198695e-05, "loss": 0.3819, "num_tokens": 154746741.0, "step": 1184 }, { "epoch": 0.4728651237031125, "grad_norm": 0.22727173566818237, "learning_rate": 4.859031634471642e-05, "loss": 0.3607, "num_tokens": 154877813.0, "step": 1185 }, { "epoch": 0.47326416600159615, "grad_norm": 0.23894914984703064, "learning_rate": 4.858686615666626e-05, "loss": 0.3828, "num_tokens": 155008885.0, "step": 1186 }, { "epoch": 0.4736632083000798, "grad_norm": 0.2380039542913437, "learning_rate": 4.858341188850408e-05, "loss": 0.3854, "num_tokens": 155139957.0, "step": 1187 }, { "epoch": 0.47406225059856344, "grad_norm": 0.2671810984611511, "learning_rate": 4.8579953540898236e-05, "loss": 0.3996, "num_tokens": 155271029.0, "step": 1188 }, { "epoch": 0.4744612928970471, "grad_norm": 0.2476036250591278, "learning_rate": 4.857649111451788e-05, "loss": 0.3684, "num_tokens": 155402101.0, "step": 1189 }, { "epoch": 0.4748603351955307, "grad_norm": 0.2760142982006073, "learning_rate": 4.857302461003297e-05, "loss": 0.4215, "num_tokens": 155533173.0, "step": 1190 }, { "epoch": 0.47525937749401437, "grad_norm": 0.29001039266586304, "learning_rate": 4.8569554028114234e-05, "loss": 0.3801, "num_tokens": 155664245.0, "step": 1191 }, { "epoch": 0.475658419792498, "grad_norm": 0.23544996976852417, "learning_rate": 4.85660793694332e-05, "loss": 0.3659, "num_tokens": 155795317.0, "step": 1192 }, { "epoch": 0.47605746209098165, "grad_norm": 0.24226470291614532, "learning_rate": 4.85626006346622e-05, "loss": 0.4028, "num_tokens": 155926389.0, "step": 1193 }, { "epoch": 0.4764565043894653, "grad_norm": 0.2223259061574936, "learning_rate": 4.8559117824474306e-05, "loss": 0.3722, "num_tokens": 156057461.0, "step": 1194 }, { "epoch": 0.47685554668794894, "grad_norm": 0.2372438758611679, "learning_rate": 4.855563093954342e-05, "loss": 0.4174, "num_tokens": 156188533.0, "step": 1195 }, { "epoch": 0.4772545889864326, "grad_norm": 0.23429839313030243, "learning_rate": 4.855213998054424e-05, "loss": 0.4234, "num_tokens": 156319605.0, "step": 1196 }, { "epoch": 0.4776536312849162, "grad_norm": 0.22284162044525146, "learning_rate": 4.8548644948152206e-05, "loss": 0.3841, "num_tokens": 156450677.0, "step": 1197 }, { "epoch": 0.47805267358339987, "grad_norm": 0.21766556799411774, "learning_rate": 4.854514584304359e-05, "loss": 0.3464, "num_tokens": 156581749.0, "step": 1198 }, { "epoch": 0.47845171588188345, "grad_norm": 0.2587299048900604, "learning_rate": 4.854164266589544e-05, "loss": 0.4076, "num_tokens": 156708486.0, "step": 1199 }, { "epoch": 0.4788507581803671, "grad_norm": 0.22816893458366394, "learning_rate": 4.853813541738558e-05, "loss": 0.3971, "num_tokens": 156839558.0, "step": 1200 }, { "epoch": 0.47924980047885074, "grad_norm": 0.21507836878299713, "learning_rate": 4.8534624098192627e-05, "loss": 0.3535, "num_tokens": 156970630.0, "step": 1201 }, { "epoch": 0.4796488427773344, "grad_norm": 0.24180617928504944, "learning_rate": 4.8531108708995995e-05, "loss": 0.3847, "num_tokens": 157101702.0, "step": 1202 }, { "epoch": 0.480047885075818, "grad_norm": 0.24926842749118805, "learning_rate": 4.852758925047588e-05, "loss": 0.4063, "num_tokens": 157232774.0, "step": 1203 }, { "epoch": 0.48044692737430167, "grad_norm": 0.26046881079673767, "learning_rate": 4.852406572331326e-05, "loss": 0.4046, "num_tokens": 157363846.0, "step": 1204 }, { "epoch": 0.4808459696727853, "grad_norm": 0.22125153243541718, "learning_rate": 4.8520538128189907e-05, "loss": 0.3759, "num_tokens": 157494918.0, "step": 1205 }, { "epoch": 0.48124501197126895, "grad_norm": 0.24672771990299225, "learning_rate": 4.851700646578837e-05, "loss": 0.4137, "num_tokens": 157625990.0, "step": 1206 }, { "epoch": 0.4816440542697526, "grad_norm": 0.23931366205215454, "learning_rate": 4.851347073679201e-05, "loss": 0.3777, "num_tokens": 157757062.0, "step": 1207 }, { "epoch": 0.48204309656823624, "grad_norm": 0.2882852852344513, "learning_rate": 4.850993094188495e-05, "loss": 0.5566, "num_tokens": 157888134.0, "step": 1208 }, { "epoch": 0.4824421388667199, "grad_norm": 0.22389300167560577, "learning_rate": 4.8506387081752094e-05, "loss": 0.369, "num_tokens": 158019206.0, "step": 1209 }, { "epoch": 0.4828411811652035, "grad_norm": 0.2215418964624405, "learning_rate": 4.850283915707916e-05, "loss": 0.3644, "num_tokens": 158150278.0, "step": 1210 }, { "epoch": 0.48324022346368717, "grad_norm": 0.24570772051811218, "learning_rate": 4.8499287168552634e-05, "loss": 0.4052, "num_tokens": 158281350.0, "step": 1211 }, { "epoch": 0.4836392657621708, "grad_norm": 0.26873064041137695, "learning_rate": 4.84957311168598e-05, "loss": 0.451, "num_tokens": 158412422.0, "step": 1212 }, { "epoch": 0.48403830806065445, "grad_norm": 0.2283007800579071, "learning_rate": 4.849217100268871e-05, "loss": 0.3958, "num_tokens": 158543494.0, "step": 1213 }, { "epoch": 0.48443735035913804, "grad_norm": 0.21148055791854858, "learning_rate": 4.848860682672822e-05, "loss": 0.36, "num_tokens": 158674566.0, "step": 1214 }, { "epoch": 0.4848363926576217, "grad_norm": 0.21775299310684204, "learning_rate": 4.848503858966796e-05, "loss": 0.3566, "num_tokens": 158805638.0, "step": 1215 }, { "epoch": 0.4852354349561053, "grad_norm": 0.23346640169620514, "learning_rate": 4.848146629219836e-05, "loss": 0.4216, "num_tokens": 158936710.0, "step": 1216 }, { "epoch": 0.48563447725458897, "grad_norm": 0.24055804312229156, "learning_rate": 4.847788993501061e-05, "loss": 0.3968, "num_tokens": 159067782.0, "step": 1217 }, { "epoch": 0.4860335195530726, "grad_norm": 0.26227879524230957, "learning_rate": 4.847430951879671e-05, "loss": 0.44, "num_tokens": 159198854.0, "step": 1218 }, { "epoch": 0.48643256185155626, "grad_norm": 0.26303520798683167, "learning_rate": 4.847072504424944e-05, "loss": 0.4374, "num_tokens": 159329926.0, "step": 1219 }, { "epoch": 0.4868316041500399, "grad_norm": 0.21289582550525665, "learning_rate": 4.8467136512062364e-05, "loss": 0.3591, "num_tokens": 159460998.0, "step": 1220 }, { "epoch": 0.48723064644852354, "grad_norm": 0.25218823552131653, "learning_rate": 4.846354392292982e-05, "loss": 0.4482, "num_tokens": 159592070.0, "step": 1221 }, { "epoch": 0.4876296887470072, "grad_norm": 0.2215857356786728, "learning_rate": 4.8459947277546956e-05, "loss": 0.3656, "num_tokens": 159723142.0, "step": 1222 }, { "epoch": 0.4880287310454908, "grad_norm": 0.23712539672851562, "learning_rate": 4.8456346576609674e-05, "loss": 0.4067, "num_tokens": 159854214.0, "step": 1223 }, { "epoch": 0.48842777334397447, "grad_norm": 0.23874986171722412, "learning_rate": 4.8452741820814686e-05, "loss": 0.4362, "num_tokens": 159985286.0, "step": 1224 }, { "epoch": 0.4888268156424581, "grad_norm": 0.25946590304374695, "learning_rate": 4.844913301085947e-05, "loss": 0.4138, "num_tokens": 160100581.0, "step": 1225 }, { "epoch": 0.48922585794094176, "grad_norm": 0.2225686013698578, "learning_rate": 4.8445520147442304e-05, "loss": 0.3764, "num_tokens": 160231653.0, "step": 1226 }, { "epoch": 0.4896249002394254, "grad_norm": 0.2161647081375122, "learning_rate": 4.844190323126224e-05, "loss": 0.3461, "num_tokens": 160362725.0, "step": 1227 }, { "epoch": 0.49002394253790904, "grad_norm": 0.2299918532371521, "learning_rate": 4.8438282263019116e-05, "loss": 0.3733, "num_tokens": 160493797.0, "step": 1228 }, { "epoch": 0.4904229848363927, "grad_norm": 0.2587261199951172, "learning_rate": 4.843465724341357e-05, "loss": 0.3975, "num_tokens": 160624869.0, "step": 1229 }, { "epoch": 0.49082202713487627, "grad_norm": 0.28226250410079956, "learning_rate": 4.843102817314698e-05, "loss": 0.4605, "num_tokens": 160755941.0, "step": 1230 }, { "epoch": 0.4912210694333599, "grad_norm": 0.2540723383426666, "learning_rate": 4.8427395052921564e-05, "loss": 0.4063, "num_tokens": 160887013.0, "step": 1231 }, { "epoch": 0.49162011173184356, "grad_norm": 0.23127397894859314, "learning_rate": 4.842375788344029e-05, "loss": 0.3809, "num_tokens": 161018085.0, "step": 1232 }, { "epoch": 0.4920191540303272, "grad_norm": 0.25553974509239197, "learning_rate": 4.8420116665406916e-05, "loss": 0.4023, "num_tokens": 161149157.0, "step": 1233 }, { "epoch": 0.49241819632881084, "grad_norm": 0.2498599886894226, "learning_rate": 4.841647139952598e-05, "loss": 0.3909, "num_tokens": 161280229.0, "step": 1234 }, { "epoch": 0.4928172386272945, "grad_norm": 0.23148907721042633, "learning_rate": 4.8412822086502805e-05, "loss": 0.3682, "num_tokens": 161411301.0, "step": 1235 }, { "epoch": 0.4932162809257781, "grad_norm": 0.24046923220157623, "learning_rate": 4.840916872704351e-05, "loss": 0.4154, "num_tokens": 161542373.0, "step": 1236 }, { "epoch": 0.49361532322426177, "grad_norm": 0.2547067105770111, "learning_rate": 4.8405511321854976e-05, "loss": 0.4363, "num_tokens": 161673445.0, "step": 1237 }, { "epoch": 0.4940143655227454, "grad_norm": 0.2411329448223114, "learning_rate": 4.840184987164488e-05, "loss": 0.3874, "num_tokens": 161804517.0, "step": 1238 }, { "epoch": 0.49441340782122906, "grad_norm": 0.2042350471019745, "learning_rate": 4.839818437712167e-05, "loss": 0.3049, "num_tokens": 161935589.0, "step": 1239 }, { "epoch": 0.4948124501197127, "grad_norm": 0.23247702419757843, "learning_rate": 4.83945148389946e-05, "loss": 0.3864, "num_tokens": 162066661.0, "step": 1240 }, { "epoch": 0.49521149241819634, "grad_norm": 0.26091060042381287, "learning_rate": 4.839084125797369e-05, "loss": 0.4021, "num_tokens": 162197733.0, "step": 1241 }, { "epoch": 0.49561053471668, "grad_norm": 0.27820634841918945, "learning_rate": 4.838716363476973e-05, "loss": 0.4302, "num_tokens": 162328805.0, "step": 1242 }, { "epoch": 0.4960095770151636, "grad_norm": 0.24996870756149292, "learning_rate": 4.838348197009432e-05, "loss": 0.4073, "num_tokens": 162459877.0, "step": 1243 }, { "epoch": 0.49640861931364727, "grad_norm": 0.2547282874584198, "learning_rate": 4.837979626465982e-05, "loss": 0.4173, "num_tokens": 162590949.0, "step": 1244 }, { "epoch": 0.4968076616121309, "grad_norm": 0.25101861357688904, "learning_rate": 4.837610651917938e-05, "loss": 0.3898, "num_tokens": 162722021.0, "step": 1245 }, { "epoch": 0.4972067039106145, "grad_norm": 0.22675681114196777, "learning_rate": 4.837241273436694e-05, "loss": 0.3509, "num_tokens": 162853093.0, "step": 1246 }, { "epoch": 0.49760574620909814, "grad_norm": 0.2825370728969574, "learning_rate": 4.83687149109372e-05, "loss": 0.4369, "num_tokens": 162984165.0, "step": 1247 }, { "epoch": 0.4980047885075818, "grad_norm": 0.23414410650730133, "learning_rate": 4.836501304960566e-05, "loss": 0.4035, "num_tokens": 163115237.0, "step": 1248 }, { "epoch": 0.49840383080606543, "grad_norm": 0.2260817438364029, "learning_rate": 4.83613071510886e-05, "loss": 0.3338, "num_tokens": 163246309.0, "step": 1249 }, { "epoch": 0.49880287310454907, "grad_norm": 0.24036525189876556, "learning_rate": 4.835759721610307e-05, "loss": 0.3797, "num_tokens": 163377381.0, "step": 1250 }, { "epoch": 0.4992019154030327, "grad_norm": 0.21186316013336182, "learning_rate": 4.835388324536692e-05, "loss": 0.342, "num_tokens": 163508453.0, "step": 1251 }, { "epoch": 0.49960095770151636, "grad_norm": 0.2591238021850586, "learning_rate": 4.835016523959875e-05, "loss": 0.4449, "num_tokens": 163639525.0, "step": 1252 }, { "epoch": 0.5, "grad_norm": 0.2540800869464874, "learning_rate": 4.8346443199517966e-05, "loss": 0.4045, "num_tokens": 163755058.0, "step": 1253 }, { "epoch": 0.5003990422984836, "grad_norm": 0.24651819467544556, "learning_rate": 4.8342717125844756e-05, "loss": 0.3821, "num_tokens": 163886130.0, "step": 1254 }, { "epoch": 0.5007980845969673, "grad_norm": 0.2593960762023926, "learning_rate": 4.833898701930007e-05, "loss": 0.4434, "num_tokens": 164017202.0, "step": 1255 }, { "epoch": 0.5011971268954509, "grad_norm": 0.22900858521461487, "learning_rate": 4.833525288060566e-05, "loss": 0.384, "num_tokens": 164148274.0, "step": 1256 }, { "epoch": 0.5015961691939346, "grad_norm": 0.21387997269630432, "learning_rate": 4.833151471048404e-05, "loss": 0.3538, "num_tokens": 164279346.0, "step": 1257 }, { "epoch": 0.5019952114924182, "grad_norm": 0.23189452290534973, "learning_rate": 4.8327772509658517e-05, "loss": 0.4058, "num_tokens": 164410418.0, "step": 1258 }, { "epoch": 0.5023942537909019, "grad_norm": 0.2440505474805832, "learning_rate": 4.832402627885316e-05, "loss": 0.3984, "num_tokens": 164541490.0, "step": 1259 }, { "epoch": 0.5027932960893855, "grad_norm": 0.2396366447210312, "learning_rate": 4.8320276018792835e-05, "loss": 0.4282, "num_tokens": 164672562.0, "step": 1260 }, { "epoch": 0.5031923383878691, "grad_norm": 0.21431276202201843, "learning_rate": 4.831652173020319e-05, "loss": 0.341, "num_tokens": 164798714.0, "step": 1261 }, { "epoch": 0.5035913806863528, "grad_norm": 0.2237655520439148, "learning_rate": 4.831276341381064e-05, "loss": 0.3962, "num_tokens": 164929786.0, "step": 1262 }, { "epoch": 0.5039904229848364, "grad_norm": 0.21594059467315674, "learning_rate": 4.8309001070342384e-05, "loss": 0.3679, "num_tokens": 165060858.0, "step": 1263 }, { "epoch": 0.5043894652833201, "grad_norm": 0.23832666873931885, "learning_rate": 4.83052347005264e-05, "loss": 0.4408, "num_tokens": 165191930.0, "step": 1264 }, { "epoch": 0.5047885075818037, "grad_norm": 0.23550766706466675, "learning_rate": 4.830146430509145e-05, "loss": 0.4383, "num_tokens": 165323002.0, "step": 1265 }, { "epoch": 0.5051875498802874, "grad_norm": 0.23031435906887054, "learning_rate": 4.829768988476706e-05, "loss": 0.3769, "num_tokens": 165454074.0, "step": 1266 }, { "epoch": 0.505586592178771, "grad_norm": 0.21477274596691132, "learning_rate": 4.8293911440283544e-05, "loss": 0.3506, "num_tokens": 165585146.0, "step": 1267 }, { "epoch": 0.5059856344772546, "grad_norm": 0.2313506305217743, "learning_rate": 4.8290128972372014e-05, "loss": 0.3961, "num_tokens": 165716218.0, "step": 1268 }, { "epoch": 0.5063846767757382, "grad_norm": 0.2257659137248993, "learning_rate": 4.828634248176432e-05, "loss": 0.35, "num_tokens": 165847290.0, "step": 1269 }, { "epoch": 0.5067837190742218, "grad_norm": 0.21121034026145935, "learning_rate": 4.828255196919313e-05, "loss": 0.3363, "num_tokens": 165978362.0, "step": 1270 }, { "epoch": 0.5071827613727055, "grad_norm": 0.2775349020957947, "learning_rate": 4.827875743539187e-05, "loss": 0.416, "num_tokens": 166109434.0, "step": 1271 }, { "epoch": 0.5075818036711891, "grad_norm": 0.2563313841819763, "learning_rate": 4.827495888109474e-05, "loss": 0.4277, "num_tokens": 166240506.0, "step": 1272 }, { "epoch": 0.5079808459696727, "grad_norm": 0.2381243258714676, "learning_rate": 4.8271156307036724e-05, "loss": 0.4348, "num_tokens": 166371578.0, "step": 1273 }, { "epoch": 0.5083798882681564, "grad_norm": 0.23337367177009583, "learning_rate": 4.8267349713953594e-05, "loss": 0.4021, "num_tokens": 166502650.0, "step": 1274 }, { "epoch": 0.50877893056664, "grad_norm": 0.2428816854953766, "learning_rate": 4.826353910258188e-05, "loss": 0.415, "num_tokens": 166633722.0, "step": 1275 }, { "epoch": 0.5091779728651237, "grad_norm": 0.20524752140045166, "learning_rate": 4.8259724473658914e-05, "loss": 0.3218, "num_tokens": 166764794.0, "step": 1276 }, { "epoch": 0.5095770151636073, "grad_norm": 0.22452029585838318, "learning_rate": 4.825590582792277e-05, "loss": 0.3483, "num_tokens": 166895866.0, "step": 1277 }, { "epoch": 0.509976057462091, "grad_norm": 0.24089650809764862, "learning_rate": 4.825208316611234e-05, "loss": 0.3923, "num_tokens": 167026938.0, "step": 1278 }, { "epoch": 0.5103750997605746, "grad_norm": 0.2280811369419098, "learning_rate": 4.824825648896727e-05, "loss": 0.3855, "num_tokens": 167158010.0, "step": 1279 }, { "epoch": 0.5107741420590582, "grad_norm": 0.1775120496749878, "learning_rate": 4.8244425797227976e-05, "loss": 0.2512, "num_tokens": 167289082.0, "step": 1280 }, { "epoch": 0.5111731843575419, "grad_norm": 0.25033998489379883, "learning_rate": 4.824059109163567e-05, "loss": 0.3612, "num_tokens": 167420154.0, "step": 1281 }, { "epoch": 0.5115722266560255, "grad_norm": 0.24974502623081207, "learning_rate": 4.823675237293232e-05, "loss": 0.4141, "num_tokens": 167551226.0, "step": 1282 }, { "epoch": 0.5119712689545092, "grad_norm": 0.2647925317287445, "learning_rate": 4.8232909641860705e-05, "loss": 0.3709, "num_tokens": 167682298.0, "step": 1283 }, { "epoch": 0.5123703112529928, "grad_norm": 0.22931234538555145, "learning_rate": 4.822906289916434e-05, "loss": 0.3751, "num_tokens": 167813370.0, "step": 1284 }, { "epoch": 0.5127693535514765, "grad_norm": 0.22260557115077972, "learning_rate": 4.8225212145587535e-05, "loss": 0.3686, "num_tokens": 167944442.0, "step": 1285 }, { "epoch": 0.5131683958499601, "grad_norm": 4.600845813751221, "learning_rate": 4.822135738187537e-05, "loss": 0.4724, "num_tokens": 168075514.0, "step": 1286 }, { "epoch": 0.5135674381484437, "grad_norm": 0.22913137078285217, "learning_rate": 4.8217498608773724e-05, "loss": 0.3947, "num_tokens": 168206586.0, "step": 1287 }, { "epoch": 0.5139664804469274, "grad_norm": 0.2308468073606491, "learning_rate": 4.8213635827029225e-05, "loss": 0.389, "num_tokens": 168337658.0, "step": 1288 }, { "epoch": 0.514365522745411, "grad_norm": 0.23450513184070587, "learning_rate": 4.820976903738928e-05, "loss": 0.387, "num_tokens": 168468730.0, "step": 1289 }, { "epoch": 0.5147645650438947, "grad_norm": 0.24627985060214996, "learning_rate": 4.820589824060208e-05, "loss": 0.4031, "num_tokens": 168599802.0, "step": 1290 }, { "epoch": 0.5151636073423783, "grad_norm": 0.21351894736289978, "learning_rate": 4.8202023437416596e-05, "loss": 0.3668, "num_tokens": 168729137.0, "step": 1291 }, { "epoch": 0.515562649640862, "grad_norm": 0.22991439700126648, "learning_rate": 4.819814462858254e-05, "loss": 0.3736, "num_tokens": 168860209.0, "step": 1292 }, { "epoch": 0.5159616919393456, "grad_norm": 0.2383180409669876, "learning_rate": 4.819426181485046e-05, "loss": 0.3968, "num_tokens": 168991281.0, "step": 1293 }, { "epoch": 0.5163607342378292, "grad_norm": 0.2295846939086914, "learning_rate": 4.8190374996971624e-05, "loss": 0.3811, "num_tokens": 169122353.0, "step": 1294 }, { "epoch": 0.5167597765363129, "grad_norm": 0.23840932548046112, "learning_rate": 4.818648417569809e-05, "loss": 0.4025, "num_tokens": 169253425.0, "step": 1295 }, { "epoch": 0.5171588188347965, "grad_norm": 0.25158247351646423, "learning_rate": 4.818258935178272e-05, "loss": 0.4106, "num_tokens": 169384497.0, "step": 1296 }, { "epoch": 0.5175578611332802, "grad_norm": 0.24911774694919586, "learning_rate": 4.81786905259791e-05, "loss": 0.3684, "num_tokens": 169515569.0, "step": 1297 }, { "epoch": 0.5179569034317638, "grad_norm": 0.26996979117393494, "learning_rate": 4.817478769904164e-05, "loss": 0.4043, "num_tokens": 169646641.0, "step": 1298 }, { "epoch": 0.5183559457302475, "grad_norm": 0.23418278992176056, "learning_rate": 4.817088087172548e-05, "loss": 0.3963, "num_tokens": 169777713.0, "step": 1299 }, { "epoch": 0.518754988028731, "grad_norm": 0.2444092333316803, "learning_rate": 4.816697004478656e-05, "loss": 0.3937, "num_tokens": 169908785.0, "step": 1300 }, { "epoch": 0.5191540303272146, "grad_norm": 0.26055875420570374, "learning_rate": 4.8163055218981595e-05, "loss": 0.3937, "num_tokens": 170039857.0, "step": 1301 }, { "epoch": 0.5195530726256983, "grad_norm": 0.22837233543395996, "learning_rate": 4.815913639506806e-05, "loss": 0.3083, "num_tokens": 170170929.0, "step": 1302 }, { "epoch": 0.5199521149241819, "grad_norm": 0.24410507082939148, "learning_rate": 4.815521357380422e-05, "loss": 0.3666, "num_tokens": 170302001.0, "step": 1303 }, { "epoch": 0.5203511572226656, "grad_norm": 0.22957944869995117, "learning_rate": 4.81512867559491e-05, "loss": 0.3369, "num_tokens": 170433073.0, "step": 1304 }, { "epoch": 0.5207501995211492, "grad_norm": 0.2606019079685211, "learning_rate": 4.814735594226249e-05, "loss": 0.3902, "num_tokens": 170564145.0, "step": 1305 }, { "epoch": 0.5211492418196328, "grad_norm": 0.24088315665721893, "learning_rate": 4.8143421133505e-05, "loss": 0.3642, "num_tokens": 170695217.0, "step": 1306 }, { "epoch": 0.5215482841181165, "grad_norm": 0.23166100680828094, "learning_rate": 4.8139482330437934e-05, "loss": 0.3416, "num_tokens": 170826289.0, "step": 1307 }, { "epoch": 0.5219473264166001, "grad_norm": 0.24389950931072235, "learning_rate": 4.813553953382344e-05, "loss": 0.3428, "num_tokens": 170957361.0, "step": 1308 }, { "epoch": 0.5223463687150838, "grad_norm": 0.24814438819885254, "learning_rate": 4.8131592744424416e-05, "loss": 0.377, "num_tokens": 171088433.0, "step": 1309 }, { "epoch": 0.5227454110135674, "grad_norm": 0.2278224676847458, "learning_rate": 4.812764196300452e-05, "loss": 0.3507, "num_tokens": 171219505.0, "step": 1310 }, { "epoch": 0.5231444533120511, "grad_norm": 0.24936963617801666, "learning_rate": 4.812368719032819e-05, "loss": 0.4033, "num_tokens": 171350577.0, "step": 1311 }, { "epoch": 0.5235434956105347, "grad_norm": 0.24284981191158295, "learning_rate": 4.8119728427160636e-05, "loss": 0.4187, "num_tokens": 171481649.0, "step": 1312 }, { "epoch": 0.5239425379090183, "grad_norm": 0.24023480713367462, "learning_rate": 4.8115765674267854e-05, "loss": 0.4138, "num_tokens": 171612721.0, "step": 1313 }, { "epoch": 0.524341580207502, "grad_norm": 0.23608827590942383, "learning_rate": 4.811179893241658e-05, "loss": 0.4054, "num_tokens": 171743793.0, "step": 1314 }, { "epoch": 0.5247406225059856, "grad_norm": 0.243027463555336, "learning_rate": 4.810782820237437e-05, "loss": 0.3499, "num_tokens": 171874865.0, "step": 1315 }, { "epoch": 0.5251396648044693, "grad_norm": 0.22613704204559326, "learning_rate": 4.810385348490949e-05, "loss": 0.3748, "num_tokens": 172005937.0, "step": 1316 }, { "epoch": 0.5255387071029529, "grad_norm": 0.23087841272354126, "learning_rate": 4.8099874780791046e-05, "loss": 0.3748, "num_tokens": 172137009.0, "step": 1317 }, { "epoch": 0.5259377494014366, "grad_norm": 0.238919198513031, "learning_rate": 4.809589209078884e-05, "loss": 0.3508, "num_tokens": 172268081.0, "step": 1318 }, { "epoch": 0.5263367916999202, "grad_norm": 0.2549881637096405, "learning_rate": 4.8091905415673524e-05, "loss": 0.3722, "num_tokens": 172399153.0, "step": 1319 }, { "epoch": 0.5267358339984038, "grad_norm": 0.2549508213996887, "learning_rate": 4.8087914756216456e-05, "loss": 0.4047, "num_tokens": 172530225.0, "step": 1320 }, { "epoch": 0.5271348762968875, "grad_norm": 0.2574905455112457, "learning_rate": 4.80839201131898e-05, "loss": 0.4161, "num_tokens": 172661297.0, "step": 1321 }, { "epoch": 0.5275339185953711, "grad_norm": 0.24326953291893005, "learning_rate": 4.807992148736649e-05, "loss": 0.3717, "num_tokens": 172792369.0, "step": 1322 }, { "epoch": 0.5279329608938548, "grad_norm": 0.2461889535188675, "learning_rate": 4.807591887952021e-05, "loss": 0.4035, "num_tokens": 172923441.0, "step": 1323 }, { "epoch": 0.5283320031923384, "grad_norm": 0.2355051338672638, "learning_rate": 4.8071912290425435e-05, "loss": 0.3909, "num_tokens": 173054513.0, "step": 1324 }, { "epoch": 0.5287310454908221, "grad_norm": 0.2499934881925583, "learning_rate": 4.8067901720857405e-05, "loss": 0.4069, "num_tokens": 173185585.0, "step": 1325 }, { "epoch": 0.5291300877893057, "grad_norm": 0.2682345509529114, "learning_rate": 4.806388717159212e-05, "loss": 0.4248, "num_tokens": 173316657.0, "step": 1326 }, { "epoch": 0.5295291300877893, "grad_norm": 0.2104818969964981, "learning_rate": 4.805986864340636e-05, "loss": 0.3235, "num_tokens": 173447729.0, "step": 1327 }, { "epoch": 0.529928172386273, "grad_norm": 0.23288775980472565, "learning_rate": 4.8055846137077674e-05, "loss": 0.3817, "num_tokens": 173578801.0, "step": 1328 }, { "epoch": 0.5303272146847566, "grad_norm": 0.24767521023750305, "learning_rate": 4.805181965338438e-05, "loss": 0.3835, "num_tokens": 173709873.0, "step": 1329 }, { "epoch": 0.5307262569832403, "grad_norm": 0.23888815939426422, "learning_rate": 4.8047789193105576e-05, "loss": 0.3973, "num_tokens": 173840945.0, "step": 1330 }, { "epoch": 0.5311252992817239, "grad_norm": 0.25471997261047363, "learning_rate": 4.804375475702109e-05, "loss": 0.4064, "num_tokens": 173972017.0, "step": 1331 }, { "epoch": 0.5315243415802074, "grad_norm": 0.318845272064209, "learning_rate": 4.803971634591157e-05, "loss": 0.3922, "num_tokens": 174103089.0, "step": 1332 }, { "epoch": 0.5319233838786911, "grad_norm": 0.23327790200710297, "learning_rate": 4.8035673960558406e-05, "loss": 0.3761, "num_tokens": 174234161.0, "step": 1333 }, { "epoch": 0.5323224261771747, "grad_norm": 0.24052289128303528, "learning_rate": 4.803162760174377e-05, "loss": 0.398, "num_tokens": 174365233.0, "step": 1334 }, { "epoch": 0.5327214684756584, "grad_norm": 0.26066985726356506, "learning_rate": 4.8027577270250576e-05, "loss": 0.3579, "num_tokens": 174496305.0, "step": 1335 }, { "epoch": 0.533120510774142, "grad_norm": 0.24903567135334015, "learning_rate": 4.802352296686254e-05, "loss": 0.4046, "num_tokens": 174627377.0, "step": 1336 }, { "epoch": 0.5335195530726257, "grad_norm": 0.24882441759109497, "learning_rate": 4.801946469236413e-05, "loss": 0.3939, "num_tokens": 174758449.0, "step": 1337 }, { "epoch": 0.5339185953711093, "grad_norm": 0.21134699881076813, "learning_rate": 4.801540244754058e-05, "loss": 0.3347, "num_tokens": 174889521.0, "step": 1338 }, { "epoch": 0.534317637669593, "grad_norm": 0.21916697919368744, "learning_rate": 4.8011336233177895e-05, "loss": 0.3388, "num_tokens": 175020593.0, "step": 1339 }, { "epoch": 0.5347166799680766, "grad_norm": 0.2494548261165619, "learning_rate": 4.8007266050062856e-05, "loss": 0.4256, "num_tokens": 175151665.0, "step": 1340 }, { "epoch": 0.5351157222665602, "grad_norm": 0.22098617255687714, "learning_rate": 4.8003191898983e-05, "loss": 0.3412, "num_tokens": 175282737.0, "step": 1341 }, { "epoch": 0.5355147645650439, "grad_norm": 0.24652403593063354, "learning_rate": 4.799911378072665e-05, "loss": 0.403, "num_tokens": 175413809.0, "step": 1342 }, { "epoch": 0.5359138068635275, "grad_norm": 0.23446118831634521, "learning_rate": 4.799503169608286e-05, "loss": 0.4121, "num_tokens": 175544881.0, "step": 1343 }, { "epoch": 0.5363128491620112, "grad_norm": 0.2370143085718155, "learning_rate": 4.799094564584149e-05, "loss": 0.3668, "num_tokens": 175675953.0, "step": 1344 }, { "epoch": 0.5367118914604948, "grad_norm": 0.2201353758573532, "learning_rate": 4.7986855630793164e-05, "loss": 0.3349, "num_tokens": 175807025.0, "step": 1345 }, { "epoch": 0.5371109337589784, "grad_norm": 0.27754607796669006, "learning_rate": 4.7982761651729244e-05, "loss": 0.4737, "num_tokens": 175938097.0, "step": 1346 }, { "epoch": 0.5375099760574621, "grad_norm": 0.23015572130680084, "learning_rate": 4.797866370944188e-05, "loss": 0.3823, "num_tokens": 176069169.0, "step": 1347 }, { "epoch": 0.5379090183559457, "grad_norm": 0.21708917617797852, "learning_rate": 4.797456180472399e-05, "loss": 0.3661, "num_tokens": 176200241.0, "step": 1348 }, { "epoch": 0.5383080606544294, "grad_norm": 0.24395138025283813, "learning_rate": 4.797045593836925e-05, "loss": 0.4036, "num_tokens": 176331313.0, "step": 1349 }, { "epoch": 0.538707102952913, "grad_norm": 0.23276254534721375, "learning_rate": 4.796634611117211e-05, "loss": 0.386, "num_tokens": 176462385.0, "step": 1350 }, { "epoch": 0.5391061452513967, "grad_norm": 0.2211393117904663, "learning_rate": 4.796223232392779e-05, "loss": 0.3759, "num_tokens": 176593457.0, "step": 1351 }, { "epoch": 0.5395051875498803, "grad_norm": 0.2244272083044052, "learning_rate": 4.795811457743224e-05, "loss": 0.3314, "num_tokens": 176724529.0, "step": 1352 }, { "epoch": 0.539904229848364, "grad_norm": 0.25003060698509216, "learning_rate": 4.795399287248225e-05, "loss": 0.4048, "num_tokens": 176855601.0, "step": 1353 }, { "epoch": 0.5403032721468476, "grad_norm": 0.23474876582622528, "learning_rate": 4.79498672098753e-05, "loss": 0.3897, "num_tokens": 176986673.0, "step": 1354 }, { "epoch": 0.5407023144453312, "grad_norm": 0.25538626313209534, "learning_rate": 4.794573759040968e-05, "loss": 0.4073, "num_tokens": 177117745.0, "step": 1355 }, { "epoch": 0.5411013567438149, "grad_norm": 0.24012425541877747, "learning_rate": 4.794160401488442e-05, "loss": 0.3333, "num_tokens": 177248817.0, "step": 1356 }, { "epoch": 0.5415003990422985, "grad_norm": 0.2815791964530945, "learning_rate": 4.7937466484099346e-05, "loss": 0.4525, "num_tokens": 177379889.0, "step": 1357 }, { "epoch": 0.5418994413407822, "grad_norm": 0.2350900024175644, "learning_rate": 4.7933324998855014e-05, "loss": 0.4241, "num_tokens": 177510961.0, "step": 1358 }, { "epoch": 0.5422984836392658, "grad_norm": 0.21373572945594788, "learning_rate": 4.792917955995278e-05, "loss": 0.3292, "num_tokens": 177642033.0, "step": 1359 }, { "epoch": 0.5426975259377494, "grad_norm": 0.22907570004463196, "learning_rate": 4.792503016819474e-05, "loss": 0.3727, "num_tokens": 177773105.0, "step": 1360 }, { "epoch": 0.5430965682362331, "grad_norm": 0.24403101205825806, "learning_rate": 4.7920876824383766e-05, "loss": 0.402, "num_tokens": 177904177.0, "step": 1361 }, { "epoch": 0.5434956105347167, "grad_norm": 0.2213786393404007, "learning_rate": 4.791671952932347e-05, "loss": 0.3452, "num_tokens": 178035249.0, "step": 1362 }, { "epoch": 0.5438946528332003, "grad_norm": 0.23512272536754608, "learning_rate": 4.791255828381828e-05, "loss": 0.4069, "num_tokens": 178166321.0, "step": 1363 }, { "epoch": 0.5442936951316839, "grad_norm": 0.27182042598724365, "learning_rate": 4.790839308867335e-05, "loss": 0.4177, "num_tokens": 178297393.0, "step": 1364 }, { "epoch": 0.5446927374301676, "grad_norm": 0.23679578304290771, "learning_rate": 4.790422394469459e-05, "loss": 0.3687, "num_tokens": 178428465.0, "step": 1365 }, { "epoch": 0.5450917797286512, "grad_norm": 0.27414292097091675, "learning_rate": 4.790005085268872e-05, "loss": 0.4442, "num_tokens": 178559537.0, "step": 1366 }, { "epoch": 0.5454908220271348, "grad_norm": 0.23885533213615417, "learning_rate": 4.789587381346316e-05, "loss": 0.3917, "num_tokens": 178690609.0, "step": 1367 }, { "epoch": 0.5458898643256185, "grad_norm": 0.22698485851287842, "learning_rate": 4.789169282782615e-05, "loss": 0.3862, "num_tokens": 178821681.0, "step": 1368 }, { "epoch": 0.5462889066241021, "grad_norm": 0.278125524520874, "learning_rate": 4.7887507896586656e-05, "loss": 0.4947, "num_tokens": 178952753.0, "step": 1369 }, { "epoch": 0.5466879489225858, "grad_norm": 0.23914538323879242, "learning_rate": 4.788331902055445e-05, "loss": 0.4098, "num_tokens": 179083825.0, "step": 1370 }, { "epoch": 0.5470869912210694, "grad_norm": 0.2158413678407669, "learning_rate": 4.787912620054002e-05, "loss": 0.3856, "num_tokens": 179214897.0, "step": 1371 }, { "epoch": 0.547486033519553, "grad_norm": 0.4333561360836029, "learning_rate": 4.7874929437354645e-05, "loss": 0.3673, "num_tokens": 179345969.0, "step": 1372 }, { "epoch": 0.5478850758180367, "grad_norm": 0.220678448677063, "learning_rate": 4.7870728731810356e-05, "loss": 0.3973, "num_tokens": 179477041.0, "step": 1373 }, { "epoch": 0.5482841181165203, "grad_norm": 0.2258111834526062, "learning_rate": 4.786652408471994e-05, "loss": 0.3525, "num_tokens": 179608113.0, "step": 1374 }, { "epoch": 0.548683160415004, "grad_norm": 0.23679979145526886, "learning_rate": 4.7862315496896976e-05, "loss": 0.4152, "num_tokens": 179739185.0, "step": 1375 }, { "epoch": 0.5490822027134876, "grad_norm": 0.23263782262802124, "learning_rate": 4.785810296915578e-05, "loss": 0.4076, "num_tokens": 179870257.0, "step": 1376 }, { "epoch": 0.5494812450119713, "grad_norm": 0.23485501110553741, "learning_rate": 4.785388650231143e-05, "loss": 0.403, "num_tokens": 180001329.0, "step": 1377 }, { "epoch": 0.5498802873104549, "grad_norm": 0.22492335736751556, "learning_rate": 4.784966609717978e-05, "loss": 0.3853, "num_tokens": 180132401.0, "step": 1378 }, { "epoch": 0.5502793296089385, "grad_norm": 0.20617641508579254, "learning_rate": 4.7845441754577444e-05, "loss": 0.3337, "num_tokens": 180263473.0, "step": 1379 }, { "epoch": 0.5506783719074222, "grad_norm": 0.23610137403011322, "learning_rate": 4.7841213475321775e-05, "loss": 0.3974, "num_tokens": 180394545.0, "step": 1380 }, { "epoch": 0.5510774142059058, "grad_norm": 0.24656745791435242, "learning_rate": 4.783698126023092e-05, "loss": 0.3927, "num_tokens": 180525617.0, "step": 1381 }, { "epoch": 0.5514764565043895, "grad_norm": 0.2319117784500122, "learning_rate": 4.783274511012378e-05, "loss": 0.3599, "num_tokens": 180656689.0, "step": 1382 }, { "epoch": 0.5518754988028731, "grad_norm": 0.245074063539505, "learning_rate": 4.7828505025819995e-05, "loss": 0.3711, "num_tokens": 180787761.0, "step": 1383 }, { "epoch": 0.5522745411013568, "grad_norm": 0.2134205847978592, "learning_rate": 4.782426100813999e-05, "loss": 0.3374, "num_tokens": 180918833.0, "step": 1384 }, { "epoch": 0.5526735833998404, "grad_norm": 0.22095328569412231, "learning_rate": 4.7820013057904936e-05, "loss": 0.3219, "num_tokens": 181049905.0, "step": 1385 }, { "epoch": 0.553072625698324, "grad_norm": 0.2674618661403656, "learning_rate": 4.781576117593678e-05, "loss": 0.3983, "num_tokens": 181180977.0, "step": 1386 }, { "epoch": 0.5534716679968077, "grad_norm": 0.2568064033985138, "learning_rate": 4.7811505363058204e-05, "loss": 0.4144, "num_tokens": 181312049.0, "step": 1387 }, { "epoch": 0.5538707102952913, "grad_norm": 0.25013840198516846, "learning_rate": 4.78072456200927e-05, "loss": 0.3842, "num_tokens": 181443121.0, "step": 1388 }, { "epoch": 0.554269752593775, "grad_norm": 0.23093396425247192, "learning_rate": 4.780298194786446e-05, "loss": 0.3318, "num_tokens": 181574193.0, "step": 1389 }, { "epoch": 0.5546687948922586, "grad_norm": 0.24464863538742065, "learning_rate": 4.779871434719847e-05, "loss": 0.4026, "num_tokens": 181705265.0, "step": 1390 }, { "epoch": 0.5550678371907423, "grad_norm": 0.26446250081062317, "learning_rate": 4.7794442818920485e-05, "loss": 0.4122, "num_tokens": 181836337.0, "step": 1391 }, { "epoch": 0.5554668794892259, "grad_norm": 0.23206757009029388, "learning_rate": 4.7790167363856994e-05, "loss": 0.3416, "num_tokens": 181967409.0, "step": 1392 }, { "epoch": 0.5558659217877095, "grad_norm": 0.21707172691822052, "learning_rate": 4.7785887982835263e-05, "loss": 0.3362, "num_tokens": 182098481.0, "step": 1393 }, { "epoch": 0.5562649640861931, "grad_norm": 0.24156400561332703, "learning_rate": 4.7781604676683306e-05, "loss": 0.4016, "num_tokens": 182229553.0, "step": 1394 }, { "epoch": 0.5566640063846767, "grad_norm": 0.2419862300157547, "learning_rate": 4.777731744622991e-05, "loss": 0.3764, "num_tokens": 182360625.0, "step": 1395 }, { "epoch": 0.5570630486831604, "grad_norm": 0.2511381208896637, "learning_rate": 4.7773026292304615e-05, "loss": 0.388, "num_tokens": 182491697.0, "step": 1396 }, { "epoch": 0.557462090981644, "grad_norm": 0.23712654411792755, "learning_rate": 4.7768731215737706e-05, "loss": 0.3656, "num_tokens": 182622769.0, "step": 1397 }, { "epoch": 0.5578611332801277, "grad_norm": 0.2492290437221527, "learning_rate": 4.7764432217360256e-05, "loss": 0.3708, "num_tokens": 182753841.0, "step": 1398 }, { "epoch": 0.5582601755786113, "grad_norm": 0.2485152631998062, "learning_rate": 4.776012929800407e-05, "loss": 0.4076, "num_tokens": 182884913.0, "step": 1399 }, { "epoch": 0.5586592178770949, "grad_norm": 0.2449939101934433, "learning_rate": 4.7755822458501734e-05, "loss": 0.3689, "num_tokens": 183015985.0, "step": 1400 }, { "epoch": 0.5590582601755786, "grad_norm": 0.22810828685760498, "learning_rate": 4.775151169968657e-05, "loss": 0.3516, "num_tokens": 183147057.0, "step": 1401 }, { "epoch": 0.5594573024740622, "grad_norm": 0.23591849207878113, "learning_rate": 4.774719702239267e-05, "loss": 0.3677, "num_tokens": 183278129.0, "step": 1402 }, { "epoch": 0.5598563447725459, "grad_norm": 0.23860645294189453, "learning_rate": 4.77428784274549e-05, "loss": 0.3759, "num_tokens": 183409201.0, "step": 1403 }, { "epoch": 0.5602553870710295, "grad_norm": 0.21609750390052795, "learning_rate": 4.773855591570885e-05, "loss": 0.3212, "num_tokens": 183540273.0, "step": 1404 }, { "epoch": 0.5606544293695132, "grad_norm": 0.23188002407550812, "learning_rate": 4.773422948799088e-05, "loss": 0.3526, "num_tokens": 183671345.0, "step": 1405 }, { "epoch": 0.5610534716679968, "grad_norm": 0.2158224731683731, "learning_rate": 4.772989914513814e-05, "loss": 0.3295, "num_tokens": 183802417.0, "step": 1406 }, { "epoch": 0.5614525139664804, "grad_norm": 0.2349645495414734, "learning_rate": 4.772556488798849e-05, "loss": 0.3984, "num_tokens": 183933489.0, "step": 1407 }, { "epoch": 0.5618515562649641, "grad_norm": 0.21654053032398224, "learning_rate": 4.772122671738058e-05, "loss": 0.3104, "num_tokens": 184064561.0, "step": 1408 }, { "epoch": 0.5622505985634477, "grad_norm": 0.2688620686531067, "learning_rate": 4.7716884634153795e-05, "loss": 0.4125, "num_tokens": 184195633.0, "step": 1409 }, { "epoch": 0.5626496408619314, "grad_norm": 0.2515137493610382, "learning_rate": 4.77125386391483e-05, "loss": 0.4003, "num_tokens": 184326705.0, "step": 1410 }, { "epoch": 0.563048683160415, "grad_norm": 0.22129803895950317, "learning_rate": 4.7708188733204986e-05, "loss": 0.3772, "num_tokens": 184457777.0, "step": 1411 }, { "epoch": 0.5634477254588987, "grad_norm": 0.23119471967220306, "learning_rate": 4.7703834917165536e-05, "loss": 0.4043, "num_tokens": 184588849.0, "step": 1412 }, { "epoch": 0.5638467677573823, "grad_norm": 0.22949065268039703, "learning_rate": 4.7699477191872366e-05, "loss": 0.3515, "num_tokens": 184719921.0, "step": 1413 }, { "epoch": 0.5642458100558659, "grad_norm": 0.21783386170864105, "learning_rate": 4.7695115558168655e-05, "loss": 0.333, "num_tokens": 184850993.0, "step": 1414 }, { "epoch": 0.5646448523543496, "grad_norm": 0.23373043537139893, "learning_rate": 4.7690750016898345e-05, "loss": 0.3999, "num_tokens": 184982065.0, "step": 1415 }, { "epoch": 0.5650438946528332, "grad_norm": 0.24623017013072968, "learning_rate": 4.768638056890612e-05, "loss": 0.3848, "num_tokens": 185113137.0, "step": 1416 }, { "epoch": 0.5654429369513169, "grad_norm": 0.2294004261493683, "learning_rate": 4.7682007215037434e-05, "loss": 0.3671, "num_tokens": 185244209.0, "step": 1417 }, { "epoch": 0.5658419792498005, "grad_norm": 0.23807145655155182, "learning_rate": 4.767762995613848e-05, "loss": 0.3789, "num_tokens": 185375281.0, "step": 1418 }, { "epoch": 0.5662410215482842, "grad_norm": 0.25506073236465454, "learning_rate": 4.767324879305623e-05, "loss": 0.4224, "num_tokens": 185506353.0, "step": 1419 }, { "epoch": 0.5666400638467678, "grad_norm": 0.2599272131919861, "learning_rate": 4.766886372663839e-05, "loss": 0.4059, "num_tokens": 185637425.0, "step": 1420 }, { "epoch": 0.5670391061452514, "grad_norm": 0.23710083961486816, "learning_rate": 4.766447475773342e-05, "loss": 0.3613, "num_tokens": 185768497.0, "step": 1421 }, { "epoch": 0.5674381484437351, "grad_norm": 0.23883210122585297, "learning_rate": 4.766008188719056e-05, "loss": 0.3905, "num_tokens": 185899569.0, "step": 1422 }, { "epoch": 0.5678371907422187, "grad_norm": 0.2550155222415924, "learning_rate": 4.765568511585979e-05, "loss": 0.4099, "num_tokens": 186030641.0, "step": 1423 }, { "epoch": 0.5682362330407024, "grad_norm": 0.24070893228054047, "learning_rate": 4.765128444459183e-05, "loss": 0.4181, "num_tokens": 186161713.0, "step": 1424 }, { "epoch": 0.568635275339186, "grad_norm": 0.22086749970912933, "learning_rate": 4.7646879874238184e-05, "loss": 0.3785, "num_tokens": 186292785.0, "step": 1425 }, { "epoch": 0.5690343176376695, "grad_norm": 0.21364057064056396, "learning_rate": 4.7642471405651096e-05, "loss": 0.3645, "num_tokens": 186423857.0, "step": 1426 }, { "epoch": 0.5694333599361532, "grad_norm": 0.2465241551399231, "learning_rate": 4.763805903968355e-05, "loss": 0.3816, "num_tokens": 186554929.0, "step": 1427 }, { "epoch": 0.5698324022346368, "grad_norm": 0.24423010647296906, "learning_rate": 4.76336427771893e-05, "loss": 0.4138, "num_tokens": 186686001.0, "step": 1428 }, { "epoch": 0.5702314445331205, "grad_norm": 0.23985430598258972, "learning_rate": 4.762922261902286e-05, "loss": 0.409, "num_tokens": 186817073.0, "step": 1429 }, { "epoch": 0.5706304868316041, "grad_norm": 0.24222756922245026, "learning_rate": 4.762479856603949e-05, "loss": 0.3757, "num_tokens": 186948145.0, "step": 1430 }, { "epoch": 0.5710295291300878, "grad_norm": 0.1968884915113449, "learning_rate": 4.762037061909519e-05, "loss": 0.3087, "num_tokens": 187079217.0, "step": 1431 }, { "epoch": 0.5714285714285714, "grad_norm": 0.27335259318351746, "learning_rate": 4.761593877904674e-05, "loss": 0.4234, "num_tokens": 187210289.0, "step": 1432 }, { "epoch": 0.571827613727055, "grad_norm": 0.2439742535352707, "learning_rate": 4.7611503046751656e-05, "loss": 0.3999, "num_tokens": 187341361.0, "step": 1433 }, { "epoch": 0.5722266560255387, "grad_norm": 0.2423168569803238, "learning_rate": 4.7607063423068196e-05, "loss": 0.3823, "num_tokens": 187472433.0, "step": 1434 }, { "epoch": 0.5726256983240223, "grad_norm": 0.21023976802825928, "learning_rate": 4.760261990885542e-05, "loss": 0.3053, "num_tokens": 187603505.0, "step": 1435 }, { "epoch": 0.573024740622506, "grad_norm": 0.23374104499816895, "learning_rate": 4.7598172504973075e-05, "loss": 0.3774, "num_tokens": 187734577.0, "step": 1436 }, { "epoch": 0.5734237829209896, "grad_norm": 0.2697663903236389, "learning_rate": 4.7593721212281705e-05, "loss": 0.4425, "num_tokens": 187865649.0, "step": 1437 }, { "epoch": 0.5738228252194733, "grad_norm": 0.22713744640350342, "learning_rate": 4.7589266031642585e-05, "loss": 0.3846, "num_tokens": 187996721.0, "step": 1438 }, { "epoch": 0.5742218675179569, "grad_norm": 0.24449487030506134, "learning_rate": 4.758480696391777e-05, "loss": 0.4392, "num_tokens": 188127793.0, "step": 1439 }, { "epoch": 0.5746209098164405, "grad_norm": 0.23062774538993835, "learning_rate": 4.758034400997002e-05, "loss": 0.3748, "num_tokens": 188258865.0, "step": 1440 }, { "epoch": 0.5750199521149242, "grad_norm": 0.24656924605369568, "learning_rate": 4.7575877170662905e-05, "loss": 0.3937, "num_tokens": 188389937.0, "step": 1441 }, { "epoch": 0.5754189944134078, "grad_norm": 0.24404923617839813, "learning_rate": 4.75714064468607e-05, "loss": 0.3582, "num_tokens": 188521009.0, "step": 1442 }, { "epoch": 0.5758180367118915, "grad_norm": 0.22967953979969025, "learning_rate": 4.7566931839428446e-05, "loss": 0.3697, "num_tokens": 188652081.0, "step": 1443 }, { "epoch": 0.5762170790103751, "grad_norm": 0.23889845609664917, "learning_rate": 4.7562453349231944e-05, "loss": 0.3552, "num_tokens": 188783153.0, "step": 1444 }, { "epoch": 0.5766161213088588, "grad_norm": 0.29636991024017334, "learning_rate": 4.755797097713774e-05, "loss": 0.4116, "num_tokens": 188899680.0, "step": 1445 }, { "epoch": 0.5770151636073424, "grad_norm": 0.27768126130104065, "learning_rate": 4.755348472401313e-05, "loss": 0.411, "num_tokens": 189030752.0, "step": 1446 }, { "epoch": 0.577414205905826, "grad_norm": 0.25634145736694336, "learning_rate": 4.754899459072616e-05, "loss": 0.4228, "num_tokens": 189161824.0, "step": 1447 }, { "epoch": 0.5778132482043097, "grad_norm": 0.24053969979286194, "learning_rate": 4.754450057814564e-05, "loss": 0.3589, "num_tokens": 189292896.0, "step": 1448 }, { "epoch": 0.5782122905027933, "grad_norm": 0.25131523609161377, "learning_rate": 4.754000268714111e-05, "loss": 0.389, "num_tokens": 189423968.0, "step": 1449 }, { "epoch": 0.578611332801277, "grad_norm": 0.2393389493227005, "learning_rate": 4.7535500918582856e-05, "loss": 0.3656, "num_tokens": 189555040.0, "step": 1450 }, { "epoch": 0.5790103750997606, "grad_norm": 0.23281578719615936, "learning_rate": 4.753099527334195e-05, "loss": 0.3448, "num_tokens": 189686112.0, "step": 1451 }, { "epoch": 0.5794094173982443, "grad_norm": 0.24325023591518402, "learning_rate": 4.752648575229018e-05, "loss": 0.3928, "num_tokens": 189817184.0, "step": 1452 }, { "epoch": 0.5798084596967279, "grad_norm": 0.2678796947002411, "learning_rate": 4.7521972356300106e-05, "loss": 0.4113, "num_tokens": 189948256.0, "step": 1453 }, { "epoch": 0.5802075019952115, "grad_norm": 0.2828909754753113, "learning_rate": 4.7517455086245034e-05, "loss": 0.4334, "num_tokens": 190079328.0, "step": 1454 }, { "epoch": 0.5806065442936952, "grad_norm": 0.25116923451423645, "learning_rate": 4.751293394299899e-05, "loss": 0.4279, "num_tokens": 190210400.0, "step": 1455 }, { "epoch": 0.5810055865921788, "grad_norm": 0.25478285551071167, "learning_rate": 4.7508408927436793e-05, "loss": 0.4136, "num_tokens": 190341472.0, "step": 1456 }, { "epoch": 0.5814046288906624, "grad_norm": 0.2589797079563141, "learning_rate": 4.750388004043398e-05, "loss": 0.3554, "num_tokens": 190461617.0, "step": 1457 }, { "epoch": 0.581803671189146, "grad_norm": 0.35622885823249817, "learning_rate": 4.749934728286685e-05, "loss": 0.4261, "num_tokens": 190592689.0, "step": 1458 }, { "epoch": 0.5822027134876296, "grad_norm": 0.2356540560722351, "learning_rate": 4.749481065561246e-05, "loss": 0.4111, "num_tokens": 190723761.0, "step": 1459 }, { "epoch": 0.5826017557861133, "grad_norm": 0.26113009452819824, "learning_rate": 4.74902701595486e-05, "loss": 0.407, "num_tokens": 190854833.0, "step": 1460 }, { "epoch": 0.5830007980845969, "grad_norm": 0.2448427975177765, "learning_rate": 4.7485725795553795e-05, "loss": 0.3669, "num_tokens": 190985905.0, "step": 1461 }, { "epoch": 0.5833998403830806, "grad_norm": 0.24879632890224457, "learning_rate": 4.748117756450737e-05, "loss": 0.402, "num_tokens": 191116977.0, "step": 1462 }, { "epoch": 0.5837988826815642, "grad_norm": 0.23949289321899414, "learning_rate": 4.7476625467289334e-05, "loss": 0.384, "num_tokens": 191248049.0, "step": 1463 }, { "epoch": 0.5841979249800479, "grad_norm": 0.22880704700946808, "learning_rate": 4.74720695047805e-05, "loss": 0.3381, "num_tokens": 191379121.0, "step": 1464 }, { "epoch": 0.5845969672785315, "grad_norm": 0.2634265720844269, "learning_rate": 4.74675096778624e-05, "loss": 0.399, "num_tokens": 191510193.0, "step": 1465 }, { "epoch": 0.5849960095770151, "grad_norm": 0.2730858623981476, "learning_rate": 4.74629459874173e-05, "loss": 0.4424, "num_tokens": 191641265.0, "step": 1466 }, { "epoch": 0.5853950518754988, "grad_norm": 0.2202313095331192, "learning_rate": 4.745837843432825e-05, "loss": 0.3032, "num_tokens": 191772337.0, "step": 1467 }, { "epoch": 0.5857940941739824, "grad_norm": 0.23853744566440582, "learning_rate": 4.7453807019479015e-05, "loss": 0.3537, "num_tokens": 191903409.0, "step": 1468 }, { "epoch": 0.5861931364724661, "grad_norm": 0.23972411453723907, "learning_rate": 4.7449231743754136e-05, "loss": 0.3926, "num_tokens": 192034481.0, "step": 1469 }, { "epoch": 0.5865921787709497, "grad_norm": 0.23606295883655548, "learning_rate": 4.744465260803887e-05, "loss": 0.3556, "num_tokens": 192165553.0, "step": 1470 }, { "epoch": 0.5869912210694334, "grad_norm": 0.2685049772262573, "learning_rate": 4.744006961321926e-05, "loss": 0.4178, "num_tokens": 192296625.0, "step": 1471 }, { "epoch": 0.587390263367917, "grad_norm": 0.2354481816291809, "learning_rate": 4.743548276018205e-05, "loss": 0.3514, "num_tokens": 192419690.0, "step": 1472 }, { "epoch": 0.5877893056664006, "grad_norm": 0.21376155316829681, "learning_rate": 4.7430892049814765e-05, "loss": 0.3296, "num_tokens": 192550762.0, "step": 1473 }, { "epoch": 0.5881883479648843, "grad_norm": 0.2798317074775696, "learning_rate": 4.7426297483005666e-05, "loss": 0.457, "num_tokens": 192681834.0, "step": 1474 }, { "epoch": 0.5885873902633679, "grad_norm": 0.23147064447402954, "learning_rate": 4.742169906064374e-05, "loss": 0.3709, "num_tokens": 192812906.0, "step": 1475 }, { "epoch": 0.5889864325618516, "grad_norm": 0.24776965379714966, "learning_rate": 4.741709678361876e-05, "loss": 0.4108, "num_tokens": 192943978.0, "step": 1476 }, { "epoch": 0.5893854748603352, "grad_norm": 0.23423191905021667, "learning_rate": 4.741249065282123e-05, "loss": 0.3666, "num_tokens": 193075050.0, "step": 1477 }, { "epoch": 0.5897845171588189, "grad_norm": 0.24416574835777283, "learning_rate": 4.740788066914237e-05, "loss": 0.4008, "num_tokens": 193206122.0, "step": 1478 }, { "epoch": 0.5901835594573025, "grad_norm": 0.24685800075531006, "learning_rate": 4.740326683347418e-05, "loss": 0.4066, "num_tokens": 193337194.0, "step": 1479 }, { "epoch": 0.5905826017557861, "grad_norm": 0.23135945200920105, "learning_rate": 4.739864914670939e-05, "loss": 0.3395, "num_tokens": 193468266.0, "step": 1480 }, { "epoch": 0.5909816440542698, "grad_norm": 0.27672940492630005, "learning_rate": 4.7394027609741486e-05, "loss": 0.4246, "num_tokens": 193599338.0, "step": 1481 }, { "epoch": 0.5913806863527534, "grad_norm": 0.23234567046165466, "learning_rate": 4.738940222346469e-05, "loss": 0.3517, "num_tokens": 193730410.0, "step": 1482 }, { "epoch": 0.5917797286512371, "grad_norm": 0.24005524814128876, "learning_rate": 4.738477298877397e-05, "loss": 0.4086, "num_tokens": 193861482.0, "step": 1483 }, { "epoch": 0.5921787709497207, "grad_norm": 0.2284363955259323, "learning_rate": 4.738013990656504e-05, "loss": 0.3109, "num_tokens": 193992554.0, "step": 1484 }, { "epoch": 0.5925778132482044, "grad_norm": 0.2335686981678009, "learning_rate": 4.7375502977734366e-05, "loss": 0.364, "num_tokens": 194123626.0, "step": 1485 }, { "epoch": 0.592976855546688, "grad_norm": 0.25093719363212585, "learning_rate": 4.7370862203179135e-05, "loss": 0.3738, "num_tokens": 194254698.0, "step": 1486 }, { "epoch": 0.5933758978451716, "grad_norm": 0.23945926129817963, "learning_rate": 4.736621758379729e-05, "loss": 0.3903, "num_tokens": 194385770.0, "step": 1487 }, { "epoch": 0.5937749401436552, "grad_norm": 0.24280838668346405, "learning_rate": 4.7361569120487556e-05, "loss": 0.3618, "num_tokens": 194516842.0, "step": 1488 }, { "epoch": 0.5941739824421388, "grad_norm": 0.24606846272945404, "learning_rate": 4.735691681414933e-05, "loss": 0.3762, "num_tokens": 194647914.0, "step": 1489 }, { "epoch": 0.5945730247406225, "grad_norm": 0.2273358255624771, "learning_rate": 4.735226066568281e-05, "loss": 0.3827, "num_tokens": 194778986.0, "step": 1490 }, { "epoch": 0.5949720670391061, "grad_norm": 0.2209179699420929, "learning_rate": 4.734760067598891e-05, "loss": 0.3535, "num_tokens": 194910058.0, "step": 1491 }, { "epoch": 0.5953711093375897, "grad_norm": 0.22096160054206848, "learning_rate": 4.734293684596929e-05, "loss": 0.3237, "num_tokens": 195041130.0, "step": 1492 }, { "epoch": 0.5957701516360734, "grad_norm": 0.22849391400814056, "learning_rate": 4.733826917652637e-05, "loss": 0.3611, "num_tokens": 195172202.0, "step": 1493 }, { "epoch": 0.596169193934557, "grad_norm": 0.2495098114013672, "learning_rate": 4.73335976685633e-05, "loss": 0.4246, "num_tokens": 195303274.0, "step": 1494 }, { "epoch": 0.5965682362330407, "grad_norm": 0.2154218852519989, "learning_rate": 4.7328922322983956e-05, "loss": 0.3353, "num_tokens": 195434346.0, "step": 1495 }, { "epoch": 0.5969672785315243, "grad_norm": 0.2743958830833435, "learning_rate": 4.7324243140692993e-05, "loss": 0.4054, "num_tokens": 195565418.0, "step": 1496 }, { "epoch": 0.597366320830008, "grad_norm": 0.2158108949661255, "learning_rate": 4.731956012259578e-05, "loss": 0.2819, "num_tokens": 195696490.0, "step": 1497 }, { "epoch": 0.5977653631284916, "grad_norm": 0.24803948402404785, "learning_rate": 4.731487326959843e-05, "loss": 0.3613, "num_tokens": 195827562.0, "step": 1498 }, { "epoch": 0.5981644054269752, "grad_norm": 0.28453758358955383, "learning_rate": 4.7310182582607825e-05, "loss": 0.4226, "num_tokens": 195958634.0, "step": 1499 }, { "epoch": 0.5985634477254589, "grad_norm": 0.23280824720859528, "learning_rate": 4.730548806253156e-05, "loss": 0.3735, "num_tokens": 196089706.0, "step": 1500 }, { "epoch": 0.5989624900239425, "grad_norm": 0.23645620048046112, "learning_rate": 4.730078971027797e-05, "loss": 0.3873, "num_tokens": 196220778.0, "step": 1501 }, { "epoch": 0.5993615323224262, "grad_norm": 0.22690288722515106, "learning_rate": 4.729608752675616e-05, "loss": 0.3735, "num_tokens": 196351850.0, "step": 1502 }, { "epoch": 0.5997605746209098, "grad_norm": 0.2325858324766159, "learning_rate": 4.729138151287595e-05, "loss": 0.3746, "num_tokens": 196482922.0, "step": 1503 }, { "epoch": 0.6001596169193935, "grad_norm": 0.24142953753471375, "learning_rate": 4.728667166954791e-05, "loss": 0.3963, "num_tokens": 196607060.0, "step": 1504 }, { "epoch": 0.6005586592178771, "grad_norm": 0.22601617872714996, "learning_rate": 4.728195799768334e-05, "loss": 0.3345, "num_tokens": 196738132.0, "step": 1505 }, { "epoch": 0.6009577015163607, "grad_norm": 0.20779167115688324, "learning_rate": 4.727724049819431e-05, "loss": 0.3299, "num_tokens": 196869204.0, "step": 1506 }, { "epoch": 0.6013567438148444, "grad_norm": 0.2524639666080475, "learning_rate": 4.7272519171993615e-05, "loss": 0.3786, "num_tokens": 197000276.0, "step": 1507 }, { "epoch": 0.601755786113328, "grad_norm": 0.2527434229850769, "learning_rate": 4.726779401999476e-05, "loss": 0.3753, "num_tokens": 197131348.0, "step": 1508 }, { "epoch": 0.6021548284118117, "grad_norm": 0.23778794705867767, "learning_rate": 4.726306504311204e-05, "loss": 0.3645, "num_tokens": 197262420.0, "step": 1509 }, { "epoch": 0.6025538707102953, "grad_norm": 0.23667137324810028, "learning_rate": 4.7258332242260465e-05, "loss": 0.3731, "num_tokens": 197393492.0, "step": 1510 }, { "epoch": 0.602952913008779, "grad_norm": 0.2977621257305145, "learning_rate": 4.7253595618355794e-05, "loss": 0.4029, "num_tokens": 197524564.0, "step": 1511 }, { "epoch": 0.6033519553072626, "grad_norm": 0.25811487436294556, "learning_rate": 4.72488551723145e-05, "loss": 0.3914, "num_tokens": 197655636.0, "step": 1512 }, { "epoch": 0.6037509976057462, "grad_norm": 0.24754150211811066, "learning_rate": 4.7244110905053835e-05, "loss": 0.3736, "num_tokens": 197786708.0, "step": 1513 }, { "epoch": 0.6041500399042299, "grad_norm": 0.2327626347541809, "learning_rate": 4.723936281749176e-05, "loss": 0.3967, "num_tokens": 197917780.0, "step": 1514 }, { "epoch": 0.6045490822027135, "grad_norm": 0.2299155592918396, "learning_rate": 4.7234610910546985e-05, "loss": 0.3739, "num_tokens": 198048852.0, "step": 1515 }, { "epoch": 0.6049481245011972, "grad_norm": 0.24645516276359558, "learning_rate": 4.722985518513896e-05, "loss": 0.3806, "num_tokens": 198179924.0, "step": 1516 }, { "epoch": 0.6053471667996808, "grad_norm": 0.2162633240222931, "learning_rate": 4.722509564218788e-05, "loss": 0.3386, "num_tokens": 198310996.0, "step": 1517 }, { "epoch": 0.6057462090981645, "grad_norm": 0.24143588542938232, "learning_rate": 4.722033228261467e-05, "loss": 0.3835, "num_tokens": 198442068.0, "step": 1518 }, { "epoch": 0.6061452513966481, "grad_norm": 0.22420424222946167, "learning_rate": 4.721556510734099e-05, "loss": 0.3484, "num_tokens": 198573140.0, "step": 1519 }, { "epoch": 0.6065442936951316, "grad_norm": 0.22505518794059753, "learning_rate": 4.721079411728925e-05, "loss": 0.3449, "num_tokens": 198704212.0, "step": 1520 }, { "epoch": 0.6069433359936153, "grad_norm": 0.24951335787773132, "learning_rate": 4.720601931338259e-05, "loss": 0.3835, "num_tokens": 198835284.0, "step": 1521 }, { "epoch": 0.6073423782920989, "grad_norm": 0.2187250554561615, "learning_rate": 4.720124069654489e-05, "loss": 0.3692, "num_tokens": 198966356.0, "step": 1522 }, { "epoch": 0.6077414205905826, "grad_norm": 0.24107973277568817, "learning_rate": 4.7196458267700765e-05, "loss": 0.3604, "num_tokens": 199097428.0, "step": 1523 }, { "epoch": 0.6081404628890662, "grad_norm": 0.22874701023101807, "learning_rate": 4.719167202777558e-05, "loss": 0.3626, "num_tokens": 199228500.0, "step": 1524 }, { "epoch": 0.6085395051875498, "grad_norm": 0.25118911266326904, "learning_rate": 4.718688197769541e-05, "loss": 0.3997, "num_tokens": 199359572.0, "step": 1525 }, { "epoch": 0.6089385474860335, "grad_norm": 0.2444937527179718, "learning_rate": 4.71820881183871e-05, "loss": 0.3923, "num_tokens": 199490644.0, "step": 1526 }, { "epoch": 0.6093375897845171, "grad_norm": 0.21112364530563354, "learning_rate": 4.717729045077822e-05, "loss": 0.3268, "num_tokens": 199621716.0, "step": 1527 }, { "epoch": 0.6097366320830008, "grad_norm": 0.24560067057609558, "learning_rate": 4.717248897579706e-05, "loss": 0.3993, "num_tokens": 199752788.0, "step": 1528 }, { "epoch": 0.6101356743814844, "grad_norm": 0.2201462835073471, "learning_rate": 4.716768369437267e-05, "loss": 0.3406, "num_tokens": 199883860.0, "step": 1529 }, { "epoch": 0.6105347166799681, "grad_norm": 0.22212934494018555, "learning_rate": 4.716287460743483e-05, "loss": 0.372, "num_tokens": 200014932.0, "step": 1530 }, { "epoch": 0.6109337589784517, "grad_norm": 0.22740192711353302, "learning_rate": 4.715806171591405e-05, "loss": 0.3699, "num_tokens": 200146004.0, "step": 1531 }, { "epoch": 0.6113328012769353, "grad_norm": 0.2389640510082245, "learning_rate": 4.715324502074157e-05, "loss": 0.3841, "num_tokens": 200277076.0, "step": 1532 }, { "epoch": 0.611731843575419, "grad_norm": 0.23773618042469025, "learning_rate": 4.7148424522849386e-05, "loss": 0.4112, "num_tokens": 200408148.0, "step": 1533 }, { "epoch": 0.6121308858739026, "grad_norm": 0.24821507930755615, "learning_rate": 4.714360022317022e-05, "loss": 0.4045, "num_tokens": 200539220.0, "step": 1534 }, { "epoch": 0.6125299281723863, "grad_norm": 0.2788538336753845, "learning_rate": 4.713877212263752e-05, "loss": 0.467, "num_tokens": 200670292.0, "step": 1535 }, { "epoch": 0.6129289704708699, "grad_norm": 0.23772206902503967, "learning_rate": 4.7133940222185486e-05, "loss": 0.4037, "num_tokens": 200801364.0, "step": 1536 }, { "epoch": 0.6133280127693536, "grad_norm": 0.2261829674243927, "learning_rate": 4.7129104522749047e-05, "loss": 0.3144, "num_tokens": 200932436.0, "step": 1537 }, { "epoch": 0.6137270550678372, "grad_norm": 0.22849027812480927, "learning_rate": 4.712426502526386e-05, "loss": 0.374, "num_tokens": 201063508.0, "step": 1538 }, { "epoch": 0.6141260973663208, "grad_norm": 0.2546553611755371, "learning_rate": 4.711942173066633e-05, "loss": 0.4099, "num_tokens": 201194580.0, "step": 1539 }, { "epoch": 0.6145251396648045, "grad_norm": 0.24350625276565552, "learning_rate": 4.711457463989358e-05, "loss": 0.4117, "num_tokens": 201325652.0, "step": 1540 }, { "epoch": 0.6149241819632881, "grad_norm": 0.2516975700855255, "learning_rate": 4.71097237538835e-05, "loss": 0.3722, "num_tokens": 201456724.0, "step": 1541 }, { "epoch": 0.6153232242617718, "grad_norm": 0.2221686989068985, "learning_rate": 4.710486907357465e-05, "loss": 0.3502, "num_tokens": 201587796.0, "step": 1542 }, { "epoch": 0.6157222665602554, "grad_norm": 0.23403766751289368, "learning_rate": 4.710001059990639e-05, "loss": 0.3677, "num_tokens": 201718868.0, "step": 1543 }, { "epoch": 0.6161213088587391, "grad_norm": 0.24646225571632385, "learning_rate": 4.7095148333818795e-05, "loss": 0.3617, "num_tokens": 201849940.0, "step": 1544 }, { "epoch": 0.6165203511572227, "grad_norm": 0.23151850700378418, "learning_rate": 4.709028227625267e-05, "loss": 0.3792, "num_tokens": 201981012.0, "step": 1545 }, { "epoch": 0.6169193934557063, "grad_norm": 0.2508333623409271, "learning_rate": 4.708541242814953e-05, "loss": 0.3584, "num_tokens": 202112084.0, "step": 1546 }, { "epoch": 0.61731843575419, "grad_norm": 0.2320336103439331, "learning_rate": 4.7080538790451656e-05, "loss": 0.3799, "num_tokens": 202243156.0, "step": 1547 }, { "epoch": 0.6177174780526736, "grad_norm": 0.22957707941532135, "learning_rate": 4.707566136410206e-05, "loss": 0.3387, "num_tokens": 202374228.0, "step": 1548 }, { "epoch": 0.6181165203511573, "grad_norm": 0.2454901784658432, "learning_rate": 4.707078015004447e-05, "loss": 0.3633, "num_tokens": 202505300.0, "step": 1549 }, { "epoch": 0.6185155626496409, "grad_norm": 0.22839178144931793, "learning_rate": 4.706589514922336e-05, "loss": 0.3463, "num_tokens": 202636372.0, "step": 1550 }, { "epoch": 0.6189146049481244, "grad_norm": 0.23253288865089417, "learning_rate": 4.7061006362583924e-05, "loss": 0.3728, "num_tokens": 202767444.0, "step": 1551 }, { "epoch": 0.6193136472466081, "grad_norm": 0.23348717391490936, "learning_rate": 4.705611379107211e-05, "loss": 0.3373, "num_tokens": 202898516.0, "step": 1552 }, { "epoch": 0.6197126895450917, "grad_norm": 0.2428448349237442, "learning_rate": 4.7051217435634575e-05, "loss": 0.3715, "num_tokens": 203029588.0, "step": 1553 }, { "epoch": 0.6201117318435754, "grad_norm": 0.24240493774414062, "learning_rate": 4.7046317297218715e-05, "loss": 0.3985, "num_tokens": 203160660.0, "step": 1554 }, { "epoch": 0.620510774142059, "grad_norm": 0.24046558141708374, "learning_rate": 4.704141337677268e-05, "loss": 0.3543, "num_tokens": 203291732.0, "step": 1555 }, { "epoch": 0.6209098164405427, "grad_norm": 0.2342187911272049, "learning_rate": 4.7036505675245304e-05, "loss": 0.346, "num_tokens": 203422804.0, "step": 1556 }, { "epoch": 0.6213088587390263, "grad_norm": 0.2530529797077179, "learning_rate": 4.70315941935862e-05, "loss": 0.3848, "num_tokens": 203553876.0, "step": 1557 }, { "epoch": 0.62170790103751, "grad_norm": 0.26680564880371094, "learning_rate": 4.7026678932745697e-05, "loss": 0.3849, "num_tokens": 203684948.0, "step": 1558 }, { "epoch": 0.6221069433359936, "grad_norm": 0.22302822768688202, "learning_rate": 4.7021759893674836e-05, "loss": 0.3381, "num_tokens": 203816020.0, "step": 1559 }, { "epoch": 0.6225059856344772, "grad_norm": 0.237280011177063, "learning_rate": 4.701683707732542e-05, "loss": 0.3677, "num_tokens": 203947092.0, "step": 1560 }, { "epoch": 0.6229050279329609, "grad_norm": 0.22382856905460358, "learning_rate": 4.701191048464997e-05, "loss": 0.3443, "num_tokens": 204078164.0, "step": 1561 }, { "epoch": 0.6233040702314445, "grad_norm": 0.2336033582687378, "learning_rate": 4.7006980116601726e-05, "loss": 0.342, "num_tokens": 204196407.0, "step": 1562 }, { "epoch": 0.6237031125299282, "grad_norm": 0.22950860857963562, "learning_rate": 4.700204597413467e-05, "loss": 0.3392, "num_tokens": 204327479.0, "step": 1563 }, { "epoch": 0.6241021548284118, "grad_norm": 0.25726181268692017, "learning_rate": 4.699710805820351e-05, "loss": 0.3763, "num_tokens": 204458551.0, "step": 1564 }, { "epoch": 0.6245011971268954, "grad_norm": 0.2248077243566513, "learning_rate": 4.699216636976368e-05, "loss": 0.3645, "num_tokens": 204589623.0, "step": 1565 }, { "epoch": 0.6249002394253791, "grad_norm": 0.2215108424425125, "learning_rate": 4.698722090977139e-05, "loss": 0.2958, "num_tokens": 204712906.0, "step": 1566 }, { "epoch": 0.6252992817238627, "grad_norm": 0.228353351354599, "learning_rate": 4.698227167918349e-05, "loss": 0.3547, "num_tokens": 204843978.0, "step": 1567 }, { "epoch": 0.6256983240223464, "grad_norm": 0.23489312827587128, "learning_rate": 4.6977318678957634e-05, "loss": 0.3976, "num_tokens": 204975050.0, "step": 1568 }, { "epoch": 0.62609736632083, "grad_norm": 0.25991290807724, "learning_rate": 4.6972361910052176e-05, "loss": 0.3895, "num_tokens": 205106122.0, "step": 1569 }, { "epoch": 0.6264964086193137, "grad_norm": 0.21518956124782562, "learning_rate": 4.6967401373426214e-05, "loss": 0.3183, "num_tokens": 205237194.0, "step": 1570 }, { "epoch": 0.6268954509177973, "grad_norm": 0.2441665381193161, "learning_rate": 4.6962437070039566e-05, "loss": 0.4237, "num_tokens": 205368266.0, "step": 1571 }, { "epoch": 0.627294493216281, "grad_norm": 3.778937816619873, "learning_rate": 4.695746900085275e-05, "loss": 0.5319, "num_tokens": 205499338.0, "step": 1572 }, { "epoch": 0.6276935355147646, "grad_norm": 0.2437572181224823, "learning_rate": 4.695249716682709e-05, "loss": 0.3676, "num_tokens": 205630410.0, "step": 1573 }, { "epoch": 0.6280925778132482, "grad_norm": 0.2545948028564453, "learning_rate": 4.694752156892455e-05, "loss": 0.3615, "num_tokens": 205761482.0, "step": 1574 }, { "epoch": 0.6284916201117319, "grad_norm": 0.21523945033550262, "learning_rate": 4.6942542208107865e-05, "loss": 0.3377, "num_tokens": 205892554.0, "step": 1575 }, { "epoch": 0.6288906624102155, "grad_norm": 0.2908205986022949, "learning_rate": 4.693755908534052e-05, "loss": 0.3777, "num_tokens": 206023626.0, "step": 1576 }, { "epoch": 0.6292897047086992, "grad_norm": 0.2653482258319855, "learning_rate": 4.6932572201586674e-05, "loss": 0.3843, "num_tokens": 206154698.0, "step": 1577 }, { "epoch": 0.6296887470071828, "grad_norm": 0.2716182470321655, "learning_rate": 4.692758155781125e-05, "loss": 0.3945, "num_tokens": 206285770.0, "step": 1578 }, { "epoch": 0.6300877893056664, "grad_norm": 0.2543533444404602, "learning_rate": 4.6922587154979905e-05, "loss": 0.3731, "num_tokens": 206416842.0, "step": 1579 }, { "epoch": 0.6304868316041501, "grad_norm": 0.2226848304271698, "learning_rate": 4.6917588994059e-05, "loss": 0.3309, "num_tokens": 206547914.0, "step": 1580 }, { "epoch": 0.6308858739026337, "grad_norm": 0.22782780230045319, "learning_rate": 4.691258707601564e-05, "loss": 0.3201, "num_tokens": 206667238.0, "step": 1581 }, { "epoch": 0.6312849162011173, "grad_norm": 0.21581390500068665, "learning_rate": 4.690758140181764e-05, "loss": 0.3264, "num_tokens": 206798310.0, "step": 1582 }, { "epoch": 0.6316839584996009, "grad_norm": 0.22061395645141602, "learning_rate": 4.690257197243354e-05, "loss": 0.3418, "num_tokens": 206929382.0, "step": 1583 }, { "epoch": 0.6320830007980845, "grad_norm": 0.2471616566181183, "learning_rate": 4.689755878883264e-05, "loss": 0.3814, "num_tokens": 207054430.0, "step": 1584 }, { "epoch": 0.6324820430965682, "grad_norm": 0.22427381575107574, "learning_rate": 4.6892541851984937e-05, "loss": 0.3256, "num_tokens": 207185502.0, "step": 1585 }, { "epoch": 0.6328810853950518, "grad_norm": 0.2362249493598938, "learning_rate": 4.688752116286117e-05, "loss": 0.3919, "num_tokens": 207316574.0, "step": 1586 }, { "epoch": 0.6332801276935355, "grad_norm": 0.25508686900138855, "learning_rate": 4.6882496722432774e-05, "loss": 0.4093, "num_tokens": 207447646.0, "step": 1587 }, { "epoch": 0.6336791699920191, "grad_norm": 0.27362537384033203, "learning_rate": 4.687746853167195e-05, "loss": 0.4089, "num_tokens": 207578718.0, "step": 1588 }, { "epoch": 0.6340782122905028, "grad_norm": 0.22248585522174835, "learning_rate": 4.6872436591551595e-05, "loss": 0.319, "num_tokens": 207709790.0, "step": 1589 }, { "epoch": 0.6344772545889864, "grad_norm": 0.2337857335805893, "learning_rate": 4.686740090304534e-05, "loss": 0.3553, "num_tokens": 207840862.0, "step": 1590 }, { "epoch": 0.63487629688747, "grad_norm": 0.2641553580760956, "learning_rate": 4.686236146712757e-05, "loss": 0.3795, "num_tokens": 207971934.0, "step": 1591 }, { "epoch": 0.6352753391859537, "grad_norm": 0.2505098879337311, "learning_rate": 4.6857318284773336e-05, "loss": 0.3543, "num_tokens": 208103006.0, "step": 1592 }, { "epoch": 0.6356743814844373, "grad_norm": 0.25923246145248413, "learning_rate": 4.685227135695847e-05, "loss": 0.3962, "num_tokens": 208234078.0, "step": 1593 }, { "epoch": 0.636073423782921, "grad_norm": 0.2283843457698822, "learning_rate": 4.6847220684659485e-05, "loss": 0.3131, "num_tokens": 208365150.0, "step": 1594 }, { "epoch": 0.6364724660814046, "grad_norm": 0.242123544216156, "learning_rate": 4.6842166268853655e-05, "loss": 0.3788, "num_tokens": 208496222.0, "step": 1595 }, { "epoch": 0.6368715083798883, "grad_norm": 0.253057062625885, "learning_rate": 4.6837108110518956e-05, "loss": 0.3354, "num_tokens": 208627294.0, "step": 1596 }, { "epoch": 0.6372705506783719, "grad_norm": 0.22635065019130707, "learning_rate": 4.68320462106341e-05, "loss": 0.3395, "num_tokens": 208758366.0, "step": 1597 }, { "epoch": 0.6376695929768555, "grad_norm": 0.27500444650650024, "learning_rate": 4.682698057017851e-05, "loss": 0.4157, "num_tokens": 208889438.0, "step": 1598 }, { "epoch": 0.6380686352753392, "grad_norm": 0.2256556749343872, "learning_rate": 4.6821911190132344e-05, "loss": 0.3645, "num_tokens": 209020510.0, "step": 1599 }, { "epoch": 0.6384676775738228, "grad_norm": 0.28667208552360535, "learning_rate": 4.6816838071476475e-05, "loss": 0.4418, "num_tokens": 209151582.0, "step": 1600 }, { "epoch": 0.6388667198723065, "grad_norm": 0.2391665279865265, "learning_rate": 4.681176121519251e-05, "loss": 0.3529, "num_tokens": 209282654.0, "step": 1601 }, { "epoch": 0.6392657621707901, "grad_norm": 0.23740313947200775, "learning_rate": 4.6806680622262786e-05, "loss": 0.3633, "num_tokens": 209413726.0, "step": 1602 }, { "epoch": 0.6396648044692738, "grad_norm": 0.21744932234287262, "learning_rate": 4.680159629367032e-05, "loss": 0.3013, "num_tokens": 209544798.0, "step": 1603 }, { "epoch": 0.6400638467677574, "grad_norm": 0.23906275629997253, "learning_rate": 4.67965082303989e-05, "loss": 0.3635, "num_tokens": 209675870.0, "step": 1604 }, { "epoch": 0.640462889066241, "grad_norm": 0.3791855573654175, "learning_rate": 4.6791416433433025e-05, "loss": 0.4853, "num_tokens": 209806942.0, "step": 1605 }, { "epoch": 0.6408619313647247, "grad_norm": 0.2482745200395584, "learning_rate": 4.6786320903757894e-05, "loss": 0.3758, "num_tokens": 209938014.0, "step": 1606 }, { "epoch": 0.6412609736632083, "grad_norm": 0.251101016998291, "learning_rate": 4.678122164235946e-05, "loss": 0.3367, "num_tokens": 210069086.0, "step": 1607 }, { "epoch": 0.641660015961692, "grad_norm": 0.2855619788169861, "learning_rate": 4.677611865022437e-05, "loss": 0.3597, "num_tokens": 210200158.0, "step": 1608 }, { "epoch": 0.6420590582601756, "grad_norm": 0.22943083941936493, "learning_rate": 4.677101192834001e-05, "loss": 0.35, "num_tokens": 210331230.0, "step": 1609 }, { "epoch": 0.6424581005586593, "grad_norm": 0.2264997661113739, "learning_rate": 4.676590147769449e-05, "loss": 0.3731, "num_tokens": 210462302.0, "step": 1610 }, { "epoch": 0.6428571428571429, "grad_norm": 0.2464066594839096, "learning_rate": 4.676078729927662e-05, "loss": 0.4158, "num_tokens": 210593374.0, "step": 1611 }, { "epoch": 0.6432561851556265, "grad_norm": 0.336844265460968, "learning_rate": 4.6755669394075965e-05, "loss": 0.3461, "num_tokens": 210724446.0, "step": 1612 }, { "epoch": 0.6436552274541102, "grad_norm": 0.22348786890506744, "learning_rate": 4.6750547763082774e-05, "loss": 0.3429, "num_tokens": 210855518.0, "step": 1613 }, { "epoch": 0.6440542697525937, "grad_norm": 0.22455011308193207, "learning_rate": 4.674542240728805e-05, "loss": 0.3487, "num_tokens": 210986590.0, "step": 1614 }, { "epoch": 0.6444533120510774, "grad_norm": 0.2369423508644104, "learning_rate": 4.674029332768348e-05, "loss": 0.3585, "num_tokens": 211117662.0, "step": 1615 }, { "epoch": 0.644852354349561, "grad_norm": 0.24720437824726105, "learning_rate": 4.673516052526152e-05, "loss": 0.3591, "num_tokens": 211248734.0, "step": 1616 }, { "epoch": 0.6452513966480447, "grad_norm": 0.24041809141635895, "learning_rate": 4.67300240010153e-05, "loss": 0.342, "num_tokens": 211379806.0, "step": 1617 }, { "epoch": 0.6456504389465283, "grad_norm": 0.24602262675762177, "learning_rate": 4.672488375593869e-05, "loss": 0.3384, "num_tokens": 211510878.0, "step": 1618 }, { "epoch": 0.6460494812450119, "grad_norm": 0.2317705899477005, "learning_rate": 4.67197397910263e-05, "loss": 0.3481, "num_tokens": 211641950.0, "step": 1619 }, { "epoch": 0.6464485235434956, "grad_norm": 0.23307061195373535, "learning_rate": 4.6714592107273416e-05, "loss": 0.3429, "num_tokens": 211773022.0, "step": 1620 }, { "epoch": 0.6468475658419792, "grad_norm": 0.23166489601135254, "learning_rate": 4.670944070567608e-05, "loss": 0.3185, "num_tokens": 211904094.0, "step": 1621 }, { "epoch": 0.6472466081404629, "grad_norm": 0.23429669439792633, "learning_rate": 4.670428558723103e-05, "loss": 0.3361, "num_tokens": 212035166.0, "step": 1622 }, { "epoch": 0.6476456504389465, "grad_norm": 0.230388343334198, "learning_rate": 4.669912675293574e-05, "loss": 0.3498, "num_tokens": 212166238.0, "step": 1623 }, { "epoch": 0.6480446927374302, "grad_norm": 0.23836855590343475, "learning_rate": 4.6693964203788394e-05, "loss": 0.3609, "num_tokens": 212297310.0, "step": 1624 }, { "epoch": 0.6484437350359138, "grad_norm": 0.22835403680801392, "learning_rate": 4.6688797940787905e-05, "loss": 0.325, "num_tokens": 212428382.0, "step": 1625 }, { "epoch": 0.6488427773343974, "grad_norm": 0.23492023348808289, "learning_rate": 4.668362796493388e-05, "loss": 0.3828, "num_tokens": 212559454.0, "step": 1626 }, { "epoch": 0.6492418196328811, "grad_norm": 0.25201159715652466, "learning_rate": 4.667845427722668e-05, "loss": 0.4274, "num_tokens": 212690526.0, "step": 1627 }, { "epoch": 0.6496408619313647, "grad_norm": 0.2605859935283661, "learning_rate": 4.667327687866736e-05, "loss": 0.4089, "num_tokens": 212821598.0, "step": 1628 }, { "epoch": 0.6500399042298484, "grad_norm": 0.24837395548820496, "learning_rate": 4.666809577025768e-05, "loss": 0.3672, "num_tokens": 212952670.0, "step": 1629 }, { "epoch": 0.650438946528332, "grad_norm": 0.2535875141620636, "learning_rate": 4.666291095300016e-05, "loss": 0.3772, "num_tokens": 213083742.0, "step": 1630 }, { "epoch": 0.6508379888268156, "grad_norm": 0.23214951157569885, "learning_rate": 4.6657722427898005e-05, "loss": 0.3662, "num_tokens": 213214814.0, "step": 1631 }, { "epoch": 0.6512370311252993, "grad_norm": 0.2653745114803314, "learning_rate": 4.665253019595514e-05, "loss": 0.3929, "num_tokens": 213345886.0, "step": 1632 }, { "epoch": 0.6516360734237829, "grad_norm": 0.25630027055740356, "learning_rate": 4.664733425817623e-05, "loss": 0.361, "num_tokens": 213476958.0, "step": 1633 }, { "epoch": 0.6520351157222666, "grad_norm": 0.22960594296455383, "learning_rate": 4.6642134615566624e-05, "loss": 0.3514, "num_tokens": 213608030.0, "step": 1634 }, { "epoch": 0.6524341580207502, "grad_norm": 0.23587915301322937, "learning_rate": 4.663693126913242e-05, "loss": 0.3617, "num_tokens": 213739102.0, "step": 1635 }, { "epoch": 0.6528332003192339, "grad_norm": 0.234225332736969, "learning_rate": 4.663172421988039e-05, "loss": 0.3472, "num_tokens": 213870174.0, "step": 1636 }, { "epoch": 0.6532322426177175, "grad_norm": 0.23192743957042694, "learning_rate": 4.662651346881808e-05, "loss": 0.3434, "num_tokens": 214001246.0, "step": 1637 }, { "epoch": 0.6536312849162011, "grad_norm": 0.24840010702610016, "learning_rate": 4.662129901695371e-05, "loss": 0.3908, "num_tokens": 214132318.0, "step": 1638 }, { "epoch": 0.6540303272146848, "grad_norm": 0.2314678579568863, "learning_rate": 4.661608086529621e-05, "loss": 0.313, "num_tokens": 214263390.0, "step": 1639 }, { "epoch": 0.6544293695131684, "grad_norm": 0.2721535265445709, "learning_rate": 4.661085901485528e-05, "loss": 0.3884, "num_tokens": 214394462.0, "step": 1640 }, { "epoch": 0.6548284118116521, "grad_norm": 0.284516841173172, "learning_rate": 4.6605633466641276e-05, "loss": 0.4486, "num_tokens": 214525534.0, "step": 1641 }, { "epoch": 0.6552274541101357, "grad_norm": 0.20648829638957977, "learning_rate": 4.660040422166529e-05, "loss": 0.3259, "num_tokens": 214656606.0, "step": 1642 }, { "epoch": 0.6556264964086194, "grad_norm": 0.2515023946762085, "learning_rate": 4.659517128093914e-05, "loss": 0.3654, "num_tokens": 214787678.0, "step": 1643 }, { "epoch": 0.656025538707103, "grad_norm": 0.23108167946338654, "learning_rate": 4.658993464547535e-05, "loss": 0.3505, "num_tokens": 214918750.0, "step": 1644 }, { "epoch": 0.6564245810055865, "grad_norm": 0.2224903255701065, "learning_rate": 4.658469431628717e-05, "loss": 0.3629, "num_tokens": 215049822.0, "step": 1645 }, { "epoch": 0.6568236233040702, "grad_norm": 0.2260652482509613, "learning_rate": 4.657945029438854e-05, "loss": 0.3459, "num_tokens": 215180894.0, "step": 1646 }, { "epoch": 0.6572226656025538, "grad_norm": 0.23723602294921875, "learning_rate": 4.657420258079413e-05, "loss": 0.3586, "num_tokens": 215311966.0, "step": 1647 }, { "epoch": 0.6576217079010375, "grad_norm": 0.2272031456232071, "learning_rate": 4.656895117651933e-05, "loss": 0.3334, "num_tokens": 215443038.0, "step": 1648 }, { "epoch": 0.6580207501995211, "grad_norm": 0.24095909297466278, "learning_rate": 4.6563696082580236e-05, "loss": 0.3629, "num_tokens": 215574110.0, "step": 1649 }, { "epoch": 0.6584197924980048, "grad_norm": 0.2375655621290207, "learning_rate": 4.655843729999366e-05, "loss": 0.3645, "num_tokens": 215705182.0, "step": 1650 }, { "epoch": 0.6588188347964884, "grad_norm": 0.2946808338165283, "learning_rate": 4.655317482977713e-05, "loss": 0.4599, "num_tokens": 215836254.0, "step": 1651 }, { "epoch": 0.659217877094972, "grad_norm": 0.26911821961402893, "learning_rate": 4.654790867294887e-05, "loss": 0.3979, "num_tokens": 215967326.0, "step": 1652 }, { "epoch": 0.6596169193934557, "grad_norm": 0.2272089719772339, "learning_rate": 4.6542638830527854e-05, "loss": 0.3645, "num_tokens": 216098398.0, "step": 1653 }, { "epoch": 0.6600159616919393, "grad_norm": 0.28162631392478943, "learning_rate": 4.653736530353374e-05, "loss": 0.4079, "num_tokens": 216229470.0, "step": 1654 }, { "epoch": 0.660415003990423, "grad_norm": 0.239335834980011, "learning_rate": 4.653208809298689e-05, "loss": 0.375, "num_tokens": 216360542.0, "step": 1655 }, { "epoch": 0.6608140462889066, "grad_norm": 0.2283809334039688, "learning_rate": 4.652680719990842e-05, "loss": 0.3474, "num_tokens": 216491614.0, "step": 1656 }, { "epoch": 0.6612130885873903, "grad_norm": 0.21890071034431458, "learning_rate": 4.652152262532013e-05, "loss": 0.3227, "num_tokens": 216622686.0, "step": 1657 }, { "epoch": 0.6616121308858739, "grad_norm": 0.22649848461151123, "learning_rate": 4.651623437024451e-05, "loss": 0.3654, "num_tokens": 216753758.0, "step": 1658 }, { "epoch": 0.6620111731843575, "grad_norm": 0.5225958228111267, "learning_rate": 4.651094243570482e-05, "loss": 0.3223, "num_tokens": 216884830.0, "step": 1659 }, { "epoch": 0.6624102154828412, "grad_norm": 0.26485684514045715, "learning_rate": 4.650564682272498e-05, "loss": 0.3889, "num_tokens": 217015902.0, "step": 1660 }, { "epoch": 0.6628092577813248, "grad_norm": 0.2709636092185974, "learning_rate": 4.650034753232965e-05, "loss": 0.3868, "num_tokens": 217146974.0, "step": 1661 }, { "epoch": 0.6632083000798085, "grad_norm": 0.24957673251628876, "learning_rate": 4.64950445655442e-05, "loss": 0.3601, "num_tokens": 217270624.0, "step": 1662 }, { "epoch": 0.6636073423782921, "grad_norm": 0.34965449571609497, "learning_rate": 4.648973792339468e-05, "loss": 0.4559, "num_tokens": 217401696.0, "step": 1663 }, { "epoch": 0.6640063846767758, "grad_norm": 0.23266594111919403, "learning_rate": 4.64844276069079e-05, "loss": 0.3426, "num_tokens": 217532768.0, "step": 1664 }, { "epoch": 0.6644054269752594, "grad_norm": 0.23502330482006073, "learning_rate": 4.647911361711135e-05, "loss": 0.3464, "num_tokens": 217663840.0, "step": 1665 }, { "epoch": 0.664804469273743, "grad_norm": 0.25729721784591675, "learning_rate": 4.6473795955033234e-05, "loss": 0.4211, "num_tokens": 217794912.0, "step": 1666 }, { "epoch": 0.6652035115722267, "grad_norm": 0.2385796755552292, "learning_rate": 4.646847462170247e-05, "loss": 0.358, "num_tokens": 217925984.0, "step": 1667 }, { "epoch": 0.6656025538707103, "grad_norm": 0.2077520489692688, "learning_rate": 4.646314961814869e-05, "loss": 0.2953, "num_tokens": 218057056.0, "step": 1668 }, { "epoch": 0.666001596169194, "grad_norm": 0.2532923221588135, "learning_rate": 4.645782094540224e-05, "loss": 0.384, "num_tokens": 218188128.0, "step": 1669 }, { "epoch": 0.6664006384676776, "grad_norm": 0.2577701210975647, "learning_rate": 4.645248860449415e-05, "loss": 0.3727, "num_tokens": 218319200.0, "step": 1670 }, { "epoch": 0.6667996807661613, "grad_norm": 0.2568991482257843, "learning_rate": 4.644715259645619e-05, "loss": 0.3737, "num_tokens": 218450272.0, "step": 1671 }, { "epoch": 0.6671987230646449, "grad_norm": 0.23131567239761353, "learning_rate": 4.6441812922320826e-05, "loss": 0.3328, "num_tokens": 218581344.0, "step": 1672 }, { "epoch": 0.6675977653631285, "grad_norm": 0.23201270401477814, "learning_rate": 4.6436469583121235e-05, "loss": 0.3316, "num_tokens": 218712416.0, "step": 1673 }, { "epoch": 0.6679968076616122, "grad_norm": 0.20986998081207275, "learning_rate": 4.64311225798913e-05, "loss": 0.287, "num_tokens": 218843488.0, "step": 1674 }, { "epoch": 0.6683958499600958, "grad_norm": 0.2542083263397217, "learning_rate": 4.642577191366562e-05, "loss": 0.3739, "num_tokens": 218974560.0, "step": 1675 }, { "epoch": 0.6687948922585794, "grad_norm": 0.23834894597530365, "learning_rate": 4.64204175854795e-05, "loss": 0.342, "num_tokens": 219105632.0, "step": 1676 }, { "epoch": 0.669193934557063, "grad_norm": 0.2571513056755066, "learning_rate": 4.641505959636896e-05, "loss": 0.3893, "num_tokens": 219236704.0, "step": 1677 }, { "epoch": 0.6695929768555466, "grad_norm": 0.2579937279224396, "learning_rate": 4.64096979473707e-05, "loss": 0.3316, "num_tokens": 219367776.0, "step": 1678 }, { "epoch": 0.6699920191540303, "grad_norm": 0.23566696047782898, "learning_rate": 4.640433263952217e-05, "loss": 0.3623, "num_tokens": 219498848.0, "step": 1679 }, { "epoch": 0.6703910614525139, "grad_norm": 0.2572188675403595, "learning_rate": 4.639896367386149e-05, "loss": 0.3925, "num_tokens": 219629920.0, "step": 1680 }, { "epoch": 0.6707901037509976, "grad_norm": 0.21755632758140564, "learning_rate": 4.6393591051427515e-05, "loss": 0.3525, "num_tokens": 219760992.0, "step": 1681 }, { "epoch": 0.6711891460494812, "grad_norm": 0.23145905137062073, "learning_rate": 4.6388214773259795e-05, "loss": 0.3686, "num_tokens": 219892064.0, "step": 1682 }, { "epoch": 0.6715881883479649, "grad_norm": 0.2318158745765686, "learning_rate": 4.638283484039859e-05, "loss": 0.3613, "num_tokens": 220023136.0, "step": 1683 }, { "epoch": 0.6719872306464485, "grad_norm": 0.2489011287689209, "learning_rate": 4.637745125388488e-05, "loss": 0.3898, "num_tokens": 220154208.0, "step": 1684 }, { "epoch": 0.6723862729449321, "grad_norm": 0.239585280418396, "learning_rate": 4.6372064014760306e-05, "loss": 0.4224, "num_tokens": 220285280.0, "step": 1685 }, { "epoch": 0.6727853152434158, "grad_norm": 0.23236766457557678, "learning_rate": 4.6366673124067285e-05, "loss": 0.3375, "num_tokens": 220416352.0, "step": 1686 }, { "epoch": 0.6731843575418994, "grad_norm": 0.24872757494449615, "learning_rate": 4.636127858284887e-05, "loss": 0.3758, "num_tokens": 220547424.0, "step": 1687 }, { "epoch": 0.6735833998403831, "grad_norm": 0.2415829449892044, "learning_rate": 4.635588039214888e-05, "loss": 0.3811, "num_tokens": 220678496.0, "step": 1688 }, { "epoch": 0.6739824421388667, "grad_norm": 0.23774276673793793, "learning_rate": 4.635047855301181e-05, "loss": 0.3564, "num_tokens": 220809568.0, "step": 1689 }, { "epoch": 0.6743814844373504, "grad_norm": 0.24710381031036377, "learning_rate": 4.634507306648286e-05, "loss": 0.3722, "num_tokens": 220940640.0, "step": 1690 }, { "epoch": 0.674780526735834, "grad_norm": 0.21785998344421387, "learning_rate": 4.633966393360794e-05, "loss": 0.3362, "num_tokens": 221071712.0, "step": 1691 }, { "epoch": 0.6751795690343176, "grad_norm": 0.21657221019268036, "learning_rate": 4.633425115543368e-05, "loss": 0.3404, "num_tokens": 221202784.0, "step": 1692 }, { "epoch": 0.6755786113328013, "grad_norm": 0.26867467164993286, "learning_rate": 4.632883473300738e-05, "loss": 0.4332, "num_tokens": 221333856.0, "step": 1693 }, { "epoch": 0.6759776536312849, "grad_norm": 0.2684285342693329, "learning_rate": 4.632341466737708e-05, "loss": 0.4628, "num_tokens": 221464928.0, "step": 1694 }, { "epoch": 0.6763766959297686, "grad_norm": 0.23236003518104553, "learning_rate": 4.6317990959591515e-05, "loss": 0.3674, "num_tokens": 221596000.0, "step": 1695 }, { "epoch": 0.6767757382282522, "grad_norm": 0.2114923745393753, "learning_rate": 4.631256361070012e-05, "loss": 0.3368, "num_tokens": 221727072.0, "step": 1696 }, { "epoch": 0.6771747805267359, "grad_norm": 0.25512203574180603, "learning_rate": 4.630713262175304e-05, "loss": 0.4299, "num_tokens": 221858144.0, "step": 1697 }, { "epoch": 0.6775738228252195, "grad_norm": 0.23966360092163086, "learning_rate": 4.63016979938011e-05, "loss": 0.3949, "num_tokens": 221989216.0, "step": 1698 }, { "epoch": 0.6779728651237031, "grad_norm": 0.24683435261249542, "learning_rate": 4.629625972789588e-05, "loss": 0.367, "num_tokens": 222120288.0, "step": 1699 }, { "epoch": 0.6783719074221868, "grad_norm": 0.22370901703834534, "learning_rate": 4.629081782508961e-05, "loss": 0.3536, "num_tokens": 222251360.0, "step": 1700 }, { "epoch": 0.6787709497206704, "grad_norm": 0.2461879402399063, "learning_rate": 4.628537228643526e-05, "loss": 0.3282, "num_tokens": 222382432.0, "step": 1701 }, { "epoch": 0.6791699920191541, "grad_norm": 0.22130267322063446, "learning_rate": 4.6279923112986484e-05, "loss": 0.326, "num_tokens": 222513504.0, "step": 1702 }, { "epoch": 0.6795690343176377, "grad_norm": 0.23597007989883423, "learning_rate": 4.6274470305797654e-05, "loss": 0.3638, "num_tokens": 222644576.0, "step": 1703 }, { "epoch": 0.6799680766161214, "grad_norm": 0.22881248593330383, "learning_rate": 4.626901386592384e-05, "loss": 0.3122, "num_tokens": 222775648.0, "step": 1704 }, { "epoch": 0.680367118914605, "grad_norm": 0.2494865208864212, "learning_rate": 4.62635537944208e-05, "loss": 0.3615, "num_tokens": 222906720.0, "step": 1705 }, { "epoch": 0.6807661612130886, "grad_norm": 0.27674156427383423, "learning_rate": 4.6258090092345005e-05, "loss": 0.3753, "num_tokens": 223037792.0, "step": 1706 }, { "epoch": 0.6811652035115723, "grad_norm": 0.28793367743492126, "learning_rate": 4.625262276075364e-05, "loss": 0.4207, "num_tokens": 223168864.0, "step": 1707 }, { "epoch": 0.6815642458100558, "grad_norm": 0.2472468912601471, "learning_rate": 4.624715180070459e-05, "loss": 0.3624, "num_tokens": 223299936.0, "step": 1708 }, { "epoch": 0.6819632881085395, "grad_norm": 0.22885762155056, "learning_rate": 4.624167721325642e-05, "loss": 0.3259, "num_tokens": 223431008.0, "step": 1709 }, { "epoch": 0.6823623304070231, "grad_norm": 0.23651358485221863, "learning_rate": 4.623619899946841e-05, "loss": 0.3643, "num_tokens": 223562080.0, "step": 1710 }, { "epoch": 0.6827613727055067, "grad_norm": 0.24007350206375122, "learning_rate": 4.623071716040056e-05, "loss": 0.4105, "num_tokens": 223693152.0, "step": 1711 }, { "epoch": 0.6831604150039904, "grad_norm": 0.2262740135192871, "learning_rate": 4.622523169711354e-05, "loss": 0.3254, "num_tokens": 223824224.0, "step": 1712 }, { "epoch": 0.683559457302474, "grad_norm": 0.23368822038173676, "learning_rate": 4.6219742610668744e-05, "loss": 0.3528, "num_tokens": 223955296.0, "step": 1713 }, { "epoch": 0.6839584996009577, "grad_norm": 0.24881744384765625, "learning_rate": 4.6214249902128256e-05, "loss": 0.361, "num_tokens": 224086368.0, "step": 1714 }, { "epoch": 0.6843575418994413, "grad_norm": 0.22603969275951385, "learning_rate": 4.6208753572554864e-05, "loss": 0.331, "num_tokens": 224217440.0, "step": 1715 }, { "epoch": 0.684756584197925, "grad_norm": 0.2653196156024933, "learning_rate": 4.620325362301205e-05, "loss": 0.3963, "num_tokens": 224348512.0, "step": 1716 }, { "epoch": 0.6851556264964086, "grad_norm": 0.2195960283279419, "learning_rate": 4.6197750054564016e-05, "loss": 0.3164, "num_tokens": 224479584.0, "step": 1717 }, { "epoch": 0.6855546687948922, "grad_norm": 0.2259628027677536, "learning_rate": 4.619224286827565e-05, "loss": 0.3468, "num_tokens": 224610656.0, "step": 1718 }, { "epoch": 0.6859537110933759, "grad_norm": 0.234134703874588, "learning_rate": 4.618673206521252e-05, "loss": 0.3695, "num_tokens": 224741728.0, "step": 1719 }, { "epoch": 0.6863527533918595, "grad_norm": 0.23327548801898956, "learning_rate": 4.618121764644095e-05, "loss": 0.3485, "num_tokens": 224872800.0, "step": 1720 }, { "epoch": 0.6867517956903432, "grad_norm": 0.23316733539104462, "learning_rate": 4.6175699613027895e-05, "loss": 0.3105, "num_tokens": 225003872.0, "step": 1721 }, { "epoch": 0.6871508379888268, "grad_norm": 0.24678529798984528, "learning_rate": 4.6170177966041054e-05, "loss": 0.3641, "num_tokens": 225134944.0, "step": 1722 }, { "epoch": 0.6875498802873105, "grad_norm": 0.20099812746047974, "learning_rate": 4.616465270654883e-05, "loss": 0.2801, "num_tokens": 225266016.0, "step": 1723 }, { "epoch": 0.6879489225857941, "grad_norm": 0.27121099829673767, "learning_rate": 4.6159123835620296e-05, "loss": 0.3797, "num_tokens": 225397088.0, "step": 1724 }, { "epoch": 0.6883479648842777, "grad_norm": 0.2294893115758896, "learning_rate": 4.6153591354325225e-05, "loss": 0.2931, "num_tokens": 225528160.0, "step": 1725 }, { "epoch": 0.6887470071827614, "grad_norm": 0.2893443703651428, "learning_rate": 4.6148055263734126e-05, "loss": 0.3996, "num_tokens": 225659232.0, "step": 1726 }, { "epoch": 0.689146049481245, "grad_norm": 0.25916802883148193, "learning_rate": 4.6142515564918163e-05, "loss": 0.3647, "num_tokens": 225790304.0, "step": 1727 }, { "epoch": 0.6895450917797287, "grad_norm": 0.2524445354938507, "learning_rate": 4.6136972258949226e-05, "loss": 0.3884, "num_tokens": 225921376.0, "step": 1728 }, { "epoch": 0.6899441340782123, "grad_norm": 0.23450325429439545, "learning_rate": 4.613142534689989e-05, "loss": 0.3462, "num_tokens": 226052448.0, "step": 1729 }, { "epoch": 0.690343176376696, "grad_norm": 0.24412497878074646, "learning_rate": 4.6125874829843424e-05, "loss": 0.3549, "num_tokens": 226183520.0, "step": 1730 }, { "epoch": 0.6907422186751796, "grad_norm": 0.2524564862251282, "learning_rate": 4.612032070885381e-05, "loss": 0.4029, "num_tokens": 226314592.0, "step": 1731 }, { "epoch": 0.6911412609736632, "grad_norm": 0.2615528404712677, "learning_rate": 4.6114762985005714e-05, "loss": 0.4025, "num_tokens": 226445664.0, "step": 1732 }, { "epoch": 0.6915403032721469, "grad_norm": 0.23237405717372894, "learning_rate": 4.6109201659374504e-05, "loss": 0.339, "num_tokens": 226576736.0, "step": 1733 }, { "epoch": 0.6919393455706305, "grad_norm": 0.24597306549549103, "learning_rate": 4.6103636733036247e-05, "loss": 0.3481, "num_tokens": 226707808.0, "step": 1734 }, { "epoch": 0.6923383878691142, "grad_norm": 0.24529610574245453, "learning_rate": 4.6098068207067705e-05, "loss": 0.3705, "num_tokens": 226838880.0, "step": 1735 }, { "epoch": 0.6927374301675978, "grad_norm": 0.2518002688884735, "learning_rate": 4.6092496082546324e-05, "loss": 0.4173, "num_tokens": 226969952.0, "step": 1736 }, { "epoch": 0.6931364724660815, "grad_norm": 0.22408662736415863, "learning_rate": 4.608692036055028e-05, "loss": 0.3146, "num_tokens": 227101024.0, "step": 1737 }, { "epoch": 0.6935355147645651, "grad_norm": 0.26306313276290894, "learning_rate": 4.6081341042158396e-05, "loss": 0.4027, "num_tokens": 227232096.0, "step": 1738 }, { "epoch": 0.6939345570630486, "grad_norm": 0.2547127306461334, "learning_rate": 4.607575812845026e-05, "loss": 0.3829, "num_tokens": 227363168.0, "step": 1739 }, { "epoch": 0.6943335993615323, "grad_norm": 0.2371472716331482, "learning_rate": 4.6070171620506066e-05, "loss": 0.3524, "num_tokens": 227494240.0, "step": 1740 }, { "epoch": 0.6947326416600159, "grad_norm": 0.2300252616405487, "learning_rate": 4.606458151940678e-05, "loss": 0.3194, "num_tokens": 227625312.0, "step": 1741 }, { "epoch": 0.6951316839584996, "grad_norm": 0.23518584668636322, "learning_rate": 4.605898782623402e-05, "loss": 0.3538, "num_tokens": 227756384.0, "step": 1742 }, { "epoch": 0.6955307262569832, "grad_norm": 0.2358332872390747, "learning_rate": 4.605339054207013e-05, "loss": 0.3623, "num_tokens": 227887456.0, "step": 1743 }, { "epoch": 0.6959297685554668, "grad_norm": 0.2451018989086151, "learning_rate": 4.6047789667998114e-05, "loss": 0.3696, "num_tokens": 228018528.0, "step": 1744 }, { "epoch": 0.6963288108539505, "grad_norm": 0.22617052495479584, "learning_rate": 4.604218520510169e-05, "loss": 0.3032, "num_tokens": 228149600.0, "step": 1745 }, { "epoch": 0.6967278531524341, "grad_norm": 0.2325049787759781, "learning_rate": 4.60365771544653e-05, "loss": 0.3507, "num_tokens": 228280672.0, "step": 1746 }, { "epoch": 0.6971268954509178, "grad_norm": 0.257304310798645, "learning_rate": 4.603096551717401e-05, "loss": 0.335, "num_tokens": 228411744.0, "step": 1747 }, { "epoch": 0.6975259377494014, "grad_norm": 0.2238224595785141, "learning_rate": 4.6025350294313644e-05, "loss": 0.3538, "num_tokens": 228542816.0, "step": 1748 }, { "epoch": 0.6979249800478851, "grad_norm": 0.24686166644096375, "learning_rate": 4.6019731486970674e-05, "loss": 0.3756, "num_tokens": 228673888.0, "step": 1749 }, { "epoch": 0.6983240223463687, "grad_norm": 0.21703077852725983, "learning_rate": 4.6014109096232316e-05, "loss": 0.3196, "num_tokens": 228804960.0, "step": 1750 }, { "epoch": 0.6987230646448523, "grad_norm": 0.24004100263118744, "learning_rate": 4.6008483123186426e-05, "loss": 0.3494, "num_tokens": 228936032.0, "step": 1751 }, { "epoch": 0.699122106943336, "grad_norm": 0.23596657812595367, "learning_rate": 4.600285356892158e-05, "loss": 0.37, "num_tokens": 229067104.0, "step": 1752 }, { "epoch": 0.6995211492418196, "grad_norm": 0.25050097703933716, "learning_rate": 4.5997220434527064e-05, "loss": 0.3411, "num_tokens": 229183832.0, "step": 1753 }, { "epoch": 0.6999201915403033, "grad_norm": 0.23883187770843506, "learning_rate": 4.599158372109282e-05, "loss": 0.3851, "num_tokens": 229314904.0, "step": 1754 }, { "epoch": 0.7003192338387869, "grad_norm": 0.20599155128002167, "learning_rate": 4.598594342970949e-05, "loss": 0.2913, "num_tokens": 229445976.0, "step": 1755 }, { "epoch": 0.7007182761372706, "grad_norm": 0.2402682900428772, "learning_rate": 4.598029956146844e-05, "loss": 0.3597, "num_tokens": 229577048.0, "step": 1756 }, { "epoch": 0.7011173184357542, "grad_norm": 0.24023284018039703, "learning_rate": 4.59746521174617e-05, "loss": 0.3618, "num_tokens": 229708120.0, "step": 1757 }, { "epoch": 0.7015163607342378, "grad_norm": 0.2627139985561371, "learning_rate": 4.596900109878198e-05, "loss": 0.4415, "num_tokens": 229839192.0, "step": 1758 }, { "epoch": 0.7019154030327215, "grad_norm": 0.21663996577262878, "learning_rate": 4.596334650652273e-05, "loss": 0.3294, "num_tokens": 229970264.0, "step": 1759 }, { "epoch": 0.7023144453312051, "grad_norm": 0.23724661767482758, "learning_rate": 4.5957688341778034e-05, "loss": 0.3425, "num_tokens": 230100097.0, "step": 1760 }, { "epoch": 0.7027134876296888, "grad_norm": 0.2263474315404892, "learning_rate": 4.59520266056427e-05, "loss": 0.3503, "num_tokens": 230231169.0, "step": 1761 }, { "epoch": 0.7031125299281724, "grad_norm": 0.23182253539562225, "learning_rate": 4.594636129921223e-05, "loss": 0.3698, "num_tokens": 230362241.0, "step": 1762 }, { "epoch": 0.7035115722266561, "grad_norm": 0.22563281655311584, "learning_rate": 4.59406924235828e-05, "loss": 0.3526, "num_tokens": 230493313.0, "step": 1763 }, { "epoch": 0.7039106145251397, "grad_norm": 0.2249167263507843, "learning_rate": 4.5935019979851296e-05, "loss": 0.3079, "num_tokens": 230624385.0, "step": 1764 }, { "epoch": 0.7043096568236233, "grad_norm": 0.2428683042526245, "learning_rate": 4.5929343969115276e-05, "loss": 0.3576, "num_tokens": 230755457.0, "step": 1765 }, { "epoch": 0.704708699122107, "grad_norm": 0.21869781613349915, "learning_rate": 4.5923664392472984e-05, "loss": 0.3164, "num_tokens": 230886529.0, "step": 1766 }, { "epoch": 0.7051077414205906, "grad_norm": 0.24172109365463257, "learning_rate": 4.591798125102338e-05, "loss": 0.373, "num_tokens": 231017601.0, "step": 1767 }, { "epoch": 0.7055067837190743, "grad_norm": 0.262692391872406, "learning_rate": 4.5912294545866095e-05, "loss": 0.3631, "num_tokens": 231133005.0, "step": 1768 }, { "epoch": 0.7059058260175579, "grad_norm": 0.23104453086853027, "learning_rate": 4.590660427810146e-05, "loss": 0.3456, "num_tokens": 231264077.0, "step": 1769 }, { "epoch": 0.7063048683160414, "grad_norm": 0.2304685264825821, "learning_rate": 4.590091044883047e-05, "loss": 0.3565, "num_tokens": 231395149.0, "step": 1770 }, { "epoch": 0.7067039106145251, "grad_norm": 0.22152093052864075, "learning_rate": 4.589521305915484e-05, "loss": 0.3018, "num_tokens": 231526221.0, "step": 1771 }, { "epoch": 0.7071029529130087, "grad_norm": 0.26727649569511414, "learning_rate": 4.5889512110176965e-05, "loss": 0.4195, "num_tokens": 231657293.0, "step": 1772 }, { "epoch": 0.7075019952114924, "grad_norm": 0.23151135444641113, "learning_rate": 4.588380760299992e-05, "loss": 0.3346, "num_tokens": 231788365.0, "step": 1773 }, { "epoch": 0.707901037509976, "grad_norm": 0.25721821188926697, "learning_rate": 4.587809953872748e-05, "loss": 0.3694, "num_tokens": 231919437.0, "step": 1774 }, { "epoch": 0.7083000798084597, "grad_norm": 0.2473062127828598, "learning_rate": 4.5872387918464085e-05, "loss": 0.3606, "num_tokens": 232050509.0, "step": 1775 }, { "epoch": 0.7086991221069433, "grad_norm": 0.22233366966247559, "learning_rate": 4.5866672743314904e-05, "loss": 0.3218, "num_tokens": 232181581.0, "step": 1776 }, { "epoch": 0.709098164405427, "grad_norm": 0.25084763765335083, "learning_rate": 4.586095401438576e-05, "loss": 0.369, "num_tokens": 232312653.0, "step": 1777 }, { "epoch": 0.7094972067039106, "grad_norm": 0.22894692420959473, "learning_rate": 4.5855231732783164e-05, "loss": 0.3452, "num_tokens": 232443725.0, "step": 1778 }, { "epoch": 0.7098962490023942, "grad_norm": 0.22168545424938202, "learning_rate": 4.584950589961435e-05, "loss": 0.3409, "num_tokens": 232574797.0, "step": 1779 }, { "epoch": 0.7102952913008779, "grad_norm": 0.25615739822387695, "learning_rate": 4.584377651598718e-05, "loss": 0.3714, "num_tokens": 232705869.0, "step": 1780 }, { "epoch": 0.7106943335993615, "grad_norm": 0.2861572206020355, "learning_rate": 4.583804358301025e-05, "loss": 0.4077, "num_tokens": 232836941.0, "step": 1781 }, { "epoch": 0.7110933758978452, "grad_norm": 0.2305481880903244, "learning_rate": 4.5832307101792846e-05, "loss": 0.3443, "num_tokens": 232968013.0, "step": 1782 }, { "epoch": 0.7114924181963288, "grad_norm": 0.2276100069284439, "learning_rate": 4.58265670734449e-05, "loss": 0.3131, "num_tokens": 233099085.0, "step": 1783 }, { "epoch": 0.7118914604948124, "grad_norm": 0.2334500402212143, "learning_rate": 4.582082349907707e-05, "loss": 0.353, "num_tokens": 233230157.0, "step": 1784 }, { "epoch": 0.7122905027932961, "grad_norm": 0.24626420438289642, "learning_rate": 4.581507637980067e-05, "loss": 0.3158, "num_tokens": 233361229.0, "step": 1785 }, { "epoch": 0.7126895450917797, "grad_norm": 0.2924588918685913, "learning_rate": 4.580932571672773e-05, "loss": 0.3755, "num_tokens": 233492301.0, "step": 1786 }, { "epoch": 0.7130885873902634, "grad_norm": 0.2639521062374115, "learning_rate": 4.5803571510970935e-05, "loss": 0.3246, "num_tokens": 233623373.0, "step": 1787 }, { "epoch": 0.713487629688747, "grad_norm": 0.2731224298477173, "learning_rate": 4.579781376364368e-05, "loss": 0.4193, "num_tokens": 233754445.0, "step": 1788 }, { "epoch": 0.7138866719872307, "grad_norm": 0.253856897354126, "learning_rate": 4.579205247586003e-05, "loss": 0.3546, "num_tokens": 233885517.0, "step": 1789 }, { "epoch": 0.7142857142857143, "grad_norm": 0.276713490486145, "learning_rate": 4.578628764873473e-05, "loss": 0.4021, "num_tokens": 234016589.0, "step": 1790 }, { "epoch": 0.7146847565841979, "grad_norm": 0.24497100710868835, "learning_rate": 4.5780519283383246e-05, "loss": 0.3529, "num_tokens": 234147661.0, "step": 1791 }, { "epoch": 0.7150837988826816, "grad_norm": 0.23276549577713013, "learning_rate": 4.5774747380921697e-05, "loss": 0.3615, "num_tokens": 234278733.0, "step": 1792 }, { "epoch": 0.7154828411811652, "grad_norm": 0.2275933474302292, "learning_rate": 4.576897194246688e-05, "loss": 0.3214, "num_tokens": 234409805.0, "step": 1793 }, { "epoch": 0.7158818834796489, "grad_norm": 0.23631592094898224, "learning_rate": 4.5763192969136284e-05, "loss": 0.3331, "num_tokens": 234540877.0, "step": 1794 }, { "epoch": 0.7162809257781325, "grad_norm": 0.24897609651088715, "learning_rate": 4.5757410462048095e-05, "loss": 0.3332, "num_tokens": 234671949.0, "step": 1795 }, { "epoch": 0.7166799680766162, "grad_norm": 0.22607001662254333, "learning_rate": 4.5751624422321185e-05, "loss": 0.3382, "num_tokens": 234803021.0, "step": 1796 }, { "epoch": 0.7170790103750998, "grad_norm": 0.2812206745147705, "learning_rate": 4.574583485107509e-05, "loss": 0.414, "num_tokens": 234934093.0, "step": 1797 }, { "epoch": 0.7174780526735834, "grad_norm": 0.23856057226657867, "learning_rate": 4.574004174943002e-05, "loss": 0.3434, "num_tokens": 235065165.0, "step": 1798 }, { "epoch": 0.7178770949720671, "grad_norm": 0.23502303659915924, "learning_rate": 4.573424511850692e-05, "loss": 0.3475, "num_tokens": 235196237.0, "step": 1799 }, { "epoch": 0.7182761372705507, "grad_norm": 0.24361827969551086, "learning_rate": 4.5728444959427356e-05, "loss": 0.3031, "num_tokens": 235327309.0, "step": 1800 }, { "epoch": 0.7186751795690344, "grad_norm": 0.22514690458774567, "learning_rate": 4.5722641273313626e-05, "loss": 0.3181, "num_tokens": 235458381.0, "step": 1801 }, { "epoch": 0.7190742218675179, "grad_norm": 0.23050439357757568, "learning_rate": 4.571683406128867e-05, "loss": 0.3128, "num_tokens": 235589453.0, "step": 1802 }, { "epoch": 0.7194732641660015, "grad_norm": 0.27378159761428833, "learning_rate": 4.571102332447614e-05, "loss": 0.3856, "num_tokens": 235720525.0, "step": 1803 }, { "epoch": 0.7198723064644852, "grad_norm": 0.2892305254936218, "learning_rate": 4.570520906400037e-05, "loss": 0.4171, "num_tokens": 235851597.0, "step": 1804 }, { "epoch": 0.7202713487629688, "grad_norm": 0.22285079956054688, "learning_rate": 4.5699391280986335e-05, "loss": 0.3339, "num_tokens": 235982669.0, "step": 1805 }, { "epoch": 0.7206703910614525, "grad_norm": 0.2242184579372406, "learning_rate": 4.569356997655975e-05, "loss": 0.3273, "num_tokens": 236113741.0, "step": 1806 }, { "epoch": 0.7210694333599361, "grad_norm": 0.2378096580505371, "learning_rate": 4.568774515184697e-05, "loss": 0.3507, "num_tokens": 236244813.0, "step": 1807 }, { "epoch": 0.7214684756584198, "grad_norm": 0.22739417850971222, "learning_rate": 4.568191680797504e-05, "loss": 0.3426, "num_tokens": 236375885.0, "step": 1808 }, { "epoch": 0.7218675179569034, "grad_norm": 0.238536074757576, "learning_rate": 4.567608494607171e-05, "loss": 0.3564, "num_tokens": 236506957.0, "step": 1809 }, { "epoch": 0.722266560255387, "grad_norm": 0.24004794657230377, "learning_rate": 4.567024956726538e-05, "loss": 0.3377, "num_tokens": 236638029.0, "step": 1810 }, { "epoch": 0.7226656025538707, "grad_norm": 0.21935732662677765, "learning_rate": 4.5664410672685123e-05, "loss": 0.3055, "num_tokens": 236763557.0, "step": 1811 }, { "epoch": 0.7230646448523543, "grad_norm": 0.25351276993751526, "learning_rate": 4.5658568263460744e-05, "loss": 0.3876, "num_tokens": 236894629.0, "step": 1812 }, { "epoch": 0.723463687150838, "grad_norm": 0.2356441617012024, "learning_rate": 4.5652722340722673e-05, "loss": 0.3281, "num_tokens": 237025701.0, "step": 1813 }, { "epoch": 0.7238627294493216, "grad_norm": 0.23593054711818695, "learning_rate": 4.564687290560204e-05, "loss": 0.3275, "num_tokens": 237156773.0, "step": 1814 }, { "epoch": 0.7242617717478053, "grad_norm": 0.2542988657951355, "learning_rate": 4.564101995923067e-05, "loss": 0.3687, "num_tokens": 237287845.0, "step": 1815 }, { "epoch": 0.7246608140462889, "grad_norm": 0.23886166512966156, "learning_rate": 4.563516350274104e-05, "loss": 0.3315, "num_tokens": 237408447.0, "step": 1816 }, { "epoch": 0.7250598563447725, "grad_norm": 0.23280715942382812, "learning_rate": 4.562930353726633e-05, "loss": 0.3446, "num_tokens": 237539519.0, "step": 1817 }, { "epoch": 0.7254588986432562, "grad_norm": 0.22535935044288635, "learning_rate": 4.562344006394038e-05, "loss": 0.3345, "num_tokens": 237670591.0, "step": 1818 }, { "epoch": 0.7258579409417398, "grad_norm": 0.22868894040584564, "learning_rate": 4.561757308389773e-05, "loss": 0.3462, "num_tokens": 237801663.0, "step": 1819 }, { "epoch": 0.7262569832402235, "grad_norm": 0.23989373445510864, "learning_rate": 4.5611702598273575e-05, "loss": 0.3456, "num_tokens": 237932735.0, "step": 1820 }, { "epoch": 0.7266560255387071, "grad_norm": 0.2650548219680786, "learning_rate": 4.560582860820381e-05, "loss": 0.4059, "num_tokens": 238063807.0, "step": 1821 }, { "epoch": 0.7270550678371908, "grad_norm": 0.28703582286834717, "learning_rate": 4.5599951114824975e-05, "loss": 0.3999, "num_tokens": 238194879.0, "step": 1822 }, { "epoch": 0.7274541101356744, "grad_norm": 0.22445157170295715, "learning_rate": 4.559407011927433e-05, "loss": 0.3352, "num_tokens": 238325951.0, "step": 1823 }, { "epoch": 0.727853152434158, "grad_norm": 0.23498739302158356, "learning_rate": 4.55881856226898e-05, "loss": 0.3273, "num_tokens": 238457023.0, "step": 1824 }, { "epoch": 0.7282521947326417, "grad_norm": 0.2745290994644165, "learning_rate": 4.558229762620995e-05, "loss": 0.4463, "num_tokens": 238573085.0, "step": 1825 }, { "epoch": 0.7286512370311253, "grad_norm": 0.23476935923099518, "learning_rate": 4.557640613097408e-05, "loss": 0.344, "num_tokens": 238704157.0, "step": 1826 }, { "epoch": 0.729050279329609, "grad_norm": 0.2721022963523865, "learning_rate": 4.557051113812212e-05, "loss": 0.3558, "num_tokens": 238835229.0, "step": 1827 }, { "epoch": 0.7294493216280926, "grad_norm": 0.23970894515514374, "learning_rate": 4.5564612648794725e-05, "loss": 0.3821, "num_tokens": 238966301.0, "step": 1828 }, { "epoch": 0.7298483639265763, "grad_norm": 0.2265576869249344, "learning_rate": 4.555871066413316e-05, "loss": 0.3301, "num_tokens": 239097373.0, "step": 1829 }, { "epoch": 0.7302474062250599, "grad_norm": 0.2637074887752533, "learning_rate": 4.555280518527943e-05, "loss": 0.3776, "num_tokens": 239228445.0, "step": 1830 }, { "epoch": 0.7306464485235435, "grad_norm": 0.2572716772556305, "learning_rate": 4.554689621337618e-05, "loss": 0.3464, "num_tokens": 239359517.0, "step": 1831 }, { "epoch": 0.7310454908220272, "grad_norm": 0.22450117766857147, "learning_rate": 4.5540983749566746e-05, "loss": 0.314, "num_tokens": 239490589.0, "step": 1832 }, { "epoch": 0.7314445331205107, "grad_norm": 0.23633277416229248, "learning_rate": 4.553506779499513e-05, "loss": 0.3264, "num_tokens": 239621661.0, "step": 1833 }, { "epoch": 0.7318435754189944, "grad_norm": 0.2648417353630066, "learning_rate": 4.552914835080603e-05, "loss": 0.3816, "num_tokens": 239752733.0, "step": 1834 }, { "epoch": 0.732242617717478, "grad_norm": 0.24944835901260376, "learning_rate": 4.5523225418144773e-05, "loss": 0.3615, "num_tokens": 239883805.0, "step": 1835 }, { "epoch": 0.7326416600159616, "grad_norm": 0.24523721635341644, "learning_rate": 4.5517298998157415e-05, "loss": 0.3333, "num_tokens": 240014877.0, "step": 1836 }, { "epoch": 0.7330407023144453, "grad_norm": 0.23872818052768707, "learning_rate": 4.551136909199066e-05, "loss": 0.3357, "num_tokens": 240145949.0, "step": 1837 }, { "epoch": 0.7334397446129289, "grad_norm": 0.27358007431030273, "learning_rate": 4.550543570079188e-05, "loss": 0.38, "num_tokens": 240277021.0, "step": 1838 }, { "epoch": 0.7338387869114126, "grad_norm": 0.2492550015449524, "learning_rate": 4.549949882570913e-05, "loss": 0.3744, "num_tokens": 240408093.0, "step": 1839 }, { "epoch": 0.7342378292098962, "grad_norm": 0.27176588773727417, "learning_rate": 4.5493558467891154e-05, "loss": 0.4014, "num_tokens": 240539165.0, "step": 1840 }, { "epoch": 0.7346368715083799, "grad_norm": 0.26810744404792786, "learning_rate": 4.548761462848735e-05, "loss": 0.3658, "num_tokens": 240670237.0, "step": 1841 }, { "epoch": 0.7350359138068635, "grad_norm": 0.2640989124774933, "learning_rate": 4.548166730864779e-05, "loss": 0.3971, "num_tokens": 240801309.0, "step": 1842 }, { "epoch": 0.7354349561053471, "grad_norm": 0.2373330444097519, "learning_rate": 4.547571650952322e-05, "loss": 0.3525, "num_tokens": 240932381.0, "step": 1843 }, { "epoch": 0.7358339984038308, "grad_norm": 0.2694525718688965, "learning_rate": 4.5469762232265086e-05, "loss": 0.3493, "num_tokens": 241063453.0, "step": 1844 }, { "epoch": 0.7362330407023144, "grad_norm": 0.23396752774715424, "learning_rate": 4.546380447802547e-05, "loss": 0.3147, "num_tokens": 241188318.0, "step": 1845 }, { "epoch": 0.7366320830007981, "grad_norm": 0.25643324851989746, "learning_rate": 4.545784324795715e-05, "loss": 0.3847, "num_tokens": 241319390.0, "step": 1846 }, { "epoch": 0.7370311252992817, "grad_norm": 0.23555681109428406, "learning_rate": 4.5451878543213556e-05, "loss": 0.3301, "num_tokens": 241450462.0, "step": 1847 }, { "epoch": 0.7374301675977654, "grad_norm": 0.24193164706230164, "learning_rate": 4.5445910364948806e-05, "loss": 0.3455, "num_tokens": 241581534.0, "step": 1848 }, { "epoch": 0.737829209896249, "grad_norm": 0.24232876300811768, "learning_rate": 4.54399387143177e-05, "loss": 0.3146, "num_tokens": 241706401.0, "step": 1849 }, { "epoch": 0.7382282521947326, "grad_norm": 0.23919130861759186, "learning_rate": 4.543396359247567e-05, "loss": 0.3538, "num_tokens": 241837473.0, "step": 1850 }, { "epoch": 0.7386272944932163, "grad_norm": 0.23803569376468658, "learning_rate": 4.5427985000578874e-05, "loss": 0.3425, "num_tokens": 241968545.0, "step": 1851 }, { "epoch": 0.7390263367916999, "grad_norm": 0.24737133085727692, "learning_rate": 4.5422002939784104e-05, "loss": 0.3871, "num_tokens": 242099617.0, "step": 1852 }, { "epoch": 0.7394253790901836, "grad_norm": 0.25630930066108704, "learning_rate": 4.541601741124883e-05, "loss": 0.3558, "num_tokens": 242215413.0, "step": 1853 }, { "epoch": 0.7398244213886672, "grad_norm": 0.22527436912059784, "learning_rate": 4.54100284161312e-05, "loss": 0.342, "num_tokens": 242346485.0, "step": 1854 }, { "epoch": 0.7402234636871509, "grad_norm": 0.2806810140609741, "learning_rate": 4.540403595559002e-05, "loss": 0.3531, "num_tokens": 242477557.0, "step": 1855 }, { "epoch": 0.7406225059856345, "grad_norm": 0.22913658618927002, "learning_rate": 4.5398040030784786e-05, "loss": 0.3635, "num_tokens": 242608629.0, "step": 1856 }, { "epoch": 0.7410215482841181, "grad_norm": 0.25214260816574097, "learning_rate": 4.539204064287564e-05, "loss": 0.3474, "num_tokens": 242739701.0, "step": 1857 }, { "epoch": 0.7414205905826018, "grad_norm": 0.23994384706020355, "learning_rate": 4.538603779302343e-05, "loss": 0.3602, "num_tokens": 242870773.0, "step": 1858 }, { "epoch": 0.7418196328810854, "grad_norm": 0.22071820497512817, "learning_rate": 4.5380031482389635e-05, "loss": 0.3158, "num_tokens": 243001845.0, "step": 1859 }, { "epoch": 0.7422186751795691, "grad_norm": 0.24524851143360138, "learning_rate": 4.5374021712136425e-05, "loss": 0.3402, "num_tokens": 243132917.0, "step": 1860 }, { "epoch": 0.7426177174780527, "grad_norm": 0.2516942024230957, "learning_rate": 4.536800848342663e-05, "loss": 0.3757, "num_tokens": 243263989.0, "step": 1861 }, { "epoch": 0.7430167597765364, "grad_norm": 0.24261125922203064, "learning_rate": 4.5361991797423765e-05, "loss": 0.3308, "num_tokens": 243395061.0, "step": 1862 }, { "epoch": 0.74341580207502, "grad_norm": 0.26590612530708313, "learning_rate": 4.535597165529199e-05, "loss": 0.3531, "num_tokens": 243526133.0, "step": 1863 }, { "epoch": 0.7438148443735035, "grad_norm": 0.24520547688007355, "learning_rate": 4.5349948058196146e-05, "loss": 0.3824, "num_tokens": 243657205.0, "step": 1864 }, { "epoch": 0.7442138866719872, "grad_norm": 0.27370816469192505, "learning_rate": 4.534392100730175e-05, "loss": 0.3572, "num_tokens": 243788277.0, "step": 1865 }, { "epoch": 0.7446129289704708, "grad_norm": 0.2732410728931427, "learning_rate": 4.533789050377498e-05, "loss": 0.4359, "num_tokens": 243919349.0, "step": 1866 }, { "epoch": 0.7450119712689545, "grad_norm": 0.24942652881145477, "learning_rate": 4.533185654878268e-05, "loss": 0.3838, "num_tokens": 244050421.0, "step": 1867 }, { "epoch": 0.7454110135674381, "grad_norm": 0.2175293266773224, "learning_rate": 4.532581914349236e-05, "loss": 0.3207, "num_tokens": 244181493.0, "step": 1868 }, { "epoch": 0.7458100558659218, "grad_norm": 0.24478168785572052, "learning_rate": 4.531977828907221e-05, "loss": 0.3628, "num_tokens": 244312565.0, "step": 1869 }, { "epoch": 0.7462090981644054, "grad_norm": 0.2459690272808075, "learning_rate": 4.531373398669107e-05, "loss": 0.3812, "num_tokens": 244443637.0, "step": 1870 }, { "epoch": 0.746608140462889, "grad_norm": 0.2504676580429077, "learning_rate": 4.530768623751846e-05, "loss": 0.3817, "num_tokens": 244574709.0, "step": 1871 }, { "epoch": 0.7470071827613727, "grad_norm": 0.23720720410346985, "learning_rate": 4.5301635042724575e-05, "loss": 0.3219, "num_tokens": 244705781.0, "step": 1872 }, { "epoch": 0.7474062250598563, "grad_norm": 0.26137009263038635, "learning_rate": 4.5295580403480254e-05, "loss": 0.3548, "num_tokens": 244836853.0, "step": 1873 }, { "epoch": 0.74780526735834, "grad_norm": 0.26224613189697266, "learning_rate": 4.528952232095701e-05, "loss": 0.37, "num_tokens": 244967925.0, "step": 1874 }, { "epoch": 0.7482043096568236, "grad_norm": 0.27018412947654724, "learning_rate": 4.528346079632702e-05, "loss": 0.3132, "num_tokens": 245098997.0, "step": 1875 }, { "epoch": 0.7486033519553073, "grad_norm": 0.3434894382953644, "learning_rate": 4.5277395830763146e-05, "loss": 0.4435, "num_tokens": 245230069.0, "step": 1876 }, { "epoch": 0.7490023942537909, "grad_norm": 0.2668095827102661, "learning_rate": 4.52713274254389e-05, "loss": 0.3384, "num_tokens": 245361141.0, "step": 1877 }, { "epoch": 0.7494014365522745, "grad_norm": 0.3102114796638489, "learning_rate": 4.526525558152846e-05, "loss": 0.4129, "num_tokens": 245492213.0, "step": 1878 }, { "epoch": 0.7498004788507582, "grad_norm": 0.2668561041355133, "learning_rate": 4.525918030020667e-05, "loss": 0.3686, "num_tokens": 245623285.0, "step": 1879 }, { "epoch": 0.7501995211492418, "grad_norm": 0.27106529474258423, "learning_rate": 4.5253101582649046e-05, "loss": 0.4148, "num_tokens": 245754357.0, "step": 1880 }, { "epoch": 0.7505985634477255, "grad_norm": 0.28117772936820984, "learning_rate": 4.5247019430031744e-05, "loss": 0.3424, "num_tokens": 245885429.0, "step": 1881 }, { "epoch": 0.7509976057462091, "grad_norm": 0.23751315474510193, "learning_rate": 4.5240933843531624e-05, "loss": 0.3585, "num_tokens": 246016501.0, "step": 1882 }, { "epoch": 0.7513966480446927, "grad_norm": 0.23044255375862122, "learning_rate": 4.5234844824326186e-05, "loss": 0.322, "num_tokens": 246147573.0, "step": 1883 }, { "epoch": 0.7517956903431764, "grad_norm": 0.26268312335014343, "learning_rate": 4.5228752373593604e-05, "loss": 0.4197, "num_tokens": 246278645.0, "step": 1884 }, { "epoch": 0.75219473264166, "grad_norm": 0.22547094523906708, "learning_rate": 4.52226564925127e-05, "loss": 0.3393, "num_tokens": 246409717.0, "step": 1885 }, { "epoch": 0.7525937749401437, "grad_norm": 0.25988149642944336, "learning_rate": 4.5216557182262956e-05, "loss": 0.4034, "num_tokens": 246540789.0, "step": 1886 }, { "epoch": 0.7529928172386273, "grad_norm": 0.22044332325458527, "learning_rate": 4.521045444402457e-05, "loss": 0.345, "num_tokens": 246671861.0, "step": 1887 }, { "epoch": 0.753391859537111, "grad_norm": 0.26124316453933716, "learning_rate": 4.520434827897834e-05, "loss": 0.3766, "num_tokens": 246802933.0, "step": 1888 }, { "epoch": 0.7537909018355946, "grad_norm": 0.2660360634326935, "learning_rate": 4.519823868830575e-05, "loss": 0.4001, "num_tokens": 246934005.0, "step": 1889 }, { "epoch": 0.7541899441340782, "grad_norm": 0.24134868383407593, "learning_rate": 4.519212567318895e-05, "loss": 0.3948, "num_tokens": 247065077.0, "step": 1890 }, { "epoch": 0.7545889864325619, "grad_norm": 0.26884666085243225, "learning_rate": 4.518600923481077e-05, "loss": 0.3862, "num_tokens": 247196149.0, "step": 1891 }, { "epoch": 0.7549880287310455, "grad_norm": 0.20521016418933868, "learning_rate": 4.5179889374354655e-05, "loss": 0.2526, "num_tokens": 247327221.0, "step": 1892 }, { "epoch": 0.7553870710295292, "grad_norm": 0.2786053717136383, "learning_rate": 4.517376609300476e-05, "loss": 0.3942, "num_tokens": 247458293.0, "step": 1893 }, { "epoch": 0.7557861133280128, "grad_norm": 0.24238941073417664, "learning_rate": 4.516763939194588e-05, "loss": 0.3401, "num_tokens": 247589365.0, "step": 1894 }, { "epoch": 0.7561851556264965, "grad_norm": 0.25667524337768555, "learning_rate": 4.516150927236348e-05, "loss": 0.348, "num_tokens": 247720437.0, "step": 1895 }, { "epoch": 0.75658419792498, "grad_norm": 0.25683286786079407, "learning_rate": 4.5155375735443664e-05, "loss": 0.3851, "num_tokens": 247851509.0, "step": 1896 }, { "epoch": 0.7569832402234636, "grad_norm": 0.21688713133335114, "learning_rate": 4.514923878237324e-05, "loss": 0.3152, "num_tokens": 247982581.0, "step": 1897 }, { "epoch": 0.7573822825219473, "grad_norm": 0.2463337779045105, "learning_rate": 4.514309841433962e-05, "loss": 0.3797, "num_tokens": 248113653.0, "step": 1898 }, { "epoch": 0.7577813248204309, "grad_norm": 0.237442284822464, "learning_rate": 4.513695463253093e-05, "loss": 0.3871, "num_tokens": 248244725.0, "step": 1899 }, { "epoch": 0.7581803671189146, "grad_norm": 0.24287545680999756, "learning_rate": 4.513080743813593e-05, "loss": 0.3469, "num_tokens": 248375797.0, "step": 1900 }, { "epoch": 0.7585794094173982, "grad_norm": 0.24137474596500397, "learning_rate": 4.512465683234404e-05, "loss": 0.3762, "num_tokens": 248506869.0, "step": 1901 }, { "epoch": 0.7589784517158819, "grad_norm": 0.22830769419670105, "learning_rate": 4.511850281634535e-05, "loss": 0.3169, "num_tokens": 248637941.0, "step": 1902 }, { "epoch": 0.7593774940143655, "grad_norm": 0.2333011031150818, "learning_rate": 4.51123453913306e-05, "loss": 0.3428, "num_tokens": 248769013.0, "step": 1903 }, { "epoch": 0.7597765363128491, "grad_norm": 0.27677690982818604, "learning_rate": 4.5106184558491196e-05, "loss": 0.4009, "num_tokens": 248900085.0, "step": 1904 }, { "epoch": 0.7601755786113328, "grad_norm": 0.27697890996932983, "learning_rate": 4.5100020319019205e-05, "loss": 0.4338, "num_tokens": 249031157.0, "step": 1905 }, { "epoch": 0.7605746209098164, "grad_norm": 0.24475131928920746, "learning_rate": 4.509385267410735e-05, "loss": 0.3391, "num_tokens": 249162229.0, "step": 1906 }, { "epoch": 0.7609736632083001, "grad_norm": 0.2367834448814392, "learning_rate": 4.5087681624949e-05, "loss": 0.3619, "num_tokens": 249293301.0, "step": 1907 }, { "epoch": 0.7613727055067837, "grad_norm": 0.2249302864074707, "learning_rate": 4.508150717273822e-05, "loss": 0.3176, "num_tokens": 249424373.0, "step": 1908 }, { "epoch": 0.7617717478052674, "grad_norm": 0.25508058071136475, "learning_rate": 4.507532931866968e-05, "loss": 0.366, "num_tokens": 249555445.0, "step": 1909 }, { "epoch": 0.762170790103751, "grad_norm": 0.2461402267217636, "learning_rate": 4.506914806393877e-05, "loss": 0.3607, "num_tokens": 249686517.0, "step": 1910 }, { "epoch": 0.7625698324022346, "grad_norm": 0.2906559109687805, "learning_rate": 4.506296340974148e-05, "loss": 0.4512, "num_tokens": 249817589.0, "step": 1911 }, { "epoch": 0.7629688747007183, "grad_norm": 0.22820833325386047, "learning_rate": 4.505677535727448e-05, "loss": 0.3524, "num_tokens": 249948661.0, "step": 1912 }, { "epoch": 0.7633679169992019, "grad_norm": 0.22938503324985504, "learning_rate": 4.505058390773512e-05, "loss": 0.3379, "num_tokens": 250079733.0, "step": 1913 }, { "epoch": 0.7637669592976856, "grad_norm": 0.2682006359100342, "learning_rate": 4.504438906232138e-05, "loss": 0.3718, "num_tokens": 250210805.0, "step": 1914 }, { "epoch": 0.7641660015961692, "grad_norm": 0.23270809650421143, "learning_rate": 4.50381908222319e-05, "loss": 0.3427, "num_tokens": 250341877.0, "step": 1915 }, { "epoch": 0.7645650438946529, "grad_norm": 0.24306237697601318, "learning_rate": 4.503198918866598e-05, "loss": 0.3594, "num_tokens": 250472949.0, "step": 1916 }, { "epoch": 0.7649640861931365, "grad_norm": 0.23006799817085266, "learning_rate": 4.5025784162823586e-05, "loss": 0.3526, "num_tokens": 250604021.0, "step": 1917 }, { "epoch": 0.7653631284916201, "grad_norm": 0.2361437976360321, "learning_rate": 4.5019575745905335e-05, "loss": 0.3316, "num_tokens": 250732933.0, "step": 1918 }, { "epoch": 0.7657621707901038, "grad_norm": 0.2680376172065735, "learning_rate": 4.501336393911249e-05, "loss": 0.3374, "num_tokens": 250864005.0, "step": 1919 }, { "epoch": 0.7661612130885874, "grad_norm": 0.2608782947063446, "learning_rate": 4.500714874364698e-05, "loss": 0.3788, "num_tokens": 250995077.0, "step": 1920 }, { "epoch": 0.7665602553870711, "grad_norm": 0.23877593874931335, "learning_rate": 4.500093016071138e-05, "loss": 0.3195, "num_tokens": 251126149.0, "step": 1921 }, { "epoch": 0.7669592976855547, "grad_norm": 0.26460838317871094, "learning_rate": 4.4994708191508945e-05, "loss": 0.4002, "num_tokens": 251257221.0, "step": 1922 }, { "epoch": 0.7673583399840384, "grad_norm": 0.2617962062358856, "learning_rate": 4.498848283724356e-05, "loss": 0.3642, "num_tokens": 251388293.0, "step": 1923 }, { "epoch": 0.767757382282522, "grad_norm": 0.2578248977661133, "learning_rate": 4.498225409911977e-05, "loss": 0.3847, "num_tokens": 251519365.0, "step": 1924 }, { "epoch": 0.7681564245810056, "grad_norm": 0.23408545553684235, "learning_rate": 4.497602197834277e-05, "loss": 0.3329, "num_tokens": 251650437.0, "step": 1925 }, { "epoch": 0.7685554668794893, "grad_norm": 0.22340147197246552, "learning_rate": 4.496978647611844e-05, "loss": 0.3385, "num_tokens": 251781509.0, "step": 1926 }, { "epoch": 0.7689545091779728, "grad_norm": 0.2503754198551178, "learning_rate": 4.496354759365327e-05, "loss": 0.3617, "num_tokens": 251912581.0, "step": 1927 }, { "epoch": 0.7693535514764565, "grad_norm": 0.20649529993534088, "learning_rate": 4.4957305332154434e-05, "loss": 0.3079, "num_tokens": 252043653.0, "step": 1928 }, { "epoch": 0.7697525937749401, "grad_norm": 0.23854243755340576, "learning_rate": 4.495105969282975e-05, "loss": 0.329, "num_tokens": 252174725.0, "step": 1929 }, { "epoch": 0.7701516360734237, "grad_norm": 0.21922485530376434, "learning_rate": 4.4944810676887695e-05, "loss": 0.315, "num_tokens": 252305797.0, "step": 1930 }, { "epoch": 0.7705506783719074, "grad_norm": 0.23789270222187042, "learning_rate": 4.493855828553739e-05, "loss": 0.3542, "num_tokens": 252436869.0, "step": 1931 }, { "epoch": 0.770949720670391, "grad_norm": 0.22745750844478607, "learning_rate": 4.493230251998862e-05, "loss": 0.3016, "num_tokens": 252567941.0, "step": 1932 }, { "epoch": 0.7713487629688747, "grad_norm": 0.25994977355003357, "learning_rate": 4.4926043381451815e-05, "loss": 0.3734, "num_tokens": 252699013.0, "step": 1933 }, { "epoch": 0.7717478052673583, "grad_norm": 0.23676230013370514, "learning_rate": 4.491978087113806e-05, "loss": 0.3335, "num_tokens": 252830085.0, "step": 1934 }, { "epoch": 0.772146847565842, "grad_norm": 0.3609620928764343, "learning_rate": 4.4913514990259084e-05, "loss": 0.3014, "num_tokens": 252961157.0, "step": 1935 }, { "epoch": 0.7725458898643256, "grad_norm": 0.2660060524940491, "learning_rate": 4.4907245740027294e-05, "loss": 0.3676, "num_tokens": 253092229.0, "step": 1936 }, { "epoch": 0.7729449321628092, "grad_norm": 0.37755101919174194, "learning_rate": 4.490097312165571e-05, "loss": 0.37, "num_tokens": 253223301.0, "step": 1937 }, { "epoch": 0.7733439744612929, "grad_norm": 0.21236006915569305, "learning_rate": 4.489469713635804e-05, "loss": 0.2905, "num_tokens": 253354373.0, "step": 1938 }, { "epoch": 0.7737430167597765, "grad_norm": 0.25255465507507324, "learning_rate": 4.488841778534863e-05, "loss": 0.3697, "num_tokens": 253485445.0, "step": 1939 }, { "epoch": 0.7741420590582602, "grad_norm": 0.24302153289318085, "learning_rate": 4.488213506984248e-05, "loss": 0.3646, "num_tokens": 253616517.0, "step": 1940 }, { "epoch": 0.7745411013567438, "grad_norm": 0.256799578666687, "learning_rate": 4.487584899105522e-05, "loss": 0.3505, "num_tokens": 253747589.0, "step": 1941 }, { "epoch": 0.7749401436552275, "grad_norm": 0.249998077750206, "learning_rate": 4.486955955020315e-05, "loss": 0.338, "num_tokens": 253878661.0, "step": 1942 }, { "epoch": 0.7753391859537111, "grad_norm": 0.2512384355068207, "learning_rate": 4.486326674850323e-05, "loss": 0.3856, "num_tokens": 254009733.0, "step": 1943 }, { "epoch": 0.7757382282521947, "grad_norm": 0.2510078549385071, "learning_rate": 4.485697058717305e-05, "loss": 0.363, "num_tokens": 254140805.0, "step": 1944 }, { "epoch": 0.7761372705506784, "grad_norm": 0.24358859658241272, "learning_rate": 4.485067106743086e-05, "loss": 0.3675, "num_tokens": 254271877.0, "step": 1945 }, { "epoch": 0.776536312849162, "grad_norm": 0.24955879151821136, "learning_rate": 4.484436819049556e-05, "loss": 0.3491, "num_tokens": 254402949.0, "step": 1946 }, { "epoch": 0.7769353551476457, "grad_norm": 0.3617948889732361, "learning_rate": 4.48380619575867e-05, "loss": 0.3644, "num_tokens": 254534021.0, "step": 1947 }, { "epoch": 0.7773343974461293, "grad_norm": 0.259248286485672, "learning_rate": 4.483175236992447e-05, "loss": 0.3663, "num_tokens": 254665093.0, "step": 1948 }, { "epoch": 0.777733439744613, "grad_norm": 0.2404508888721466, "learning_rate": 4.4825439428729724e-05, "loss": 0.3442, "num_tokens": 254796165.0, "step": 1949 }, { "epoch": 0.7781324820430966, "grad_norm": 0.2241251915693283, "learning_rate": 4.4819123135223954e-05, "loss": 0.3223, "num_tokens": 254927237.0, "step": 1950 }, { "epoch": 0.7785315243415802, "grad_norm": 0.2551431357860565, "learning_rate": 4.48128034906293e-05, "loss": 0.3675, "num_tokens": 255058309.0, "step": 1951 }, { "epoch": 0.7789305666400639, "grad_norm": 0.27138182520866394, "learning_rate": 4.480648049616857e-05, "loss": 0.3552, "num_tokens": 255189381.0, "step": 1952 }, { "epoch": 0.7793296089385475, "grad_norm": 0.2733224034309387, "learning_rate": 4.480015415306518e-05, "loss": 0.405, "num_tokens": 255320453.0, "step": 1953 }, { "epoch": 0.7797286512370312, "grad_norm": 0.27597638964653015, "learning_rate": 4.4793824462543236e-05, "loss": 0.3476, "num_tokens": 255451525.0, "step": 1954 }, { "epoch": 0.7801276935355148, "grad_norm": 0.24516451358795166, "learning_rate": 4.478749142582748e-05, "loss": 0.3339, "num_tokens": 255582597.0, "step": 1955 }, { "epoch": 0.7805267358339985, "grad_norm": 0.2417314350605011, "learning_rate": 4.478115504414327e-05, "loss": 0.3333, "num_tokens": 255713669.0, "step": 1956 }, { "epoch": 0.7809257781324821, "grad_norm": 0.24002289772033691, "learning_rate": 4.4774815318716656e-05, "loss": 0.3486, "num_tokens": 255844741.0, "step": 1957 }, { "epoch": 0.7813248204309656, "grad_norm": 0.25771209597587585, "learning_rate": 4.4768472250774316e-05, "loss": 0.3723, "num_tokens": 255975813.0, "step": 1958 }, { "epoch": 0.7817238627294493, "grad_norm": 0.23095504939556122, "learning_rate": 4.476212584154357e-05, "loss": 0.3273, "num_tokens": 256106885.0, "step": 1959 }, { "epoch": 0.7821229050279329, "grad_norm": 0.2438811957836151, "learning_rate": 4.475577609225238e-05, "loss": 0.3691, "num_tokens": 256237957.0, "step": 1960 }, { "epoch": 0.7825219473264166, "grad_norm": 0.2393071949481964, "learning_rate": 4.474942300412938e-05, "loss": 0.344, "num_tokens": 256369029.0, "step": 1961 }, { "epoch": 0.7829209896249002, "grad_norm": 0.234316885471344, "learning_rate": 4.474306657840382e-05, "loss": 0.3176, "num_tokens": 256500101.0, "step": 1962 }, { "epoch": 0.7833200319233838, "grad_norm": 0.2856549322605133, "learning_rate": 4.4736706816305626e-05, "loss": 0.4002, "num_tokens": 256631173.0, "step": 1963 }, { "epoch": 0.7837190742218675, "grad_norm": 0.23693233728408813, "learning_rate": 4.473034371906533e-05, "loss": 0.3375, "num_tokens": 256762245.0, "step": 1964 }, { "epoch": 0.7841181165203511, "grad_norm": 0.23394504189491272, "learning_rate": 4.472397728791415e-05, "loss": 0.3256, "num_tokens": 256893317.0, "step": 1965 }, { "epoch": 0.7845171588188348, "grad_norm": 0.2255672812461853, "learning_rate": 4.471760752408393e-05, "loss": 0.2958, "num_tokens": 257024389.0, "step": 1966 }, { "epoch": 0.7849162011173184, "grad_norm": 0.2521219253540039, "learning_rate": 4.471123442880714e-05, "loss": 0.3665, "num_tokens": 257155461.0, "step": 1967 }, { "epoch": 0.7853152434158021, "grad_norm": 0.25448235869407654, "learning_rate": 4.470485800331694e-05, "loss": 0.3106, "num_tokens": 257286533.0, "step": 1968 }, { "epoch": 0.7857142857142857, "grad_norm": 0.25113627314567566, "learning_rate": 4.469847824884709e-05, "loss": 0.3706, "num_tokens": 257417605.0, "step": 1969 }, { "epoch": 0.7861133280127693, "grad_norm": 0.22846677899360657, "learning_rate": 4.4692095166632026e-05, "loss": 0.3144, "num_tokens": 257548677.0, "step": 1970 }, { "epoch": 0.786512370311253, "grad_norm": 0.24914321303367615, "learning_rate": 4.4685708757906806e-05, "loss": 0.3719, "num_tokens": 257679749.0, "step": 1971 }, { "epoch": 0.7869114126097366, "grad_norm": 0.24714256823062897, "learning_rate": 4.4679319023907156e-05, "loss": 0.3742, "num_tokens": 257810821.0, "step": 1972 }, { "epoch": 0.7873104549082203, "grad_norm": 0.24609144032001495, "learning_rate": 4.467292596586941e-05, "loss": 0.3584, "num_tokens": 257941893.0, "step": 1973 }, { "epoch": 0.7877094972067039, "grad_norm": 0.253236323595047, "learning_rate": 4.4666529585030575e-05, "loss": 0.3712, "num_tokens": 258072965.0, "step": 1974 }, { "epoch": 0.7881085395051876, "grad_norm": 0.24374112486839294, "learning_rate": 4.466012988262829e-05, "loss": 0.3394, "num_tokens": 258204037.0, "step": 1975 }, { "epoch": 0.7885075818036712, "grad_norm": 0.24664127826690674, "learning_rate": 4.4653726859900835e-05, "loss": 0.3325, "num_tokens": 258335109.0, "step": 1976 }, { "epoch": 0.7889066241021548, "grad_norm": 0.2558926045894623, "learning_rate": 4.4647320518087136e-05, "loss": 0.3696, "num_tokens": 258466181.0, "step": 1977 }, { "epoch": 0.7893056664006385, "grad_norm": 0.2480066865682602, "learning_rate": 4.464091085842676e-05, "loss": 0.355, "num_tokens": 258597253.0, "step": 1978 }, { "epoch": 0.7897047086991221, "grad_norm": 0.24536685645580292, "learning_rate": 4.463449788215993e-05, "loss": 0.3548, "num_tokens": 258728325.0, "step": 1979 }, { "epoch": 0.7901037509976058, "grad_norm": 0.2362922877073288, "learning_rate": 4.462808159052748e-05, "loss": 0.3242, "num_tokens": 258859397.0, "step": 1980 }, { "epoch": 0.7905027932960894, "grad_norm": 0.24763847887516022, "learning_rate": 4.4621661984770906e-05, "loss": 0.338, "num_tokens": 258990469.0, "step": 1981 }, { "epoch": 0.790901835594573, "grad_norm": 0.2554798424243927, "learning_rate": 4.461523906613235e-05, "loss": 0.3398, "num_tokens": 259121541.0, "step": 1982 }, { "epoch": 0.7913008778930567, "grad_norm": 0.23366227746009827, "learning_rate": 4.460881283585458e-05, "loss": 0.3293, "num_tokens": 259252613.0, "step": 1983 }, { "epoch": 0.7916999201915403, "grad_norm": 0.24841199815273285, "learning_rate": 4.4602383295181e-05, "loss": 0.3232, "num_tokens": 259383685.0, "step": 1984 }, { "epoch": 0.792098962490024, "grad_norm": 0.24563249945640564, "learning_rate": 4.459595044535569e-05, "loss": 0.3373, "num_tokens": 259514757.0, "step": 1985 }, { "epoch": 0.7924980047885076, "grad_norm": 0.23763519525527954, "learning_rate": 4.4589514287623335e-05, "loss": 0.3013, "num_tokens": 259645829.0, "step": 1986 }, { "epoch": 0.7928970470869913, "grad_norm": 0.23731818795204163, "learning_rate": 4.458307482322927e-05, "loss": 0.2888, "num_tokens": 259776901.0, "step": 1987 }, { "epoch": 0.7932960893854749, "grad_norm": 0.2604166865348816, "learning_rate": 4.4576632053419465e-05, "loss": 0.3725, "num_tokens": 259907973.0, "step": 1988 }, { "epoch": 0.7936951316839586, "grad_norm": 0.2391255646944046, "learning_rate": 4.4570185979440556e-05, "loss": 0.3249, "num_tokens": 260039045.0, "step": 1989 }, { "epoch": 0.7940941739824421, "grad_norm": 0.236599400639534, "learning_rate": 4.456373660253978e-05, "loss": 0.3013, "num_tokens": 260170117.0, "step": 1990 }, { "epoch": 0.7944932162809257, "grad_norm": 0.27772530913352966, "learning_rate": 4.4557283923965034e-05, "loss": 0.3593, "num_tokens": 260301189.0, "step": 1991 }, { "epoch": 0.7948922585794094, "grad_norm": 0.26887747645378113, "learning_rate": 4.4550827944964865e-05, "loss": 0.4108, "num_tokens": 260432261.0, "step": 1992 }, { "epoch": 0.795291300877893, "grad_norm": 0.2566475570201874, "learning_rate": 4.454436866678843e-05, "loss": 0.3508, "num_tokens": 260563333.0, "step": 1993 }, { "epoch": 0.7956903431763767, "grad_norm": 0.2680147886276245, "learning_rate": 4.4537906090685545e-05, "loss": 0.3554, "num_tokens": 260694405.0, "step": 1994 }, { "epoch": 0.7960893854748603, "grad_norm": 0.22787593305110931, "learning_rate": 4.453144021790665e-05, "loss": 0.3191, "num_tokens": 260825477.0, "step": 1995 }, { "epoch": 0.7964884277733439, "grad_norm": 0.2632122337818146, "learning_rate": 4.452497104970285e-05, "loss": 0.38, "num_tokens": 260956549.0, "step": 1996 }, { "epoch": 0.7968874700718276, "grad_norm": 0.27051690220832825, "learning_rate": 4.451849858732585e-05, "loss": 0.4248, "num_tokens": 261087621.0, "step": 1997 }, { "epoch": 0.7972865123703112, "grad_norm": 0.2227378636598587, "learning_rate": 4.451202283202801e-05, "loss": 0.2978, "num_tokens": 261218693.0, "step": 1998 }, { "epoch": 0.7976855546687949, "grad_norm": 0.24633271992206573, "learning_rate": 4.4505543785062343e-05, "loss": 0.3767, "num_tokens": 261349765.0, "step": 1999 }, { "epoch": 0.7980845969672785, "grad_norm": 0.23288355767726898, "learning_rate": 4.4499061447682475e-05, "loss": 0.2847, "num_tokens": 261480837.0, "step": 2000 }, { "epoch": 0.7984836392657622, "grad_norm": 0.23987238109111786, "learning_rate": 4.449257582114269e-05, "loss": 0.3489, "num_tokens": 261611909.0, "step": 2001 }, { "epoch": 0.7988826815642458, "grad_norm": 0.2521800100803375, "learning_rate": 4.4486086906697876e-05, "loss": 0.3426, "num_tokens": 261742981.0, "step": 2002 }, { "epoch": 0.7992817238627294, "grad_norm": 0.24878950417041779, "learning_rate": 4.447959470560359e-05, "loss": 0.3465, "num_tokens": 261874053.0, "step": 2003 }, { "epoch": 0.7996807661612131, "grad_norm": 0.2678702175617218, "learning_rate": 4.447309921911601e-05, "loss": 0.3495, "num_tokens": 262005125.0, "step": 2004 }, { "epoch": 0.8000798084596967, "grad_norm": 0.23627321422100067, "learning_rate": 4.446660044849196e-05, "loss": 0.304, "num_tokens": 262136197.0, "step": 2005 }, { "epoch": 0.8004788507581804, "grad_norm": 0.23795214295387268, "learning_rate": 4.446009839498887e-05, "loss": 0.343, "num_tokens": 262267269.0, "step": 2006 }, { "epoch": 0.800877893056664, "grad_norm": 0.260052353143692, "learning_rate": 4.445359305986485e-05, "loss": 0.39, "num_tokens": 262398341.0, "step": 2007 }, { "epoch": 0.8012769353551477, "grad_norm": 0.413921058177948, "learning_rate": 4.44470844443786e-05, "loss": 0.3703, "num_tokens": 262529413.0, "step": 2008 }, { "epoch": 0.8016759776536313, "grad_norm": 0.2313305139541626, "learning_rate": 4.44405725497895e-05, "loss": 0.2982, "num_tokens": 262660485.0, "step": 2009 }, { "epoch": 0.8020750199521149, "grad_norm": 0.2477758675813675, "learning_rate": 4.443405737735753e-05, "loss": 0.3525, "num_tokens": 262791557.0, "step": 2010 }, { "epoch": 0.8024740622505986, "grad_norm": 0.23392878472805023, "learning_rate": 4.442753892834332e-05, "loss": 0.307, "num_tokens": 262922629.0, "step": 2011 }, { "epoch": 0.8028731045490822, "grad_norm": 0.2399139404296875, "learning_rate": 4.4421017204008116e-05, "loss": 0.3414, "num_tokens": 263053701.0, "step": 2012 }, { "epoch": 0.8032721468475659, "grad_norm": 0.22978447377681732, "learning_rate": 4.441449220561382e-05, "loss": 0.3561, "num_tokens": 263184773.0, "step": 2013 }, { "epoch": 0.8036711891460495, "grad_norm": 0.23214983940124512, "learning_rate": 4.440796393442296e-05, "loss": 0.3328, "num_tokens": 263315845.0, "step": 2014 }, { "epoch": 0.8040702314445332, "grad_norm": 0.2661607265472412, "learning_rate": 4.440143239169869e-05, "loss": 0.3807, "num_tokens": 263446917.0, "step": 2015 }, { "epoch": 0.8044692737430168, "grad_norm": 0.2646477222442627, "learning_rate": 4.43948975787048e-05, "loss": 0.3667, "num_tokens": 263577989.0, "step": 2016 }, { "epoch": 0.8048683160415004, "grad_norm": 0.24434193968772888, "learning_rate": 4.438835949670573e-05, "loss": 0.3542, "num_tokens": 263709061.0, "step": 2017 }, { "epoch": 0.8052673583399841, "grad_norm": 0.24945983290672302, "learning_rate": 4.438181814696652e-05, "loss": 0.3775, "num_tokens": 263840133.0, "step": 2018 }, { "epoch": 0.8056664006384677, "grad_norm": 0.25673386454582214, "learning_rate": 4.437527353075287e-05, "loss": 0.373, "num_tokens": 263971205.0, "step": 2019 }, { "epoch": 0.8060654429369514, "grad_norm": 0.2313484400510788, "learning_rate": 4.436872564933111e-05, "loss": 0.3234, "num_tokens": 264102277.0, "step": 2020 }, { "epoch": 0.8064644852354349, "grad_norm": 0.2361186444759369, "learning_rate": 4.436217450396819e-05, "loss": 0.3346, "num_tokens": 264233349.0, "step": 2021 }, { "epoch": 0.8068635275339185, "grad_norm": 0.23278428614139557, "learning_rate": 4.4355620095931666e-05, "loss": 0.344, "num_tokens": 264364421.0, "step": 2022 }, { "epoch": 0.8072625698324022, "grad_norm": 0.2199227511882782, "learning_rate": 4.4349062426489805e-05, "loss": 0.3175, "num_tokens": 264495493.0, "step": 2023 }, { "epoch": 0.8076616121308858, "grad_norm": 0.24408626556396484, "learning_rate": 4.434250149691142e-05, "loss": 0.3281, "num_tokens": 264626565.0, "step": 2024 }, { "epoch": 0.8080606544293695, "grad_norm": 0.26277804374694824, "learning_rate": 4.4335937308465985e-05, "loss": 0.3494, "num_tokens": 264757637.0, "step": 2025 }, { "epoch": 0.8084596967278531, "grad_norm": 0.26663994789123535, "learning_rate": 4.4329369862423636e-05, "loss": 0.3653, "num_tokens": 264888709.0, "step": 2026 }, { "epoch": 0.8088587390263368, "grad_norm": 0.2733791172504425, "learning_rate": 4.43227991600551e-05, "loss": 0.3881, "num_tokens": 265019781.0, "step": 2027 }, { "epoch": 0.8092577813248204, "grad_norm": 0.22251459956169128, "learning_rate": 4.431622520263175e-05, "loss": 0.3083, "num_tokens": 265150853.0, "step": 2028 }, { "epoch": 0.809656823623304, "grad_norm": 0.2608453333377838, "learning_rate": 4.430964799142558e-05, "loss": 0.3721, "num_tokens": 265281925.0, "step": 2029 }, { "epoch": 0.8100558659217877, "grad_norm": 0.250461608171463, "learning_rate": 4.430306752770921e-05, "loss": 0.3527, "num_tokens": 265412997.0, "step": 2030 }, { "epoch": 0.8104549082202713, "grad_norm": 0.264813095331192, "learning_rate": 4.429648381275592e-05, "loss": 0.396, "num_tokens": 265544069.0, "step": 2031 }, { "epoch": 0.810853950518755, "grad_norm": 0.22180864214897156, "learning_rate": 4.4289896847839594e-05, "loss": 0.3177, "num_tokens": 265675141.0, "step": 2032 }, { "epoch": 0.8112529928172386, "grad_norm": 0.22752566635608673, "learning_rate": 4.428330663423474e-05, "loss": 0.3171, "num_tokens": 265806213.0, "step": 2033 }, { "epoch": 0.8116520351157223, "grad_norm": 0.23724736273288727, "learning_rate": 4.427671317321648e-05, "loss": 0.3302, "num_tokens": 265937285.0, "step": 2034 }, { "epoch": 0.8120510774142059, "grad_norm": 0.29506000876426697, "learning_rate": 4.427011646606064e-05, "loss": 0.4351, "num_tokens": 266068357.0, "step": 2035 }, { "epoch": 0.8124501197126895, "grad_norm": 0.23543955385684967, "learning_rate": 4.4263516514043585e-05, "loss": 0.29, "num_tokens": 266199429.0, "step": 2036 }, { "epoch": 0.8128491620111732, "grad_norm": 0.2542259991168976, "learning_rate": 4.4256913318442343e-05, "loss": 0.3416, "num_tokens": 266330501.0, "step": 2037 }, { "epoch": 0.8132482043096568, "grad_norm": 0.30073028802871704, "learning_rate": 4.4250306880534586e-05, "loss": 0.344, "num_tokens": 266446375.0, "step": 2038 }, { "epoch": 0.8136472466081405, "grad_norm": 0.2374417632818222, "learning_rate": 4.4243697201598595e-05, "loss": 0.2827, "num_tokens": 266577447.0, "step": 2039 }, { "epoch": 0.8140462889066241, "grad_norm": 0.23957057297229767, "learning_rate": 4.423708428291328e-05, "loss": 0.3347, "num_tokens": 266708519.0, "step": 2040 }, { "epoch": 0.8144453312051078, "grad_norm": 0.28375548124313354, "learning_rate": 4.423046812575817e-05, "loss": 0.3514, "num_tokens": 266830707.0, "step": 2041 }, { "epoch": 0.8148443735035914, "grad_norm": 0.30867519974708557, "learning_rate": 4.422384873141344e-05, "loss": 0.3845, "num_tokens": 266961779.0, "step": 2042 }, { "epoch": 0.815243415802075, "grad_norm": 0.2402118295431137, "learning_rate": 4.4217226101159876e-05, "loss": 0.3457, "num_tokens": 267092851.0, "step": 2043 }, { "epoch": 0.8156424581005587, "grad_norm": 0.20252764225006104, "learning_rate": 4.42106002362789e-05, "loss": 0.2555, "num_tokens": 267223923.0, "step": 2044 }, { "epoch": 0.8160415003990423, "grad_norm": 0.27721545100212097, "learning_rate": 4.4203971138052546e-05, "loss": 0.3964, "num_tokens": 267354995.0, "step": 2045 }, { "epoch": 0.816440542697526, "grad_norm": 0.22714270651340485, "learning_rate": 4.41973388077635e-05, "loss": 0.3045, "num_tokens": 267486067.0, "step": 2046 }, { "epoch": 0.8168395849960096, "grad_norm": 0.24140863120555878, "learning_rate": 4.4190703246695044e-05, "loss": 0.3233, "num_tokens": 267612291.0, "step": 2047 }, { "epoch": 0.8172386272944933, "grad_norm": 0.23460598289966583, "learning_rate": 4.418406445613109e-05, "loss": 0.3186, "num_tokens": 267743363.0, "step": 2048 }, { "epoch": 0.8176376695929769, "grad_norm": 0.2315589189529419, "learning_rate": 4.4177422437356194e-05, "loss": 0.326, "num_tokens": 267874435.0, "step": 2049 }, { "epoch": 0.8180367118914605, "grad_norm": 0.25093647837638855, "learning_rate": 4.4170777191655534e-05, "loss": 0.3524, "num_tokens": 268005507.0, "step": 2050 }, { "epoch": 0.8184357541899442, "grad_norm": 0.22990460693836212, "learning_rate": 4.4164128720314876e-05, "loss": 0.3094, "num_tokens": 268136579.0, "step": 2051 }, { "epoch": 0.8188347964884277, "grad_norm": 0.23255591094493866, "learning_rate": 4.415747702462065e-05, "loss": 0.3273, "num_tokens": 268267651.0, "step": 2052 }, { "epoch": 0.8192338387869114, "grad_norm": 0.22928477823734283, "learning_rate": 4.4150822105859905e-05, "loss": 0.2941, "num_tokens": 268398723.0, "step": 2053 }, { "epoch": 0.819632881085395, "grad_norm": 0.23637403547763824, "learning_rate": 4.41441639653203e-05, "loss": 0.3275, "num_tokens": 268529795.0, "step": 2054 }, { "epoch": 0.8200319233838786, "grad_norm": 0.27356040477752686, "learning_rate": 4.413750260429012e-05, "loss": 0.3945, "num_tokens": 268660867.0, "step": 2055 }, { "epoch": 0.8204309656823623, "grad_norm": 0.2449980527162552, "learning_rate": 4.413083802405829e-05, "loss": 0.3574, "num_tokens": 268791939.0, "step": 2056 }, { "epoch": 0.8208300079808459, "grad_norm": 0.25510647892951965, "learning_rate": 4.412417022591431e-05, "loss": 0.4084, "num_tokens": 268923011.0, "step": 2057 }, { "epoch": 0.8212290502793296, "grad_norm": 0.22739435732364655, "learning_rate": 4.411749921114837e-05, "loss": 0.3439, "num_tokens": 269054083.0, "step": 2058 }, { "epoch": 0.8216280925778132, "grad_norm": 0.2401002049446106, "learning_rate": 4.411082498105125e-05, "loss": 0.3372, "num_tokens": 269170198.0, "step": 2059 }, { "epoch": 0.8220271348762969, "grad_norm": 0.22522075474262238, "learning_rate": 4.410414753691433e-05, "loss": 0.3038, "num_tokens": 269301270.0, "step": 2060 }, { "epoch": 0.8224261771747805, "grad_norm": 0.2507190704345703, "learning_rate": 4.409746688002965e-05, "loss": 0.3181, "num_tokens": 269432342.0, "step": 2061 }, { "epoch": 0.8228252194732641, "grad_norm": 0.23452675342559814, "learning_rate": 4.409078301168984e-05, "loss": 0.3481, "num_tokens": 269563414.0, "step": 2062 }, { "epoch": 0.8232242617717478, "grad_norm": 0.2433987259864807, "learning_rate": 4.408409593318818e-05, "loss": 0.3876, "num_tokens": 269694486.0, "step": 2063 }, { "epoch": 0.8236233040702314, "grad_norm": 0.278442919254303, "learning_rate": 4.407740564581854e-05, "loss": 0.4022, "num_tokens": 269825558.0, "step": 2064 }, { "epoch": 0.8240223463687151, "grad_norm": 0.2260449230670929, "learning_rate": 4.407071215087546e-05, "loss": 0.334, "num_tokens": 269956630.0, "step": 2065 }, { "epoch": 0.8244213886671987, "grad_norm": 0.22159309685230255, "learning_rate": 4.406401544965404e-05, "loss": 0.3226, "num_tokens": 270087702.0, "step": 2066 }, { "epoch": 0.8248204309656824, "grad_norm": 0.21655622124671936, "learning_rate": 4.4057315543450034e-05, "loss": 0.2766, "num_tokens": 270218774.0, "step": 2067 }, { "epoch": 0.825219473264166, "grad_norm": 0.27863866090774536, "learning_rate": 4.4050612433559825e-05, "loss": 0.3758, "num_tokens": 270349846.0, "step": 2068 }, { "epoch": 0.8256185155626496, "grad_norm": 0.2585558295249939, "learning_rate": 4.404390612128038e-05, "loss": 0.3629, "num_tokens": 270480918.0, "step": 2069 }, { "epoch": 0.8260175578611333, "grad_norm": 0.23856832087039948, "learning_rate": 4.4037196607909334e-05, "loss": 0.3497, "num_tokens": 270611990.0, "step": 2070 }, { "epoch": 0.8264166001596169, "grad_norm": 0.25979071855545044, "learning_rate": 4.4030483894744895e-05, "loss": 0.3642, "num_tokens": 270743062.0, "step": 2071 }, { "epoch": 0.8268156424581006, "grad_norm": 0.2408258616924286, "learning_rate": 4.402376798308592e-05, "loss": 0.3531, "num_tokens": 270874134.0, "step": 2072 }, { "epoch": 0.8272146847565842, "grad_norm": 0.22528713941574097, "learning_rate": 4.4017048874231867e-05, "loss": 0.3084, "num_tokens": 271005206.0, "step": 2073 }, { "epoch": 0.8276137270550679, "grad_norm": 0.24818755686283112, "learning_rate": 4.4010326569482835e-05, "loss": 0.3394, "num_tokens": 271136278.0, "step": 2074 }, { "epoch": 0.8280127693535515, "grad_norm": 0.24339158833026886, "learning_rate": 4.4003601070139515e-05, "loss": 0.3544, "num_tokens": 271267350.0, "step": 2075 }, { "epoch": 0.8284118116520351, "grad_norm": 0.2316940873861313, "learning_rate": 4.3996872377503234e-05, "loss": 0.3415, "num_tokens": 271398422.0, "step": 2076 }, { "epoch": 0.8288108539505188, "grad_norm": 0.23472285270690918, "learning_rate": 4.399014049287593e-05, "loss": 0.3091, "num_tokens": 271529494.0, "step": 2077 }, { "epoch": 0.8292098962490024, "grad_norm": 0.24647746980190277, "learning_rate": 4.3983405417560155e-05, "loss": 0.3732, "num_tokens": 271660566.0, "step": 2078 }, { "epoch": 0.8296089385474861, "grad_norm": 0.23827393352985382, "learning_rate": 4.3976667152859094e-05, "loss": 0.3225, "num_tokens": 271791638.0, "step": 2079 }, { "epoch": 0.8300079808459697, "grad_norm": 0.22852405905723572, "learning_rate": 4.396992570007653e-05, "loss": 0.3105, "num_tokens": 271922710.0, "step": 2080 }, { "epoch": 0.8304070231444534, "grad_norm": 0.23707377910614014, "learning_rate": 4.396318106051687e-05, "loss": 0.3276, "num_tokens": 272053782.0, "step": 2081 }, { "epoch": 0.830806065442937, "grad_norm": 0.260907381772995, "learning_rate": 4.395643323548515e-05, "loss": 0.3605, "num_tokens": 272184854.0, "step": 2082 }, { "epoch": 0.8312051077414206, "grad_norm": 0.23202970623970032, "learning_rate": 4.3949682226287e-05, "loss": 0.2989, "num_tokens": 272315926.0, "step": 2083 }, { "epoch": 0.8316041500399042, "grad_norm": 0.23117733001708984, "learning_rate": 4.394292803422869e-05, "loss": 0.3068, "num_tokens": 272446998.0, "step": 2084 }, { "epoch": 0.8320031923383878, "grad_norm": 0.24788816273212433, "learning_rate": 4.393617066061707e-05, "loss": 0.3592, "num_tokens": 272578070.0, "step": 2085 }, { "epoch": 0.8324022346368715, "grad_norm": 0.23702505230903625, "learning_rate": 4.392941010675965e-05, "loss": 0.3311, "num_tokens": 272704362.0, "step": 2086 }, { "epoch": 0.8328012769353551, "grad_norm": 0.24031054973602295, "learning_rate": 4.392264637396453e-05, "loss": 0.3265, "num_tokens": 272835434.0, "step": 2087 }, { "epoch": 0.8332003192338387, "grad_norm": 0.25172826647758484, "learning_rate": 4.391587946354042e-05, "loss": 0.3376, "num_tokens": 272966506.0, "step": 2088 }, { "epoch": 0.8335993615323224, "grad_norm": 0.23530422151088715, "learning_rate": 4.3909109376796665e-05, "loss": 0.3209, "num_tokens": 273097578.0, "step": 2089 }, { "epoch": 0.833998403830806, "grad_norm": 0.22267931699752808, "learning_rate": 4.390233611504321e-05, "loss": 0.3063, "num_tokens": 273228650.0, "step": 2090 }, { "epoch": 0.8343974461292897, "grad_norm": 0.23810240626335144, "learning_rate": 4.389555967959062e-05, "loss": 0.3424, "num_tokens": 273359722.0, "step": 2091 }, { "epoch": 0.8347964884277733, "grad_norm": 0.24975009262561798, "learning_rate": 4.388878007175006e-05, "loss": 0.3501, "num_tokens": 273490794.0, "step": 2092 }, { "epoch": 0.835195530726257, "grad_norm": 0.24497386813163757, "learning_rate": 4.388199729283333e-05, "loss": 0.3233, "num_tokens": 273621866.0, "step": 2093 }, { "epoch": 0.8355945730247406, "grad_norm": 0.2294919341802597, "learning_rate": 4.387521134415285e-05, "loss": 0.2829, "num_tokens": 273752938.0, "step": 2094 }, { "epoch": 0.8359936153232242, "grad_norm": 0.2744426429271698, "learning_rate": 4.386842222702161e-05, "loss": 0.337, "num_tokens": 273884010.0, "step": 2095 }, { "epoch": 0.8363926576217079, "grad_norm": 0.27739614248275757, "learning_rate": 4.3861629942753254e-05, "loss": 0.343, "num_tokens": 274015082.0, "step": 2096 }, { "epoch": 0.8367916999201915, "grad_norm": 0.23570169508457184, "learning_rate": 4.385483449266203e-05, "loss": 0.3047, "num_tokens": 274146154.0, "step": 2097 }, { "epoch": 0.8371907422186752, "grad_norm": 0.25138095021247864, "learning_rate": 4.3848035878062786e-05, "loss": 0.3648, "num_tokens": 274277226.0, "step": 2098 }, { "epoch": 0.8375897845171588, "grad_norm": 0.24551820755004883, "learning_rate": 4.3841234100271e-05, "loss": 0.3684, "num_tokens": 274408298.0, "step": 2099 }, { "epoch": 0.8379888268156425, "grad_norm": 0.26767396926879883, "learning_rate": 4.383442916060274e-05, "loss": 0.3396, "num_tokens": 274539370.0, "step": 2100 }, { "epoch": 0.8383878691141261, "grad_norm": 0.2705978751182556, "learning_rate": 4.382762106037471e-05, "loss": 0.3452, "num_tokens": 274670442.0, "step": 2101 }, { "epoch": 0.8387869114126097, "grad_norm": 0.2425340861082077, "learning_rate": 4.38208098009042e-05, "loss": 0.3256, "num_tokens": 274801514.0, "step": 2102 }, { "epoch": 0.8391859537110934, "grad_norm": 0.2411920577287674, "learning_rate": 4.3813995383509146e-05, "loss": 0.3402, "num_tokens": 274932586.0, "step": 2103 }, { "epoch": 0.839584996009577, "grad_norm": 0.276152104139328, "learning_rate": 4.380717780950805e-05, "loss": 0.3653, "num_tokens": 275063658.0, "step": 2104 }, { "epoch": 0.8399840383080607, "grad_norm": 0.256797730922699, "learning_rate": 4.380035708022007e-05, "loss": 0.3375, "num_tokens": 275194730.0, "step": 2105 }, { "epoch": 0.8403830806065443, "grad_norm": 0.2317029982805252, "learning_rate": 4.3793533196964946e-05, "loss": 0.313, "num_tokens": 275325802.0, "step": 2106 }, { "epoch": 0.840782122905028, "grad_norm": 0.24100305140018463, "learning_rate": 4.378670616106303e-05, "loss": 0.3283, "num_tokens": 275456874.0, "step": 2107 }, { "epoch": 0.8411811652035116, "grad_norm": 0.22653789818286896, "learning_rate": 4.377987597383529e-05, "loss": 0.2979, "num_tokens": 275587946.0, "step": 2108 }, { "epoch": 0.8415802075019952, "grad_norm": 0.2590484917163849, "learning_rate": 4.377304263660332e-05, "loss": 0.3454, "num_tokens": 275719018.0, "step": 2109 }, { "epoch": 0.8419792498004789, "grad_norm": 0.23368537425994873, "learning_rate": 4.376620615068929e-05, "loss": 0.318, "num_tokens": 275850090.0, "step": 2110 }, { "epoch": 0.8423782920989625, "grad_norm": 0.2580960690975189, "learning_rate": 4.3759366517416e-05, "loss": 0.3635, "num_tokens": 275981162.0, "step": 2111 }, { "epoch": 0.8427773343974462, "grad_norm": 0.23585079610347748, "learning_rate": 4.375252373810686e-05, "loss": 0.3206, "num_tokens": 276112234.0, "step": 2112 }, { "epoch": 0.8431763766959298, "grad_norm": 0.2578468322753906, "learning_rate": 4.374567781408588e-05, "loss": 0.3787, "num_tokens": 276243306.0, "step": 2113 }, { "epoch": 0.8435754189944135, "grad_norm": 0.2565101385116577, "learning_rate": 4.373882874667768e-05, "loss": 0.3515, "num_tokens": 276374378.0, "step": 2114 }, { "epoch": 0.843974461292897, "grad_norm": 0.2372492253780365, "learning_rate": 4.3731976537207494e-05, "loss": 0.3179, "num_tokens": 276505450.0, "step": 2115 }, { "epoch": 0.8443735035913806, "grad_norm": 0.24094761908054352, "learning_rate": 4.372512118700117e-05, "loss": 0.317, "num_tokens": 276636522.0, "step": 2116 }, { "epoch": 0.8447725458898643, "grad_norm": 0.2237534373998642, "learning_rate": 4.371826269738513e-05, "loss": 0.2735, "num_tokens": 276767594.0, "step": 2117 }, { "epoch": 0.8451715881883479, "grad_norm": 0.2938021421432495, "learning_rate": 4.371140106968645e-05, "loss": 0.3758, "num_tokens": 276898666.0, "step": 2118 }, { "epoch": 0.8455706304868316, "grad_norm": 0.27855634689331055, "learning_rate": 4.370453630523279e-05, "loss": 0.3631, "num_tokens": 277029738.0, "step": 2119 }, { "epoch": 0.8459696727853152, "grad_norm": 0.2599278688430786, "learning_rate": 4.369766840535241e-05, "loss": 0.403, "num_tokens": 277160810.0, "step": 2120 }, { "epoch": 0.8463687150837989, "grad_norm": 0.2425663322210312, "learning_rate": 4.369079737137418e-05, "loss": 0.3559, "num_tokens": 277291882.0, "step": 2121 }, { "epoch": 0.8467677573822825, "grad_norm": 0.2362755686044693, "learning_rate": 4.368392320462759e-05, "loss": 0.3467, "num_tokens": 277422954.0, "step": 2122 }, { "epoch": 0.8471667996807661, "grad_norm": 0.24917466938495636, "learning_rate": 4.367704590644272e-05, "loss": 0.3721, "num_tokens": 277554026.0, "step": 2123 }, { "epoch": 0.8475658419792498, "grad_norm": 0.21521437168121338, "learning_rate": 4.367016547815027e-05, "loss": 0.2992, "num_tokens": 277685098.0, "step": 2124 }, { "epoch": 0.8479648842777334, "grad_norm": 0.22532565891742706, "learning_rate": 4.366328192108154e-05, "loss": 0.3519, "num_tokens": 277816170.0, "step": 2125 }, { "epoch": 0.8483639265762171, "grad_norm": 0.239088237285614, "learning_rate": 4.365639523656843e-05, "loss": 0.3589, "num_tokens": 277947242.0, "step": 2126 }, { "epoch": 0.8487629688747007, "grad_norm": 0.23330296576023102, "learning_rate": 4.364950542594345e-05, "loss": 0.3117, "num_tokens": 278078314.0, "step": 2127 }, { "epoch": 0.8491620111731844, "grad_norm": 0.2230599969625473, "learning_rate": 4.364261249053971e-05, "loss": 0.3025, "num_tokens": 278209386.0, "step": 2128 }, { "epoch": 0.849561053471668, "grad_norm": 0.2511444389820099, "learning_rate": 4.3635716431690935e-05, "loss": 0.3434, "num_tokens": 278340458.0, "step": 2129 }, { "epoch": 0.8499600957701516, "grad_norm": 0.24717512726783752, "learning_rate": 4.362881725073144e-05, "loss": 0.337, "num_tokens": 278471530.0, "step": 2130 }, { "epoch": 0.8503591380686353, "grad_norm": 0.265218585729599, "learning_rate": 4.3621914948996166e-05, "loss": 0.3564, "num_tokens": 278597208.0, "step": 2131 }, { "epoch": 0.8507581803671189, "grad_norm": 0.238608717918396, "learning_rate": 4.361500952782063e-05, "loss": 0.3251, "num_tokens": 278728280.0, "step": 2132 }, { "epoch": 0.8511572226656026, "grad_norm": 0.23946015536785126, "learning_rate": 4.3608100988540975e-05, "loss": 0.3159, "num_tokens": 278859352.0, "step": 2133 }, { "epoch": 0.8515562649640862, "grad_norm": 0.24490341544151306, "learning_rate": 4.360118933249394e-05, "loss": 0.315, "num_tokens": 278990424.0, "step": 2134 }, { "epoch": 0.8519553072625698, "grad_norm": 0.28809064626693726, "learning_rate": 4.359427456101686e-05, "loss": 0.3645, "num_tokens": 279121496.0, "step": 2135 }, { "epoch": 0.8523543495610535, "grad_norm": 0.2366572469472885, "learning_rate": 4.358735667544768e-05, "loss": 0.3032, "num_tokens": 279252568.0, "step": 2136 }, { "epoch": 0.8527533918595371, "grad_norm": 0.30037549138069153, "learning_rate": 4.358043567712495e-05, "loss": 0.3533, "num_tokens": 279383640.0, "step": 2137 }, { "epoch": 0.8531524341580208, "grad_norm": 0.27704918384552, "learning_rate": 4.3573511567387814e-05, "loss": 0.3393, "num_tokens": 279514712.0, "step": 2138 }, { "epoch": 0.8535514764565044, "grad_norm": 0.2731804847717285, "learning_rate": 4.3566584347576026e-05, "loss": 0.3592, "num_tokens": 279645784.0, "step": 2139 }, { "epoch": 0.8539505187549881, "grad_norm": 0.2490735948085785, "learning_rate": 4.355965401902994e-05, "loss": 0.3349, "num_tokens": 279776856.0, "step": 2140 }, { "epoch": 0.8543495610534717, "grad_norm": 0.2354428470134735, "learning_rate": 4.35527205830905e-05, "loss": 0.3509, "num_tokens": 279907928.0, "step": 2141 }, { "epoch": 0.8547486033519553, "grad_norm": 0.2263154834508896, "learning_rate": 4.354578404109928e-05, "loss": 0.3215, "num_tokens": 280039000.0, "step": 2142 }, { "epoch": 0.855147645650439, "grad_norm": 0.2342406064271927, "learning_rate": 4.353884439439841e-05, "loss": 0.3408, "num_tokens": 280170072.0, "step": 2143 }, { "epoch": 0.8555466879489226, "grad_norm": 0.24031639099121094, "learning_rate": 4.3531901644330684e-05, "loss": 0.3152, "num_tokens": 280301144.0, "step": 2144 }, { "epoch": 0.8559457302474063, "grad_norm": 0.2553277015686035, "learning_rate": 4.352495579223942e-05, "loss": 0.3265, "num_tokens": 280432216.0, "step": 2145 }, { "epoch": 0.8563447725458898, "grad_norm": 0.23577016592025757, "learning_rate": 4.35180068394686e-05, "loss": 0.3238, "num_tokens": 280563288.0, "step": 2146 }, { "epoch": 0.8567438148443735, "grad_norm": 0.2635398209095001, "learning_rate": 4.351105478736277e-05, "loss": 0.3581, "num_tokens": 280694360.0, "step": 2147 }, { "epoch": 0.8571428571428571, "grad_norm": 0.2319735586643219, "learning_rate": 4.3504099637267094e-05, "loss": 0.3048, "num_tokens": 280825432.0, "step": 2148 }, { "epoch": 0.8575418994413407, "grad_norm": 0.2675778865814209, "learning_rate": 4.3497141390527325e-05, "loss": 0.3965, "num_tokens": 280956504.0, "step": 2149 }, { "epoch": 0.8579409417398244, "grad_norm": 0.2611248195171356, "learning_rate": 4.349018004848982e-05, "loss": 0.3534, "num_tokens": 281087576.0, "step": 2150 }, { "epoch": 0.858339984038308, "grad_norm": 0.22721591591835022, "learning_rate": 4.348321561250154e-05, "loss": 0.2958, "num_tokens": 281218648.0, "step": 2151 }, { "epoch": 0.8587390263367917, "grad_norm": 0.24702423810958862, "learning_rate": 4.3476248083910025e-05, "loss": 0.3388, "num_tokens": 281349720.0, "step": 2152 }, { "epoch": 0.8591380686352753, "grad_norm": 0.2504909038543701, "learning_rate": 4.346927746406344e-05, "loss": 0.3365, "num_tokens": 281480792.0, "step": 2153 }, { "epoch": 0.859537110933759, "grad_norm": 0.21726372838020325, "learning_rate": 4.346230375431052e-05, "loss": 0.2419, "num_tokens": 281611864.0, "step": 2154 }, { "epoch": 0.8599361532322426, "grad_norm": 0.25157201290130615, "learning_rate": 4.3455326956000634e-05, "loss": 0.3305, "num_tokens": 281742936.0, "step": 2155 }, { "epoch": 0.8603351955307262, "grad_norm": 0.30889859795570374, "learning_rate": 4.3448347070483716e-05, "loss": 0.4227, "num_tokens": 281874008.0, "step": 2156 }, { "epoch": 0.8607342378292099, "grad_norm": 0.24808473885059357, "learning_rate": 4.34413640991103e-05, "loss": 0.3253, "num_tokens": 282005080.0, "step": 2157 }, { "epoch": 0.8611332801276935, "grad_norm": 0.22456157207489014, "learning_rate": 4.343437804323154e-05, "loss": 0.2981, "num_tokens": 282136152.0, "step": 2158 }, { "epoch": 0.8615323224261772, "grad_norm": 0.2431740015745163, "learning_rate": 4.3427388904199164e-05, "loss": 0.3339, "num_tokens": 282267224.0, "step": 2159 }, { "epoch": 0.8619313647246608, "grad_norm": 0.2201022058725357, "learning_rate": 4.3420396683365505e-05, "loss": 0.275, "num_tokens": 282398296.0, "step": 2160 }, { "epoch": 0.8623304070231445, "grad_norm": 0.2290540188550949, "learning_rate": 4.341340138208351e-05, "loss": 0.3371, "num_tokens": 282529368.0, "step": 2161 }, { "epoch": 0.8627294493216281, "grad_norm": 0.2395392805337906, "learning_rate": 4.3406403001706676e-05, "loss": 0.329, "num_tokens": 282660440.0, "step": 2162 }, { "epoch": 0.8631284916201117, "grad_norm": 0.22000625729560852, "learning_rate": 4.339940154358914e-05, "loss": 0.3018, "num_tokens": 282791512.0, "step": 2163 }, { "epoch": 0.8635275339185954, "grad_norm": 0.2659313678741455, "learning_rate": 4.3392397009085614e-05, "loss": 0.3273, "num_tokens": 282922584.0, "step": 2164 }, { "epoch": 0.863926576217079, "grad_norm": 0.26277658343315125, "learning_rate": 4.338538939955142e-05, "loss": 0.395, "num_tokens": 283053656.0, "step": 2165 }, { "epoch": 0.8643256185155627, "grad_norm": 0.22450503706932068, "learning_rate": 4.337837871634246e-05, "loss": 0.3002, "num_tokens": 283184728.0, "step": 2166 }, { "epoch": 0.8647246608140463, "grad_norm": 0.2487247735261917, "learning_rate": 4.337136496081523e-05, "loss": 0.3073, "num_tokens": 283315800.0, "step": 2167 }, { "epoch": 0.86512370311253, "grad_norm": 0.3146573305130005, "learning_rate": 4.336434813432684e-05, "loss": 0.4024, "num_tokens": 283446872.0, "step": 2168 }, { "epoch": 0.8655227454110136, "grad_norm": 0.22533275187015533, "learning_rate": 4.3357328238234964e-05, "loss": 0.3085, "num_tokens": 283577944.0, "step": 2169 }, { "epoch": 0.8659217877094972, "grad_norm": 0.2260666787624359, "learning_rate": 4.3350305273897906e-05, "loss": 0.3063, "num_tokens": 283709016.0, "step": 2170 }, { "epoch": 0.8663208300079809, "grad_norm": 0.2431541532278061, "learning_rate": 4.3343279242674524e-05, "loss": 0.3418, "num_tokens": 283840088.0, "step": 2171 }, { "epoch": 0.8667198723064645, "grad_norm": 0.285733163356781, "learning_rate": 4.333625014592429e-05, "loss": 0.3711, "num_tokens": 283971160.0, "step": 2172 }, { "epoch": 0.8671189146049482, "grad_norm": 0.22740131616592407, "learning_rate": 4.332921798500729e-05, "loss": 0.3402, "num_tokens": 284102232.0, "step": 2173 }, { "epoch": 0.8675179569034318, "grad_norm": 0.2644554376602173, "learning_rate": 4.332218276128416e-05, "loss": 0.3902, "num_tokens": 284233304.0, "step": 2174 }, { "epoch": 0.8679169992019155, "grad_norm": 0.2302287369966507, "learning_rate": 4.331514447611615e-05, "loss": 0.3448, "num_tokens": 284364376.0, "step": 2175 }, { "epoch": 0.8683160415003991, "grad_norm": 0.23024185001850128, "learning_rate": 4.330810313086513e-05, "loss": 0.3164, "num_tokens": 284495448.0, "step": 2176 }, { "epoch": 0.8687150837988827, "grad_norm": 0.24541953206062317, "learning_rate": 4.3301058726893504e-05, "loss": 0.3288, "num_tokens": 284626520.0, "step": 2177 }, { "epoch": 0.8691141260973663, "grad_norm": 0.2507837414741516, "learning_rate": 4.329401126556431e-05, "loss": 0.3555, "num_tokens": 284757592.0, "step": 2178 }, { "epoch": 0.8695131683958499, "grad_norm": 0.23358213901519775, "learning_rate": 4.328696074824117e-05, "loss": 0.3297, "num_tokens": 284884802.0, "step": 2179 }, { "epoch": 0.8699122106943336, "grad_norm": 0.23635688424110413, "learning_rate": 4.327990717628829e-05, "loss": 0.326, "num_tokens": 285015874.0, "step": 2180 }, { "epoch": 0.8703112529928172, "grad_norm": 0.236753910779953, "learning_rate": 4.327285055107046e-05, "loss": 0.332, "num_tokens": 285146946.0, "step": 2181 }, { "epoch": 0.8707102952913008, "grad_norm": 0.2605739235877991, "learning_rate": 4.326579087395309e-05, "loss": 0.3605, "num_tokens": 285278018.0, "step": 2182 }, { "epoch": 0.8711093375897845, "grad_norm": 0.23801898956298828, "learning_rate": 4.325872814630215e-05, "loss": 0.3268, "num_tokens": 285409090.0, "step": 2183 }, { "epoch": 0.8715083798882681, "grad_norm": 0.24870337545871735, "learning_rate": 4.3251662369484215e-05, "loss": 0.349, "num_tokens": 285540162.0, "step": 2184 }, { "epoch": 0.8719074221867518, "grad_norm": 0.2297748327255249, "learning_rate": 4.324459354486644e-05, "loss": 0.2999, "num_tokens": 285671234.0, "step": 2185 }, { "epoch": 0.8723064644852354, "grad_norm": 0.22629337012767792, "learning_rate": 4.323752167381659e-05, "loss": 0.2925, "num_tokens": 285802306.0, "step": 2186 }, { "epoch": 0.872705506783719, "grad_norm": 0.2394433468580246, "learning_rate": 4.323044675770301e-05, "loss": 0.2889, "num_tokens": 285933378.0, "step": 2187 }, { "epoch": 0.8731045490822027, "grad_norm": 0.25784847140312195, "learning_rate": 4.322336879789461e-05, "loss": 0.3465, "num_tokens": 286064450.0, "step": 2188 }, { "epoch": 0.8735035913806863, "grad_norm": 0.25489968061447144, "learning_rate": 4.321628779576092e-05, "loss": 0.345, "num_tokens": 286195522.0, "step": 2189 }, { "epoch": 0.87390263367917, "grad_norm": 0.2558664381504059, "learning_rate": 4.320920375267204e-05, "loss": 0.3413, "num_tokens": 286326594.0, "step": 2190 }, { "epoch": 0.8743016759776536, "grad_norm": 0.2715689241886139, "learning_rate": 4.320211666999867e-05, "loss": 0.3927, "num_tokens": 286457666.0, "step": 2191 }, { "epoch": 0.8747007182761373, "grad_norm": 0.25068336725234985, "learning_rate": 4.3195026549112114e-05, "loss": 0.3228, "num_tokens": 286588738.0, "step": 2192 }, { "epoch": 0.8750997605746209, "grad_norm": 0.23186591267585754, "learning_rate": 4.3187933391384215e-05, "loss": 0.3136, "num_tokens": 286719810.0, "step": 2193 }, { "epoch": 0.8754988028731046, "grad_norm": 0.2332015186548233, "learning_rate": 4.3180837198187454e-05, "loss": 0.3208, "num_tokens": 286850882.0, "step": 2194 }, { "epoch": 0.8758978451715882, "grad_norm": 0.2583736777305603, "learning_rate": 4.3173737970894864e-05, "loss": 0.361, "num_tokens": 286967779.0, "step": 2195 }, { "epoch": 0.8762968874700718, "grad_norm": 0.22821982204914093, "learning_rate": 4.3166635710880096e-05, "loss": 0.2968, "num_tokens": 287084114.0, "step": 2196 }, { "epoch": 0.8766959297685555, "grad_norm": 0.2644006907939911, "learning_rate": 4.315953041951735e-05, "loss": 0.4343, "num_tokens": 287215186.0, "step": 2197 }, { "epoch": 0.8770949720670391, "grad_norm": 0.22953657805919647, "learning_rate": 4.3152422098181436e-05, "loss": 0.3334, "num_tokens": 287346258.0, "step": 2198 }, { "epoch": 0.8774940143655228, "grad_norm": 0.24873247742652893, "learning_rate": 4.314531074824777e-05, "loss": 0.3645, "num_tokens": 287477330.0, "step": 2199 }, { "epoch": 0.8778930566640064, "grad_norm": 0.21559423208236694, "learning_rate": 4.313819637109232e-05, "loss": 0.2788, "num_tokens": 287608402.0, "step": 2200 }, { "epoch": 0.87829209896249, "grad_norm": 0.2228280007839203, "learning_rate": 4.3131078968091646e-05, "loss": 0.2697, "num_tokens": 287739474.0, "step": 2201 }, { "epoch": 0.8786911412609737, "grad_norm": 0.2613068222999573, "learning_rate": 4.31239585406229e-05, "loss": 0.3862, "num_tokens": 287870546.0, "step": 2202 }, { "epoch": 0.8790901835594573, "grad_norm": 0.25899505615234375, "learning_rate": 4.3116835090063817e-05, "loss": 0.3534, "num_tokens": 288001618.0, "step": 2203 }, { "epoch": 0.879489225857941, "grad_norm": 0.2649589478969574, "learning_rate": 4.310970861779274e-05, "loss": 0.3749, "num_tokens": 288132690.0, "step": 2204 }, { "epoch": 0.8798882681564246, "grad_norm": 0.2227754145860672, "learning_rate": 4.310257912518854e-05, "loss": 0.3072, "num_tokens": 288263762.0, "step": 2205 }, { "epoch": 0.8802873104549083, "grad_norm": 0.22944197058677673, "learning_rate": 4.309544661363073e-05, "loss": 0.3233, "num_tokens": 288394834.0, "step": 2206 }, { "epoch": 0.8806863527533919, "grad_norm": 0.25079184770584106, "learning_rate": 4.308831108449937e-05, "loss": 0.3506, "num_tokens": 288525906.0, "step": 2207 }, { "epoch": 0.8810853950518756, "grad_norm": 0.23437565565109253, "learning_rate": 4.308117253917514e-05, "loss": 0.3349, "num_tokens": 288656978.0, "step": 2208 }, { "epoch": 0.8814844373503591, "grad_norm": 0.23620320856571198, "learning_rate": 4.307403097903927e-05, "loss": 0.3482, "num_tokens": 288788050.0, "step": 2209 }, { "epoch": 0.8818834796488427, "grad_norm": 0.25681838393211365, "learning_rate": 4.3066886405473575e-05, "loss": 0.3436, "num_tokens": 288903699.0, "step": 2210 }, { "epoch": 0.8822825219473264, "grad_norm": 0.24761393666267395, "learning_rate": 4.305973881986048e-05, "loss": 0.3538, "num_tokens": 289034771.0, "step": 2211 }, { "epoch": 0.88268156424581, "grad_norm": 0.23949843645095825, "learning_rate": 4.305258822358297e-05, "loss": 0.3491, "num_tokens": 289165843.0, "step": 2212 }, { "epoch": 0.8830806065442937, "grad_norm": 0.24557146430015564, "learning_rate": 4.304543461802461e-05, "loss": 0.3415, "num_tokens": 289296915.0, "step": 2213 }, { "epoch": 0.8834796488427773, "grad_norm": 0.23404951393604279, "learning_rate": 4.303827800456957e-05, "loss": 0.2966, "num_tokens": 289427987.0, "step": 2214 }, { "epoch": 0.8838786911412609, "grad_norm": 0.27333498001098633, "learning_rate": 4.303111838460258e-05, "loss": 0.3452, "num_tokens": 289559059.0, "step": 2215 }, { "epoch": 0.8842777334397446, "grad_norm": 0.2335207164287567, "learning_rate": 4.302395575950897e-05, "loss": 0.3081, "num_tokens": 289690131.0, "step": 2216 }, { "epoch": 0.8846767757382282, "grad_norm": 0.266804575920105, "learning_rate": 4.301679013067463e-05, "loss": 0.3593, "num_tokens": 289821203.0, "step": 2217 }, { "epoch": 0.8850758180367119, "grad_norm": 0.2281840592622757, "learning_rate": 4.300962149948604e-05, "loss": 0.3119, "num_tokens": 289952275.0, "step": 2218 }, { "epoch": 0.8854748603351955, "grad_norm": 0.24825216829776764, "learning_rate": 4.300244986733027e-05, "loss": 0.384, "num_tokens": 290083347.0, "step": 2219 }, { "epoch": 0.8858739026336792, "grad_norm": 0.22796088457107544, "learning_rate": 4.299527523559496e-05, "loss": 0.3029, "num_tokens": 290214419.0, "step": 2220 }, { "epoch": 0.8862729449321628, "grad_norm": 0.24715283513069153, "learning_rate": 4.298809760566835e-05, "loss": 0.3109, "num_tokens": 290345491.0, "step": 2221 }, { "epoch": 0.8866719872306464, "grad_norm": 0.2431965172290802, "learning_rate": 4.298091697893922e-05, "loss": 0.3362, "num_tokens": 290476563.0, "step": 2222 }, { "epoch": 0.8870710295291301, "grad_norm": 0.2372230589389801, "learning_rate": 4.297373335679697e-05, "loss": 0.3161, "num_tokens": 290607635.0, "step": 2223 }, { "epoch": 0.8874700718276137, "grad_norm": 0.23079483211040497, "learning_rate": 4.296654674063155e-05, "loss": 0.2999, "num_tokens": 290738707.0, "step": 2224 }, { "epoch": 0.8878691141260974, "grad_norm": 0.2471322864294052, "learning_rate": 4.295935713183352e-05, "loss": 0.3295, "num_tokens": 290869779.0, "step": 2225 }, { "epoch": 0.888268156424581, "grad_norm": 0.2467479705810547, "learning_rate": 4.2952164531794e-05, "loss": 0.3426, "num_tokens": 291000851.0, "step": 2226 }, { "epoch": 0.8886671987230647, "grad_norm": 0.24176470935344696, "learning_rate": 4.2944968941904685e-05, "loss": 0.3004, "num_tokens": 291131923.0, "step": 2227 }, { "epoch": 0.8890662410215483, "grad_norm": 0.2571839988231659, "learning_rate": 4.2937770363557856e-05, "loss": 0.3568, "num_tokens": 291262995.0, "step": 2228 }, { "epoch": 0.8894652833200319, "grad_norm": 0.2659177780151367, "learning_rate": 4.2930568798146364e-05, "loss": 0.336, "num_tokens": 291394067.0, "step": 2229 }, { "epoch": 0.8898643256185156, "grad_norm": 0.25462454557418823, "learning_rate": 4.292336424706365e-05, "loss": 0.3546, "num_tokens": 291510847.0, "step": 2230 }, { "epoch": 0.8902633679169992, "grad_norm": 0.2550539970397949, "learning_rate": 4.291615671170374e-05, "loss": 0.362, "num_tokens": 291641919.0, "step": 2231 }, { "epoch": 0.8906624102154829, "grad_norm": 0.23871548473834991, "learning_rate": 4.290894619346121e-05, "loss": 0.3006, "num_tokens": 291772991.0, "step": 2232 }, { "epoch": 0.8910614525139665, "grad_norm": 0.2695563733577728, "learning_rate": 4.2901732693731236e-05, "loss": 0.3691, "num_tokens": 291904063.0, "step": 2233 }, { "epoch": 0.8914604948124502, "grad_norm": 0.24669960141181946, "learning_rate": 4.289451621390955e-05, "loss": 0.3155, "num_tokens": 292035135.0, "step": 2234 }, { "epoch": 0.8918595371109338, "grad_norm": 0.2540874779224396, "learning_rate": 4.288729675539249e-05, "loss": 0.3153, "num_tokens": 292166207.0, "step": 2235 }, { "epoch": 0.8922585794094174, "grad_norm": 0.2494598776102066, "learning_rate": 4.288007431957694e-05, "loss": 0.3253, "num_tokens": 292297279.0, "step": 2236 }, { "epoch": 0.8926576217079011, "grad_norm": 0.2802734375, "learning_rate": 4.2872848907860374e-05, "loss": 0.3665, "num_tokens": 292428351.0, "step": 2237 }, { "epoch": 0.8930566640063847, "grad_norm": 0.23193944990634918, "learning_rate": 4.286562052164086e-05, "loss": 0.3174, "num_tokens": 292559423.0, "step": 2238 }, { "epoch": 0.8934557063048684, "grad_norm": 0.24699781835079193, "learning_rate": 4.285838916231701e-05, "loss": 0.348, "num_tokens": 292690495.0, "step": 2239 }, { "epoch": 0.8938547486033519, "grad_norm": 0.262796550989151, "learning_rate": 4.285115483128803e-05, "loss": 0.3252, "num_tokens": 292821567.0, "step": 2240 }, { "epoch": 0.8942537909018355, "grad_norm": 0.2267792522907257, "learning_rate": 4.284391752995368e-05, "loss": 0.3236, "num_tokens": 292952639.0, "step": 2241 }, { "epoch": 0.8946528332003192, "grad_norm": 0.35013484954833984, "learning_rate": 4.283667725971433e-05, "loss": 0.3694, "num_tokens": 293083711.0, "step": 2242 }, { "epoch": 0.8950518754988028, "grad_norm": 0.23772664368152618, "learning_rate": 4.2829434021970896e-05, "loss": 0.3253, "num_tokens": 293214783.0, "step": 2243 }, { "epoch": 0.8954509177972865, "grad_norm": 0.23944170773029327, "learning_rate": 4.2822187818124874e-05, "loss": 0.3439, "num_tokens": 293345855.0, "step": 2244 }, { "epoch": 0.8958499600957701, "grad_norm": 0.23064298927783966, "learning_rate": 4.2814938649578345e-05, "loss": 0.3126, "num_tokens": 293476927.0, "step": 2245 }, { "epoch": 0.8962490023942538, "grad_norm": 0.22668804228305817, "learning_rate": 4.280768651773395e-05, "loss": 0.2984, "num_tokens": 293607999.0, "step": 2246 }, { "epoch": 0.8966480446927374, "grad_norm": 0.25286200642585754, "learning_rate": 4.2800431423994914e-05, "loss": 0.3369, "num_tokens": 293739071.0, "step": 2247 }, { "epoch": 0.897047086991221, "grad_norm": 0.24937167763710022, "learning_rate": 4.279317336976502e-05, "loss": 0.3387, "num_tokens": 293870143.0, "step": 2248 }, { "epoch": 0.8974461292897047, "grad_norm": 0.2610321640968323, "learning_rate": 4.278591235644864e-05, "loss": 0.3618, "num_tokens": 294001215.0, "step": 2249 }, { "epoch": 0.8978451715881883, "grad_norm": 0.22454063594341278, "learning_rate": 4.277864838545072e-05, "loss": 0.2904, "num_tokens": 294132287.0, "step": 2250 }, { "epoch": 0.898244213886672, "grad_norm": 0.2480129450559616, "learning_rate": 4.277138145817676e-05, "loss": 0.3277, "num_tokens": 294257874.0, "step": 2251 }, { "epoch": 0.8986432561851556, "grad_norm": 0.24169230461120605, "learning_rate": 4.276411157603284e-05, "loss": 0.3187, "num_tokens": 294388946.0, "step": 2252 }, { "epoch": 0.8990422984836393, "grad_norm": 0.21200178563594818, "learning_rate": 4.275683874042563e-05, "loss": 0.2865, "num_tokens": 294520018.0, "step": 2253 }, { "epoch": 0.8994413407821229, "grad_norm": 0.25012439489364624, "learning_rate": 4.274956295276234e-05, "loss": 0.3357, "num_tokens": 294651090.0, "step": 2254 }, { "epoch": 0.8998403830806065, "grad_norm": 0.24387037754058838, "learning_rate": 4.2742284214450774e-05, "loss": 0.3406, "num_tokens": 294782162.0, "step": 2255 }, { "epoch": 0.9002394253790902, "grad_norm": 0.2504501938819885, "learning_rate": 4.27350025268993e-05, "loss": 0.3464, "num_tokens": 294913234.0, "step": 2256 }, { "epoch": 0.9006384676775738, "grad_norm": 0.23297522962093353, "learning_rate": 4.2727717891516855e-05, "loss": 0.3087, "num_tokens": 295044306.0, "step": 2257 }, { "epoch": 0.9010375099760575, "grad_norm": 0.251549631357193, "learning_rate": 4.272043030971295e-05, "loss": 0.3556, "num_tokens": 295175378.0, "step": 2258 }, { "epoch": 0.9014365522745411, "grad_norm": 0.22935964167118073, "learning_rate": 4.271313978289766e-05, "loss": 0.3102, "num_tokens": 295306450.0, "step": 2259 }, { "epoch": 0.9018355945730248, "grad_norm": 0.24093550443649292, "learning_rate": 4.270584631248164e-05, "loss": 0.341, "num_tokens": 295437522.0, "step": 2260 }, { "epoch": 0.9022346368715084, "grad_norm": 0.22717109322547913, "learning_rate": 4.269854989987611e-05, "loss": 0.3388, "num_tokens": 295568594.0, "step": 2261 }, { "epoch": 0.902633679169992, "grad_norm": 0.22743409872055054, "learning_rate": 4.269125054649284e-05, "loss": 0.2898, "num_tokens": 295699666.0, "step": 2262 }, { "epoch": 0.9030327214684757, "grad_norm": 0.25884032249450684, "learning_rate": 4.2683948253744214e-05, "loss": 0.3554, "num_tokens": 295830738.0, "step": 2263 }, { "epoch": 0.9034317637669593, "grad_norm": 0.2158237248659134, "learning_rate": 4.267664302304314e-05, "loss": 0.2549, "num_tokens": 295961810.0, "step": 2264 }, { "epoch": 0.903830806065443, "grad_norm": 0.24226851761341095, "learning_rate": 4.2669334855803114e-05, "loss": 0.3346, "num_tokens": 296092882.0, "step": 2265 }, { "epoch": 0.9042298483639266, "grad_norm": 0.2530994117259979, "learning_rate": 4.2662023753438194e-05, "loss": 0.3573, "num_tokens": 296223954.0, "step": 2266 }, { "epoch": 0.9046288906624103, "grad_norm": 0.25482410192489624, "learning_rate": 4.2654709717363026e-05, "loss": 0.3027, "num_tokens": 296355026.0, "step": 2267 }, { "epoch": 0.9050279329608939, "grad_norm": 0.2342139035463333, "learning_rate": 4.26473927489928e-05, "loss": 0.3151, "num_tokens": 296486098.0, "step": 2268 }, { "epoch": 0.9054269752593775, "grad_norm": 0.24262967705726624, "learning_rate": 4.264007284974327e-05, "loss": 0.2973, "num_tokens": 296617170.0, "step": 2269 }, { "epoch": 0.9058260175578612, "grad_norm": 0.24069802463054657, "learning_rate": 4.263275002103079e-05, "loss": 0.3098, "num_tokens": 296748242.0, "step": 2270 }, { "epoch": 0.9062250598563448, "grad_norm": 0.2682357430458069, "learning_rate": 4.262542426427223e-05, "loss": 0.3491, "num_tokens": 296879314.0, "step": 2271 }, { "epoch": 0.9066241021548284, "grad_norm": 0.2633809447288513, "learning_rate": 4.261809558088509e-05, "loss": 0.376, "num_tokens": 297010386.0, "step": 2272 }, { "epoch": 0.907023144453312, "grad_norm": 0.1923515349626541, "learning_rate": 4.261076397228737e-05, "loss": 0.2452, "num_tokens": 297141458.0, "step": 2273 }, { "epoch": 0.9074221867517956, "grad_norm": 0.23861265182495117, "learning_rate": 4.260342943989768e-05, "loss": 0.321, "num_tokens": 297272530.0, "step": 2274 }, { "epoch": 0.9078212290502793, "grad_norm": 0.246604323387146, "learning_rate": 4.25960919851352e-05, "loss": 0.2923, "num_tokens": 297403602.0, "step": 2275 }, { "epoch": 0.9082202713487629, "grad_norm": 0.23457218706607819, "learning_rate": 4.258875160941963e-05, "loss": 0.3387, "num_tokens": 297534674.0, "step": 2276 }, { "epoch": 0.9086193136472466, "grad_norm": 0.2565194070339203, "learning_rate": 4.25814083141713e-05, "loss": 0.3295, "num_tokens": 297665746.0, "step": 2277 }, { "epoch": 0.9090183559457302, "grad_norm": 0.25593435764312744, "learning_rate": 4.257406210081103e-05, "loss": 0.3031, "num_tokens": 297796818.0, "step": 2278 }, { "epoch": 0.9094173982442139, "grad_norm": 0.22848482429981232, "learning_rate": 4.256671297076027e-05, "loss": 0.301, "num_tokens": 297927890.0, "step": 2279 }, { "epoch": 0.9098164405426975, "grad_norm": 0.2504516839981079, "learning_rate": 4.2559360925441e-05, "loss": 0.34, "num_tokens": 298058962.0, "step": 2280 }, { "epoch": 0.9102154828411811, "grad_norm": 0.24585747718811035, "learning_rate": 4.255200596627577e-05, "loss": 0.3332, "num_tokens": 298190034.0, "step": 2281 }, { "epoch": 0.9106145251396648, "grad_norm": 0.239228293299675, "learning_rate": 4.2544648094687696e-05, "loss": 0.3222, "num_tokens": 298321106.0, "step": 2282 }, { "epoch": 0.9110135674381484, "grad_norm": 0.23923185467720032, "learning_rate": 4.253728731210046e-05, "loss": 0.2961, "num_tokens": 298452178.0, "step": 2283 }, { "epoch": 0.9114126097366321, "grad_norm": 0.22699204087257385, "learning_rate": 4.252992361993831e-05, "loss": 0.285, "num_tokens": 298583250.0, "step": 2284 }, { "epoch": 0.9118116520351157, "grad_norm": 0.23731230199337006, "learning_rate": 4.2522557019626046e-05, "loss": 0.2932, "num_tokens": 298714322.0, "step": 2285 }, { "epoch": 0.9122106943335994, "grad_norm": 0.2364581972360611, "learning_rate": 4.2515187512589036e-05, "loss": 0.2924, "num_tokens": 298845394.0, "step": 2286 }, { "epoch": 0.912609736632083, "grad_norm": 0.26736363768577576, "learning_rate": 4.250781510025321e-05, "loss": 0.3683, "num_tokens": 298976466.0, "step": 2287 }, { "epoch": 0.9130087789305666, "grad_norm": 0.254006028175354, "learning_rate": 4.2500439784045074e-05, "loss": 0.3157, "num_tokens": 299107538.0, "step": 2288 }, { "epoch": 0.9134078212290503, "grad_norm": 0.23926149308681488, "learning_rate": 4.2493061565391654e-05, "loss": 0.295, "num_tokens": 299238610.0, "step": 2289 }, { "epoch": 0.9138068635275339, "grad_norm": 0.2750110626220703, "learning_rate": 4.24856804457206e-05, "loss": 0.3488, "num_tokens": 299369682.0, "step": 2290 }, { "epoch": 0.9142059058260176, "grad_norm": 0.21542692184448242, "learning_rate": 4.247829642646006e-05, "loss": 0.2641, "num_tokens": 299500754.0, "step": 2291 }, { "epoch": 0.9146049481245012, "grad_norm": 0.24847930669784546, "learning_rate": 4.24709095090388e-05, "loss": 0.3492, "num_tokens": 299631826.0, "step": 2292 }, { "epoch": 0.9150039904229849, "grad_norm": 0.2856829762458801, "learning_rate": 4.24635196948861e-05, "loss": 0.3628, "num_tokens": 299762898.0, "step": 2293 }, { "epoch": 0.9154030327214685, "grad_norm": 0.23228754103183746, "learning_rate": 4.2456126985431835e-05, "loss": 0.3249, "num_tokens": 299893970.0, "step": 2294 }, { "epoch": 0.9158020750199521, "grad_norm": 0.23144854605197906, "learning_rate": 4.244873138210641e-05, "loss": 0.2963, "num_tokens": 300025042.0, "step": 2295 }, { "epoch": 0.9162011173184358, "grad_norm": 0.25441303849220276, "learning_rate": 4.244133288634081e-05, "loss": 0.3607, "num_tokens": 300156114.0, "step": 2296 }, { "epoch": 0.9166001596169194, "grad_norm": 0.24845394492149353, "learning_rate": 4.2433931499566593e-05, "loss": 0.2976, "num_tokens": 300287186.0, "step": 2297 }, { "epoch": 0.9169992019154031, "grad_norm": 0.2743920385837555, "learning_rate": 4.2426527223215826e-05, "loss": 0.3647, "num_tokens": 300418258.0, "step": 2298 }, { "epoch": 0.9173982442138867, "grad_norm": 0.2842317521572113, "learning_rate": 4.241912005872119e-05, "loss": 0.3551, "num_tokens": 300549330.0, "step": 2299 }, { "epoch": 0.9177972865123704, "grad_norm": 0.30254775285720825, "learning_rate": 4.2411710007515906e-05, "loss": 0.3911, "num_tokens": 300680402.0, "step": 2300 }, { "epoch": 0.918196328810854, "grad_norm": 0.22995178401470184, "learning_rate": 4.240429707103373e-05, "loss": 0.2877, "num_tokens": 300811474.0, "step": 2301 }, { "epoch": 0.9185953711093376, "grad_norm": 0.25402095913887024, "learning_rate": 4.2396881250709015e-05, "loss": 0.3419, "num_tokens": 300942546.0, "step": 2302 }, { "epoch": 0.9189944134078212, "grad_norm": 0.2586497366428375, "learning_rate": 4.2389462547976636e-05, "loss": 0.2748, "num_tokens": 301073618.0, "step": 2303 }, { "epoch": 0.9193934557063048, "grad_norm": 0.2344413846731186, "learning_rate": 4.238204096427205e-05, "loss": 0.3289, "num_tokens": 301204690.0, "step": 2304 }, { "epoch": 0.9197924980047885, "grad_norm": 0.2387811839580536, "learning_rate": 4.237461650103128e-05, "loss": 0.3131, "num_tokens": 301335762.0, "step": 2305 }, { "epoch": 0.9201915403032721, "grad_norm": 0.24863941967487335, "learning_rate": 4.236718915969087e-05, "loss": 0.3522, "num_tokens": 301466834.0, "step": 2306 }, { "epoch": 0.9205905826017557, "grad_norm": 0.2270064651966095, "learning_rate": 4.235975894168794e-05, "loss": 0.303, "num_tokens": 301597906.0, "step": 2307 }, { "epoch": 0.9209896249002394, "grad_norm": 0.26370716094970703, "learning_rate": 4.2352325848460184e-05, "loss": 0.346, "num_tokens": 301728978.0, "step": 2308 }, { "epoch": 0.921388667198723, "grad_norm": 0.2258700579404831, "learning_rate": 4.234488988144583e-05, "loss": 0.3182, "num_tokens": 301860050.0, "step": 2309 }, { "epoch": 0.9217877094972067, "grad_norm": 0.24043641984462738, "learning_rate": 4.2337451042083665e-05, "loss": 0.322, "num_tokens": 301991122.0, "step": 2310 }, { "epoch": 0.9221867517956903, "grad_norm": 0.2483738660812378, "learning_rate": 4.233000933181303e-05, "loss": 0.3128, "num_tokens": 302122194.0, "step": 2311 }, { "epoch": 0.922585794094174, "grad_norm": 0.23721584677696228, "learning_rate": 4.232256475207384e-05, "loss": 0.3174, "num_tokens": 302253266.0, "step": 2312 }, { "epoch": 0.9229848363926576, "grad_norm": 0.2557238042354584, "learning_rate": 4.231511730430653e-05, "loss": 0.3633, "num_tokens": 302384338.0, "step": 2313 }, { "epoch": 0.9233838786911412, "grad_norm": 0.24351565539836884, "learning_rate": 4.230766698995214e-05, "loss": 0.3006, "num_tokens": 302515410.0, "step": 2314 }, { "epoch": 0.9237829209896249, "grad_norm": 0.27008479833602905, "learning_rate": 4.2300213810452224e-05, "loss": 0.3414, "num_tokens": 302646482.0, "step": 2315 }, { "epoch": 0.9241819632881085, "grad_norm": 0.2380354106426239, "learning_rate": 4.229275776724889e-05, "loss": 0.2884, "num_tokens": 302777554.0, "step": 2316 }, { "epoch": 0.9245810055865922, "grad_norm": 0.24092280864715576, "learning_rate": 4.228529886178484e-05, "loss": 0.3134, "num_tokens": 302908626.0, "step": 2317 }, { "epoch": 0.9249800478850758, "grad_norm": 0.25967422127723694, "learning_rate": 4.227783709550328e-05, "loss": 0.3378, "num_tokens": 303039698.0, "step": 2318 }, { "epoch": 0.9253790901835595, "grad_norm": 0.2630384564399719, "learning_rate": 4.2270372469847995e-05, "loss": 0.3179, "num_tokens": 303170770.0, "step": 2319 }, { "epoch": 0.9257781324820431, "grad_norm": 0.26462024450302124, "learning_rate": 4.2262904986263326e-05, "loss": 0.3471, "num_tokens": 303301842.0, "step": 2320 }, { "epoch": 0.9261771747805267, "grad_norm": 0.2772708237171173, "learning_rate": 4.225543464619417e-05, "loss": 0.3939, "num_tokens": 303432914.0, "step": 2321 }, { "epoch": 0.9265762170790104, "grad_norm": 0.24646234512329102, "learning_rate": 4.2247961451085945e-05, "loss": 0.3418, "num_tokens": 303563986.0, "step": 2322 }, { "epoch": 0.926975259377494, "grad_norm": 0.25141796469688416, "learning_rate": 4.2240485402384664e-05, "loss": 0.34, "num_tokens": 303695058.0, "step": 2323 }, { "epoch": 0.9273743016759777, "grad_norm": 0.23681145906448364, "learning_rate": 4.2233006501536865e-05, "loss": 0.326, "num_tokens": 303826130.0, "step": 2324 }, { "epoch": 0.9277733439744613, "grad_norm": 0.2539687156677246, "learning_rate": 4.222552474998965e-05, "loss": 0.3642, "num_tokens": 303957202.0, "step": 2325 }, { "epoch": 0.928172386272945, "grad_norm": 0.2682996690273285, "learning_rate": 4.221804014919066e-05, "loss": 0.2802, "num_tokens": 304088274.0, "step": 2326 }, { "epoch": 0.9285714285714286, "grad_norm": 0.22967272996902466, "learning_rate": 4.2210552700588114e-05, "loss": 0.2947, "num_tokens": 304219346.0, "step": 2327 }, { "epoch": 0.9289704708699122, "grad_norm": 0.24943242967128754, "learning_rate": 4.2203062405630735e-05, "loss": 0.3412, "num_tokens": 304350418.0, "step": 2328 }, { "epoch": 0.9293695131683959, "grad_norm": 0.24623893201351166, "learning_rate": 4.219556926576785e-05, "loss": 0.3536, "num_tokens": 304481490.0, "step": 2329 }, { "epoch": 0.9297685554668795, "grad_norm": 0.2304745316505432, "learning_rate": 4.2188073282449306e-05, "loss": 0.2991, "num_tokens": 304612562.0, "step": 2330 }, { "epoch": 0.9301675977653632, "grad_norm": 0.22750736773014069, "learning_rate": 4.218057445712549e-05, "loss": 0.2854, "num_tokens": 304743634.0, "step": 2331 }, { "epoch": 0.9305666400638468, "grad_norm": 0.28484922647476196, "learning_rate": 4.217307279124738e-05, "loss": 0.3762, "num_tokens": 304874706.0, "step": 2332 }, { "epoch": 0.9309656823623305, "grad_norm": 0.23554323613643646, "learning_rate": 4.216556828626645e-05, "loss": 0.3023, "num_tokens": 305005778.0, "step": 2333 }, { "epoch": 0.931364724660814, "grad_norm": 0.25114113092422485, "learning_rate": 4.215806094363478e-05, "loss": 0.2711, "num_tokens": 305136850.0, "step": 2334 }, { "epoch": 0.9317637669592976, "grad_norm": 0.24427975714206696, "learning_rate": 4.215055076480496e-05, "loss": 0.307, "num_tokens": 305267922.0, "step": 2335 }, { "epoch": 0.9321628092577813, "grad_norm": 0.2763548493385315, "learning_rate": 4.214303775123013e-05, "loss": 0.3869, "num_tokens": 305398994.0, "step": 2336 }, { "epoch": 0.9325618515562649, "grad_norm": 0.2620449364185333, "learning_rate": 4.213552190436399e-05, "loss": 0.3198, "num_tokens": 305530066.0, "step": 2337 }, { "epoch": 0.9329608938547486, "grad_norm": 0.2661764621734619, "learning_rate": 4.21280032256608e-05, "loss": 0.3778, "num_tokens": 305661138.0, "step": 2338 }, { "epoch": 0.9333599361532322, "grad_norm": 0.25924235582351685, "learning_rate": 4.2120481716575344e-05, "loss": 0.2947, "num_tokens": 305792210.0, "step": 2339 }, { "epoch": 0.9337589784517158, "grad_norm": 0.2511110007762909, "learning_rate": 4.2112957378562975e-05, "loss": 0.3293, "num_tokens": 305923282.0, "step": 2340 }, { "epoch": 0.9341580207501995, "grad_norm": 0.2582775950431824, "learning_rate": 4.210543021307957e-05, "loss": 0.3162, "num_tokens": 306054354.0, "step": 2341 }, { "epoch": 0.9345570630486831, "grad_norm": 0.2570054829120636, "learning_rate": 4.209790022158156e-05, "loss": 0.3262, "num_tokens": 306185426.0, "step": 2342 }, { "epoch": 0.9349561053471668, "grad_norm": 0.2682472765445709, "learning_rate": 4.2090367405525944e-05, "loss": 0.3635, "num_tokens": 306316498.0, "step": 2343 }, { "epoch": 0.9353551476456504, "grad_norm": 0.2755666971206665, "learning_rate": 4.208283176637024e-05, "loss": 0.3585, "num_tokens": 306447570.0, "step": 2344 }, { "epoch": 0.9357541899441341, "grad_norm": 0.23105987906455994, "learning_rate": 4.207529330557253e-05, "loss": 0.2709, "num_tokens": 306578642.0, "step": 2345 }, { "epoch": 0.9361532322426177, "grad_norm": 0.2509336471557617, "learning_rate": 4.206775202459144e-05, "loss": 0.3296, "num_tokens": 306709714.0, "step": 2346 }, { "epoch": 0.9365522745411013, "grad_norm": 0.22905580699443817, "learning_rate": 4.206020792488613e-05, "loss": 0.3071, "num_tokens": 306840786.0, "step": 2347 }, { "epoch": 0.936951316839585, "grad_norm": 0.2602418065071106, "learning_rate": 4.2052661007916314e-05, "loss": 0.3037, "num_tokens": 306971858.0, "step": 2348 }, { "epoch": 0.9373503591380686, "grad_norm": 0.231643944978714, "learning_rate": 4.204511127514225e-05, "loss": 0.2846, "num_tokens": 307102930.0, "step": 2349 }, { "epoch": 0.9377494014365523, "grad_norm": 0.21995815634727478, "learning_rate": 4.203755872802474e-05, "loss": 0.3132, "num_tokens": 307234002.0, "step": 2350 }, { "epoch": 0.9381484437350359, "grad_norm": 0.2666962146759033, "learning_rate": 4.203000336802514e-05, "loss": 0.3601, "num_tokens": 307365074.0, "step": 2351 }, { "epoch": 0.9385474860335196, "grad_norm": 0.2524389624595642, "learning_rate": 4.202244519660533e-05, "loss": 0.3562, "num_tokens": 307496146.0, "step": 2352 }, { "epoch": 0.9389465283320032, "grad_norm": 0.2455148547887802, "learning_rate": 4.201488421522775e-05, "loss": 0.3354, "num_tokens": 307627218.0, "step": 2353 }, { "epoch": 0.9393455706304868, "grad_norm": 0.23710866272449493, "learning_rate": 4.2007320425355384e-05, "loss": 0.3227, "num_tokens": 307758290.0, "step": 2354 }, { "epoch": 0.9397446129289705, "grad_norm": 0.2330685406923294, "learning_rate": 4.199975382845176e-05, "loss": 0.2831, "num_tokens": 307889362.0, "step": 2355 }, { "epoch": 0.9401436552274541, "grad_norm": 0.2761240005493164, "learning_rate": 4.199218442598091e-05, "loss": 0.3909, "num_tokens": 308020434.0, "step": 2356 }, { "epoch": 0.9405426975259378, "grad_norm": 0.27712196111679077, "learning_rate": 4.198461221940749e-05, "loss": 0.3752, "num_tokens": 308151506.0, "step": 2357 }, { "epoch": 0.9409417398244214, "grad_norm": 0.2908826172351837, "learning_rate": 4.197703721019662e-05, "loss": 0.3703, "num_tokens": 308282578.0, "step": 2358 }, { "epoch": 0.9413407821229051, "grad_norm": 0.2697485387325287, "learning_rate": 4.196945939981401e-05, "loss": 0.3356, "num_tokens": 308413650.0, "step": 2359 }, { "epoch": 0.9417398244213887, "grad_norm": 0.2527014911174774, "learning_rate": 4.1961878789725885e-05, "loss": 0.3507, "num_tokens": 308544722.0, "step": 2360 }, { "epoch": 0.9421388667198723, "grad_norm": 0.22366905212402344, "learning_rate": 4.195429538139902e-05, "loss": 0.2792, "num_tokens": 308675794.0, "step": 2361 }, { "epoch": 0.942537909018356, "grad_norm": 0.2575071156024933, "learning_rate": 4.1946709176300754e-05, "loss": 0.3653, "num_tokens": 308806866.0, "step": 2362 }, { "epoch": 0.9429369513168396, "grad_norm": 0.2298453450202942, "learning_rate": 4.193912017589892e-05, "loss": 0.3021, "num_tokens": 308937938.0, "step": 2363 }, { "epoch": 0.9433359936153233, "grad_norm": 0.2492106705904007, "learning_rate": 4.1931528381661946e-05, "loss": 0.3131, "num_tokens": 309069010.0, "step": 2364 }, { "epoch": 0.9437350359138069, "grad_norm": 0.24844644963741302, "learning_rate": 4.1923933795058745e-05, "loss": 0.3314, "num_tokens": 309200082.0, "step": 2365 }, { "epoch": 0.9441340782122905, "grad_norm": 0.22942954301834106, "learning_rate": 4.191633641755881e-05, "loss": 0.3203, "num_tokens": 309331154.0, "step": 2366 }, { "epoch": 0.9445331205107741, "grad_norm": 0.27289316058158875, "learning_rate": 4.190873625063218e-05, "loss": 0.3719, "num_tokens": 309462226.0, "step": 2367 }, { "epoch": 0.9449321628092577, "grad_norm": 0.22660449147224426, "learning_rate": 4.190113329574939e-05, "loss": 0.3129, "num_tokens": 309593298.0, "step": 2368 }, { "epoch": 0.9453312051077414, "grad_norm": 0.2714550197124481, "learning_rate": 4.1893527554381555e-05, "loss": 0.3567, "num_tokens": 309724370.0, "step": 2369 }, { "epoch": 0.945730247406225, "grad_norm": 0.2571345269680023, "learning_rate": 4.188591902800032e-05, "loss": 0.3599, "num_tokens": 309855442.0, "step": 2370 }, { "epoch": 0.9461292897047087, "grad_norm": 0.2475048154592514, "learning_rate": 4.187830771807785e-05, "loss": 0.3162, "num_tokens": 309975264.0, "step": 2371 }, { "epoch": 0.9465283320031923, "grad_norm": 0.2472931146621704, "learning_rate": 4.1870693626086876e-05, "loss": 0.3218, "num_tokens": 310106336.0, "step": 2372 }, { "epoch": 0.946927374301676, "grad_norm": 0.25088009238243103, "learning_rate": 4.186307675350064e-05, "loss": 0.3055, "num_tokens": 310228397.0, "step": 2373 }, { "epoch": 0.9473264166001596, "grad_norm": 0.24741363525390625, "learning_rate": 4.185545710179295e-05, "loss": 0.3336, "num_tokens": 310359469.0, "step": 2374 }, { "epoch": 0.9477254588986432, "grad_norm": 0.2361244410276413, "learning_rate": 4.184783467243813e-05, "loss": 0.2976, "num_tokens": 310490541.0, "step": 2375 }, { "epoch": 0.9481245011971269, "grad_norm": 0.24017943441867828, "learning_rate": 4.184020946691104e-05, "loss": 0.3112, "num_tokens": 310621613.0, "step": 2376 }, { "epoch": 0.9485235434956105, "grad_norm": 0.2560124695301056, "learning_rate": 4.1832581486687105e-05, "loss": 0.3356, "num_tokens": 310752685.0, "step": 2377 }, { "epoch": 0.9489225857940942, "grad_norm": 0.24279150366783142, "learning_rate": 4.182495073324225e-05, "loss": 0.3188, "num_tokens": 310883757.0, "step": 2378 }, { "epoch": 0.9493216280925778, "grad_norm": 0.2684642970561981, "learning_rate": 4.181731720805297e-05, "loss": 0.3448, "num_tokens": 311014829.0, "step": 2379 }, { "epoch": 0.9497206703910615, "grad_norm": 0.2694222331047058, "learning_rate": 4.1809680912596285e-05, "loss": 0.3475, "num_tokens": 311145901.0, "step": 2380 }, { "epoch": 0.9501197126895451, "grad_norm": 0.2687327265739441, "learning_rate": 4.180204184834972e-05, "loss": 0.3783, "num_tokens": 311276973.0, "step": 2381 }, { "epoch": 0.9505187549880287, "grad_norm": 0.260840505361557, "learning_rate": 4.179440001679139e-05, "loss": 0.3318, "num_tokens": 311408045.0, "step": 2382 }, { "epoch": 0.9509177972865124, "grad_norm": 0.2548711597919464, "learning_rate": 4.17867554193999e-05, "loss": 0.3207, "num_tokens": 311539117.0, "step": 2383 }, { "epoch": 0.951316839584996, "grad_norm": 0.26584187150001526, "learning_rate": 4.177910805765442e-05, "loss": 0.3281, "num_tokens": 311670189.0, "step": 2384 }, { "epoch": 0.9517158818834797, "grad_norm": 0.25778424739837646, "learning_rate": 4.177145793303464e-05, "loss": 0.3129, "num_tokens": 311801261.0, "step": 2385 }, { "epoch": 0.9521149241819633, "grad_norm": 0.2724926173686981, "learning_rate": 4.176380504702078e-05, "loss": 0.3324, "num_tokens": 311932333.0, "step": 2386 }, { "epoch": 0.952513966480447, "grad_norm": 0.25228336453437805, "learning_rate": 4.175614940109362e-05, "loss": 0.3392, "num_tokens": 312063405.0, "step": 2387 }, { "epoch": 0.9529130087789306, "grad_norm": 0.2349732369184494, "learning_rate": 4.174849099673444e-05, "loss": 0.2968, "num_tokens": 312194477.0, "step": 2388 }, { "epoch": 0.9533120510774142, "grad_norm": 0.24870355427265167, "learning_rate": 4.174082983542506e-05, "loss": 0.3057, "num_tokens": 312325549.0, "step": 2389 }, { "epoch": 0.9537110933758979, "grad_norm": 0.2582143545150757, "learning_rate": 4.173316591864788e-05, "loss": 0.3213, "num_tokens": 312456621.0, "step": 2390 }, { "epoch": 0.9541101356743815, "grad_norm": 0.26432889699935913, "learning_rate": 4.172549924788577e-05, "loss": 0.3174, "num_tokens": 312587693.0, "step": 2391 }, { "epoch": 0.9545091779728652, "grad_norm": 0.26333189010620117, "learning_rate": 4.171782982462216e-05, "loss": 0.3581, "num_tokens": 312718765.0, "step": 2392 }, { "epoch": 0.9549082202713488, "grad_norm": 0.2490028291940689, "learning_rate": 4.171015765034101e-05, "loss": 0.3178, "num_tokens": 312849837.0, "step": 2393 }, { "epoch": 0.9553072625698324, "grad_norm": 0.26519930362701416, "learning_rate": 4.1702482726526823e-05, "loss": 0.2942, "num_tokens": 312980909.0, "step": 2394 }, { "epoch": 0.9557063048683161, "grad_norm": 0.2648569941520691, "learning_rate": 4.169480505466463e-05, "loss": 0.3524, "num_tokens": 313111981.0, "step": 2395 }, { "epoch": 0.9561053471667997, "grad_norm": 0.23439668118953705, "learning_rate": 4.168712463623998e-05, "loss": 0.2883, "num_tokens": 313243053.0, "step": 2396 }, { "epoch": 0.9565043894652833, "grad_norm": 0.24129962921142578, "learning_rate": 4.167944147273896e-05, "loss": 0.304, "num_tokens": 313374125.0, "step": 2397 }, { "epoch": 0.9569034317637669, "grad_norm": 0.2586396634578705, "learning_rate": 4.167175556564819e-05, "loss": 0.3393, "num_tokens": 313505197.0, "step": 2398 }, { "epoch": 0.9573024740622506, "grad_norm": 0.24226389825344086, "learning_rate": 4.166406691645483e-05, "loss": 0.3224, "num_tokens": 313636269.0, "step": 2399 }, { "epoch": 0.9577015163607342, "grad_norm": 0.23617257177829742, "learning_rate": 4.165637552664656e-05, "loss": 0.2973, "num_tokens": 313767341.0, "step": 2400 }, { "epoch": 0.9581005586592178, "grad_norm": 0.23590604960918427, "learning_rate": 4.164868139771159e-05, "loss": 0.2708, "num_tokens": 313898413.0, "step": 2401 }, { "epoch": 0.9584996009577015, "grad_norm": 0.24904684722423553, "learning_rate": 4.1640984531138657e-05, "loss": 0.3043, "num_tokens": 314029485.0, "step": 2402 }, { "epoch": 0.9588986432561851, "grad_norm": 0.26274389028549194, "learning_rate": 4.1633284928417034e-05, "loss": 0.3198, "num_tokens": 314160557.0, "step": 2403 }, { "epoch": 0.9592976855546688, "grad_norm": 0.26305070519447327, "learning_rate": 4.162558259103653e-05, "loss": 0.3419, "num_tokens": 314291629.0, "step": 2404 }, { "epoch": 0.9596967278531524, "grad_norm": 0.24780619144439697, "learning_rate": 4.161787752048748e-05, "loss": 0.3133, "num_tokens": 314422701.0, "step": 2405 }, { "epoch": 0.960095770151636, "grad_norm": 0.258165180683136, "learning_rate": 4.161016971826073e-05, "loss": 0.3385, "num_tokens": 314553773.0, "step": 2406 }, { "epoch": 0.9604948124501197, "grad_norm": 0.29703131318092346, "learning_rate": 4.160245918584766e-05, "loss": 0.3882, "num_tokens": 314684845.0, "step": 2407 }, { "epoch": 0.9608938547486033, "grad_norm": 0.2516064941883087, "learning_rate": 4.159474592474022e-05, "loss": 0.3217, "num_tokens": 314815917.0, "step": 2408 }, { "epoch": 0.961292897047087, "grad_norm": 0.25579002499580383, "learning_rate": 4.158702993643082e-05, "loss": 0.3288, "num_tokens": 314946989.0, "step": 2409 }, { "epoch": 0.9616919393455706, "grad_norm": 0.26105180382728577, "learning_rate": 4.1579311222412455e-05, "loss": 0.3376, "num_tokens": 315078061.0, "step": 2410 }, { "epoch": 0.9620909816440543, "grad_norm": 0.2243991643190384, "learning_rate": 4.157158978417861e-05, "loss": 0.2756, "num_tokens": 315207317.0, "step": 2411 }, { "epoch": 0.9624900239425379, "grad_norm": 0.26057136058807373, "learning_rate": 4.156386562322331e-05, "loss": 0.3054, "num_tokens": 315338389.0, "step": 2412 }, { "epoch": 0.9628890662410216, "grad_norm": 0.25022000074386597, "learning_rate": 4.1556138741041117e-05, "loss": 0.337, "num_tokens": 315469461.0, "step": 2413 }, { "epoch": 0.9632881085395052, "grad_norm": 0.24092495441436768, "learning_rate": 4.15484091391271e-05, "loss": 0.3134, "num_tokens": 315600533.0, "step": 2414 }, { "epoch": 0.9636871508379888, "grad_norm": 0.2468123435974121, "learning_rate": 4.154067681897687e-05, "loss": 0.308, "num_tokens": 315731605.0, "step": 2415 }, { "epoch": 0.9640861931364725, "grad_norm": 0.2736469507217407, "learning_rate": 4.1532941782086576e-05, "loss": 0.3568, "num_tokens": 315862677.0, "step": 2416 }, { "epoch": 0.9644852354349561, "grad_norm": 0.227487251162529, "learning_rate": 4.152520402995286e-05, "loss": 0.2821, "num_tokens": 315993749.0, "step": 2417 }, { "epoch": 0.9648842777334398, "grad_norm": 0.26171422004699707, "learning_rate": 4.1517463564072886e-05, "loss": 0.3395, "num_tokens": 316124821.0, "step": 2418 }, { "epoch": 0.9652833200319234, "grad_norm": 0.25364336371421814, "learning_rate": 4.1509720385944404e-05, "loss": 0.3122, "num_tokens": 316255893.0, "step": 2419 }, { "epoch": 0.965682362330407, "grad_norm": 0.24324442446231842, "learning_rate": 4.1501974497065624e-05, "loss": 0.3091, "num_tokens": 316386965.0, "step": 2420 }, { "epoch": 0.9660814046288907, "grad_norm": 0.24700483679771423, "learning_rate": 4.149422589893529e-05, "loss": 0.2773, "num_tokens": 316518037.0, "step": 2421 }, { "epoch": 0.9664804469273743, "grad_norm": 0.26294589042663574, "learning_rate": 4.14864745930527e-05, "loss": 0.3372, "num_tokens": 316649109.0, "step": 2422 }, { "epoch": 0.966879489225858, "grad_norm": 0.23518191277980804, "learning_rate": 4.147872058091766e-05, "loss": 0.2532, "num_tokens": 316780181.0, "step": 2423 }, { "epoch": 0.9672785315243416, "grad_norm": 0.2897636294364929, "learning_rate": 4.1470963864030495e-05, "loss": 0.3787, "num_tokens": 316911253.0, "step": 2424 }, { "epoch": 0.9676775738228253, "grad_norm": 0.24206317961215973, "learning_rate": 4.146320444389206e-05, "loss": 0.2974, "num_tokens": 317042325.0, "step": 2425 }, { "epoch": 0.9680766161213089, "grad_norm": 0.2448725551366806, "learning_rate": 4.1455442322003726e-05, "loss": 0.321, "num_tokens": 317173397.0, "step": 2426 }, { "epoch": 0.9684756584197926, "grad_norm": 0.2454589307308197, "learning_rate": 4.14476774998674e-05, "loss": 0.301, "num_tokens": 317304469.0, "step": 2427 }, { "epoch": 0.9688747007182761, "grad_norm": 0.252428263425827, "learning_rate": 4.1439909978985475e-05, "loss": 0.3255, "num_tokens": 317435541.0, "step": 2428 }, { "epoch": 0.9692737430167597, "grad_norm": 0.22105541825294495, "learning_rate": 4.143213976086093e-05, "loss": 0.2617, "num_tokens": 317566613.0, "step": 2429 }, { "epoch": 0.9696727853152434, "grad_norm": 0.25562775135040283, "learning_rate": 4.1424366846997226e-05, "loss": 0.3281, "num_tokens": 317697685.0, "step": 2430 }, { "epoch": 0.970071827613727, "grad_norm": 0.26938799023628235, "learning_rate": 4.1416591238898326e-05, "loss": 0.3256, "num_tokens": 317828757.0, "step": 2431 }, { "epoch": 0.9704708699122107, "grad_norm": 0.24169793725013733, "learning_rate": 4.140881293806875e-05, "loss": 0.2869, "num_tokens": 317959829.0, "step": 2432 }, { "epoch": 0.9708699122106943, "grad_norm": 0.2830032706260681, "learning_rate": 4.140103194601353e-05, "loss": 0.3034, "num_tokens": 318090901.0, "step": 2433 }, { "epoch": 0.9712689545091779, "grad_norm": 0.25531086325645447, "learning_rate": 4.139324826423821e-05, "loss": 0.2964, "num_tokens": 318221973.0, "step": 2434 }, { "epoch": 0.9716679968076616, "grad_norm": 0.2717881202697754, "learning_rate": 4.138546189424885e-05, "loss": 0.3071, "num_tokens": 318353045.0, "step": 2435 }, { "epoch": 0.9720670391061452, "grad_norm": 0.23789264261722565, "learning_rate": 4.137767283755207e-05, "loss": 0.3013, "num_tokens": 318484117.0, "step": 2436 }, { "epoch": 0.9724660814046289, "grad_norm": 0.2443450540304184, "learning_rate": 4.136988109565496e-05, "loss": 0.285, "num_tokens": 318615189.0, "step": 2437 }, { "epoch": 0.9728651237031125, "grad_norm": 0.25266826152801514, "learning_rate": 4.1362086670065156e-05, "loss": 0.2861, "num_tokens": 318746261.0, "step": 2438 }, { "epoch": 0.9732641660015962, "grad_norm": 0.24918505549430847, "learning_rate": 4.135428956229078e-05, "loss": 0.3387, "num_tokens": 318877333.0, "step": 2439 }, { "epoch": 0.9736632083000798, "grad_norm": 0.26284244656562805, "learning_rate": 4.1346489773840544e-05, "loss": 0.3541, "num_tokens": 319008405.0, "step": 2440 }, { "epoch": 0.9740622505985634, "grad_norm": 0.2325076460838318, "learning_rate": 4.13386873062236e-05, "loss": 0.2814, "num_tokens": 319139477.0, "step": 2441 }, { "epoch": 0.9744612928970471, "grad_norm": 0.23701144754886627, "learning_rate": 4.133088216094967e-05, "loss": 0.2911, "num_tokens": 319270549.0, "step": 2442 }, { "epoch": 0.9748603351955307, "grad_norm": 0.23706136643886566, "learning_rate": 4.132307433952897e-05, "loss": 0.2881, "num_tokens": 319401621.0, "step": 2443 }, { "epoch": 0.9752593774940144, "grad_norm": 0.23431847989559174, "learning_rate": 4.1315263843472244e-05, "loss": 0.303, "num_tokens": 319532693.0, "step": 2444 }, { "epoch": 0.975658419792498, "grad_norm": 0.2390219122171402, "learning_rate": 4.130745067429075e-05, "loss": 0.2895, "num_tokens": 319663765.0, "step": 2445 }, { "epoch": 0.9760574620909817, "grad_norm": 0.26233628392219543, "learning_rate": 4.1299634833496256e-05, "loss": 0.3505, "num_tokens": 319794837.0, "step": 2446 }, { "epoch": 0.9764565043894653, "grad_norm": 0.23623515665531158, "learning_rate": 4.129181632260107e-05, "loss": 0.303, "num_tokens": 319925909.0, "step": 2447 }, { "epoch": 0.9768555466879489, "grad_norm": 0.22992728650569916, "learning_rate": 4.128399514311798e-05, "loss": 0.2974, "num_tokens": 320056981.0, "step": 2448 }, { "epoch": 0.9772545889864326, "grad_norm": 0.23581773042678833, "learning_rate": 4.1276171296560326e-05, "loss": 0.3142, "num_tokens": 320188053.0, "step": 2449 }, { "epoch": 0.9776536312849162, "grad_norm": 0.26416027545928955, "learning_rate": 4.126834478444195e-05, "loss": 0.3106, "num_tokens": 320319125.0, "step": 2450 }, { "epoch": 0.9780526735833999, "grad_norm": 0.28383907675743103, "learning_rate": 4.12605156082772e-05, "loss": 0.3979, "num_tokens": 320450197.0, "step": 2451 }, { "epoch": 0.9784517158818835, "grad_norm": 0.2291620373725891, "learning_rate": 4.125268376958095e-05, "loss": 0.297, "num_tokens": 320581269.0, "step": 2452 }, { "epoch": 0.9788507581803672, "grad_norm": 0.27577996253967285, "learning_rate": 4.12448492698686e-05, "loss": 0.3144, "num_tokens": 320712341.0, "step": 2453 }, { "epoch": 0.9792498004788508, "grad_norm": 0.24361225962638855, "learning_rate": 4.123701211065605e-05, "loss": 0.3288, "num_tokens": 320843413.0, "step": 2454 }, { "epoch": 0.9796488427773344, "grad_norm": 0.2512906789779663, "learning_rate": 4.12291722934597e-05, "loss": 0.3165, "num_tokens": 320974485.0, "step": 2455 }, { "epoch": 0.9800478850758181, "grad_norm": 0.23503723740577698, "learning_rate": 4.122132981979649e-05, "loss": 0.2972, "num_tokens": 321105557.0, "step": 2456 }, { "epoch": 0.9804469273743017, "grad_norm": 0.23513303697109222, "learning_rate": 4.1213484691183876e-05, "loss": 0.3118, "num_tokens": 321236629.0, "step": 2457 }, { "epoch": 0.9808459696727854, "grad_norm": 0.2765871584415436, "learning_rate": 4.120563690913981e-05, "loss": 0.369, "num_tokens": 321367701.0, "step": 2458 }, { "epoch": 0.981245011971269, "grad_norm": 0.2274627834558487, "learning_rate": 4.119778647518276e-05, "loss": 0.2664, "num_tokens": 321498773.0, "step": 2459 }, { "epoch": 0.9816440542697525, "grad_norm": 0.2402145117521286, "learning_rate": 4.118993339083172e-05, "loss": 0.2738, "num_tokens": 321629845.0, "step": 2460 }, { "epoch": 0.9820430965682362, "grad_norm": 0.23694437742233276, "learning_rate": 4.118207765760619e-05, "loss": 0.2924, "num_tokens": 321760917.0, "step": 2461 }, { "epoch": 0.9824421388667198, "grad_norm": 0.2653818130493164, "learning_rate": 4.1174219277026166e-05, "loss": 0.345, "num_tokens": 321891989.0, "step": 2462 }, { "epoch": 0.9828411811652035, "grad_norm": 0.24834679067134857, "learning_rate": 4.1166358250612185e-05, "loss": 0.2866, "num_tokens": 322023061.0, "step": 2463 }, { "epoch": 0.9832402234636871, "grad_norm": 0.24214798212051392, "learning_rate": 4.115849457988528e-05, "loss": 0.314, "num_tokens": 322154133.0, "step": 2464 }, { "epoch": 0.9836392657621708, "grad_norm": 0.26151883602142334, "learning_rate": 4.1150628266366995e-05, "loss": 0.3118, "num_tokens": 322285205.0, "step": 2465 }, { "epoch": 0.9840383080606544, "grad_norm": 0.24779194593429565, "learning_rate": 4.114275931157939e-05, "loss": 0.2861, "num_tokens": 322416277.0, "step": 2466 }, { "epoch": 0.984437350359138, "grad_norm": 0.2556101679801941, "learning_rate": 4.113488771704503e-05, "loss": 0.2817, "num_tokens": 322547349.0, "step": 2467 }, { "epoch": 0.9848363926576217, "grad_norm": 0.27815350890159607, "learning_rate": 4.112701348428701e-05, "loss": 0.3446, "num_tokens": 322678421.0, "step": 2468 }, { "epoch": 0.9852354349561053, "grad_norm": 0.23878347873687744, "learning_rate": 4.11191366148289e-05, "loss": 0.3086, "num_tokens": 322809493.0, "step": 2469 }, { "epoch": 0.985634477254589, "grad_norm": 0.23686456680297852, "learning_rate": 4.111125711019482e-05, "loss": 0.2694, "num_tokens": 322940565.0, "step": 2470 }, { "epoch": 0.9860335195530726, "grad_norm": 0.2442149817943573, "learning_rate": 4.110337497190937e-05, "loss": 0.3417, "num_tokens": 323071637.0, "step": 2471 }, { "epoch": 0.9864325618515563, "grad_norm": 0.24188686907291412, "learning_rate": 4.109549020149767e-05, "loss": 0.311, "num_tokens": 323195604.0, "step": 2472 }, { "epoch": 0.9868316041500399, "grad_norm": 0.22935940325260162, "learning_rate": 4.1087602800485344e-05, "loss": 0.2856, "num_tokens": 323326676.0, "step": 2473 }, { "epoch": 0.9872306464485235, "grad_norm": 0.2698732316493988, "learning_rate": 4.107971277039854e-05, "loss": 0.356, "num_tokens": 323457748.0, "step": 2474 }, { "epoch": 0.9876296887470072, "grad_norm": 0.2709592580795288, "learning_rate": 4.1071820112763914e-05, "loss": 0.3613, "num_tokens": 323588820.0, "step": 2475 }, { "epoch": 0.9880287310454908, "grad_norm": 0.32749783992767334, "learning_rate": 4.1063924829108616e-05, "loss": 0.3347, "num_tokens": 323719892.0, "step": 2476 }, { "epoch": 0.9884277733439745, "grad_norm": 0.21810176968574524, "learning_rate": 4.105602692096029e-05, "loss": 0.2538, "num_tokens": 323850964.0, "step": 2477 }, { "epoch": 0.9888268156424581, "grad_norm": 0.2412751317024231, "learning_rate": 4.104812638984712e-05, "loss": 0.2972, "num_tokens": 323982036.0, "step": 2478 }, { "epoch": 0.9892258579409418, "grad_norm": 0.24531637132167816, "learning_rate": 4.1040223237297795e-05, "loss": 0.3156, "num_tokens": 324113108.0, "step": 2479 }, { "epoch": 0.9896249002394254, "grad_norm": 0.27096956968307495, "learning_rate": 4.103231746484149e-05, "loss": 0.3495, "num_tokens": 324244180.0, "step": 2480 }, { "epoch": 0.990023942537909, "grad_norm": 0.2480773627758026, "learning_rate": 4.102440907400791e-05, "loss": 0.3067, "num_tokens": 324375252.0, "step": 2481 }, { "epoch": 0.9904229848363927, "grad_norm": 0.2647719383239746, "learning_rate": 4.1016498066327244e-05, "loss": 0.3448, "num_tokens": 324506324.0, "step": 2482 }, { "epoch": 0.9908220271348763, "grad_norm": 0.2413339465856552, "learning_rate": 4.10085844433302e-05, "loss": 0.2896, "num_tokens": 324637396.0, "step": 2483 }, { "epoch": 0.99122106943336, "grad_norm": 0.25479966402053833, "learning_rate": 4.100066820654799e-05, "loss": 0.3173, "num_tokens": 324768468.0, "step": 2484 }, { "epoch": 0.9916201117318436, "grad_norm": 0.24488231539726257, "learning_rate": 4.099274935751233e-05, "loss": 0.3126, "num_tokens": 324899540.0, "step": 2485 }, { "epoch": 0.9920191540303273, "grad_norm": 0.2793008089065552, "learning_rate": 4.0984827897755464e-05, "loss": 0.3221, "num_tokens": 325030612.0, "step": 2486 }, { "epoch": 0.9924181963288109, "grad_norm": 0.23654180765151978, "learning_rate": 4.097690382881009e-05, "loss": 0.3168, "num_tokens": 325161684.0, "step": 2487 }, { "epoch": 0.9928172386272945, "grad_norm": 0.24748198688030243, "learning_rate": 4.096897715220945e-05, "loss": 0.3221, "num_tokens": 325292756.0, "step": 2488 }, { "epoch": 0.9932162809257782, "grad_norm": 0.2695292830467224, "learning_rate": 4.0961047869487294e-05, "loss": 0.3397, "num_tokens": 325423828.0, "step": 2489 }, { "epoch": 0.9936153232242618, "grad_norm": 0.2557673454284668, "learning_rate": 4.095311598217786e-05, "loss": 0.3417, "num_tokens": 325554900.0, "step": 2490 }, { "epoch": 0.9940143655227454, "grad_norm": 0.27085596323013306, "learning_rate": 4.094518149181589e-05, "loss": 0.3869, "num_tokens": 325685972.0, "step": 2491 }, { "epoch": 0.994413407821229, "grad_norm": 0.2385408580303192, "learning_rate": 4.0937244399936634e-05, "loss": 0.2872, "num_tokens": 325817044.0, "step": 2492 }, { "epoch": 0.9948124501197126, "grad_norm": 0.28309282660484314, "learning_rate": 4.092930470807585e-05, "loss": 0.3415, "num_tokens": 325936400.0, "step": 2493 }, { "epoch": 0.9952114924181963, "grad_norm": 0.23499281704425812, "learning_rate": 4.0921362417769796e-05, "loss": 0.269, "num_tokens": 326067472.0, "step": 2494 }, { "epoch": 0.9956105347166799, "grad_norm": 0.267282634973526, "learning_rate": 4.0913417530555214e-05, "loss": 0.3238, "num_tokens": 326198544.0, "step": 2495 }, { "epoch": 0.9960095770151636, "grad_norm": 0.2614571750164032, "learning_rate": 4.090547004796939e-05, "loss": 0.3298, "num_tokens": 326329616.0, "step": 2496 }, { "epoch": 0.9964086193136472, "grad_norm": 0.23279409110546112, "learning_rate": 4.089751997155006e-05, "loss": 0.2905, "num_tokens": 326460688.0, "step": 2497 }, { "epoch": 0.9968076616121309, "grad_norm": 0.26686999201774597, "learning_rate": 4.088956730283552e-05, "loss": 0.3391, "num_tokens": 326591760.0, "step": 2498 }, { "epoch": 0.9972067039106145, "grad_norm": 0.2584298253059387, "learning_rate": 4.088161204336453e-05, "loss": 0.3099, "num_tokens": 326722832.0, "step": 2499 }, { "epoch": 0.9976057462090981, "grad_norm": 0.28388655185699463, "learning_rate": 4.087365419467633e-05, "loss": 0.3796, "num_tokens": 326853904.0, "step": 2500 }, { "epoch": 0.9980047885075818, "grad_norm": 0.2710479497909546, "learning_rate": 4.086569375831072e-05, "loss": 0.3223, "num_tokens": 326984976.0, "step": 2501 }, { "epoch": 0.9984038308060654, "grad_norm": 0.24949762225151062, "learning_rate": 4.085773073580796e-05, "loss": 0.3305, "num_tokens": 327116048.0, "step": 2502 }, { "epoch": 0.9988028731045491, "grad_norm": 0.26963746547698975, "learning_rate": 4.0849765128708825e-05, "loss": 0.3348, "num_tokens": 327247120.0, "step": 2503 }, { "epoch": 0.9992019154030327, "grad_norm": 0.23889362812042236, "learning_rate": 4.084179693855459e-05, "loss": 0.2718, "num_tokens": 327378192.0, "step": 2504 }, { "epoch": 0.9996009577015164, "grad_norm": 0.23853899538516998, "learning_rate": 4.083382616688701e-05, "loss": 0.3207, "num_tokens": 327509264.0, "step": 2505 }, { "epoch": 1.0, "grad_norm": 0.35875174403190613, "learning_rate": 4.0825852815248355e-05, "loss": 0.3396, "num_tokens": 327574800.0, "step": 2506 }, { "epoch": 1.0003990422984836, "grad_norm": 0.2864607274532318, "learning_rate": 4.0817876885181416e-05, "loss": 0.2625, "num_tokens": 327705872.0, "step": 2507 }, { "epoch": 1.0007980845969673, "grad_norm": 0.25238510966300964, "learning_rate": 4.080989837822944e-05, "loss": 0.2539, "num_tokens": 327836944.0, "step": 2508 }, { "epoch": 1.001197126895451, "grad_norm": 0.2410300076007843, "learning_rate": 4.080191729593621e-05, "loss": 0.2502, "num_tokens": 327968016.0, "step": 2509 }, { "epoch": 1.0015961691939346, "grad_norm": 0.28292709589004517, "learning_rate": 4.079393363984597e-05, "loss": 0.2775, "num_tokens": 328099088.0, "step": 2510 }, { "epoch": 1.0019952114924182, "grad_norm": 0.3690752387046814, "learning_rate": 4.07859474115035e-05, "loss": 0.2719, "num_tokens": 328230160.0, "step": 2511 }, { "epoch": 1.0023942537909019, "grad_norm": 0.2541307806968689, "learning_rate": 4.077795861245405e-05, "loss": 0.2122, "num_tokens": 328361232.0, "step": 2512 }, { "epoch": 1.0027932960893855, "grad_norm": 0.27259570360183716, "learning_rate": 4.076996724424339e-05, "loss": 0.2771, "num_tokens": 328492304.0, "step": 2513 }, { "epoch": 1.0031923383878691, "grad_norm": 0.26315346360206604, "learning_rate": 4.076197330841777e-05, "loss": 0.2632, "num_tokens": 328623376.0, "step": 2514 }, { "epoch": 1.0035913806863528, "grad_norm": 0.2999420762062073, "learning_rate": 4.0753976806523946e-05, "loss": 0.3108, "num_tokens": 328754448.0, "step": 2515 }, { "epoch": 1.0039904229848364, "grad_norm": 0.26324397325515747, "learning_rate": 4.074597774010916e-05, "loss": 0.2778, "num_tokens": 328885520.0, "step": 2516 }, { "epoch": 1.00438946528332, "grad_norm": 0.26062676310539246, "learning_rate": 4.073797611072115e-05, "loss": 0.2522, "num_tokens": 329016592.0, "step": 2517 }, { "epoch": 1.0047885075818037, "grad_norm": 0.2650856077671051, "learning_rate": 4.072997191990817e-05, "loss": 0.2533, "num_tokens": 329147664.0, "step": 2518 }, { "epoch": 1.0051875498802874, "grad_norm": 0.27328863739967346, "learning_rate": 4.072196516921895e-05, "loss": 0.2541, "num_tokens": 329278736.0, "step": 2519 }, { "epoch": 1.005586592178771, "grad_norm": 0.2420499324798584, "learning_rate": 4.0713955860202724e-05, "loss": 0.2399, "num_tokens": 329409808.0, "step": 2520 }, { "epoch": 1.0059856344772546, "grad_norm": 0.25273823738098145, "learning_rate": 4.070594399440922e-05, "loss": 0.2636, "num_tokens": 329540880.0, "step": 2521 }, { "epoch": 1.0063846767757383, "grad_norm": 0.24881474673748016, "learning_rate": 4.069792957338866e-05, "loss": 0.2256, "num_tokens": 329671952.0, "step": 2522 }, { "epoch": 1.006783719074222, "grad_norm": 0.27606943249702454, "learning_rate": 4.0689912598691755e-05, "loss": 0.2607, "num_tokens": 329803024.0, "step": 2523 }, { "epoch": 1.0071827613727056, "grad_norm": 0.25869306921958923, "learning_rate": 4.068189307186972e-05, "loss": 0.2401, "num_tokens": 329934096.0, "step": 2524 }, { "epoch": 1.0075818036711892, "grad_norm": 0.25133711099624634, "learning_rate": 4.0673870994474246e-05, "loss": 0.2269, "num_tokens": 330065168.0, "step": 2525 }, { "epoch": 1.0079808459696729, "grad_norm": 0.29459258913993835, "learning_rate": 4.0665846368057546e-05, "loss": 0.2896, "num_tokens": 330196240.0, "step": 2526 }, { "epoch": 1.0083798882681565, "grad_norm": 0.2822306156158447, "learning_rate": 4.06578191941723e-05, "loss": 0.2668, "num_tokens": 330327312.0, "step": 2527 }, { "epoch": 1.0087789305666401, "grad_norm": 0.24491479992866516, "learning_rate": 4.06497894743717e-05, "loss": 0.2148, "num_tokens": 330458384.0, "step": 2528 }, { "epoch": 1.0091779728651238, "grad_norm": 0.24548129737377167, "learning_rate": 4.0641757210209424e-05, "loss": 0.2559, "num_tokens": 330589456.0, "step": 2529 }, { "epoch": 1.0095770151636074, "grad_norm": 0.25723332166671753, "learning_rate": 4.063372240323963e-05, "loss": 0.274, "num_tokens": 330720528.0, "step": 2530 }, { "epoch": 1.009976057462091, "grad_norm": 0.25704455375671387, "learning_rate": 4.062568505501697e-05, "loss": 0.2528, "num_tokens": 330851600.0, "step": 2531 }, { "epoch": 1.0103750997605747, "grad_norm": 0.25082993507385254, "learning_rate": 4.061764516709662e-05, "loss": 0.2297, "num_tokens": 330982672.0, "step": 2532 }, { "epoch": 1.0107741420590584, "grad_norm": 0.22342276573181152, "learning_rate": 4.0609602741034204e-05, "loss": 0.2191, "num_tokens": 331113744.0, "step": 2533 }, { "epoch": 1.011173184357542, "grad_norm": 0.24881432950496674, "learning_rate": 4.060155777838587e-05, "loss": 0.255, "num_tokens": 331244816.0, "step": 2534 }, { "epoch": 1.0115722266560256, "grad_norm": 0.2545049488544464, "learning_rate": 4.0593510280708234e-05, "loss": 0.2624, "num_tokens": 331375888.0, "step": 2535 }, { "epoch": 1.0119712689545093, "grad_norm": 0.23697416484355927, "learning_rate": 4.0585460249558414e-05, "loss": 0.2244, "num_tokens": 331506960.0, "step": 2536 }, { "epoch": 1.012370311252993, "grad_norm": 0.24726331233978271, "learning_rate": 4.0577407686494015e-05, "loss": 0.2472, "num_tokens": 331638032.0, "step": 2537 }, { "epoch": 1.0127693535514763, "grad_norm": 0.2545377016067505, "learning_rate": 4.0569352593073144e-05, "loss": 0.246, "num_tokens": 331769104.0, "step": 2538 }, { "epoch": 1.01316839584996, "grad_norm": 0.2588341534137726, "learning_rate": 4.056129497085437e-05, "loss": 0.254, "num_tokens": 331900176.0, "step": 2539 }, { "epoch": 1.0135674381484436, "grad_norm": 0.2491654008626938, "learning_rate": 4.055323482139677e-05, "loss": 0.2457, "num_tokens": 332031248.0, "step": 2540 }, { "epoch": 1.0139664804469273, "grad_norm": 0.22818206250667572, "learning_rate": 4.0545172146259926e-05, "loss": 0.2213, "num_tokens": 332162320.0, "step": 2541 }, { "epoch": 1.014365522745411, "grad_norm": 0.22517570853233337, "learning_rate": 4.0537106947003874e-05, "loss": 0.1964, "num_tokens": 332293392.0, "step": 2542 }, { "epoch": 1.0147645650438946, "grad_norm": 0.24480487406253815, "learning_rate": 4.0529039225189155e-05, "loss": 0.2484, "num_tokens": 332424464.0, "step": 2543 }, { "epoch": 1.0151636073423782, "grad_norm": 0.27617278695106506, "learning_rate": 4.05209689823768e-05, "loss": 0.2706, "num_tokens": 332555536.0, "step": 2544 }, { "epoch": 1.0155626496408618, "grad_norm": 0.23551981151103973, "learning_rate": 4.0512896220128324e-05, "loss": 0.2038, "num_tokens": 332686608.0, "step": 2545 }, { "epoch": 1.0159616919393455, "grad_norm": 0.23971806466579437, "learning_rate": 4.050482094000574e-05, "loss": 0.2314, "num_tokens": 332817680.0, "step": 2546 }, { "epoch": 1.0163607342378291, "grad_norm": 0.2959333658218384, "learning_rate": 4.049674314357153e-05, "loss": 0.2916, "num_tokens": 332948752.0, "step": 2547 }, { "epoch": 1.0167597765363128, "grad_norm": 0.24664153158664703, "learning_rate": 4.0488662832388676e-05, "loss": 0.2309, "num_tokens": 333079824.0, "step": 2548 }, { "epoch": 1.0171588188347964, "grad_norm": 0.24237492680549622, "learning_rate": 4.0480580008020635e-05, "loss": 0.2369, "num_tokens": 333210896.0, "step": 2549 }, { "epoch": 1.01755786113328, "grad_norm": 0.24457518756389618, "learning_rate": 4.047249467203138e-05, "loss": 0.2213, "num_tokens": 333341968.0, "step": 2550 }, { "epoch": 1.0179569034317637, "grad_norm": 0.2336169332265854, "learning_rate": 4.046440682598533e-05, "loss": 0.241, "num_tokens": 333473040.0, "step": 2551 }, { "epoch": 1.0183559457302473, "grad_norm": 0.2414979189634323, "learning_rate": 4.04563164714474e-05, "loss": 0.2434, "num_tokens": 333604112.0, "step": 2552 }, { "epoch": 1.018754988028731, "grad_norm": 0.25803953409194946, "learning_rate": 4.0448223609983025e-05, "loss": 0.2433, "num_tokens": 333735184.0, "step": 2553 }, { "epoch": 1.0191540303272146, "grad_norm": 0.26287275552749634, "learning_rate": 4.0440128243158076e-05, "loss": 0.2545, "num_tokens": 333866256.0, "step": 2554 }, { "epoch": 1.0195530726256983, "grad_norm": 0.28360623121261597, "learning_rate": 4.0432030372538945e-05, "loss": 0.2697, "num_tokens": 333997328.0, "step": 2555 }, { "epoch": 1.019952114924182, "grad_norm": 0.2526775300502777, "learning_rate": 4.0423929999692486e-05, "loss": 0.2115, "num_tokens": 334128400.0, "step": 2556 }, { "epoch": 1.0203511572226656, "grad_norm": 0.2788691222667694, "learning_rate": 4.041582712618604e-05, "loss": 0.2539, "num_tokens": 334259472.0, "step": 2557 }, { "epoch": 1.0207501995211492, "grad_norm": 0.2509864568710327, "learning_rate": 4.040772175358745e-05, "loss": 0.244, "num_tokens": 334390544.0, "step": 2558 }, { "epoch": 1.0211492418196328, "grad_norm": 0.2808547019958496, "learning_rate": 4.039961388346503e-05, "loss": 0.2407, "num_tokens": 334521616.0, "step": 2559 }, { "epoch": 1.0215482841181165, "grad_norm": 0.27894943952560425, "learning_rate": 4.039150351738756e-05, "loss": 0.2931, "num_tokens": 334652688.0, "step": 2560 }, { "epoch": 1.0219473264166001, "grad_norm": 0.24696297943592072, "learning_rate": 4.038339065692434e-05, "loss": 0.2512, "num_tokens": 334783760.0, "step": 2561 }, { "epoch": 1.0223463687150838, "grad_norm": 0.28581053018569946, "learning_rate": 4.037527530364513e-05, "loss": 0.2904, "num_tokens": 334914832.0, "step": 2562 }, { "epoch": 1.0227454110135674, "grad_norm": 0.24500679969787598, "learning_rate": 4.036715745912016e-05, "loss": 0.2278, "num_tokens": 335045904.0, "step": 2563 }, { "epoch": 1.023144453312051, "grad_norm": 0.29023435711860657, "learning_rate": 4.035903712492017e-05, "loss": 0.2902, "num_tokens": 335176976.0, "step": 2564 }, { "epoch": 1.0235434956105347, "grad_norm": 0.2704614996910095, "learning_rate": 4.035091430261637e-05, "loss": 0.2325, "num_tokens": 335308048.0, "step": 2565 }, { "epoch": 1.0239425379090183, "grad_norm": 0.24296759068965912, "learning_rate": 4.034278899378045e-05, "loss": 0.276, "num_tokens": 335439120.0, "step": 2566 }, { "epoch": 1.024341580207502, "grad_norm": 0.241933673620224, "learning_rate": 4.033466119998457e-05, "loss": 0.2486, "num_tokens": 335570192.0, "step": 2567 }, { "epoch": 1.0247406225059856, "grad_norm": 0.25083598494529724, "learning_rate": 4.03265309228014e-05, "loss": 0.2471, "num_tokens": 335701264.0, "step": 2568 }, { "epoch": 1.0251396648044693, "grad_norm": 0.2467009574174881, "learning_rate": 4.031839816380406e-05, "loss": 0.2465, "num_tokens": 335832336.0, "step": 2569 }, { "epoch": 1.025538707102953, "grad_norm": 0.2347041815519333, "learning_rate": 4.0310262924566164e-05, "loss": 0.226, "num_tokens": 335963408.0, "step": 2570 }, { "epoch": 1.0259377494014366, "grad_norm": 0.281582236289978, "learning_rate": 4.030212520666182e-05, "loss": 0.2941, "num_tokens": 336094480.0, "step": 2571 }, { "epoch": 1.0263367916999202, "grad_norm": 0.24313384294509888, "learning_rate": 4.029398501166558e-05, "loss": 0.222, "num_tokens": 336225552.0, "step": 2572 }, { "epoch": 1.0267358339984038, "grad_norm": 0.2574528753757477, "learning_rate": 4.0285842341152496e-05, "loss": 0.2241, "num_tokens": 336356624.0, "step": 2573 }, { "epoch": 1.0271348762968875, "grad_norm": 0.282589852809906, "learning_rate": 4.0277697196698114e-05, "loss": 0.2631, "num_tokens": 336487696.0, "step": 2574 }, { "epoch": 1.0275339185953711, "grad_norm": 0.2485627979040146, "learning_rate": 4.0269549579878436e-05, "loss": 0.223, "num_tokens": 336618768.0, "step": 2575 }, { "epoch": 1.0279329608938548, "grad_norm": 0.25891903042793274, "learning_rate": 4.026139949226995e-05, "loss": 0.2492, "num_tokens": 336749840.0, "step": 2576 }, { "epoch": 1.0283320031923384, "grad_norm": 0.2789004445075989, "learning_rate": 4.0253246935449636e-05, "loss": 0.2695, "num_tokens": 336880912.0, "step": 2577 }, { "epoch": 1.028731045490822, "grad_norm": 0.26253947615623474, "learning_rate": 4.0245091910994906e-05, "loss": 0.2417, "num_tokens": 337011984.0, "step": 2578 }, { "epoch": 1.0291300877893057, "grad_norm": 0.25829100608825684, "learning_rate": 4.02369344204837e-05, "loss": 0.2608, "num_tokens": 337143056.0, "step": 2579 }, { "epoch": 1.0295291300877893, "grad_norm": 0.25004395842552185, "learning_rate": 4.022877446549443e-05, "loss": 0.2458, "num_tokens": 337274128.0, "step": 2580 }, { "epoch": 1.029928172386273, "grad_norm": 0.25800052285194397, "learning_rate": 4.022061204760595e-05, "loss": 0.244, "num_tokens": 337405200.0, "step": 2581 }, { "epoch": 1.0303272146847566, "grad_norm": 0.23711687326431274, "learning_rate": 4.0212447168397614e-05, "loss": 0.2249, "num_tokens": 337536272.0, "step": 2582 }, { "epoch": 1.0307262569832403, "grad_norm": 0.2483048290014267, "learning_rate": 4.020427982944926e-05, "loss": 0.2005, "num_tokens": 337667344.0, "step": 2583 }, { "epoch": 1.031125299281724, "grad_norm": 0.3205045163631439, "learning_rate": 4.019611003234118e-05, "loss": 0.3047, "num_tokens": 337793496.0, "step": 2584 }, { "epoch": 1.0315243415802076, "grad_norm": 0.2825545072555542, "learning_rate": 4.018793777865417e-05, "loss": 0.2324, "num_tokens": 337919234.0, "step": 2585 }, { "epoch": 1.0319233838786912, "grad_norm": 0.2926850914955139, "learning_rate": 4.017976306996947e-05, "loss": 0.2445, "num_tokens": 338029018.0, "step": 2586 }, { "epoch": 1.0323224261771748, "grad_norm": 0.26770198345184326, "learning_rate": 4.017158590786881e-05, "loss": 0.2671, "num_tokens": 338160090.0, "step": 2587 }, { "epoch": 1.0327214684756585, "grad_norm": 0.2605718672275543, "learning_rate": 4.016340629393441e-05, "loss": 0.241, "num_tokens": 338291162.0, "step": 2588 }, { "epoch": 1.0331205107741421, "grad_norm": 0.27042117714881897, "learning_rate": 4.015522422974894e-05, "loss": 0.2582, "num_tokens": 338422234.0, "step": 2589 }, { "epoch": 1.0335195530726258, "grad_norm": 0.2928811311721802, "learning_rate": 4.0147039716895544e-05, "loss": 0.281, "num_tokens": 338553306.0, "step": 2590 }, { "epoch": 1.0339185953711094, "grad_norm": 0.25216183066368103, "learning_rate": 4.013885275695786e-05, "loss": 0.2795, "num_tokens": 338684378.0, "step": 2591 }, { "epoch": 1.034317637669593, "grad_norm": 0.25910529494285583, "learning_rate": 4.0130663351519985e-05, "loss": 0.2706, "num_tokens": 338815450.0, "step": 2592 }, { "epoch": 1.0347166799680767, "grad_norm": 0.25415095686912537, "learning_rate": 4.01224715021665e-05, "loss": 0.2759, "num_tokens": 338946522.0, "step": 2593 }, { "epoch": 1.0351157222665603, "grad_norm": 0.2621246576309204, "learning_rate": 4.011427721048243e-05, "loss": 0.2647, "num_tokens": 339077594.0, "step": 2594 }, { "epoch": 1.035514764565044, "grad_norm": 0.22939810156822205, "learning_rate": 4.0106080478053324e-05, "loss": 0.2381, "num_tokens": 339208666.0, "step": 2595 }, { "epoch": 1.0359138068635276, "grad_norm": 0.24413049221038818, "learning_rate": 4.009788130646515e-05, "loss": 0.2533, "num_tokens": 339339738.0, "step": 2596 }, { "epoch": 1.0363128491620113, "grad_norm": 0.28912919759750366, "learning_rate": 4.0089679697304376e-05, "loss": 0.2808, "num_tokens": 339470810.0, "step": 2597 }, { "epoch": 1.036711891460495, "grad_norm": 0.271080881357193, "learning_rate": 4.008147565215794e-05, "loss": 0.2683, "num_tokens": 339601882.0, "step": 2598 }, { "epoch": 1.0371109337589783, "grad_norm": 0.2581632733345032, "learning_rate": 4.0073269172613256e-05, "loss": 0.255, "num_tokens": 339732954.0, "step": 2599 }, { "epoch": 1.037509976057462, "grad_norm": 0.27542734146118164, "learning_rate": 4.006506026025819e-05, "loss": 0.2395, "num_tokens": 339864026.0, "step": 2600 }, { "epoch": 1.0379090183559456, "grad_norm": 0.2647814452648163, "learning_rate": 4.0056848916681096e-05, "loss": 0.2424, "num_tokens": 339995098.0, "step": 2601 }, { "epoch": 1.0383080606544293, "grad_norm": 0.2883111834526062, "learning_rate": 4.004863514347078e-05, "loss": 0.2422, "num_tokens": 340126170.0, "step": 2602 }, { "epoch": 1.038707102952913, "grad_norm": 0.26203376054763794, "learning_rate": 4.004041894221655e-05, "loss": 0.2334, "num_tokens": 340257242.0, "step": 2603 }, { "epoch": 1.0391061452513966, "grad_norm": 0.24490594863891602, "learning_rate": 4.003220031450815e-05, "loss": 0.205, "num_tokens": 340388314.0, "step": 2604 }, { "epoch": 1.0395051875498802, "grad_norm": 0.27299442887306213, "learning_rate": 4.0023979261935816e-05, "loss": 0.2561, "num_tokens": 340519386.0, "step": 2605 }, { "epoch": 1.0399042298483638, "grad_norm": 0.251897931098938, "learning_rate": 4.001575578609025e-05, "loss": 0.2211, "num_tokens": 340650458.0, "step": 2606 }, { "epoch": 1.0403032721468475, "grad_norm": 0.2656695544719696, "learning_rate": 4.00075298885626e-05, "loss": 0.243, "num_tokens": 340781530.0, "step": 2607 }, { "epoch": 1.0407023144453311, "grad_norm": 0.2780751585960388, "learning_rate": 3.999930157094451e-05, "loss": 0.2588, "num_tokens": 340912602.0, "step": 2608 }, { "epoch": 1.0411013567438148, "grad_norm": 0.2741522192955017, "learning_rate": 3.999107083482808e-05, "loss": 0.2243, "num_tokens": 341043674.0, "step": 2609 }, { "epoch": 1.0415003990422984, "grad_norm": 0.2629493474960327, "learning_rate": 3.998283768180589e-05, "loss": 0.2325, "num_tokens": 341174746.0, "step": 2610 }, { "epoch": 1.041899441340782, "grad_norm": 0.27646806836128235, "learning_rate": 3.9974602113470974e-05, "loss": 0.2448, "num_tokens": 341305818.0, "step": 2611 }, { "epoch": 1.0422984836392657, "grad_norm": 0.2841443717479706, "learning_rate": 3.9966364131416835e-05, "loss": 0.249, "num_tokens": 341436890.0, "step": 2612 }, { "epoch": 1.0426975259377493, "grad_norm": 0.2686232626438141, "learning_rate": 3.995812373723744e-05, "loss": 0.2566, "num_tokens": 341567962.0, "step": 2613 }, { "epoch": 1.043096568236233, "grad_norm": 0.26508283615112305, "learning_rate": 3.994988093252724e-05, "loss": 0.2376, "num_tokens": 341699034.0, "step": 2614 }, { "epoch": 1.0434956105347166, "grad_norm": 0.22192248702049255, "learning_rate": 3.994163571888113e-05, "loss": 0.2133, "num_tokens": 341830106.0, "step": 2615 }, { "epoch": 1.0438946528332003, "grad_norm": 0.24969999492168427, "learning_rate": 3.993338809789449e-05, "loss": 0.2564, "num_tokens": 341961178.0, "step": 2616 }, { "epoch": 1.044293695131684, "grad_norm": 0.2722051739692688, "learning_rate": 3.9925138071163174e-05, "loss": 0.2339, "num_tokens": 342092250.0, "step": 2617 }, { "epoch": 1.0446927374301676, "grad_norm": 0.26268434524536133, "learning_rate": 3.991688564028344e-05, "loss": 0.2648, "num_tokens": 342223322.0, "step": 2618 }, { "epoch": 1.0450917797286512, "grad_norm": 0.2516385614871979, "learning_rate": 3.99086308068521e-05, "loss": 0.2176, "num_tokens": 342354394.0, "step": 2619 }, { "epoch": 1.0454908220271348, "grad_norm": 0.2863660156726837, "learning_rate": 3.9900373572466365e-05, "loss": 0.2377, "num_tokens": 342485466.0, "step": 2620 }, { "epoch": 1.0458898643256185, "grad_norm": 0.3007698655128479, "learning_rate": 3.989211393872394e-05, "loss": 0.2597, "num_tokens": 342616538.0, "step": 2621 }, { "epoch": 1.0462889066241021, "grad_norm": 0.29933053255081177, "learning_rate": 3.988385190722297e-05, "loss": 0.2377, "num_tokens": 342747610.0, "step": 2622 }, { "epoch": 1.0466879489225858, "grad_norm": 0.2960887551307678, "learning_rate": 3.987558747956212e-05, "loss": 0.2764, "num_tokens": 342878682.0, "step": 2623 }, { "epoch": 1.0470869912210694, "grad_norm": 0.24705930054187775, "learning_rate": 3.9867320657340446e-05, "loss": 0.233, "num_tokens": 343009754.0, "step": 2624 }, { "epoch": 1.047486033519553, "grad_norm": 0.26314252614974976, "learning_rate": 3.985905144215752e-05, "loss": 0.2577, "num_tokens": 343140826.0, "step": 2625 }, { "epoch": 1.0478850758180367, "grad_norm": 0.279839426279068, "learning_rate": 3.9850779835613334e-05, "loss": 0.2785, "num_tokens": 343271898.0, "step": 2626 }, { "epoch": 1.0482841181165203, "grad_norm": 0.27531078457832336, "learning_rate": 3.984250583930839e-05, "loss": 0.264, "num_tokens": 343402970.0, "step": 2627 }, { "epoch": 1.048683160415004, "grad_norm": 0.24975000321865082, "learning_rate": 3.9834229454843626e-05, "loss": 0.2436, "num_tokens": 343534042.0, "step": 2628 }, { "epoch": 1.0490822027134876, "grad_norm": 0.24460090696811676, "learning_rate": 3.9825950683820436e-05, "loss": 0.2499, "num_tokens": 343665114.0, "step": 2629 }, { "epoch": 1.0494812450119713, "grad_norm": 0.259815514087677, "learning_rate": 3.981766952784069e-05, "loss": 0.2468, "num_tokens": 343796186.0, "step": 2630 }, { "epoch": 1.049880287310455, "grad_norm": 0.2504110336303711, "learning_rate": 3.980938598850671e-05, "loss": 0.244, "num_tokens": 343927258.0, "step": 2631 }, { "epoch": 1.0502793296089385, "grad_norm": 0.24373598396778107, "learning_rate": 3.9801100067421296e-05, "loss": 0.2026, "num_tokens": 344058330.0, "step": 2632 }, { "epoch": 1.0506783719074222, "grad_norm": 0.2738605737686157, "learning_rate": 3.9792811766187684e-05, "loss": 0.2487, "num_tokens": 344189402.0, "step": 2633 }, { "epoch": 1.0510774142059058, "grad_norm": 0.26463550329208374, "learning_rate": 3.9784521086409594e-05, "loss": 0.2422, "num_tokens": 344320474.0, "step": 2634 }, { "epoch": 1.0514764565043895, "grad_norm": 0.25527313351631165, "learning_rate": 3.977622802969118e-05, "loss": 0.2292, "num_tokens": 344451546.0, "step": 2635 }, { "epoch": 1.0518754988028731, "grad_norm": 0.29168903827667236, "learning_rate": 3.976793259763709e-05, "loss": 0.2701, "num_tokens": 344582618.0, "step": 2636 }, { "epoch": 1.0522745411013568, "grad_norm": 0.2967548370361328, "learning_rate": 3.975963479185239e-05, "loss": 0.2451, "num_tokens": 344713690.0, "step": 2637 }, { "epoch": 1.0526735833998404, "grad_norm": 0.26267099380493164, "learning_rate": 3.9751334613942646e-05, "loss": 0.2781, "num_tokens": 344844762.0, "step": 2638 }, { "epoch": 1.053072625698324, "grad_norm": 0.25018396973609924, "learning_rate": 3.974303206551386e-05, "loss": 0.218, "num_tokens": 344975834.0, "step": 2639 }, { "epoch": 1.0534716679968077, "grad_norm": 0.26383745670318604, "learning_rate": 3.9734727148172495e-05, "loss": 0.2443, "num_tokens": 345106906.0, "step": 2640 }, { "epoch": 1.0538707102952913, "grad_norm": 0.26972371339797974, "learning_rate": 3.972641986352548e-05, "loss": 0.2724, "num_tokens": 345237978.0, "step": 2641 }, { "epoch": 1.054269752593775, "grad_norm": 0.27606791257858276, "learning_rate": 3.971811021318019e-05, "loss": 0.2733, "num_tokens": 345369050.0, "step": 2642 }, { "epoch": 1.0546687948922586, "grad_norm": 0.23622918128967285, "learning_rate": 3.970979819874448e-05, "loss": 0.2319, "num_tokens": 345500122.0, "step": 2643 }, { "epoch": 1.0550678371907423, "grad_norm": 0.25927549600601196, "learning_rate": 3.970148382182663e-05, "loss": 0.2438, "num_tokens": 345631194.0, "step": 2644 }, { "epoch": 1.055466879489226, "grad_norm": 0.2658373713493347, "learning_rate": 3.9693167084035404e-05, "loss": 0.2601, "num_tokens": 345762266.0, "step": 2645 }, { "epoch": 1.0558659217877095, "grad_norm": 0.2564108967781067, "learning_rate": 3.9684847986980014e-05, "loss": 0.2718, "num_tokens": 345893338.0, "step": 2646 }, { "epoch": 1.0562649640861932, "grad_norm": 0.24238120019435883, "learning_rate": 3.967652653227011e-05, "loss": 0.2232, "num_tokens": 346024410.0, "step": 2647 }, { "epoch": 1.0566640063846768, "grad_norm": 0.27534666657447815, "learning_rate": 3.9668202721515844e-05, "loss": 0.224, "num_tokens": 346155482.0, "step": 2648 }, { "epoch": 1.0570630486831605, "grad_norm": 0.23789475858211517, "learning_rate": 3.9659876556327774e-05, "loss": 0.2223, "num_tokens": 346286554.0, "step": 2649 }, { "epoch": 1.0574620909816441, "grad_norm": 0.28098535537719727, "learning_rate": 3.965154803831695e-05, "loss": 0.2657, "num_tokens": 346417626.0, "step": 2650 }, { "epoch": 1.0578611332801278, "grad_norm": 0.2741132080554962, "learning_rate": 3.964321716909485e-05, "loss": 0.2451, "num_tokens": 346548698.0, "step": 2651 }, { "epoch": 1.0582601755786114, "grad_norm": 0.26957303285598755, "learning_rate": 3.9634883950273425e-05, "loss": 0.2347, "num_tokens": 346679770.0, "step": 2652 }, { "epoch": 1.058659217877095, "grad_norm": 0.2525608539581299, "learning_rate": 3.962654838346508e-05, "loss": 0.2414, "num_tokens": 346810842.0, "step": 2653 }, { "epoch": 1.0590582601755787, "grad_norm": 0.2614566683769226, "learning_rate": 3.961821047028266e-05, "loss": 0.2497, "num_tokens": 346941914.0, "step": 2654 }, { "epoch": 1.0594573024740623, "grad_norm": 0.2636739909648895, "learning_rate": 3.9609870212339484e-05, "loss": 0.2785, "num_tokens": 347072986.0, "step": 2655 }, { "epoch": 1.059856344772546, "grad_norm": 0.22829458117485046, "learning_rate": 3.960152761124931e-05, "loss": 0.2208, "num_tokens": 347204058.0, "step": 2656 }, { "epoch": 1.0602553870710296, "grad_norm": 0.2712019681930542, "learning_rate": 3.9593182668626344e-05, "loss": 0.2704, "num_tokens": 347335130.0, "step": 2657 }, { "epoch": 1.0606544293695133, "grad_norm": 0.2737249732017517, "learning_rate": 3.9584835386085264e-05, "loss": 0.2554, "num_tokens": 347466202.0, "step": 2658 }, { "epoch": 1.061053471667997, "grad_norm": 0.27479231357574463, "learning_rate": 3.957648576524119e-05, "loss": 0.2473, "num_tokens": 347597274.0, "step": 2659 }, { "epoch": 1.0614525139664805, "grad_norm": 0.24419759213924408, "learning_rate": 3.956813380770971e-05, "loss": 0.252, "num_tokens": 347728346.0, "step": 2660 }, { "epoch": 1.0618515562649642, "grad_norm": 0.2832251787185669, "learning_rate": 3.955977951510682e-05, "loss": 0.2588, "num_tokens": 347859418.0, "step": 2661 }, { "epoch": 1.0622505985634478, "grad_norm": 0.2825734317302704, "learning_rate": 3.955142288904902e-05, "loss": 0.2604, "num_tokens": 347990490.0, "step": 2662 }, { "epoch": 1.0626496408619315, "grad_norm": 0.2868100106716156, "learning_rate": 3.9543063931153236e-05, "loss": 0.2467, "num_tokens": 348121562.0, "step": 2663 }, { "epoch": 1.0630486831604151, "grad_norm": 0.2816430926322937, "learning_rate": 3.9534702643036844e-05, "loss": 0.2475, "num_tokens": 348252634.0, "step": 2664 }, { "epoch": 1.0634477254588985, "grad_norm": 0.27936941385269165, "learning_rate": 3.9526339026317674e-05, "loss": 0.2751, "num_tokens": 348383706.0, "step": 2665 }, { "epoch": 1.0638467677573822, "grad_norm": 0.26324188709259033, "learning_rate": 3.9517973082614006e-05, "loss": 0.2376, "num_tokens": 348514778.0, "step": 2666 }, { "epoch": 1.0642458100558658, "grad_norm": 0.24067986011505127, "learning_rate": 3.950960481354458e-05, "loss": 0.2231, "num_tokens": 348645850.0, "step": 2667 }, { "epoch": 1.0646448523543495, "grad_norm": 0.2728956341743469, "learning_rate": 3.950123422072858e-05, "loss": 0.2563, "num_tokens": 348776922.0, "step": 2668 }, { "epoch": 1.0650438946528331, "grad_norm": 0.27653074264526367, "learning_rate": 3.9492861305785625e-05, "loss": 0.2649, "num_tokens": 348907994.0, "step": 2669 }, { "epoch": 1.0654429369513168, "grad_norm": 0.2826322913169861, "learning_rate": 3.9484486070335806e-05, "loss": 0.2698, "num_tokens": 349039066.0, "step": 2670 }, { "epoch": 1.0658419792498004, "grad_norm": 0.26242807507514954, "learning_rate": 3.947610851599965e-05, "loss": 0.2599, "num_tokens": 349170138.0, "step": 2671 }, { "epoch": 1.066241021548284, "grad_norm": 0.27482858300209045, "learning_rate": 3.9467728644398134e-05, "loss": 0.267, "num_tokens": 349301210.0, "step": 2672 }, { "epoch": 1.0666400638467677, "grad_norm": 0.28095725178718567, "learning_rate": 3.945934645715269e-05, "loss": 0.292, "num_tokens": 349432282.0, "step": 2673 }, { "epoch": 1.0670391061452513, "grad_norm": 0.24551977217197418, "learning_rate": 3.94509619558852e-05, "loss": 0.2192, "num_tokens": 349563354.0, "step": 2674 }, { "epoch": 1.067438148443735, "grad_norm": 0.2560215890407562, "learning_rate": 3.9442575142217956e-05, "loss": 0.2154, "num_tokens": 349694426.0, "step": 2675 }, { "epoch": 1.0678371907422186, "grad_norm": 0.24182535707950592, "learning_rate": 3.9434186017773764e-05, "loss": 0.2199, "num_tokens": 349825498.0, "step": 2676 }, { "epoch": 1.0682362330407023, "grad_norm": 0.26785123348236084, "learning_rate": 3.942579458417582e-05, "loss": 0.2359, "num_tokens": 349956570.0, "step": 2677 }, { "epoch": 1.068635275339186, "grad_norm": 0.28709566593170166, "learning_rate": 3.94174008430478e-05, "loss": 0.2425, "num_tokens": 350087642.0, "step": 2678 }, { "epoch": 1.0690343176376695, "grad_norm": 0.2626877725124359, "learning_rate": 3.940900479601381e-05, "loss": 0.2458, "num_tokens": 350218714.0, "step": 2679 }, { "epoch": 1.0694333599361532, "grad_norm": 0.2476435750722885, "learning_rate": 3.94006064446984e-05, "loss": 0.2221, "num_tokens": 350349786.0, "step": 2680 }, { "epoch": 1.0698324022346368, "grad_norm": 0.2503296732902527, "learning_rate": 3.939220579072658e-05, "loss": 0.1717, "num_tokens": 350480858.0, "step": 2681 }, { "epoch": 1.0702314445331205, "grad_norm": 0.30429863929748535, "learning_rate": 3.93838028357238e-05, "loss": 0.2532, "num_tokens": 350611930.0, "step": 2682 }, { "epoch": 1.0706304868316041, "grad_norm": 0.2327633649110794, "learning_rate": 3.9375397581315955e-05, "loss": 0.2038, "num_tokens": 350743002.0, "step": 2683 }, { "epoch": 1.0710295291300878, "grad_norm": 0.2813763916492462, "learning_rate": 3.936699002912937e-05, "loss": 0.2494, "num_tokens": 350874074.0, "step": 2684 }, { "epoch": 1.0714285714285714, "grad_norm": 0.33540016412734985, "learning_rate": 3.935858018079085e-05, "loss": 0.2541, "num_tokens": 351005146.0, "step": 2685 }, { "epoch": 1.071827613727055, "grad_norm": 0.24154022336006165, "learning_rate": 3.93501680379276e-05, "loss": 0.234, "num_tokens": 351136218.0, "step": 2686 }, { "epoch": 1.0722266560255387, "grad_norm": 0.25469329953193665, "learning_rate": 3.93417536021673e-05, "loss": 0.249, "num_tokens": 351267290.0, "step": 2687 }, { "epoch": 1.0726256983240223, "grad_norm": 0.29177209734916687, "learning_rate": 3.933333687513806e-05, "loss": 0.2871, "num_tokens": 351398362.0, "step": 2688 }, { "epoch": 1.073024740622506, "grad_norm": 0.26985400915145874, "learning_rate": 3.9324917858468455e-05, "loss": 0.269, "num_tokens": 351529434.0, "step": 2689 }, { "epoch": 1.0734237829209896, "grad_norm": 0.24728736281394958, "learning_rate": 3.931649655378747e-05, "loss": 0.2493, "num_tokens": 351660506.0, "step": 2690 }, { "epoch": 1.0738228252194733, "grad_norm": 0.25645747780799866, "learning_rate": 3.930807296272455e-05, "loss": 0.2108, "num_tokens": 351791578.0, "step": 2691 }, { "epoch": 1.074221867517957, "grad_norm": 0.2613445520401001, "learning_rate": 3.9299647086909586e-05, "loss": 0.2364, "num_tokens": 351922650.0, "step": 2692 }, { "epoch": 1.0746209098164405, "grad_norm": 0.25398680567741394, "learning_rate": 3.929121892797291e-05, "loss": 0.1931, "num_tokens": 352053722.0, "step": 2693 }, { "epoch": 1.0750199521149242, "grad_norm": 0.2722673714160919, "learning_rate": 3.928278848754529e-05, "loss": 0.2525, "num_tokens": 352184794.0, "step": 2694 }, { "epoch": 1.0754189944134078, "grad_norm": 0.34402304887771606, "learning_rate": 3.927435576725793e-05, "loss": 0.251, "num_tokens": 352305246.0, "step": 2695 }, { "epoch": 1.0758180367118915, "grad_norm": 0.28296494483947754, "learning_rate": 3.926592076874249e-05, "loss": 0.2737, "num_tokens": 352436318.0, "step": 2696 }, { "epoch": 1.076217079010375, "grad_norm": 0.25526100397109985, "learning_rate": 3.925748349363107e-05, "loss": 0.227, "num_tokens": 352567390.0, "step": 2697 }, { "epoch": 1.0766161213088588, "grad_norm": 0.2391214668750763, "learning_rate": 3.924904394355618e-05, "loss": 0.1999, "num_tokens": 352698462.0, "step": 2698 }, { "epoch": 1.0770151636073424, "grad_norm": 0.28210291266441345, "learning_rate": 3.924060212015082e-05, "loss": 0.254, "num_tokens": 352829534.0, "step": 2699 }, { "epoch": 1.077414205905826, "grad_norm": 0.25461387634277344, "learning_rate": 3.9232158025048396e-05, "loss": 0.2535, "num_tokens": 352960606.0, "step": 2700 }, { "epoch": 1.0778132482043097, "grad_norm": 0.24033312499523163, "learning_rate": 3.9223711659882744e-05, "loss": 0.2228, "num_tokens": 353091678.0, "step": 2701 }, { "epoch": 1.0782122905027933, "grad_norm": 0.2688683271408081, "learning_rate": 3.92152630262882e-05, "loss": 0.2398, "num_tokens": 353222750.0, "step": 2702 }, { "epoch": 1.078611332801277, "grad_norm": 0.27229979634284973, "learning_rate": 3.920681212589945e-05, "loss": 0.2742, "num_tokens": 353353822.0, "step": 2703 }, { "epoch": 1.0790103750997606, "grad_norm": 0.255007803440094, "learning_rate": 3.9198358960351684e-05, "loss": 0.242, "num_tokens": 353484894.0, "step": 2704 }, { "epoch": 1.0794094173982443, "grad_norm": 0.2504991888999939, "learning_rate": 3.9189903531280515e-05, "loss": 0.2299, "num_tokens": 353615966.0, "step": 2705 }, { "epoch": 1.079808459696728, "grad_norm": 0.2945723235607147, "learning_rate": 3.918144584032198e-05, "loss": 0.2849, "num_tokens": 353747038.0, "step": 2706 }, { "epoch": 1.0802075019952115, "grad_norm": 0.26047801971435547, "learning_rate": 3.917298588911257e-05, "loss": 0.2566, "num_tokens": 353878110.0, "step": 2707 }, { "epoch": 1.0806065442936952, "grad_norm": 0.2648461163043976, "learning_rate": 3.916452367928921e-05, "loss": 0.2401, "num_tokens": 354009182.0, "step": 2708 }, { "epoch": 1.0810055865921788, "grad_norm": 0.3123200237751007, "learning_rate": 3.9156059212489244e-05, "loss": 0.2996, "num_tokens": 354140254.0, "step": 2709 }, { "epoch": 1.0814046288906625, "grad_norm": 0.2762521207332611, "learning_rate": 3.914759249035048e-05, "loss": 0.272, "num_tokens": 354271326.0, "step": 2710 }, { "epoch": 1.081803671189146, "grad_norm": 0.26416438817977905, "learning_rate": 3.913912351451115e-05, "loss": 0.2215, "num_tokens": 354402398.0, "step": 2711 }, { "epoch": 1.0822027134876298, "grad_norm": 0.28158730268478394, "learning_rate": 3.913065228660991e-05, "loss": 0.2575, "num_tokens": 354533470.0, "step": 2712 }, { "epoch": 1.0826017557861134, "grad_norm": 0.2661091685295105, "learning_rate": 3.912217880828588e-05, "loss": 0.2156, "num_tokens": 354664542.0, "step": 2713 }, { "epoch": 1.083000798084597, "grad_norm": 0.24977968633174896, "learning_rate": 3.9113703081178594e-05, "loss": 0.2186, "num_tokens": 354795614.0, "step": 2714 }, { "epoch": 1.0833998403830807, "grad_norm": 0.24435678124427795, "learning_rate": 3.910522510692802e-05, "loss": 0.2036, "num_tokens": 354926686.0, "step": 2715 }, { "epoch": 1.0837988826815643, "grad_norm": 0.27716514468193054, "learning_rate": 3.9096744887174565e-05, "loss": 0.2315, "num_tokens": 355057758.0, "step": 2716 }, { "epoch": 1.084197924980048, "grad_norm": 0.2902389168739319, "learning_rate": 3.908826242355908e-05, "loss": 0.2558, "num_tokens": 355188830.0, "step": 2717 }, { "epoch": 1.0845969672785316, "grad_norm": 0.2916581928730011, "learning_rate": 3.9079777717722846e-05, "loss": 0.2408, "num_tokens": 355319902.0, "step": 2718 }, { "epoch": 1.0849960095770153, "grad_norm": 0.27059027552604675, "learning_rate": 3.9071290771307564e-05, "loss": 0.2626, "num_tokens": 355450974.0, "step": 2719 }, { "epoch": 1.085395051875499, "grad_norm": 0.258573979139328, "learning_rate": 3.9062801585955374e-05, "loss": 0.2623, "num_tokens": 355582046.0, "step": 2720 }, { "epoch": 1.0857940941739825, "grad_norm": 0.2800734341144562, "learning_rate": 3.905431016330889e-05, "loss": 0.2996, "num_tokens": 355713118.0, "step": 2721 }, { "epoch": 1.0861931364724662, "grad_norm": 0.25246307253837585, "learning_rate": 3.904581650501108e-05, "loss": 0.2256, "num_tokens": 355844190.0, "step": 2722 }, { "epoch": 1.0865921787709498, "grad_norm": 0.25966793298721313, "learning_rate": 3.9037320612705404e-05, "loss": 0.2742, "num_tokens": 355975262.0, "step": 2723 }, { "epoch": 1.0869912210694332, "grad_norm": 0.30418938398361206, "learning_rate": 3.902882248803574e-05, "loss": 0.3152, "num_tokens": 356090364.0, "step": 2724 }, { "epoch": 1.0873902633679169, "grad_norm": 0.24654151499271393, "learning_rate": 3.90203221326464e-05, "loss": 0.2301, "num_tokens": 356221436.0, "step": 2725 }, { "epoch": 1.0877893056664005, "grad_norm": 0.23652710020542145, "learning_rate": 3.9011819548182115e-05, "loss": 0.2304, "num_tokens": 356352508.0, "step": 2726 }, { "epoch": 1.0881883479648842, "grad_norm": 0.2508876919746399, "learning_rate": 3.900331473628806e-05, "loss": 0.2721, "num_tokens": 356483580.0, "step": 2727 }, { "epoch": 1.0885873902633678, "grad_norm": 0.2529725134372711, "learning_rate": 3.8994807698609845e-05, "loss": 0.2421, "num_tokens": 356614652.0, "step": 2728 }, { "epoch": 1.0889864325618515, "grad_norm": 0.2684747874736786, "learning_rate": 3.8986298436793475e-05, "loss": 0.2428, "num_tokens": 356745724.0, "step": 2729 }, { "epoch": 1.089385474860335, "grad_norm": 0.2729847729206085, "learning_rate": 3.8977786952485445e-05, "loss": 0.2477, "num_tokens": 356876796.0, "step": 2730 }, { "epoch": 1.0897845171588187, "grad_norm": 0.27207815647125244, "learning_rate": 3.896927324733263e-05, "loss": 0.2358, "num_tokens": 357007868.0, "step": 2731 }, { "epoch": 1.0901835594573024, "grad_norm": 0.25044775009155273, "learning_rate": 3.896075732298236e-05, "loss": 0.2303, "num_tokens": 357138940.0, "step": 2732 }, { "epoch": 1.090582601755786, "grad_norm": 0.29815444350242615, "learning_rate": 3.895223918108238e-05, "loss": 0.2646, "num_tokens": 357270012.0, "step": 2733 }, { "epoch": 1.0909816440542697, "grad_norm": 0.2902053892612457, "learning_rate": 3.894371882328088e-05, "loss": 0.2521, "num_tokens": 357401084.0, "step": 2734 }, { "epoch": 1.0913806863527533, "grad_norm": 0.30985933542251587, "learning_rate": 3.893519625122645e-05, "loss": 0.2682, "num_tokens": 357532156.0, "step": 2735 }, { "epoch": 1.091779728651237, "grad_norm": 0.2556045353412628, "learning_rate": 3.8926671466568165e-05, "loss": 0.2415, "num_tokens": 357663228.0, "step": 2736 }, { "epoch": 1.0921787709497206, "grad_norm": 0.26400184631347656, "learning_rate": 3.891814447095545e-05, "loss": 0.2556, "num_tokens": 357794300.0, "step": 2737 }, { "epoch": 1.0925778132482042, "grad_norm": 0.2796989977359772, "learning_rate": 3.890961526603823e-05, "loss": 0.2511, "num_tokens": 357925372.0, "step": 2738 }, { "epoch": 1.0929768555466879, "grad_norm": 0.25795990228652954, "learning_rate": 3.890108385346681e-05, "loss": 0.2457, "num_tokens": 358056444.0, "step": 2739 }, { "epoch": 1.0933758978451715, "grad_norm": 0.2389097362756729, "learning_rate": 3.8892550234891944e-05, "loss": 0.2375, "num_tokens": 358187516.0, "step": 2740 }, { "epoch": 1.0937749401436552, "grad_norm": 0.24171750247478485, "learning_rate": 3.8884014411964815e-05, "loss": 0.2009, "num_tokens": 358318588.0, "step": 2741 }, { "epoch": 1.0941739824421388, "grad_norm": 0.25069528818130493, "learning_rate": 3.887547638633701e-05, "loss": 0.228, "num_tokens": 358449660.0, "step": 2742 }, { "epoch": 1.0945730247406225, "grad_norm": 0.29829922318458557, "learning_rate": 3.8866936159660565e-05, "loss": 0.2557, "num_tokens": 358580732.0, "step": 2743 }, { "epoch": 1.094972067039106, "grad_norm": 0.27977409958839417, "learning_rate": 3.8858393733587935e-05, "loss": 0.2625, "num_tokens": 358711804.0, "step": 2744 }, { "epoch": 1.0953711093375897, "grad_norm": 0.29007214307785034, "learning_rate": 3.8849849109772e-05, "loss": 0.2992, "num_tokens": 358842876.0, "step": 2745 }, { "epoch": 1.0957701516360734, "grad_norm": 0.2761898636817932, "learning_rate": 3.884130228986606e-05, "loss": 0.2379, "num_tokens": 358973948.0, "step": 2746 }, { "epoch": 1.096169193934557, "grad_norm": 0.2611282765865326, "learning_rate": 3.8832753275523856e-05, "loss": 0.2669, "num_tokens": 359105020.0, "step": 2747 }, { "epoch": 1.0965682362330407, "grad_norm": 0.2519286870956421, "learning_rate": 3.8824202068399536e-05, "loss": 0.2309, "num_tokens": 359236092.0, "step": 2748 }, { "epoch": 1.0969672785315243, "grad_norm": 0.24900899827480316, "learning_rate": 3.881564867014767e-05, "loss": 0.2671, "num_tokens": 359367164.0, "step": 2749 }, { "epoch": 1.097366320830008, "grad_norm": 0.2603757977485657, "learning_rate": 3.880709308242328e-05, "loss": 0.2466, "num_tokens": 359498236.0, "step": 2750 }, { "epoch": 1.0977653631284916, "grad_norm": 0.23598866164684296, "learning_rate": 3.879853530688178e-05, "loss": 0.1971, "num_tokens": 359629308.0, "step": 2751 }, { "epoch": 1.0981644054269752, "grad_norm": 0.2720547318458557, "learning_rate": 3.878997534517902e-05, "loss": 0.2547, "num_tokens": 359760380.0, "step": 2752 }, { "epoch": 1.0985634477254589, "grad_norm": 0.27272599935531616, "learning_rate": 3.8781413198971275e-05, "loss": 0.2516, "num_tokens": 359891452.0, "step": 2753 }, { "epoch": 1.0989624900239425, "grad_norm": 0.2895665168762207, "learning_rate": 3.877284886991524e-05, "loss": 0.2716, "num_tokens": 360022524.0, "step": 2754 }, { "epoch": 1.0993615323224262, "grad_norm": 0.280557781457901, "learning_rate": 3.8764282359668026e-05, "loss": 0.2561, "num_tokens": 360153596.0, "step": 2755 }, { "epoch": 1.0997605746209098, "grad_norm": 0.2706461250782013, "learning_rate": 3.875571366988718e-05, "loss": 0.2502, "num_tokens": 360284668.0, "step": 2756 }, { "epoch": 1.1001596169193935, "grad_norm": 0.26498284935951233, "learning_rate": 3.874714280223067e-05, "loss": 0.2356, "num_tokens": 360415740.0, "step": 2757 }, { "epoch": 1.100558659217877, "grad_norm": 0.2639971673488617, "learning_rate": 3.873856975835688e-05, "loss": 0.2385, "num_tokens": 360546812.0, "step": 2758 }, { "epoch": 1.1009577015163607, "grad_norm": 0.2596653401851654, "learning_rate": 3.872999453992459e-05, "loss": 0.2422, "num_tokens": 360677884.0, "step": 2759 }, { "epoch": 1.1013567438148444, "grad_norm": 0.25658175349235535, "learning_rate": 3.872141714859305e-05, "loss": 0.2479, "num_tokens": 360808956.0, "step": 2760 }, { "epoch": 1.101755786113328, "grad_norm": 0.27312493324279785, "learning_rate": 3.8712837586021894e-05, "loss": 0.2727, "num_tokens": 360925527.0, "step": 2761 }, { "epoch": 1.1021548284118117, "grad_norm": 0.26163411140441895, "learning_rate": 3.870425585387119e-05, "loss": 0.2592, "num_tokens": 361056599.0, "step": 2762 }, { "epoch": 1.1025538707102953, "grad_norm": 0.24804946780204773, "learning_rate": 3.869567195380142e-05, "loss": 0.2173, "num_tokens": 361187671.0, "step": 2763 }, { "epoch": 1.102952913008779, "grad_norm": 0.2845690846443176, "learning_rate": 3.86870858874735e-05, "loss": 0.2853, "num_tokens": 361318743.0, "step": 2764 }, { "epoch": 1.1033519553072626, "grad_norm": 0.2573176324367523, "learning_rate": 3.867849765654874e-05, "loss": 0.2598, "num_tokens": 361449815.0, "step": 2765 }, { "epoch": 1.1037509976057462, "grad_norm": 0.2884998619556427, "learning_rate": 3.8669907262688884e-05, "loss": 0.2907, "num_tokens": 361580887.0, "step": 2766 }, { "epoch": 1.1041500399042299, "grad_norm": 0.27549999952316284, "learning_rate": 3.8661314707556105e-05, "loss": 0.255, "num_tokens": 361711959.0, "step": 2767 }, { "epoch": 1.1045490822027135, "grad_norm": 0.25970104336738586, "learning_rate": 3.865271999281297e-05, "loss": 0.2594, "num_tokens": 361843031.0, "step": 2768 }, { "epoch": 1.1049481245011972, "grad_norm": 0.2492161989212036, "learning_rate": 3.864412312012249e-05, "loss": 0.2099, "num_tokens": 361974103.0, "step": 2769 }, { "epoch": 1.1053471667996808, "grad_norm": 0.24677342176437378, "learning_rate": 3.8635524091148055e-05, "loss": 0.2333, "num_tokens": 362105175.0, "step": 2770 }, { "epoch": 1.1057462090981645, "grad_norm": 0.2515520453453064, "learning_rate": 3.862692290755353e-05, "loss": 0.2557, "num_tokens": 362236247.0, "step": 2771 }, { "epoch": 1.106145251396648, "grad_norm": 0.23491795361042023, "learning_rate": 3.861831957100313e-05, "loss": 0.2044, "num_tokens": 362367319.0, "step": 2772 }, { "epoch": 1.1065442936951317, "grad_norm": 0.28495723009109497, "learning_rate": 3.860971408316154e-05, "loss": 0.2563, "num_tokens": 362498391.0, "step": 2773 }, { "epoch": 1.1069433359936154, "grad_norm": 0.26380860805511475, "learning_rate": 3.8601106445693845e-05, "loss": 0.2589, "num_tokens": 362629463.0, "step": 2774 }, { "epoch": 1.107342378292099, "grad_norm": 0.25733840465545654, "learning_rate": 3.859249666026554e-05, "loss": 0.2702, "num_tokens": 362760535.0, "step": 2775 }, { "epoch": 1.1077414205905827, "grad_norm": 0.2779097855091095, "learning_rate": 3.8583884728542516e-05, "loss": 0.2748, "num_tokens": 362891607.0, "step": 2776 }, { "epoch": 1.1081404628890663, "grad_norm": 0.3105486035346985, "learning_rate": 3.857527065219113e-05, "loss": 0.3016, "num_tokens": 363022679.0, "step": 2777 }, { "epoch": 1.10853950518755, "grad_norm": 0.2679941654205322, "learning_rate": 3.856665443287812e-05, "loss": 0.1946, "num_tokens": 363153751.0, "step": 2778 }, { "epoch": 1.1089385474860336, "grad_norm": 0.25799405574798584, "learning_rate": 3.855803607227063e-05, "loss": 0.2336, "num_tokens": 363284823.0, "step": 2779 }, { "epoch": 1.1093375897845172, "grad_norm": 0.30558034777641296, "learning_rate": 3.854941557203625e-05, "loss": 0.2638, "num_tokens": 363415895.0, "step": 2780 }, { "epoch": 1.1097366320830009, "grad_norm": 0.24977371096611023, "learning_rate": 3.854079293384296e-05, "loss": 0.2059, "num_tokens": 363546967.0, "step": 2781 }, { "epoch": 1.1101356743814845, "grad_norm": 0.2554592788219452, "learning_rate": 3.853216815935915e-05, "loss": 0.2536, "num_tokens": 363678039.0, "step": 2782 }, { "epoch": 1.1105347166799682, "grad_norm": 0.28342297673225403, "learning_rate": 3.8523541250253645e-05, "loss": 0.2775, "num_tokens": 363809111.0, "step": 2783 }, { "epoch": 1.1109337589784518, "grad_norm": 0.2532767355442047, "learning_rate": 3.851491220819566e-05, "loss": 0.2346, "num_tokens": 363940183.0, "step": 2784 }, { "epoch": 1.1113328012769355, "grad_norm": 0.2519476115703583, "learning_rate": 3.850628103485485e-05, "loss": 0.2197, "num_tokens": 364071255.0, "step": 2785 }, { "epoch": 1.111731843575419, "grad_norm": 0.2955625653266907, "learning_rate": 3.849764773190127e-05, "loss": 0.2708, "num_tokens": 364202327.0, "step": 2786 }, { "epoch": 1.1121308858739027, "grad_norm": 0.24673157930374146, "learning_rate": 3.848901230100536e-05, "loss": 0.2126, "num_tokens": 364333399.0, "step": 2787 }, { "epoch": 1.1125299281723864, "grad_norm": 0.26677316427230835, "learning_rate": 3.8480374743838e-05, "loss": 0.239, "num_tokens": 364464471.0, "step": 2788 }, { "epoch": 1.11292897047087, "grad_norm": 0.412015438079834, "learning_rate": 3.84717350620705e-05, "loss": 0.2584, "num_tokens": 364587536.0, "step": 2789 }, { "epoch": 1.1133280127693534, "grad_norm": 0.25721168518066406, "learning_rate": 3.8463093257374534e-05, "loss": 0.2498, "num_tokens": 364718608.0, "step": 2790 }, { "epoch": 1.113727055067837, "grad_norm": 0.2417345494031906, "learning_rate": 3.845444933142222e-05, "loss": 0.1993, "num_tokens": 364834943.0, "step": 2791 }, { "epoch": 1.1141260973663207, "grad_norm": 0.28244471549987793, "learning_rate": 3.844580328588609e-05, "loss": 0.2713, "num_tokens": 364966015.0, "step": 2792 }, { "epoch": 1.1145251396648044, "grad_norm": 0.2795141041278839, "learning_rate": 3.843715512243904e-05, "loss": 0.2453, "num_tokens": 365097087.0, "step": 2793 }, { "epoch": 1.114924181963288, "grad_norm": 0.25416192412376404, "learning_rate": 3.8428504842754446e-05, "loss": 0.2281, "num_tokens": 365228159.0, "step": 2794 }, { "epoch": 1.1153232242617717, "grad_norm": 0.24296224117279053, "learning_rate": 3.841985244850603e-05, "loss": 0.2154, "num_tokens": 365359231.0, "step": 2795 }, { "epoch": 1.1157222665602553, "grad_norm": 0.27065911889076233, "learning_rate": 3.8411197941367956e-05, "loss": 0.2231, "num_tokens": 365490303.0, "step": 2796 }, { "epoch": 1.116121308858739, "grad_norm": 0.26817914843559265, "learning_rate": 3.84025413230148e-05, "loss": 0.2439, "num_tokens": 365621375.0, "step": 2797 }, { "epoch": 1.1165203511572226, "grad_norm": 0.2529165744781494, "learning_rate": 3.839388259512152e-05, "loss": 0.2234, "num_tokens": 365752447.0, "step": 2798 }, { "epoch": 1.1169193934557062, "grad_norm": 0.26794466376304626, "learning_rate": 3.838522175936352e-05, "loss": 0.2456, "num_tokens": 365883519.0, "step": 2799 }, { "epoch": 1.1173184357541899, "grad_norm": 0.2593991756439209, "learning_rate": 3.837655881741658e-05, "loss": 0.2437, "num_tokens": 366014591.0, "step": 2800 }, { "epoch": 1.1177174780526735, "grad_norm": 0.2794917821884155, "learning_rate": 3.836789377095689e-05, "loss": 0.2564, "num_tokens": 366145663.0, "step": 2801 }, { "epoch": 1.1181165203511572, "grad_norm": 0.25045496225357056, "learning_rate": 3.835922662166107e-05, "loss": 0.2063, "num_tokens": 366276735.0, "step": 2802 }, { "epoch": 1.1185155626496408, "grad_norm": 0.2636718451976776, "learning_rate": 3.835055737120613e-05, "loss": 0.2118, "num_tokens": 366407807.0, "step": 2803 }, { "epoch": 1.1189146049481244, "grad_norm": 0.23951755464076996, "learning_rate": 3.8341886021269487e-05, "loss": 0.1913, "num_tokens": 366538879.0, "step": 2804 }, { "epoch": 1.119313647246608, "grad_norm": 0.3040103018283844, "learning_rate": 3.8333212573528957e-05, "loss": 0.256, "num_tokens": 366669951.0, "step": 2805 }, { "epoch": 1.1197126895450917, "grad_norm": 0.3024981915950775, "learning_rate": 3.8324537029662785e-05, "loss": 0.2289, "num_tokens": 366801023.0, "step": 2806 }, { "epoch": 1.1201117318435754, "grad_norm": 0.23946331441402435, "learning_rate": 3.83158593913496e-05, "loss": 0.1896, "num_tokens": 366932095.0, "step": 2807 }, { "epoch": 1.120510774142059, "grad_norm": 0.2544771730899811, "learning_rate": 3.830717966026845e-05, "loss": 0.2098, "num_tokens": 367063167.0, "step": 2808 }, { "epoch": 1.1209098164405427, "grad_norm": 0.29927363991737366, "learning_rate": 3.829849783809879e-05, "loss": 0.2714, "num_tokens": 367194239.0, "step": 2809 }, { "epoch": 1.1213088587390263, "grad_norm": 0.270767480134964, "learning_rate": 3.828981392652044e-05, "loss": 0.2459, "num_tokens": 367325311.0, "step": 2810 }, { "epoch": 1.12170790103751, "grad_norm": 0.312520831823349, "learning_rate": 3.828112792721369e-05, "loss": 0.2922, "num_tokens": 367456383.0, "step": 2811 }, { "epoch": 1.1221069433359936, "grad_norm": 0.2664516866207123, "learning_rate": 3.827243984185918e-05, "loss": 0.2467, "num_tokens": 367587455.0, "step": 2812 }, { "epoch": 1.1225059856344772, "grad_norm": 0.25445109605789185, "learning_rate": 3.8263749672137976e-05, "loss": 0.2721, "num_tokens": 367718527.0, "step": 2813 }, { "epoch": 1.1229050279329609, "grad_norm": 0.26117759943008423, "learning_rate": 3.825505741973155e-05, "loss": 0.2533, "num_tokens": 367849599.0, "step": 2814 }, { "epoch": 1.1233040702314445, "grad_norm": 0.2560768127441406, "learning_rate": 3.8246363086321784e-05, "loss": 0.2083, "num_tokens": 367980671.0, "step": 2815 }, { "epoch": 1.1237031125299282, "grad_norm": 0.2725880444049835, "learning_rate": 3.823766667359093e-05, "loss": 0.2767, "num_tokens": 368111743.0, "step": 2816 }, { "epoch": 1.1241021548284118, "grad_norm": 0.2523708641529083, "learning_rate": 3.822896818322167e-05, "loss": 0.2244, "num_tokens": 368242815.0, "step": 2817 }, { "epoch": 1.1245011971268954, "grad_norm": 0.23952580988407135, "learning_rate": 3.822026761689708e-05, "loss": 0.2333, "num_tokens": 368373887.0, "step": 2818 }, { "epoch": 1.124900239425379, "grad_norm": 0.2654784619808197, "learning_rate": 3.821156497630064e-05, "loss": 0.2471, "num_tokens": 368504959.0, "step": 2819 }, { "epoch": 1.1252992817238627, "grad_norm": 0.26839810609817505, "learning_rate": 3.8202860263116233e-05, "loss": 0.2296, "num_tokens": 368636031.0, "step": 2820 }, { "epoch": 1.1256983240223464, "grad_norm": 0.28835737705230713, "learning_rate": 3.819415347902812e-05, "loss": 0.2288, "num_tokens": 368767103.0, "step": 2821 }, { "epoch": 1.12609736632083, "grad_norm": 0.27858468890190125, "learning_rate": 3.8185444625721015e-05, "loss": 0.2592, "num_tokens": 368898175.0, "step": 2822 }, { "epoch": 1.1264964086193137, "grad_norm": 0.2687182128429413, "learning_rate": 3.8176733704879975e-05, "loss": 0.2501, "num_tokens": 369028470.0, "step": 2823 }, { "epoch": 1.1268954509177973, "grad_norm": 0.25309792160987854, "learning_rate": 3.816802071819049e-05, "loss": 0.2102, "num_tokens": 369159542.0, "step": 2824 }, { "epoch": 1.127294493216281, "grad_norm": 0.25167080760002136, "learning_rate": 3.8159305667338444e-05, "loss": 0.255, "num_tokens": 369290614.0, "step": 2825 }, { "epoch": 1.1276935355147646, "grad_norm": 0.25478294491767883, "learning_rate": 3.815058855401012e-05, "loss": 0.2493, "num_tokens": 369421686.0, "step": 2826 }, { "epoch": 1.1280925778132482, "grad_norm": 0.3649832010269165, "learning_rate": 3.814186937989219e-05, "loss": 0.229, "num_tokens": 369552758.0, "step": 2827 }, { "epoch": 1.1284916201117319, "grad_norm": 0.2681099772453308, "learning_rate": 3.8133148146671745e-05, "loss": 0.2457, "num_tokens": 369683830.0, "step": 2828 }, { "epoch": 1.1288906624102155, "grad_norm": 0.2522943615913391, "learning_rate": 3.812442485603624e-05, "loss": 0.2339, "num_tokens": 369814902.0, "step": 2829 }, { "epoch": 1.1292897047086992, "grad_norm": 0.23925776779651642, "learning_rate": 3.811569950967358e-05, "loss": 0.2034, "num_tokens": 369945974.0, "step": 2830 }, { "epoch": 1.1296887470071828, "grad_norm": 0.262844443321228, "learning_rate": 3.810697210927201e-05, "loss": 0.206, "num_tokens": 370077046.0, "step": 2831 }, { "epoch": 1.1300877893056664, "grad_norm": 0.2687090039253235, "learning_rate": 3.809824265652022e-05, "loss": 0.2439, "num_tokens": 370208118.0, "step": 2832 }, { "epoch": 1.13048683160415, "grad_norm": 0.279893159866333, "learning_rate": 3.808951115310727e-05, "loss": 0.2215, "num_tokens": 370339190.0, "step": 2833 }, { "epoch": 1.1308858739026337, "grad_norm": 0.28782913088798523, "learning_rate": 3.808077760072263e-05, "loss": 0.2359, "num_tokens": 370470262.0, "step": 2834 }, { "epoch": 1.1312849162011174, "grad_norm": 0.3185310363769531, "learning_rate": 3.8072042001056154e-05, "loss": 0.2932, "num_tokens": 370601334.0, "step": 2835 }, { "epoch": 1.131683958499601, "grad_norm": 0.23491151630878448, "learning_rate": 3.80633043557981e-05, "loss": 0.1846, "num_tokens": 370732406.0, "step": 2836 }, { "epoch": 1.1320830007980847, "grad_norm": 0.25792601704597473, "learning_rate": 3.805456466663913e-05, "loss": 0.2365, "num_tokens": 370863478.0, "step": 2837 }, { "epoch": 1.1324820430965683, "grad_norm": 0.25623756647109985, "learning_rate": 3.804582293527028e-05, "loss": 0.2335, "num_tokens": 370994550.0, "step": 2838 }, { "epoch": 1.132881085395052, "grad_norm": 0.248980313539505, "learning_rate": 3.8037079163383004e-05, "loss": 0.2379, "num_tokens": 371125622.0, "step": 2839 }, { "epoch": 1.1332801276935356, "grad_norm": 0.25982460379600525, "learning_rate": 3.8028333352669134e-05, "loss": 0.2475, "num_tokens": 371256694.0, "step": 2840 }, { "epoch": 1.1336791699920192, "grad_norm": 0.25303536653518677, "learning_rate": 3.80195855048209e-05, "loss": 0.2461, "num_tokens": 371387766.0, "step": 2841 }, { "epoch": 1.1340782122905029, "grad_norm": 0.2497238665819168, "learning_rate": 3.801083562153093e-05, "loss": 0.247, "num_tokens": 371518838.0, "step": 2842 }, { "epoch": 1.1344772545889865, "grad_norm": 0.2603890001773834, "learning_rate": 3.800208370449225e-05, "loss": 0.2552, "num_tokens": 371649910.0, "step": 2843 }, { "epoch": 1.1348762968874702, "grad_norm": 0.29318711161613464, "learning_rate": 3.799332975539828e-05, "loss": 0.2782, "num_tokens": 371780982.0, "step": 2844 }, { "epoch": 1.1352753391859538, "grad_norm": 0.2392076849937439, "learning_rate": 3.7984573775942815e-05, "loss": 0.2244, "num_tokens": 371912054.0, "step": 2845 }, { "epoch": 1.1356743814844374, "grad_norm": 0.2488396167755127, "learning_rate": 3.797581576782006e-05, "loss": 0.2498, "num_tokens": 372043126.0, "step": 2846 }, { "epoch": 1.1360734237829209, "grad_norm": 0.2687898278236389, "learning_rate": 3.796705573272461e-05, "loss": 0.2483, "num_tokens": 372174198.0, "step": 2847 }, { "epoch": 1.1364724660814045, "grad_norm": 0.2998049259185791, "learning_rate": 3.795829367235145e-05, "loss": 0.2526, "num_tokens": 372305270.0, "step": 2848 }, { "epoch": 1.1368715083798882, "grad_norm": 0.26774078607559204, "learning_rate": 3.794952958839595e-05, "loss": 0.2615, "num_tokens": 372436342.0, "step": 2849 }, { "epoch": 1.1372705506783718, "grad_norm": 0.2696712911128998, "learning_rate": 3.7940763482553884e-05, "loss": 0.2169, "num_tokens": 372567414.0, "step": 2850 }, { "epoch": 1.1376695929768554, "grad_norm": 0.2822346091270447, "learning_rate": 3.7931995356521414e-05, "loss": 0.2799, "num_tokens": 372698486.0, "step": 2851 }, { "epoch": 1.138068635275339, "grad_norm": 0.25830480456352234, "learning_rate": 3.7923225211995086e-05, "loss": 0.2052, "num_tokens": 372829558.0, "step": 2852 }, { "epoch": 1.1384676775738227, "grad_norm": 0.30076509714126587, "learning_rate": 3.791445305067185e-05, "loss": 0.2816, "num_tokens": 372960630.0, "step": 2853 }, { "epoch": 1.1388667198723064, "grad_norm": 0.27032312750816345, "learning_rate": 3.790567887424902e-05, "loss": 0.2661, "num_tokens": 373091702.0, "step": 2854 }, { "epoch": 1.13926576217079, "grad_norm": 0.2390596568584442, "learning_rate": 3.789690268442434e-05, "loss": 0.221, "num_tokens": 373222774.0, "step": 2855 }, { "epoch": 1.1396648044692737, "grad_norm": 0.23703311383724213, "learning_rate": 3.78881244828959e-05, "loss": 0.2217, "num_tokens": 373352030.0, "step": 2856 }, { "epoch": 1.1400638467677573, "grad_norm": 0.27515995502471924, "learning_rate": 3.787934427136221e-05, "loss": 0.2891, "num_tokens": 373483102.0, "step": 2857 }, { "epoch": 1.140462889066241, "grad_norm": 0.27156901359558105, "learning_rate": 3.787056205152217e-05, "loss": 0.2546, "num_tokens": 373614174.0, "step": 2858 }, { "epoch": 1.1408619313647246, "grad_norm": 0.23075884580612183, "learning_rate": 3.7861777825075035e-05, "loss": 0.2088, "num_tokens": 373745246.0, "step": 2859 }, { "epoch": 1.1412609736632082, "grad_norm": 0.26896026730537415, "learning_rate": 3.785299159372049e-05, "loss": 0.2535, "num_tokens": 373876318.0, "step": 2860 }, { "epoch": 1.1416600159616919, "grad_norm": 0.2903745174407959, "learning_rate": 3.784420335915857e-05, "loss": 0.2937, "num_tokens": 374007390.0, "step": 2861 }, { "epoch": 1.1420590582601755, "grad_norm": 0.23914146423339844, "learning_rate": 3.783541312308974e-05, "loss": 0.1983, "num_tokens": 374138462.0, "step": 2862 }, { "epoch": 1.1424581005586592, "grad_norm": 0.23927627503871918, "learning_rate": 3.7826620887214804e-05, "loss": 0.2183, "num_tokens": 374269534.0, "step": 2863 }, { "epoch": 1.1428571428571428, "grad_norm": 0.24611470103263855, "learning_rate": 3.7817826653235e-05, "loss": 0.2137, "num_tokens": 374400606.0, "step": 2864 }, { "epoch": 1.1432561851556264, "grad_norm": 0.25868409872055054, "learning_rate": 3.780903042285192e-05, "loss": 0.2573, "num_tokens": 374531678.0, "step": 2865 }, { "epoch": 1.14365522745411, "grad_norm": 0.254180908203125, "learning_rate": 3.780023219776755e-05, "loss": 0.2335, "num_tokens": 374662750.0, "step": 2866 }, { "epoch": 1.1440542697525937, "grad_norm": 0.24956095218658447, "learning_rate": 3.779143197968426e-05, "loss": 0.2173, "num_tokens": 374793822.0, "step": 2867 }, { "epoch": 1.1444533120510774, "grad_norm": 0.265445739030838, "learning_rate": 3.7782629770304826e-05, "loss": 0.2432, "num_tokens": 374924894.0, "step": 2868 }, { "epoch": 1.144852354349561, "grad_norm": 0.2925141155719757, "learning_rate": 3.7773825571332386e-05, "loss": 0.2456, "num_tokens": 375055966.0, "step": 2869 }, { "epoch": 1.1452513966480447, "grad_norm": 0.24947917461395264, "learning_rate": 3.7765019384470464e-05, "loss": 0.2047, "num_tokens": 375187038.0, "step": 2870 }, { "epoch": 1.1456504389465283, "grad_norm": 0.25192326307296753, "learning_rate": 3.775621121142298e-05, "loss": 0.2284, "num_tokens": 375318110.0, "step": 2871 }, { "epoch": 1.146049481245012, "grad_norm": 0.30038711428642273, "learning_rate": 3.774740105389423e-05, "loss": 0.2638, "num_tokens": 375449182.0, "step": 2872 }, { "epoch": 1.1464485235434956, "grad_norm": 0.2710505723953247, "learning_rate": 3.773858891358891e-05, "loss": 0.2053, "num_tokens": 375580254.0, "step": 2873 }, { "epoch": 1.1468475658419792, "grad_norm": 0.2926495671272278, "learning_rate": 3.7729774792212075e-05, "loss": 0.2599, "num_tokens": 375695843.0, "step": 2874 }, { "epoch": 1.1472466081404629, "grad_norm": 0.2604917585849762, "learning_rate": 3.772095869146918e-05, "loss": 0.2322, "num_tokens": 375826915.0, "step": 2875 }, { "epoch": 1.1476456504389465, "grad_norm": 0.2742220461368561, "learning_rate": 3.771214061306605e-05, "loss": 0.2791, "num_tokens": 375957987.0, "step": 2876 }, { "epoch": 1.1480446927374302, "grad_norm": 0.2682681381702423, "learning_rate": 3.770332055870892e-05, "loss": 0.2794, "num_tokens": 376089059.0, "step": 2877 }, { "epoch": 1.1484437350359138, "grad_norm": 0.30361586809158325, "learning_rate": 3.769449853010437e-05, "loss": 0.2996, "num_tokens": 376220131.0, "step": 2878 }, { "epoch": 1.1488427773343974, "grad_norm": 0.26706135272979736, "learning_rate": 3.768567452895938e-05, "loss": 0.2287, "num_tokens": 376351203.0, "step": 2879 }, { "epoch": 1.149241819632881, "grad_norm": 0.25713542103767395, "learning_rate": 3.7676848556981323e-05, "loss": 0.2274, "num_tokens": 376466999.0, "step": 2880 }, { "epoch": 1.1496408619313647, "grad_norm": 0.24531209468841553, "learning_rate": 3.766802061587794e-05, "loss": 0.2352, "num_tokens": 376598071.0, "step": 2881 }, { "epoch": 1.1500399042298484, "grad_norm": 0.24771881103515625, "learning_rate": 3.7659190707357356e-05, "loss": 0.2579, "num_tokens": 376729143.0, "step": 2882 }, { "epoch": 1.150438946528332, "grad_norm": 0.22290070354938507, "learning_rate": 3.765035883312807e-05, "loss": 0.2005, "num_tokens": 376860215.0, "step": 2883 }, { "epoch": 1.1508379888268156, "grad_norm": 0.2695004642009735, "learning_rate": 3.764152499489896e-05, "loss": 0.2479, "num_tokens": 376991287.0, "step": 2884 }, { "epoch": 1.1512370311252993, "grad_norm": 0.24251654744148254, "learning_rate": 3.763268919437931e-05, "loss": 0.2168, "num_tokens": 377122359.0, "step": 2885 }, { "epoch": 1.151636073423783, "grad_norm": 0.2708433270454407, "learning_rate": 3.762385143327876e-05, "loss": 0.2335, "num_tokens": 377253431.0, "step": 2886 }, { "epoch": 1.1520351157222666, "grad_norm": 0.2700658440589905, "learning_rate": 3.761501171330733e-05, "loss": 0.2517, "num_tokens": 377384503.0, "step": 2887 }, { "epoch": 1.1524341580207502, "grad_norm": 0.27727439999580383, "learning_rate": 3.760617003617542e-05, "loss": 0.2678, "num_tokens": 377515575.0, "step": 2888 }, { "epoch": 1.1528332003192339, "grad_norm": 0.2440616935491562, "learning_rate": 3.7597326403593816e-05, "loss": 0.2251, "num_tokens": 377646647.0, "step": 2889 }, { "epoch": 1.1532322426177175, "grad_norm": 0.26584094762802124, "learning_rate": 3.758848081727369e-05, "loss": 0.2591, "num_tokens": 377777719.0, "step": 2890 }, { "epoch": 1.1536312849162011, "grad_norm": 0.26218563318252563, "learning_rate": 3.757963327892656e-05, "loss": 0.2392, "num_tokens": 377908791.0, "step": 2891 }, { "epoch": 1.1540303272146848, "grad_norm": 0.2433926910161972, "learning_rate": 3.757078379026435e-05, "loss": 0.2057, "num_tokens": 378039863.0, "step": 2892 }, { "epoch": 1.1544293695131684, "grad_norm": 0.223220556974411, "learning_rate": 3.756193235299936e-05, "loss": 0.1686, "num_tokens": 378170935.0, "step": 2893 }, { "epoch": 1.154828411811652, "grad_norm": 0.25562769174575806, "learning_rate": 3.755307896884425e-05, "loss": 0.2218, "num_tokens": 378302007.0, "step": 2894 }, { "epoch": 1.1552274541101357, "grad_norm": 0.29452580213546753, "learning_rate": 3.754422363951208e-05, "loss": 0.256, "num_tokens": 378433079.0, "step": 2895 }, { "epoch": 1.1556264964086194, "grad_norm": 0.2804104685783386, "learning_rate": 3.753536636671626e-05, "loss": 0.2266, "num_tokens": 378564151.0, "step": 2896 }, { "epoch": 1.156025538707103, "grad_norm": 0.24637949466705322, "learning_rate": 3.75265071521706e-05, "loss": 0.1834, "num_tokens": 378695223.0, "step": 2897 }, { "epoch": 1.1564245810055866, "grad_norm": 0.2601831555366516, "learning_rate": 3.751764599758927e-05, "loss": 0.2218, "num_tokens": 378826295.0, "step": 2898 }, { "epoch": 1.1568236233040703, "grad_norm": 0.25022193789482117, "learning_rate": 3.750878290468683e-05, "loss": 0.2168, "num_tokens": 378957367.0, "step": 2899 }, { "epoch": 1.157222665602554, "grad_norm": 0.267226904630661, "learning_rate": 3.7499917875178195e-05, "loss": 0.2454, "num_tokens": 379088439.0, "step": 2900 }, { "epoch": 1.1576217079010376, "grad_norm": 0.2445257604122162, "learning_rate": 3.749105091077867e-05, "loss": 0.2389, "num_tokens": 379219511.0, "step": 2901 }, { "epoch": 1.1580207501995212, "grad_norm": 0.2599896192550659, "learning_rate": 3.748218201320392e-05, "loss": 0.2355, "num_tokens": 379350583.0, "step": 2902 }, { "epoch": 1.1584197924980049, "grad_norm": 0.2596380412578583, "learning_rate": 3.747331118417001e-05, "loss": 0.2341, "num_tokens": 379481655.0, "step": 2903 }, { "epoch": 1.1588188347964885, "grad_norm": 0.2501952052116394, "learning_rate": 3.746443842539336e-05, "loss": 0.2158, "num_tokens": 379612727.0, "step": 2904 }, { "epoch": 1.1592178770949721, "grad_norm": 0.27322301268577576, "learning_rate": 3.745556373859076e-05, "loss": 0.2383, "num_tokens": 379743799.0, "step": 2905 }, { "epoch": 1.1596169193934558, "grad_norm": 0.25564631819725037, "learning_rate": 3.744668712547938e-05, "loss": 0.1985, "num_tokens": 379874871.0, "step": 2906 }, { "epoch": 1.1600159616919394, "grad_norm": 0.27758461236953735, "learning_rate": 3.743780858777676e-05, "loss": 0.2796, "num_tokens": 380005943.0, "step": 2907 }, { "epoch": 1.160415003990423, "grad_norm": 0.28747567534446716, "learning_rate": 3.742892812720082e-05, "loss": 0.2594, "num_tokens": 380137015.0, "step": 2908 }, { "epoch": 1.1608140462889067, "grad_norm": 0.23891855776309967, "learning_rate": 3.742004574546984e-05, "loss": 0.219, "num_tokens": 380268087.0, "step": 2909 }, { "epoch": 1.1612130885873904, "grad_norm": 0.3024536967277527, "learning_rate": 3.741116144430249e-05, "loss": 0.2934, "num_tokens": 380399159.0, "step": 2910 }, { "epoch": 1.161612130885874, "grad_norm": 0.28662776947021484, "learning_rate": 3.740227522541778e-05, "loss": 0.2698, "num_tokens": 380530231.0, "step": 2911 }, { "epoch": 1.1620111731843576, "grad_norm": 0.2583773136138916, "learning_rate": 3.7393387090535135e-05, "loss": 0.2271, "num_tokens": 380661303.0, "step": 2912 }, { "epoch": 1.1624102154828413, "grad_norm": 0.28213998675346375, "learning_rate": 3.73844970413743e-05, "loss": 0.2645, "num_tokens": 380792375.0, "step": 2913 }, { "epoch": 1.162809257781325, "grad_norm": 0.25343674421310425, "learning_rate": 3.7375605079655434e-05, "loss": 0.1796, "num_tokens": 380923447.0, "step": 2914 }, { "epoch": 1.1632083000798086, "grad_norm": 0.27742281556129456, "learning_rate": 3.7366711207099053e-05, "loss": 0.2544, "num_tokens": 381054519.0, "step": 2915 }, { "epoch": 1.1636073423782922, "grad_norm": 0.27305638790130615, "learning_rate": 3.7357815425426016e-05, "loss": 0.252, "num_tokens": 381185591.0, "step": 2916 }, { "epoch": 1.1640063846767759, "grad_norm": 0.28595665097236633, "learning_rate": 3.73489177363576e-05, "loss": 0.2717, "num_tokens": 381316663.0, "step": 2917 }, { "epoch": 1.1644054269752593, "grad_norm": 0.2667753994464874, "learning_rate": 3.7340018141615405e-05, "loss": 0.2482, "num_tokens": 381447735.0, "step": 2918 }, { "epoch": 1.164804469273743, "grad_norm": 0.26339656114578247, "learning_rate": 3.733111664292143e-05, "loss": 0.2482, "num_tokens": 381578807.0, "step": 2919 }, { "epoch": 1.1652035115722266, "grad_norm": 0.260299950838089, "learning_rate": 3.732221324199802e-05, "loss": 0.2499, "num_tokens": 381707719.0, "step": 2920 }, { "epoch": 1.1656025538707102, "grad_norm": 0.28312715888023376, "learning_rate": 3.731330794056793e-05, "loss": 0.2379, "num_tokens": 381838791.0, "step": 2921 }, { "epoch": 1.1660015961691939, "grad_norm": 0.26955312490463257, "learning_rate": 3.730440074035421e-05, "loss": 0.2086, "num_tokens": 381969863.0, "step": 2922 }, { "epoch": 1.1664006384676775, "grad_norm": 0.2640886902809143, "learning_rate": 3.7295491643080354e-05, "loss": 0.2479, "num_tokens": 382100935.0, "step": 2923 }, { "epoch": 1.1667996807661611, "grad_norm": 0.2671072781085968, "learning_rate": 3.728658065047017e-05, "loss": 0.2588, "num_tokens": 382232007.0, "step": 2924 }, { "epoch": 1.1671987230646448, "grad_norm": 0.2503524422645569, "learning_rate": 3.7277667764247864e-05, "loss": 0.2238, "num_tokens": 382363079.0, "step": 2925 }, { "epoch": 1.1675977653631284, "grad_norm": 0.2593925893306732, "learning_rate": 3.7268752986138e-05, "loss": 0.2341, "num_tokens": 382494151.0, "step": 2926 }, { "epoch": 1.167996807661612, "grad_norm": 0.279983252286911, "learning_rate": 3.7259836317865475e-05, "loss": 0.2444, "num_tokens": 382625223.0, "step": 2927 }, { "epoch": 1.1683958499600957, "grad_norm": 0.2673163115978241, "learning_rate": 3.7250917761155623e-05, "loss": 0.238, "num_tokens": 382756295.0, "step": 2928 }, { "epoch": 1.1687948922585794, "grad_norm": 0.2572965621948242, "learning_rate": 3.724199731773407e-05, "loss": 0.2168, "num_tokens": 382887367.0, "step": 2929 }, { "epoch": 1.169193934557063, "grad_norm": 0.2557694613933563, "learning_rate": 3.723307498932686e-05, "loss": 0.2311, "num_tokens": 383018439.0, "step": 2930 }, { "epoch": 1.1695929768555466, "grad_norm": 0.2587423622608185, "learning_rate": 3.7224150777660364e-05, "loss": 0.2331, "num_tokens": 383149511.0, "step": 2931 }, { "epoch": 1.1699920191540303, "grad_norm": 0.29109978675842285, "learning_rate": 3.721522468446134e-05, "loss": 0.2914, "num_tokens": 383280583.0, "step": 2932 }, { "epoch": 1.170391061452514, "grad_norm": 0.2434864193201065, "learning_rate": 3.72062967114569e-05, "loss": 0.2116, "num_tokens": 383411655.0, "step": 2933 }, { "epoch": 1.1707901037509976, "grad_norm": 0.2788693606853485, "learning_rate": 3.7197366860374534e-05, "loss": 0.2782, "num_tokens": 383542727.0, "step": 2934 }, { "epoch": 1.1711891460494812, "grad_norm": 0.2718040645122528, "learning_rate": 3.7188435132942065e-05, "loss": 0.2627, "num_tokens": 383673799.0, "step": 2935 }, { "epoch": 1.1715881883479649, "grad_norm": 0.2948501706123352, "learning_rate": 3.7179501530887734e-05, "loss": 0.3002, "num_tokens": 383804871.0, "step": 2936 }, { "epoch": 1.1719872306464485, "grad_norm": 0.29716283082962036, "learning_rate": 3.7170566055940075e-05, "loss": 0.2545, "num_tokens": 383935943.0, "step": 2937 }, { "epoch": 1.1723862729449321, "grad_norm": 0.26395025849342346, "learning_rate": 3.7161628709828026e-05, "loss": 0.2554, "num_tokens": 384067015.0, "step": 2938 }, { "epoch": 1.1727853152434158, "grad_norm": 0.2691676914691925, "learning_rate": 3.7152689494280904e-05, "loss": 0.2366, "num_tokens": 384198087.0, "step": 2939 }, { "epoch": 1.1731843575418994, "grad_norm": 0.29525014758110046, "learning_rate": 3.714374841102833e-05, "loss": 0.265, "num_tokens": 384329159.0, "step": 2940 }, { "epoch": 1.173583399840383, "grad_norm": 0.2510020434856415, "learning_rate": 3.713480546180034e-05, "loss": 0.2117, "num_tokens": 384460231.0, "step": 2941 }, { "epoch": 1.1739824421388667, "grad_norm": 0.27139440178871155, "learning_rate": 3.712586064832731e-05, "loss": 0.2111, "num_tokens": 384591303.0, "step": 2942 }, { "epoch": 1.1743814844373504, "grad_norm": 0.2557418644428253, "learning_rate": 3.711691397233997e-05, "loss": 0.2431, "num_tokens": 384722375.0, "step": 2943 }, { "epoch": 1.174780526735834, "grad_norm": 0.25793367624282837, "learning_rate": 3.710796543556943e-05, "loss": 0.2466, "num_tokens": 384853447.0, "step": 2944 }, { "epoch": 1.1751795690343176, "grad_norm": 0.23752456903457642, "learning_rate": 3.709901503974715e-05, "loss": 0.1865, "num_tokens": 384984519.0, "step": 2945 }, { "epoch": 1.1755786113328013, "grad_norm": 0.2891918420791626, "learning_rate": 3.709006278660493e-05, "loss": 0.2407, "num_tokens": 385115591.0, "step": 2946 }, { "epoch": 1.175977653631285, "grad_norm": 0.2684735059738159, "learning_rate": 3.708110867787496e-05, "loss": 0.2282, "num_tokens": 385246663.0, "step": 2947 }, { "epoch": 1.1763766959297686, "grad_norm": 0.27970612049102783, "learning_rate": 3.7072152715289775e-05, "loss": 0.2567, "num_tokens": 385371530.0, "step": 2948 }, { "epoch": 1.1767757382282522, "grad_norm": 0.23270869255065918, "learning_rate": 3.706319490058227e-05, "loss": 0.1719, "num_tokens": 385502602.0, "step": 2949 }, { "epoch": 1.1771747805267359, "grad_norm": 0.28960996866226196, "learning_rate": 3.70542352354857e-05, "loss": 0.2353, "num_tokens": 385633674.0, "step": 2950 }, { "epoch": 1.1775738228252195, "grad_norm": 0.2800772190093994, "learning_rate": 3.7045273721733676e-05, "loss": 0.2423, "num_tokens": 385764746.0, "step": 2951 }, { "epoch": 1.1779728651237031, "grad_norm": 0.2656400203704834, "learning_rate": 3.7036310361060164e-05, "loss": 0.2158, "num_tokens": 385895818.0, "step": 2952 }, { "epoch": 1.1783719074221868, "grad_norm": 0.2728855609893799, "learning_rate": 3.7027345155199496e-05, "loss": 0.23, "num_tokens": 386026890.0, "step": 2953 }, { "epoch": 1.1787709497206704, "grad_norm": 0.24832309782505035, "learning_rate": 3.701837810588635e-05, "loss": 0.2258, "num_tokens": 386157962.0, "step": 2954 }, { "epoch": 1.179169992019154, "grad_norm": 0.25434252619743347, "learning_rate": 3.7009409214855765e-05, "loss": 0.1962, "num_tokens": 386289034.0, "step": 2955 }, { "epoch": 1.1795690343176377, "grad_norm": 0.2805410623550415, "learning_rate": 3.700043848384316e-05, "loss": 0.2393, "num_tokens": 386420106.0, "step": 2956 }, { "epoch": 1.1799680766161214, "grad_norm": 0.28157228231430054, "learning_rate": 3.6991465914584256e-05, "loss": 0.2528, "num_tokens": 386551178.0, "step": 2957 }, { "epoch": 1.180367118914605, "grad_norm": 0.2970176935195923, "learning_rate": 3.6982491508815174e-05, "loss": 0.2655, "num_tokens": 386682250.0, "step": 2958 }, { "epoch": 1.1807661612130886, "grad_norm": 0.26235827803611755, "learning_rate": 3.697351526827239e-05, "loss": 0.1969, "num_tokens": 386813322.0, "step": 2959 }, { "epoch": 1.1811652035115723, "grad_norm": 0.3053875267505646, "learning_rate": 3.696453719469271e-05, "loss": 0.271, "num_tokens": 386944394.0, "step": 2960 }, { "epoch": 1.181564245810056, "grad_norm": 0.2724939286708832, "learning_rate": 3.6955557289813303e-05, "loss": 0.2138, "num_tokens": 387075466.0, "step": 2961 }, { "epoch": 1.1819632881085396, "grad_norm": 0.2724032700061798, "learning_rate": 3.6946575555371715e-05, "loss": 0.2309, "num_tokens": 387206538.0, "step": 2962 }, { "epoch": 1.1823623304070232, "grad_norm": 0.2590891420841217, "learning_rate": 3.693759199310581e-05, "loss": 0.1981, "num_tokens": 387337610.0, "step": 2963 }, { "epoch": 1.1827613727055069, "grad_norm": 0.28421398997306824, "learning_rate": 3.692860660475383e-05, "loss": 0.2348, "num_tokens": 387468682.0, "step": 2964 }, { "epoch": 1.1831604150039905, "grad_norm": 0.279293954372406, "learning_rate": 3.691961939205436e-05, "loss": 0.2416, "num_tokens": 387599754.0, "step": 2965 }, { "epoch": 1.1835594573024741, "grad_norm": 0.2894001603126526, "learning_rate": 3.691063035674635e-05, "loss": 0.2413, "num_tokens": 387730826.0, "step": 2966 }, { "epoch": 1.1839584996009578, "grad_norm": 0.2624591588973999, "learning_rate": 3.6901639500569095e-05, "loss": 0.2296, "num_tokens": 387861898.0, "step": 2967 }, { "epoch": 1.1843575418994414, "grad_norm": 0.28523287177085876, "learning_rate": 3.689264682526224e-05, "loss": 0.2591, "num_tokens": 387992970.0, "step": 2968 }, { "epoch": 1.184756584197925, "grad_norm": 0.2582738399505615, "learning_rate": 3.688365233256578e-05, "loss": 0.2371, "num_tokens": 388124042.0, "step": 2969 }, { "epoch": 1.1851556264964087, "grad_norm": 0.2630091607570648, "learning_rate": 3.687465602422006e-05, "loss": 0.2436, "num_tokens": 388255114.0, "step": 2970 }, { "epoch": 1.1855546687948924, "grad_norm": 0.2684389054775238, "learning_rate": 3.686565790196579e-05, "loss": 0.2279, "num_tokens": 388386186.0, "step": 2971 }, { "epoch": 1.1859537110933758, "grad_norm": 0.26433679461479187, "learning_rate": 3.685665796754402e-05, "loss": 0.2423, "num_tokens": 388517258.0, "step": 2972 }, { "epoch": 1.1863527533918594, "grad_norm": 0.2555650770664215, "learning_rate": 3.684765622269617e-05, "loss": 0.2384, "num_tokens": 388648330.0, "step": 2973 }, { "epoch": 1.186751795690343, "grad_norm": 0.2735762894153595, "learning_rate": 3.683865266916396e-05, "loss": 0.2427, "num_tokens": 388779402.0, "step": 2974 }, { "epoch": 1.1871508379888267, "grad_norm": 0.26956430077552795, "learning_rate": 3.682964730868952e-05, "loss": 0.2521, "num_tokens": 388910474.0, "step": 2975 }, { "epoch": 1.1875498802873103, "grad_norm": 0.2561458945274353, "learning_rate": 3.682064014301529e-05, "loss": 0.2413, "num_tokens": 389041546.0, "step": 2976 }, { "epoch": 1.187948922585794, "grad_norm": 0.2953096926212311, "learning_rate": 3.681163117388409e-05, "loss": 0.2751, "num_tokens": 389172618.0, "step": 2977 }, { "epoch": 1.1883479648842776, "grad_norm": 0.2693132758140564, "learning_rate": 3.680262040303905e-05, "loss": 0.2364, "num_tokens": 389303690.0, "step": 2978 }, { "epoch": 1.1887470071827613, "grad_norm": 0.26382577419281006, "learning_rate": 3.6793607832223686e-05, "loss": 0.2663, "num_tokens": 389434762.0, "step": 2979 }, { "epoch": 1.189146049481245, "grad_norm": 0.24627913534641266, "learning_rate": 3.6784593463181835e-05, "loss": 0.2388, "num_tokens": 389565834.0, "step": 2980 }, { "epoch": 1.1895450917797286, "grad_norm": 0.24479223787784576, "learning_rate": 3.67755772976577e-05, "loss": 0.2254, "num_tokens": 389696906.0, "step": 2981 }, { "epoch": 1.1899441340782122, "grad_norm": 0.2513037323951721, "learning_rate": 3.676655933739583e-05, "loss": 0.2262, "num_tokens": 389827978.0, "step": 2982 }, { "epoch": 1.1903431763766958, "grad_norm": 0.251008540391922, "learning_rate": 3.67575395841411e-05, "loss": 0.2338, "num_tokens": 389959050.0, "step": 2983 }, { "epoch": 1.1907422186751795, "grad_norm": 0.2305690348148346, "learning_rate": 3.6748518039638775e-05, "loss": 0.1761, "num_tokens": 390090122.0, "step": 2984 }, { "epoch": 1.1911412609736631, "grad_norm": 0.2566947340965271, "learning_rate": 3.67394947056344e-05, "loss": 0.2249, "num_tokens": 390221194.0, "step": 2985 }, { "epoch": 1.1915403032721468, "grad_norm": 0.31115996837615967, "learning_rate": 3.6730469583873944e-05, "loss": 0.2778, "num_tokens": 390352266.0, "step": 2986 }, { "epoch": 1.1919393455706304, "grad_norm": 0.29044902324676514, "learning_rate": 3.672144267610366e-05, "loss": 0.2727, "num_tokens": 390483338.0, "step": 2987 }, { "epoch": 1.192338387869114, "grad_norm": 0.25906234979629517, "learning_rate": 3.6712413984070186e-05, "loss": 0.2027, "num_tokens": 390614410.0, "step": 2988 }, { "epoch": 1.1927374301675977, "grad_norm": 0.2405567318201065, "learning_rate": 3.6703383509520484e-05, "loss": 0.1896, "num_tokens": 390745482.0, "step": 2989 }, { "epoch": 1.1931364724660813, "grad_norm": 0.2548977732658386, "learning_rate": 3.669435125420186e-05, "loss": 0.2273, "num_tokens": 390876554.0, "step": 2990 }, { "epoch": 1.193535514764565, "grad_norm": 0.2927868962287903, "learning_rate": 3.668531721986198e-05, "loss": 0.2457, "num_tokens": 391007626.0, "step": 2991 }, { "epoch": 1.1939345570630486, "grad_norm": 0.24851462244987488, "learning_rate": 3.667628140824885e-05, "loss": 0.2152, "num_tokens": 391138698.0, "step": 2992 }, { "epoch": 1.1943335993615323, "grad_norm": 0.2721596658229828, "learning_rate": 3.6667243821110796e-05, "loss": 0.2356, "num_tokens": 391269770.0, "step": 2993 }, { "epoch": 1.194732641660016, "grad_norm": 0.2898278832435608, "learning_rate": 3.665820446019652e-05, "loss": 0.287, "num_tokens": 391400842.0, "step": 2994 }, { "epoch": 1.1951316839584996, "grad_norm": 0.2757013738155365, "learning_rate": 3.664916332725506e-05, "loss": 0.202, "num_tokens": 391531914.0, "step": 2995 }, { "epoch": 1.1955307262569832, "grad_norm": 0.2619911730289459, "learning_rate": 3.6640120424035776e-05, "loss": 0.2287, "num_tokens": 391662986.0, "step": 2996 }, { "epoch": 1.1959297685554668, "grad_norm": 0.2807578146457672, "learning_rate": 3.6631075752288396e-05, "loss": 0.2455, "num_tokens": 391794058.0, "step": 2997 }, { "epoch": 1.1963288108539505, "grad_norm": 0.2649844288825989, "learning_rate": 3.662202931376298e-05, "loss": 0.2198, "num_tokens": 391925130.0, "step": 2998 }, { "epoch": 1.1967278531524341, "grad_norm": 0.2613646388053894, "learning_rate": 3.661298111020992e-05, "loss": 0.2316, "num_tokens": 392056202.0, "step": 2999 }, { "epoch": 1.1971268954509178, "grad_norm": 0.2787191867828369, "learning_rate": 3.6603931143379966e-05, "loss": 0.2462, "num_tokens": 392187274.0, "step": 3000 }, { "epoch": 1.1975259377494014, "grad_norm": 0.28249889612197876, "learning_rate": 3.659487941502421e-05, "loss": 0.2661, "num_tokens": 392318346.0, "step": 3001 }, { "epoch": 1.197924980047885, "grad_norm": 0.24696393311023712, "learning_rate": 3.658582592689406e-05, "loss": 0.2367, "num_tokens": 392449418.0, "step": 3002 }, { "epoch": 1.1983240223463687, "grad_norm": 0.26505082845687866, "learning_rate": 3.657677068074129e-05, "loss": 0.2329, "num_tokens": 392580490.0, "step": 3003 }, { "epoch": 1.1987230646448523, "grad_norm": 0.27141037583351135, "learning_rate": 3.656771367831801e-05, "loss": 0.239, "num_tokens": 392711562.0, "step": 3004 }, { "epoch": 1.199122106943336, "grad_norm": 0.23604846000671387, "learning_rate": 3.6558654921376663e-05, "loss": 0.1906, "num_tokens": 392842634.0, "step": 3005 }, { "epoch": 1.1995211492418196, "grad_norm": 0.25986814498901367, "learning_rate": 3.6549594411670023e-05, "loss": 0.2116, "num_tokens": 392973706.0, "step": 3006 }, { "epoch": 1.1999201915403033, "grad_norm": 0.2723316550254822, "learning_rate": 3.6540532150951233e-05, "loss": 0.2524, "num_tokens": 393104778.0, "step": 3007 }, { "epoch": 1.200319233838787, "grad_norm": 0.286944180727005, "learning_rate": 3.6531468140973747e-05, "loss": 0.2454, "num_tokens": 393235850.0, "step": 3008 }, { "epoch": 1.2007182761372706, "grad_norm": 0.30599290132522583, "learning_rate": 3.652240238349136e-05, "loss": 0.2526, "num_tokens": 393366922.0, "step": 3009 }, { "epoch": 1.2011173184357542, "grad_norm": 0.3482339680194855, "learning_rate": 3.651333488025822e-05, "loss": 0.2363, "num_tokens": 393497994.0, "step": 3010 }, { "epoch": 1.2015163607342378, "grad_norm": 0.2667556703090668, "learning_rate": 3.65042656330288e-05, "loss": 0.2142, "num_tokens": 393629066.0, "step": 3011 }, { "epoch": 1.2019154030327215, "grad_norm": 0.2906259298324585, "learning_rate": 3.649519464355792e-05, "loss": 0.2286, "num_tokens": 393744940.0, "step": 3012 }, { "epoch": 1.2023144453312051, "grad_norm": 0.2440439611673355, "learning_rate": 3.648612191360073e-05, "loss": 0.18, "num_tokens": 393876012.0, "step": 3013 }, { "epoch": 1.2027134876296888, "grad_norm": 0.25131550431251526, "learning_rate": 3.647704744491271e-05, "loss": 0.1946, "num_tokens": 394007084.0, "step": 3014 }, { "epoch": 1.2031125299281724, "grad_norm": 0.2564995288848877, "learning_rate": 3.64679712392497e-05, "loss": 0.2221, "num_tokens": 394138156.0, "step": 3015 }, { "epoch": 1.203511572226656, "grad_norm": 0.28217294812202454, "learning_rate": 3.645889329836785e-05, "loss": 0.243, "num_tokens": 394269228.0, "step": 3016 }, { "epoch": 1.2039106145251397, "grad_norm": 0.3172319531440735, "learning_rate": 3.644981362402366e-05, "loss": 0.2525, "num_tokens": 394400300.0, "step": 3017 }, { "epoch": 1.2043096568236233, "grad_norm": 0.3127821981906891, "learning_rate": 3.6440732217973974e-05, "loss": 0.2579, "num_tokens": 394531372.0, "step": 3018 }, { "epoch": 1.204708699122107, "grad_norm": 0.25579169392585754, "learning_rate": 3.643164908197593e-05, "loss": 0.224, "num_tokens": 394662444.0, "step": 3019 }, { "epoch": 1.2051077414205906, "grad_norm": 0.2564641833305359, "learning_rate": 3.642256421778705e-05, "loss": 0.2069, "num_tokens": 394793516.0, "step": 3020 }, { "epoch": 1.2055067837190743, "grad_norm": 0.28162169456481934, "learning_rate": 3.641347762716518e-05, "loss": 0.2595, "num_tokens": 394923609.0, "step": 3021 }, { "epoch": 1.205905826017558, "grad_norm": 0.2588123679161072, "learning_rate": 3.6404389311868475e-05, "loss": 0.2246, "num_tokens": 395054681.0, "step": 3022 }, { "epoch": 1.2063048683160416, "grad_norm": 0.2844312787055969, "learning_rate": 3.6395299273655445e-05, "loss": 0.2582, "num_tokens": 395185753.0, "step": 3023 }, { "epoch": 1.2067039106145252, "grad_norm": 0.25516951084136963, "learning_rate": 3.6386207514284924e-05, "loss": 0.2158, "num_tokens": 395316825.0, "step": 3024 }, { "epoch": 1.2071029529130088, "grad_norm": 0.2642728090286255, "learning_rate": 3.6377114035516084e-05, "loss": 0.2186, "num_tokens": 395447897.0, "step": 3025 }, { "epoch": 1.2075019952114925, "grad_norm": 0.27571019530296326, "learning_rate": 3.6368018839108436e-05, "loss": 0.237, "num_tokens": 395578969.0, "step": 3026 }, { "epoch": 1.2079010375099761, "grad_norm": 0.24815049767494202, "learning_rate": 3.635892192682181e-05, "loss": 0.1922, "num_tokens": 395710041.0, "step": 3027 }, { "epoch": 1.2083000798084598, "grad_norm": 0.2579498887062073, "learning_rate": 3.634982330041636e-05, "loss": 0.2169, "num_tokens": 395841113.0, "step": 3028 }, { "epoch": 1.2086991221069434, "grad_norm": 0.30377253890037537, "learning_rate": 3.6340722961652615e-05, "loss": 0.2734, "num_tokens": 395972185.0, "step": 3029 }, { "epoch": 1.209098164405427, "grad_norm": 0.30332937836647034, "learning_rate": 3.633162091229138e-05, "loss": 0.2663, "num_tokens": 396103257.0, "step": 3030 }, { "epoch": 1.2094972067039107, "grad_norm": 0.2663392722606659, "learning_rate": 3.632251715409382e-05, "loss": 0.2427, "num_tokens": 396234329.0, "step": 3031 }, { "epoch": 1.2098962490023943, "grad_norm": 0.24197375774383545, "learning_rate": 3.6313411688821444e-05, "loss": 0.1685, "num_tokens": 396365401.0, "step": 3032 }, { "epoch": 1.210295291300878, "grad_norm": 0.27894771099090576, "learning_rate": 3.6304304518236057e-05, "loss": 0.2297, "num_tokens": 396496473.0, "step": 3033 }, { "epoch": 1.2106943335993616, "grad_norm": 0.25084245204925537, "learning_rate": 3.6295195644099815e-05, "loss": 0.2247, "num_tokens": 396627545.0, "step": 3034 }, { "epoch": 1.2110933758978453, "grad_norm": 0.27467721700668335, "learning_rate": 3.628608506817522e-05, "loss": 0.2438, "num_tokens": 396758617.0, "step": 3035 }, { "epoch": 1.211492418196329, "grad_norm": 0.2581498920917511, "learning_rate": 3.627697279222506e-05, "loss": 0.2454, "num_tokens": 396889689.0, "step": 3036 }, { "epoch": 1.2118914604948126, "grad_norm": 0.2822883725166321, "learning_rate": 3.626785881801247e-05, "loss": 0.2206, "num_tokens": 397020761.0, "step": 3037 }, { "epoch": 1.2122905027932962, "grad_norm": 0.32666370272636414, "learning_rate": 3.6258743147300944e-05, "loss": 0.238, "num_tokens": 397151833.0, "step": 3038 }, { "epoch": 1.2126895450917798, "grad_norm": 0.3268517851829529, "learning_rate": 3.624962578185426e-05, "loss": 0.2638, "num_tokens": 397282905.0, "step": 3039 }, { "epoch": 1.2130885873902635, "grad_norm": 0.24769780039787292, "learning_rate": 3.624050672343656e-05, "loss": 0.1973, "num_tokens": 397413977.0, "step": 3040 }, { "epoch": 1.2134876296887471, "grad_norm": 0.3010913133621216, "learning_rate": 3.623138597381228e-05, "loss": 0.2231, "num_tokens": 397530562.0, "step": 3041 }, { "epoch": 1.2138866719872308, "grad_norm": 0.26657021045684814, "learning_rate": 3.6222263534746204e-05, "loss": 0.2634, "num_tokens": 397661634.0, "step": 3042 }, { "epoch": 1.2142857142857142, "grad_norm": 0.2552860379219055, "learning_rate": 3.621313940800345e-05, "loss": 0.2062, "num_tokens": 397792706.0, "step": 3043 }, { "epoch": 1.2146847565841978, "grad_norm": 0.27954405546188354, "learning_rate": 3.6204013595349436e-05, "loss": 0.2346, "num_tokens": 397923778.0, "step": 3044 }, { "epoch": 1.2150837988826815, "grad_norm": 0.31262022256851196, "learning_rate": 3.619488609854993e-05, "loss": 0.2724, "num_tokens": 398054850.0, "step": 3045 }, { "epoch": 1.2154828411811651, "grad_norm": 0.25687533617019653, "learning_rate": 3.6185756919371026e-05, "loss": 0.229, "num_tokens": 398185922.0, "step": 3046 }, { "epoch": 1.2158818834796488, "grad_norm": 0.249969482421875, "learning_rate": 3.617662605957912e-05, "loss": 0.1999, "num_tokens": 398305278.0, "step": 3047 }, { "epoch": 1.2162809257781324, "grad_norm": 0.2509939968585968, "learning_rate": 3.6167493520940946e-05, "loss": 0.2309, "num_tokens": 398436350.0, "step": 3048 }, { "epoch": 1.216679968076616, "grad_norm": 0.27055853605270386, "learning_rate": 3.615835930522358e-05, "loss": 0.2189, "num_tokens": 398567422.0, "step": 3049 }, { "epoch": 1.2170790103750997, "grad_norm": 0.3258304297924042, "learning_rate": 3.61492234141944e-05, "loss": 0.2507, "num_tokens": 398698494.0, "step": 3050 }, { "epoch": 1.2174780526735833, "grad_norm": 0.2996593713760376, "learning_rate": 3.6140085849621116e-05, "loss": 0.2207, "num_tokens": 398829566.0, "step": 3051 }, { "epoch": 1.217877094972067, "grad_norm": 0.2739870846271515, "learning_rate": 3.613094661327176e-05, "loss": 0.2263, "num_tokens": 398960638.0, "step": 3052 }, { "epoch": 1.2182761372705506, "grad_norm": 0.3096087872982025, "learning_rate": 3.6121805706914694e-05, "loss": 0.2748, "num_tokens": 399091710.0, "step": 3053 }, { "epoch": 1.2186751795690343, "grad_norm": 0.2530144453048706, "learning_rate": 3.61126631323186e-05, "loss": 0.2064, "num_tokens": 399222782.0, "step": 3054 }, { "epoch": 1.219074221867518, "grad_norm": 0.30333560705184937, "learning_rate": 3.610351889125247e-05, "loss": 0.3227, "num_tokens": 399353854.0, "step": 3055 }, { "epoch": 1.2194732641660015, "grad_norm": 0.2614760398864746, "learning_rate": 3.6094372985485634e-05, "loss": 0.2039, "num_tokens": 399484926.0, "step": 3056 }, { "epoch": 1.2198723064644852, "grad_norm": 0.2816551923751831, "learning_rate": 3.6085225416787745e-05, "loss": 0.2673, "num_tokens": 399615998.0, "step": 3057 }, { "epoch": 1.2202713487629688, "grad_norm": 0.2734028697013855, "learning_rate": 3.6076076186928766e-05, "loss": 0.2075, "num_tokens": 399747070.0, "step": 3058 }, { "epoch": 1.2206703910614525, "grad_norm": 0.24348852038383484, "learning_rate": 3.6066925297678996e-05, "loss": 0.2037, "num_tokens": 399878142.0, "step": 3059 }, { "epoch": 1.2210694333599361, "grad_norm": 0.2735104262828827, "learning_rate": 3.605777275080903e-05, "loss": 0.2278, "num_tokens": 400009214.0, "step": 3060 }, { "epoch": 1.2214684756584198, "grad_norm": 0.30788177251815796, "learning_rate": 3.6048618548089814e-05, "loss": 0.2481, "num_tokens": 400140286.0, "step": 3061 }, { "epoch": 1.2218675179569034, "grad_norm": 0.31161612272262573, "learning_rate": 3.60394626912926e-05, "loss": 0.2369, "num_tokens": 400271358.0, "step": 3062 }, { "epoch": 1.222266560255387, "grad_norm": 0.2658703327178955, "learning_rate": 3.603030518218897e-05, "loss": 0.2165, "num_tokens": 400402430.0, "step": 3063 }, { "epoch": 1.2226656025538707, "grad_norm": 0.30107587575912476, "learning_rate": 3.602114602255079e-05, "loss": 0.2525, "num_tokens": 400533502.0, "step": 3064 }, { "epoch": 1.2230646448523543, "grad_norm": 0.27772659063339233, "learning_rate": 3.60119852141503e-05, "loss": 0.2261, "num_tokens": 400664574.0, "step": 3065 }, { "epoch": 1.223463687150838, "grad_norm": 0.27586403489112854, "learning_rate": 3.6002822758760005e-05, "loss": 0.2444, "num_tokens": 400795646.0, "step": 3066 }, { "epoch": 1.2238627294493216, "grad_norm": 0.2912740111351013, "learning_rate": 3.599365865815277e-05, "loss": 0.232, "num_tokens": 400926718.0, "step": 3067 }, { "epoch": 1.2242617717478053, "grad_norm": 0.26976123452186584, "learning_rate": 3.598449291410177e-05, "loss": 0.2088, "num_tokens": 401057790.0, "step": 3068 }, { "epoch": 1.224660814046289, "grad_norm": 0.2613649368286133, "learning_rate": 3.597532552838048e-05, "loss": 0.2011, "num_tokens": 401188862.0, "step": 3069 }, { "epoch": 1.2250598563447725, "grad_norm": 0.33522626757621765, "learning_rate": 3.596615650276271e-05, "loss": 0.2724, "num_tokens": 401319934.0, "step": 3070 }, { "epoch": 1.2254588986432562, "grad_norm": 0.31792396306991577, "learning_rate": 3.595698583902257e-05, "loss": 0.2427, "num_tokens": 401451006.0, "step": 3071 }, { "epoch": 1.2258579409417398, "grad_norm": 0.29086342453956604, "learning_rate": 3.594781353893451e-05, "loss": 0.2739, "num_tokens": 401582078.0, "step": 3072 }, { "epoch": 1.2262569832402235, "grad_norm": 0.286123126745224, "learning_rate": 3.593863960427328e-05, "loss": 0.2114, "num_tokens": 401713150.0, "step": 3073 }, { "epoch": 1.2266560255387071, "grad_norm": 0.2791669964790344, "learning_rate": 3.5929464036813946e-05, "loss": 0.2386, "num_tokens": 401844222.0, "step": 3074 }, { "epoch": 1.2270550678371908, "grad_norm": 0.24426215887069702, "learning_rate": 3.5920286838331904e-05, "loss": 0.2164, "num_tokens": 401975294.0, "step": 3075 }, { "epoch": 1.2274541101356744, "grad_norm": 0.2617713510990143, "learning_rate": 3.591110801060286e-05, "loss": 0.1724, "num_tokens": 402106366.0, "step": 3076 }, { "epoch": 1.227853152434158, "grad_norm": 0.2584490478038788, "learning_rate": 3.590192755540281e-05, "loss": 0.2068, "num_tokens": 402237438.0, "step": 3077 }, { "epoch": 1.2282521947326417, "grad_norm": 0.32998618483543396, "learning_rate": 3.58927454745081e-05, "loss": 0.2692, "num_tokens": 402368510.0, "step": 3078 }, { "epoch": 1.2286512370311253, "grad_norm": 0.3067409098148346, "learning_rate": 3.588356176969538e-05, "loss": 0.2508, "num_tokens": 402499582.0, "step": 3079 }, { "epoch": 1.229050279329609, "grad_norm": 0.2936437427997589, "learning_rate": 3.5874376442741605e-05, "loss": 0.2512, "num_tokens": 402630654.0, "step": 3080 }, { "epoch": 1.2294493216280926, "grad_norm": 0.27870017290115356, "learning_rate": 3.5865189495424055e-05, "loss": 0.2316, "num_tokens": 402761726.0, "step": 3081 }, { "epoch": 1.2298483639265763, "grad_norm": 0.2609879970550537, "learning_rate": 3.585600092952032e-05, "loss": 0.2035, "num_tokens": 402892798.0, "step": 3082 }, { "epoch": 1.23024740622506, "grad_norm": 0.29423654079437256, "learning_rate": 3.584681074680829e-05, "loss": 0.2626, "num_tokens": 403023870.0, "step": 3083 }, { "epoch": 1.2306464485235435, "grad_norm": 0.2848937213420868, "learning_rate": 3.5837618949066204e-05, "loss": 0.2359, "num_tokens": 403154942.0, "step": 3084 }, { "epoch": 1.2310454908220272, "grad_norm": 0.2730304002761841, "learning_rate": 3.582842553807256e-05, "loss": 0.2361, "num_tokens": 403286014.0, "step": 3085 }, { "epoch": 1.2314445331205108, "grad_norm": 0.2579815089702606, "learning_rate": 3.5819230515606214e-05, "loss": 0.2004, "num_tokens": 403417086.0, "step": 3086 }, { "epoch": 1.2318435754189945, "grad_norm": 0.2883319556713104, "learning_rate": 3.5810033883446326e-05, "loss": 0.2315, "num_tokens": 403548158.0, "step": 3087 }, { "epoch": 1.2322426177174781, "grad_norm": 0.2906952202320099, "learning_rate": 3.580083564337233e-05, "loss": 0.2304, "num_tokens": 403679230.0, "step": 3088 }, { "epoch": 1.2326416600159618, "grad_norm": 0.3618379831314087, "learning_rate": 3.5791635797164025e-05, "loss": 0.2912, "num_tokens": 403810302.0, "step": 3089 }, { "epoch": 1.2330407023144454, "grad_norm": 0.30438825488090515, "learning_rate": 3.578243434660149e-05, "loss": 0.2801, "num_tokens": 403941374.0, "step": 3090 }, { "epoch": 1.233439744612929, "grad_norm": 0.32047924399375916, "learning_rate": 3.577323129346511e-05, "loss": 0.2801, "num_tokens": 404072446.0, "step": 3091 }, { "epoch": 1.2338387869114127, "grad_norm": 0.28751063346862793, "learning_rate": 3.5764026639535594e-05, "loss": 0.2184, "num_tokens": 404203518.0, "step": 3092 }, { "epoch": 1.2342378292098963, "grad_norm": 0.24854883551597595, "learning_rate": 3.575482038659397e-05, "loss": 0.2155, "num_tokens": 404334590.0, "step": 3093 }, { "epoch": 1.23463687150838, "grad_norm": 0.2693951725959778, "learning_rate": 3.5745612536421546e-05, "loss": 0.2197, "num_tokens": 404465662.0, "step": 3094 }, { "epoch": 1.2350359138068636, "grad_norm": 0.2953576147556305, "learning_rate": 3.573640309079996e-05, "loss": 0.2724, "num_tokens": 404596734.0, "step": 3095 }, { "epoch": 1.2354349561053473, "grad_norm": 0.24969594180583954, "learning_rate": 3.572719205151115e-05, "loss": 0.2119, "num_tokens": 404727806.0, "step": 3096 }, { "epoch": 1.235833998403831, "grad_norm": 0.249301016330719, "learning_rate": 3.571797942033738e-05, "loss": 0.2219, "num_tokens": 404858878.0, "step": 3097 }, { "epoch": 1.2362330407023143, "grad_norm": 0.24545696377754211, "learning_rate": 3.5708765199061196e-05, "loss": 0.1944, "num_tokens": 404989950.0, "step": 3098 }, { "epoch": 1.236632083000798, "grad_norm": 0.32153353095054626, "learning_rate": 3.569954938946547e-05, "loss": 0.2595, "num_tokens": 405121022.0, "step": 3099 }, { "epoch": 1.2370311252992816, "grad_norm": 0.27827033400535583, "learning_rate": 3.569033199333336e-05, "loss": 0.2258, "num_tokens": 405252094.0, "step": 3100 }, { "epoch": 1.2374301675977653, "grad_norm": 0.32585206627845764, "learning_rate": 3.568111301244837e-05, "loss": 0.2831, "num_tokens": 405383166.0, "step": 3101 }, { "epoch": 1.237829209896249, "grad_norm": 0.2925906479358673, "learning_rate": 3.567189244859427e-05, "loss": 0.2122, "num_tokens": 405514238.0, "step": 3102 }, { "epoch": 1.2382282521947325, "grad_norm": 0.3007200360298157, "learning_rate": 3.5662670303555166e-05, "loss": 0.2525, "num_tokens": 405645310.0, "step": 3103 }, { "epoch": 1.2386272944932162, "grad_norm": 0.2524136006832123, "learning_rate": 3.565344657911545e-05, "loss": 0.2354, "num_tokens": 405776382.0, "step": 3104 }, { "epoch": 1.2390263367916998, "grad_norm": 0.23693886399269104, "learning_rate": 3.5644221277059826e-05, "loss": 0.2169, "num_tokens": 405907454.0, "step": 3105 }, { "epoch": 1.2394253790901835, "grad_norm": 0.2666848301887512, "learning_rate": 3.5634994399173286e-05, "loss": 0.2623, "num_tokens": 406038526.0, "step": 3106 }, { "epoch": 1.239824421388667, "grad_norm": 0.2624644637107849, "learning_rate": 3.5625765947241185e-05, "loss": 0.2429, "num_tokens": 406169598.0, "step": 3107 }, { "epoch": 1.2402234636871508, "grad_norm": 0.25363418459892273, "learning_rate": 3.561653592304911e-05, "loss": 0.2405, "num_tokens": 406300670.0, "step": 3108 }, { "epoch": 1.2406225059856344, "grad_norm": 0.2659546434879303, "learning_rate": 3.5607304328383e-05, "loss": 0.26, "num_tokens": 406426459.0, "step": 3109 }, { "epoch": 1.241021548284118, "grad_norm": 0.2555299997329712, "learning_rate": 3.559807116502908e-05, "loss": 0.2352, "num_tokens": 406557531.0, "step": 3110 }, { "epoch": 1.2414205905826017, "grad_norm": 0.27478429675102234, "learning_rate": 3.558883643477387e-05, "loss": 0.2379, "num_tokens": 406688603.0, "step": 3111 }, { "epoch": 1.2418196328810853, "grad_norm": 0.28888827562332153, "learning_rate": 3.5579600139404216e-05, "loss": 0.2532, "num_tokens": 406819675.0, "step": 3112 }, { "epoch": 1.242218675179569, "grad_norm": 0.2996925115585327, "learning_rate": 3.557036228070725e-05, "loss": 0.2469, "num_tokens": 406950747.0, "step": 3113 }, { "epoch": 1.2426177174780526, "grad_norm": 0.2594625949859619, "learning_rate": 3.556112286047043e-05, "loss": 0.2051, "num_tokens": 407081819.0, "step": 3114 }, { "epoch": 1.2430167597765363, "grad_norm": 0.3057543933391571, "learning_rate": 3.555188188048147e-05, "loss": 0.2458, "num_tokens": 407212891.0, "step": 3115 }, { "epoch": 1.24341580207502, "grad_norm": 0.2612190544605255, "learning_rate": 3.5542639342528415e-05, "loss": 0.2114, "num_tokens": 407343963.0, "step": 3116 }, { "epoch": 1.2438148443735035, "grad_norm": 0.25859373807907104, "learning_rate": 3.553339524839963e-05, "loss": 0.2161, "num_tokens": 407475035.0, "step": 3117 }, { "epoch": 1.2442138866719872, "grad_norm": 0.26335635781288147, "learning_rate": 3.552414959988375e-05, "loss": 0.1855, "num_tokens": 407606107.0, "step": 3118 }, { "epoch": 1.2446129289704708, "grad_norm": 0.25833749771118164, "learning_rate": 3.551490239876972e-05, "loss": 0.2033, "num_tokens": 407737179.0, "step": 3119 }, { "epoch": 1.2450119712689545, "grad_norm": 0.2856054902076721, "learning_rate": 3.550565364684679e-05, "loss": 0.241, "num_tokens": 407861317.0, "step": 3120 }, { "epoch": 1.245411013567438, "grad_norm": 0.276944100856781, "learning_rate": 3.549640334590451e-05, "loss": 0.237, "num_tokens": 407992389.0, "step": 3121 }, { "epoch": 1.2458100558659218, "grad_norm": 0.2728588581085205, "learning_rate": 3.548715149773273e-05, "loss": 0.1778, "num_tokens": 408123461.0, "step": 3122 }, { "epoch": 1.2462090981644054, "grad_norm": 0.27141618728637695, "learning_rate": 3.5477898104121575e-05, "loss": 0.2112, "num_tokens": 408254533.0, "step": 3123 }, { "epoch": 1.246608140462889, "grad_norm": 0.29365843534469604, "learning_rate": 3.546864316686151e-05, "loss": 0.2532, "num_tokens": 408385605.0, "step": 3124 }, { "epoch": 1.2470071827613727, "grad_norm": 0.29750773310661316, "learning_rate": 3.545938668774329e-05, "loss": 0.2609, "num_tokens": 408516677.0, "step": 3125 }, { "epoch": 1.2474062250598563, "grad_norm": 0.236628919839859, "learning_rate": 3.545012866855793e-05, "loss": 0.1852, "num_tokens": 408647749.0, "step": 3126 }, { "epoch": 1.24780526735834, "grad_norm": 0.2732941508293152, "learning_rate": 3.5440869111096784e-05, "loss": 0.2259, "num_tokens": 408778821.0, "step": 3127 }, { "epoch": 1.2482043096568236, "grad_norm": 0.2643182575702667, "learning_rate": 3.5431608017151485e-05, "loss": 0.2358, "num_tokens": 408909893.0, "step": 3128 }, { "epoch": 1.2486033519553073, "grad_norm": 0.25589317083358765, "learning_rate": 3.542234538851398e-05, "loss": 0.2348, "num_tokens": 409040965.0, "step": 3129 }, { "epoch": 1.249002394253791, "grad_norm": 0.2657163441181183, "learning_rate": 3.5413081226976494e-05, "loss": 0.2178, "num_tokens": 409172037.0, "step": 3130 }, { "epoch": 1.2494014365522745, "grad_norm": 0.26940786838531494, "learning_rate": 3.540381553433155e-05, "loss": 0.2514, "num_tokens": 409303109.0, "step": 3131 }, { "epoch": 1.2498004788507582, "grad_norm": 0.2548622786998749, "learning_rate": 3.539454831237198e-05, "loss": 0.2299, "num_tokens": 409434181.0, "step": 3132 }, { "epoch": 1.2501995211492418, "grad_norm": 0.2621879577636719, "learning_rate": 3.5385279562890904e-05, "loss": 0.1846, "num_tokens": 409565253.0, "step": 3133 }, { "epoch": 1.2505985634477255, "grad_norm": 0.264638751745224, "learning_rate": 3.537600928768173e-05, "loss": 0.2005, "num_tokens": 409680596.0, "step": 3134 }, { "epoch": 1.250997605746209, "grad_norm": 0.2772424817085266, "learning_rate": 3.5366737488538195e-05, "loss": 0.2244, "num_tokens": 409811668.0, "step": 3135 }, { "epoch": 1.2513966480446927, "grad_norm": 0.26830410957336426, "learning_rate": 3.5357464167254275e-05, "loss": 0.2119, "num_tokens": 409942740.0, "step": 3136 }, { "epoch": 1.2517956903431764, "grad_norm": 0.2743564248085022, "learning_rate": 3.534818932562429e-05, "loss": 0.2212, "num_tokens": 410073812.0, "step": 3137 }, { "epoch": 1.25219473264166, "grad_norm": 0.2799679934978485, "learning_rate": 3.5338912965442835e-05, "loss": 0.2378, "num_tokens": 410204884.0, "step": 3138 }, { "epoch": 1.2525937749401437, "grad_norm": 0.2836279571056366, "learning_rate": 3.532963508850478e-05, "loss": 0.2635, "num_tokens": 410335956.0, "step": 3139 }, { "epoch": 1.2529928172386273, "grad_norm": 15.86454963684082, "learning_rate": 3.532035569660534e-05, "loss": 0.275, "num_tokens": 410467028.0, "step": 3140 }, { "epoch": 1.253391859537111, "grad_norm": 0.2724427282810211, "learning_rate": 3.531107479153996e-05, "loss": 0.2209, "num_tokens": 410598100.0, "step": 3141 }, { "epoch": 1.2537909018355946, "grad_norm": 0.2608371078968048, "learning_rate": 3.5301792375104434e-05, "loss": 0.231, "num_tokens": 410729172.0, "step": 3142 }, { "epoch": 1.2541899441340782, "grad_norm": 0.2826032042503357, "learning_rate": 3.5292508449094805e-05, "loss": 0.2729, "num_tokens": 410860244.0, "step": 3143 }, { "epoch": 1.254588986432562, "grad_norm": 0.25010114908218384, "learning_rate": 3.528322301530743e-05, "loss": 0.2029, "num_tokens": 410991316.0, "step": 3144 }, { "epoch": 1.2549880287310455, "grad_norm": 0.28464874625205994, "learning_rate": 3.527393607553895e-05, "loss": 0.2611, "num_tokens": 411122388.0, "step": 3145 }, { "epoch": 1.2553870710295292, "grad_norm": 0.2583189010620117, "learning_rate": 3.526464763158632e-05, "loss": 0.2122, "num_tokens": 411253460.0, "step": 3146 }, { "epoch": 1.2557861133280128, "grad_norm": 0.278774231672287, "learning_rate": 3.5255357685246744e-05, "loss": 0.2338, "num_tokens": 411384532.0, "step": 3147 }, { "epoch": 1.2561851556264965, "grad_norm": 0.2714381515979767, "learning_rate": 3.5246066238317756e-05, "loss": 0.2496, "num_tokens": 411515604.0, "step": 3148 }, { "epoch": 1.25658419792498, "grad_norm": 0.2656354308128357, "learning_rate": 3.523677329259715e-05, "loss": 0.2072, "num_tokens": 411646676.0, "step": 3149 }, { "epoch": 1.2569832402234637, "grad_norm": 0.30979469418525696, "learning_rate": 3.522747884988304e-05, "loss": 0.2158, "num_tokens": 411777748.0, "step": 3150 }, { "epoch": 1.2573822825219474, "grad_norm": 0.25317054986953735, "learning_rate": 3.521818291197381e-05, "loss": 0.2053, "num_tokens": 411908820.0, "step": 3151 }, { "epoch": 1.257781324820431, "grad_norm": 0.27912330627441406, "learning_rate": 3.520888548066813e-05, "loss": 0.2758, "num_tokens": 412039892.0, "step": 3152 }, { "epoch": 1.2581803671189147, "grad_norm": 0.28363150358200073, "learning_rate": 3.519958655776497e-05, "loss": 0.2649, "num_tokens": 412170964.0, "step": 3153 }, { "epoch": 1.2585794094173983, "grad_norm": 0.28807562589645386, "learning_rate": 3.51902861450636e-05, "loss": 0.2429, "num_tokens": 412302036.0, "step": 3154 }, { "epoch": 1.258978451715882, "grad_norm": 0.27097082138061523, "learning_rate": 3.518098424436353e-05, "loss": 0.2381, "num_tokens": 412433108.0, "step": 3155 }, { "epoch": 1.2593774940143656, "grad_norm": 0.29787054657936096, "learning_rate": 3.517168085746461e-05, "loss": 0.2482, "num_tokens": 412564180.0, "step": 3156 }, { "epoch": 1.2597765363128492, "grad_norm": 0.25278240442276, "learning_rate": 3.516237598616697e-05, "loss": 0.2164, "num_tokens": 412695252.0, "step": 3157 }, { "epoch": 1.260175578611333, "grad_norm": 0.2647288739681244, "learning_rate": 3.5153069632271e-05, "loss": 0.2152, "num_tokens": 412826324.0, "step": 3158 }, { "epoch": 1.2605746209098165, "grad_norm": 0.5193159580230713, "learning_rate": 3.5143761797577404e-05, "loss": 0.2508, "num_tokens": 412957396.0, "step": 3159 }, { "epoch": 1.2609736632083002, "grad_norm": 0.2660059332847595, "learning_rate": 3.513445248388715e-05, "loss": 0.2111, "num_tokens": 413088468.0, "step": 3160 }, { "epoch": 1.2613727055067838, "grad_norm": 0.24833393096923828, "learning_rate": 3.5125141693001496e-05, "loss": 0.1615, "num_tokens": 413219540.0, "step": 3161 }, { "epoch": 1.2617717478052675, "grad_norm": 0.2742668688297272, "learning_rate": 3.511582942672202e-05, "loss": 0.2179, "num_tokens": 413350612.0, "step": 3162 }, { "epoch": 1.262170790103751, "grad_norm": 0.30818837881088257, "learning_rate": 3.510651568685054e-05, "loss": 0.2799, "num_tokens": 413481684.0, "step": 3163 }, { "epoch": 1.2625698324022347, "grad_norm": 0.2759415805339813, "learning_rate": 3.5097200475189176e-05, "loss": 0.2781, "num_tokens": 413612756.0, "step": 3164 }, { "epoch": 1.2629688747007184, "grad_norm": 0.2929840683937073, "learning_rate": 3.508788379354035e-05, "loss": 0.2593, "num_tokens": 413743828.0, "step": 3165 }, { "epoch": 1.263367916999202, "grad_norm": 0.2735459804534912, "learning_rate": 3.507856564370673e-05, "loss": 0.2292, "num_tokens": 413874900.0, "step": 3166 }, { "epoch": 1.2637669592976857, "grad_norm": 0.2808898687362671, "learning_rate": 3.506924602749131e-05, "loss": 0.241, "num_tokens": 414005972.0, "step": 3167 }, { "epoch": 1.2641660015961693, "grad_norm": 0.24365870654582977, "learning_rate": 3.505992494669735e-05, "loss": 0.1889, "num_tokens": 414137044.0, "step": 3168 }, { "epoch": 1.264565043894653, "grad_norm": 0.2911984324455261, "learning_rate": 3.505060240312838e-05, "loss": 0.2352, "num_tokens": 414268116.0, "step": 3169 }, { "epoch": 1.2649640861931366, "grad_norm": 0.2680138647556305, "learning_rate": 3.5041278398588225e-05, "loss": 0.2162, "num_tokens": 414399188.0, "step": 3170 }, { "epoch": 1.2653631284916202, "grad_norm": 0.2683767080307007, "learning_rate": 3.503195293488101e-05, "loss": 0.2035, "num_tokens": 414530260.0, "step": 3171 }, { "epoch": 1.2657621707901037, "grad_norm": 0.31429678201675415, "learning_rate": 3.502262601381111e-05, "loss": 0.2659, "num_tokens": 414661332.0, "step": 3172 }, { "epoch": 1.2661612130885873, "grad_norm": 0.2515171766281128, "learning_rate": 3.50132976371832e-05, "loss": 0.2059, "num_tokens": 414792404.0, "step": 3173 }, { "epoch": 1.266560255387071, "grad_norm": 0.28467661142349243, "learning_rate": 3.500396780680224e-05, "loss": 0.2436, "num_tokens": 414923476.0, "step": 3174 }, { "epoch": 1.2669592976855546, "grad_norm": 0.27324020862579346, "learning_rate": 3.4994636524473454e-05, "loss": 0.2191, "num_tokens": 415054548.0, "step": 3175 }, { "epoch": 1.2673583399840382, "grad_norm": 0.26603248715400696, "learning_rate": 3.498530379200237e-05, "loss": 0.2481, "num_tokens": 415185620.0, "step": 3176 }, { "epoch": 1.2677573822825219, "grad_norm": 0.3004314601421356, "learning_rate": 3.497596961119478e-05, "loss": 0.2438, "num_tokens": 415316692.0, "step": 3177 }, { "epoch": 1.2681564245810055, "grad_norm": 0.25994741916656494, "learning_rate": 3.496663398385675e-05, "loss": 0.2183, "num_tokens": 415447764.0, "step": 3178 }, { "epoch": 1.2685554668794892, "grad_norm": 0.2566371262073517, "learning_rate": 3.4957296911794654e-05, "loss": 0.2148, "num_tokens": 415578836.0, "step": 3179 }, { "epoch": 1.2689545091779728, "grad_norm": 0.2558838725090027, "learning_rate": 3.4947958396815125e-05, "loss": 0.2263, "num_tokens": 415709908.0, "step": 3180 }, { "epoch": 1.2693535514764565, "grad_norm": 0.26418307423591614, "learning_rate": 3.4938618440725066e-05, "loss": 0.2158, "num_tokens": 415840980.0, "step": 3181 }, { "epoch": 1.26975259377494, "grad_norm": 0.29796722531318665, "learning_rate": 3.4929277045331694e-05, "loss": 0.2448, "num_tokens": 415972052.0, "step": 3182 }, { "epoch": 1.2701516360734237, "grad_norm": 0.2650129795074463, "learning_rate": 3.491993421244246e-05, "loss": 0.228, "num_tokens": 416103124.0, "step": 3183 }, { "epoch": 1.2705506783719074, "grad_norm": 0.2843376696109772, "learning_rate": 3.4910589943865124e-05, "loss": 0.2149, "num_tokens": 416234196.0, "step": 3184 }, { "epoch": 1.270949720670391, "grad_norm": 0.25949516892433167, "learning_rate": 3.4901244241407715e-05, "loss": 0.1996, "num_tokens": 416365268.0, "step": 3185 }, { "epoch": 1.2713487629688747, "grad_norm": 0.21667584776878357, "learning_rate": 3.489189710687854e-05, "loss": 0.1628, "num_tokens": 416496340.0, "step": 3186 }, { "epoch": 1.2717478052673583, "grad_norm": 0.2947506010532379, "learning_rate": 3.488254854208617e-05, "loss": 0.2248, "num_tokens": 416627412.0, "step": 3187 }, { "epoch": 1.272146847565842, "grad_norm": 0.2604304552078247, "learning_rate": 3.48731985488395e-05, "loss": 0.2114, "num_tokens": 416758484.0, "step": 3188 }, { "epoch": 1.2725458898643256, "grad_norm": 0.2569146156311035, "learning_rate": 3.4863847128947625e-05, "loss": 0.2073, "num_tokens": 416889556.0, "step": 3189 }, { "epoch": 1.2729449321628092, "grad_norm": 0.2612835168838501, "learning_rate": 3.485449428421998e-05, "loss": 0.2229, "num_tokens": 417020628.0, "step": 3190 }, { "epoch": 1.2733439744612929, "grad_norm": 0.2524476647377014, "learning_rate": 3.484514001646625e-05, "loss": 0.1999, "num_tokens": 417151700.0, "step": 3191 }, { "epoch": 1.2737430167597765, "grad_norm": 0.2655681371688843, "learning_rate": 3.48357843274964e-05, "loss": 0.2423, "num_tokens": 417282772.0, "step": 3192 }, { "epoch": 1.2741420590582602, "grad_norm": 0.2864425480365753, "learning_rate": 3.4826427219120664e-05, "loss": 0.2754, "num_tokens": 417413844.0, "step": 3193 }, { "epoch": 1.2745411013567438, "grad_norm": 0.2532077431678772, "learning_rate": 3.481706869314955e-05, "loss": 0.1956, "num_tokens": 417544916.0, "step": 3194 }, { "epoch": 1.2749401436552275, "grad_norm": 0.2595662772655487, "learning_rate": 3.480770875139385e-05, "loss": 0.1983, "num_tokens": 417675988.0, "step": 3195 }, { "epoch": 1.275339185953711, "grad_norm": 0.2852509915828705, "learning_rate": 3.4798347395664645e-05, "loss": 0.2245, "num_tokens": 417807060.0, "step": 3196 }, { "epoch": 1.2757382282521947, "grad_norm": 0.26649290323257446, "learning_rate": 3.478898462777324e-05, "loss": 0.2246, "num_tokens": 417938132.0, "step": 3197 }, { "epoch": 1.2761372705506784, "grad_norm": 0.2846236526966095, "learning_rate": 3.477962044953126e-05, "loss": 0.2418, "num_tokens": 418069204.0, "step": 3198 }, { "epoch": 1.276536312849162, "grad_norm": 0.2795015573501587, "learning_rate": 3.477025486275059e-05, "loss": 0.2423, "num_tokens": 418200276.0, "step": 3199 }, { "epoch": 1.2769353551476457, "grad_norm": 0.27608513832092285, "learning_rate": 3.476088786924337e-05, "loss": 0.2242, "num_tokens": 418331348.0, "step": 3200 }, { "epoch": 1.2773343974461293, "grad_norm": 0.28571873903274536, "learning_rate": 3.475151947082203e-05, "loss": 0.239, "num_tokens": 418462420.0, "step": 3201 }, { "epoch": 1.277733439744613, "grad_norm": 0.2527835965156555, "learning_rate": 3.474214966929927e-05, "loss": 0.1868, "num_tokens": 418593492.0, "step": 3202 }, { "epoch": 1.2781324820430966, "grad_norm": 0.2773018777370453, "learning_rate": 3.473277846648806e-05, "loss": 0.247, "num_tokens": 418724564.0, "step": 3203 }, { "epoch": 1.2785315243415802, "grad_norm": 0.2697538733482361, "learning_rate": 3.4723405864201655e-05, "loss": 0.2562, "num_tokens": 418855636.0, "step": 3204 }, { "epoch": 1.2789305666400639, "grad_norm": 0.23441249132156372, "learning_rate": 3.471403186425354e-05, "loss": 0.1957, "num_tokens": 418986708.0, "step": 3205 }, { "epoch": 1.2793296089385475, "grad_norm": 0.2686779797077179, "learning_rate": 3.47046564684575e-05, "loss": 0.234, "num_tokens": 419117780.0, "step": 3206 }, { "epoch": 1.2797286512370312, "grad_norm": 0.27625250816345215, "learning_rate": 3.4695279678627594e-05, "loss": 0.2485, "num_tokens": 419248852.0, "step": 3207 }, { "epoch": 1.2801276935355148, "grad_norm": 0.28226688504219055, "learning_rate": 3.468590149657815e-05, "loss": 0.2483, "num_tokens": 419379924.0, "step": 3208 }, { "epoch": 1.2805267358339985, "grad_norm": 0.25663572549819946, "learning_rate": 3.467652192412375e-05, "loss": 0.2098, "num_tokens": 419510996.0, "step": 3209 }, { "epoch": 1.280925778132482, "grad_norm": 0.27014413475990295, "learning_rate": 3.466714096307926e-05, "loss": 0.2355, "num_tokens": 419642068.0, "step": 3210 }, { "epoch": 1.2813248204309657, "grad_norm": 0.2801295518875122, "learning_rate": 3.4657758615259814e-05, "loss": 0.2417, "num_tokens": 419773140.0, "step": 3211 }, { "epoch": 1.2817238627294494, "grad_norm": 0.2602136731147766, "learning_rate": 3.4648374882480775e-05, "loss": 0.2161, "num_tokens": 419904212.0, "step": 3212 }, { "epoch": 1.282122905027933, "grad_norm": 0.2762306332588196, "learning_rate": 3.463898976655785e-05, "loss": 0.218, "num_tokens": 420035284.0, "step": 3213 }, { "epoch": 1.2825219473264167, "grad_norm": 0.27672335505485535, "learning_rate": 3.4629603269306956e-05, "loss": 0.2536, "num_tokens": 420166356.0, "step": 3214 }, { "epoch": 1.2829209896249003, "grad_norm": 0.25960612297058105, "learning_rate": 3.4620215392544286e-05, "loss": 0.2455, "num_tokens": 420297428.0, "step": 3215 }, { "epoch": 1.283320031923384, "grad_norm": 0.2503014802932739, "learning_rate": 3.461082613808631e-05, "loss": 0.2063, "num_tokens": 420428500.0, "step": 3216 }, { "epoch": 1.2837190742218676, "grad_norm": 0.2645862400531769, "learning_rate": 3.4601435507749766e-05, "loss": 0.1973, "num_tokens": 420559572.0, "step": 3217 }, { "epoch": 1.2841181165203512, "grad_norm": 0.26763808727264404, "learning_rate": 3.459204350335165e-05, "loss": 0.2169, "num_tokens": 420690644.0, "step": 3218 }, { "epoch": 1.2845171588188347, "grad_norm": 0.30468955636024475, "learning_rate": 3.4582650126709224e-05, "loss": 0.2436, "num_tokens": 420821716.0, "step": 3219 }, { "epoch": 1.2849162011173183, "grad_norm": 0.2957240045070648, "learning_rate": 3.457325537964001e-05, "loss": 0.2258, "num_tokens": 420952788.0, "step": 3220 }, { "epoch": 1.285315243415802, "grad_norm": 0.2756819427013397, "learning_rate": 3.456385926396184e-05, "loss": 0.2429, "num_tokens": 421083860.0, "step": 3221 }, { "epoch": 1.2857142857142856, "grad_norm": 0.29314881563186646, "learning_rate": 3.455446178149274e-05, "loss": 0.2657, "num_tokens": 421214932.0, "step": 3222 }, { "epoch": 1.2861133280127692, "grad_norm": 0.26969385147094727, "learning_rate": 3.454506293405104e-05, "loss": 0.23, "num_tokens": 421338021.0, "step": 3223 }, { "epoch": 1.2865123703112529, "grad_norm": 0.24977929890155792, "learning_rate": 3.4535662723455345e-05, "loss": 0.2051, "num_tokens": 421469093.0, "step": 3224 }, { "epoch": 1.2869114126097365, "grad_norm": 0.2565845549106598, "learning_rate": 3.452626115152449e-05, "loss": 0.2419, "num_tokens": 421600165.0, "step": 3225 }, { "epoch": 1.2873104549082202, "grad_norm": 0.2548902928829193, "learning_rate": 3.451685822007761e-05, "loss": 0.2022, "num_tokens": 421731237.0, "step": 3226 }, { "epoch": 1.2877094972067038, "grad_norm": 0.2611278295516968, "learning_rate": 3.4507453930934074e-05, "loss": 0.2395, "num_tokens": 421862309.0, "step": 3227 }, { "epoch": 1.2881085395051874, "grad_norm": 0.27125537395477295, "learning_rate": 3.449804828591352e-05, "loss": 0.2465, "num_tokens": 421993381.0, "step": 3228 }, { "epoch": 1.288507581803671, "grad_norm": 0.3713916838169098, "learning_rate": 3.4488641286835855e-05, "loss": 0.2903, "num_tokens": 422124453.0, "step": 3229 }, { "epoch": 1.2889066241021547, "grad_norm": 0.2510152757167816, "learning_rate": 3.447923293552125e-05, "loss": 0.2239, "num_tokens": 422255525.0, "step": 3230 }, { "epoch": 1.2893056664006384, "grad_norm": 0.2771870493888855, "learning_rate": 3.4469823233790136e-05, "loss": 0.2202, "num_tokens": 422386597.0, "step": 3231 }, { "epoch": 1.289704708699122, "grad_norm": 0.29420042037963867, "learning_rate": 3.4460412183463195e-05, "loss": 0.201, "num_tokens": 422517669.0, "step": 3232 }, { "epoch": 1.2901037509976057, "grad_norm": 0.33073318004608154, "learning_rate": 3.445099978636138e-05, "loss": 0.2457, "num_tokens": 422648741.0, "step": 3233 }, { "epoch": 1.2905027932960893, "grad_norm": 0.25483179092407227, "learning_rate": 3.44415860443059e-05, "loss": 0.1767, "num_tokens": 422779813.0, "step": 3234 }, { "epoch": 1.290901835594573, "grad_norm": 0.3306039869785309, "learning_rate": 3.443217095911824e-05, "loss": 0.2462, "num_tokens": 422910885.0, "step": 3235 }, { "epoch": 1.2913008778930566, "grad_norm": 0.34112420678138733, "learning_rate": 3.442275453262012e-05, "loss": 0.247, "num_tokens": 423041957.0, "step": 3236 }, { "epoch": 1.2916999201915402, "grad_norm": 0.2512013614177704, "learning_rate": 3.441333676663351e-05, "loss": 0.1892, "num_tokens": 423173029.0, "step": 3237 }, { "epoch": 1.2920989624900239, "grad_norm": 0.29952603578567505, "learning_rate": 3.440391766298071e-05, "loss": 0.2501, "num_tokens": 423304101.0, "step": 3238 }, { "epoch": 1.2924980047885075, "grad_norm": 0.29562172293663025, "learning_rate": 3.439449722348418e-05, "loss": 0.2393, "num_tokens": 423435173.0, "step": 3239 }, { "epoch": 1.2928970470869912, "grad_norm": 0.29196596145629883, "learning_rate": 3.4385075449966706e-05, "loss": 0.2603, "num_tokens": 423566245.0, "step": 3240 }, { "epoch": 1.2932960893854748, "grad_norm": 0.32511624693870544, "learning_rate": 3.4375652344251326e-05, "loss": 0.2921, "num_tokens": 423697317.0, "step": 3241 }, { "epoch": 1.2936951316839584, "grad_norm": 0.26531705260276794, "learning_rate": 3.43662279081613e-05, "loss": 0.2096, "num_tokens": 423828389.0, "step": 3242 }, { "epoch": 1.294094173982442, "grad_norm": 0.2861347794532776, "learning_rate": 3.435680214352019e-05, "loss": 0.2551, "num_tokens": 423956749.0, "step": 3243 }, { "epoch": 1.2944932162809257, "grad_norm": 0.29935869574546814, "learning_rate": 3.434737505215178e-05, "loss": 0.292, "num_tokens": 424087821.0, "step": 3244 }, { "epoch": 1.2948922585794094, "grad_norm": 0.23732827603816986, "learning_rate": 3.4337946635880117e-05, "loss": 0.1792, "num_tokens": 424218893.0, "step": 3245 }, { "epoch": 1.295291300877893, "grad_norm": 0.2520672380924225, "learning_rate": 3.432851689652954e-05, "loss": 0.1913, "num_tokens": 424349965.0, "step": 3246 }, { "epoch": 1.2956903431763767, "grad_norm": 0.2962278425693512, "learning_rate": 3.4319085835924585e-05, "loss": 0.2506, "num_tokens": 424481037.0, "step": 3247 }, { "epoch": 1.2960893854748603, "grad_norm": 0.260063111782074, "learning_rate": 3.430965345589009e-05, "loss": 0.2135, "num_tokens": 424612109.0, "step": 3248 }, { "epoch": 1.296488427773344, "grad_norm": 0.2745390832424164, "learning_rate": 3.430021975825114e-05, "loss": 0.2451, "num_tokens": 424743181.0, "step": 3249 }, { "epoch": 1.2968874700718276, "grad_norm": 0.31170886754989624, "learning_rate": 3.429078474483305e-05, "loss": 0.2607, "num_tokens": 424870120.0, "step": 3250 }, { "epoch": 1.2972865123703112, "grad_norm": 0.2647586464881897, "learning_rate": 3.428134841746142e-05, "loss": 0.207, "num_tokens": 425001192.0, "step": 3251 }, { "epoch": 1.2976855546687949, "grad_norm": 0.2548483610153198, "learning_rate": 3.427191077796208e-05, "loss": 0.2037, "num_tokens": 425132264.0, "step": 3252 }, { "epoch": 1.2980845969672785, "grad_norm": 0.2518162727355957, "learning_rate": 3.426247182816113e-05, "loss": 0.2055, "num_tokens": 425263336.0, "step": 3253 }, { "epoch": 1.2984836392657622, "grad_norm": 0.3536842167377472, "learning_rate": 3.425303156988492e-05, "loss": 0.2522, "num_tokens": 425394408.0, "step": 3254 }, { "epoch": 1.2988826815642458, "grad_norm": 0.25199639797210693, "learning_rate": 3.424359000496006e-05, "loss": 0.1943, "num_tokens": 425525480.0, "step": 3255 }, { "epoch": 1.2992817238627294, "grad_norm": 0.26961013674736023, "learning_rate": 3.423414713521339e-05, "loss": 0.205, "num_tokens": 425656552.0, "step": 3256 }, { "epoch": 1.299680766161213, "grad_norm": 0.24939067661762238, "learning_rate": 3.422470296247203e-05, "loss": 0.184, "num_tokens": 425787624.0, "step": 3257 }, { "epoch": 1.3000798084596967, "grad_norm": 0.29800280928611755, "learning_rate": 3.421525748856333e-05, "loss": 0.1902, "num_tokens": 425918696.0, "step": 3258 }, { "epoch": 1.3004788507581804, "grad_norm": 0.28535082936286926, "learning_rate": 3.4205810715314904e-05, "loss": 0.2097, "num_tokens": 426049768.0, "step": 3259 }, { "epoch": 1.300877893056664, "grad_norm": 0.3614991009235382, "learning_rate": 3.419636264455461e-05, "loss": 0.2016, "num_tokens": 426165830.0, "step": 3260 }, { "epoch": 1.3012769353551477, "grad_norm": 0.2508677840232849, "learning_rate": 3.418691327811058e-05, "loss": 0.1654, "num_tokens": 426296902.0, "step": 3261 }, { "epoch": 1.3016759776536313, "grad_norm": 0.28452616930007935, "learning_rate": 3.417746261781116e-05, "loss": 0.2339, "num_tokens": 426427974.0, "step": 3262 }, { "epoch": 1.302075019952115, "grad_norm": 0.30919116735458374, "learning_rate": 3.416801066548497e-05, "loss": 0.2265, "num_tokens": 426559046.0, "step": 3263 }, { "epoch": 1.3024740622505986, "grad_norm": 0.2487834393978119, "learning_rate": 3.415855742296087e-05, "loss": 0.1855, "num_tokens": 426690118.0, "step": 3264 }, { "epoch": 1.3028731045490822, "grad_norm": 0.25947582721710205, "learning_rate": 3.414910289206798e-05, "loss": 0.2067, "num_tokens": 426821190.0, "step": 3265 }, { "epoch": 1.3032721468475659, "grad_norm": 0.3038939833641052, "learning_rate": 3.413964707463567e-05, "loss": 0.2542, "num_tokens": 426952262.0, "step": 3266 }, { "epoch": 1.3036711891460495, "grad_norm": 0.280528724193573, "learning_rate": 3.413018997249354e-05, "loss": 0.2479, "num_tokens": 427083334.0, "step": 3267 }, { "epoch": 1.3040702314445332, "grad_norm": 0.3278982639312744, "learning_rate": 3.4120731587471456e-05, "loss": 0.2637, "num_tokens": 427214406.0, "step": 3268 }, { "epoch": 1.3044692737430168, "grad_norm": 0.2783149778842926, "learning_rate": 3.411127192139953e-05, "loss": 0.268, "num_tokens": 427345478.0, "step": 3269 }, { "epoch": 1.3048683160415004, "grad_norm": 0.27046629786491394, "learning_rate": 3.410181097610811e-05, "loss": 0.2014, "num_tokens": 427476550.0, "step": 3270 }, { "epoch": 1.305267358339984, "grad_norm": 0.2667049765586853, "learning_rate": 3.4092348753427814e-05, "loss": 0.2346, "num_tokens": 427607622.0, "step": 3271 }, { "epoch": 1.3056664006384677, "grad_norm": 0.27237147092819214, "learning_rate": 3.408288525518949e-05, "loss": 0.2326, "num_tokens": 427738694.0, "step": 3272 }, { "epoch": 1.3060654429369514, "grad_norm": 0.2503720819950104, "learning_rate": 3.4073420483224225e-05, "loss": 0.1923, "num_tokens": 427869766.0, "step": 3273 }, { "epoch": 1.306464485235435, "grad_norm": 0.2903197705745697, "learning_rate": 3.4063954439363374e-05, "loss": 0.2489, "num_tokens": 428000838.0, "step": 3274 }, { "epoch": 1.3068635275339187, "grad_norm": 0.24536947906017303, "learning_rate": 3.4054487125438534e-05, "loss": 0.2023, "num_tokens": 428131910.0, "step": 3275 }, { "epoch": 1.3072625698324023, "grad_norm": 0.35567620396614075, "learning_rate": 3.4045018543281524e-05, "loss": 0.2681, "num_tokens": 428262982.0, "step": 3276 }, { "epoch": 1.307661612130886, "grad_norm": 0.2793028950691223, "learning_rate": 3.403554869472445e-05, "loss": 0.2125, "num_tokens": 428394054.0, "step": 3277 }, { "epoch": 1.3080606544293696, "grad_norm": 0.28123676776885986, "learning_rate": 3.402607758159963e-05, "loss": 0.2441, "num_tokens": 428525126.0, "step": 3278 }, { "epoch": 1.3084596967278532, "grad_norm": 0.2603437602519989, "learning_rate": 3.4016605205739616e-05, "loss": 0.1785, "num_tokens": 428656198.0, "step": 3279 }, { "epoch": 1.3088587390263369, "grad_norm": 0.2943451404571533, "learning_rate": 3.400713156897726e-05, "loss": 0.2143, "num_tokens": 428778259.0, "step": 3280 }, { "epoch": 1.3092577813248205, "grad_norm": 0.2662917375564575, "learning_rate": 3.3997656673145595e-05, "loss": 0.2153, "num_tokens": 428909331.0, "step": 3281 }, { "epoch": 1.3096568236233042, "grad_norm": 0.26739317178726196, "learning_rate": 3.398818052007793e-05, "loss": 0.1977, "num_tokens": 429040403.0, "step": 3282 }, { "epoch": 1.3100558659217878, "grad_norm": 0.3350427448749542, "learning_rate": 3.397870311160782e-05, "loss": 0.2788, "num_tokens": 429171475.0, "step": 3283 }, { "epoch": 1.3104549082202714, "grad_norm": 0.25832435488700867, "learning_rate": 3.396922444956905e-05, "loss": 0.2028, "num_tokens": 429302547.0, "step": 3284 }, { "epoch": 1.310853950518755, "grad_norm": 0.31066250801086426, "learning_rate": 3.395974453579567e-05, "loss": 0.216, "num_tokens": 429433619.0, "step": 3285 }, { "epoch": 1.3112529928172387, "grad_norm": 0.24955326318740845, "learning_rate": 3.395026337212192e-05, "loss": 0.1526, "num_tokens": 429564691.0, "step": 3286 }, { "epoch": 1.3116520351157224, "grad_norm": 0.3091687262058258, "learning_rate": 3.394078096038234e-05, "loss": 0.2865, "num_tokens": 429695763.0, "step": 3287 }, { "epoch": 1.312051077414206, "grad_norm": 0.27115535736083984, "learning_rate": 3.3931297302411684e-05, "loss": 0.2229, "num_tokens": 429826835.0, "step": 3288 }, { "epoch": 1.3124501197126897, "grad_norm": 0.2692015469074249, "learning_rate": 3.3921812400044964e-05, "loss": 0.2425, "num_tokens": 429957907.0, "step": 3289 }, { "epoch": 1.3128491620111733, "grad_norm": 0.2680787444114685, "learning_rate": 3.3912326255117394e-05, "loss": 0.2183, "num_tokens": 430088979.0, "step": 3290 }, { "epoch": 1.313248204309657, "grad_norm": 0.2971503734588623, "learning_rate": 3.3902838869464474e-05, "loss": 0.268, "num_tokens": 430220051.0, "step": 3291 }, { "epoch": 1.3136472466081406, "grad_norm": 0.2713411748409271, "learning_rate": 3.389335024492192e-05, "loss": 0.222, "num_tokens": 430351123.0, "step": 3292 }, { "epoch": 1.3140462889066242, "grad_norm": 0.28463801741600037, "learning_rate": 3.388386038332569e-05, "loss": 0.2405, "num_tokens": 430482195.0, "step": 3293 }, { "epoch": 1.3144453312051079, "grad_norm": 0.24604511260986328, "learning_rate": 3.387436928651198e-05, "loss": 0.2081, "num_tokens": 430613267.0, "step": 3294 }, { "epoch": 1.3148443735035915, "grad_norm": 0.2642531394958496, "learning_rate": 3.386487695631724e-05, "loss": 0.2297, "num_tokens": 430744339.0, "step": 3295 }, { "epoch": 1.3152434158020752, "grad_norm": 0.23504570126533508, "learning_rate": 3.385538339457815e-05, "loss": 0.1678, "num_tokens": 430875411.0, "step": 3296 }, { "epoch": 1.3156424581005586, "grad_norm": 0.31621143221855164, "learning_rate": 3.384588860313161e-05, "loss": 0.2731, "num_tokens": 431006483.0, "step": 3297 }, { "epoch": 1.3160415003990422, "grad_norm": 0.2689412236213684, "learning_rate": 3.3836392583814774e-05, "loss": 0.2339, "num_tokens": 431137555.0, "step": 3298 }, { "epoch": 1.3164405426975259, "grad_norm": 0.2522696554660797, "learning_rate": 3.382689533846505e-05, "loss": 0.2172, "num_tokens": 431268627.0, "step": 3299 }, { "epoch": 1.3168395849960095, "grad_norm": 0.2494988590478897, "learning_rate": 3.381739686892005e-05, "loss": 0.2083, "num_tokens": 431399699.0, "step": 3300 }, { "epoch": 1.3172386272944931, "grad_norm": 0.25539925694465637, "learning_rate": 3.380789717701765e-05, "loss": 0.2035, "num_tokens": 431530771.0, "step": 3301 }, { "epoch": 1.3176376695929768, "grad_norm": 0.24013635516166687, "learning_rate": 3.379839626459594e-05, "loss": 0.2055, "num_tokens": 431661843.0, "step": 3302 }, { "epoch": 1.3180367118914604, "grad_norm": 0.2461404949426651, "learning_rate": 3.3788894133493275e-05, "loss": 0.1668, "num_tokens": 431792915.0, "step": 3303 }, { "epoch": 1.318435754189944, "grad_norm": 0.25050410628318787, "learning_rate": 3.377939078554822e-05, "loss": 0.1644, "num_tokens": 431923987.0, "step": 3304 }, { "epoch": 1.3188347964884277, "grad_norm": 0.29820919036865234, "learning_rate": 3.376988622259958e-05, "loss": 0.2326, "num_tokens": 432055059.0, "step": 3305 }, { "epoch": 1.3192338387869114, "grad_norm": 0.26607584953308105, "learning_rate": 3.37603804464864e-05, "loss": 0.2124, "num_tokens": 432186131.0, "step": 3306 }, { "epoch": 1.319632881085395, "grad_norm": 0.3069382607936859, "learning_rate": 3.375087345904797e-05, "loss": 0.2144, "num_tokens": 432317203.0, "step": 3307 }, { "epoch": 1.3200319233838786, "grad_norm": 0.26247352361679077, "learning_rate": 3.3741365262123784e-05, "loss": 0.2069, "num_tokens": 432448275.0, "step": 3308 }, { "epoch": 1.3204309656823623, "grad_norm": 0.2558422386646271, "learning_rate": 3.373185585755361e-05, "loss": 0.2007, "num_tokens": 432579347.0, "step": 3309 }, { "epoch": 1.320830007980846, "grad_norm": 0.2902807295322418, "learning_rate": 3.372234524717742e-05, "loss": 0.2525, "num_tokens": 432710419.0, "step": 3310 }, { "epoch": 1.3212290502793296, "grad_norm": 0.31972992420196533, "learning_rate": 3.3712833432835426e-05, "loss": 0.2697, "num_tokens": 432841491.0, "step": 3311 }, { "epoch": 1.3216280925778132, "grad_norm": 0.2860593795776367, "learning_rate": 3.370332041636808e-05, "loss": 0.2209, "num_tokens": 432972563.0, "step": 3312 }, { "epoch": 1.3220271348762969, "grad_norm": 0.2832627296447754, "learning_rate": 3.3693806199616056e-05, "loss": 0.2219, "num_tokens": 433103635.0, "step": 3313 }, { "epoch": 1.3224261771747805, "grad_norm": 0.27662697434425354, "learning_rate": 3.3684290784420277e-05, "loss": 0.2441, "num_tokens": 433234707.0, "step": 3314 }, { "epoch": 1.3228252194732641, "grad_norm": 0.28982725739479065, "learning_rate": 3.367477417262187e-05, "loss": 0.2412, "num_tokens": 433365779.0, "step": 3315 }, { "epoch": 1.3232242617717478, "grad_norm": 0.27235957980155945, "learning_rate": 3.366525636606223e-05, "loss": 0.261, "num_tokens": 433496851.0, "step": 3316 }, { "epoch": 1.3236233040702314, "grad_norm": 0.28449520468711853, "learning_rate": 3.3655737366582956e-05, "loss": 0.2284, "num_tokens": 433627923.0, "step": 3317 }, { "epoch": 1.324022346368715, "grad_norm": 0.2877282202243805, "learning_rate": 3.364621717602588e-05, "loss": 0.2443, "num_tokens": 433758995.0, "step": 3318 }, { "epoch": 1.3244213886671987, "grad_norm": 0.26703500747680664, "learning_rate": 3.363669579623308e-05, "loss": 0.2325, "num_tokens": 433890067.0, "step": 3319 }, { "epoch": 1.3248204309656824, "grad_norm": 0.2964073419570923, "learning_rate": 3.362717322904684e-05, "loss": 0.2318, "num_tokens": 434021139.0, "step": 3320 }, { "epoch": 1.325219473264166, "grad_norm": 0.32655540108680725, "learning_rate": 3.36176494763097e-05, "loss": 0.2618, "num_tokens": 434152211.0, "step": 3321 }, { "epoch": 1.3256185155626496, "grad_norm": 0.29240939021110535, "learning_rate": 3.3608124539864425e-05, "loss": 0.1604, "num_tokens": 434283283.0, "step": 3322 }, { "epoch": 1.3260175578611333, "grad_norm": 0.2748970687389374, "learning_rate": 3.3598598421553975e-05, "loss": 0.1985, "num_tokens": 434414355.0, "step": 3323 }, { "epoch": 1.326416600159617, "grad_norm": 0.3245055377483368, "learning_rate": 3.358907112322159e-05, "loss": 0.2916, "num_tokens": 434545427.0, "step": 3324 }, { "epoch": 1.3268156424581006, "grad_norm": 0.2576633393764496, "learning_rate": 3.3579542646710706e-05, "loss": 0.2001, "num_tokens": 434676499.0, "step": 3325 }, { "epoch": 1.3272146847565842, "grad_norm": 0.24590933322906494, "learning_rate": 3.357001299386498e-05, "loss": 0.1928, "num_tokens": 434807571.0, "step": 3326 }, { "epoch": 1.3276137270550679, "grad_norm": 0.3365423381328583, "learning_rate": 3.356048216652833e-05, "loss": 0.2788, "num_tokens": 434938643.0, "step": 3327 }, { "epoch": 1.3280127693535515, "grad_norm": 0.33672842383384705, "learning_rate": 3.355095016654488e-05, "loss": 0.2786, "num_tokens": 435069715.0, "step": 3328 }, { "epoch": 1.3284118116520351, "grad_norm": 0.2792149484157562, "learning_rate": 3.3541416995758984e-05, "loss": 0.2501, "num_tokens": 435200787.0, "step": 3329 }, { "epoch": 1.3288108539505188, "grad_norm": 0.2796356678009033, "learning_rate": 3.353188265601521e-05, "loss": 0.1832, "num_tokens": 435331859.0, "step": 3330 }, { "epoch": 1.3292098962490024, "grad_norm": 0.2808972895145416, "learning_rate": 3.3522347149158366e-05, "loss": 0.2306, "num_tokens": 435461119.0, "step": 3331 }, { "epoch": 1.329608938547486, "grad_norm": 0.3120262026786804, "learning_rate": 3.351281047703349e-05, "loss": 0.2799, "num_tokens": 435592191.0, "step": 3332 }, { "epoch": 1.3300079808459697, "grad_norm": 0.2935926616191864, "learning_rate": 3.3503272641485845e-05, "loss": 0.2465, "num_tokens": 435723263.0, "step": 3333 }, { "epoch": 1.3304070231444534, "grad_norm": 0.2593836784362793, "learning_rate": 3.34937336443609e-05, "loss": 0.195, "num_tokens": 435854335.0, "step": 3334 }, { "epoch": 1.330806065442937, "grad_norm": 0.268829345703125, "learning_rate": 3.3484193487504366e-05, "loss": 0.2018, "num_tokens": 435985407.0, "step": 3335 }, { "epoch": 1.3312051077414206, "grad_norm": 0.304250568151474, "learning_rate": 3.3474652172762184e-05, "loss": 0.2633, "num_tokens": 436116479.0, "step": 3336 }, { "epoch": 1.3316041500399043, "grad_norm": 0.2744540274143219, "learning_rate": 3.346510970198049e-05, "loss": 0.1918, "num_tokens": 436247551.0, "step": 3337 }, { "epoch": 1.332003192338388, "grad_norm": 0.25765347480773926, "learning_rate": 3.345556607700568e-05, "loss": 0.2229, "num_tokens": 436378623.0, "step": 3338 }, { "epoch": 1.3324022346368716, "grad_norm": 0.28824582695961, "learning_rate": 3.344602129968435e-05, "loss": 0.2534, "num_tokens": 436509695.0, "step": 3339 }, { "epoch": 1.3328012769353552, "grad_norm": 0.2663598358631134, "learning_rate": 3.3436475371863336e-05, "loss": 0.2068, "num_tokens": 436640767.0, "step": 3340 }, { "epoch": 1.3332003192338389, "grad_norm": 0.2625400125980377, "learning_rate": 3.342692829538966e-05, "loss": 0.2369, "num_tokens": 436771839.0, "step": 3341 }, { "epoch": 1.3335993615323225, "grad_norm": 0.2689318358898163, "learning_rate": 3.341738007211063e-05, "loss": 0.2329, "num_tokens": 436902911.0, "step": 3342 }, { "epoch": 1.3339984038308061, "grad_norm": 0.2607790231704712, "learning_rate": 3.34078307038737e-05, "loss": 0.2183, "num_tokens": 437026957.0, "step": 3343 }, { "epoch": 1.3343974461292896, "grad_norm": 0.25033825635910034, "learning_rate": 3.3398280192526605e-05, "loss": 0.1954, "num_tokens": 437158029.0, "step": 3344 }, { "epoch": 1.3347964884277732, "grad_norm": 0.2565325200557709, "learning_rate": 3.338872853991728e-05, "loss": 0.2368, "num_tokens": 437289101.0, "step": 3345 }, { "epoch": 1.3351955307262569, "grad_norm": 0.25580504536628723, "learning_rate": 3.337917574789388e-05, "loss": 0.2109, "num_tokens": 437420173.0, "step": 3346 }, { "epoch": 1.3355945730247405, "grad_norm": 0.2562626898288727, "learning_rate": 3.3369621818304765e-05, "loss": 0.2149, "num_tokens": 437551245.0, "step": 3347 }, { "epoch": 1.3359936153232241, "grad_norm": 0.2564926743507385, "learning_rate": 3.3360066752998554e-05, "loss": 0.1982, "num_tokens": 437682317.0, "step": 3348 }, { "epoch": 1.3363926576217078, "grad_norm": 0.3004336655139923, "learning_rate": 3.335051055382404e-05, "loss": 0.263, "num_tokens": 437813389.0, "step": 3349 }, { "epoch": 1.3367916999201914, "grad_norm": 0.25291335582733154, "learning_rate": 3.334095322263028e-05, "loss": 0.1936, "num_tokens": 437944461.0, "step": 3350 }, { "epoch": 1.337190742218675, "grad_norm": 0.2821957468986511, "learning_rate": 3.3331394761266526e-05, "loss": 0.2354, "num_tokens": 438075533.0, "step": 3351 }, { "epoch": 1.3375897845171587, "grad_norm": 0.2833091616630554, "learning_rate": 3.3321835171582243e-05, "loss": 0.2581, "num_tokens": 438206605.0, "step": 3352 }, { "epoch": 1.3379888268156424, "grad_norm": 0.2449241280555725, "learning_rate": 3.3312274455427124e-05, "loss": 0.193, "num_tokens": 438337677.0, "step": 3353 }, { "epoch": 1.338387869114126, "grad_norm": 0.25246232748031616, "learning_rate": 3.330271261465107e-05, "loss": 0.1856, "num_tokens": 438468749.0, "step": 3354 }, { "epoch": 1.3387869114126096, "grad_norm": 0.26893407106399536, "learning_rate": 3.329314965110423e-05, "loss": 0.2282, "num_tokens": 438599821.0, "step": 3355 }, { "epoch": 1.3391859537110933, "grad_norm": 0.2557627558708191, "learning_rate": 3.328358556663693e-05, "loss": 0.2302, "num_tokens": 438730893.0, "step": 3356 }, { "epoch": 1.339584996009577, "grad_norm": 0.2849770188331604, "learning_rate": 3.3274020363099736e-05, "loss": 0.2485, "num_tokens": 438861965.0, "step": 3357 }, { "epoch": 1.3399840383080606, "grad_norm": 0.25927695631980896, "learning_rate": 3.326445404234343e-05, "loss": 0.2005, "num_tokens": 438993037.0, "step": 3358 }, { "epoch": 1.3403830806065442, "grad_norm": 0.2674713134765625, "learning_rate": 3.3254886606219e-05, "loss": 0.2264, "num_tokens": 439124109.0, "step": 3359 }, { "epoch": 1.3407821229050279, "grad_norm": 0.3062312602996826, "learning_rate": 3.324531805657765e-05, "loss": 0.2483, "num_tokens": 439255181.0, "step": 3360 }, { "epoch": 1.3411811652035115, "grad_norm": 0.26299431920051575, "learning_rate": 3.323574839527083e-05, "loss": 0.2036, "num_tokens": 439386253.0, "step": 3361 }, { "epoch": 1.3415802075019951, "grad_norm": 0.2584640085697174, "learning_rate": 3.322617762415015e-05, "loss": 0.1901, "num_tokens": 439517325.0, "step": 3362 }, { "epoch": 1.3419792498004788, "grad_norm": 0.300149530172348, "learning_rate": 3.3216605745067484e-05, "loss": 0.2304, "num_tokens": 439648397.0, "step": 3363 }, { "epoch": 1.3423782920989624, "grad_norm": 0.2427777498960495, "learning_rate": 3.320703275987489e-05, "loss": 0.1685, "num_tokens": 439779469.0, "step": 3364 }, { "epoch": 1.342777334397446, "grad_norm": 0.30382224917411804, "learning_rate": 3.319745867042466e-05, "loss": 0.2234, "num_tokens": 439910541.0, "step": 3365 }, { "epoch": 1.3431763766959297, "grad_norm": 0.2803923189640045, "learning_rate": 3.31878834785693e-05, "loss": 0.2033, "num_tokens": 440041613.0, "step": 3366 }, { "epoch": 1.3435754189944134, "grad_norm": 0.2746635973453522, "learning_rate": 3.31783071861615e-05, "loss": 0.1853, "num_tokens": 440172685.0, "step": 3367 }, { "epoch": 1.343974461292897, "grad_norm": 0.3333316743373871, "learning_rate": 3.3168729795054194e-05, "loss": 0.2775, "num_tokens": 440303757.0, "step": 3368 }, { "epoch": 1.3443735035913806, "grad_norm": 0.25866571068763733, "learning_rate": 3.315915130710051e-05, "loss": 0.2004, "num_tokens": 440434829.0, "step": 3369 }, { "epoch": 1.3447725458898643, "grad_norm": 0.25511685013771057, "learning_rate": 3.314957172415381e-05, "loss": 0.1998, "num_tokens": 440565901.0, "step": 3370 }, { "epoch": 1.345171588188348, "grad_norm": 0.2646564543247223, "learning_rate": 3.3139991048067636e-05, "loss": 0.2244, "num_tokens": 440696973.0, "step": 3371 }, { "epoch": 1.3455706304868316, "grad_norm": 0.2418920248746872, "learning_rate": 3.313040928069577e-05, "loss": 0.1973, "num_tokens": 440828045.0, "step": 3372 }, { "epoch": 1.3459696727853152, "grad_norm": 0.2799352705478668, "learning_rate": 3.3120826423892205e-05, "loss": 0.2651, "num_tokens": 440959117.0, "step": 3373 }, { "epoch": 1.3463687150837989, "grad_norm": 0.28576597571372986, "learning_rate": 3.3111242479511115e-05, "loss": 0.2742, "num_tokens": 441090189.0, "step": 3374 }, { "epoch": 1.3467677573822825, "grad_norm": 0.2688186466693878, "learning_rate": 3.310165744940692e-05, "loss": 0.2249, "num_tokens": 441221261.0, "step": 3375 }, { "epoch": 1.3471667996807661, "grad_norm": 0.2847127616405487, "learning_rate": 3.309207133543422e-05, "loss": 0.2642, "num_tokens": 441352333.0, "step": 3376 }, { "epoch": 1.3475658419792498, "grad_norm": 0.25879213213920593, "learning_rate": 3.308248413944784e-05, "loss": 0.2107, "num_tokens": 441477861.0, "step": 3377 }, { "epoch": 1.3479648842777334, "grad_norm": 0.27393805980682373, "learning_rate": 3.307289586330283e-05, "loss": 0.1602, "num_tokens": 441608933.0, "step": 3378 }, { "epoch": 1.348363926576217, "grad_norm": 0.26552045345306396, "learning_rate": 3.306330650885442e-05, "loss": 0.2105, "num_tokens": 441740005.0, "step": 3379 }, { "epoch": 1.3487629688747007, "grad_norm": 0.3007065951824188, "learning_rate": 3.3053716077958055e-05, "loss": 0.1986, "num_tokens": 441871077.0, "step": 3380 }, { "epoch": 1.3491620111731844, "grad_norm": 0.28009718656539917, "learning_rate": 3.3044124572469396e-05, "loss": 0.2318, "num_tokens": 442002149.0, "step": 3381 }, { "epoch": 1.349561053471668, "grad_norm": 0.2688160240650177, "learning_rate": 3.303453199424432e-05, "loss": 0.1823, "num_tokens": 442133221.0, "step": 3382 }, { "epoch": 1.3499600957701516, "grad_norm": 0.2844490706920624, "learning_rate": 3.3024938345138896e-05, "loss": 0.2021, "num_tokens": 442264293.0, "step": 3383 }, { "epoch": 1.3503591380686353, "grad_norm": 0.28725290298461914, "learning_rate": 3.30153436270094e-05, "loss": 0.2283, "num_tokens": 442395365.0, "step": 3384 }, { "epoch": 1.350758180367119, "grad_norm": 0.26830974221229553, "learning_rate": 3.300574784171234e-05, "loss": 0.1865, "num_tokens": 442526437.0, "step": 3385 }, { "epoch": 1.3511572226656026, "grad_norm": 0.2771764099597931, "learning_rate": 3.299615099110438e-05, "loss": 0.2456, "num_tokens": 442657509.0, "step": 3386 }, { "epoch": 1.3515562649640862, "grad_norm": 0.28123849630355835, "learning_rate": 3.298655307704244e-05, "loss": 0.2309, "num_tokens": 442778111.0, "step": 3387 }, { "epoch": 1.3519553072625698, "grad_norm": 0.27132442593574524, "learning_rate": 3.297695410138363e-05, "loss": 0.2469, "num_tokens": 442909183.0, "step": 3388 }, { "epoch": 1.3523543495610535, "grad_norm": 0.2585133910179138, "learning_rate": 3.2967354065985255e-05, "loss": 0.2092, "num_tokens": 443040255.0, "step": 3389 }, { "epoch": 1.3527533918595371, "grad_norm": 0.2590591609477997, "learning_rate": 3.2957752972704834e-05, "loss": 0.1973, "num_tokens": 443171327.0, "step": 3390 }, { "epoch": 1.3531524341580208, "grad_norm": 0.2756709158420563, "learning_rate": 3.294815082340009e-05, "loss": 0.2432, "num_tokens": 443302399.0, "step": 3391 }, { "epoch": 1.3535514764565044, "grad_norm": 0.25411364436149597, "learning_rate": 3.293854761992895e-05, "loss": 0.2163, "num_tokens": 443433471.0, "step": 3392 }, { "epoch": 1.353950518754988, "grad_norm": 0.26580098271369934, "learning_rate": 3.292894336414954e-05, "loss": 0.2074, "num_tokens": 443564543.0, "step": 3393 }, { "epoch": 1.3543495610534717, "grad_norm": 0.2579074501991272, "learning_rate": 3.291933805792019e-05, "loss": 0.2244, "num_tokens": 443695615.0, "step": 3394 }, { "epoch": 1.3547486033519553, "grad_norm": 0.2907930314540863, "learning_rate": 3.290973170309946e-05, "loss": 0.2585, "num_tokens": 443826687.0, "step": 3395 }, { "epoch": 1.355147645650439, "grad_norm": 0.26205500960350037, "learning_rate": 3.2900124301546066e-05, "loss": 0.209, "num_tokens": 443957759.0, "step": 3396 }, { "epoch": 1.3555466879489226, "grad_norm": 0.2806469202041626, "learning_rate": 3.289051585511896e-05, "loss": 0.2046, "num_tokens": 444088831.0, "step": 3397 }, { "epoch": 1.3559457302474063, "grad_norm": 0.30818280577659607, "learning_rate": 3.288090636567729e-05, "loss": 0.2531, "num_tokens": 444219903.0, "step": 3398 }, { "epoch": 1.35634477254589, "grad_norm": 0.27101725339889526, "learning_rate": 3.287129583508039e-05, "loss": 0.2128, "num_tokens": 444350975.0, "step": 3399 }, { "epoch": 1.3567438148443736, "grad_norm": 0.31242838501930237, "learning_rate": 3.286168426518782e-05, "loss": 0.2556, "num_tokens": 444482047.0, "step": 3400 }, { "epoch": 1.3571428571428572, "grad_norm": 0.3066098392009735, "learning_rate": 3.285207165785933e-05, "loss": 0.2194, "num_tokens": 444613119.0, "step": 3401 }, { "epoch": 1.3575418994413408, "grad_norm": 0.27392417192459106, "learning_rate": 3.284245801495486e-05, "loss": 0.2257, "num_tokens": 444744191.0, "step": 3402 }, { "epoch": 1.3579409417398245, "grad_norm": 0.2629651725292206, "learning_rate": 3.283284333833457e-05, "loss": 0.2007, "num_tokens": 444875263.0, "step": 3403 }, { "epoch": 1.3583399840383081, "grad_norm": 0.27054861187934875, "learning_rate": 3.28232276298588e-05, "loss": 0.2132, "num_tokens": 445006335.0, "step": 3404 }, { "epoch": 1.3587390263367918, "grad_norm": 0.26430776715278625, "learning_rate": 3.281361089138811e-05, "loss": 0.2297, "num_tokens": 445137407.0, "step": 3405 }, { "epoch": 1.3591380686352754, "grad_norm": 0.25734323263168335, "learning_rate": 3.280399312478325e-05, "loss": 0.2165, "num_tokens": 445268479.0, "step": 3406 }, { "epoch": 1.359537110933759, "grad_norm": 0.2493169605731964, "learning_rate": 3.279437433190515e-05, "loss": 0.2005, "num_tokens": 445399551.0, "step": 3407 }, { "epoch": 1.3599361532322427, "grad_norm": 0.289325088262558, "learning_rate": 3.278475451461499e-05, "loss": 0.2733, "num_tokens": 445530623.0, "step": 3408 }, { "epoch": 1.3603351955307263, "grad_norm": 0.2917884290218353, "learning_rate": 3.277513367477408e-05, "loss": 0.2728, "num_tokens": 445661695.0, "step": 3409 }, { "epoch": 1.36073423782921, "grad_norm": 0.24224643409252167, "learning_rate": 3.2765511814243983e-05, "loss": 0.2164, "num_tokens": 445792767.0, "step": 3410 }, { "epoch": 1.3611332801276936, "grad_norm": 0.2461376041173935, "learning_rate": 3.275588893488643e-05, "loss": 0.2139, "num_tokens": 445923839.0, "step": 3411 }, { "epoch": 1.3615323224261773, "grad_norm": 0.2675681710243225, "learning_rate": 3.274626503856337e-05, "loss": 0.2301, "num_tokens": 446054911.0, "step": 3412 }, { "epoch": 1.361931364724661, "grad_norm": 0.27300551533699036, "learning_rate": 3.2736640127136925e-05, "loss": 0.2109, "num_tokens": 446185983.0, "step": 3413 }, { "epoch": 1.3623304070231446, "grad_norm": 0.28959447145462036, "learning_rate": 3.2727014202469435e-05, "loss": 0.2306, "num_tokens": 446317055.0, "step": 3414 }, { "epoch": 1.3627294493216282, "grad_norm": 0.2785934507846832, "learning_rate": 3.271738726642342e-05, "loss": 0.2243, "num_tokens": 446448127.0, "step": 3415 }, { "epoch": 1.3631284916201118, "grad_norm": 0.25609642267227173, "learning_rate": 3.27077593208616e-05, "loss": 0.1933, "num_tokens": 446579199.0, "step": 3416 }, { "epoch": 1.3635275339185955, "grad_norm": 0.24390128254890442, "learning_rate": 3.269813036764691e-05, "loss": 0.1747, "num_tokens": 446710271.0, "step": 3417 }, { "epoch": 1.3639265762170791, "grad_norm": 0.270803838968277, "learning_rate": 3.2688500408642443e-05, "loss": 0.1649, "num_tokens": 446841343.0, "step": 3418 }, { "epoch": 1.3643256185155628, "grad_norm": 0.29082784056663513, "learning_rate": 3.2678869445711513e-05, "loss": 0.1888, "num_tokens": 446972415.0, "step": 3419 }, { "epoch": 1.3647246608140464, "grad_norm": 0.41033491492271423, "learning_rate": 3.266923748071762e-05, "loss": 0.3211, "num_tokens": 447103487.0, "step": 3420 }, { "epoch": 1.36512370311253, "grad_norm": 0.33151623606681824, "learning_rate": 3.2659604515524464e-05, "loss": 0.2605, "num_tokens": 447234559.0, "step": 3421 }, { "epoch": 1.3655227454110137, "grad_norm": 0.28600138425827026, "learning_rate": 3.2649970551995934e-05, "loss": 0.215, "num_tokens": 447365631.0, "step": 3422 }, { "epoch": 1.3659217877094971, "grad_norm": 0.2738858163356781, "learning_rate": 3.26403355919961e-05, "loss": 0.2368, "num_tokens": 447496703.0, "step": 3423 }, { "epoch": 1.3663208300079808, "grad_norm": 0.2834329605102539, "learning_rate": 3.2630699637389253e-05, "loss": 0.2013, "num_tokens": 447627775.0, "step": 3424 }, { "epoch": 1.3667198723064644, "grad_norm": 0.25541070103645325, "learning_rate": 3.2621062690039864e-05, "loss": 0.1852, "num_tokens": 447758847.0, "step": 3425 }, { "epoch": 1.367118914604948, "grad_norm": 0.27482083439826965, "learning_rate": 3.261142475181257e-05, "loss": 0.2023, "num_tokens": 447889919.0, "step": 3426 }, { "epoch": 1.3675179569034317, "grad_norm": 1.0339648723602295, "learning_rate": 3.260178582457223e-05, "loss": 0.2244, "num_tokens": 448020991.0, "step": 3427 }, { "epoch": 1.3679169992019153, "grad_norm": 0.2657701373100281, "learning_rate": 3.25921459101839e-05, "loss": 0.2172, "num_tokens": 448152063.0, "step": 3428 }, { "epoch": 1.368316041500399, "grad_norm": 0.3460908532142639, "learning_rate": 3.2582505010512806e-05, "loss": 0.2583, "num_tokens": 448283135.0, "step": 3429 }, { "epoch": 1.3687150837988826, "grad_norm": 0.2711128890514374, "learning_rate": 3.257286312742437e-05, "loss": 0.1917, "num_tokens": 448414207.0, "step": 3430 }, { "epoch": 1.3691141260973663, "grad_norm": 0.27308669686317444, "learning_rate": 3.256322026278421e-05, "loss": 0.2096, "num_tokens": 448545279.0, "step": 3431 }, { "epoch": 1.36951316839585, "grad_norm": 0.2744976878166199, "learning_rate": 3.2553576418458115e-05, "loss": 0.2213, "num_tokens": 448676351.0, "step": 3432 }, { "epoch": 1.3699122106943336, "grad_norm": 0.273420512676239, "learning_rate": 3.2543931596312106e-05, "loss": 0.1876, "num_tokens": 448807423.0, "step": 3433 }, { "epoch": 1.3703112529928172, "grad_norm": 0.2880530059337616, "learning_rate": 3.2534285798212343e-05, "loss": 0.2252, "num_tokens": 448938495.0, "step": 3434 }, { "epoch": 1.3707102952913008, "grad_norm": 0.2754301130771637, "learning_rate": 3.2524639026025206e-05, "loss": 0.2101, "num_tokens": 449069567.0, "step": 3435 }, { "epoch": 1.3711093375897845, "grad_norm": 0.27848371863365173, "learning_rate": 3.251499128161727e-05, "loss": 0.2135, "num_tokens": 449200639.0, "step": 3436 }, { "epoch": 1.3715083798882681, "grad_norm": 0.2880766987800598, "learning_rate": 3.2505342566855255e-05, "loss": 0.2335, "num_tokens": 449331711.0, "step": 3437 }, { "epoch": 1.3719074221867518, "grad_norm": 0.23932579159736633, "learning_rate": 3.249569288360612e-05, "loss": 0.1765, "num_tokens": 449462783.0, "step": 3438 }, { "epoch": 1.3723064644852354, "grad_norm": 0.28641316294670105, "learning_rate": 3.248604223373698e-05, "loss": 0.2277, "num_tokens": 449593855.0, "step": 3439 }, { "epoch": 1.372705506783719, "grad_norm": 0.27746281027793884, "learning_rate": 3.247639061911515e-05, "loss": 0.207, "num_tokens": 449724927.0, "step": 3440 }, { "epoch": 1.3731045490822027, "grad_norm": 0.31755873560905457, "learning_rate": 3.246673804160811e-05, "loss": 0.2474, "num_tokens": 449855999.0, "step": 3441 }, { "epoch": 1.3735035913806863, "grad_norm": 0.26683706045150757, "learning_rate": 3.2457084503083585e-05, "loss": 0.2006, "num_tokens": 449987071.0, "step": 3442 }, { "epoch": 1.37390263367917, "grad_norm": 0.23695704340934753, "learning_rate": 3.2447430005409395e-05, "loss": 0.1604, "num_tokens": 450118143.0, "step": 3443 }, { "epoch": 1.3743016759776536, "grad_norm": 0.26926466822624207, "learning_rate": 3.243777455045362e-05, "loss": 0.2072, "num_tokens": 450249215.0, "step": 3444 }, { "epoch": 1.3747007182761373, "grad_norm": 0.24462400376796722, "learning_rate": 3.242811814008451e-05, "loss": 0.1734, "num_tokens": 450380287.0, "step": 3445 }, { "epoch": 1.375099760574621, "grad_norm": 0.358814001083374, "learning_rate": 3.241846077617048e-05, "loss": 0.2594, "num_tokens": 450511359.0, "step": 3446 }, { "epoch": 1.3754988028731046, "grad_norm": 0.3109686076641083, "learning_rate": 3.2408802460580134e-05, "loss": 0.2245, "num_tokens": 450642431.0, "step": 3447 }, { "epoch": 1.3758978451715882, "grad_norm": 0.26507338881492615, "learning_rate": 3.2399143195182273e-05, "loss": 0.2196, "num_tokens": 450773503.0, "step": 3448 }, { "epoch": 1.3762968874700718, "grad_norm": 0.26267075538635254, "learning_rate": 3.238948298184587e-05, "loss": 0.2191, "num_tokens": 450904575.0, "step": 3449 }, { "epoch": 1.3766959297685555, "grad_norm": 0.28380683064460754, "learning_rate": 3.2379821822440086e-05, "loss": 0.2131, "num_tokens": 451035647.0, "step": 3450 }, { "epoch": 1.3770949720670391, "grad_norm": 0.25237780809402466, "learning_rate": 3.237015971883427e-05, "loss": 0.2086, "num_tokens": 451166719.0, "step": 3451 }, { "epoch": 1.3774940143655228, "grad_norm": 0.24168649315834045, "learning_rate": 3.236049667289795e-05, "loss": 0.2014, "num_tokens": 451297791.0, "step": 3452 }, { "epoch": 1.3778930566640064, "grad_norm": 0.261209100484848, "learning_rate": 3.235083268650084e-05, "loss": 0.1952, "num_tokens": 451428863.0, "step": 3453 }, { "epoch": 1.37829209896249, "grad_norm": 0.3036017119884491, "learning_rate": 3.234116776151281e-05, "loss": 0.2092, "num_tokens": 451559935.0, "step": 3454 }, { "epoch": 1.3786911412609737, "grad_norm": 0.2547227442264557, "learning_rate": 3.233150189980395e-05, "loss": 0.2034, "num_tokens": 451691007.0, "step": 3455 }, { "epoch": 1.3790901835594573, "grad_norm": 0.2642313838005066, "learning_rate": 3.2321835103244516e-05, "loss": 0.2034, "num_tokens": 451822079.0, "step": 3456 }, { "epoch": 1.379489225857941, "grad_norm": 0.2925601303577423, "learning_rate": 3.231216737370494e-05, "loss": 0.201, "num_tokens": 451953151.0, "step": 3457 }, { "epoch": 1.3798882681564246, "grad_norm": 0.27874577045440674, "learning_rate": 3.230249871305583e-05, "loss": 0.223, "num_tokens": 452084223.0, "step": 3458 }, { "epoch": 1.3802873104549083, "grad_norm": 0.2743988633155823, "learning_rate": 3.229282912316799e-05, "loss": 0.1891, "num_tokens": 452215295.0, "step": 3459 }, { "epoch": 1.380686352753392, "grad_norm": 0.2532285749912262, "learning_rate": 3.2283158605912396e-05, "loss": 0.1893, "num_tokens": 452346367.0, "step": 3460 }, { "epoch": 1.3810853950518756, "grad_norm": 0.27407780289649963, "learning_rate": 3.2273487163160195e-05, "loss": 0.2195, "num_tokens": 452477439.0, "step": 3461 }, { "epoch": 1.3814844373503592, "grad_norm": 0.2786031663417816, "learning_rate": 3.2263814796782724e-05, "loss": 0.2479, "num_tokens": 452608511.0, "step": 3462 }, { "epoch": 1.3818834796488428, "grad_norm": 0.2593863606452942, "learning_rate": 3.2254141508651497e-05, "loss": 0.1946, "num_tokens": 452739583.0, "step": 3463 }, { "epoch": 1.3822825219473265, "grad_norm": 0.2702851891517639, "learning_rate": 3.224446730063821e-05, "loss": 0.2136, "num_tokens": 452870655.0, "step": 3464 }, { "epoch": 1.3826815642458101, "grad_norm": 0.2749949097633362, "learning_rate": 3.2234792174614734e-05, "loss": 0.209, "num_tokens": 453001727.0, "step": 3465 }, { "epoch": 1.3830806065442938, "grad_norm": 0.2843162417411804, "learning_rate": 3.22251161324531e-05, "loss": 0.2243, "num_tokens": 453132799.0, "step": 3466 }, { "epoch": 1.3834796488427774, "grad_norm": 0.37601524591445923, "learning_rate": 3.221543917602554e-05, "loss": 0.286, "num_tokens": 453248094.0, "step": 3467 }, { "epoch": 1.383878691141261, "grad_norm": 0.27650153636932373, "learning_rate": 3.220576130720446e-05, "loss": 0.1835, "num_tokens": 453379166.0, "step": 3468 }, { "epoch": 1.3842777334397447, "grad_norm": 0.27639931440353394, "learning_rate": 3.2196082527862434e-05, "loss": 0.2555, "num_tokens": 453510238.0, "step": 3469 }, { "epoch": 1.3846767757382281, "grad_norm": 0.2570118010044098, "learning_rate": 3.218640283987222e-05, "loss": 0.1718, "num_tokens": 453641310.0, "step": 3470 }, { "epoch": 1.3850758180367118, "grad_norm": 0.2507155239582062, "learning_rate": 3.217672224510673e-05, "loss": 0.1786, "num_tokens": 453772382.0, "step": 3471 }, { "epoch": 1.3854748603351954, "grad_norm": 0.2848343253135681, "learning_rate": 3.216704074543909e-05, "loss": 0.2399, "num_tokens": 453903454.0, "step": 3472 }, { "epoch": 1.385873902633679, "grad_norm": 0.27149197459220886, "learning_rate": 3.2157358342742575e-05, "loss": 0.2157, "num_tokens": 454034526.0, "step": 3473 }, { "epoch": 1.3862729449321627, "grad_norm": 0.27238497138023376, "learning_rate": 3.2147675038890625e-05, "loss": 0.1827, "num_tokens": 454165598.0, "step": 3474 }, { "epoch": 1.3866719872306463, "grad_norm": 0.2801794707775116, "learning_rate": 3.213799083575689e-05, "loss": 0.2029, "num_tokens": 454296670.0, "step": 3475 }, { "epoch": 1.38707102952913, "grad_norm": 0.32965630292892456, "learning_rate": 3.212830573521516e-05, "loss": 0.2425, "num_tokens": 454427742.0, "step": 3476 }, { "epoch": 1.3874700718276136, "grad_norm": 0.26935505867004395, "learning_rate": 3.211861973913941e-05, "loss": 0.2048, "num_tokens": 454558814.0, "step": 3477 }, { "epoch": 1.3878691141260973, "grad_norm": 0.28447040915489197, "learning_rate": 3.21089328494038e-05, "loss": 0.2216, "num_tokens": 454689886.0, "step": 3478 }, { "epoch": 1.388268156424581, "grad_norm": 0.2687246799468994, "learning_rate": 3.209924506788265e-05, "loss": 0.2128, "num_tokens": 454820958.0, "step": 3479 }, { "epoch": 1.3886671987230645, "grad_norm": 0.2636450231075287, "learning_rate": 3.208955639645044e-05, "loss": 0.2225, "num_tokens": 454952030.0, "step": 3480 }, { "epoch": 1.3890662410215482, "grad_norm": 0.2516140043735504, "learning_rate": 3.207986683698187e-05, "loss": 0.2003, "num_tokens": 455083102.0, "step": 3481 }, { "epoch": 1.3894652833200318, "grad_norm": 0.2953377962112427, "learning_rate": 3.2070176391351744e-05, "loss": 0.2626, "num_tokens": 455214174.0, "step": 3482 }, { "epoch": 1.3898643256185155, "grad_norm": 0.27770692110061646, "learning_rate": 3.20604850614351e-05, "loss": 0.2184, "num_tokens": 455345246.0, "step": 3483 }, { "epoch": 1.3902633679169991, "grad_norm": 0.291289746761322, "learning_rate": 3.20507928491071e-05, "loss": 0.2335, "num_tokens": 455476318.0, "step": 3484 }, { "epoch": 1.3906624102154828, "grad_norm": 0.26569634675979614, "learning_rate": 3.204109975624311e-05, "loss": 0.2172, "num_tokens": 455607390.0, "step": 3485 }, { "epoch": 1.3910614525139664, "grad_norm": 0.2814808785915375, "learning_rate": 3.2031405784718644e-05, "loss": 0.206, "num_tokens": 455738462.0, "step": 3486 }, { "epoch": 1.39146049481245, "grad_norm": 0.34411343932151794, "learning_rate": 3.202171093640941e-05, "loss": 0.2532, "num_tokens": 455869534.0, "step": 3487 }, { "epoch": 1.3918595371109337, "grad_norm": 0.24700002372264862, "learning_rate": 3.2012015213191246e-05, "loss": 0.1921, "num_tokens": 456000606.0, "step": 3488 }, { "epoch": 1.3922585794094173, "grad_norm": 0.30019474029541016, "learning_rate": 3.2002318616940214e-05, "loss": 0.2326, "num_tokens": 456131678.0, "step": 3489 }, { "epoch": 1.392657621707901, "grad_norm": 0.2662086486816406, "learning_rate": 3.199262114953249e-05, "loss": 0.2092, "num_tokens": 456262750.0, "step": 3490 }, { "epoch": 1.3930566640063846, "grad_norm": 0.2365063726902008, "learning_rate": 3.1982922812844456e-05, "loss": 0.1832, "num_tokens": 456393822.0, "step": 3491 }, { "epoch": 1.3934557063048683, "grad_norm": 0.27106553316116333, "learning_rate": 3.197322360875264e-05, "loss": 0.2023, "num_tokens": 456524894.0, "step": 3492 }, { "epoch": 1.393854748603352, "grad_norm": 0.26766982674598694, "learning_rate": 3.196352353913377e-05, "loss": 0.1972, "num_tokens": 456655966.0, "step": 3493 }, { "epoch": 1.3942537909018355, "grad_norm": 0.28061702847480774, "learning_rate": 3.1953822605864696e-05, "loss": 0.2297, "num_tokens": 456787038.0, "step": 3494 }, { "epoch": 1.3946528332003192, "grad_norm": 0.25629574060440063, "learning_rate": 3.1944120810822464e-05, "loss": 0.1873, "num_tokens": 456918110.0, "step": 3495 }, { "epoch": 1.3950518754988028, "grad_norm": 0.27620401978492737, "learning_rate": 3.193441815588429e-05, "loss": 0.2184, "num_tokens": 457049182.0, "step": 3496 }, { "epoch": 1.3954509177972865, "grad_norm": 0.24993787705898285, "learning_rate": 3.1924714642927525e-05, "loss": 0.1973, "num_tokens": 457180254.0, "step": 3497 }, { "epoch": 1.3958499600957701, "grad_norm": 0.32927316427230835, "learning_rate": 3.1915010273829735e-05, "loss": 0.2385, "num_tokens": 457311326.0, "step": 3498 }, { "epoch": 1.3962490023942538, "grad_norm": 0.2592323124408722, "learning_rate": 3.1905305050468616e-05, "loss": 0.1912, "num_tokens": 457442398.0, "step": 3499 }, { "epoch": 1.3966480446927374, "grad_norm": 0.2746477723121643, "learning_rate": 3.189559897472203e-05, "loss": 0.1937, "num_tokens": 457573470.0, "step": 3500 }, { "epoch": 1.397047086991221, "grad_norm": 0.31359001994132996, "learning_rate": 3.1885892048468015e-05, "loss": 0.2778, "num_tokens": 457704542.0, "step": 3501 }, { "epoch": 1.3974461292897047, "grad_norm": 0.2779493033885956, "learning_rate": 3.187618427358477e-05, "loss": 0.2321, "num_tokens": 457835614.0, "step": 3502 }, { "epoch": 1.3978451715881883, "grad_norm": 0.2731007933616638, "learning_rate": 3.1866475651950664e-05, "loss": 0.1941, "num_tokens": 457966686.0, "step": 3503 }, { "epoch": 1.398244213886672, "grad_norm": 0.26674094796180725, "learning_rate": 3.185676618544424e-05, "loss": 0.2302, "num_tokens": 458097758.0, "step": 3504 }, { "epoch": 1.3986432561851556, "grad_norm": 0.27240726351737976, "learning_rate": 3.184705587594415e-05, "loss": 0.1615, "num_tokens": 458228830.0, "step": 3505 }, { "epoch": 1.3990422984836393, "grad_norm": 0.29221564531326294, "learning_rate": 3.183734472532928e-05, "loss": 0.2331, "num_tokens": 458359902.0, "step": 3506 }, { "epoch": 1.399441340782123, "grad_norm": 0.2689189016819, "learning_rate": 3.182763273547864e-05, "loss": 0.2039, "num_tokens": 458490974.0, "step": 3507 }, { "epoch": 1.3998403830806065, "grad_norm": 0.27896443009376526, "learning_rate": 3.181791990827141e-05, "loss": 0.2169, "num_tokens": 458622046.0, "step": 3508 }, { "epoch": 1.4002394253790902, "grad_norm": 0.2574767768383026, "learning_rate": 3.180820624558692e-05, "loss": 0.1881, "num_tokens": 458753118.0, "step": 3509 }, { "epoch": 1.4006384676775738, "grad_norm": 0.2731451988220215, "learning_rate": 3.179849174930469e-05, "loss": 0.1934, "num_tokens": 458884190.0, "step": 3510 }, { "epoch": 1.4010375099760575, "grad_norm": 0.31752490997314453, "learning_rate": 3.178877642130437e-05, "loss": 0.2461, "num_tokens": 459015262.0, "step": 3511 }, { "epoch": 1.4014365522745411, "grad_norm": 0.287647008895874, "learning_rate": 3.1779060263465796e-05, "loss": 0.1895, "num_tokens": 459130270.0, "step": 3512 }, { "epoch": 1.4018355945730248, "grad_norm": 0.2730090916156769, "learning_rate": 3.1769343277668945e-05, "loss": 0.2311, "num_tokens": 459261342.0, "step": 3513 }, { "epoch": 1.4022346368715084, "grad_norm": 0.26828134059906006, "learning_rate": 3.175962546579398e-05, "loss": 0.1947, "num_tokens": 459392414.0, "step": 3514 }, { "epoch": 1.402633679169992, "grad_norm": 0.2725279629230499, "learning_rate": 3.174990682972119e-05, "loss": 0.2331, "num_tokens": 459523486.0, "step": 3515 }, { "epoch": 1.4030327214684757, "grad_norm": 0.2773493826389313, "learning_rate": 3.1740187371331034e-05, "loss": 0.2226, "num_tokens": 459654558.0, "step": 3516 }, { "epoch": 1.4034317637669593, "grad_norm": 0.2515292763710022, "learning_rate": 3.1730467092504164e-05, "loss": 0.1936, "num_tokens": 459785630.0, "step": 3517 }, { "epoch": 1.403830806065443, "grad_norm": 0.2334597259759903, "learning_rate": 3.1720745995121336e-05, "loss": 0.1727, "num_tokens": 459916702.0, "step": 3518 }, { "epoch": 1.4042298483639266, "grad_norm": 0.2978777289390564, "learning_rate": 3.171102408106351e-05, "loss": 0.2328, "num_tokens": 460047774.0, "step": 3519 }, { "epoch": 1.4046288906624103, "grad_norm": 0.24765312671661377, "learning_rate": 3.170130135221177e-05, "loss": 0.1727, "num_tokens": 460178846.0, "step": 3520 }, { "epoch": 1.405027932960894, "grad_norm": 0.3047025799751282, "learning_rate": 3.1691577810447396e-05, "loss": 0.2245, "num_tokens": 460309918.0, "step": 3521 }, { "epoch": 1.4054269752593775, "grad_norm": 0.2721783518791199, "learning_rate": 3.1681853457651777e-05, "loss": 0.2291, "num_tokens": 460440990.0, "step": 3522 }, { "epoch": 1.4058260175578612, "grad_norm": 0.24662384390830994, "learning_rate": 3.167212829570651e-05, "loss": 0.1708, "num_tokens": 460572062.0, "step": 3523 }, { "epoch": 1.4062250598563448, "grad_norm": 0.25717607140541077, "learning_rate": 3.16624023264933e-05, "loss": 0.1961, "num_tokens": 460703134.0, "step": 3524 }, { "epoch": 1.4066241021548285, "grad_norm": 0.3208004832267761, "learning_rate": 3.1652675551894035e-05, "loss": 0.2615, "num_tokens": 460834206.0, "step": 3525 }, { "epoch": 1.4070231444533121, "grad_norm": 0.3395702838897705, "learning_rate": 3.1642947973790774e-05, "loss": 0.2021, "num_tokens": 460965278.0, "step": 3526 }, { "epoch": 1.4074221867517958, "grad_norm": 0.2512945234775543, "learning_rate": 3.163321959406569e-05, "loss": 0.1742, "num_tokens": 461096350.0, "step": 3527 }, { "epoch": 1.4078212290502794, "grad_norm": 0.2735863924026489, "learning_rate": 3.162349041460115e-05, "loss": 0.2172, "num_tokens": 461227422.0, "step": 3528 }, { "epoch": 1.408220271348763, "grad_norm": 0.25129377841949463, "learning_rate": 3.161376043727966e-05, "loss": 0.1725, "num_tokens": 461358494.0, "step": 3529 }, { "epoch": 1.4086193136472467, "grad_norm": 0.24500823020935059, "learning_rate": 3.160402966398386e-05, "loss": 0.1791, "num_tokens": 461489566.0, "step": 3530 }, { "epoch": 1.4090183559457303, "grad_norm": 0.31312039494514465, "learning_rate": 3.1594298096596584e-05, "loss": 0.2661, "num_tokens": 461620638.0, "step": 3531 }, { "epoch": 1.409417398244214, "grad_norm": 0.25891053676605225, "learning_rate": 3.15845657370008e-05, "loss": 0.1823, "num_tokens": 461751710.0, "step": 3532 }, { "epoch": 1.4098164405426976, "grad_norm": 0.3156015872955322, "learning_rate": 3.157483258707961e-05, "loss": 0.2365, "num_tokens": 461882782.0, "step": 3533 }, { "epoch": 1.4102154828411813, "grad_norm": 0.24231845140457153, "learning_rate": 3.15650986487163e-05, "loss": 0.1586, "num_tokens": 462013854.0, "step": 3534 }, { "epoch": 1.410614525139665, "grad_norm": 0.26890817284584045, "learning_rate": 3.1555363923794303e-05, "loss": 0.2024, "num_tokens": 462144926.0, "step": 3535 }, { "epoch": 1.4110135674381485, "grad_norm": 0.27809154987335205, "learning_rate": 3.154562841419718e-05, "loss": 0.2309, "num_tokens": 462275998.0, "step": 3536 }, { "epoch": 1.4114126097366322, "grad_norm": 0.2325095236301422, "learning_rate": 3.1535892121808687e-05, "loss": 0.1489, "num_tokens": 462407070.0, "step": 3537 }, { "epoch": 1.4118116520351158, "grad_norm": 0.3105730414390564, "learning_rate": 3.152615504851268e-05, "loss": 0.2455, "num_tokens": 462538142.0, "step": 3538 }, { "epoch": 1.4122106943335995, "grad_norm": 0.3193943500518799, "learning_rate": 3.15164171961932e-05, "loss": 0.2454, "num_tokens": 462653813.0, "step": 3539 }, { "epoch": 1.4126097366320831, "grad_norm": 0.274627685546875, "learning_rate": 3.150667856673444e-05, "loss": 0.2174, "num_tokens": 462784885.0, "step": 3540 }, { "epoch": 1.4130087789305668, "grad_norm": 0.312776118516922, "learning_rate": 3.149693916202072e-05, "loss": 0.1521, "num_tokens": 462915957.0, "step": 3541 }, { "epoch": 1.4134078212290504, "grad_norm": 0.2992699444293976, "learning_rate": 3.148719898393654e-05, "loss": 0.2324, "num_tokens": 463047029.0, "step": 3542 }, { "epoch": 1.413806863527534, "grad_norm": 0.27984419465065, "learning_rate": 3.147745803436652e-05, "loss": 0.2038, "num_tokens": 463178101.0, "step": 3543 }, { "epoch": 1.4142059058260177, "grad_norm": 0.2803422212600708, "learning_rate": 3.146771631519545e-05, "loss": 0.2226, "num_tokens": 463309173.0, "step": 3544 }, { "epoch": 1.4146049481245013, "grad_norm": 0.2590304911136627, "learning_rate": 3.1457973828308256e-05, "loss": 0.2026, "num_tokens": 463440245.0, "step": 3545 }, { "epoch": 1.415003990422985, "grad_norm": 0.31192460656166077, "learning_rate": 3.144823057559003e-05, "loss": 0.2338, "num_tokens": 463571317.0, "step": 3546 }, { "epoch": 1.4154030327214686, "grad_norm": 0.2765780985355377, "learning_rate": 3.1438486558925975e-05, "loss": 0.1763, "num_tokens": 463702389.0, "step": 3547 }, { "epoch": 1.415802075019952, "grad_norm": 0.2849842607975006, "learning_rate": 3.14287417802015e-05, "loss": 0.2148, "num_tokens": 463833461.0, "step": 3548 }, { "epoch": 1.4162011173184357, "grad_norm": 0.293760746717453, "learning_rate": 3.141899624130211e-05, "loss": 0.2281, "num_tokens": 463964533.0, "step": 3549 }, { "epoch": 1.4166001596169193, "grad_norm": 0.2740161418914795, "learning_rate": 3.1409249944113485e-05, "loss": 0.1973, "num_tokens": 464095605.0, "step": 3550 }, { "epoch": 1.416999201915403, "grad_norm": 0.25783202052116394, "learning_rate": 3.139950289052143e-05, "loss": 0.1867, "num_tokens": 464226677.0, "step": 3551 }, { "epoch": 1.4173982442138866, "grad_norm": 0.2597563564777374, "learning_rate": 3.1389755082411914e-05, "loss": 0.1979, "num_tokens": 464357749.0, "step": 3552 }, { "epoch": 1.4177972865123702, "grad_norm": 0.29385727643966675, "learning_rate": 3.1380006521671056e-05, "loss": 0.2467, "num_tokens": 464488821.0, "step": 3553 }, { "epoch": 1.418196328810854, "grad_norm": 0.282897025346756, "learning_rate": 3.13702572101851e-05, "loss": 0.2343, "num_tokens": 464619893.0, "step": 3554 }, { "epoch": 1.4185953711093375, "grad_norm": 0.2538983225822449, "learning_rate": 3.1360507149840465e-05, "loss": 0.1776, "num_tokens": 464750965.0, "step": 3555 }, { "epoch": 1.4189944134078212, "grad_norm": 0.27422022819519043, "learning_rate": 3.135075634252367e-05, "loss": 0.2367, "num_tokens": 464882037.0, "step": 3556 }, { "epoch": 1.4193934557063048, "grad_norm": 0.29161858558654785, "learning_rate": 3.134100479012143e-05, "loss": 0.2398, "num_tokens": 465013109.0, "step": 3557 }, { "epoch": 1.4197924980047885, "grad_norm": 0.27361252903938293, "learning_rate": 3.1331252494520553e-05, "loss": 0.2309, "num_tokens": 465144181.0, "step": 3558 }, { "epoch": 1.420191540303272, "grad_norm": 0.2684592306613922, "learning_rate": 3.132149945760804e-05, "loss": 0.2347, "num_tokens": 465275253.0, "step": 3559 }, { "epoch": 1.4205905826017557, "grad_norm": 0.28720805048942566, "learning_rate": 3.1311745681271015e-05, "loss": 0.2526, "num_tokens": 465406325.0, "step": 3560 }, { "epoch": 1.4209896249002394, "grad_norm": 0.2501641511917114, "learning_rate": 3.1301991167396724e-05, "loss": 0.1821, "num_tokens": 465537397.0, "step": 3561 }, { "epoch": 1.421388667198723, "grad_norm": 0.2618212401866913, "learning_rate": 3.1292235917872583e-05, "loss": 0.2148, "num_tokens": 465668469.0, "step": 3562 }, { "epoch": 1.4217877094972067, "grad_norm": 0.27313002943992615, "learning_rate": 3.128247993458614e-05, "loss": 0.2252, "num_tokens": 465799541.0, "step": 3563 }, { "epoch": 1.4221867517956903, "grad_norm": 0.2763058841228485, "learning_rate": 3.127272321942509e-05, "loss": 0.1894, "num_tokens": 465930613.0, "step": 3564 }, { "epoch": 1.422585794094174, "grad_norm": 0.27954477071762085, "learning_rate": 3.126296577427727e-05, "loss": 0.1977, "num_tokens": 466061685.0, "step": 3565 }, { "epoch": 1.4229848363926576, "grad_norm": 0.31183531880378723, "learning_rate": 3.125320760103065e-05, "loss": 0.1959, "num_tokens": 466192757.0, "step": 3566 }, { "epoch": 1.4233838786911412, "grad_norm": 0.31387361884117126, "learning_rate": 3.1243448701573346e-05, "loss": 0.2434, "num_tokens": 466323829.0, "step": 3567 }, { "epoch": 1.423782920989625, "grad_norm": 0.29337042570114136, "learning_rate": 3.123368907779362e-05, "loss": 0.2184, "num_tokens": 466454901.0, "step": 3568 }, { "epoch": 1.4241819632881085, "grad_norm": 0.2299572080373764, "learning_rate": 3.122392873157986e-05, "loss": 0.1604, "num_tokens": 466585973.0, "step": 3569 }, { "epoch": 1.4245810055865922, "grad_norm": 0.287905216217041, "learning_rate": 3.1214167664820595e-05, "loss": 0.2267, "num_tokens": 466717045.0, "step": 3570 }, { "epoch": 1.4249800478850758, "grad_norm": 0.26598668098449707, "learning_rate": 3.1204405879404516e-05, "loss": 0.2053, "num_tokens": 466848117.0, "step": 3571 }, { "epoch": 1.4253790901835595, "grad_norm": 0.27707353234291077, "learning_rate": 3.1194643377220437e-05, "loss": 0.2241, "num_tokens": 466979189.0, "step": 3572 }, { "epoch": 1.425778132482043, "grad_norm": 0.30189937353134155, "learning_rate": 3.118488016015731e-05, "loss": 0.2218, "num_tokens": 467099245.0, "step": 3573 }, { "epoch": 1.4261771747805267, "grad_norm": 0.23559780418872833, "learning_rate": 3.11751162301042e-05, "loss": 0.1748, "num_tokens": 467230317.0, "step": 3574 }, { "epoch": 1.4265762170790104, "grad_norm": 0.2766824960708618, "learning_rate": 3.116535158895037e-05, "loss": 0.2003, "num_tokens": 467361389.0, "step": 3575 }, { "epoch": 1.426975259377494, "grad_norm": 0.30628058314323425, "learning_rate": 3.115558623858518e-05, "loss": 0.2531, "num_tokens": 467492461.0, "step": 3576 }, { "epoch": 1.4273743016759777, "grad_norm": 0.27551230788230896, "learning_rate": 3.114582018089812e-05, "loss": 0.1854, "num_tokens": 467623533.0, "step": 3577 }, { "epoch": 1.4277733439744613, "grad_norm": 0.33087190985679626, "learning_rate": 3.113605341777885e-05, "loss": 0.2264, "num_tokens": 467754605.0, "step": 3578 }, { "epoch": 1.428172386272945, "grad_norm": 0.24514663219451904, "learning_rate": 3.112628595111714e-05, "loss": 0.1539, "num_tokens": 467885677.0, "step": 3579 }, { "epoch": 1.4285714285714286, "grad_norm": 0.32663366198539734, "learning_rate": 3.111651778280289e-05, "loss": 0.2322, "num_tokens": 468016749.0, "step": 3580 }, { "epoch": 1.4289704708699122, "grad_norm": 0.2681521773338318, "learning_rate": 3.110674891472618e-05, "loss": 0.175, "num_tokens": 468147821.0, "step": 3581 }, { "epoch": 1.4293695131683959, "grad_norm": 0.2446098029613495, "learning_rate": 3.109697934877716e-05, "loss": 0.1747, "num_tokens": 468278893.0, "step": 3582 }, { "epoch": 1.4297685554668795, "grad_norm": 0.27826374769210815, "learning_rate": 3.1087209086846174e-05, "loss": 0.2303, "num_tokens": 468409965.0, "step": 3583 }, { "epoch": 1.4301675977653632, "grad_norm": 0.27860063314437866, "learning_rate": 3.107743813082367e-05, "loss": 0.2238, "num_tokens": 468541037.0, "step": 3584 }, { "epoch": 1.4305666400638468, "grad_norm": 0.2611236274242401, "learning_rate": 3.106766648260024e-05, "loss": 0.2249, "num_tokens": 468672109.0, "step": 3585 }, { "epoch": 1.4309656823623305, "grad_norm": 0.2791889011859894, "learning_rate": 3.105789414406659e-05, "loss": 0.199, "num_tokens": 468803181.0, "step": 3586 }, { "epoch": 1.431364724660814, "grad_norm": 0.24324554204940796, "learning_rate": 3.10481211171136e-05, "loss": 0.1725, "num_tokens": 468934253.0, "step": 3587 }, { "epoch": 1.4317637669592977, "grad_norm": 0.2828904986381531, "learning_rate": 3.1038347403632246e-05, "loss": 0.2212, "num_tokens": 469065325.0, "step": 3588 }, { "epoch": 1.4321628092577814, "grad_norm": 0.2821882665157318, "learning_rate": 3.102857300551365e-05, "loss": 0.2115, "num_tokens": 469196397.0, "step": 3589 }, { "epoch": 1.432561851556265, "grad_norm": 0.3306630253791809, "learning_rate": 3.101879792464907e-05, "loss": 0.2688, "num_tokens": 469327469.0, "step": 3590 }, { "epoch": 1.4329608938547487, "grad_norm": 0.27289459109306335, "learning_rate": 3.10090221629299e-05, "loss": 0.1695, "num_tokens": 469458541.0, "step": 3591 }, { "epoch": 1.4333599361532323, "grad_norm": 0.26642507314682007, "learning_rate": 3.099924572224764e-05, "loss": 0.1981, "num_tokens": 469589613.0, "step": 3592 }, { "epoch": 1.433758978451716, "grad_norm": 0.3004327416419983, "learning_rate": 3.098946860449397e-05, "loss": 0.2135, "num_tokens": 469720685.0, "step": 3593 }, { "epoch": 1.4341580207501996, "grad_norm": 0.25541242957115173, "learning_rate": 3.097969081156065e-05, "loss": 0.1893, "num_tokens": 469851757.0, "step": 3594 }, { "epoch": 1.434557063048683, "grad_norm": 0.2788648009300232, "learning_rate": 3.0969912345339584e-05, "loss": 0.1947, "num_tokens": 469982829.0, "step": 3595 }, { "epoch": 1.4349561053471667, "grad_norm": 0.2853514552116394, "learning_rate": 3.0960133207722835e-05, "loss": 0.1813, "num_tokens": 470109121.0, "step": 3596 }, { "epoch": 1.4353551476456503, "grad_norm": 0.31364747881889343, "learning_rate": 3.095035340060256e-05, "loss": 0.2511, "num_tokens": 470240193.0, "step": 3597 }, { "epoch": 1.435754189944134, "grad_norm": 0.2619701623916626, "learning_rate": 3.0940572925871064e-05, "loss": 0.1682, "num_tokens": 470371265.0, "step": 3598 }, { "epoch": 1.4361532322426176, "grad_norm": 0.2555064857006073, "learning_rate": 3.093079178542079e-05, "loss": 0.164, "num_tokens": 470502337.0, "step": 3599 }, { "epoch": 1.4365522745411012, "grad_norm": 0.27491363883018494, "learning_rate": 3.092100998114428e-05, "loss": 0.1833, "num_tokens": 470633409.0, "step": 3600 }, { "epoch": 1.4369513168395849, "grad_norm": 0.273861289024353, "learning_rate": 3.091122751493423e-05, "loss": 0.2011, "num_tokens": 470764481.0, "step": 3601 }, { "epoch": 1.4373503591380685, "grad_norm": 0.288114994764328, "learning_rate": 3.090144438868345e-05, "loss": 0.1848, "num_tokens": 470895553.0, "step": 3602 }, { "epoch": 1.4377494014365522, "grad_norm": 0.2455291748046875, "learning_rate": 3.089166060428489e-05, "loss": 0.1676, "num_tokens": 471020418.0, "step": 3603 }, { "epoch": 1.4381484437350358, "grad_norm": 0.3802087604999542, "learning_rate": 3.0881876163631615e-05, "loss": 0.2461, "num_tokens": 471151490.0, "step": 3604 }, { "epoch": 1.4385474860335195, "grad_norm": 0.2504412531852722, "learning_rate": 3.0872091068616846e-05, "loss": 0.1751, "num_tokens": 471282562.0, "step": 3605 }, { "epoch": 1.438946528332003, "grad_norm": 0.2636655271053314, "learning_rate": 3.086230532113387e-05, "loss": 0.193, "num_tokens": 471413634.0, "step": 3606 }, { "epoch": 1.4393455706304867, "grad_norm": 0.28414273262023926, "learning_rate": 3.0852518923076176e-05, "loss": 0.2285, "num_tokens": 471544706.0, "step": 3607 }, { "epoch": 1.4397446129289704, "grad_norm": 0.31490272283554077, "learning_rate": 3.08427318763373e-05, "loss": 0.252, "num_tokens": 471675778.0, "step": 3608 }, { "epoch": 1.440143655227454, "grad_norm": 0.2710915505886078, "learning_rate": 3.083294418281099e-05, "loss": 0.221, "num_tokens": 471806850.0, "step": 3609 }, { "epoch": 1.4405426975259377, "grad_norm": 0.27974894642829895, "learning_rate": 3.0823155844391035e-05, "loss": 0.192, "num_tokens": 471937922.0, "step": 3610 }, { "epoch": 1.4409417398244213, "grad_norm": 0.2738189995288849, "learning_rate": 3.08133668629714e-05, "loss": 0.2291, "num_tokens": 472068994.0, "step": 3611 }, { "epoch": 1.441340782122905, "grad_norm": 0.27407434582710266, "learning_rate": 3.0803577240446174e-05, "loss": 0.2189, "num_tokens": 472200066.0, "step": 3612 }, { "epoch": 1.4417398244213886, "grad_norm": 0.26608550548553467, "learning_rate": 3.079378697870954e-05, "loss": 0.2053, "num_tokens": 472331138.0, "step": 3613 }, { "epoch": 1.4421388667198722, "grad_norm": 0.27923524379730225, "learning_rate": 3.078399607965584e-05, "loss": 0.2153, "num_tokens": 472462210.0, "step": 3614 }, { "epoch": 1.4425379090183559, "grad_norm": 0.3085688054561615, "learning_rate": 3.0774204545179494e-05, "loss": 0.258, "num_tokens": 472593282.0, "step": 3615 }, { "epoch": 1.4429369513168395, "grad_norm": 0.2670856714248657, "learning_rate": 3.0764412377175104e-05, "loss": 0.1953, "num_tokens": 472724354.0, "step": 3616 }, { "epoch": 1.4433359936153232, "grad_norm": 0.2905615270137787, "learning_rate": 3.0754619577537344e-05, "loss": 0.1815, "num_tokens": 472855426.0, "step": 3617 }, { "epoch": 1.4437350359138068, "grad_norm": 0.27160516381263733, "learning_rate": 3.074482614816104e-05, "loss": 0.1887, "num_tokens": 472986498.0, "step": 3618 }, { "epoch": 1.4441340782122905, "grad_norm": 0.2817628085613251, "learning_rate": 3.073503209094112e-05, "loss": 0.2152, "num_tokens": 473117570.0, "step": 3619 }, { "epoch": 1.444533120510774, "grad_norm": 0.2812592685222626, "learning_rate": 3.072523740777264e-05, "loss": 0.2199, "num_tokens": 473248642.0, "step": 3620 }, { "epoch": 1.4449321628092577, "grad_norm": 0.3052583932876587, "learning_rate": 3.07154421005508e-05, "loss": 0.2527, "num_tokens": 473379714.0, "step": 3621 }, { "epoch": 1.4453312051077414, "grad_norm": 0.28866681456565857, "learning_rate": 3.070564617117089e-05, "loss": 0.2222, "num_tokens": 473510786.0, "step": 3622 }, { "epoch": 1.445730247406225, "grad_norm": 0.2719711363315582, "learning_rate": 3.069584962152832e-05, "loss": 0.2275, "num_tokens": 473641858.0, "step": 3623 }, { "epoch": 1.4461292897047087, "grad_norm": 0.2708131670951843, "learning_rate": 3.068605245351864e-05, "loss": 0.2378, "num_tokens": 473772930.0, "step": 3624 }, { "epoch": 1.4465283320031923, "grad_norm": 0.26343458890914917, "learning_rate": 3.067625466903751e-05, "loss": 0.2319, "num_tokens": 473904002.0, "step": 3625 }, { "epoch": 1.446927374301676, "grad_norm": 0.26029351353645325, "learning_rate": 3.066645626998071e-05, "loss": 0.1399, "num_tokens": 474035074.0, "step": 3626 }, { "epoch": 1.4473264166001596, "grad_norm": 0.28380635380744934, "learning_rate": 3.065665725824414e-05, "loss": 0.2122, "num_tokens": 474166146.0, "step": 3627 }, { "epoch": 1.4477254588986432, "grad_norm": 0.27922147512435913, "learning_rate": 3.0646857635723816e-05, "loss": 0.2296, "num_tokens": 474297218.0, "step": 3628 }, { "epoch": 1.4481245011971269, "grad_norm": 0.3054995834827423, "learning_rate": 3.063705740431588e-05, "loss": 0.2006, "num_tokens": 474428290.0, "step": 3629 }, { "epoch": 1.4485235434956105, "grad_norm": 0.32033196091651917, "learning_rate": 3.0627256565916566e-05, "loss": 0.2501, "num_tokens": 474559362.0, "step": 3630 }, { "epoch": 1.4489225857940942, "grad_norm": 0.3694227337837219, "learning_rate": 3.061745512242227e-05, "loss": 0.2373, "num_tokens": 474690434.0, "step": 3631 }, { "epoch": 1.4493216280925778, "grad_norm": 0.28921911120414734, "learning_rate": 3.060765307572947e-05, "loss": 0.2283, "num_tokens": 474821506.0, "step": 3632 }, { "epoch": 1.4497206703910615, "grad_norm": 0.274308979511261, "learning_rate": 3.0597850427734756e-05, "loss": 0.2228, "num_tokens": 474952578.0, "step": 3633 }, { "epoch": 1.450119712689545, "grad_norm": 0.31110385060310364, "learning_rate": 3.058804718033488e-05, "loss": 0.2232, "num_tokens": 475083650.0, "step": 3634 }, { "epoch": 1.4505187549880287, "grad_norm": 0.27315181493759155, "learning_rate": 3.057824333542664e-05, "loss": 0.2307, "num_tokens": 475214722.0, "step": 3635 }, { "epoch": 1.4509177972865124, "grad_norm": 0.2785116136074066, "learning_rate": 3.056843889490703e-05, "loss": 0.1857, "num_tokens": 475345794.0, "step": 3636 }, { "epoch": 1.451316839584996, "grad_norm": 0.2712843716144562, "learning_rate": 3.0558633860673084e-05, "loss": 0.2372, "num_tokens": 475476866.0, "step": 3637 }, { "epoch": 1.4517158818834797, "grad_norm": 0.28672829270362854, "learning_rate": 3.0548828234622e-05, "loss": 0.2485, "num_tokens": 475607938.0, "step": 3638 }, { "epoch": 1.4521149241819633, "grad_norm": 0.2921507656574249, "learning_rate": 3.0539022018651076e-05, "loss": 0.2182, "num_tokens": 475739010.0, "step": 3639 }, { "epoch": 1.452513966480447, "grad_norm": 0.2608116865158081, "learning_rate": 3.0529215214657726e-05, "loss": 0.2164, "num_tokens": 475870082.0, "step": 3640 }, { "epoch": 1.4529130087789306, "grad_norm": 0.2692593038082123, "learning_rate": 3.051940782453946e-05, "loss": 0.2142, "num_tokens": 476001154.0, "step": 3641 }, { "epoch": 1.4533120510774142, "grad_norm": 0.26146724820137024, "learning_rate": 3.0509599850193933e-05, "loss": 0.1973, "num_tokens": 476132226.0, "step": 3642 }, { "epoch": 1.4537110933758979, "grad_norm": 0.26121464371681213, "learning_rate": 3.0499791293518886e-05, "loss": 0.1872, "num_tokens": 476263298.0, "step": 3643 }, { "epoch": 1.4541101356743815, "grad_norm": 0.29987290501594543, "learning_rate": 3.0489982156412195e-05, "loss": 0.1891, "num_tokens": 476394370.0, "step": 3644 }, { "epoch": 1.4545091779728652, "grad_norm": 0.24404871463775635, "learning_rate": 3.048017244077182e-05, "loss": 0.1388, "num_tokens": 476525442.0, "step": 3645 }, { "epoch": 1.4549082202713488, "grad_norm": 0.325107604265213, "learning_rate": 3.047036214849587e-05, "loss": 0.2321, "num_tokens": 476656514.0, "step": 3646 }, { "epoch": 1.4553072625698324, "grad_norm": 0.2822294533252716, "learning_rate": 3.046055128148252e-05, "loss": 0.2075, "num_tokens": 476787586.0, "step": 3647 }, { "epoch": 1.455706304868316, "grad_norm": 0.27316808700561523, "learning_rate": 3.04507398416301e-05, "loss": 0.2127, "num_tokens": 476918658.0, "step": 3648 }, { "epoch": 1.4561053471667997, "grad_norm": 0.28435018658638, "learning_rate": 3.044092783083703e-05, "loss": 0.2091, "num_tokens": 477049730.0, "step": 3649 }, { "epoch": 1.4565043894652834, "grad_norm": 0.29405272006988525, "learning_rate": 3.043111525100183e-05, "loss": 0.2384, "num_tokens": 477180802.0, "step": 3650 }, { "epoch": 1.456903431763767, "grad_norm": 0.2335299253463745, "learning_rate": 3.0421302104023163e-05, "loss": 0.1601, "num_tokens": 477311874.0, "step": 3651 }, { "epoch": 1.4573024740622507, "grad_norm": 0.29161855578422546, "learning_rate": 3.041148839179977e-05, "loss": 0.2018, "num_tokens": 477442946.0, "step": 3652 }, { "epoch": 1.4577015163607343, "grad_norm": 0.27354422211647034, "learning_rate": 3.040167411623051e-05, "loss": 0.1766, "num_tokens": 477574018.0, "step": 3653 }, { "epoch": 1.458100558659218, "grad_norm": 0.2554587423801422, "learning_rate": 3.039185927921436e-05, "loss": 0.1676, "num_tokens": 477705090.0, "step": 3654 }, { "epoch": 1.4584996009577016, "grad_norm": 0.2528599798679352, "learning_rate": 3.0382043882650397e-05, "loss": 0.1712, "num_tokens": 477836162.0, "step": 3655 }, { "epoch": 1.4588986432561852, "grad_norm": 0.29723379015922546, "learning_rate": 3.037222792843781e-05, "loss": 0.2186, "num_tokens": 477967234.0, "step": 3656 }, { "epoch": 1.4592976855546689, "grad_norm": 0.30962762236595154, "learning_rate": 3.0362411418475888e-05, "loss": 0.2225, "num_tokens": 478098306.0, "step": 3657 }, { "epoch": 1.4596967278531525, "grad_norm": 0.27094337344169617, "learning_rate": 3.0352594354664037e-05, "loss": 0.1991, "num_tokens": 478229378.0, "step": 3658 }, { "epoch": 1.4600957701516362, "grad_norm": 0.26345089077949524, "learning_rate": 3.0342776738901786e-05, "loss": 0.1942, "num_tokens": 478360450.0, "step": 3659 }, { "epoch": 1.4604948124501198, "grad_norm": 0.256896048784256, "learning_rate": 3.0332958573088727e-05, "loss": 0.2208, "num_tokens": 478491522.0, "step": 3660 }, { "epoch": 1.4608938547486034, "grad_norm": 0.2742554545402527, "learning_rate": 3.0323139859124588e-05, "loss": 0.1976, "num_tokens": 478622594.0, "step": 3661 }, { "epoch": 1.461292897047087, "grad_norm": 0.261782169342041, "learning_rate": 3.0313320598909206e-05, "loss": 0.2066, "num_tokens": 478753666.0, "step": 3662 }, { "epoch": 1.4616919393455707, "grad_norm": 0.250905841588974, "learning_rate": 3.0303500794342516e-05, "loss": 0.1757, "num_tokens": 478884738.0, "step": 3663 }, { "epoch": 1.4620909816440544, "grad_norm": 0.28067323565483093, "learning_rate": 3.029368044732455e-05, "loss": 0.1893, "num_tokens": 479015810.0, "step": 3664 }, { "epoch": 1.462490023942538, "grad_norm": 0.2634584307670593, "learning_rate": 3.0283859559755466e-05, "loss": 0.2015, "num_tokens": 479140858.0, "step": 3665 }, { "epoch": 1.4628890662410217, "grad_norm": 0.29652947187423706, "learning_rate": 3.027403813353551e-05, "loss": 0.2225, "num_tokens": 479271930.0, "step": 3666 }, { "epoch": 1.4632881085395053, "grad_norm": 0.25107359886169434, "learning_rate": 3.0264216170565023e-05, "loss": 0.1664, "num_tokens": 479403002.0, "step": 3667 }, { "epoch": 1.463687150837989, "grad_norm": 0.34190165996551514, "learning_rate": 3.025439367274449e-05, "loss": 0.228, "num_tokens": 479534074.0, "step": 3668 }, { "epoch": 1.4640861931364726, "grad_norm": 0.3451598584651947, "learning_rate": 3.0244570641974453e-05, "loss": 0.2311, "num_tokens": 479665146.0, "step": 3669 }, { "epoch": 1.4644852354349562, "grad_norm": 0.26447367668151855, "learning_rate": 3.0234747080155567e-05, "loss": 0.1929, "num_tokens": 479796218.0, "step": 3670 }, { "epoch": 1.4648842777334399, "grad_norm": 0.268831729888916, "learning_rate": 3.022492298918863e-05, "loss": 0.219, "num_tokens": 479927290.0, "step": 3671 }, { "epoch": 1.4652833200319235, "grad_norm": 0.3030756115913391, "learning_rate": 3.0215098370974494e-05, "loss": 0.2362, "num_tokens": 480058362.0, "step": 3672 }, { "epoch": 1.465682362330407, "grad_norm": 0.27646490931510925, "learning_rate": 3.0205273227414133e-05, "loss": 0.2068, "num_tokens": 480189434.0, "step": 3673 }, { "epoch": 1.4660814046288906, "grad_norm": 0.26762136816978455, "learning_rate": 3.0195447560408613e-05, "loss": 0.1869, "num_tokens": 480320506.0, "step": 3674 }, { "epoch": 1.4664804469273742, "grad_norm": 0.26831698417663574, "learning_rate": 3.0185621371859124e-05, "loss": 0.1729, "num_tokens": 480451578.0, "step": 3675 }, { "epoch": 1.4668794892258579, "grad_norm": 0.2966892719268799, "learning_rate": 3.017579466366693e-05, "loss": 0.2368, "num_tokens": 480582650.0, "step": 3676 }, { "epoch": 1.4672785315243415, "grad_norm": 0.27261045575141907, "learning_rate": 3.0165967437733412e-05, "loss": 0.1673, "num_tokens": 480713722.0, "step": 3677 }, { "epoch": 1.4676775738228252, "grad_norm": 0.28037315607070923, "learning_rate": 3.0156139695960045e-05, "loss": 0.204, "num_tokens": 480844794.0, "step": 3678 }, { "epoch": 1.4680766161213088, "grad_norm": 0.2851794958114624, "learning_rate": 3.014631144024842e-05, "loss": 0.2117, "num_tokens": 480975866.0, "step": 3679 }, { "epoch": 1.4684756584197924, "grad_norm": 0.3012806475162506, "learning_rate": 3.0136482672500195e-05, "loss": 0.2301, "num_tokens": 481106938.0, "step": 3680 }, { "epoch": 1.468874700718276, "grad_norm": 0.24827489256858826, "learning_rate": 3.012665339461713e-05, "loss": 0.1816, "num_tokens": 481238010.0, "step": 3681 }, { "epoch": 1.4692737430167597, "grad_norm": 0.2702547609806061, "learning_rate": 3.0116823608501143e-05, "loss": 0.1779, "num_tokens": 481369082.0, "step": 3682 }, { "epoch": 1.4696727853152434, "grad_norm": 0.27335479855537415, "learning_rate": 3.0106993316054172e-05, "loss": 0.2088, "num_tokens": 481500154.0, "step": 3683 }, { "epoch": 1.470071827613727, "grad_norm": 0.24235594272613525, "learning_rate": 3.0097162519178294e-05, "loss": 0.1217, "num_tokens": 481631226.0, "step": 3684 }, { "epoch": 1.4704708699122107, "grad_norm": 0.2293480783700943, "learning_rate": 3.008733121977569e-05, "loss": 0.1495, "num_tokens": 481762298.0, "step": 3685 }, { "epoch": 1.4708699122106943, "grad_norm": 0.30206066370010376, "learning_rate": 3.0077499419748602e-05, "loss": 0.2202, "num_tokens": 481893370.0, "step": 3686 }, { "epoch": 1.471268954509178, "grad_norm": 0.2521743178367615, "learning_rate": 3.0067667120999405e-05, "loss": 0.1854, "num_tokens": 482024442.0, "step": 3687 }, { "epoch": 1.4716679968076616, "grad_norm": 0.34465670585632324, "learning_rate": 3.005783432543056e-05, "loss": 0.2015, "num_tokens": 482155514.0, "step": 3688 }, { "epoch": 1.4720670391061452, "grad_norm": 0.3027593195438385, "learning_rate": 3.0048001034944612e-05, "loss": 0.1811, "num_tokens": 482286586.0, "step": 3689 }, { "epoch": 1.4724660814046289, "grad_norm": 0.30358636379241943, "learning_rate": 3.003816725144422e-05, "loss": 0.22, "num_tokens": 482417658.0, "step": 3690 }, { "epoch": 1.4728651237031125, "grad_norm": 0.27229583263397217, "learning_rate": 3.002833297683213e-05, "loss": 0.1873, "num_tokens": 482548730.0, "step": 3691 }, { "epoch": 1.4732641660015962, "grad_norm": 0.2801007926464081, "learning_rate": 3.0018498213011163e-05, "loss": 0.1961, "num_tokens": 482679802.0, "step": 3692 }, { "epoch": 1.4736632083000798, "grad_norm": 0.2804814577102661, "learning_rate": 3.000866296188428e-05, "loss": 0.2002, "num_tokens": 482810874.0, "step": 3693 }, { "epoch": 1.4740622505985634, "grad_norm": 0.2566916346549988, "learning_rate": 2.9998827225354503e-05, "loss": 0.1827, "num_tokens": 482941946.0, "step": 3694 }, { "epoch": 1.474461292897047, "grad_norm": 0.2712838649749756, "learning_rate": 2.9988991005324936e-05, "loss": 0.1729, "num_tokens": 483073018.0, "step": 3695 }, { "epoch": 1.4748603351955307, "grad_norm": 0.24502691626548767, "learning_rate": 2.9979154303698832e-05, "loss": 0.1724, "num_tokens": 483204090.0, "step": 3696 }, { "epoch": 1.4752593774940144, "grad_norm": 0.28386208415031433, "learning_rate": 2.996931712237946e-05, "loss": 0.2289, "num_tokens": 483335162.0, "step": 3697 }, { "epoch": 1.475658419792498, "grad_norm": 0.29188039898872375, "learning_rate": 2.9959479463270257e-05, "loss": 0.2233, "num_tokens": 483466234.0, "step": 3698 }, { "epoch": 1.4760574620909817, "grad_norm": 0.2743642032146454, "learning_rate": 2.99496413282747e-05, "loss": 0.1911, "num_tokens": 483597306.0, "step": 3699 }, { "epoch": 1.4764565043894653, "grad_norm": 0.31044909358024597, "learning_rate": 2.993980271929639e-05, "loss": 0.2548, "num_tokens": 483728378.0, "step": 3700 }, { "epoch": 1.476855546687949, "grad_norm": 0.2460850179195404, "learning_rate": 2.9929963638238982e-05, "loss": 0.1666, "num_tokens": 483859450.0, "step": 3701 }, { "epoch": 1.4772545889864326, "grad_norm": 0.24638396501541138, "learning_rate": 2.9920124087006273e-05, "loss": 0.1463, "num_tokens": 483990522.0, "step": 3702 }, { "epoch": 1.4776536312849162, "grad_norm": 0.24486136436462402, "learning_rate": 2.991028406750211e-05, "loss": 0.1709, "num_tokens": 484121594.0, "step": 3703 }, { "epoch": 1.4780526735833999, "grad_norm": 0.2791242003440857, "learning_rate": 2.9900443581630444e-05, "loss": 0.1714, "num_tokens": 484252666.0, "step": 3704 }, { "epoch": 1.4784517158818835, "grad_norm": 0.28470590710639954, "learning_rate": 2.989060263129533e-05, "loss": 0.1831, "num_tokens": 484383738.0, "step": 3705 }, { "epoch": 1.4788507581803672, "grad_norm": 0.28164464235305786, "learning_rate": 2.9880761218400883e-05, "loss": 0.1745, "num_tokens": 484514810.0, "step": 3706 }, { "epoch": 1.4792498004788508, "grad_norm": 0.2889656722545624, "learning_rate": 2.9870919344851346e-05, "loss": 0.201, "num_tokens": 484645882.0, "step": 3707 }, { "epoch": 1.4796488427773344, "grad_norm": 0.2669346034526825, "learning_rate": 2.9861077012551007e-05, "loss": 0.2127, "num_tokens": 484776954.0, "step": 3708 }, { "epoch": 1.480047885075818, "grad_norm": 0.27117016911506653, "learning_rate": 2.9851234223404272e-05, "loss": 0.1868, "num_tokens": 484908026.0, "step": 3709 }, { "epoch": 1.4804469273743017, "grad_norm": 0.2859686017036438, "learning_rate": 2.9841390979315642e-05, "loss": 0.2316, "num_tokens": 485039098.0, "step": 3710 }, { "epoch": 1.4808459696727854, "grad_norm": 0.279319167137146, "learning_rate": 2.9831547282189683e-05, "loss": 0.1991, "num_tokens": 485170170.0, "step": 3711 }, { "epoch": 1.481245011971269, "grad_norm": 0.2558983266353607, "learning_rate": 2.982170313393105e-05, "loss": 0.1864, "num_tokens": 485301242.0, "step": 3712 }, { "epoch": 1.4816440542697527, "grad_norm": 0.2873346507549286, "learning_rate": 2.9811858536444516e-05, "loss": 0.2264, "num_tokens": 485432314.0, "step": 3713 }, { "epoch": 1.4820430965682363, "grad_norm": 0.29918548464775085, "learning_rate": 2.98020134916349e-05, "loss": 0.2424, "num_tokens": 485563386.0, "step": 3714 }, { "epoch": 1.48244213886672, "grad_norm": 0.2691568434238434, "learning_rate": 2.9792168001407127e-05, "loss": 0.173, "num_tokens": 485694458.0, "step": 3715 }, { "epoch": 1.4828411811652036, "grad_norm": 0.26011887192726135, "learning_rate": 2.9782322067666218e-05, "loss": 0.1899, "num_tokens": 485825530.0, "step": 3716 }, { "epoch": 1.4832402234636872, "grad_norm": 0.27237144112586975, "learning_rate": 2.9772475692317265e-05, "loss": 0.1871, "num_tokens": 485956602.0, "step": 3717 }, { "epoch": 1.4836392657621709, "grad_norm": 0.25285935401916504, "learning_rate": 2.9762628877265452e-05, "loss": 0.1915, "num_tokens": 486087674.0, "step": 3718 }, { "epoch": 1.4840383080606545, "grad_norm": 0.26207634806632996, "learning_rate": 2.975278162441604e-05, "loss": 0.181, "num_tokens": 486218746.0, "step": 3719 }, { "epoch": 1.484437350359138, "grad_norm": 0.2980406880378723, "learning_rate": 2.9742933935674382e-05, "loss": 0.2231, "num_tokens": 486349818.0, "step": 3720 }, { "epoch": 1.4848363926576216, "grad_norm": 0.2776920795440674, "learning_rate": 2.973308581294592e-05, "loss": 0.202, "num_tokens": 486480890.0, "step": 3721 }, { "epoch": 1.4852354349561052, "grad_norm": 0.27714431285858154, "learning_rate": 2.9723237258136172e-05, "loss": 0.207, "num_tokens": 486611962.0, "step": 3722 }, { "epoch": 1.4856344772545889, "grad_norm": 0.27480706572532654, "learning_rate": 2.9713388273150727e-05, "loss": 0.1889, "num_tokens": 486743034.0, "step": 3723 }, { "epoch": 1.4860335195530725, "grad_norm": 0.32735100388526917, "learning_rate": 2.9703538859895304e-05, "loss": 0.2344, "num_tokens": 486874106.0, "step": 3724 }, { "epoch": 1.4864325618515561, "grad_norm": 0.27089229226112366, "learning_rate": 2.9693689020275634e-05, "loss": 0.1866, "num_tokens": 487005178.0, "step": 3725 }, { "epoch": 1.4868316041500398, "grad_norm": 0.2583320438861847, "learning_rate": 2.9683838756197597e-05, "loss": 0.1861, "num_tokens": 487136250.0, "step": 3726 }, { "epoch": 1.4872306464485234, "grad_norm": 0.2534624934196472, "learning_rate": 2.9673988069567115e-05, "loss": 0.1773, "num_tokens": 487267322.0, "step": 3727 }, { "epoch": 1.487629688747007, "grad_norm": 0.2575865089893341, "learning_rate": 2.9664136962290206e-05, "loss": 0.1768, "num_tokens": 487398394.0, "step": 3728 }, { "epoch": 1.4880287310454907, "grad_norm": 0.31613194942474365, "learning_rate": 2.9654285436272967e-05, "loss": 0.2177, "num_tokens": 487529466.0, "step": 3729 }, { "epoch": 1.4884277733439744, "grad_norm": 0.2761107385158539, "learning_rate": 2.9644433493421585e-05, "loss": 0.2177, "num_tokens": 487660538.0, "step": 3730 }, { "epoch": 1.488826815642458, "grad_norm": 0.2717515230178833, "learning_rate": 2.9634581135642315e-05, "loss": 0.1981, "num_tokens": 487791610.0, "step": 3731 }, { "epoch": 1.4892258579409416, "grad_norm": 0.2750496566295624, "learning_rate": 2.962472836484148e-05, "loss": 0.2197, "num_tokens": 487922682.0, "step": 3732 }, { "epoch": 1.4896249002394253, "grad_norm": 0.3074258863925934, "learning_rate": 2.961487518292552e-05, "loss": 0.2338, "num_tokens": 488053754.0, "step": 3733 }, { "epoch": 1.490023942537909, "grad_norm": 0.2859503924846649, "learning_rate": 2.960502159180093e-05, "loss": 0.2151, "num_tokens": 488181571.0, "step": 3734 }, { "epoch": 1.4904229848363926, "grad_norm": 0.27749237418174744, "learning_rate": 2.9595167593374278e-05, "loss": 0.2275, "num_tokens": 488312643.0, "step": 3735 }, { "epoch": 1.4908220271348762, "grad_norm": 0.2783571481704712, "learning_rate": 2.9585313189552233e-05, "loss": 0.245, "num_tokens": 488443715.0, "step": 3736 }, { "epoch": 1.4912210694333599, "grad_norm": 0.3042837679386139, "learning_rate": 2.9575458382241527e-05, "loss": 0.2487, "num_tokens": 488574787.0, "step": 3737 }, { "epoch": 1.4916201117318435, "grad_norm": 0.25264376401901245, "learning_rate": 2.956560317334897e-05, "loss": 0.1778, "num_tokens": 488705859.0, "step": 3738 }, { "epoch": 1.4920191540303271, "grad_norm": 0.27441975474357605, "learning_rate": 2.955574756478145e-05, "loss": 0.2163, "num_tokens": 488836931.0, "step": 3739 }, { "epoch": 1.4924181963288108, "grad_norm": 0.3765122890472412, "learning_rate": 2.9545891558445942e-05, "loss": 0.2645, "num_tokens": 488968003.0, "step": 3740 }, { "epoch": 1.4928172386272944, "grad_norm": 0.2885056734085083, "learning_rate": 2.9536035156249493e-05, "loss": 0.2313, "num_tokens": 489099075.0, "step": 3741 }, { "epoch": 1.493216280925778, "grad_norm": 0.257290780544281, "learning_rate": 2.952617836009921e-05, "loss": 0.1816, "num_tokens": 489230147.0, "step": 3742 }, { "epoch": 1.4936153232242617, "grad_norm": 0.24999181926250458, "learning_rate": 2.9516321171902306e-05, "loss": 0.1883, "num_tokens": 489361219.0, "step": 3743 }, { "epoch": 1.4940143655227454, "grad_norm": 0.27011942863464355, "learning_rate": 2.9506463593566058e-05, "loss": 0.1852, "num_tokens": 489492291.0, "step": 3744 }, { "epoch": 1.494413407821229, "grad_norm": 0.2728675603866577, "learning_rate": 2.9496605626997804e-05, "loss": 0.2348, "num_tokens": 489623363.0, "step": 3745 }, { "epoch": 1.4948124501197126, "grad_norm": 0.27456265687942505, "learning_rate": 2.948674727410497e-05, "loss": 0.2148, "num_tokens": 489754435.0, "step": 3746 }, { "epoch": 1.4952114924181963, "grad_norm": 0.2593889534473419, "learning_rate": 2.9476888536795068e-05, "loss": 0.1825, "num_tokens": 489885507.0, "step": 3747 }, { "epoch": 1.49561053471668, "grad_norm": 0.2800792157649994, "learning_rate": 2.9467029416975655e-05, "loss": 0.2211, "num_tokens": 490016579.0, "step": 3748 }, { "epoch": 1.4960095770151636, "grad_norm": 0.25801005959510803, "learning_rate": 2.9457169916554388e-05, "loss": 0.1726, "num_tokens": 490147651.0, "step": 3749 }, { "epoch": 1.4964086193136472, "grad_norm": 0.2643730938434601, "learning_rate": 2.944731003743899e-05, "loss": 0.1786, "num_tokens": 490278723.0, "step": 3750 }, { "epoch": 1.4968076616121309, "grad_norm": 0.3037342429161072, "learning_rate": 2.943744978153724e-05, "loss": 0.2182, "num_tokens": 490409795.0, "step": 3751 }, { "epoch": 1.4972067039106145, "grad_norm": 0.31813159584999084, "learning_rate": 2.942758915075703e-05, "loss": 0.233, "num_tokens": 490540867.0, "step": 3752 }, { "epoch": 1.4976057462090981, "grad_norm": 0.29036441445350647, "learning_rate": 2.9417728147006286e-05, "loss": 0.2324, "num_tokens": 490671939.0, "step": 3753 }, { "epoch": 1.4980047885075818, "grad_norm": 0.23308052122592926, "learning_rate": 2.9407866772193026e-05, "loss": 0.1558, "num_tokens": 490803011.0, "step": 3754 }, { "epoch": 1.4984038308060654, "grad_norm": 0.3000848889350891, "learning_rate": 2.9398005028225322e-05, "loss": 0.2595, "num_tokens": 490934083.0, "step": 3755 }, { "epoch": 1.498802873104549, "grad_norm": 0.25070640444755554, "learning_rate": 2.9388142917011342e-05, "loss": 0.1379, "num_tokens": 491065155.0, "step": 3756 }, { "epoch": 1.4992019154030327, "grad_norm": 0.2939552962779999, "learning_rate": 2.9378280440459306e-05, "loss": 0.2116, "num_tokens": 491196227.0, "step": 3757 }, { "epoch": 1.4996009577015164, "grad_norm": 0.3389385938644409, "learning_rate": 2.936841760047752e-05, "loss": 0.2334, "num_tokens": 491327299.0, "step": 3758 }, { "epoch": 1.5, "grad_norm": 0.3008688986301422, "learning_rate": 2.9358554398974342e-05, "loss": 0.266, "num_tokens": 491458371.0, "step": 3759 }, { "epoch": 1.5003990422984836, "grad_norm": 0.25962674617767334, "learning_rate": 2.934869083785821e-05, "loss": 0.1986, "num_tokens": 491589443.0, "step": 3760 }, { "epoch": 1.5007980845969673, "grad_norm": 0.2656900882720947, "learning_rate": 2.933882691903764e-05, "loss": 0.2021, "num_tokens": 491720515.0, "step": 3761 }, { "epoch": 1.501197126895451, "grad_norm": 0.2577642500400543, "learning_rate": 2.9328962644421192e-05, "loss": 0.1845, "num_tokens": 491851587.0, "step": 3762 }, { "epoch": 1.5015961691939346, "grad_norm": 0.25446292757987976, "learning_rate": 2.9319098015917534e-05, "loss": 0.1714, "num_tokens": 491982659.0, "step": 3763 }, { "epoch": 1.5019952114924182, "grad_norm": 0.2696647644042969, "learning_rate": 2.9309233035435363e-05, "loss": 0.1766, "num_tokens": 492113731.0, "step": 3764 }, { "epoch": 1.5023942537909019, "grad_norm": 0.24507446587085724, "learning_rate": 2.9299367704883467e-05, "loss": 0.1675, "num_tokens": 492244803.0, "step": 3765 }, { "epoch": 1.5027932960893855, "grad_norm": 0.29424959421157837, "learning_rate": 2.928950202617069e-05, "loss": 0.2329, "num_tokens": 492373209.0, "step": 3766 }, { "epoch": 1.5031923383878691, "grad_norm": 0.38026225566864014, "learning_rate": 2.9279636001205957e-05, "loss": 0.2205, "num_tokens": 492504281.0, "step": 3767 }, { "epoch": 1.5035913806863528, "grad_norm": 0.34447795152664185, "learning_rate": 2.9269769631898235e-05, "loss": 0.2206, "num_tokens": 492635353.0, "step": 3768 }, { "epoch": 1.5039904229848364, "grad_norm": 0.2637166678905487, "learning_rate": 2.92599029201566e-05, "loss": 0.1654, "num_tokens": 492766425.0, "step": 3769 }, { "epoch": 1.50438946528332, "grad_norm": 0.27366870641708374, "learning_rate": 2.9250035867890153e-05, "loss": 0.2131, "num_tokens": 492897497.0, "step": 3770 }, { "epoch": 1.5047885075818037, "grad_norm": 0.29787954688072205, "learning_rate": 2.924016847700808e-05, "loss": 0.2023, "num_tokens": 493028569.0, "step": 3771 }, { "epoch": 1.5051875498802874, "grad_norm": 0.29077062010765076, "learning_rate": 2.9230300749419627e-05, "loss": 0.2074, "num_tokens": 493159641.0, "step": 3772 }, { "epoch": 1.505586592178771, "grad_norm": 0.24644076824188232, "learning_rate": 2.92204326870341e-05, "loss": 0.1833, "num_tokens": 493290713.0, "step": 3773 }, { "epoch": 1.5059856344772546, "grad_norm": 0.25626707077026367, "learning_rate": 2.9210564291760894e-05, "loss": 0.1826, "num_tokens": 493421785.0, "step": 3774 }, { "epoch": 1.5063846767757383, "grad_norm": 0.26838573813438416, "learning_rate": 2.920069556550945e-05, "loss": 0.185, "num_tokens": 493552857.0, "step": 3775 }, { "epoch": 1.506783719074222, "grad_norm": 0.24372278153896332, "learning_rate": 2.9190826510189255e-05, "loss": 0.1621, "num_tokens": 493683929.0, "step": 3776 }, { "epoch": 1.5071827613727056, "grad_norm": 0.2566086947917938, "learning_rate": 2.91809571277099e-05, "loss": 0.1686, "num_tokens": 493815001.0, "step": 3777 }, { "epoch": 1.5075818036711892, "grad_norm": 0.2666475176811218, "learning_rate": 2.917108741998101e-05, "loss": 0.1921, "num_tokens": 493946073.0, "step": 3778 }, { "epoch": 1.5079808459696729, "grad_norm": 0.30271807312965393, "learning_rate": 2.9161217388912275e-05, "loss": 0.2292, "num_tokens": 494077145.0, "step": 3779 }, { "epoch": 1.5083798882681565, "grad_norm": 0.2649134397506714, "learning_rate": 2.9151347036413467e-05, "loss": 0.1869, "num_tokens": 494208217.0, "step": 3780 }, { "epoch": 1.5087789305666401, "grad_norm": 0.27352508902549744, "learning_rate": 2.9141476364394404e-05, "loss": 0.1921, "num_tokens": 494339289.0, "step": 3781 }, { "epoch": 1.5091779728651238, "grad_norm": 0.30826354026794434, "learning_rate": 2.9131605374764958e-05, "loss": 0.2354, "num_tokens": 494470361.0, "step": 3782 }, { "epoch": 1.5095770151636074, "grad_norm": 0.32138264179229736, "learning_rate": 2.9121734069435084e-05, "loss": 0.2323, "num_tokens": 494601433.0, "step": 3783 }, { "epoch": 1.509976057462091, "grad_norm": 0.2715482711791992, "learning_rate": 2.9111862450314788e-05, "loss": 0.1881, "num_tokens": 494732505.0, "step": 3784 }, { "epoch": 1.5103750997605747, "grad_norm": 0.25889867544174194, "learning_rate": 2.910199051931413e-05, "loss": 0.1827, "num_tokens": 494863577.0, "step": 3785 }, { "epoch": 1.5107741420590584, "grad_norm": 0.2848370671272278, "learning_rate": 2.909211827834325e-05, "loss": 0.2145, "num_tokens": 494994649.0, "step": 3786 }, { "epoch": 1.511173184357542, "grad_norm": 0.272899329662323, "learning_rate": 2.9082245729312312e-05, "loss": 0.1918, "num_tokens": 495125721.0, "step": 3787 }, { "epoch": 1.5115722266560256, "grad_norm": 0.2994959354400635, "learning_rate": 2.907237287413158e-05, "loss": 0.233, "num_tokens": 495256793.0, "step": 3788 }, { "epoch": 1.5119712689545093, "grad_norm": 0.2691904902458191, "learning_rate": 2.9062499714711356e-05, "loss": 0.1899, "num_tokens": 495387865.0, "step": 3789 }, { "epoch": 1.512370311252993, "grad_norm": 0.30693042278289795, "learning_rate": 2.9052626252961997e-05, "loss": 0.2142, "num_tokens": 495518937.0, "step": 3790 }, { "epoch": 1.5127693535514766, "grad_norm": 0.2619163990020752, "learning_rate": 2.9042752490793934e-05, "loss": 0.1884, "num_tokens": 495650009.0, "step": 3791 }, { "epoch": 1.5131683958499602, "grad_norm": 0.2872045934200287, "learning_rate": 2.903287843011765e-05, "loss": 0.2358, "num_tokens": 495781081.0, "step": 3792 }, { "epoch": 1.5135674381484439, "grad_norm": 0.25837141275405884, "learning_rate": 2.9023004072843672e-05, "loss": 0.2061, "num_tokens": 495912153.0, "step": 3793 }, { "epoch": 1.5139664804469275, "grad_norm": 0.23414722084999084, "learning_rate": 2.9013129420882605e-05, "loss": 0.1479, "num_tokens": 496043225.0, "step": 3794 }, { "epoch": 1.5143655227454111, "grad_norm": 0.246663436293602, "learning_rate": 2.9003254476145102e-05, "loss": 0.1229, "num_tokens": 496174297.0, "step": 3795 }, { "epoch": 1.5147645650438948, "grad_norm": 0.27752402424812317, "learning_rate": 2.8993379240541875e-05, "loss": 0.2068, "num_tokens": 496305369.0, "step": 3796 }, { "epoch": 1.5151636073423784, "grad_norm": 0.4167660176753998, "learning_rate": 2.898350371598368e-05, "loss": 0.272, "num_tokens": 496436441.0, "step": 3797 }, { "epoch": 1.515562649640862, "grad_norm": 0.2839062213897705, "learning_rate": 2.897362790438135e-05, "loss": 0.2365, "num_tokens": 496567513.0, "step": 3798 }, { "epoch": 1.5159616919393457, "grad_norm": 0.25588998198509216, "learning_rate": 2.896375180764575e-05, "loss": 0.1843, "num_tokens": 496698585.0, "step": 3799 }, { "epoch": 1.5163607342378294, "grad_norm": 0.2578417956829071, "learning_rate": 2.8953875427687832e-05, "loss": 0.1805, "num_tokens": 496829657.0, "step": 3800 }, { "epoch": 1.516759776536313, "grad_norm": 0.2589520215988159, "learning_rate": 2.8943998766418556e-05, "loss": 0.1843, "num_tokens": 496960729.0, "step": 3801 }, { "epoch": 1.5171588188347966, "grad_norm": 0.2767258584499359, "learning_rate": 2.8934121825748993e-05, "loss": 0.2097, "num_tokens": 497091801.0, "step": 3802 }, { "epoch": 1.5175578611332803, "grad_norm": 0.30120736360549927, "learning_rate": 2.892424460759022e-05, "loss": 0.2287, "num_tokens": 497222873.0, "step": 3803 }, { "epoch": 1.517956903431764, "grad_norm": 0.3003835380077362, "learning_rate": 2.8914367113853397e-05, "loss": 0.2446, "num_tokens": 497353945.0, "step": 3804 }, { "epoch": 1.5183559457302476, "grad_norm": 0.25545403361320496, "learning_rate": 2.8904489346449714e-05, "loss": 0.1906, "num_tokens": 497485017.0, "step": 3805 }, { "epoch": 1.518754988028731, "grad_norm": 0.27487918734550476, "learning_rate": 2.889461130729044e-05, "loss": 0.2204, "num_tokens": 497616089.0, "step": 3806 }, { "epoch": 1.5191540303272146, "grad_norm": 0.3028474748134613, "learning_rate": 2.8884732998286874e-05, "loss": 0.2553, "num_tokens": 497747161.0, "step": 3807 }, { "epoch": 1.5195530726256983, "grad_norm": 0.28559738397598267, "learning_rate": 2.8874854421350385e-05, "loss": 0.2223, "num_tokens": 497878233.0, "step": 3808 }, { "epoch": 1.519952114924182, "grad_norm": 0.2740841507911682, "learning_rate": 2.886497557839238e-05, "loss": 0.1804, "num_tokens": 498009305.0, "step": 3809 }, { "epoch": 1.5203511572226656, "grad_norm": 0.29095181822776794, "learning_rate": 2.8855096471324328e-05, "loss": 0.2168, "num_tokens": 498140377.0, "step": 3810 }, { "epoch": 1.5207501995211492, "grad_norm": 0.27267229557037354, "learning_rate": 2.8845217102057735e-05, "loss": 0.2341, "num_tokens": 498271449.0, "step": 3811 }, { "epoch": 1.5211492418196328, "grad_norm": 0.2642599046230316, "learning_rate": 2.883533747250417e-05, "loss": 0.2107, "num_tokens": 498402521.0, "step": 3812 }, { "epoch": 1.5215482841181165, "grad_norm": 0.24923431873321533, "learning_rate": 2.882545758457526e-05, "loss": 0.1701, "num_tokens": 498533593.0, "step": 3813 }, { "epoch": 1.5219473264166001, "grad_norm": 0.271694540977478, "learning_rate": 2.8815577440182663e-05, "loss": 0.1866, "num_tokens": 498664665.0, "step": 3814 }, { "epoch": 1.5223463687150838, "grad_norm": 0.24618367850780487, "learning_rate": 2.8805697041238093e-05, "loss": 0.1732, "num_tokens": 498795737.0, "step": 3815 }, { "epoch": 1.5227454110135674, "grad_norm": 0.2879878878593445, "learning_rate": 2.879581638965332e-05, "loss": 0.1911, "num_tokens": 498926809.0, "step": 3816 }, { "epoch": 1.523144453312051, "grad_norm": 0.2777702510356903, "learning_rate": 2.8785935487340153e-05, "loss": 0.1791, "num_tokens": 499057881.0, "step": 3817 }, { "epoch": 1.5235434956105347, "grad_norm": 0.2918568551540375, "learning_rate": 2.877605433621045e-05, "loss": 0.2272, "num_tokens": 499188953.0, "step": 3818 }, { "epoch": 1.5239425379090183, "grad_norm": 0.29494309425354004, "learning_rate": 2.8766172938176138e-05, "loss": 0.2342, "num_tokens": 499320025.0, "step": 3819 }, { "epoch": 1.524341580207502, "grad_norm": 0.26749491691589355, "learning_rate": 2.8756291295149155e-05, "loss": 0.197, "num_tokens": 499451097.0, "step": 3820 }, { "epoch": 1.5247406225059856, "grad_norm": 0.2703985273838043, "learning_rate": 2.8746409409041532e-05, "loss": 0.1746, "num_tokens": 499582169.0, "step": 3821 }, { "epoch": 1.5251396648044693, "grad_norm": 0.29885485768318176, "learning_rate": 2.8736527281765295e-05, "loss": 0.2355, "num_tokens": 499713241.0, "step": 3822 }, { "epoch": 1.525538707102953, "grad_norm": 0.2819456458091736, "learning_rate": 2.8726644915232558e-05, "loss": 0.208, "num_tokens": 499844313.0, "step": 3823 }, { "epoch": 1.5259377494014366, "grad_norm": 0.2583574652671814, "learning_rate": 2.8716762311355466e-05, "loss": 0.1833, "num_tokens": 499975385.0, "step": 3824 }, { "epoch": 1.5263367916999202, "grad_norm": 0.3162993788719177, "learning_rate": 2.870687947204621e-05, "loss": 0.2231, "num_tokens": 500106457.0, "step": 3825 }, { "epoch": 1.5267358339984038, "grad_norm": 0.2543315291404724, "learning_rate": 2.869699639921703e-05, "loss": 0.1383, "num_tokens": 500237529.0, "step": 3826 }, { "epoch": 1.5271348762968875, "grad_norm": 0.2743787467479706, "learning_rate": 2.86871130947802e-05, "loss": 0.1986, "num_tokens": 500368601.0, "step": 3827 }, { "epoch": 1.5275339185953711, "grad_norm": 0.2652971148490906, "learning_rate": 2.8677229560648055e-05, "loss": 0.1905, "num_tokens": 500499673.0, "step": 3828 }, { "epoch": 1.5279329608938548, "grad_norm": 0.25177353620529175, "learning_rate": 2.866734579873296e-05, "loss": 0.1699, "num_tokens": 500630745.0, "step": 3829 }, { "epoch": 1.5283320031923384, "grad_norm": 0.3133644461631775, "learning_rate": 2.8657461810947332e-05, "loss": 0.2204, "num_tokens": 500761817.0, "step": 3830 }, { "epoch": 1.528731045490822, "grad_norm": 0.27436184883117676, "learning_rate": 2.8647577599203647e-05, "loss": 0.1842, "num_tokens": 500892889.0, "step": 3831 }, { "epoch": 1.5291300877893057, "grad_norm": 0.2877216041088104, "learning_rate": 2.8637693165414387e-05, "loss": 0.2201, "num_tokens": 501023961.0, "step": 3832 }, { "epoch": 1.5295291300877893, "grad_norm": 0.24850089848041534, "learning_rate": 2.8627808511492103e-05, "loss": 0.1569, "num_tokens": 501155033.0, "step": 3833 }, { "epoch": 1.529928172386273, "grad_norm": 0.2587386965751648, "learning_rate": 2.861792363934938e-05, "loss": 0.1633, "num_tokens": 501286105.0, "step": 3834 }, { "epoch": 1.5303272146847566, "grad_norm": 0.2690245509147644, "learning_rate": 2.8608038550898857e-05, "loss": 0.1866, "num_tokens": 501417177.0, "step": 3835 }, { "epoch": 1.5307262569832403, "grad_norm": 0.25807443261146545, "learning_rate": 2.859815324805321e-05, "loss": 0.1783, "num_tokens": 501548249.0, "step": 3836 }, { "epoch": 1.531125299281724, "grad_norm": 0.2974543273448944, "learning_rate": 2.8588267732725145e-05, "loss": 0.2256, "num_tokens": 501679321.0, "step": 3837 }, { "epoch": 1.5315243415802073, "grad_norm": 0.3442535400390625, "learning_rate": 2.8578382006827416e-05, "loss": 0.2426, "num_tokens": 501810393.0, "step": 3838 }, { "epoch": 1.531923383878691, "grad_norm": 0.26536792516708374, "learning_rate": 2.856849607227282e-05, "loss": 0.1731, "num_tokens": 501941465.0, "step": 3839 }, { "epoch": 1.5323224261771746, "grad_norm": 0.2962387502193451, "learning_rate": 2.855860993097419e-05, "loss": 0.1872, "num_tokens": 502072537.0, "step": 3840 }, { "epoch": 1.5327214684756583, "grad_norm": 0.26772964000701904, "learning_rate": 2.8548723584844418e-05, "loss": 0.1751, "num_tokens": 502203609.0, "step": 3841 }, { "epoch": 1.533120510774142, "grad_norm": 0.30726808309555054, "learning_rate": 2.85388370357964e-05, "loss": 0.2336, "num_tokens": 502334681.0, "step": 3842 }, { "epoch": 1.5335195530726256, "grad_norm": 0.33293411135673523, "learning_rate": 2.85289502857431e-05, "loss": 0.2504, "num_tokens": 502465753.0, "step": 3843 }, { "epoch": 1.5339185953711092, "grad_norm": 0.30645081400871277, "learning_rate": 2.851906333659752e-05, "loss": 0.2215, "num_tokens": 502596825.0, "step": 3844 }, { "epoch": 1.5343176376695928, "grad_norm": 0.2677578926086426, "learning_rate": 2.850917619027268e-05, "loss": 0.1894, "num_tokens": 502727897.0, "step": 3845 }, { "epoch": 1.5347166799680765, "grad_norm": 0.27350881695747375, "learning_rate": 2.8499288848681643e-05, "loss": 0.206, "num_tokens": 502858969.0, "step": 3846 }, { "epoch": 1.5351157222665601, "grad_norm": 0.27273377776145935, "learning_rate": 2.848940131373754e-05, "loss": 0.2075, "num_tokens": 502990041.0, "step": 3847 }, { "epoch": 1.5355147645650438, "grad_norm": 0.26666054129600525, "learning_rate": 2.8479513587353506e-05, "loss": 0.2097, "num_tokens": 503121113.0, "step": 3848 }, { "epoch": 1.5359138068635274, "grad_norm": 0.2609703242778778, "learning_rate": 2.8469625671442724e-05, "loss": 0.185, "num_tokens": 503252185.0, "step": 3849 }, { "epoch": 1.536312849162011, "grad_norm": 0.2789168953895569, "learning_rate": 2.845973756791841e-05, "loss": 0.2031, "num_tokens": 503374439.0, "step": 3850 }, { "epoch": 1.5367118914604947, "grad_norm": 0.2749777138233185, "learning_rate": 2.844984927869382e-05, "loss": 0.2025, "num_tokens": 503505407.0, "step": 3851 }, { "epoch": 1.5371109337589783, "grad_norm": 0.265414834022522, "learning_rate": 2.843996080568225e-05, "loss": 0.1919, "num_tokens": 503636479.0, "step": 3852 }, { "epoch": 1.537509976057462, "grad_norm": 0.33723825216293335, "learning_rate": 2.843007215079704e-05, "loss": 0.1877, "num_tokens": 503767551.0, "step": 3853 }, { "epoch": 1.5379090183559456, "grad_norm": 0.3218732178211212, "learning_rate": 2.8420183315951522e-05, "loss": 0.2026, "num_tokens": 503898623.0, "step": 3854 }, { "epoch": 1.5383080606544293, "grad_norm": 0.2760921120643616, "learning_rate": 2.841029430305912e-05, "loss": 0.2124, "num_tokens": 504029695.0, "step": 3855 }, { "epoch": 1.538707102952913, "grad_norm": 0.2839585840702057, "learning_rate": 2.840040511403325e-05, "loss": 0.1859, "num_tokens": 504160767.0, "step": 3856 }, { "epoch": 1.5391061452513966, "grad_norm": 0.28340765833854675, "learning_rate": 2.839051575078738e-05, "loss": 0.2343, "num_tokens": 504291839.0, "step": 3857 }, { "epoch": 1.5395051875498802, "grad_norm": 0.3007073998451233, "learning_rate": 2.8380626215235028e-05, "loss": 0.2287, "num_tokens": 504422911.0, "step": 3858 }, { "epoch": 1.5399042298483638, "grad_norm": 0.26651495695114136, "learning_rate": 2.8370736509289704e-05, "loss": 0.1809, "num_tokens": 504553983.0, "step": 3859 }, { "epoch": 1.5403032721468475, "grad_norm": 0.285408616065979, "learning_rate": 2.8360846634864984e-05, "loss": 0.2099, "num_tokens": 504685055.0, "step": 3860 }, { "epoch": 1.5407023144453311, "grad_norm": 0.27089834213256836, "learning_rate": 2.835095659387446e-05, "loss": 0.1905, "num_tokens": 504816127.0, "step": 3861 }, { "epoch": 1.5411013567438148, "grad_norm": 0.27571648359298706, "learning_rate": 2.8341066388231774e-05, "loss": 0.2109, "num_tokens": 504947199.0, "step": 3862 }, { "epoch": 1.5415003990422984, "grad_norm": 0.2649659216403961, "learning_rate": 2.833117601985058e-05, "loss": 0.1796, "num_tokens": 505078271.0, "step": 3863 }, { "epoch": 1.541899441340782, "grad_norm": 1.6517648696899414, "learning_rate": 2.832128549064457e-05, "loss": 0.2402, "num_tokens": 505209343.0, "step": 3864 }, { "epoch": 1.5422984836392657, "grad_norm": 0.26485082507133484, "learning_rate": 2.8311394802527473e-05, "loss": 0.1753, "num_tokens": 505340415.0, "step": 3865 }, { "epoch": 1.5426975259377493, "grad_norm": 0.2940175533294678, "learning_rate": 2.830150395741306e-05, "loss": 0.2222, "num_tokens": 505471487.0, "step": 3866 }, { "epoch": 1.543096568236233, "grad_norm": 0.25892406702041626, "learning_rate": 2.8291612957215093e-05, "loss": 0.1691, "num_tokens": 505602559.0, "step": 3867 }, { "epoch": 1.5434956105347166, "grad_norm": 0.2816174030303955, "learning_rate": 2.8281721803847394e-05, "loss": 0.1962, "num_tokens": 505733631.0, "step": 3868 }, { "epoch": 1.5438946528332003, "grad_norm": 0.27282044291496277, "learning_rate": 2.8271830499223812e-05, "loss": 0.1928, "num_tokens": 505864703.0, "step": 3869 }, { "epoch": 1.544293695131684, "grad_norm": 0.2664358913898468, "learning_rate": 2.8261939045258235e-05, "loss": 0.21, "num_tokens": 505995775.0, "step": 3870 }, { "epoch": 1.5446927374301676, "grad_norm": 0.26940757036209106, "learning_rate": 2.8252047443864548e-05, "loss": 0.1819, "num_tokens": 506126847.0, "step": 3871 }, { "epoch": 1.5450917797286512, "grad_norm": 0.3294673562049866, "learning_rate": 2.824215569695669e-05, "loss": 0.2574, "num_tokens": 506257919.0, "step": 3872 }, { "epoch": 1.5454908220271348, "grad_norm": 0.2651679217815399, "learning_rate": 2.8232263806448623e-05, "loss": 0.171, "num_tokens": 506388991.0, "step": 3873 }, { "epoch": 1.5458898643256185, "grad_norm": 0.27355918288230896, "learning_rate": 2.8222371774254335e-05, "loss": 0.1888, "num_tokens": 506520063.0, "step": 3874 }, { "epoch": 1.5462889066241021, "grad_norm": 0.27515900135040283, "learning_rate": 2.821247960228785e-05, "loss": 0.1972, "num_tokens": 506651135.0, "step": 3875 }, { "epoch": 1.5466879489225858, "grad_norm": 0.26130610704421997, "learning_rate": 2.8202587292463195e-05, "loss": 0.1712, "num_tokens": 506782207.0, "step": 3876 }, { "epoch": 1.5470869912210694, "grad_norm": 0.2644554376602173, "learning_rate": 2.819269484669445e-05, "loss": 0.1841, "num_tokens": 506913279.0, "step": 3877 }, { "epoch": 1.547486033519553, "grad_norm": 0.2680239677429199, "learning_rate": 2.8182802266895714e-05, "loss": 0.1636, "num_tokens": 507036929.0, "step": 3878 }, { "epoch": 1.5478850758180367, "grad_norm": 0.2723112404346466, "learning_rate": 2.8172909554981104e-05, "loss": 0.1582, "num_tokens": 507168001.0, "step": 3879 }, { "epoch": 1.5482841181165203, "grad_norm": 0.35253387689590454, "learning_rate": 2.816301671286476e-05, "loss": 0.2306, "num_tokens": 507299073.0, "step": 3880 }, { "epoch": 1.548683160415004, "grad_norm": 0.31584659218788147, "learning_rate": 2.8153123742460868e-05, "loss": 0.2334, "num_tokens": 507430145.0, "step": 3881 }, { "epoch": 1.5490822027134876, "grad_norm": 0.28825509548187256, "learning_rate": 2.8143230645683614e-05, "loss": 0.1976, "num_tokens": 507561217.0, "step": 3882 }, { "epoch": 1.5494812450119713, "grad_norm": 0.2780022919178009, "learning_rate": 2.8133337424447243e-05, "loss": 0.2059, "num_tokens": 507692289.0, "step": 3883 }, { "epoch": 1.549880287310455, "grad_norm": 0.27826815843582153, "learning_rate": 2.8123444080665973e-05, "loss": 0.2214, "num_tokens": 507823361.0, "step": 3884 }, { "epoch": 1.5502793296089385, "grad_norm": 0.30537641048431396, "learning_rate": 2.8113550616254075e-05, "loss": 0.2342, "num_tokens": 507940638.0, "step": 3885 }, { "epoch": 1.5506783719074222, "grad_norm": 0.2783980667591095, "learning_rate": 2.810365703312587e-05, "loss": 0.2208, "num_tokens": 508058881.0, "step": 3886 }, { "epoch": 1.5510774142059058, "grad_norm": 0.2665088474750519, "learning_rate": 2.809376333319565e-05, "loss": 0.1687, "num_tokens": 508189953.0, "step": 3887 }, { "epoch": 1.5514764565043895, "grad_norm": 0.3100331723690033, "learning_rate": 2.808386951837776e-05, "loss": 0.2252, "num_tokens": 508321025.0, "step": 3888 }, { "epoch": 1.5518754988028731, "grad_norm": 0.27574804425239563, "learning_rate": 2.807397559058656e-05, "loss": 0.1975, "num_tokens": 508452097.0, "step": 3889 }, { "epoch": 1.5522745411013568, "grad_norm": 0.29496392607688904, "learning_rate": 2.8064081551736427e-05, "loss": 0.2096, "num_tokens": 508577489.0, "step": 3890 }, { "epoch": 1.5526735833998404, "grad_norm": 0.2863272428512573, "learning_rate": 2.805418740374178e-05, "loss": 0.2061, "num_tokens": 508708561.0, "step": 3891 }, { "epoch": 1.553072625698324, "grad_norm": 0.24450404942035675, "learning_rate": 2.8044293148517035e-05, "loss": 0.1618, "num_tokens": 508839633.0, "step": 3892 }, { "epoch": 1.5534716679968077, "grad_norm": 0.28413018584251404, "learning_rate": 2.803439878797664e-05, "loss": 0.1779, "num_tokens": 508970705.0, "step": 3893 }, { "epoch": 1.5538707102952913, "grad_norm": 0.29625454545021057, "learning_rate": 2.8024504324035066e-05, "loss": 0.22, "num_tokens": 509101777.0, "step": 3894 }, { "epoch": 1.554269752593775, "grad_norm": 0.2780323326587677, "learning_rate": 2.8014609758606793e-05, "loss": 0.1392, "num_tokens": 509232849.0, "step": 3895 }, { "epoch": 1.5546687948922586, "grad_norm": 0.27381372451782227, "learning_rate": 2.8004715093606317e-05, "loss": 0.1951, "num_tokens": 509363921.0, "step": 3896 }, { "epoch": 1.5550678371907423, "grad_norm": 0.2838456332683563, "learning_rate": 2.7994820330948186e-05, "loss": 0.1868, "num_tokens": 509494993.0, "step": 3897 }, { "epoch": 1.555466879489226, "grad_norm": 0.3055649995803833, "learning_rate": 2.798492547254693e-05, "loss": 0.2201, "num_tokens": 509626065.0, "step": 3898 }, { "epoch": 1.5558659217877095, "grad_norm": 0.3347962200641632, "learning_rate": 2.7975030520317118e-05, "loss": 0.2412, "num_tokens": 509757137.0, "step": 3899 }, { "epoch": 1.5562649640861932, "grad_norm": 0.2690644860267639, "learning_rate": 2.796513547617333e-05, "loss": 0.1795, "num_tokens": 509888209.0, "step": 3900 }, { "epoch": 1.5566640063846768, "grad_norm": 0.2750261127948761, "learning_rate": 2.7955240342030155e-05, "loss": 0.1914, "num_tokens": 510019281.0, "step": 3901 }, { "epoch": 1.5570630486831605, "grad_norm": 0.2743460536003113, "learning_rate": 2.7945345119802217e-05, "loss": 0.2024, "num_tokens": 510150353.0, "step": 3902 }, { "epoch": 1.5574620909816441, "grad_norm": 0.2792988419532776, "learning_rate": 2.7935449811404164e-05, "loss": 0.2072, "num_tokens": 510281425.0, "step": 3903 }, { "epoch": 1.5578611332801278, "grad_norm": 0.2967151999473572, "learning_rate": 2.7925554418750616e-05, "loss": 0.2005, "num_tokens": 510412497.0, "step": 3904 }, { "epoch": 1.5582601755786114, "grad_norm": 0.2864798605442047, "learning_rate": 2.7915658943756273e-05, "loss": 0.2036, "num_tokens": 510543569.0, "step": 3905 }, { "epoch": 1.558659217877095, "grad_norm": 0.2849509119987488, "learning_rate": 2.790576338833579e-05, "loss": 0.2269, "num_tokens": 510674641.0, "step": 3906 }, { "epoch": 1.5590582601755787, "grad_norm": 0.2540939450263977, "learning_rate": 2.7895867754403865e-05, "loss": 0.1499, "num_tokens": 510805713.0, "step": 3907 }, { "epoch": 1.5594573024740623, "grad_norm": 0.28982383012771606, "learning_rate": 2.788597204387523e-05, "loss": 0.1793, "num_tokens": 510923175.0, "step": 3908 }, { "epoch": 1.559856344772546, "grad_norm": 0.25644052028656006, "learning_rate": 2.78760762586646e-05, "loss": 0.1874, "num_tokens": 511054247.0, "step": 3909 }, { "epoch": 1.5602553870710296, "grad_norm": 0.31652331352233887, "learning_rate": 2.7866180400686716e-05, "loss": 0.2268, "num_tokens": 511185319.0, "step": 3910 }, { "epoch": 1.5606544293695133, "grad_norm": 0.31643742322921753, "learning_rate": 2.7856284471856348e-05, "loss": 0.1826, "num_tokens": 511316391.0, "step": 3911 }, { "epoch": 1.561053471667997, "grad_norm": 0.29983997344970703, "learning_rate": 2.7846388474088242e-05, "loss": 0.1939, "num_tokens": 511447463.0, "step": 3912 }, { "epoch": 1.5614525139664805, "grad_norm": 0.28074541687965393, "learning_rate": 2.7836492409297204e-05, "loss": 0.1848, "num_tokens": 511578535.0, "step": 3913 }, { "epoch": 1.5618515562649642, "grad_norm": 0.31587305665016174, "learning_rate": 2.782659627939802e-05, "loss": 0.2221, "num_tokens": 511709607.0, "step": 3914 }, { "epoch": 1.5622505985634478, "grad_norm": 0.3422064483165741, "learning_rate": 2.7816700086305502e-05, "loss": 0.2351, "num_tokens": 511840679.0, "step": 3915 }, { "epoch": 1.5626496408619315, "grad_norm": 0.2754622995853424, "learning_rate": 2.780680383193447e-05, "loss": 0.1984, "num_tokens": 511971751.0, "step": 3916 }, { "epoch": 1.5630486831604151, "grad_norm": 0.2838393747806549, "learning_rate": 2.7796907518199755e-05, "loss": 0.1745, "num_tokens": 512102823.0, "step": 3917 }, { "epoch": 1.5634477254588988, "grad_norm": 0.3040533661842346, "learning_rate": 2.7787011147016202e-05, "loss": 0.2233, "num_tokens": 512233895.0, "step": 3918 }, { "epoch": 1.5638467677573824, "grad_norm": 0.27028170228004456, "learning_rate": 2.7777114720298665e-05, "loss": 0.1889, "num_tokens": 512364967.0, "step": 3919 }, { "epoch": 1.564245810055866, "grad_norm": 0.28310975432395935, "learning_rate": 2.7767218239962018e-05, "loss": 0.2098, "num_tokens": 512496039.0, "step": 3920 }, { "epoch": 1.5646448523543497, "grad_norm": 0.27882248163223267, "learning_rate": 2.7757321707921125e-05, "loss": 0.2016, "num_tokens": 512627111.0, "step": 3921 }, { "epoch": 1.5650438946528333, "grad_norm": 0.2696886956691742, "learning_rate": 2.7747425126090893e-05, "loss": 0.2048, "num_tokens": 512758183.0, "step": 3922 }, { "epoch": 1.565442936951317, "grad_norm": 0.24850812554359436, "learning_rate": 2.7737528496386196e-05, "loss": 0.1618, "num_tokens": 512889255.0, "step": 3923 }, { "epoch": 1.5658419792498006, "grad_norm": 0.26635295152664185, "learning_rate": 2.7727631820721946e-05, "loss": 0.1757, "num_tokens": 513020327.0, "step": 3924 }, { "epoch": 1.5662410215482843, "grad_norm": 0.2743992805480957, "learning_rate": 2.771773510101307e-05, "loss": 0.1712, "num_tokens": 513151399.0, "step": 3925 }, { "epoch": 1.566640063846768, "grad_norm": 0.34458810091018677, "learning_rate": 2.770783833917448e-05, "loss": 0.2091, "num_tokens": 513282471.0, "step": 3926 }, { "epoch": 1.5670391061452515, "grad_norm": 0.2829618752002716, "learning_rate": 2.7697941537121102e-05, "loss": 0.1695, "num_tokens": 513413543.0, "step": 3927 }, { "epoch": 1.5674381484437352, "grad_norm": 0.32701072096824646, "learning_rate": 2.768804469676789e-05, "loss": 0.251, "num_tokens": 513544615.0, "step": 3928 }, { "epoch": 1.5678371907422188, "grad_norm": 0.279048353433609, "learning_rate": 2.7678147820029774e-05, "loss": 0.1977, "num_tokens": 513675687.0, "step": 3929 }, { "epoch": 1.5682362330407025, "grad_norm": 0.28772833943367004, "learning_rate": 2.7668250908821725e-05, "loss": 0.2094, "num_tokens": 513806759.0, "step": 3930 }, { "epoch": 1.5686352753391861, "grad_norm": 0.3264067471027374, "learning_rate": 2.7658353965058685e-05, "loss": 0.2458, "num_tokens": 513937831.0, "step": 3931 }, { "epoch": 1.5690343176376695, "grad_norm": 0.2845422923564911, "learning_rate": 2.764845699065563e-05, "loss": 0.1889, "num_tokens": 514068903.0, "step": 3932 }, { "epoch": 1.5694333599361532, "grad_norm": 0.30000266432762146, "learning_rate": 2.763855998752754e-05, "loss": 0.2381, "num_tokens": 514199975.0, "step": 3933 }, { "epoch": 1.5698324022346368, "grad_norm": 0.2546398341655731, "learning_rate": 2.762866295758938e-05, "loss": 0.1628, "num_tokens": 514331047.0, "step": 3934 }, { "epoch": 1.5702314445331205, "grad_norm": 0.2799123227596283, "learning_rate": 2.7618765902756126e-05, "loss": 0.1721, "num_tokens": 514462119.0, "step": 3935 }, { "epoch": 1.5706304868316041, "grad_norm": 0.2832193076610565, "learning_rate": 2.7608868824942784e-05, "loss": 0.2182, "num_tokens": 514593191.0, "step": 3936 }, { "epoch": 1.5710295291300878, "grad_norm": 0.25268328189849854, "learning_rate": 2.7598971726064342e-05, "loss": 0.1615, "num_tokens": 514724263.0, "step": 3937 }, { "epoch": 1.5714285714285714, "grad_norm": 0.2655186951160431, "learning_rate": 2.7589074608035793e-05, "loss": 0.1902, "num_tokens": 514855335.0, "step": 3938 }, { "epoch": 1.571827613727055, "grad_norm": 0.2639869451522827, "learning_rate": 2.7579177472772143e-05, "loss": 0.1826, "num_tokens": 514986407.0, "step": 3939 }, { "epoch": 1.5722266560255387, "grad_norm": 0.32119840383529663, "learning_rate": 2.7569280322188378e-05, "loss": 0.2451, "num_tokens": 515117479.0, "step": 3940 }, { "epoch": 1.5726256983240223, "grad_norm": 0.2835025489330292, "learning_rate": 2.755938315819952e-05, "loss": 0.2054, "num_tokens": 515248551.0, "step": 3941 }, { "epoch": 1.573024740622506, "grad_norm": 0.2837352156639099, "learning_rate": 2.7549485982720585e-05, "loss": 0.1821, "num_tokens": 515379623.0, "step": 3942 }, { "epoch": 1.5734237829209896, "grad_norm": 0.2716274559497833, "learning_rate": 2.753958879766656e-05, "loss": 0.191, "num_tokens": 515510695.0, "step": 3943 }, { "epoch": 1.5738228252194733, "grad_norm": 0.28407010436058044, "learning_rate": 2.752969160495248e-05, "loss": 0.2197, "num_tokens": 515641767.0, "step": 3944 }, { "epoch": 1.574221867517957, "grad_norm": 0.25699731707572937, "learning_rate": 2.7519794406493355e-05, "loss": 0.1542, "num_tokens": 515772839.0, "step": 3945 }, { "epoch": 1.5746209098164405, "grad_norm": 0.2743999660015106, "learning_rate": 2.750989720420418e-05, "loss": 0.2071, "num_tokens": 515903911.0, "step": 3946 }, { "epoch": 1.5750199521149242, "grad_norm": 0.28185078501701355, "learning_rate": 2.7500000000000004e-05, "loss": 0.166, "num_tokens": 516034983.0, "step": 3947 }, { "epoch": 1.5754189944134078, "grad_norm": 0.2649089992046356, "learning_rate": 2.7490102795795824e-05, "loss": 0.1781, "num_tokens": 516166055.0, "step": 3948 }, { "epoch": 1.5758180367118915, "grad_norm": 0.2657147943973541, "learning_rate": 2.7480205593506654e-05, "loss": 0.1679, "num_tokens": 516297127.0, "step": 3949 }, { "epoch": 1.576217079010375, "grad_norm": 0.25667059421539307, "learning_rate": 2.7470308395047522e-05, "loss": 0.1743, "num_tokens": 516428199.0, "step": 3950 }, { "epoch": 1.5766161213088588, "grad_norm": 0.2800499498844147, "learning_rate": 2.7460411202333442e-05, "loss": 0.1745, "num_tokens": 516559271.0, "step": 3951 }, { "epoch": 1.5770151636073424, "grad_norm": 0.2958112359046936, "learning_rate": 2.745051401727942e-05, "loss": 0.1735, "num_tokens": 516690343.0, "step": 3952 }, { "epoch": 1.577414205905826, "grad_norm": 0.2677721679210663, "learning_rate": 2.7440616841800486e-05, "loss": 0.1721, "num_tokens": 516821415.0, "step": 3953 }, { "epoch": 1.5778132482043097, "grad_norm": 0.30667349696159363, "learning_rate": 2.7430719677811628e-05, "loss": 0.1842, "num_tokens": 516952487.0, "step": 3954 }, { "epoch": 1.5782122905027933, "grad_norm": 0.29839015007019043, "learning_rate": 2.742082252722787e-05, "loss": 0.2068, "num_tokens": 517083559.0, "step": 3955 }, { "epoch": 1.578611332801277, "grad_norm": 0.2801090478897095, "learning_rate": 2.7410925391964216e-05, "loss": 0.1843, "num_tokens": 517214631.0, "step": 3956 }, { "epoch": 1.5790103750997606, "grad_norm": 0.2918819785118103, "learning_rate": 2.740102827393566e-05, "loss": 0.1975, "num_tokens": 517345703.0, "step": 3957 }, { "epoch": 1.5794094173982443, "grad_norm": 0.3046629726886749, "learning_rate": 2.7391131175057215e-05, "loss": 0.2063, "num_tokens": 517476775.0, "step": 3958 }, { "epoch": 1.579808459696728, "grad_norm": 0.2629203498363495, "learning_rate": 2.738123409724388e-05, "loss": 0.1764, "num_tokens": 517607847.0, "step": 3959 }, { "epoch": 1.5802075019952115, "grad_norm": 0.2830955982208252, "learning_rate": 2.7371337042410626e-05, "loss": 0.1973, "num_tokens": 517738919.0, "step": 3960 }, { "epoch": 1.5806065442936952, "grad_norm": 0.2569398283958435, "learning_rate": 2.7361440012472467e-05, "loss": 0.167, "num_tokens": 517869991.0, "step": 3961 }, { "epoch": 1.5810055865921788, "grad_norm": 0.2694927155971527, "learning_rate": 2.735154300934437e-05, "loss": 0.1908, "num_tokens": 518001063.0, "step": 3962 }, { "epoch": 1.5814046288906622, "grad_norm": 0.30389270186424255, "learning_rate": 2.7341646034941314e-05, "loss": 0.2132, "num_tokens": 518132135.0, "step": 3963 }, { "epoch": 1.5818036711891459, "grad_norm": 0.34676438570022583, "learning_rate": 2.7331749091178294e-05, "loss": 0.2293, "num_tokens": 518263207.0, "step": 3964 }, { "epoch": 1.5822027134876295, "grad_norm": 0.28244322538375854, "learning_rate": 2.732185217997023e-05, "loss": 0.2098, "num_tokens": 518394279.0, "step": 3965 }, { "epoch": 1.5826017557861132, "grad_norm": 0.31839925050735474, "learning_rate": 2.7311955303232122e-05, "loss": 0.2185, "num_tokens": 518525351.0, "step": 3966 }, { "epoch": 1.5830007980845968, "grad_norm": 0.2694072723388672, "learning_rate": 2.7302058462878904e-05, "loss": 0.1911, "num_tokens": 518656423.0, "step": 3967 }, { "epoch": 1.5833998403830805, "grad_norm": 0.2356661856174469, "learning_rate": 2.7292161660825526e-05, "loss": 0.1343, "num_tokens": 518787495.0, "step": 3968 }, { "epoch": 1.583798882681564, "grad_norm": 0.2867469787597656, "learning_rate": 2.7282264898986936e-05, "loss": 0.1665, "num_tokens": 518918567.0, "step": 3969 }, { "epoch": 1.5841979249800477, "grad_norm": 0.2908707857131958, "learning_rate": 2.7272368179278056e-05, "loss": 0.2291, "num_tokens": 519049639.0, "step": 3970 }, { "epoch": 1.5845969672785314, "grad_norm": 0.2728838622570038, "learning_rate": 2.7262471503613806e-05, "loss": 0.2011, "num_tokens": 519180711.0, "step": 3971 }, { "epoch": 1.584996009577015, "grad_norm": 0.2744534909725189, "learning_rate": 2.7252574873909116e-05, "loss": 0.1749, "num_tokens": 519311783.0, "step": 3972 }, { "epoch": 1.5853950518754987, "grad_norm": 0.29258689284324646, "learning_rate": 2.7242678292078877e-05, "loss": 0.2165, "num_tokens": 519427316.0, "step": 3973 }, { "epoch": 1.5857940941739823, "grad_norm": 0.2586306631565094, "learning_rate": 2.7232781760037984e-05, "loss": 0.1578, "num_tokens": 519558388.0, "step": 3974 }, { "epoch": 1.586193136472466, "grad_norm": 0.28726786375045776, "learning_rate": 2.7222885279701337e-05, "loss": 0.1964, "num_tokens": 519689460.0, "step": 3975 }, { "epoch": 1.5865921787709496, "grad_norm": 0.2722485065460205, "learning_rate": 2.721298885298381e-05, "loss": 0.2037, "num_tokens": 519820532.0, "step": 3976 }, { "epoch": 1.5869912210694332, "grad_norm": 0.2777278423309326, "learning_rate": 2.7203092481800257e-05, "loss": 0.2003, "num_tokens": 519951604.0, "step": 3977 }, { "epoch": 1.5873902633679169, "grad_norm": 0.2740626335144043, "learning_rate": 2.7193196168065545e-05, "loss": 0.1964, "num_tokens": 520082676.0, "step": 3978 }, { "epoch": 1.5877893056664005, "grad_norm": 0.2400423139333725, "learning_rate": 2.7183299913694503e-05, "loss": 0.1381, "num_tokens": 520213748.0, "step": 3979 }, { "epoch": 1.5881883479648842, "grad_norm": 0.2972320318222046, "learning_rate": 2.7173403720601985e-05, "loss": 0.2204, "num_tokens": 520344820.0, "step": 3980 }, { "epoch": 1.5885873902633678, "grad_norm": 0.2548244297504425, "learning_rate": 2.71635075907028e-05, "loss": 0.1638, "num_tokens": 520474155.0, "step": 3981 }, { "epoch": 1.5889864325618515, "grad_norm": 0.26077231764793396, "learning_rate": 2.7153611525911766e-05, "loss": 0.1687, "num_tokens": 520605227.0, "step": 3982 }, { "epoch": 1.589385474860335, "grad_norm": 0.28131911158561707, "learning_rate": 2.714371552814366e-05, "loss": 0.1784, "num_tokens": 520726223.0, "step": 3983 }, { "epoch": 1.5897845171588187, "grad_norm": 0.41059964895248413, "learning_rate": 2.7133819599313286e-05, "loss": 0.2695, "num_tokens": 520842750.0, "step": 3984 }, { "epoch": 1.5901835594573024, "grad_norm": 0.30501917004585266, "learning_rate": 2.712392374133541e-05, "loss": 0.2087, "num_tokens": 520973822.0, "step": 3985 }, { "epoch": 1.590582601755786, "grad_norm": 0.28161635994911194, "learning_rate": 2.7114027956124772e-05, "loss": 0.2133, "num_tokens": 521104894.0, "step": 3986 }, { "epoch": 1.5909816440542697, "grad_norm": 0.28188735246658325, "learning_rate": 2.7104132245596144e-05, "loss": 0.1882, "num_tokens": 521235966.0, "step": 3987 }, { "epoch": 1.5913806863527533, "grad_norm": 0.30421605706214905, "learning_rate": 2.7094236611664227e-05, "loss": 0.2054, "num_tokens": 521367038.0, "step": 3988 }, { "epoch": 1.591779728651237, "grad_norm": 0.28407254815101624, "learning_rate": 2.708434105624374e-05, "loss": 0.1937, "num_tokens": 521498110.0, "step": 3989 }, { "epoch": 1.5921787709497206, "grad_norm": 0.25941240787506104, "learning_rate": 2.707444558124939e-05, "loss": 0.1566, "num_tokens": 521624769.0, "step": 3990 }, { "epoch": 1.5925778132482042, "grad_norm": 0.33079561591148376, "learning_rate": 2.7064550188595845e-05, "loss": 0.2234, "num_tokens": 521755841.0, "step": 3991 }, { "epoch": 1.5929768555466879, "grad_norm": 0.2716871500015259, "learning_rate": 2.705465488019778e-05, "loss": 0.1673, "num_tokens": 521886913.0, "step": 3992 }, { "epoch": 1.5933758978451715, "grad_norm": 0.3234659731388092, "learning_rate": 2.7044759657969854e-05, "loss": 0.1959, "num_tokens": 522016575.0, "step": 3993 }, { "epoch": 1.5937749401436552, "grad_norm": 0.2623682916164398, "learning_rate": 2.703486452382668e-05, "loss": 0.1706, "num_tokens": 522147647.0, "step": 3994 }, { "epoch": 1.5941739824421388, "grad_norm": 0.272506982088089, "learning_rate": 2.7024969479682888e-05, "loss": 0.1976, "num_tokens": 522278719.0, "step": 3995 }, { "epoch": 1.5945730247406225, "grad_norm": 0.27181464433670044, "learning_rate": 2.7015074527453078e-05, "loss": 0.1603, "num_tokens": 522409791.0, "step": 3996 }, { "epoch": 1.594972067039106, "grad_norm": 0.260633260011673, "learning_rate": 2.7005179669051816e-05, "loss": 0.1541, "num_tokens": 522540863.0, "step": 3997 }, { "epoch": 1.5953711093375897, "grad_norm": 0.3113543391227722, "learning_rate": 2.69952849063937e-05, "loss": 0.2121, "num_tokens": 522671935.0, "step": 3998 }, { "epoch": 1.5957701516360734, "grad_norm": 0.2849595844745636, "learning_rate": 2.6985390241393223e-05, "loss": 0.1894, "num_tokens": 522803007.0, "step": 3999 }, { "epoch": 1.596169193934557, "grad_norm": 0.24443769454956055, "learning_rate": 2.6975495675964946e-05, "loss": 0.1408, "num_tokens": 522934079.0, "step": 4000 }, { "epoch": 1.5965682362330407, "grad_norm": 0.29691559076309204, "learning_rate": 2.6965601212023373e-05, "loss": 0.2126, "num_tokens": 523065151.0, "step": 4001 }, { "epoch": 1.5969672785315243, "grad_norm": 0.3301299810409546, "learning_rate": 2.695570685148297e-05, "loss": 0.2108, "num_tokens": 523196223.0, "step": 4002 }, { "epoch": 1.597366320830008, "grad_norm": 0.33713725209236145, "learning_rate": 2.6945812596258223e-05, "loss": 0.2148, "num_tokens": 523327295.0, "step": 4003 }, { "epoch": 1.5977653631284916, "grad_norm": 0.39263850450515747, "learning_rate": 2.6935918448263575e-05, "loss": 0.2572, "num_tokens": 523458367.0, "step": 4004 }, { "epoch": 1.5981644054269752, "grad_norm": 0.32304906845092773, "learning_rate": 2.6926024409413448e-05, "loss": 0.2552, "num_tokens": 523589439.0, "step": 4005 }, { "epoch": 1.5985634477254589, "grad_norm": 0.2575148940086365, "learning_rate": 2.691613048162225e-05, "loss": 0.1751, "num_tokens": 523720511.0, "step": 4006 }, { "epoch": 1.5989624900239425, "grad_norm": 0.29043182730674744, "learning_rate": 2.690623666680436e-05, "loss": 0.211, "num_tokens": 523851583.0, "step": 4007 }, { "epoch": 1.5993615323224262, "grad_norm": 0.26500725746154785, "learning_rate": 2.6896342966874133e-05, "loss": 0.1755, "num_tokens": 523982655.0, "step": 4008 }, { "epoch": 1.5997605746209098, "grad_norm": 0.27658790349960327, "learning_rate": 2.688644938374592e-05, "loss": 0.1958, "num_tokens": 524113727.0, "step": 4009 }, { "epoch": 1.6001596169193935, "grad_norm": 0.27614137530326843, "learning_rate": 2.687655591933404e-05, "loss": 0.2096, "num_tokens": 524244799.0, "step": 4010 }, { "epoch": 1.600558659217877, "grad_norm": 0.26409316062927246, "learning_rate": 2.686666257555277e-05, "loss": 0.1833, "num_tokens": 524375871.0, "step": 4011 }, { "epoch": 1.6009577015163607, "grad_norm": 0.2572740912437439, "learning_rate": 2.685676935431639e-05, "loss": 0.1754, "num_tokens": 524506943.0, "step": 4012 }, { "epoch": 1.6013567438148444, "grad_norm": 0.2743318974971771, "learning_rate": 2.6846876257539137e-05, "loss": 0.2168, "num_tokens": 524638015.0, "step": 4013 }, { "epoch": 1.601755786113328, "grad_norm": 0.2933703064918518, "learning_rate": 2.683698328713525e-05, "loss": 0.2006, "num_tokens": 524769087.0, "step": 4014 }, { "epoch": 1.6021548284118117, "grad_norm": 0.2793980538845062, "learning_rate": 2.682709044501891e-05, "loss": 0.2019, "num_tokens": 524900159.0, "step": 4015 }, { "epoch": 1.6025538707102953, "grad_norm": 0.26972007751464844, "learning_rate": 2.6817197733104295e-05, "loss": 0.168, "num_tokens": 525031231.0, "step": 4016 }, { "epoch": 1.602952913008779, "grad_norm": 0.2748717665672302, "learning_rate": 2.680730515330555e-05, "loss": 0.1778, "num_tokens": 525162303.0, "step": 4017 }, { "epoch": 1.6033519553072626, "grad_norm": 0.28213974833488464, "learning_rate": 2.679741270753681e-05, "loss": 0.1942, "num_tokens": 525293375.0, "step": 4018 }, { "epoch": 1.6037509976057462, "grad_norm": 0.25648820400238037, "learning_rate": 2.6787520397712156e-05, "loss": 0.1463, "num_tokens": 525424447.0, "step": 4019 }, { "epoch": 1.6041500399042299, "grad_norm": 0.3492911458015442, "learning_rate": 2.6777628225745667e-05, "loss": 0.2005, "num_tokens": 525555519.0, "step": 4020 }, { "epoch": 1.6045490822027135, "grad_norm": 0.33867397904396057, "learning_rate": 2.6767736193551385e-05, "loss": 0.205, "num_tokens": 525686591.0, "step": 4021 }, { "epoch": 1.6049481245011972, "grad_norm": 0.3056991994380951, "learning_rate": 2.6757844303043323e-05, "loss": 0.1997, "num_tokens": 525817663.0, "step": 4022 }, { "epoch": 1.6053471667996808, "grad_norm": 0.3125373423099518, "learning_rate": 2.6747952556135464e-05, "loss": 0.2327, "num_tokens": 525948735.0, "step": 4023 }, { "epoch": 1.6057462090981645, "grad_norm": 0.2812942564487457, "learning_rate": 2.6738060954741773e-05, "loss": 0.2092, "num_tokens": 526079807.0, "step": 4024 }, { "epoch": 1.606145251396648, "grad_norm": 0.33859091997146606, "learning_rate": 2.672816950077619e-05, "loss": 0.2645, "num_tokens": 526210879.0, "step": 4025 }, { "epoch": 1.6065442936951317, "grad_norm": 0.27659091353416443, "learning_rate": 2.671827819615261e-05, "loss": 0.2128, "num_tokens": 526341951.0, "step": 4026 }, { "epoch": 1.6069433359936154, "grad_norm": 0.27430981397628784, "learning_rate": 2.6708387042784916e-05, "loss": 0.2117, "num_tokens": 526473023.0, "step": 4027 }, { "epoch": 1.607342378292099, "grad_norm": 0.28160950541496277, "learning_rate": 2.6698496042586947e-05, "loss": 0.1771, "num_tokens": 526604095.0, "step": 4028 }, { "epoch": 1.6077414205905827, "grad_norm": 0.28420013189315796, "learning_rate": 2.6688605197472526e-05, "loss": 0.1627, "num_tokens": 526735167.0, "step": 4029 }, { "epoch": 1.6081404628890663, "grad_norm": 0.2801090478897095, "learning_rate": 2.667871450935543e-05, "loss": 0.2078, "num_tokens": 526866239.0, "step": 4030 }, { "epoch": 1.60853950518755, "grad_norm": 0.24750599265098572, "learning_rate": 2.666882398014942e-05, "loss": 0.158, "num_tokens": 526997311.0, "step": 4031 }, { "epoch": 1.6089385474860336, "grad_norm": 0.2694101333618164, "learning_rate": 2.665893361176824e-05, "loss": 0.1775, "num_tokens": 527128383.0, "step": 4032 }, { "epoch": 1.6093375897845172, "grad_norm": 0.30313795804977417, "learning_rate": 2.6649043406125545e-05, "loss": 0.1993, "num_tokens": 527259455.0, "step": 4033 }, { "epoch": 1.6097366320830009, "grad_norm": 0.33013755083084106, "learning_rate": 2.6639153365135028e-05, "loss": 0.2269, "num_tokens": 527390527.0, "step": 4034 }, { "epoch": 1.6101356743814845, "grad_norm": 0.29080474376678467, "learning_rate": 2.6629263490710305e-05, "loss": 0.1615, "num_tokens": 527521599.0, "step": 4035 }, { "epoch": 1.6105347166799682, "grad_norm": 0.32044854760169983, "learning_rate": 2.6619373784764978e-05, "loss": 0.2155, "num_tokens": 527652671.0, "step": 4036 }, { "epoch": 1.6109337589784518, "grad_norm": 0.28613683581352234, "learning_rate": 2.6609484249212617e-05, "loss": 0.1835, "num_tokens": 527783743.0, "step": 4037 }, { "epoch": 1.6113328012769355, "grad_norm": 0.24518175423145294, "learning_rate": 2.659959488596675e-05, "loss": 0.1455, "num_tokens": 527914815.0, "step": 4038 }, { "epoch": 1.611731843575419, "grad_norm": 0.3069591224193573, "learning_rate": 2.6589705696940885e-05, "loss": 0.1796, "num_tokens": 528040493.0, "step": 4039 }, { "epoch": 1.6121308858739027, "grad_norm": 0.31441885232925415, "learning_rate": 2.657981668404848e-05, "loss": 0.2094, "num_tokens": 528171565.0, "step": 4040 }, { "epoch": 1.6125299281723864, "grad_norm": 0.2783651649951935, "learning_rate": 2.6569927849202963e-05, "loss": 0.1974, "num_tokens": 528302637.0, "step": 4041 }, { "epoch": 1.61292897047087, "grad_norm": 0.27671924233436584, "learning_rate": 2.6560039194317744e-05, "loss": 0.1578, "num_tokens": 528433709.0, "step": 4042 }, { "epoch": 1.6133280127693537, "grad_norm": 0.2616335153579712, "learning_rate": 2.6550150721306182e-05, "loss": 0.1735, "num_tokens": 528564781.0, "step": 4043 }, { "epoch": 1.6137270550678373, "grad_norm": 0.24027691781520844, "learning_rate": 2.6540262432081603e-05, "loss": 0.1563, "num_tokens": 528695853.0, "step": 4044 }, { "epoch": 1.614126097366321, "grad_norm": 0.28910312056541443, "learning_rate": 2.653037432855729e-05, "loss": 0.2019, "num_tokens": 528826925.0, "step": 4045 }, { "epoch": 1.6145251396648046, "grad_norm": 0.26745983958244324, "learning_rate": 2.65204864126465e-05, "loss": 0.1838, "num_tokens": 528957997.0, "step": 4046 }, { "epoch": 1.6149241819632882, "grad_norm": 0.27155447006225586, "learning_rate": 2.6510598686262466e-05, "loss": 0.1861, "num_tokens": 529089069.0, "step": 4047 }, { "epoch": 1.6153232242617719, "grad_norm": 0.2816094160079956, "learning_rate": 2.6500711151318362e-05, "loss": 0.2082, "num_tokens": 529220141.0, "step": 4048 }, { "epoch": 1.6157222665602555, "grad_norm": 0.2613070607185364, "learning_rate": 2.6490823809727326e-05, "loss": 0.1764, "num_tokens": 529351213.0, "step": 4049 }, { "epoch": 1.6161213088587392, "grad_norm": 0.25685256719589233, "learning_rate": 2.648093666340249e-05, "loss": 0.1816, "num_tokens": 529482285.0, "step": 4050 }, { "epoch": 1.6165203511572228, "grad_norm": 0.26850461959838867, "learning_rate": 2.6471049714256903e-05, "loss": 0.2082, "num_tokens": 529613357.0, "step": 4051 }, { "epoch": 1.6169193934557065, "grad_norm": 0.3085417151451111, "learning_rate": 2.6461162964203607e-05, "loss": 0.238, "num_tokens": 529744429.0, "step": 4052 }, { "epoch": 1.61731843575419, "grad_norm": 0.28152498602867126, "learning_rate": 2.6451276415155584e-05, "loss": 0.1976, "num_tokens": 529875501.0, "step": 4053 }, { "epoch": 1.6177174780526737, "grad_norm": 0.258532851934433, "learning_rate": 2.6441390069025806e-05, "loss": 0.1739, "num_tokens": 530006573.0, "step": 4054 }, { "epoch": 1.6181165203511574, "grad_norm": 0.2694863975048065, "learning_rate": 2.6431503927727192e-05, "loss": 0.1889, "num_tokens": 530137645.0, "step": 4055 }, { "epoch": 1.618515562649641, "grad_norm": 0.27224451303482056, "learning_rate": 2.64216179931726e-05, "loss": 0.1671, "num_tokens": 530268717.0, "step": 4056 }, { "epoch": 1.6189146049481244, "grad_norm": 0.2949518859386444, "learning_rate": 2.6411732267274864e-05, "loss": 0.1787, "num_tokens": 530399789.0, "step": 4057 }, { "epoch": 1.619313647246608, "grad_norm": 0.2710557281970978, "learning_rate": 2.6401846751946795e-05, "loss": 0.1862, "num_tokens": 530530861.0, "step": 4058 }, { "epoch": 1.6197126895450917, "grad_norm": 0.28886812925338745, "learning_rate": 2.639196144910115e-05, "loss": 0.1887, "num_tokens": 530661933.0, "step": 4059 }, { "epoch": 1.6201117318435754, "grad_norm": 0.28294721245765686, "learning_rate": 2.6382076360650627e-05, "loss": 0.1741, "num_tokens": 530793005.0, "step": 4060 }, { "epoch": 1.620510774142059, "grad_norm": 0.27861887216567993, "learning_rate": 2.637219148850791e-05, "loss": 0.1862, "num_tokens": 530924077.0, "step": 4061 }, { "epoch": 1.6209098164405427, "grad_norm": 0.2830356955528259, "learning_rate": 2.6362306834585625e-05, "loss": 0.1905, "num_tokens": 531055149.0, "step": 4062 }, { "epoch": 1.6213088587390263, "grad_norm": 0.2706197500228882, "learning_rate": 2.6352422400796362e-05, "loss": 0.2061, "num_tokens": 531186221.0, "step": 4063 }, { "epoch": 1.62170790103751, "grad_norm": 0.27941933274269104, "learning_rate": 2.634253818905267e-05, "loss": 0.1868, "num_tokens": 531317293.0, "step": 4064 }, { "epoch": 1.6221069433359936, "grad_norm": 0.2921290397644043, "learning_rate": 2.6332654201267043e-05, "loss": 0.1424, "num_tokens": 531447985.0, "step": 4065 }, { "epoch": 1.6225059856344772, "grad_norm": 0.2824172079563141, "learning_rate": 2.6322770439351957e-05, "loss": 0.2121, "num_tokens": 531579057.0, "step": 4066 }, { "epoch": 1.6229050279329609, "grad_norm": 0.28025734424591064, "learning_rate": 2.6312886905219813e-05, "loss": 0.2048, "num_tokens": 531710129.0, "step": 4067 }, { "epoch": 1.6233040702314445, "grad_norm": 0.27519187331199646, "learning_rate": 2.6303003600782984e-05, "loss": 0.1871, "num_tokens": 531841201.0, "step": 4068 }, { "epoch": 1.6237031125299282, "grad_norm": 0.2633807063102722, "learning_rate": 2.6293120527953795e-05, "loss": 0.1502, "num_tokens": 531972273.0, "step": 4069 }, { "epoch": 1.6241021548284118, "grad_norm": 0.33897876739501953, "learning_rate": 2.628323768864454e-05, "loss": 0.2326, "num_tokens": 532103345.0, "step": 4070 }, { "epoch": 1.6245011971268954, "grad_norm": 0.2682015597820282, "learning_rate": 2.6273355084767447e-05, "loss": 0.1733, "num_tokens": 532234417.0, "step": 4071 }, { "epoch": 1.624900239425379, "grad_norm": 0.28848811984062195, "learning_rate": 2.626347271823471e-05, "loss": 0.197, "num_tokens": 532365489.0, "step": 4072 }, { "epoch": 1.6252992817238627, "grad_norm": 0.2396237999200821, "learning_rate": 2.6253590590958477e-05, "loss": 0.158, "num_tokens": 532496561.0, "step": 4073 }, { "epoch": 1.6256983240223464, "grad_norm": 0.27285417914390564, "learning_rate": 2.624370870485085e-05, "loss": 0.1812, "num_tokens": 532627633.0, "step": 4074 }, { "epoch": 1.62609736632083, "grad_norm": 0.2651785910129547, "learning_rate": 2.6233827061823868e-05, "loss": 0.1684, "num_tokens": 532758705.0, "step": 4075 }, { "epoch": 1.6264964086193137, "grad_norm": 0.26783403754234314, "learning_rate": 2.622394566378955e-05, "loss": 0.1665, "num_tokens": 532889777.0, "step": 4076 }, { "epoch": 1.6268954509177973, "grad_norm": 0.2626391053199768, "learning_rate": 2.621406451265986e-05, "loss": 0.1736, "num_tokens": 533020849.0, "step": 4077 }, { "epoch": 1.627294493216281, "grad_norm": 0.24615713953971863, "learning_rate": 2.6204183610346696e-05, "loss": 0.1629, "num_tokens": 533151921.0, "step": 4078 }, { "epoch": 1.6276935355147646, "grad_norm": 0.32197368144989014, "learning_rate": 2.619430295876192e-05, "loss": 0.2139, "num_tokens": 533282993.0, "step": 4079 }, { "epoch": 1.6280925778132482, "grad_norm": 0.3712855875492096, "learning_rate": 2.6184422559817346e-05, "loss": 0.2598, "num_tokens": 533414065.0, "step": 4080 }, { "epoch": 1.6284916201117319, "grad_norm": 0.2853964567184448, "learning_rate": 2.6174542415424747e-05, "loss": 0.1916, "num_tokens": 533545137.0, "step": 4081 }, { "epoch": 1.6288906624102155, "grad_norm": 0.28164130449295044, "learning_rate": 2.6164662527495832e-05, "loss": 0.1856, "num_tokens": 533676209.0, "step": 4082 }, { "epoch": 1.6292897047086992, "grad_norm": 0.26333168148994446, "learning_rate": 2.615478289794227e-05, "loss": 0.1397, "num_tokens": 533807281.0, "step": 4083 }, { "epoch": 1.6296887470071828, "grad_norm": 0.2690143883228302, "learning_rate": 2.614490352867568e-05, "loss": 0.1821, "num_tokens": 533938353.0, "step": 4084 }, { "epoch": 1.6300877893056664, "grad_norm": 0.2562699317932129, "learning_rate": 2.613502442160763e-05, "loss": 0.1735, "num_tokens": 534069425.0, "step": 4085 }, { "epoch": 1.63048683160415, "grad_norm": 0.27799999713897705, "learning_rate": 2.6125145578649617e-05, "loss": 0.1574, "num_tokens": 534200497.0, "step": 4086 }, { "epoch": 1.6308858739026337, "grad_norm": 0.22686244547367096, "learning_rate": 2.6115267001713128e-05, "loss": 0.12, "num_tokens": 534331569.0, "step": 4087 }, { "epoch": 1.6312849162011172, "grad_norm": 0.29233983159065247, "learning_rate": 2.6105388692709565e-05, "loss": 0.1935, "num_tokens": 534462641.0, "step": 4088 }, { "epoch": 1.6316839584996008, "grad_norm": 0.3860621750354767, "learning_rate": 2.60955106535503e-05, "loss": 0.2278, "num_tokens": 534593713.0, "step": 4089 }, { "epoch": 1.6320830007980844, "grad_norm": 0.3166235387325287, "learning_rate": 2.6085632886146622e-05, "loss": 0.1711, "num_tokens": 534724785.0, "step": 4090 }, { "epoch": 1.632482043096568, "grad_norm": 0.3542315661907196, "learning_rate": 2.607575539240979e-05, "loss": 0.2221, "num_tokens": 534855857.0, "step": 4091 }, { "epoch": 1.6328810853950517, "grad_norm": 0.24898357689380646, "learning_rate": 2.606587817425102e-05, "loss": 0.1389, "num_tokens": 534986929.0, "step": 4092 }, { "epoch": 1.6332801276935354, "grad_norm": 0.30847859382629395, "learning_rate": 2.605600123358145e-05, "loss": 0.2047, "num_tokens": 535118001.0, "step": 4093 }, { "epoch": 1.633679169992019, "grad_norm": 0.27827492356300354, "learning_rate": 2.6046124572312173e-05, "loss": 0.1972, "num_tokens": 535249073.0, "step": 4094 }, { "epoch": 1.6340782122905027, "grad_norm": 0.28493911027908325, "learning_rate": 2.603624819235425e-05, "loss": 0.1778, "num_tokens": 535380145.0, "step": 4095 }, { "epoch": 1.6344772545889863, "grad_norm": 0.292887419462204, "learning_rate": 2.6026372095618655e-05, "loss": 0.199, "num_tokens": 535511217.0, "step": 4096 }, { "epoch": 1.63487629688747, "grad_norm": 0.2559294104576111, "learning_rate": 2.6016496284016317e-05, "loss": 0.1665, "num_tokens": 535642289.0, "step": 4097 }, { "epoch": 1.6352753391859536, "grad_norm": 0.2775655686855316, "learning_rate": 2.6006620759458127e-05, "loss": 0.1937, "num_tokens": 535765677.0, "step": 4098 }, { "epoch": 1.6356743814844372, "grad_norm": 0.2793176770210266, "learning_rate": 2.59967455238549e-05, "loss": 0.1898, "num_tokens": 535893721.0, "step": 4099 }, { "epoch": 1.6360734237829209, "grad_norm": 0.25680264830589294, "learning_rate": 2.59868705791174e-05, "loss": 0.1541, "num_tokens": 536024793.0, "step": 4100 }, { "epoch": 1.6364724660814045, "grad_norm": 0.27137577533721924, "learning_rate": 2.597699592715634e-05, "loss": 0.1891, "num_tokens": 536155865.0, "step": 4101 }, { "epoch": 1.6368715083798882, "grad_norm": 0.2809312343597412, "learning_rate": 2.596712156988236e-05, "loss": 0.1645, "num_tokens": 536286937.0, "step": 4102 }, { "epoch": 1.6372705506783718, "grad_norm": 0.2785973846912384, "learning_rate": 2.5957247509206072e-05, "loss": 0.1837, "num_tokens": 536418009.0, "step": 4103 }, { "epoch": 1.6376695929768554, "grad_norm": 0.3011190891265869, "learning_rate": 2.5947373747038012e-05, "loss": 0.1873, "num_tokens": 536549081.0, "step": 4104 }, { "epoch": 1.638068635275339, "grad_norm": 0.3178425431251526, "learning_rate": 2.5937500285288652e-05, "loss": 0.2036, "num_tokens": 536680153.0, "step": 4105 }, { "epoch": 1.6384676775738227, "grad_norm": 0.25014862418174744, "learning_rate": 2.5927627125868425e-05, "loss": 0.1705, "num_tokens": 536811225.0, "step": 4106 }, { "epoch": 1.6388667198723064, "grad_norm": 0.24155506491661072, "learning_rate": 2.5917754270687693e-05, "loss": 0.1348, "num_tokens": 536942297.0, "step": 4107 }, { "epoch": 1.63926576217079, "grad_norm": 0.2689487338066101, "learning_rate": 2.5907881721656758e-05, "loss": 0.16, "num_tokens": 537073369.0, "step": 4108 }, { "epoch": 1.6396648044692737, "grad_norm": 0.3371633291244507, "learning_rate": 2.589800948068587e-05, "loss": 0.247, "num_tokens": 537204441.0, "step": 4109 }, { "epoch": 1.6400638467677573, "grad_norm": 0.3020670413970947, "learning_rate": 2.5888137549685214e-05, "loss": 0.2084, "num_tokens": 537335513.0, "step": 4110 }, { "epoch": 1.640462889066241, "grad_norm": 0.3054909110069275, "learning_rate": 2.5878265930564928e-05, "loss": 0.1925, "num_tokens": 537466585.0, "step": 4111 }, { "epoch": 1.6408619313647246, "grad_norm": 0.3116736114025116, "learning_rate": 2.5868394625235054e-05, "loss": 0.2216, "num_tokens": 537597657.0, "step": 4112 }, { "epoch": 1.6412609736632082, "grad_norm": 0.27213579416275024, "learning_rate": 2.585852363560561e-05, "loss": 0.1897, "num_tokens": 537728729.0, "step": 4113 }, { "epoch": 1.6416600159616919, "grad_norm": 0.26627689599990845, "learning_rate": 2.5848652963586538e-05, "loss": 0.1697, "num_tokens": 537859801.0, "step": 4114 }, { "epoch": 1.6420590582601755, "grad_norm": 0.2838033139705658, "learning_rate": 2.583878261108773e-05, "loss": 0.1737, "num_tokens": 537990873.0, "step": 4115 }, { "epoch": 1.6424581005586592, "grad_norm": 0.2553898096084595, "learning_rate": 2.5828912580019e-05, "loss": 0.1917, "num_tokens": 538121945.0, "step": 4116 }, { "epoch": 1.6428571428571428, "grad_norm": 0.2738194763660431, "learning_rate": 2.5819042872290107e-05, "loss": 0.1809, "num_tokens": 538253017.0, "step": 4117 }, { "epoch": 1.6432561851556264, "grad_norm": 0.27148231863975525, "learning_rate": 2.580917348981075e-05, "loss": 0.1772, "num_tokens": 538384089.0, "step": 4118 }, { "epoch": 1.64365522745411, "grad_norm": 0.31687480211257935, "learning_rate": 2.5799304434490558e-05, "loss": 0.2444, "num_tokens": 538515161.0, "step": 4119 }, { "epoch": 1.6440542697525937, "grad_norm": 0.28228363394737244, "learning_rate": 2.5789435708239108e-05, "loss": 0.2033, "num_tokens": 538646233.0, "step": 4120 }, { "epoch": 1.6444533120510774, "grad_norm": 0.3129139840602875, "learning_rate": 2.5779567312965903e-05, "loss": 0.2273, "num_tokens": 538777305.0, "step": 4121 }, { "epoch": 1.644852354349561, "grad_norm": 0.31471478939056396, "learning_rate": 2.576969925058038e-05, "loss": 0.2087, "num_tokens": 538908377.0, "step": 4122 }, { "epoch": 1.6452513966480447, "grad_norm": 0.27197110652923584, "learning_rate": 2.5759831522991933e-05, "loss": 0.1647, "num_tokens": 539039449.0, "step": 4123 }, { "epoch": 1.6456504389465283, "grad_norm": 0.2578393518924713, "learning_rate": 2.5749964132109856e-05, "loss": 0.1646, "num_tokens": 539170521.0, "step": 4124 }, { "epoch": 1.646049481245012, "grad_norm": 0.2586592435836792, "learning_rate": 2.5740097079843405e-05, "loss": 0.1593, "num_tokens": 539301593.0, "step": 4125 }, { "epoch": 1.6464485235434956, "grad_norm": 0.28617721796035767, "learning_rate": 2.5730230368101764e-05, "loss": 0.1784, "num_tokens": 539432665.0, "step": 4126 }, { "epoch": 1.6468475658419792, "grad_norm": 0.28233274817466736, "learning_rate": 2.572036399879405e-05, "loss": 0.1715, "num_tokens": 539563737.0, "step": 4127 }, { "epoch": 1.6472466081404629, "grad_norm": 0.2865526080131531, "learning_rate": 2.5710497973829316e-05, "loss": 0.1928, "num_tokens": 539694809.0, "step": 4128 }, { "epoch": 1.6476456504389465, "grad_norm": 0.26442259550094604, "learning_rate": 2.5700632295116538e-05, "loss": 0.1737, "num_tokens": 539825881.0, "step": 4129 }, { "epoch": 1.6480446927374302, "grad_norm": 0.27147555351257324, "learning_rate": 2.5690766964564645e-05, "loss": 0.2129, "num_tokens": 539956953.0, "step": 4130 }, { "epoch": 1.6484437350359138, "grad_norm": 0.28681430220603943, "learning_rate": 2.5680901984082468e-05, "loss": 0.2076, "num_tokens": 540088025.0, "step": 4131 }, { "epoch": 1.6488427773343974, "grad_norm": 0.2783071994781494, "learning_rate": 2.5671037355578803e-05, "loss": 0.1858, "num_tokens": 540219097.0, "step": 4132 }, { "epoch": 1.649241819632881, "grad_norm": 0.24816793203353882, "learning_rate": 2.566117308096237e-05, "loss": 0.1601, "num_tokens": 540350169.0, "step": 4133 }, { "epoch": 1.6496408619313647, "grad_norm": 0.29156509041786194, "learning_rate": 2.5651309162141802e-05, "loss": 0.2217, "num_tokens": 540481241.0, "step": 4134 }, { "epoch": 1.6500399042298484, "grad_norm": 0.2688640356063843, "learning_rate": 2.564144560102567e-05, "loss": 0.1717, "num_tokens": 540612313.0, "step": 4135 }, { "epoch": 1.650438946528332, "grad_norm": 0.2439279705286026, "learning_rate": 2.563158239952249e-05, "loss": 0.1576, "num_tokens": 540743385.0, "step": 4136 }, { "epoch": 1.6508379888268156, "grad_norm": 0.3093673586845398, "learning_rate": 2.5621719559540696e-05, "loss": 0.159, "num_tokens": 540874457.0, "step": 4137 }, { "epoch": 1.6512370311252993, "grad_norm": 0.3126278221607208, "learning_rate": 2.5611857082988667e-05, "loss": 0.1951, "num_tokens": 541005529.0, "step": 4138 }, { "epoch": 1.651636073423783, "grad_norm": 0.24784965813159943, "learning_rate": 2.5601994971774683e-05, "loss": 0.1542, "num_tokens": 541136601.0, "step": 4139 }, { "epoch": 1.6520351157222666, "grad_norm": 0.30249136686325073, "learning_rate": 2.5592133227806986e-05, "loss": 0.2037, "num_tokens": 541267673.0, "step": 4140 }, { "epoch": 1.6524341580207502, "grad_norm": 0.2850802540779114, "learning_rate": 2.5582271852993723e-05, "loss": 0.1966, "num_tokens": 541398745.0, "step": 4141 }, { "epoch": 1.6528332003192339, "grad_norm": 0.43383949995040894, "learning_rate": 2.557241084924297e-05, "loss": 0.2674, "num_tokens": 541529817.0, "step": 4142 }, { "epoch": 1.6532322426177175, "grad_norm": 0.3023286461830139, "learning_rate": 2.556255021846276e-05, "loss": 0.2307, "num_tokens": 541660889.0, "step": 4143 }, { "epoch": 1.6536312849162011, "grad_norm": 0.25499165058135986, "learning_rate": 2.555268996256102e-05, "loss": 0.1687, "num_tokens": 541791961.0, "step": 4144 }, { "epoch": 1.6540303272146848, "grad_norm": 0.2693023979663849, "learning_rate": 2.5542830083445624e-05, "loss": 0.1617, "num_tokens": 541923033.0, "step": 4145 }, { "epoch": 1.6544293695131684, "grad_norm": 0.269458532333374, "learning_rate": 2.5532970583024357e-05, "loss": 0.2228, "num_tokens": 542054105.0, "step": 4146 }, { "epoch": 1.654828411811652, "grad_norm": 0.25763005018234253, "learning_rate": 2.552311146320494e-05, "loss": 0.1692, "num_tokens": 542185177.0, "step": 4147 }, { "epoch": 1.6552274541101357, "grad_norm": 0.2762984037399292, "learning_rate": 2.5513252725895036e-05, "loss": 0.2072, "num_tokens": 542316249.0, "step": 4148 }, { "epoch": 1.6556264964086194, "grad_norm": 0.26149681210517883, "learning_rate": 2.5503394373002204e-05, "loss": 0.1657, "num_tokens": 542447321.0, "step": 4149 }, { "epoch": 1.656025538707103, "grad_norm": 0.24166521430015564, "learning_rate": 2.5493536406433948e-05, "loss": 0.1589, "num_tokens": 542578393.0, "step": 4150 }, { "epoch": 1.6564245810055866, "grad_norm": 0.2597380578517914, "learning_rate": 2.5483678828097696e-05, "loss": 0.1635, "num_tokens": 542709465.0, "step": 4151 }, { "epoch": 1.6568236233040703, "grad_norm": 0.3446083962917328, "learning_rate": 2.5473821639900795e-05, "loss": 0.2298, "num_tokens": 542840537.0, "step": 4152 }, { "epoch": 1.657222665602554, "grad_norm": 0.29408812522888184, "learning_rate": 2.5463964843750515e-05, "loss": 0.1752, "num_tokens": 542971609.0, "step": 4153 }, { "epoch": 1.6576217079010376, "grad_norm": 0.2825676202774048, "learning_rate": 2.5454108441554063e-05, "loss": 0.1922, "num_tokens": 543102681.0, "step": 4154 }, { "epoch": 1.6580207501995212, "grad_norm": 0.2901427149772644, "learning_rate": 2.5444252435218557e-05, "loss": 0.1867, "num_tokens": 543233753.0, "step": 4155 }, { "epoch": 1.6584197924980049, "grad_norm": 0.2719218134880066, "learning_rate": 2.543439682665103e-05, "loss": 0.1636, "num_tokens": 543364825.0, "step": 4156 }, { "epoch": 1.6588188347964885, "grad_norm": 0.3097497224807739, "learning_rate": 2.5424541617758485e-05, "loss": 0.2041, "num_tokens": 543495897.0, "step": 4157 }, { "epoch": 1.6592178770949721, "grad_norm": 0.26042115688323975, "learning_rate": 2.5414686810447776e-05, "loss": 0.1655, "num_tokens": 543626969.0, "step": 4158 }, { "epoch": 1.6596169193934558, "grad_norm": 0.2846719026565552, "learning_rate": 2.5404832406625724e-05, "loss": 0.1984, "num_tokens": 543758041.0, "step": 4159 }, { "epoch": 1.6600159616919394, "grad_norm": 0.3168075978755951, "learning_rate": 2.539497840819908e-05, "loss": 0.2205, "num_tokens": 543885101.0, "step": 4160 }, { "epoch": 1.660415003990423, "grad_norm": 0.2942720353603363, "learning_rate": 2.538512481707448e-05, "loss": 0.1984, "num_tokens": 544016173.0, "step": 4161 }, { "epoch": 1.6608140462889067, "grad_norm": 0.2803962826728821, "learning_rate": 2.537527163515852e-05, "loss": 0.188, "num_tokens": 544147245.0, "step": 4162 }, { "epoch": 1.6612130885873904, "grad_norm": 0.2914383113384247, "learning_rate": 2.5365418864357697e-05, "loss": 0.2061, "num_tokens": 544278317.0, "step": 4163 }, { "epoch": 1.661612130885874, "grad_norm": 0.2671652138233185, "learning_rate": 2.5355566506578417e-05, "loss": 0.1778, "num_tokens": 544409389.0, "step": 4164 }, { "epoch": 1.6620111731843576, "grad_norm": 0.2779952585697174, "learning_rate": 2.534571456372703e-05, "loss": 0.1823, "num_tokens": 544540461.0, "step": 4165 }, { "epoch": 1.6624102154828413, "grad_norm": 0.3175660967826843, "learning_rate": 2.5335863037709796e-05, "loss": 0.2233, "num_tokens": 544671533.0, "step": 4166 }, { "epoch": 1.662809257781325, "grad_norm": 0.2601615786552429, "learning_rate": 2.5326011930432886e-05, "loss": 0.177, "num_tokens": 544802605.0, "step": 4167 }, { "epoch": 1.6632083000798086, "grad_norm": 0.26676443219184875, "learning_rate": 2.5316161243802418e-05, "loss": 0.1981, "num_tokens": 544933677.0, "step": 4168 }, { "epoch": 1.6636073423782922, "grad_norm": 0.25276729464530945, "learning_rate": 2.5306310979724375e-05, "loss": 0.1584, "num_tokens": 545064749.0, "step": 4169 }, { "epoch": 1.6640063846767759, "grad_norm": 0.25891008973121643, "learning_rate": 2.529646114010471e-05, "loss": 0.1668, "num_tokens": 545195821.0, "step": 4170 }, { "epoch": 1.6644054269752595, "grad_norm": 0.26182493567466736, "learning_rate": 2.5286611726849275e-05, "loss": 0.1517, "num_tokens": 545326893.0, "step": 4171 }, { "epoch": 1.6648044692737431, "grad_norm": 0.2878994047641754, "learning_rate": 2.5276762741863837e-05, "loss": 0.1848, "num_tokens": 545457965.0, "step": 4172 }, { "epoch": 1.6652035115722268, "grad_norm": 0.2756292521953583, "learning_rate": 2.5266914187054086e-05, "loss": 0.1643, "num_tokens": 545589037.0, "step": 4173 }, { "epoch": 1.6656025538707104, "grad_norm": 0.3047356605529785, "learning_rate": 2.525706606432562e-05, "loss": 0.2017, "num_tokens": 545720109.0, "step": 4174 }, { "epoch": 1.666001596169194, "grad_norm": 0.27690592408180237, "learning_rate": 2.524721837558396e-05, "loss": 0.1745, "num_tokens": 545851181.0, "step": 4175 }, { "epoch": 1.6664006384676777, "grad_norm": 0.2657877504825592, "learning_rate": 2.523737112273455e-05, "loss": 0.1799, "num_tokens": 545982253.0, "step": 4176 }, { "epoch": 1.6667996807661614, "grad_norm": 0.2659766376018524, "learning_rate": 2.5227524307682737e-05, "loss": 0.1438, "num_tokens": 546113325.0, "step": 4177 }, { "epoch": 1.667198723064645, "grad_norm": 0.33044618368148804, "learning_rate": 2.5217677932333778e-05, "loss": 0.1924, "num_tokens": 546244397.0, "step": 4178 }, { "epoch": 1.6675977653631286, "grad_norm": 0.2579883337020874, "learning_rate": 2.520783199859288e-05, "loss": 0.1357, "num_tokens": 546375469.0, "step": 4179 }, { "epoch": 1.6679968076616123, "grad_norm": 0.27037695050239563, "learning_rate": 2.5197986508365112e-05, "loss": 0.1659, "num_tokens": 546506541.0, "step": 4180 }, { "epoch": 1.668395849960096, "grad_norm": 0.3259350061416626, "learning_rate": 2.5188141463555497e-05, "loss": 0.2165, "num_tokens": 546637613.0, "step": 4181 }, { "epoch": 1.6687948922585794, "grad_norm": 0.26360026001930237, "learning_rate": 2.5178296866068957e-05, "loss": 0.1544, "num_tokens": 546768685.0, "step": 4182 }, { "epoch": 1.669193934557063, "grad_norm": 0.3176211416721344, "learning_rate": 2.5168452717810326e-05, "loss": 0.1974, "num_tokens": 546899757.0, "step": 4183 }, { "epoch": 1.6695929768555466, "grad_norm": 0.2720339894294739, "learning_rate": 2.5158609020684366e-05, "loss": 0.184, "num_tokens": 547030829.0, "step": 4184 }, { "epoch": 1.6699920191540303, "grad_norm": 0.2645837068557739, "learning_rate": 2.514876577659573e-05, "loss": 0.1815, "num_tokens": 547161901.0, "step": 4185 }, { "epoch": 1.670391061452514, "grad_norm": 0.25379034876823425, "learning_rate": 2.5138922987448998e-05, "loss": 0.1627, "num_tokens": 547292973.0, "step": 4186 }, { "epoch": 1.6707901037509976, "grad_norm": 0.2625942528247833, "learning_rate": 2.5129080655148663e-05, "loss": 0.1525, "num_tokens": 547416065.0, "step": 4187 }, { "epoch": 1.6711891460494812, "grad_norm": 0.2820582389831543, "learning_rate": 2.511923878159912e-05, "loss": 0.1602, "num_tokens": 547547137.0, "step": 4188 }, { "epoch": 1.6715881883479649, "grad_norm": 0.2817007899284363, "learning_rate": 2.510939736870468e-05, "loss": 0.1607, "num_tokens": 547678209.0, "step": 4189 }, { "epoch": 1.6719872306464485, "grad_norm": 0.2942366600036621, "learning_rate": 2.509955641836956e-05, "loss": 0.1907, "num_tokens": 547809281.0, "step": 4190 }, { "epoch": 1.6723862729449321, "grad_norm": 0.2582591772079468, "learning_rate": 2.5089715932497896e-05, "loss": 0.1472, "num_tokens": 547940353.0, "step": 4191 }, { "epoch": 1.6727853152434158, "grad_norm": 0.3488966226577759, "learning_rate": 2.507987591299374e-05, "loss": 0.1735, "num_tokens": 548071425.0, "step": 4192 }, { "epoch": 1.6731843575418994, "grad_norm": 0.2890528440475464, "learning_rate": 2.5070036361761023e-05, "loss": 0.1615, "num_tokens": 548202497.0, "step": 4193 }, { "epoch": 1.673583399840383, "grad_norm": 0.26299455761909485, "learning_rate": 2.5060197280703618e-05, "loss": 0.1302, "num_tokens": 548333569.0, "step": 4194 }, { "epoch": 1.6739824421388667, "grad_norm": 0.3152711093425751, "learning_rate": 2.5050358671725305e-05, "loss": 0.2032, "num_tokens": 548464641.0, "step": 4195 }, { "epoch": 1.6743814844373504, "grad_norm": 0.2776562571525574, "learning_rate": 2.504052053672975e-05, "loss": 0.1637, "num_tokens": 548595713.0, "step": 4196 }, { "epoch": 1.674780526735834, "grad_norm": 0.2720348834991455, "learning_rate": 2.5030682877620542e-05, "loss": 0.1786, "num_tokens": 548726785.0, "step": 4197 }, { "epoch": 1.6751795690343176, "grad_norm": 0.2746421694755554, "learning_rate": 2.5020845696301177e-05, "loss": 0.1599, "num_tokens": 548857857.0, "step": 4198 }, { "epoch": 1.6755786113328013, "grad_norm": 0.29827776551246643, "learning_rate": 2.5011008994675063e-05, "loss": 0.2091, "num_tokens": 548988929.0, "step": 4199 }, { "epoch": 1.675977653631285, "grad_norm": 0.2585022747516632, "learning_rate": 2.500117277464551e-05, "loss": 0.156, "num_tokens": 549120001.0, "step": 4200 }, { "epoch": 1.6763766959297686, "grad_norm": 0.2568703591823578, "learning_rate": 2.499133703811572e-05, "loss": 0.1615, "num_tokens": 549251073.0, "step": 4201 }, { "epoch": 1.6767757382282522, "grad_norm": 0.24818459153175354, "learning_rate": 2.4981501786988842e-05, "loss": 0.1481, "num_tokens": 549382145.0, "step": 4202 }, { "epoch": 1.6771747805267359, "grad_norm": 0.2951441705226898, "learning_rate": 2.4971667023167882e-05, "loss": 0.1866, "num_tokens": 549513217.0, "step": 4203 }, { "epoch": 1.6775738228252195, "grad_norm": 0.2824135422706604, "learning_rate": 2.4961832748555785e-05, "loss": 0.1659, "num_tokens": 549644289.0, "step": 4204 }, { "epoch": 1.6779728651237031, "grad_norm": 0.27058374881744385, "learning_rate": 2.4951998965055396e-05, "loss": 0.1663, "num_tokens": 549775361.0, "step": 4205 }, { "epoch": 1.6783719074221868, "grad_norm": 0.2672697901725769, "learning_rate": 2.4942165674569444e-05, "loss": 0.1529, "num_tokens": 549906433.0, "step": 4206 }, { "epoch": 1.6787709497206704, "grad_norm": 0.27277684211730957, "learning_rate": 2.4932332879000597e-05, "loss": 0.1657, "num_tokens": 550037505.0, "step": 4207 }, { "epoch": 1.679169992019154, "grad_norm": 0.33556967973709106, "learning_rate": 2.4922500580251407e-05, "loss": 0.1872, "num_tokens": 550168577.0, "step": 4208 }, { "epoch": 1.6795690343176377, "grad_norm": 0.3175736665725708, "learning_rate": 2.491266878022432e-05, "loss": 0.1904, "num_tokens": 550299649.0, "step": 4209 }, { "epoch": 1.6799680766161214, "grad_norm": 0.26450422406196594, "learning_rate": 2.4902837480821704e-05, "loss": 0.1539, "num_tokens": 550430721.0, "step": 4210 }, { "epoch": 1.680367118914605, "grad_norm": 0.3444281816482544, "learning_rate": 2.4893006683945837e-05, "loss": 0.2343, "num_tokens": 550561793.0, "step": 4211 }, { "epoch": 1.6807661612130886, "grad_norm": 0.262057900428772, "learning_rate": 2.488317639149886e-05, "loss": 0.177, "num_tokens": 550692865.0, "step": 4212 }, { "epoch": 1.6811652035115723, "grad_norm": 0.2561437487602234, "learning_rate": 2.4873346605382874e-05, "loss": 0.1517, "num_tokens": 550823937.0, "step": 4213 }, { "epoch": 1.6815642458100557, "grad_norm": 0.2704043686389923, "learning_rate": 2.486351732749982e-05, "loss": 0.177, "num_tokens": 550955009.0, "step": 4214 }, { "epoch": 1.6819632881085393, "grad_norm": 0.28553062677383423, "learning_rate": 2.4853688559751592e-05, "loss": 0.2048, "num_tokens": 551086081.0, "step": 4215 }, { "epoch": 1.682362330407023, "grad_norm": 0.30038323998451233, "learning_rate": 2.4843860304039957e-05, "loss": 0.1913, "num_tokens": 551217153.0, "step": 4216 }, { "epoch": 1.6827613727055066, "grad_norm": 0.2668524980545044, "learning_rate": 2.483403256226659e-05, "loss": 0.17, "num_tokens": 551348225.0, "step": 4217 }, { "epoch": 1.6831604150039903, "grad_norm": 0.2649167478084564, "learning_rate": 2.482420533633308e-05, "loss": 0.1811, "num_tokens": 551479297.0, "step": 4218 }, { "epoch": 1.683559457302474, "grad_norm": 0.257382869720459, "learning_rate": 2.4814378628140888e-05, "loss": 0.186, "num_tokens": 551610369.0, "step": 4219 }, { "epoch": 1.6839584996009576, "grad_norm": 0.2652987241744995, "learning_rate": 2.480455243959139e-05, "loss": 0.1712, "num_tokens": 551741441.0, "step": 4220 }, { "epoch": 1.6843575418994412, "grad_norm": 0.25936976075172424, "learning_rate": 2.479472677258588e-05, "loss": 0.1626, "num_tokens": 551872513.0, "step": 4221 }, { "epoch": 1.6847565841979248, "grad_norm": 0.29460179805755615, "learning_rate": 2.4784901629025515e-05, "loss": 0.1759, "num_tokens": 552003585.0, "step": 4222 }, { "epoch": 1.6851556264964085, "grad_norm": 0.31670886278152466, "learning_rate": 2.477507701081137e-05, "loss": 0.1751, "num_tokens": 552134657.0, "step": 4223 }, { "epoch": 1.6855546687948921, "grad_norm": 0.3122711777687073, "learning_rate": 2.476525291984444e-05, "loss": 0.1981, "num_tokens": 552265729.0, "step": 4224 }, { "epoch": 1.6859537110933758, "grad_norm": 0.2604844868183136, "learning_rate": 2.4755429358025563e-05, "loss": 0.1466, "num_tokens": 552396801.0, "step": 4225 }, { "epoch": 1.6863527533918594, "grad_norm": 0.23552721738815308, "learning_rate": 2.474560632725552e-05, "loss": 0.1124, "num_tokens": 552527873.0, "step": 4226 }, { "epoch": 1.686751795690343, "grad_norm": 0.2990885078907013, "learning_rate": 2.4735783829434982e-05, "loss": 0.1942, "num_tokens": 552658945.0, "step": 4227 }, { "epoch": 1.6871508379888267, "grad_norm": 0.27241894602775574, "learning_rate": 2.4725961866464496e-05, "loss": 0.1587, "num_tokens": 552790017.0, "step": 4228 }, { "epoch": 1.6875498802873103, "grad_norm": 0.31700214743614197, "learning_rate": 2.471614044024454e-05, "loss": 0.2192, "num_tokens": 552921089.0, "step": 4229 }, { "epoch": 1.687948922585794, "grad_norm": 0.3359934687614441, "learning_rate": 2.4706319552675455e-05, "loss": 0.2134, "num_tokens": 553052161.0, "step": 4230 }, { "epoch": 1.6883479648842776, "grad_norm": 0.2723410129547119, "learning_rate": 2.4696499205657493e-05, "loss": 0.1875, "num_tokens": 553183233.0, "step": 4231 }, { "epoch": 1.6887470071827613, "grad_norm": 0.26356804370880127, "learning_rate": 2.46866794010908e-05, "loss": 0.1678, "num_tokens": 553314305.0, "step": 4232 }, { "epoch": 1.689146049481245, "grad_norm": 0.27211815118789673, "learning_rate": 2.4676860140875418e-05, "loss": 0.1933, "num_tokens": 553440529.0, "step": 4233 }, { "epoch": 1.6895450917797286, "grad_norm": 0.28025561571121216, "learning_rate": 2.4667041426911282e-05, "loss": 0.1718, "num_tokens": 553571601.0, "step": 4234 }, { "epoch": 1.6899441340782122, "grad_norm": 0.2744252681732178, "learning_rate": 2.4657223261098222e-05, "loss": 0.1745, "num_tokens": 553702673.0, "step": 4235 }, { "epoch": 1.6903431763766958, "grad_norm": 0.2633771002292633, "learning_rate": 2.4647405645335965e-05, "loss": 0.153, "num_tokens": 553833745.0, "step": 4236 }, { "epoch": 1.6907422186751795, "grad_norm": 0.2679210305213928, "learning_rate": 2.4637588581524117e-05, "loss": 0.1678, "num_tokens": 553964817.0, "step": 4237 }, { "epoch": 1.6911412609736631, "grad_norm": 0.3200596570968628, "learning_rate": 2.4627772071562204e-05, "loss": 0.2119, "num_tokens": 554095889.0, "step": 4238 }, { "epoch": 1.6915403032721468, "grad_norm": 0.2850991487503052, "learning_rate": 2.461795611734961e-05, "loss": 0.1701, "num_tokens": 554226961.0, "step": 4239 }, { "epoch": 1.6919393455706304, "grad_norm": 0.28065457940101624, "learning_rate": 2.460814072078565e-05, "loss": 0.1915, "num_tokens": 554358033.0, "step": 4240 }, { "epoch": 1.692338387869114, "grad_norm": 0.2703116834163666, "learning_rate": 2.4598325883769498e-05, "loss": 0.1459, "num_tokens": 554489105.0, "step": 4241 }, { "epoch": 1.6927374301675977, "grad_norm": 0.28830045461654663, "learning_rate": 2.458851160820023e-05, "loss": 0.1821, "num_tokens": 554620177.0, "step": 4242 }, { "epoch": 1.6931364724660813, "grad_norm": 0.24231773614883423, "learning_rate": 2.4578697895976836e-05, "loss": 0.131, "num_tokens": 554751249.0, "step": 4243 }, { "epoch": 1.693535514764565, "grad_norm": 0.2843542695045471, "learning_rate": 2.456888474899817e-05, "loss": 0.1682, "num_tokens": 554882321.0, "step": 4244 }, { "epoch": 1.6939345570630486, "grad_norm": 0.3186599612236023, "learning_rate": 2.4559072169162976e-05, "loss": 0.2119, "num_tokens": 555013393.0, "step": 4245 }, { "epoch": 1.6943335993615323, "grad_norm": 0.31230804324150085, "learning_rate": 2.4549260158369902e-05, "loss": 0.1936, "num_tokens": 555144465.0, "step": 4246 }, { "epoch": 1.694732641660016, "grad_norm": 0.278644323348999, "learning_rate": 2.4539448718517492e-05, "loss": 0.1658, "num_tokens": 555275537.0, "step": 4247 }, { "epoch": 1.6951316839584996, "grad_norm": 0.2604803144931793, "learning_rate": 2.452963785150415e-05, "loss": 0.1331, "num_tokens": 555406609.0, "step": 4248 }, { "epoch": 1.6955307262569832, "grad_norm": 0.2835054099559784, "learning_rate": 2.4519827559228186e-05, "loss": 0.1991, "num_tokens": 555537681.0, "step": 4249 }, { "epoch": 1.6959297685554668, "grad_norm": 0.24543990194797516, "learning_rate": 2.4510017843587818e-05, "loss": 0.1557, "num_tokens": 555668753.0, "step": 4250 }, { "epoch": 1.6963288108539505, "grad_norm": 0.2818555235862732, "learning_rate": 2.450020870648112e-05, "loss": 0.173, "num_tokens": 555799825.0, "step": 4251 }, { "epoch": 1.6967278531524341, "grad_norm": 0.2701456546783447, "learning_rate": 2.4490400149806076e-05, "loss": 0.1646, "num_tokens": 555930897.0, "step": 4252 }, { "epoch": 1.6971268954509178, "grad_norm": 0.26454073190689087, "learning_rate": 2.4480592175460543e-05, "loss": 0.162, "num_tokens": 556061969.0, "step": 4253 }, { "epoch": 1.6975259377494014, "grad_norm": 0.28504684567451477, "learning_rate": 2.4470784785342283e-05, "loss": 0.1712, "num_tokens": 556193041.0, "step": 4254 }, { "epoch": 1.697924980047885, "grad_norm": 0.24984797835350037, "learning_rate": 2.4460977981348926e-05, "loss": 0.1419, "num_tokens": 556324113.0, "step": 4255 }, { "epoch": 1.6983240223463687, "grad_norm": 0.28046247363090515, "learning_rate": 2.4451171765377993e-05, "loss": 0.1777, "num_tokens": 556455185.0, "step": 4256 }, { "epoch": 1.6987230646448523, "grad_norm": 0.43499574065208435, "learning_rate": 2.4441366139326914e-05, "loss": 0.2313, "num_tokens": 556586257.0, "step": 4257 }, { "epoch": 1.699122106943336, "grad_norm": 0.267236590385437, "learning_rate": 2.4431561105092982e-05, "loss": 0.1609, "num_tokens": 556717329.0, "step": 4258 }, { "epoch": 1.6995211492418196, "grad_norm": 0.28725284337997437, "learning_rate": 2.4421756664573363e-05, "loss": 0.1841, "num_tokens": 556848401.0, "step": 4259 }, { "epoch": 1.6999201915403033, "grad_norm": 0.2662345767021179, "learning_rate": 2.4411952819665135e-05, "loss": 0.1676, "num_tokens": 556979473.0, "step": 4260 }, { "epoch": 1.700319233838787, "grad_norm": 0.2807331383228302, "learning_rate": 2.440214957226525e-05, "loss": 0.1548, "num_tokens": 557110545.0, "step": 4261 }, { "epoch": 1.7007182761372706, "grad_norm": 0.26476630568504333, "learning_rate": 2.4392346924270544e-05, "loss": 0.1883, "num_tokens": 557241617.0, "step": 4262 }, { "epoch": 1.7011173184357542, "grad_norm": 0.2930554151535034, "learning_rate": 2.4382544877577734e-05, "loss": 0.182, "num_tokens": 557372689.0, "step": 4263 }, { "epoch": 1.7015163607342378, "grad_norm": 0.27152690291404724, "learning_rate": 2.4372743434083432e-05, "loss": 0.1799, "num_tokens": 557503761.0, "step": 4264 }, { "epoch": 1.7019154030327215, "grad_norm": 0.24374490976333618, "learning_rate": 2.4362942595684128e-05, "loss": 0.1477, "num_tokens": 557634833.0, "step": 4265 }, { "epoch": 1.7023144453312051, "grad_norm": 0.25860950350761414, "learning_rate": 2.435314236427619e-05, "loss": 0.164, "num_tokens": 557765905.0, "step": 4266 }, { "epoch": 1.7027134876296888, "grad_norm": 0.33721259236335754, "learning_rate": 2.4343342741755865e-05, "loss": 0.2115, "num_tokens": 557896977.0, "step": 4267 }, { "epoch": 1.7031125299281724, "grad_norm": 0.25290799140930176, "learning_rate": 2.4333543730019294e-05, "loss": 0.137, "num_tokens": 558028049.0, "step": 4268 }, { "epoch": 1.703511572226656, "grad_norm": 0.29262930154800415, "learning_rate": 2.4323745330962493e-05, "loss": 0.18, "num_tokens": 558159121.0, "step": 4269 }, { "epoch": 1.7039106145251397, "grad_norm": 0.28408920764923096, "learning_rate": 2.4313947546481368e-05, "loss": 0.1392, "num_tokens": 558290193.0, "step": 4270 }, { "epoch": 1.7043096568236233, "grad_norm": 0.326280802488327, "learning_rate": 2.4304150378471692e-05, "loss": 0.1899, "num_tokens": 558421265.0, "step": 4271 }, { "epoch": 1.704708699122107, "grad_norm": 0.316866397857666, "learning_rate": 2.429435382882912e-05, "loss": 0.193, "num_tokens": 558552337.0, "step": 4272 }, { "epoch": 1.7051077414205906, "grad_norm": 0.2621220350265503, "learning_rate": 2.42845578994492e-05, "loss": 0.1527, "num_tokens": 558683409.0, "step": 4273 }, { "epoch": 1.7055067837190743, "grad_norm": 0.27891474962234497, "learning_rate": 2.4274762592227357e-05, "loss": 0.1597, "num_tokens": 558814481.0, "step": 4274 }, { "epoch": 1.705905826017558, "grad_norm": 0.26278582215309143, "learning_rate": 2.426496790905889e-05, "loss": 0.1713, "num_tokens": 558945553.0, "step": 4275 }, { "epoch": 1.7063048683160416, "grad_norm": 0.2752237319946289, "learning_rate": 2.4255173851838965e-05, "loss": 0.1837, "num_tokens": 559076625.0, "step": 4276 }, { "epoch": 1.7067039106145252, "grad_norm": 0.2668383717536926, "learning_rate": 2.4245380422462655e-05, "loss": 0.1665, "num_tokens": 559207697.0, "step": 4277 }, { "epoch": 1.7071029529130088, "grad_norm": 0.25718259811401367, "learning_rate": 2.42355876228249e-05, "loss": 0.1864, "num_tokens": 559338769.0, "step": 4278 }, { "epoch": 1.7075019952114925, "grad_norm": 0.2684590220451355, "learning_rate": 2.4225795454820505e-05, "loss": 0.1655, "num_tokens": 559469841.0, "step": 4279 }, { "epoch": 1.7079010375099761, "grad_norm": 0.2557418644428253, "learning_rate": 2.4216003920344172e-05, "loss": 0.1695, "num_tokens": 559600913.0, "step": 4280 }, { "epoch": 1.7083000798084598, "grad_norm": 0.23578663170337677, "learning_rate": 2.420621302129047e-05, "loss": 0.1417, "num_tokens": 559731985.0, "step": 4281 }, { "epoch": 1.7086991221069434, "grad_norm": 0.27435365319252014, "learning_rate": 2.419642275955384e-05, "loss": 0.1629, "num_tokens": 559860160.0, "step": 4282 }, { "epoch": 1.709098164405427, "grad_norm": 0.24830815196037292, "learning_rate": 2.418663313702861e-05, "loss": 0.1462, "num_tokens": 559991232.0, "step": 4283 }, { "epoch": 1.7094972067039107, "grad_norm": 0.2597522735595703, "learning_rate": 2.4176844155608974e-05, "loss": 0.1558, "num_tokens": 560122304.0, "step": 4284 }, { "epoch": 1.7098962490023943, "grad_norm": 0.27987906336784363, "learning_rate": 2.416705581718902e-05, "loss": 0.1321, "num_tokens": 560253376.0, "step": 4285 }, { "epoch": 1.710295291300878, "grad_norm": 0.29506248235702515, "learning_rate": 2.4157268123662703e-05, "loss": 0.1796, "num_tokens": 560384448.0, "step": 4286 }, { "epoch": 1.7106943335993616, "grad_norm": 0.27434638142585754, "learning_rate": 2.4147481076923833e-05, "loss": 0.1659, "num_tokens": 560515520.0, "step": 4287 }, { "epoch": 1.7110933758978453, "grad_norm": 0.3223726153373718, "learning_rate": 2.413769467886613e-05, "loss": 0.2215, "num_tokens": 560646592.0, "step": 4288 }, { "epoch": 1.711492418196329, "grad_norm": 0.2894812524318695, "learning_rate": 2.4127908931383163e-05, "loss": 0.1856, "num_tokens": 560777664.0, "step": 4289 }, { "epoch": 1.7118914604948126, "grad_norm": 0.30416733026504517, "learning_rate": 2.4118123836368377e-05, "loss": 0.185, "num_tokens": 560908736.0, "step": 4290 }, { "epoch": 1.7122905027932962, "grad_norm": 0.27557656168937683, "learning_rate": 2.4108339395715112e-05, "loss": 0.1558, "num_tokens": 561025321.0, "step": 4291 }, { "epoch": 1.7126895450917798, "grad_norm": 0.2265263944864273, "learning_rate": 2.409855561131656e-05, "loss": 0.1321, "num_tokens": 561156393.0, "step": 4292 }, { "epoch": 1.7130885873902635, "grad_norm": 0.2956455945968628, "learning_rate": 2.4088772485065786e-05, "loss": 0.1929, "num_tokens": 561287465.0, "step": 4293 }, { "epoch": 1.7134876296887471, "grad_norm": 0.2763887345790863, "learning_rate": 2.4078990018855736e-05, "loss": 0.175, "num_tokens": 561418537.0, "step": 4294 }, { "epoch": 1.7138866719872308, "grad_norm": 0.285745233297348, "learning_rate": 2.406920821457922e-05, "loss": 0.1952, "num_tokens": 561549609.0, "step": 4295 }, { "epoch": 1.7142857142857144, "grad_norm": 0.27184250950813293, "learning_rate": 2.4059427074128938e-05, "loss": 0.166, "num_tokens": 561680681.0, "step": 4296 }, { "epoch": 1.714684756584198, "grad_norm": 0.25227734446525574, "learning_rate": 2.4049646599397448e-05, "loss": 0.1646, "num_tokens": 561811753.0, "step": 4297 }, { "epoch": 1.7150837988826817, "grad_norm": 0.2664255201816559, "learning_rate": 2.403986679227717e-05, "loss": 0.1453, "num_tokens": 561942825.0, "step": 4298 }, { "epoch": 1.7154828411811653, "grad_norm": 0.273945689201355, "learning_rate": 2.4030087654660415e-05, "loss": 0.152, "num_tokens": 562073897.0, "step": 4299 }, { "epoch": 1.715881883479649, "grad_norm": 0.2766798734664917, "learning_rate": 2.402030918843936e-05, "loss": 0.1687, "num_tokens": 562204969.0, "step": 4300 }, { "epoch": 1.7162809257781326, "grad_norm": 0.3497006893157959, "learning_rate": 2.401053139550603e-05, "loss": 0.1808, "num_tokens": 562336041.0, "step": 4301 }, { "epoch": 1.7166799680766163, "grad_norm": 0.2913517355918884, "learning_rate": 2.4000754277752352e-05, "loss": 0.1663, "num_tokens": 562467113.0, "step": 4302 }, { "epoch": 1.7170790103751, "grad_norm": 0.29772302508354187, "learning_rate": 2.399097783707011e-05, "loss": 0.1633, "num_tokens": 562598185.0, "step": 4303 }, { "epoch": 1.7174780526735836, "grad_norm": 0.24462002515792847, "learning_rate": 2.3981202075350935e-05, "loss": 0.1372, "num_tokens": 562729257.0, "step": 4304 }, { "epoch": 1.7178770949720672, "grad_norm": 0.28364717960357666, "learning_rate": 2.3971426994486364e-05, "loss": 0.1738, "num_tokens": 562860329.0, "step": 4305 }, { "epoch": 1.7182761372705508, "grad_norm": 0.2675878703594208, "learning_rate": 2.3961652596367763e-05, "loss": 0.1724, "num_tokens": 562991401.0, "step": 4306 }, { "epoch": 1.7186751795690345, "grad_norm": 0.2691749036312103, "learning_rate": 2.3951878882886415e-05, "loss": 0.1667, "num_tokens": 563122473.0, "step": 4307 }, { "epoch": 1.719074221867518, "grad_norm": 0.27411672472953796, "learning_rate": 2.3942105855933416e-05, "loss": 0.1644, "num_tokens": 563253545.0, "step": 4308 }, { "epoch": 1.7194732641660015, "grad_norm": 0.29222989082336426, "learning_rate": 2.3932333517399767e-05, "loss": 0.1954, "num_tokens": 563384617.0, "step": 4309 }, { "epoch": 1.7198723064644852, "grad_norm": 0.27813026309013367, "learning_rate": 2.3922561869176337e-05, "loss": 0.166, "num_tokens": 563515689.0, "step": 4310 }, { "epoch": 1.7202713487629688, "grad_norm": 0.26263588666915894, "learning_rate": 2.3912790913153832e-05, "loss": 0.1612, "num_tokens": 563646761.0, "step": 4311 }, { "epoch": 1.7206703910614525, "grad_norm": 0.29226547479629517, "learning_rate": 2.390302065122284e-05, "loss": 0.1947, "num_tokens": 563777833.0, "step": 4312 }, { "epoch": 1.7210694333599361, "grad_norm": 0.23278234899044037, "learning_rate": 2.3893251085273825e-05, "loss": 0.1216, "num_tokens": 563908905.0, "step": 4313 }, { "epoch": 1.7214684756584198, "grad_norm": 0.2838627099990845, "learning_rate": 2.3883482217197107e-05, "loss": 0.1811, "num_tokens": 564039977.0, "step": 4314 }, { "epoch": 1.7218675179569034, "grad_norm": 0.27034345269203186, "learning_rate": 2.3873714048882874e-05, "loss": 0.1739, "num_tokens": 564171049.0, "step": 4315 }, { "epoch": 1.722266560255387, "grad_norm": 0.24682067334651947, "learning_rate": 2.3863946582221157e-05, "loss": 0.1279, "num_tokens": 564302121.0, "step": 4316 }, { "epoch": 1.7226656025538707, "grad_norm": 0.26349470019340515, "learning_rate": 2.385417981910188e-05, "loss": 0.1585, "num_tokens": 564433193.0, "step": 4317 }, { "epoch": 1.7230646448523543, "grad_norm": 0.2545405626296997, "learning_rate": 2.384441376141483e-05, "loss": 0.1465, "num_tokens": 564553338.0, "step": 4318 }, { "epoch": 1.723463687150838, "grad_norm": 0.29233670234680176, "learning_rate": 2.3834648411049638e-05, "loss": 0.1814, "num_tokens": 564684410.0, "step": 4319 }, { "epoch": 1.7238627294493216, "grad_norm": 0.29778099060058594, "learning_rate": 2.38248837698958e-05, "loss": 0.1787, "num_tokens": 564815482.0, "step": 4320 }, { "epoch": 1.7242617717478053, "grad_norm": 0.30904996395111084, "learning_rate": 2.3815119839842702e-05, "loss": 0.1924, "num_tokens": 564946554.0, "step": 4321 }, { "epoch": 1.724660814046289, "grad_norm": 0.2942698895931244, "learning_rate": 2.380535662277957e-05, "loss": 0.1931, "num_tokens": 565077626.0, "step": 4322 }, { "epoch": 1.7250598563447725, "grad_norm": 0.2962704300880432, "learning_rate": 2.3795594120595482e-05, "loss": 0.1979, "num_tokens": 565208698.0, "step": 4323 }, { "epoch": 1.7254588986432562, "grad_norm": 0.2995184659957886, "learning_rate": 2.3785832335179404e-05, "loss": 0.1854, "num_tokens": 565339770.0, "step": 4324 }, { "epoch": 1.7258579409417398, "grad_norm": 0.25258171558380127, "learning_rate": 2.3776071268420148e-05, "loss": 0.1519, "num_tokens": 565470842.0, "step": 4325 }, { "epoch": 1.7262569832402235, "grad_norm": 0.2841314971446991, "learning_rate": 2.3766310922206393e-05, "loss": 0.1973, "num_tokens": 565601914.0, "step": 4326 }, { "epoch": 1.7266560255387071, "grad_norm": 0.29861339926719666, "learning_rate": 2.375655129842666e-05, "loss": 0.2053, "num_tokens": 565732986.0, "step": 4327 }, { "epoch": 1.7270550678371908, "grad_norm": 0.294125497341156, "learning_rate": 2.3746792398969353e-05, "loss": 0.1646, "num_tokens": 565864058.0, "step": 4328 }, { "epoch": 1.7274541101356744, "grad_norm": 0.30897200107574463, "learning_rate": 2.3737034225722733e-05, "loss": 0.2033, "num_tokens": 565995130.0, "step": 4329 }, { "epoch": 1.727853152434158, "grad_norm": 0.2920216917991638, "learning_rate": 2.3727276780574915e-05, "loss": 0.1648, "num_tokens": 566126202.0, "step": 4330 }, { "epoch": 1.7282521947326417, "grad_norm": 0.24681727588176727, "learning_rate": 2.3717520065413863e-05, "loss": 0.1407, "num_tokens": 566257274.0, "step": 4331 }, { "epoch": 1.7286512370311253, "grad_norm": 0.2521235942840576, "learning_rate": 2.3707764082127425e-05, "loss": 0.1402, "num_tokens": 566388346.0, "step": 4332 }, { "epoch": 1.729050279329609, "grad_norm": 0.268913209438324, "learning_rate": 2.3698008832603285e-05, "loss": 0.184, "num_tokens": 566519418.0, "step": 4333 }, { "epoch": 1.7294493216280926, "grad_norm": 0.24599218368530273, "learning_rate": 2.3688254318728997e-05, "loss": 0.1452, "num_tokens": 566650490.0, "step": 4334 }, { "epoch": 1.7298483639265763, "grad_norm": 0.25087833404541016, "learning_rate": 2.367850054239196e-05, "loss": 0.1342, "num_tokens": 566781562.0, "step": 4335 }, { "epoch": 1.73024740622506, "grad_norm": 0.26598212122917175, "learning_rate": 2.366874750547945e-05, "loss": 0.1764, "num_tokens": 566910269.0, "step": 4336 }, { "epoch": 1.7306464485235435, "grad_norm": 0.29655149579048157, "learning_rate": 2.3658995209878587e-05, "loss": 0.1778, "num_tokens": 567041341.0, "step": 4337 }, { "epoch": 1.7310454908220272, "grad_norm": 0.272661030292511, "learning_rate": 2.364924365747634e-05, "loss": 0.1467, "num_tokens": 567156666.0, "step": 4338 }, { "epoch": 1.7314445331205106, "grad_norm": 0.2779832184314728, "learning_rate": 2.3639492850159543e-05, "loss": 0.1413, "num_tokens": 567287738.0, "step": 4339 }, { "epoch": 1.7318435754189943, "grad_norm": 0.2618694603443146, "learning_rate": 2.3629742789814903e-05, "loss": 0.155, "num_tokens": 567418810.0, "step": 4340 }, { "epoch": 1.732242617717478, "grad_norm": 0.27600422501564026, "learning_rate": 2.361999347832895e-05, "loss": 0.1581, "num_tokens": 567549882.0, "step": 4341 }, { "epoch": 1.7326416600159615, "grad_norm": 0.2501336634159088, "learning_rate": 2.361024491758809e-05, "loss": 0.128, "num_tokens": 567680954.0, "step": 4342 }, { "epoch": 1.7330407023144452, "grad_norm": 0.2894516885280609, "learning_rate": 2.3600497109478575e-05, "loss": 0.1821, "num_tokens": 567812026.0, "step": 4343 }, { "epoch": 1.7334397446129288, "grad_norm": 0.31335920095443726, "learning_rate": 2.3590750055886523e-05, "loss": 0.1705, "num_tokens": 567943098.0, "step": 4344 }, { "epoch": 1.7338387869114125, "grad_norm": 0.272489458322525, "learning_rate": 2.3581003758697895e-05, "loss": 0.1591, "num_tokens": 568074170.0, "step": 4345 }, { "epoch": 1.734237829209896, "grad_norm": 0.26037320494651794, "learning_rate": 2.3571258219798498e-05, "loss": 0.1692, "num_tokens": 568205242.0, "step": 4346 }, { "epoch": 1.7346368715083798, "grad_norm": 0.28111347556114197, "learning_rate": 2.3561513441074023e-05, "loss": 0.1886, "num_tokens": 568336314.0, "step": 4347 }, { "epoch": 1.7350359138068634, "grad_norm": 0.2729122042655945, "learning_rate": 2.355176942440998e-05, "loss": 0.147, "num_tokens": 568467386.0, "step": 4348 }, { "epoch": 1.735434956105347, "grad_norm": 0.2807735502719879, "learning_rate": 2.3542026171691756e-05, "loss": 0.181, "num_tokens": 568598458.0, "step": 4349 }, { "epoch": 1.7358339984038307, "grad_norm": 0.2608790397644043, "learning_rate": 2.3532283684804564e-05, "loss": 0.1734, "num_tokens": 568729530.0, "step": 4350 }, { "epoch": 1.7362330407023143, "grad_norm": 0.27580511569976807, "learning_rate": 2.3522541965633487e-05, "loss": 0.1781, "num_tokens": 568860602.0, "step": 4351 }, { "epoch": 1.736632083000798, "grad_norm": 0.2597658932209015, "learning_rate": 2.3512801016063467e-05, "loss": 0.1575, "num_tokens": 568991674.0, "step": 4352 }, { "epoch": 1.7370311252992816, "grad_norm": 0.22169318795204163, "learning_rate": 2.3503060837979286e-05, "loss": 0.1081, "num_tokens": 569122746.0, "step": 4353 }, { "epoch": 1.7374301675977653, "grad_norm": 0.29490646719932556, "learning_rate": 2.3493321433265564e-05, "loss": 0.1811, "num_tokens": 569253818.0, "step": 4354 }, { "epoch": 1.737829209896249, "grad_norm": 0.284266859292984, "learning_rate": 2.3483582803806803e-05, "loss": 0.1835, "num_tokens": 569384890.0, "step": 4355 }, { "epoch": 1.7382282521947325, "grad_norm": 0.26475363969802856, "learning_rate": 2.3473844951487328e-05, "loss": 0.1731, "num_tokens": 569515962.0, "step": 4356 }, { "epoch": 1.7386272944932162, "grad_norm": 0.29217371344566345, "learning_rate": 2.3464107878191322e-05, "loss": 0.1907, "num_tokens": 569647034.0, "step": 4357 }, { "epoch": 1.7390263367916998, "grad_norm": 0.31449246406555176, "learning_rate": 2.345437158580282e-05, "loss": 0.1946, "num_tokens": 569778106.0, "step": 4358 }, { "epoch": 1.7394253790901835, "grad_norm": 0.27436399459838867, "learning_rate": 2.3444636076205705e-05, "loss": 0.1532, "num_tokens": 569894221.0, "step": 4359 }, { "epoch": 1.739824421388667, "grad_norm": 0.31800273060798645, "learning_rate": 2.343490135128371e-05, "loss": 0.204, "num_tokens": 570025293.0, "step": 4360 }, { "epoch": 1.7402234636871508, "grad_norm": 0.2542210519313812, "learning_rate": 2.34251674129204e-05, "loss": 0.1668, "num_tokens": 570156365.0, "step": 4361 }, { "epoch": 1.7406225059856344, "grad_norm": 0.28392454981803894, "learning_rate": 2.3415434262999213e-05, "loss": 0.1858, "num_tokens": 570287437.0, "step": 4362 }, { "epoch": 1.741021548284118, "grad_norm": 0.25869375467300415, "learning_rate": 2.340570190340342e-05, "loss": 0.1728, "num_tokens": 570418509.0, "step": 4363 }, { "epoch": 1.7414205905826017, "grad_norm": 0.276434063911438, "learning_rate": 2.339597033601615e-05, "loss": 0.1886, "num_tokens": 570549581.0, "step": 4364 }, { "epoch": 1.7418196328810853, "grad_norm": 0.26701775193214417, "learning_rate": 2.338623956272035e-05, "loss": 0.1699, "num_tokens": 570680653.0, "step": 4365 }, { "epoch": 1.742218675179569, "grad_norm": 0.26866576075553894, "learning_rate": 2.3376509585398853e-05, "loss": 0.1669, "num_tokens": 570811725.0, "step": 4366 }, { "epoch": 1.7426177174780526, "grad_norm": 0.24821588397026062, "learning_rate": 2.3366780405934313e-05, "loss": 0.1504, "num_tokens": 570942797.0, "step": 4367 }, { "epoch": 1.7430167597765363, "grad_norm": 0.2816043496131897, "learning_rate": 2.335705202620923e-05, "loss": 0.1837, "num_tokens": 571073869.0, "step": 4368 }, { "epoch": 1.74341580207502, "grad_norm": 0.2702094614505768, "learning_rate": 2.3347324448105967e-05, "loss": 0.1669, "num_tokens": 571204941.0, "step": 4369 }, { "epoch": 1.7438148443735035, "grad_norm": 0.2935172915458679, "learning_rate": 2.3337597673506705e-05, "loss": 0.188, "num_tokens": 571336013.0, "step": 4370 }, { "epoch": 1.7442138866719872, "grad_norm": 0.25912436842918396, "learning_rate": 2.3327871704293508e-05, "loss": 0.1457, "num_tokens": 571467085.0, "step": 4371 }, { "epoch": 1.7446129289704708, "grad_norm": 0.2848103642463684, "learning_rate": 2.331814654234823e-05, "loss": 0.1763, "num_tokens": 571598157.0, "step": 4372 }, { "epoch": 1.7450119712689545, "grad_norm": 0.24243299663066864, "learning_rate": 2.3308422189552613e-05, "loss": 0.142, "num_tokens": 571729229.0, "step": 4373 }, { "epoch": 1.745411013567438, "grad_norm": 0.28365737199783325, "learning_rate": 2.3298698647788235e-05, "loss": 0.1558, "num_tokens": 571860301.0, "step": 4374 }, { "epoch": 1.7458100558659218, "grad_norm": 0.26024726033210754, "learning_rate": 2.3288975918936502e-05, "loss": 0.1355, "num_tokens": 571991373.0, "step": 4375 }, { "epoch": 1.7462090981644054, "grad_norm": 0.2811335027217865, "learning_rate": 2.327925400487867e-05, "loss": 0.1495, "num_tokens": 572122445.0, "step": 4376 }, { "epoch": 1.746608140462889, "grad_norm": 0.30148985981941223, "learning_rate": 2.3269532907495845e-05, "loss": 0.1459, "num_tokens": 572253517.0, "step": 4377 }, { "epoch": 1.7470071827613727, "grad_norm": 0.3732163608074188, "learning_rate": 2.325981262866897e-05, "loss": 0.2175, "num_tokens": 572384589.0, "step": 4378 }, { "epoch": 1.7474062250598563, "grad_norm": 0.2568719685077667, "learning_rate": 2.325009317027882e-05, "loss": 0.1458, "num_tokens": 572515661.0, "step": 4379 }, { "epoch": 1.74780526735834, "grad_norm": 0.3092815577983856, "learning_rate": 2.324037453420603e-05, "loss": 0.1902, "num_tokens": 572646733.0, "step": 4380 }, { "epoch": 1.7482043096568236, "grad_norm": 0.3097609579563141, "learning_rate": 2.3230656722331057e-05, "loss": 0.1824, "num_tokens": 572777805.0, "step": 4381 }, { "epoch": 1.7486033519553073, "grad_norm": 0.2700271010398865, "learning_rate": 2.3220939736534203e-05, "loss": 0.1539, "num_tokens": 572908877.0, "step": 4382 }, { "epoch": 1.749002394253791, "grad_norm": 0.2827499508857727, "learning_rate": 2.321122357869564e-05, "loss": 0.1631, "num_tokens": 573039949.0, "step": 4383 }, { "epoch": 1.7494014365522745, "grad_norm": 0.28208300471305847, "learning_rate": 2.3201508250695318e-05, "loss": 0.1801, "num_tokens": 573171021.0, "step": 4384 }, { "epoch": 1.7498004788507582, "grad_norm": 0.24314603209495544, "learning_rate": 2.3191793754413084e-05, "loss": 0.1598, "num_tokens": 573302093.0, "step": 4385 }, { "epoch": 1.7501995211492418, "grad_norm": 0.2635961174964905, "learning_rate": 2.3182080091728604e-05, "loss": 0.1698, "num_tokens": 573433165.0, "step": 4386 }, { "epoch": 1.7505985634477255, "grad_norm": 0.31976833939552307, "learning_rate": 2.317236726452136e-05, "loss": 0.2246, "num_tokens": 573564237.0, "step": 4387 }, { "epoch": 1.750997605746209, "grad_norm": 0.2872903048992157, "learning_rate": 2.316265527467072e-05, "loss": 0.1977, "num_tokens": 573695309.0, "step": 4388 }, { "epoch": 1.7513966480446927, "grad_norm": 0.2603287696838379, "learning_rate": 2.3152944124055853e-05, "loss": 0.1542, "num_tokens": 573826381.0, "step": 4389 }, { "epoch": 1.7517956903431764, "grad_norm": 0.2674221992492676, "learning_rate": 2.3143233814555765e-05, "loss": 0.1706, "num_tokens": 573957453.0, "step": 4390 }, { "epoch": 1.75219473264166, "grad_norm": 0.36510124802589417, "learning_rate": 2.3133524348049335e-05, "loss": 0.209, "num_tokens": 574088525.0, "step": 4391 }, { "epoch": 1.7525937749401437, "grad_norm": 0.2833033800125122, "learning_rate": 2.3123815726415233e-05, "loss": 0.1688, "num_tokens": 574219597.0, "step": 4392 }, { "epoch": 1.7529928172386273, "grad_norm": 0.33359193801879883, "learning_rate": 2.3114107951531987e-05, "loss": 0.2292, "num_tokens": 574350669.0, "step": 4393 }, { "epoch": 1.753391859537111, "grad_norm": 0.27343010902404785, "learning_rate": 2.3104401025277982e-05, "loss": 0.173, "num_tokens": 574481741.0, "step": 4394 }, { "epoch": 1.7537909018355946, "grad_norm": 0.29944878816604614, "learning_rate": 2.30946949495314e-05, "loss": 0.1864, "num_tokens": 574612813.0, "step": 4395 }, { "epoch": 1.7541899441340782, "grad_norm": 0.2621328830718994, "learning_rate": 2.308498972617027e-05, "loss": 0.1488, "num_tokens": 574743885.0, "step": 4396 }, { "epoch": 1.754588986432562, "grad_norm": 0.2860303521156311, "learning_rate": 2.307528535707248e-05, "loss": 0.193, "num_tokens": 574874957.0, "step": 4397 }, { "epoch": 1.7549880287310455, "grad_norm": 0.27516376972198486, "learning_rate": 2.3065581844115718e-05, "loss": 0.1702, "num_tokens": 575006029.0, "step": 4398 }, { "epoch": 1.7553870710295292, "grad_norm": 0.3117944598197937, "learning_rate": 2.3055879189177538e-05, "loss": 0.2001, "num_tokens": 575137101.0, "step": 4399 }, { "epoch": 1.7557861133280128, "grad_norm": 0.3139278292655945, "learning_rate": 2.304617739413531e-05, "loss": 0.182, "num_tokens": 575251994.0, "step": 4400 }, { "epoch": 1.7561851556264965, "grad_norm": 0.28909197449684143, "learning_rate": 2.303647646086623e-05, "loss": 0.1788, "num_tokens": 575383066.0, "step": 4401 }, { "epoch": 1.75658419792498, "grad_norm": 0.3350515067577362, "learning_rate": 2.302677639124735e-05, "loss": 0.1678, "num_tokens": 575514138.0, "step": 4402 }, { "epoch": 1.7569832402234637, "grad_norm": 0.27651000022888184, "learning_rate": 2.3017077187155546e-05, "loss": 0.1429, "num_tokens": 575645210.0, "step": 4403 }, { "epoch": 1.7573822825219474, "grad_norm": 0.3254581689834595, "learning_rate": 2.300737885046751e-05, "loss": 0.1848, "num_tokens": 575776282.0, "step": 4404 }, { "epoch": 1.757781324820431, "grad_norm": 0.2711760699748993, "learning_rate": 2.2997681383059798e-05, "loss": 0.1593, "num_tokens": 575907354.0, "step": 4405 }, { "epoch": 1.7581803671189147, "grad_norm": 0.28030452132225037, "learning_rate": 2.2987984786808753e-05, "loss": 0.1635, "num_tokens": 576038426.0, "step": 4406 }, { "epoch": 1.7585794094173983, "grad_norm": 0.3028453290462494, "learning_rate": 2.2978289063590602e-05, "loss": 0.2187, "num_tokens": 576169498.0, "step": 4407 }, { "epoch": 1.758978451715882, "grad_norm": 0.25872546434402466, "learning_rate": 2.2968594215281358e-05, "loss": 0.1341, "num_tokens": 576300570.0, "step": 4408 }, { "epoch": 1.7593774940143656, "grad_norm": 0.26542171835899353, "learning_rate": 2.2958900243756893e-05, "loss": 0.1728, "num_tokens": 576431642.0, "step": 4409 }, { "epoch": 1.7597765363128492, "grad_norm": 0.3178917169570923, "learning_rate": 2.2949207150892905e-05, "loss": 0.2025, "num_tokens": 576562714.0, "step": 4410 }, { "epoch": 1.760175578611333, "grad_norm": 0.27982014417648315, "learning_rate": 2.293951493856491e-05, "loss": 0.2039, "num_tokens": 576693786.0, "step": 4411 }, { "epoch": 1.7605746209098165, "grad_norm": 0.2684275209903717, "learning_rate": 2.292982360864826e-05, "loss": 0.1684, "num_tokens": 576824858.0, "step": 4412 }, { "epoch": 1.7609736632083002, "grad_norm": 0.259306401014328, "learning_rate": 2.2920133163018136e-05, "loss": 0.1685, "num_tokens": 576955930.0, "step": 4413 }, { "epoch": 1.7613727055067838, "grad_norm": 0.271083265542984, "learning_rate": 2.291044360354956e-05, "loss": 0.1785, "num_tokens": 577087002.0, "step": 4414 }, { "epoch": 1.7617717478052675, "grad_norm": 0.28447431325912476, "learning_rate": 2.2900754932117356e-05, "loss": 0.1852, "num_tokens": 577214826.0, "step": 4415 }, { "epoch": 1.762170790103751, "grad_norm": 0.26015788316726685, "learning_rate": 2.289106715059621e-05, "loss": 0.1599, "num_tokens": 577345898.0, "step": 4416 }, { "epoch": 1.7625698324022347, "grad_norm": 0.2635069489479065, "learning_rate": 2.288138026086059e-05, "loss": 0.1563, "num_tokens": 577476970.0, "step": 4417 }, { "epoch": 1.7629688747007184, "grad_norm": 0.24431924521923065, "learning_rate": 2.2871694264784847e-05, "loss": 0.133, "num_tokens": 577608042.0, "step": 4418 }, { "epoch": 1.763367916999202, "grad_norm": 0.3064325451850891, "learning_rate": 2.286200916424312e-05, "loss": 0.1766, "num_tokens": 577739114.0, "step": 4419 }, { "epoch": 1.7637669592976857, "grad_norm": 0.27735069394111633, "learning_rate": 2.2852324961109376e-05, "loss": 0.1735, "num_tokens": 577870186.0, "step": 4420 }, { "epoch": 1.7641660015961693, "grad_norm": 0.2642071843147278, "learning_rate": 2.284264165725743e-05, "loss": 0.1678, "num_tokens": 578001258.0, "step": 4421 }, { "epoch": 1.764565043894653, "grad_norm": 0.2634292542934418, "learning_rate": 2.2832959254560917e-05, "loss": 0.1516, "num_tokens": 578132330.0, "step": 4422 }, { "epoch": 1.7649640861931366, "grad_norm": 0.26517799496650696, "learning_rate": 2.2823277754893275e-05, "loss": 0.1503, "num_tokens": 578263402.0, "step": 4423 }, { "epoch": 1.7653631284916202, "grad_norm": 0.24643772840499878, "learning_rate": 2.2813597160127787e-05, "loss": 0.1367, "num_tokens": 578394474.0, "step": 4424 }, { "epoch": 1.765762170790104, "grad_norm": 0.3137742877006531, "learning_rate": 2.280391747213757e-05, "loss": 0.206, "num_tokens": 578525546.0, "step": 4425 }, { "epoch": 1.7661612130885875, "grad_norm": 0.26292046904563904, "learning_rate": 2.279423869279554e-05, "loss": 0.1542, "num_tokens": 578656618.0, "step": 4426 }, { "epoch": 1.7665602553870712, "grad_norm": 0.2387421876192093, "learning_rate": 2.2784560823974453e-05, "loss": 0.1202, "num_tokens": 578786451.0, "step": 4427 }, { "epoch": 1.7669592976855548, "grad_norm": 0.2715749740600586, "learning_rate": 2.2774883867546907e-05, "loss": 0.1709, "num_tokens": 578917523.0, "step": 4428 }, { "epoch": 1.7673583399840385, "grad_norm": 0.32105687260627747, "learning_rate": 2.2765207825385275e-05, "loss": 0.1678, "num_tokens": 579048595.0, "step": 4429 }, { "epoch": 1.767757382282522, "grad_norm": 0.36620017886161804, "learning_rate": 2.2755532699361793e-05, "loss": 0.211, "num_tokens": 579179667.0, "step": 4430 }, { "epoch": 1.7681564245810057, "grad_norm": 0.260130375623703, "learning_rate": 2.2745858491348505e-05, "loss": 0.1632, "num_tokens": 579310739.0, "step": 4431 }, { "epoch": 1.7685554668794894, "grad_norm": 0.2644048035144806, "learning_rate": 2.273618520321728e-05, "loss": 0.1698, "num_tokens": 579441811.0, "step": 4432 }, { "epoch": 1.7689545091779728, "grad_norm": 0.262416273355484, "learning_rate": 2.2726512836839813e-05, "loss": 0.1512, "num_tokens": 579572883.0, "step": 4433 }, { "epoch": 1.7693535514764565, "grad_norm": 0.25043943524360657, "learning_rate": 2.2716841394087613e-05, "loss": 0.1414, "num_tokens": 579703955.0, "step": 4434 }, { "epoch": 1.76975259377494, "grad_norm": 0.28079330921173096, "learning_rate": 2.2707170876832013e-05, "loss": 0.178, "num_tokens": 579835027.0, "step": 4435 }, { "epoch": 1.7701516360734237, "grad_norm": 0.2637503147125244, "learning_rate": 2.269750128694417e-05, "loss": 0.1625, "num_tokens": 579966099.0, "step": 4436 }, { "epoch": 1.7705506783719074, "grad_norm": 0.29997262358665466, "learning_rate": 2.2687832626295068e-05, "loss": 0.2038, "num_tokens": 580097171.0, "step": 4437 }, { "epoch": 1.770949720670391, "grad_norm": 0.25677236914634705, "learning_rate": 2.2678164896755483e-05, "loss": 0.1571, "num_tokens": 580228243.0, "step": 4438 }, { "epoch": 1.7713487629688747, "grad_norm": 0.3421724736690521, "learning_rate": 2.2668498100196055e-05, "loss": 0.2239, "num_tokens": 580359315.0, "step": 4439 }, { "epoch": 1.7717478052673583, "grad_norm": 0.2862696945667267, "learning_rate": 2.26588322384872e-05, "loss": 0.1862, "num_tokens": 580490387.0, "step": 4440 }, { "epoch": 1.772146847565842, "grad_norm": 0.3037320375442505, "learning_rate": 2.2649167313499175e-05, "loss": 0.2034, "num_tokens": 580621459.0, "step": 4441 }, { "epoch": 1.7725458898643256, "grad_norm": 0.2456110715866089, "learning_rate": 2.2639503327102058e-05, "loss": 0.1525, "num_tokens": 580752531.0, "step": 4442 }, { "epoch": 1.7729449321628092, "grad_norm": 0.25556719303131104, "learning_rate": 2.262984028116573e-05, "loss": 0.145, "num_tokens": 580883603.0, "step": 4443 }, { "epoch": 1.7733439744612929, "grad_norm": 0.2886292338371277, "learning_rate": 2.262017817755992e-05, "loss": 0.1836, "num_tokens": 581014675.0, "step": 4444 }, { "epoch": 1.7737430167597765, "grad_norm": 0.280586302280426, "learning_rate": 2.261051701815414e-05, "loss": 0.1877, "num_tokens": 581145747.0, "step": 4445 }, { "epoch": 1.7741420590582602, "grad_norm": 0.2738528251647949, "learning_rate": 2.2600856804817732e-05, "loss": 0.1431, "num_tokens": 581276819.0, "step": 4446 }, { "epoch": 1.7745411013567438, "grad_norm": 0.28129494190216064, "learning_rate": 2.259119753941987e-05, "loss": 0.1534, "num_tokens": 581407891.0, "step": 4447 }, { "epoch": 1.7749401436552275, "grad_norm": 0.282339483499527, "learning_rate": 2.2581539223829528e-05, "loss": 0.1647, "num_tokens": 581538963.0, "step": 4448 }, { "epoch": 1.775339185953711, "grad_norm": 0.3198738098144531, "learning_rate": 2.257188185991549e-05, "loss": 0.1667, "num_tokens": 581670035.0, "step": 4449 }, { "epoch": 1.7757382282521947, "grad_norm": 0.2628544270992279, "learning_rate": 2.2562225449546383e-05, "loss": 0.1444, "num_tokens": 581801107.0, "step": 4450 }, { "epoch": 1.7761372705506784, "grad_norm": 0.2767835557460785, "learning_rate": 2.255256999459061e-05, "loss": 0.1592, "num_tokens": 581932179.0, "step": 4451 }, { "epoch": 1.776536312849162, "grad_norm": 0.2962551414966583, "learning_rate": 2.2542915496916427e-05, "loss": 0.1761, "num_tokens": 582063251.0, "step": 4452 }, { "epoch": 1.7769353551476457, "grad_norm": 0.25614234805107117, "learning_rate": 2.2533261958391888e-05, "loss": 0.1569, "num_tokens": 582194323.0, "step": 4453 }, { "epoch": 1.7773343974461293, "grad_norm": 0.2847142517566681, "learning_rate": 2.252360938088486e-05, "loss": 0.1693, "num_tokens": 582325395.0, "step": 4454 }, { "epoch": 1.777733439744613, "grad_norm": 0.2844516634941101, "learning_rate": 2.2513957766263027e-05, "loss": 0.1893, "num_tokens": 582456467.0, "step": 4455 }, { "epoch": 1.7781324820430966, "grad_norm": 0.2898090183734894, "learning_rate": 2.2504307116393887e-05, "loss": 0.1499, "num_tokens": 582587539.0, "step": 4456 }, { "epoch": 1.7785315243415802, "grad_norm": 0.30908575654029846, "learning_rate": 2.2494657433144744e-05, "loss": 0.1894, "num_tokens": 582718611.0, "step": 4457 }, { "epoch": 1.7789305666400639, "grad_norm": 0.33315640687942505, "learning_rate": 2.2485008718382738e-05, "loss": 0.1749, "num_tokens": 582845117.0, "step": 4458 }, { "epoch": 1.7793296089385475, "grad_norm": 0.2640952467918396, "learning_rate": 2.2475360973974796e-05, "loss": 0.1541, "num_tokens": 582976189.0, "step": 4459 }, { "epoch": 1.7797286512370312, "grad_norm": 0.26711031794548035, "learning_rate": 2.246571420178766e-05, "loss": 0.1684, "num_tokens": 583107261.0, "step": 4460 }, { "epoch": 1.7801276935355148, "grad_norm": 0.27726539969444275, "learning_rate": 2.2456068403687902e-05, "loss": 0.1604, "num_tokens": 583238333.0, "step": 4461 }, { "epoch": 1.7805267358339985, "grad_norm": 0.299640953540802, "learning_rate": 2.2446423581541893e-05, "loss": 0.1805, "num_tokens": 583369405.0, "step": 4462 }, { "epoch": 1.780925778132482, "grad_norm": 0.2766530513763428, "learning_rate": 2.2436779737215808e-05, "loss": 0.166, "num_tokens": 583500477.0, "step": 4463 }, { "epoch": 1.7813248204309655, "grad_norm": 0.26752969622612, "learning_rate": 2.2427136872575645e-05, "loss": 0.1664, "num_tokens": 583631549.0, "step": 4464 }, { "epoch": 1.7817238627294492, "grad_norm": 0.2649976909160614, "learning_rate": 2.24174949894872e-05, "loss": 0.1505, "num_tokens": 583762621.0, "step": 4465 }, { "epoch": 1.7821229050279328, "grad_norm": 0.2427225261926651, "learning_rate": 2.2407854089816104e-05, "loss": 0.1332, "num_tokens": 583893693.0, "step": 4466 }, { "epoch": 1.7825219473264164, "grad_norm": 0.3077233135700226, "learning_rate": 2.2398214175427774e-05, "loss": 0.1668, "num_tokens": 584024765.0, "step": 4467 }, { "epoch": 1.7829209896249, "grad_norm": 0.30425387620925903, "learning_rate": 2.238857524818744e-05, "loss": 0.1643, "num_tokens": 584155837.0, "step": 4468 }, { "epoch": 1.7833200319233837, "grad_norm": 0.26802554726600647, "learning_rate": 2.2378937309960145e-05, "loss": 0.1497, "num_tokens": 584286909.0, "step": 4469 }, { "epoch": 1.7837190742218674, "grad_norm": 0.3298308253288269, "learning_rate": 2.2369300362610752e-05, "loss": 0.1893, "num_tokens": 584417981.0, "step": 4470 }, { "epoch": 1.784118116520351, "grad_norm": 0.2867576777935028, "learning_rate": 2.23596644080039e-05, "loss": 0.159, "num_tokens": 584549053.0, "step": 4471 }, { "epoch": 1.7845171588188347, "grad_norm": 0.27939754724502563, "learning_rate": 2.235002944800407e-05, "loss": 0.1577, "num_tokens": 584680125.0, "step": 4472 }, { "epoch": 1.7849162011173183, "grad_norm": 0.281315416097641, "learning_rate": 2.2340395484475544e-05, "loss": 0.1552, "num_tokens": 584811197.0, "step": 4473 }, { "epoch": 1.785315243415802, "grad_norm": 0.29112404584884644, "learning_rate": 2.233076251928239e-05, "loss": 0.1711, "num_tokens": 584942269.0, "step": 4474 }, { "epoch": 1.7857142857142856, "grad_norm": 0.27633094787597656, "learning_rate": 2.23211305542885e-05, "loss": 0.1294, "num_tokens": 585073341.0, "step": 4475 }, { "epoch": 1.7861133280127692, "grad_norm": 0.267872154712677, "learning_rate": 2.2311499591357565e-05, "loss": 0.1623, "num_tokens": 585204413.0, "step": 4476 }, { "epoch": 1.7865123703112529, "grad_norm": 0.2573646306991577, "learning_rate": 2.23018696323531e-05, "loss": 0.124, "num_tokens": 585321193.0, "step": 4477 }, { "epoch": 1.7869114126097365, "grad_norm": 0.26535895466804504, "learning_rate": 2.2292240679138406e-05, "loss": 0.1439, "num_tokens": 585452265.0, "step": 4478 }, { "epoch": 1.7873104549082202, "grad_norm": 0.34165725111961365, "learning_rate": 2.228261273357658e-05, "loss": 0.1379, "num_tokens": 585583337.0, "step": 4479 }, { "epoch": 1.7877094972067038, "grad_norm": 0.26002034544944763, "learning_rate": 2.227298579753057e-05, "loss": 0.1412, "num_tokens": 585714409.0, "step": 4480 }, { "epoch": 1.7881085395051874, "grad_norm": 0.2630173861980438, "learning_rate": 2.2263359872863077e-05, "loss": 0.1425, "num_tokens": 585845481.0, "step": 4481 }, { "epoch": 1.788507581803671, "grad_norm": 0.26904016733169556, "learning_rate": 2.2253734961436635e-05, "loss": 0.1503, "num_tokens": 585976553.0, "step": 4482 }, { "epoch": 1.7889066241021547, "grad_norm": 0.31599321961402893, "learning_rate": 2.2244111065113567e-05, "loss": 0.1693, "num_tokens": 586107625.0, "step": 4483 }, { "epoch": 1.7893056664006384, "grad_norm": 0.3509141802787781, "learning_rate": 2.2234488185756025e-05, "loss": 0.1925, "num_tokens": 586238697.0, "step": 4484 }, { "epoch": 1.789704708699122, "grad_norm": 0.3297590911388397, "learning_rate": 2.222486632522593e-05, "loss": 0.2174, "num_tokens": 586369769.0, "step": 4485 }, { "epoch": 1.7901037509976057, "grad_norm": 0.26775845885276794, "learning_rate": 2.221524548538502e-05, "loss": 0.1668, "num_tokens": 586500841.0, "step": 4486 }, { "epoch": 1.7905027932960893, "grad_norm": 0.28758320212364197, "learning_rate": 2.220562566809485e-05, "loss": 0.1541, "num_tokens": 586631913.0, "step": 4487 }, { "epoch": 1.790901835594573, "grad_norm": 0.25836965441703796, "learning_rate": 2.2196006875216758e-05, "loss": 0.1481, "num_tokens": 586762985.0, "step": 4488 }, { "epoch": 1.7913008778930566, "grad_norm": 0.3162532448768616, "learning_rate": 2.2186389108611892e-05, "loss": 0.2016, "num_tokens": 586894057.0, "step": 4489 }, { "epoch": 1.7916999201915402, "grad_norm": 0.2709476053714752, "learning_rate": 2.21767723701412e-05, "loss": 0.1665, "num_tokens": 587025129.0, "step": 4490 }, { "epoch": 1.7920989624900239, "grad_norm": 0.26411113142967224, "learning_rate": 2.2167156661665438e-05, "loss": 0.1486, "num_tokens": 587156201.0, "step": 4491 }, { "epoch": 1.7924980047885075, "grad_norm": 0.24684034287929535, "learning_rate": 2.2157541985045143e-05, "loss": 0.139, "num_tokens": 587287273.0, "step": 4492 }, { "epoch": 1.7928970470869912, "grad_norm": 0.25525060296058655, "learning_rate": 2.214792834214068e-05, "loss": 0.1238, "num_tokens": 587418345.0, "step": 4493 }, { "epoch": 1.7932960893854748, "grad_norm": 0.2698892056941986, "learning_rate": 2.2138315734812177e-05, "loss": 0.1567, "num_tokens": 587542312.0, "step": 4494 }, { "epoch": 1.7936951316839584, "grad_norm": 0.3034287393093109, "learning_rate": 2.212870416491961e-05, "loss": 0.1619, "num_tokens": 587673384.0, "step": 4495 }, { "epoch": 1.794094173982442, "grad_norm": 0.307041734457016, "learning_rate": 2.2119093634322724e-05, "loss": 0.1672, "num_tokens": 587804456.0, "step": 4496 }, { "epoch": 1.7944932162809257, "grad_norm": 0.306416779756546, "learning_rate": 2.2109484144881047e-05, "loss": 0.1684, "num_tokens": 587935528.0, "step": 4497 }, { "epoch": 1.7948922585794094, "grad_norm": 0.27029839158058167, "learning_rate": 2.2099875698453942e-05, "loss": 0.1316, "num_tokens": 588066600.0, "step": 4498 }, { "epoch": 1.795291300877893, "grad_norm": 0.2708685100078583, "learning_rate": 2.2090268296900545e-05, "loss": 0.1569, "num_tokens": 588197672.0, "step": 4499 }, { "epoch": 1.7956903431763767, "grad_norm": 0.2828521132469177, "learning_rate": 2.2080661942079805e-05, "loss": 0.1719, "num_tokens": 588328744.0, "step": 4500 }, { "epoch": 1.7960893854748603, "grad_norm": 0.30545538663864136, "learning_rate": 2.207105663585047e-05, "loss": 0.1834, "num_tokens": 588459816.0, "step": 4501 }, { "epoch": 1.796488427773344, "grad_norm": 0.24621565639972687, "learning_rate": 2.2061452380071057e-05, "loss": 0.1225, "num_tokens": 588590888.0, "step": 4502 }, { "epoch": 1.7968874700718276, "grad_norm": 0.27706578373908997, "learning_rate": 2.2051849176599914e-05, "loss": 0.1537, "num_tokens": 588721960.0, "step": 4503 }, { "epoch": 1.7972865123703112, "grad_norm": 0.29173779487609863, "learning_rate": 2.204224702729517e-05, "loss": 0.1816, "num_tokens": 588853032.0, "step": 4504 }, { "epoch": 1.7976855546687949, "grad_norm": 0.2997051179409027, "learning_rate": 2.2032645934014747e-05, "loss": 0.1826, "num_tokens": 588984104.0, "step": 4505 }, { "epoch": 1.7980845969672785, "grad_norm": 0.2499833106994629, "learning_rate": 2.2023045898616375e-05, "loss": 0.14, "num_tokens": 589115176.0, "step": 4506 }, { "epoch": 1.7984836392657622, "grad_norm": 0.26422762870788574, "learning_rate": 2.2013446922957566e-05, "loss": 0.1503, "num_tokens": 589244737.0, "step": 4507 }, { "epoch": 1.7988826815642458, "grad_norm": 0.2806051969528198, "learning_rate": 2.2003849008895633e-05, "loss": 0.1705, "num_tokens": 589375809.0, "step": 4508 }, { "epoch": 1.7992817238627294, "grad_norm": 0.2853122055530548, "learning_rate": 2.1994252158287675e-05, "loss": 0.1535, "num_tokens": 589506881.0, "step": 4509 }, { "epoch": 1.799680766161213, "grad_norm": 0.3058789074420929, "learning_rate": 2.1984656372990602e-05, "loss": 0.163, "num_tokens": 589637953.0, "step": 4510 }, { "epoch": 1.8000798084596967, "grad_norm": 0.29494577646255493, "learning_rate": 2.1975061654861113e-05, "loss": 0.1586, "num_tokens": 589766710.0, "step": 4511 }, { "epoch": 1.8004788507581804, "grad_norm": 0.280139297246933, "learning_rate": 2.1965468005755685e-05, "loss": 0.1848, "num_tokens": 589897782.0, "step": 4512 }, { "epoch": 1.800877893056664, "grad_norm": 0.2701534628868103, "learning_rate": 2.1955875427530603e-05, "loss": 0.1811, "num_tokens": 590028854.0, "step": 4513 }, { "epoch": 1.8012769353551477, "grad_norm": 0.27711230516433716, "learning_rate": 2.1946283922041954e-05, "loss": 0.1711, "num_tokens": 590159926.0, "step": 4514 }, { "epoch": 1.8016759776536313, "grad_norm": 0.26667436957359314, "learning_rate": 2.1936693491145594e-05, "loss": 0.157, "num_tokens": 590290998.0, "step": 4515 }, { "epoch": 1.802075019952115, "grad_norm": 0.2948875427246094, "learning_rate": 2.1927104136697175e-05, "loss": 0.2029, "num_tokens": 590422070.0, "step": 4516 }, { "epoch": 1.8024740622505986, "grad_norm": 0.2613692581653595, "learning_rate": 2.1917515860552158e-05, "loss": 0.1529, "num_tokens": 590553142.0, "step": 4517 }, { "epoch": 1.8028731045490822, "grad_norm": 0.28478771448135376, "learning_rate": 2.190792866456579e-05, "loss": 0.1701, "num_tokens": 590684214.0, "step": 4518 }, { "epoch": 1.8032721468475659, "grad_norm": 0.2595609128475189, "learning_rate": 2.189834255059309e-05, "loss": 0.1478, "num_tokens": 590815286.0, "step": 4519 }, { "epoch": 1.8036711891460495, "grad_norm": 0.28063109517097473, "learning_rate": 2.1888757520488893e-05, "loss": 0.1677, "num_tokens": 590946358.0, "step": 4520 }, { "epoch": 1.8040702314445332, "grad_norm": 0.2701372504234314, "learning_rate": 2.1879173576107804e-05, "loss": 0.1509, "num_tokens": 591077430.0, "step": 4521 }, { "epoch": 1.8044692737430168, "grad_norm": 0.32586193084716797, "learning_rate": 2.1869590719304235e-05, "loss": 0.158, "num_tokens": 591208502.0, "step": 4522 }, { "epoch": 1.8048683160415004, "grad_norm": 0.411683052778244, "learning_rate": 2.186000895193237e-05, "loss": 0.2018, "num_tokens": 591339574.0, "step": 4523 }, { "epoch": 1.805267358339984, "grad_norm": 0.33143189549446106, "learning_rate": 2.1850428275846203e-05, "loss": 0.1898, "num_tokens": 591470646.0, "step": 4524 }, { "epoch": 1.8056664006384677, "grad_norm": 0.30931657552719116, "learning_rate": 2.1840848692899498e-05, "loss": 0.1871, "num_tokens": 591601718.0, "step": 4525 }, { "epoch": 1.8060654429369514, "grad_norm": 0.29219239950180054, "learning_rate": 2.1831270204945815e-05, "loss": 0.209, "num_tokens": 591732790.0, "step": 4526 }, { "epoch": 1.806464485235435, "grad_norm": 0.2795408070087433, "learning_rate": 2.18216928138385e-05, "loss": 0.1653, "num_tokens": 591851738.0, "step": 4527 }, { "epoch": 1.8068635275339187, "grad_norm": 0.2796103060245514, "learning_rate": 2.1812116521430702e-05, "loss": 0.1647, "num_tokens": 591981668.0, "step": 4528 }, { "epoch": 1.8072625698324023, "grad_norm": 0.24987907707691193, "learning_rate": 2.1802541329575338e-05, "loss": 0.1261, "num_tokens": 592112740.0, "step": 4529 }, { "epoch": 1.807661612130886, "grad_norm": 0.24125000834465027, "learning_rate": 2.1792967240125112e-05, "loss": 0.1284, "num_tokens": 592243812.0, "step": 4530 }, { "epoch": 1.8080606544293696, "grad_norm": 0.27744653820991516, "learning_rate": 2.1783394254932525e-05, "loss": 0.1644, "num_tokens": 592374884.0, "step": 4531 }, { "epoch": 1.8084596967278532, "grad_norm": 0.2664555609226227, "learning_rate": 2.1773822375849855e-05, "loss": 0.1558, "num_tokens": 592505956.0, "step": 4532 }, { "epoch": 1.8088587390263369, "grad_norm": 0.24797624349594116, "learning_rate": 2.176425160472918e-05, "loss": 0.1308, "num_tokens": 592637028.0, "step": 4533 }, { "epoch": 1.8092577813248205, "grad_norm": 0.3064050078392029, "learning_rate": 2.175468194342235e-05, "loss": 0.1777, "num_tokens": 592768100.0, "step": 4534 }, { "epoch": 1.8096568236233042, "grad_norm": 0.31726524233818054, "learning_rate": 2.1745113393781e-05, "loss": 0.21, "num_tokens": 592894389.0, "step": 4535 }, { "epoch": 1.8100558659217878, "grad_norm": 0.2780548334121704, "learning_rate": 2.1735545957656577e-05, "loss": 0.1576, "num_tokens": 593025461.0, "step": 4536 }, { "epoch": 1.8104549082202714, "grad_norm": 0.2809200882911682, "learning_rate": 2.172597963690027e-05, "loss": 0.1311, "num_tokens": 593156533.0, "step": 4537 }, { "epoch": 1.810853950518755, "grad_norm": 0.367643266916275, "learning_rate": 2.1716414433363068e-05, "loss": 0.1596, "num_tokens": 593272182.0, "step": 4538 }, { "epoch": 1.8112529928172387, "grad_norm": 0.31830134987831116, "learning_rate": 2.1706850348895775e-05, "loss": 0.1671, "num_tokens": 593403254.0, "step": 4539 }, { "epoch": 1.8116520351157224, "grad_norm": 0.279570072889328, "learning_rate": 2.169728738534893e-05, "loss": 0.1402, "num_tokens": 593534326.0, "step": 4540 }, { "epoch": 1.812051077414206, "grad_norm": 0.2922140061855316, "learning_rate": 2.168772554457289e-05, "loss": 0.1786, "num_tokens": 593665398.0, "step": 4541 }, { "epoch": 1.8124501197126897, "grad_norm": 0.32028016448020935, "learning_rate": 2.167816482841777e-05, "loss": 0.1853, "num_tokens": 593796470.0, "step": 4542 }, { "epoch": 1.8128491620111733, "grad_norm": 0.2992406487464905, "learning_rate": 2.166860523873348e-05, "loss": 0.1624, "num_tokens": 593927542.0, "step": 4543 }, { "epoch": 1.813248204309657, "grad_norm": 0.3148556053638458, "learning_rate": 2.1659046777369718e-05, "loss": 0.1988, "num_tokens": 594058614.0, "step": 4544 }, { "epoch": 1.8136472466081406, "grad_norm": 0.31319084763526917, "learning_rate": 2.1649489446175964e-05, "loss": 0.1896, "num_tokens": 594189686.0, "step": 4545 }, { "epoch": 1.8140462889066242, "grad_norm": 0.28509509563446045, "learning_rate": 2.1639933247001454e-05, "loss": 0.1859, "num_tokens": 594320758.0, "step": 4546 }, { "epoch": 1.8144453312051079, "grad_norm": 0.27899086475372314, "learning_rate": 2.1630378181695237e-05, "loss": 0.1621, "num_tokens": 594451830.0, "step": 4547 }, { "epoch": 1.8148443735035915, "grad_norm": 0.29061752557754517, "learning_rate": 2.162082425210613e-05, "loss": 0.194, "num_tokens": 594582902.0, "step": 4548 }, { "epoch": 1.8152434158020752, "grad_norm": 0.25488200783729553, "learning_rate": 2.161127146008273e-05, "loss": 0.1311, "num_tokens": 594713974.0, "step": 4549 }, { "epoch": 1.8156424581005588, "grad_norm": 0.29333221912384033, "learning_rate": 2.1601719807473397e-05, "loss": 0.1783, "num_tokens": 594845046.0, "step": 4550 }, { "epoch": 1.8160415003990424, "grad_norm": 0.25833845138549805, "learning_rate": 2.15921692961263e-05, "loss": 0.1454, "num_tokens": 594976118.0, "step": 4551 }, { "epoch": 1.816440542697526, "grad_norm": 0.2826990783214569, "learning_rate": 2.1582619927889385e-05, "loss": 0.1723, "num_tokens": 595107190.0, "step": 4552 }, { "epoch": 1.8168395849960097, "grad_norm": 0.31584060192108154, "learning_rate": 2.1573071704610343e-05, "loss": 0.1782, "num_tokens": 595238262.0, "step": 4553 }, { "epoch": 1.8172386272944934, "grad_norm": 0.2740747034549713, "learning_rate": 2.1563524628136672e-05, "loss": 0.1538, "num_tokens": 595369334.0, "step": 4554 }, { "epoch": 1.817637669592977, "grad_norm": 0.32338082790374756, "learning_rate": 2.1553978700315654e-05, "loss": 0.1737, "num_tokens": 595500406.0, "step": 4555 }, { "epoch": 1.8180367118914607, "grad_norm": 0.27584490180015564, "learning_rate": 2.1544433922994323e-05, "loss": 0.1463, "num_tokens": 595631478.0, "step": 4556 }, { "epoch": 1.8184357541899443, "grad_norm": 0.33590295910835266, "learning_rate": 2.1534890298019515e-05, "loss": 0.1722, "num_tokens": 595762550.0, "step": 4557 }, { "epoch": 1.8188347964884277, "grad_norm": 0.3283941447734833, "learning_rate": 2.1525347827237825e-05, "loss": 0.1597, "num_tokens": 595893622.0, "step": 4558 }, { "epoch": 1.8192338387869114, "grad_norm": 0.25013574957847595, "learning_rate": 2.151580651249564e-05, "loss": 0.1307, "num_tokens": 596024694.0, "step": 4559 }, { "epoch": 1.819632881085395, "grad_norm": 0.28266382217407227, "learning_rate": 2.150626635563911e-05, "loss": 0.1619, "num_tokens": 596155766.0, "step": 4560 }, { "epoch": 1.8200319233838786, "grad_norm": 0.3024260401725769, "learning_rate": 2.1496727358514163e-05, "loss": 0.1838, "num_tokens": 596286838.0, "step": 4561 }, { "epoch": 1.8204309656823623, "grad_norm": 0.27329108119010925, "learning_rate": 2.148718952296651e-05, "loss": 0.171, "num_tokens": 596417910.0, "step": 4562 }, { "epoch": 1.820830007980846, "grad_norm": 0.2816303074359894, "learning_rate": 2.1477652850841646e-05, "loss": 0.1744, "num_tokens": 596548982.0, "step": 4563 }, { "epoch": 1.8212290502793296, "grad_norm": 0.28422683477401733, "learning_rate": 2.14681173439848e-05, "loss": 0.1732, "num_tokens": 596680054.0, "step": 4564 }, { "epoch": 1.8216280925778132, "grad_norm": 0.2801678478717804, "learning_rate": 2.1458583004241035e-05, "loss": 0.1707, "num_tokens": 596811126.0, "step": 4565 }, { "epoch": 1.8220271348762969, "grad_norm": 0.27586299180984497, "learning_rate": 2.1449049833455126e-05, "loss": 0.1608, "num_tokens": 596942198.0, "step": 4566 }, { "epoch": 1.8224261771747805, "grad_norm": 0.26205477118492126, "learning_rate": 2.1439517833471672e-05, "loss": 0.1589, "num_tokens": 597073270.0, "step": 4567 }, { "epoch": 1.8228252194732641, "grad_norm": 0.26473596692085266, "learning_rate": 2.1429987006135027e-05, "loss": 0.1404, "num_tokens": 597195458.0, "step": 4568 }, { "epoch": 1.8232242617717478, "grad_norm": 0.34251153469085693, "learning_rate": 2.1420457353289302e-05, "loss": 0.1528, "num_tokens": 597326530.0, "step": 4569 }, { "epoch": 1.8236233040702314, "grad_norm": 0.29211661219596863, "learning_rate": 2.141092887677842e-05, "loss": 0.1701, "num_tokens": 597457602.0, "step": 4570 }, { "epoch": 1.824022346368715, "grad_norm": 0.3007408082485199, "learning_rate": 2.1401401578446027e-05, "loss": 0.1659, "num_tokens": 597588674.0, "step": 4571 }, { "epoch": 1.8244213886671987, "grad_norm": 0.3401590585708618, "learning_rate": 2.1391875460135584e-05, "loss": 0.1879, "num_tokens": 597719746.0, "step": 4572 }, { "epoch": 1.8248204309656824, "grad_norm": 0.2701089680194855, "learning_rate": 2.1382350523690304e-05, "loss": 0.1518, "num_tokens": 597850818.0, "step": 4573 }, { "epoch": 1.825219473264166, "grad_norm": 0.3141736090183258, "learning_rate": 2.1372826770953164e-05, "loss": 0.1869, "num_tokens": 597981890.0, "step": 4574 }, { "epoch": 1.8256185155626496, "grad_norm": 0.31888073682785034, "learning_rate": 2.136330420376694e-05, "loss": 0.1782, "num_tokens": 598112962.0, "step": 4575 }, { "epoch": 1.8260175578611333, "grad_norm": 0.2749354839324951, "learning_rate": 2.1353782823974135e-05, "loss": 0.1626, "num_tokens": 598244034.0, "step": 4576 }, { "epoch": 1.826416600159617, "grad_norm": 0.2842924892902374, "learning_rate": 2.134426263341705e-05, "loss": 0.1735, "num_tokens": 598375106.0, "step": 4577 }, { "epoch": 1.8268156424581006, "grad_norm": 0.28917747735977173, "learning_rate": 2.1334743633937776e-05, "loss": 0.1559, "num_tokens": 598492003.0, "step": 4578 }, { "epoch": 1.8272146847565842, "grad_norm": 0.242210254073143, "learning_rate": 2.1325225827378133e-05, "loss": 0.1246, "num_tokens": 598623075.0, "step": 4579 }, { "epoch": 1.8276137270550679, "grad_norm": 0.23907527327537537, "learning_rate": 2.1315709215579725e-05, "loss": 0.1187, "num_tokens": 598754147.0, "step": 4580 }, { "epoch": 1.8280127693535515, "grad_norm": 0.34363535046577454, "learning_rate": 2.1306193800383943e-05, "loss": 0.1856, "num_tokens": 598885219.0, "step": 4581 }, { "epoch": 1.8284118116520351, "grad_norm": 0.28864187002182007, "learning_rate": 2.129667958363193e-05, "loss": 0.1688, "num_tokens": 599016291.0, "step": 4582 }, { "epoch": 1.8288108539505188, "grad_norm": 0.2967870831489563, "learning_rate": 2.128716656716458e-05, "loss": 0.1722, "num_tokens": 599147363.0, "step": 4583 }, { "epoch": 1.8292098962490024, "grad_norm": 0.26447781920433044, "learning_rate": 2.1277654752822584e-05, "loss": 0.169, "num_tokens": 599278435.0, "step": 4584 }, { "epoch": 1.829608938547486, "grad_norm": 0.281638503074646, "learning_rate": 2.126814414244639e-05, "loss": 0.1848, "num_tokens": 599409507.0, "step": 4585 }, { "epoch": 1.8300079808459697, "grad_norm": 0.26737090945243835, "learning_rate": 2.125863473787622e-05, "loss": 0.1457, "num_tokens": 599540579.0, "step": 4586 }, { "epoch": 1.8304070231444534, "grad_norm": 0.2744874060153961, "learning_rate": 2.1249126540952046e-05, "loss": 0.1662, "num_tokens": 599671651.0, "step": 4587 }, { "epoch": 1.830806065442937, "grad_norm": 0.2896144688129425, "learning_rate": 2.1239619553513606e-05, "loss": 0.189, "num_tokens": 599802723.0, "step": 4588 }, { "epoch": 1.8312051077414206, "grad_norm": 0.28260865807533264, "learning_rate": 2.123011377740043e-05, "loss": 0.1593, "num_tokens": 599933795.0, "step": 4589 }, { "epoch": 1.831604150039904, "grad_norm": 0.25704073905944824, "learning_rate": 2.122060921445179e-05, "loss": 0.1491, "num_tokens": 600064867.0, "step": 4590 }, { "epoch": 1.8320031923383877, "grad_norm": 0.25906381011009216, "learning_rate": 2.1211105866506724e-05, "loss": 0.1346, "num_tokens": 600195939.0, "step": 4591 }, { "epoch": 1.8324022346368714, "grad_norm": 0.3084948658943176, "learning_rate": 2.1201603735404056e-05, "loss": 0.1724, "num_tokens": 600327011.0, "step": 4592 }, { "epoch": 1.832801276935355, "grad_norm": 0.3437713384628296, "learning_rate": 2.1192102822982357e-05, "loss": 0.1764, "num_tokens": 600458083.0, "step": 4593 }, { "epoch": 1.8332003192338386, "grad_norm": 0.2908129394054413, "learning_rate": 2.1182603131079952e-05, "loss": 0.1782, "num_tokens": 600589155.0, "step": 4594 }, { "epoch": 1.8335993615323223, "grad_norm": 0.2694728970527649, "learning_rate": 2.1173104661534954e-05, "loss": 0.1721, "num_tokens": 600720227.0, "step": 4595 }, { "epoch": 1.833998403830806, "grad_norm": 0.30067527294158936, "learning_rate": 2.1163607416185228e-05, "loss": 0.1881, "num_tokens": 600851299.0, "step": 4596 }, { "epoch": 1.8343974461292896, "grad_norm": 0.26905661821365356, "learning_rate": 2.1154111396868405e-05, "loss": 0.1336, "num_tokens": 600966132.0, "step": 4597 }, { "epoch": 1.8347964884277732, "grad_norm": 0.2797291874885559, "learning_rate": 2.1144616605421862e-05, "loss": 0.1471, "num_tokens": 601097204.0, "step": 4598 }, { "epoch": 1.8351955307262569, "grad_norm": 0.259999543428421, "learning_rate": 2.113512304368276e-05, "loss": 0.1458, "num_tokens": 601228276.0, "step": 4599 }, { "epoch": 1.8355945730247405, "grad_norm": 0.27302098274230957, "learning_rate": 2.112563071348802e-05, "loss": 0.1768, "num_tokens": 601359348.0, "step": 4600 }, { "epoch": 1.8359936153232241, "grad_norm": 0.28875410556793213, "learning_rate": 2.111613961667432e-05, "loss": 0.1769, "num_tokens": 601490420.0, "step": 4601 }, { "epoch": 1.8363926576217078, "grad_norm": 0.24268954992294312, "learning_rate": 2.1106649755078084e-05, "loss": 0.1203, "num_tokens": 601621492.0, "step": 4602 }, { "epoch": 1.8367916999201914, "grad_norm": 0.2634657025337219, "learning_rate": 2.109716113053553e-05, "loss": 0.15, "num_tokens": 601752564.0, "step": 4603 }, { "epoch": 1.837190742218675, "grad_norm": 0.2888979911804199, "learning_rate": 2.1087673744882608e-05, "loss": 0.1695, "num_tokens": 601883636.0, "step": 4604 }, { "epoch": 1.8375897845171587, "grad_norm": 0.35051482915878296, "learning_rate": 2.107818759995504e-05, "loss": 0.1762, "num_tokens": 602014708.0, "step": 4605 }, { "epoch": 1.8379888268156424, "grad_norm": 0.28751668334007263, "learning_rate": 2.106870269758831e-05, "loss": 0.1503, "num_tokens": 602145780.0, "step": 4606 }, { "epoch": 1.838387869114126, "grad_norm": 0.2969897985458374, "learning_rate": 2.105921903961766e-05, "loss": 0.1845, "num_tokens": 602276852.0, "step": 4607 }, { "epoch": 1.8387869114126096, "grad_norm": 0.3015400767326355, "learning_rate": 2.104973662787808e-05, "loss": 0.1755, "num_tokens": 602407924.0, "step": 4608 }, { "epoch": 1.8391859537110933, "grad_norm": 0.2657748758792877, "learning_rate": 2.104025546420435e-05, "loss": 0.1552, "num_tokens": 602538996.0, "step": 4609 }, { "epoch": 1.839584996009577, "grad_norm": 0.258259117603302, "learning_rate": 2.1030775550430947e-05, "loss": 0.1371, "num_tokens": 602670068.0, "step": 4610 }, { "epoch": 1.8399840383080606, "grad_norm": 0.31439003348350525, "learning_rate": 2.1021296888392184e-05, "loss": 0.1785, "num_tokens": 602801140.0, "step": 4611 }, { "epoch": 1.8403830806065442, "grad_norm": 0.28833135962486267, "learning_rate": 2.1011819479922078e-05, "loss": 0.1696, "num_tokens": 602932212.0, "step": 4612 }, { "epoch": 1.8407821229050279, "grad_norm": 0.28184932470321655, "learning_rate": 2.1002343326854413e-05, "loss": 0.1739, "num_tokens": 603063284.0, "step": 4613 }, { "epoch": 1.8411811652035115, "grad_norm": 0.28888359665870667, "learning_rate": 2.099286843102275e-05, "loss": 0.1889, "num_tokens": 603194356.0, "step": 4614 }, { "epoch": 1.8415802075019951, "grad_norm": 0.2725869417190552, "learning_rate": 2.0983394794260386e-05, "loss": 0.1518, "num_tokens": 603325428.0, "step": 4615 }, { "epoch": 1.8419792498004788, "grad_norm": 0.2649199068546295, "learning_rate": 2.0973922418400384e-05, "loss": 0.1581, "num_tokens": 603456500.0, "step": 4616 }, { "epoch": 1.8423782920989624, "grad_norm": 0.2888917922973633, "learning_rate": 2.096445130527555e-05, "loss": 0.1738, "num_tokens": 603587572.0, "step": 4617 }, { "epoch": 1.842777334397446, "grad_norm": 0.2890585660934448, "learning_rate": 2.0954981456718474e-05, "loss": 0.1668, "num_tokens": 603718644.0, "step": 4618 }, { "epoch": 1.8431763766959297, "grad_norm": 0.3034583330154419, "learning_rate": 2.0945512874561475e-05, "loss": 0.1808, "num_tokens": 603849716.0, "step": 4619 }, { "epoch": 1.8435754189944134, "grad_norm": 0.2984965741634369, "learning_rate": 2.0936045560636634e-05, "loss": 0.176, "num_tokens": 603980788.0, "step": 4620 }, { "epoch": 1.843974461292897, "grad_norm": 0.2828384339809418, "learning_rate": 2.0926579516775783e-05, "loss": 0.1765, "num_tokens": 604111860.0, "step": 4621 }, { "epoch": 1.8443735035913806, "grad_norm": 0.3045828342437744, "learning_rate": 2.0917114744810523e-05, "loss": 0.1727, "num_tokens": 604242932.0, "step": 4622 }, { "epoch": 1.8447725458898643, "grad_norm": 0.26895084977149963, "learning_rate": 2.0907651246572198e-05, "loss": 0.1476, "num_tokens": 604374004.0, "step": 4623 }, { "epoch": 1.845171588188348, "grad_norm": 0.27580592036247253, "learning_rate": 2.0898189023891895e-05, "loss": 0.1498, "num_tokens": 604505076.0, "step": 4624 }, { "epoch": 1.8455706304868316, "grad_norm": 0.265511691570282, "learning_rate": 2.088872807860048e-05, "loss": 0.1448, "num_tokens": 604636148.0, "step": 4625 }, { "epoch": 1.8459696727853152, "grad_norm": 0.27689510583877563, "learning_rate": 2.0879268412528553e-05, "loss": 0.1507, "num_tokens": 604767220.0, "step": 4626 }, { "epoch": 1.8463687150837989, "grad_norm": 0.2839341163635254, "learning_rate": 2.0869810027506465e-05, "loss": 0.1742, "num_tokens": 604898292.0, "step": 4627 }, { "epoch": 1.8467677573822825, "grad_norm": 0.26579445600509644, "learning_rate": 2.0860352925364335e-05, "loss": 0.1505, "num_tokens": 605029364.0, "step": 4628 }, { "epoch": 1.8471667996807661, "grad_norm": 0.27534806728363037, "learning_rate": 2.085089710793202e-05, "loss": 0.1389, "num_tokens": 605160436.0, "step": 4629 }, { "epoch": 1.8475658419792498, "grad_norm": 0.26927897334098816, "learning_rate": 2.0841442577039134e-05, "loss": 0.1469, "num_tokens": 605291508.0, "step": 4630 }, { "epoch": 1.8479648842777334, "grad_norm": 0.3058408200740814, "learning_rate": 2.0831989334515045e-05, "loss": 0.188, "num_tokens": 605422580.0, "step": 4631 }, { "epoch": 1.848363926576217, "grad_norm": 0.26983997225761414, "learning_rate": 2.0822537382188846e-05, "loss": 0.1541, "num_tokens": 605553652.0, "step": 4632 }, { "epoch": 1.8487629688747007, "grad_norm": 0.2696993052959442, "learning_rate": 2.0813086721889424e-05, "loss": 0.1559, "num_tokens": 605684724.0, "step": 4633 }, { "epoch": 1.8491620111731844, "grad_norm": 0.27096283435821533, "learning_rate": 2.080363735544539e-05, "loss": 0.151, "num_tokens": 605815796.0, "step": 4634 }, { "epoch": 1.849561053471668, "grad_norm": 0.3125078082084656, "learning_rate": 2.0794189284685105e-05, "loss": 0.1783, "num_tokens": 605946868.0, "step": 4635 }, { "epoch": 1.8499600957701516, "grad_norm": 0.3975423574447632, "learning_rate": 2.0784742511436673e-05, "loss": 0.192, "num_tokens": 606077940.0, "step": 4636 }, { "epoch": 1.8503591380686353, "grad_norm": 0.2962251305580139, "learning_rate": 2.0775297037527974e-05, "loss": 0.1776, "num_tokens": 606209012.0, "step": 4637 }, { "epoch": 1.850758180367119, "grad_norm": 0.26206156611442566, "learning_rate": 2.076585286478661e-05, "loss": 0.1461, "num_tokens": 606340084.0, "step": 4638 }, { "epoch": 1.8511572226656026, "grad_norm": 0.26812493801116943, "learning_rate": 2.0756409995039945e-05, "loss": 0.15, "num_tokens": 606471156.0, "step": 4639 }, { "epoch": 1.8515562649640862, "grad_norm": 0.2715696096420288, "learning_rate": 2.0746968430115083e-05, "loss": 0.1384, "num_tokens": 606602228.0, "step": 4640 }, { "epoch": 1.8519553072625698, "grad_norm": 0.2904975116252899, "learning_rate": 2.073752817183887e-05, "loss": 0.1824, "num_tokens": 606733300.0, "step": 4641 }, { "epoch": 1.8523543495610535, "grad_norm": 0.2467641532421112, "learning_rate": 2.0728089222037933e-05, "loss": 0.1456, "num_tokens": 606864372.0, "step": 4642 }, { "epoch": 1.8527533918595371, "grad_norm": 0.32032209634780884, "learning_rate": 2.0718651582538596e-05, "loss": 0.1952, "num_tokens": 606995444.0, "step": 4643 }, { "epoch": 1.8531524341580208, "grad_norm": 0.2731793224811554, "learning_rate": 2.0709215255166953e-05, "loss": 0.1532, "num_tokens": 607126516.0, "step": 4644 }, { "epoch": 1.8535514764565044, "grad_norm": 0.2749151289463043, "learning_rate": 2.0699780241748868e-05, "loss": 0.1712, "num_tokens": 607257588.0, "step": 4645 }, { "epoch": 1.853950518754988, "grad_norm": 0.29031145572662354, "learning_rate": 2.0690346544109913e-05, "loss": 0.169, "num_tokens": 607388660.0, "step": 4646 }, { "epoch": 1.8543495610534717, "grad_norm": 0.23796017467975616, "learning_rate": 2.0680914164075417e-05, "loss": 0.1411, "num_tokens": 607519732.0, "step": 4647 }, { "epoch": 1.8547486033519553, "grad_norm": 0.2795376181602478, "learning_rate": 2.067148310347047e-05, "loss": 0.173, "num_tokens": 607650804.0, "step": 4648 }, { "epoch": 1.855147645650439, "grad_norm": 0.23717747628688812, "learning_rate": 2.0662053364119882e-05, "loss": 0.1278, "num_tokens": 607781876.0, "step": 4649 }, { "epoch": 1.8555466879489226, "grad_norm": 0.24518275260925293, "learning_rate": 2.0652624947848227e-05, "loss": 0.1323, "num_tokens": 607912948.0, "step": 4650 }, { "epoch": 1.8559457302474063, "grad_norm": 0.3091040849685669, "learning_rate": 2.0643197856479812e-05, "loss": 0.1641, "num_tokens": 608044020.0, "step": 4651 }, { "epoch": 1.85634477254589, "grad_norm": 0.26418671011924744, "learning_rate": 2.0633772091838706e-05, "loss": 0.145, "num_tokens": 608175092.0, "step": 4652 }, { "epoch": 1.8567438148443736, "grad_norm": 0.2785569429397583, "learning_rate": 2.0624347655748676e-05, "loss": 0.1475, "num_tokens": 608306164.0, "step": 4653 }, { "epoch": 1.8571428571428572, "grad_norm": 0.27946189045906067, "learning_rate": 2.06149245500333e-05, "loss": 0.1352, "num_tokens": 608437236.0, "step": 4654 }, { "epoch": 1.8575418994413408, "grad_norm": 0.32216960191726685, "learning_rate": 2.0605502776515828e-05, "loss": 0.1906, "num_tokens": 608568308.0, "step": 4655 }, { "epoch": 1.8579409417398245, "grad_norm": 0.28206896781921387, "learning_rate": 2.0596082337019307e-05, "loss": 0.1594, "num_tokens": 608699380.0, "step": 4656 }, { "epoch": 1.8583399840383081, "grad_norm": 0.30422529578208923, "learning_rate": 2.0586663233366494e-05, "loss": 0.1989, "num_tokens": 608830452.0, "step": 4657 }, { "epoch": 1.8587390263367918, "grad_norm": 0.2631191611289978, "learning_rate": 2.0577245467379895e-05, "loss": 0.1516, "num_tokens": 608961524.0, "step": 4658 }, { "epoch": 1.8591380686352754, "grad_norm": 0.28729283809661865, "learning_rate": 2.056782904088177e-05, "loss": 0.1711, "num_tokens": 609092596.0, "step": 4659 }, { "epoch": 1.859537110933759, "grad_norm": 0.27798527479171753, "learning_rate": 2.05584139556941e-05, "loss": 0.1506, "num_tokens": 609223668.0, "step": 4660 }, { "epoch": 1.8599361532322427, "grad_norm": 0.246001198887825, "learning_rate": 2.054900021363862e-05, "loss": 0.1126, "num_tokens": 609354740.0, "step": 4661 }, { "epoch": 1.8603351955307263, "grad_norm": 0.26112645864486694, "learning_rate": 2.053958781653681e-05, "loss": 0.1453, "num_tokens": 609481477.0, "step": 4662 }, { "epoch": 1.86073423782921, "grad_norm": 0.33577844500541687, "learning_rate": 2.053017676620987e-05, "loss": 0.2195, "num_tokens": 609612549.0, "step": 4663 }, { "epoch": 1.8611332801276936, "grad_norm": 0.2668742835521698, "learning_rate": 2.0520767064478745e-05, "loss": 0.1499, "num_tokens": 609743621.0, "step": 4664 }, { "epoch": 1.8615323224261773, "grad_norm": 0.2668294608592987, "learning_rate": 2.0511358713164157e-05, "loss": 0.1538, "num_tokens": 609874693.0, "step": 4665 }, { "epoch": 1.861931364724661, "grad_norm": 0.24892570078372955, "learning_rate": 2.0501951714086492e-05, "loss": 0.1392, "num_tokens": 610005765.0, "step": 4666 }, { "epoch": 1.8623304070231446, "grad_norm": 0.26387107372283936, "learning_rate": 2.0492546069065938e-05, "loss": 0.149, "num_tokens": 610136837.0, "step": 4667 }, { "epoch": 1.8627294493216282, "grad_norm": 0.26970386505126953, "learning_rate": 2.0483141779922397e-05, "loss": 0.1587, "num_tokens": 610267909.0, "step": 4668 }, { "epoch": 1.8631284916201118, "grad_norm": 0.27653008699417114, "learning_rate": 2.047373884847551e-05, "loss": 0.1601, "num_tokens": 610398981.0, "step": 4669 }, { "epoch": 1.8635275339185955, "grad_norm": 0.30821794271469116, "learning_rate": 2.046433727654466e-05, "loss": 0.1833, "num_tokens": 610520041.0, "step": 4670 }, { "epoch": 1.8639265762170791, "grad_norm": 0.2552759647369385, "learning_rate": 2.0454937065948966e-05, "loss": 0.1447, "num_tokens": 610651113.0, "step": 4671 }, { "epoch": 1.8643256185155628, "grad_norm": 0.2944701910018921, "learning_rate": 2.0445538218507267e-05, "loss": 0.1477, "num_tokens": 610782185.0, "step": 4672 }, { "epoch": 1.8647246608140464, "grad_norm": 0.26804956793785095, "learning_rate": 2.043614073603817e-05, "loss": 0.151, "num_tokens": 610913257.0, "step": 4673 }, { "epoch": 1.86512370311253, "grad_norm": 0.26316455006599426, "learning_rate": 2.0426744620359987e-05, "loss": 0.1322, "num_tokens": 611044329.0, "step": 4674 }, { "epoch": 1.8655227454110137, "grad_norm": 0.27406394481658936, "learning_rate": 2.041734987329078e-05, "loss": 0.1505, "num_tokens": 611175401.0, "step": 4675 }, { "epoch": 1.8659217877094973, "grad_norm": 0.2702923119068146, "learning_rate": 2.0407956496648368e-05, "loss": 0.1541, "num_tokens": 611306473.0, "step": 4676 }, { "epoch": 1.866320830007981, "grad_norm": 0.2897327244281769, "learning_rate": 2.0398564492250243e-05, "loss": 0.1677, "num_tokens": 611437545.0, "step": 4677 }, { "epoch": 1.8667198723064646, "grad_norm": 0.2765655219554901, "learning_rate": 2.03891738619137e-05, "loss": 0.1454, "num_tokens": 611568617.0, "step": 4678 }, { "epoch": 1.8671189146049483, "grad_norm": 0.25528937578201294, "learning_rate": 2.0379784607455726e-05, "loss": 0.1485, "num_tokens": 611699689.0, "step": 4679 }, { "epoch": 1.867517956903432, "grad_norm": 0.24879090487957, "learning_rate": 2.0370396730693053e-05, "loss": 0.1298, "num_tokens": 611830761.0, "step": 4680 }, { "epoch": 1.8679169992019156, "grad_norm": 0.28349700570106506, "learning_rate": 2.0361010233442153e-05, "loss": 0.159, "num_tokens": 611961833.0, "step": 4681 }, { "epoch": 1.8683160415003992, "grad_norm": 0.2960546910762787, "learning_rate": 2.0351625117519223e-05, "loss": 0.1644, "num_tokens": 612092905.0, "step": 4682 }, { "epoch": 1.8687150837988828, "grad_norm": 0.3173968493938446, "learning_rate": 2.0342241384740195e-05, "loss": 0.173, "num_tokens": 612217001.0, "step": 4683 }, { "epoch": 1.8691141260973663, "grad_norm": 0.2952680289745331, "learning_rate": 2.0332859036920737e-05, "loss": 0.1646, "num_tokens": 612348073.0, "step": 4684 }, { "epoch": 1.86951316839585, "grad_norm": 0.28715255856513977, "learning_rate": 2.0323478075876252e-05, "loss": 0.1644, "num_tokens": 612479145.0, "step": 4685 }, { "epoch": 1.8699122106943336, "grad_norm": 0.30468666553497314, "learning_rate": 2.0314098503421845e-05, "loss": 0.1795, "num_tokens": 612610217.0, "step": 4686 }, { "epoch": 1.8703112529928172, "grad_norm": 0.26656582951545715, "learning_rate": 2.0304720321372405e-05, "loss": 0.1598, "num_tokens": 612741289.0, "step": 4687 }, { "epoch": 1.8707102952913008, "grad_norm": 0.2623894214630127, "learning_rate": 2.029534353154251e-05, "loss": 0.1492, "num_tokens": 612872361.0, "step": 4688 }, { "epoch": 1.8711093375897845, "grad_norm": 0.25171327590942383, "learning_rate": 2.028596813574648e-05, "loss": 0.1392, "num_tokens": 613003433.0, "step": 4689 }, { "epoch": 1.8715083798882681, "grad_norm": 0.30928701162338257, "learning_rate": 2.027659413579836e-05, "loss": 0.1961, "num_tokens": 613134505.0, "step": 4690 }, { "epoch": 1.8719074221867518, "grad_norm": 0.28863632678985596, "learning_rate": 2.0267221533511937e-05, "loss": 0.1644, "num_tokens": 613265577.0, "step": 4691 }, { "epoch": 1.8723064644852354, "grad_norm": 0.26582297682762146, "learning_rate": 2.0257850330700732e-05, "loss": 0.1714, "num_tokens": 613396649.0, "step": 4692 }, { "epoch": 1.872705506783719, "grad_norm": 0.2642582952976227, "learning_rate": 2.0248480529177975e-05, "loss": 0.1585, "num_tokens": 613527721.0, "step": 4693 }, { "epoch": 1.8731045490822027, "grad_norm": 0.2594408094882965, "learning_rate": 2.023911213075664e-05, "loss": 0.1444, "num_tokens": 613658793.0, "step": 4694 }, { "epoch": 1.8735035913806863, "grad_norm": 0.2619324326515198, "learning_rate": 2.0229745137249418e-05, "loss": 0.1431, "num_tokens": 613789865.0, "step": 4695 }, { "epoch": 1.87390263367917, "grad_norm": 0.2576291561126709, "learning_rate": 2.0220379550468738e-05, "loss": 0.1422, "num_tokens": 613920937.0, "step": 4696 }, { "epoch": 1.8743016759776536, "grad_norm": 0.26687827706336975, "learning_rate": 2.0211015372226766e-05, "loss": 0.1476, "num_tokens": 614052009.0, "step": 4697 }, { "epoch": 1.8747007182761373, "grad_norm": 0.3417443037033081, "learning_rate": 2.0201652604335357e-05, "loss": 0.1678, "num_tokens": 614183081.0, "step": 4698 }, { "epoch": 1.875099760574621, "grad_norm": 0.2621183395385742, "learning_rate": 2.0192291248606147e-05, "loss": 0.1585, "num_tokens": 614314153.0, "step": 4699 }, { "epoch": 1.8754988028731046, "grad_norm": 0.2956498861312866, "learning_rate": 2.018293130685046e-05, "loss": 0.1858, "num_tokens": 614445225.0, "step": 4700 }, { "epoch": 1.8758978451715882, "grad_norm": 0.3210277855396271, "learning_rate": 2.0173572780879352e-05, "loss": 0.1683, "num_tokens": 614576297.0, "step": 4701 }, { "epoch": 1.8762968874700718, "grad_norm": 0.24702316522598267, "learning_rate": 2.0164215672503618e-05, "loss": 0.1255, "num_tokens": 614707369.0, "step": 4702 }, { "epoch": 1.8766959297685555, "grad_norm": 0.2636061906814575, "learning_rate": 2.0154859983533757e-05, "loss": 0.1312, "num_tokens": 614838441.0, "step": 4703 }, { "epoch": 1.8770949720670391, "grad_norm": 0.2452799379825592, "learning_rate": 2.0145505715780028e-05, "loss": 0.1165, "num_tokens": 614969513.0, "step": 4704 }, { "epoch": 1.8774940143655228, "grad_norm": 0.2844592034816742, "learning_rate": 2.0136152871052383e-05, "loss": 0.1558, "num_tokens": 615100585.0, "step": 4705 }, { "epoch": 1.8778930566640064, "grad_norm": 0.26500141620635986, "learning_rate": 2.0126801451160503e-05, "loss": 0.1421, "num_tokens": 615231657.0, "step": 4706 }, { "epoch": 1.87829209896249, "grad_norm": 0.2870892882347107, "learning_rate": 2.0117451457913822e-05, "loss": 0.1815, "num_tokens": 615362729.0, "step": 4707 }, { "epoch": 1.8786911412609737, "grad_norm": 0.2955906093120575, "learning_rate": 2.0108102893121467e-05, "loss": 0.1791, "num_tokens": 615493801.0, "step": 4708 }, { "epoch": 1.8790901835594573, "grad_norm": 0.2660255432128906, "learning_rate": 2.0098755758592287e-05, "loss": 0.1537, "num_tokens": 615624873.0, "step": 4709 }, { "epoch": 1.879489225857941, "grad_norm": 0.27738112211227417, "learning_rate": 2.0089410056134888e-05, "loss": 0.1511, "num_tokens": 615755945.0, "step": 4710 }, { "epoch": 1.8798882681564246, "grad_norm": 0.2770874500274658, "learning_rate": 2.0080065787557552e-05, "loss": 0.1641, "num_tokens": 615887017.0, "step": 4711 }, { "epoch": 1.8802873104549083, "grad_norm": 0.2923499643802643, "learning_rate": 2.007072295466832e-05, "loss": 0.1604, "num_tokens": 616003745.0, "step": 4712 }, { "epoch": 1.880686352753392, "grad_norm": 0.25602683424949646, "learning_rate": 2.006138155927494e-05, "loss": 0.1436, "num_tokens": 616134817.0, "step": 4713 }, { "epoch": 1.8810853950518756, "grad_norm": 0.2550933361053467, "learning_rate": 2.005204160318488e-05, "loss": 0.1364, "num_tokens": 616265889.0, "step": 4714 }, { "epoch": 1.881484437350359, "grad_norm": 0.2783567011356354, "learning_rate": 2.004270308820535e-05, "loss": 0.1617, "num_tokens": 616396961.0, "step": 4715 }, { "epoch": 1.8818834796488426, "grad_norm": 0.2753680646419525, "learning_rate": 2.0033366016143253e-05, "loss": 0.1371, "num_tokens": 616528033.0, "step": 4716 }, { "epoch": 1.8822825219473263, "grad_norm": 0.27395883202552795, "learning_rate": 2.0024030388805225e-05, "loss": 0.1385, "num_tokens": 616659105.0, "step": 4717 }, { "epoch": 1.88268156424581, "grad_norm": 0.27019187808036804, "learning_rate": 2.0014696207997637e-05, "loss": 0.1401, "num_tokens": 616790177.0, "step": 4718 }, { "epoch": 1.8830806065442935, "grad_norm": 0.2592538297176361, "learning_rate": 2.0005363475526552e-05, "loss": 0.1224, "num_tokens": 616921249.0, "step": 4719 }, { "epoch": 1.8834796488427772, "grad_norm": 0.278021901845932, "learning_rate": 1.999603219319776e-05, "loss": 0.1659, "num_tokens": 617052321.0, "step": 4720 }, { "epoch": 1.8838786911412608, "grad_norm": 0.26750752329826355, "learning_rate": 1.99867023628168e-05, "loss": 0.1416, "num_tokens": 617183393.0, "step": 4721 }, { "epoch": 1.8842777334397445, "grad_norm": 0.31203821301460266, "learning_rate": 1.9977373986188896e-05, "loss": 0.1668, "num_tokens": 617314465.0, "step": 4722 }, { "epoch": 1.8846767757382281, "grad_norm": 0.25446397066116333, "learning_rate": 1.9968047065118993e-05, "loss": 0.1403, "num_tokens": 617445537.0, "step": 4723 }, { "epoch": 1.8850758180367118, "grad_norm": 0.29104772210121155, "learning_rate": 1.995872160141178e-05, "loss": 0.1602, "num_tokens": 617576609.0, "step": 4724 }, { "epoch": 1.8854748603351954, "grad_norm": 0.2610938549041748, "learning_rate": 1.9949397596871627e-05, "loss": 0.1456, "num_tokens": 617707681.0, "step": 4725 }, { "epoch": 1.885873902633679, "grad_norm": 0.3046073615550995, "learning_rate": 1.994007505330266e-05, "loss": 0.1999, "num_tokens": 617838753.0, "step": 4726 }, { "epoch": 1.8862729449321627, "grad_norm": 0.2301899939775467, "learning_rate": 1.9930753972508692e-05, "loss": 0.1115, "num_tokens": 617969825.0, "step": 4727 }, { "epoch": 1.8866719872306463, "grad_norm": 0.267874538898468, "learning_rate": 1.9921434356293274e-05, "loss": 0.1415, "num_tokens": 618100897.0, "step": 4728 }, { "epoch": 1.88707102952913, "grad_norm": 0.2545256018638611, "learning_rate": 1.9912116206459662e-05, "loss": 0.1308, "num_tokens": 618231969.0, "step": 4729 }, { "epoch": 1.8874700718276136, "grad_norm": 0.288018137216568, "learning_rate": 1.990279952481083e-05, "loss": 0.164, "num_tokens": 618363041.0, "step": 4730 }, { "epoch": 1.8878691141260973, "grad_norm": 0.30255410075187683, "learning_rate": 1.9893484313149468e-05, "loss": 0.1784, "num_tokens": 618494113.0, "step": 4731 }, { "epoch": 1.888268156424581, "grad_norm": 0.2892838716506958, "learning_rate": 1.9884170573277984e-05, "loss": 0.167, "num_tokens": 618625185.0, "step": 4732 }, { "epoch": 1.8886671987230645, "grad_norm": 0.283429354429245, "learning_rate": 1.9874858306998506e-05, "loss": 0.1474, "num_tokens": 618756257.0, "step": 4733 }, { "epoch": 1.8890662410215482, "grad_norm": 0.32518452405929565, "learning_rate": 1.9865547516112865e-05, "loss": 0.2046, "num_tokens": 618887329.0, "step": 4734 }, { "epoch": 1.8894652833200318, "grad_norm": 0.2831816077232361, "learning_rate": 1.985623820242261e-05, "loss": 0.1701, "num_tokens": 619018401.0, "step": 4735 }, { "epoch": 1.8898643256185155, "grad_norm": 0.2615741789340973, "learning_rate": 1.9846930367729007e-05, "loss": 0.1209, "num_tokens": 619134988.0, "step": 4736 }, { "epoch": 1.8902633679169991, "grad_norm": 0.26918673515319824, "learning_rate": 1.9837624013833034e-05, "loss": 0.1457, "num_tokens": 619266060.0, "step": 4737 }, { "epoch": 1.8906624102154828, "grad_norm": 0.2882469892501831, "learning_rate": 1.982831914253539e-05, "loss": 0.1601, "num_tokens": 619397132.0, "step": 4738 }, { "epoch": 1.8910614525139664, "grad_norm": 0.25247716903686523, "learning_rate": 1.981901575563647e-05, "loss": 0.1228, "num_tokens": 619528204.0, "step": 4739 }, { "epoch": 1.89146049481245, "grad_norm": 0.29222822189331055, "learning_rate": 1.9809713854936412e-05, "loss": 0.1826, "num_tokens": 619659276.0, "step": 4740 }, { "epoch": 1.8918595371109337, "grad_norm": 0.29857319593429565, "learning_rate": 1.9800413442235028e-05, "loss": 0.1591, "num_tokens": 619790348.0, "step": 4741 }, { "epoch": 1.8922585794094173, "grad_norm": 0.3171519339084625, "learning_rate": 1.979111451933187e-05, "loss": 0.1909, "num_tokens": 619921420.0, "step": 4742 }, { "epoch": 1.892657621707901, "grad_norm": 0.2658904194831848, "learning_rate": 1.9781817088026196e-05, "loss": 0.1435, "num_tokens": 620052492.0, "step": 4743 }, { "epoch": 1.8930566640063846, "grad_norm": 0.3099147379398346, "learning_rate": 1.9772521150116964e-05, "loss": 0.1863, "num_tokens": 620183564.0, "step": 4744 }, { "epoch": 1.8934557063048683, "grad_norm": 0.2988841235637665, "learning_rate": 1.9763226707402854e-05, "loss": 0.1565, "num_tokens": 620314636.0, "step": 4745 }, { "epoch": 1.893854748603352, "grad_norm": 0.7945353984832764, "learning_rate": 1.975393376168226e-05, "loss": 0.1594, "num_tokens": 620445708.0, "step": 4746 }, { "epoch": 1.8942537909018355, "grad_norm": 0.2657429277896881, "learning_rate": 1.9744642314753264e-05, "loss": 0.1511, "num_tokens": 620576780.0, "step": 4747 }, { "epoch": 1.8946528332003192, "grad_norm": 0.2874740660190582, "learning_rate": 1.9735352368413697e-05, "loss": 0.1347, "num_tokens": 620707852.0, "step": 4748 }, { "epoch": 1.8950518754988028, "grad_norm": 0.3133399784564972, "learning_rate": 1.972606392446106e-05, "loss": 0.168, "num_tokens": 620838924.0, "step": 4749 }, { "epoch": 1.8954509177972865, "grad_norm": 0.2848142385482788, "learning_rate": 1.971677698469258e-05, "loss": 0.1631, "num_tokens": 620969996.0, "step": 4750 }, { "epoch": 1.8958499600957701, "grad_norm": 0.2674168348312378, "learning_rate": 1.9707491550905204e-05, "loss": 0.1507, "num_tokens": 621101068.0, "step": 4751 }, { "epoch": 1.8962490023942538, "grad_norm": 0.22475649416446686, "learning_rate": 1.9698207624895575e-05, "loss": 0.0967, "num_tokens": 621232140.0, "step": 4752 }, { "epoch": 1.8966480446927374, "grad_norm": 0.2787877917289734, "learning_rate": 1.9688925208460042e-05, "loss": 0.1665, "num_tokens": 621363212.0, "step": 4753 }, { "epoch": 1.897047086991221, "grad_norm": 0.26679790019989014, "learning_rate": 1.9679644303394666e-05, "loss": 0.1543, "num_tokens": 621494284.0, "step": 4754 }, { "epoch": 1.8974461292897047, "grad_norm": 0.3026093542575836, "learning_rate": 1.967036491149522e-05, "loss": 0.1817, "num_tokens": 621625356.0, "step": 4755 }, { "epoch": 1.8978451715881883, "grad_norm": 0.25293615460395813, "learning_rate": 1.966108703455718e-05, "loss": 0.1385, "num_tokens": 621756428.0, "step": 4756 }, { "epoch": 1.898244213886672, "grad_norm": 0.3188513219356537, "learning_rate": 1.965181067437572e-05, "loss": 0.1803, "num_tokens": 621887500.0, "step": 4757 }, { "epoch": 1.8986432561851556, "grad_norm": 0.26476722955703735, "learning_rate": 1.964253583274573e-05, "loss": 0.1439, "num_tokens": 622018572.0, "step": 4758 }, { "epoch": 1.8990422984836393, "grad_norm": 0.31781917810440063, "learning_rate": 1.9633262511461814e-05, "loss": 0.1542, "num_tokens": 622149644.0, "step": 4759 }, { "epoch": 1.899441340782123, "grad_norm": 0.28601235151290894, "learning_rate": 1.962399071231827e-05, "loss": 0.1508, "num_tokens": 622280716.0, "step": 4760 }, { "epoch": 1.8998403830806065, "grad_norm": 0.2456696331501007, "learning_rate": 1.96147204371091e-05, "loss": 0.1338, "num_tokens": 622411788.0, "step": 4761 }, { "epoch": 1.9002394253790902, "grad_norm": 0.276528537273407, "learning_rate": 1.9605451687628024e-05, "loss": 0.1522, "num_tokens": 622542860.0, "step": 4762 }, { "epoch": 1.9006384676775738, "grad_norm": 0.24140019714832306, "learning_rate": 1.9596184465668455e-05, "loss": 0.1305, "num_tokens": 622673932.0, "step": 4763 }, { "epoch": 1.9010375099760575, "grad_norm": 0.2627866864204407, "learning_rate": 1.9586918773023514e-05, "loss": 0.1384, "num_tokens": 622805004.0, "step": 4764 }, { "epoch": 1.9014365522745411, "grad_norm": 0.2914610803127289, "learning_rate": 1.957765461148602e-05, "loss": 0.1418, "num_tokens": 622936076.0, "step": 4765 }, { "epoch": 1.9018355945730248, "grad_norm": 0.3447662889957428, "learning_rate": 1.956839198284851e-05, "loss": 0.2016, "num_tokens": 623067148.0, "step": 4766 }, { "epoch": 1.9022346368715084, "grad_norm": 0.28880617022514343, "learning_rate": 1.955913088890322e-05, "loss": 0.1399, "num_tokens": 623198220.0, "step": 4767 }, { "epoch": 1.902633679169992, "grad_norm": 0.293468177318573, "learning_rate": 1.9549871331442083e-05, "loss": 0.1701, "num_tokens": 623329292.0, "step": 4768 }, { "epoch": 1.9030327214684757, "grad_norm": 0.2917512059211731, "learning_rate": 1.9540613312256717e-05, "loss": 0.1718, "num_tokens": 623460364.0, "step": 4769 }, { "epoch": 1.9034317637669593, "grad_norm": 0.2710530757904053, "learning_rate": 1.9531356833138487e-05, "loss": 0.1302, "num_tokens": 623591436.0, "step": 4770 }, { "epoch": 1.903830806065443, "grad_norm": 0.26246190071105957, "learning_rate": 1.9522101895878424e-05, "loss": 0.1525, "num_tokens": 623722508.0, "step": 4771 }, { "epoch": 1.9042298483639266, "grad_norm": 0.3439481258392334, "learning_rate": 1.9512848502267284e-05, "loss": 0.2048, "num_tokens": 623839683.0, "step": 4772 }, { "epoch": 1.9046288906624103, "grad_norm": 0.2710173726081848, "learning_rate": 1.950359665409549e-05, "loss": 0.1579, "num_tokens": 623970755.0, "step": 4773 }, { "epoch": 1.905027932960894, "grad_norm": 0.2648380398750305, "learning_rate": 1.949434635315321e-05, "loss": 0.1456, "num_tokens": 624101827.0, "step": 4774 }, { "epoch": 1.9054269752593775, "grad_norm": 0.286812961101532, "learning_rate": 1.9485097601230284e-05, "loss": 0.1705, "num_tokens": 624232899.0, "step": 4775 }, { "epoch": 1.9058260175578612, "grad_norm": 0.2816013991832733, "learning_rate": 1.947585040011625e-05, "loss": 0.1497, "num_tokens": 624363971.0, "step": 4776 }, { "epoch": 1.9062250598563448, "grad_norm": 0.30558285117149353, "learning_rate": 1.946660475160037e-05, "loss": 0.1534, "num_tokens": 624495043.0, "step": 4777 }, { "epoch": 1.9066241021548285, "grad_norm": 0.2506742775440216, "learning_rate": 1.945736065747159e-05, "loss": 0.1409, "num_tokens": 624626115.0, "step": 4778 }, { "epoch": 1.9070231444533121, "grad_norm": 0.26348403096199036, "learning_rate": 1.9448118119518545e-05, "loss": 0.1375, "num_tokens": 624757187.0, "step": 4779 }, { "epoch": 1.9074221867517958, "grad_norm": 0.3074258267879486, "learning_rate": 1.943887713952958e-05, "loss": 0.1737, "num_tokens": 624888259.0, "step": 4780 }, { "epoch": 1.9078212290502794, "grad_norm": 0.27469632029533386, "learning_rate": 1.9429637719292748e-05, "loss": 0.1398, "num_tokens": 625019331.0, "step": 4781 }, { "epoch": 1.908220271348763, "grad_norm": 0.2762533128261566, "learning_rate": 1.942039986059579e-05, "loss": 0.1674, "num_tokens": 625150403.0, "step": 4782 }, { "epoch": 1.9086193136472467, "grad_norm": 0.31449806690216064, "learning_rate": 1.9411163565226136e-05, "loss": 0.1584, "num_tokens": 625281475.0, "step": 4783 }, { "epoch": 1.9090183559457303, "grad_norm": 0.2886739671230316, "learning_rate": 1.940192883497093e-05, "loss": 0.1527, "num_tokens": 625412547.0, "step": 4784 }, { "epoch": 1.909417398244214, "grad_norm": 0.25768524408340454, "learning_rate": 1.9392695671617006e-05, "loss": 0.1346, "num_tokens": 625543619.0, "step": 4785 }, { "epoch": 1.9098164405426976, "grad_norm": 0.2968479096889496, "learning_rate": 1.938346407695089e-05, "loss": 0.1677, "num_tokens": 625674691.0, "step": 4786 }, { "epoch": 1.9102154828411813, "grad_norm": 0.3006088137626648, "learning_rate": 1.9374234052758817e-05, "loss": 0.1744, "num_tokens": 625805763.0, "step": 4787 }, { "epoch": 1.910614525139665, "grad_norm": 0.2814566195011139, "learning_rate": 1.936500560082671e-05, "loss": 0.1386, "num_tokens": 625936835.0, "step": 4788 }, { "epoch": 1.9110135674381485, "grad_norm": 0.27051427960395813, "learning_rate": 1.9355778722940193e-05, "loss": 0.1561, "num_tokens": 626067907.0, "step": 4789 }, { "epoch": 1.9114126097366322, "grad_norm": 0.2728572487831116, "learning_rate": 1.934655342088456e-05, "loss": 0.1483, "num_tokens": 626198979.0, "step": 4790 }, { "epoch": 1.9118116520351158, "grad_norm": 0.26922208070755005, "learning_rate": 1.933732969644484e-05, "loss": 0.1317, "num_tokens": 626330051.0, "step": 4791 }, { "epoch": 1.9122106943335995, "grad_norm": 0.2871982157230377, "learning_rate": 1.9328107551405728e-05, "loss": 0.1431, "num_tokens": 626461123.0, "step": 4792 }, { "epoch": 1.9126097366320831, "grad_norm": 0.2657373249530792, "learning_rate": 1.931888698755163e-05, "loss": 0.1488, "num_tokens": 626592195.0, "step": 4793 }, { "epoch": 1.9130087789305668, "grad_norm": 0.3095463216304779, "learning_rate": 1.9309668006666642e-05, "loss": 0.1664, "num_tokens": 626723267.0, "step": 4794 }, { "epoch": 1.9134078212290504, "grad_norm": 0.3117087781429291, "learning_rate": 1.9300450610534537e-05, "loss": 0.1597, "num_tokens": 626854339.0, "step": 4795 }, { "epoch": 1.913806863527534, "grad_norm": 0.250316858291626, "learning_rate": 1.9291234800938806e-05, "loss": 0.1335, "num_tokens": 626985411.0, "step": 4796 }, { "epoch": 1.9142059058260177, "grad_norm": 0.24483349919319153, "learning_rate": 1.9282020579662624e-05, "loss": 0.1252, "num_tokens": 627116483.0, "step": 4797 }, { "epoch": 1.9146049481245013, "grad_norm": 0.2821343243122101, "learning_rate": 1.9272807948488848e-05, "loss": 0.1703, "num_tokens": 627247555.0, "step": 4798 }, { "epoch": 1.915003990422985, "grad_norm": 0.25191450119018555, "learning_rate": 1.9263596909200045e-05, "loss": 0.1282, "num_tokens": 627378627.0, "step": 4799 }, { "epoch": 1.9154030327214686, "grad_norm": 0.3093520402908325, "learning_rate": 1.925438746357846e-05, "loss": 0.1709, "num_tokens": 627509699.0, "step": 4800 }, { "epoch": 1.9158020750199523, "grad_norm": 0.3133426308631897, "learning_rate": 1.924517961340604e-05, "loss": 0.1554, "num_tokens": 627640771.0, "step": 4801 }, { "epoch": 1.916201117318436, "grad_norm": 0.30001866817474365, "learning_rate": 1.923597336046441e-05, "loss": 0.159, "num_tokens": 627771843.0, "step": 4802 }, { "epoch": 1.9166001596169195, "grad_norm": 0.29550591111183167, "learning_rate": 1.9226768706534897e-05, "loss": 0.1519, "num_tokens": 627888020.0, "step": 4803 }, { "epoch": 1.9169992019154032, "grad_norm": 0.2849859893321991, "learning_rate": 1.9217565653398523e-05, "loss": 0.1538, "num_tokens": 628019092.0, "step": 4804 }, { "epoch": 1.9173982442138868, "grad_norm": 0.25299403071403503, "learning_rate": 1.9208364202835984e-05, "loss": 0.1277, "num_tokens": 628150164.0, "step": 4805 }, { "epoch": 1.9177972865123705, "grad_norm": 0.29666566848754883, "learning_rate": 1.919916435662767e-05, "loss": 0.1633, "num_tokens": 628281236.0, "step": 4806 }, { "epoch": 1.9181963288108541, "grad_norm": 0.23365016281604767, "learning_rate": 1.9189966116553683e-05, "loss": 0.1136, "num_tokens": 628412308.0, "step": 4807 }, { "epoch": 1.9185953711093378, "grad_norm": 0.284586638212204, "learning_rate": 1.9180769484393785e-05, "loss": 0.1659, "num_tokens": 628543380.0, "step": 4808 }, { "epoch": 1.9189944134078212, "grad_norm": 0.2758043706417084, "learning_rate": 1.917157446192744e-05, "loss": 0.1521, "num_tokens": 628674452.0, "step": 4809 }, { "epoch": 1.9193934557063048, "grad_norm": 0.2776883840560913, "learning_rate": 1.91623810509338e-05, "loss": 0.1587, "num_tokens": 628805524.0, "step": 4810 }, { "epoch": 1.9197924980047885, "grad_norm": 0.3061305582523346, "learning_rate": 1.9153189253191705e-05, "loss": 0.1708, "num_tokens": 628936596.0, "step": 4811 }, { "epoch": 1.920191540303272, "grad_norm": 0.28023597598075867, "learning_rate": 1.914399907047969e-05, "loss": 0.1387, "num_tokens": 629067668.0, "step": 4812 }, { "epoch": 1.9205905826017557, "grad_norm": 0.26647984981536865, "learning_rate": 1.9134810504575957e-05, "loss": 0.1406, "num_tokens": 629198740.0, "step": 4813 }, { "epoch": 1.9209896249002394, "grad_norm": 0.2439822107553482, "learning_rate": 1.91256235572584e-05, "loss": 0.1232, "num_tokens": 629329812.0, "step": 4814 }, { "epoch": 1.921388667198723, "grad_norm": 0.2903727889060974, "learning_rate": 1.911643823030463e-05, "loss": 0.1639, "num_tokens": 629460884.0, "step": 4815 }, { "epoch": 1.9217877094972067, "grad_norm": 0.26046985387802124, "learning_rate": 1.9107254525491912e-05, "loss": 0.1481, "num_tokens": 629591956.0, "step": 4816 }, { "epoch": 1.9221867517956903, "grad_norm": 0.271754652261734, "learning_rate": 1.90980724445972e-05, "loss": 0.1477, "num_tokens": 629723028.0, "step": 4817 }, { "epoch": 1.922585794094174, "grad_norm": 0.3135685324668884, "learning_rate": 1.9088891989397158e-05, "loss": 0.1756, "num_tokens": 629854100.0, "step": 4818 }, { "epoch": 1.9229848363926576, "grad_norm": 0.26674285531044006, "learning_rate": 1.9079713161668102e-05, "loss": 0.1362, "num_tokens": 629985172.0, "step": 4819 }, { "epoch": 1.9233838786911412, "grad_norm": 0.2505815327167511, "learning_rate": 1.9070535963186052e-05, "loss": 0.1239, "num_tokens": 630116244.0, "step": 4820 }, { "epoch": 1.923782920989625, "grad_norm": 0.32660698890686035, "learning_rate": 1.9061360395726723e-05, "loss": 0.1717, "num_tokens": 630247316.0, "step": 4821 }, { "epoch": 1.9241819632881085, "grad_norm": 0.30552515387535095, "learning_rate": 1.905218646106549e-05, "loss": 0.1614, "num_tokens": 630378388.0, "step": 4822 }, { "epoch": 1.9245810055865922, "grad_norm": 0.2930552065372467, "learning_rate": 1.9043014160977435e-05, "loss": 0.1766, "num_tokens": 630498210.0, "step": 4823 }, { "epoch": 1.9249800478850758, "grad_norm": 0.27066290378570557, "learning_rate": 1.9033843497237303e-05, "loss": 0.1501, "num_tokens": 630629282.0, "step": 4824 }, { "epoch": 1.9253790901835595, "grad_norm": 0.25806504487991333, "learning_rate": 1.902467447161952e-05, "loss": 0.1284, "num_tokens": 630760354.0, "step": 4825 }, { "epoch": 1.925778132482043, "grad_norm": 0.25691288709640503, "learning_rate": 1.901550708589823e-05, "loss": 0.1226, "num_tokens": 630891426.0, "step": 4826 }, { "epoch": 1.9261771747805267, "grad_norm": 0.2937837839126587, "learning_rate": 1.9006341341847227e-05, "loss": 0.1572, "num_tokens": 631022498.0, "step": 4827 }, { "epoch": 1.9265762170790104, "grad_norm": 0.2853134572505951, "learning_rate": 1.899717724124e-05, "loss": 0.1589, "num_tokens": 631153570.0, "step": 4828 }, { "epoch": 1.926975259377494, "grad_norm": 0.2605464458465576, "learning_rate": 1.898801478584971e-05, "loss": 0.1324, "num_tokens": 631284642.0, "step": 4829 }, { "epoch": 1.9273743016759777, "grad_norm": 0.3079090118408203, "learning_rate": 1.897885397744921e-05, "loss": 0.1567, "num_tokens": 631415714.0, "step": 4830 }, { "epoch": 1.9277733439744613, "grad_norm": 0.3058723211288452, "learning_rate": 1.8969694817811035e-05, "loss": 0.1476, "num_tokens": 631546786.0, "step": 4831 }, { "epoch": 1.928172386272945, "grad_norm": 0.283477783203125, "learning_rate": 1.89605373087074e-05, "loss": 0.1341, "num_tokens": 631677858.0, "step": 4832 }, { "epoch": 1.9285714285714286, "grad_norm": 0.265773206949234, "learning_rate": 1.8951381451910184e-05, "loss": 0.1444, "num_tokens": 631808930.0, "step": 4833 }, { "epoch": 1.9289704708699122, "grad_norm": 0.24960243701934814, "learning_rate": 1.8942227249190968e-05, "loss": 0.1198, "num_tokens": 631940002.0, "step": 4834 }, { "epoch": 1.9293695131683959, "grad_norm": 0.2644064426422119, "learning_rate": 1.893307470232102e-05, "loss": 0.1249, "num_tokens": 632071074.0, "step": 4835 }, { "epoch": 1.9297685554668795, "grad_norm": 0.28568941354751587, "learning_rate": 1.8923923813071236e-05, "loss": 0.1369, "num_tokens": 632202146.0, "step": 4836 }, { "epoch": 1.9301675977653632, "grad_norm": 0.3105751872062683, "learning_rate": 1.8914774583212263e-05, "loss": 0.1633, "num_tokens": 632333218.0, "step": 4837 }, { "epoch": 1.9305666400638468, "grad_norm": 0.2796390652656555, "learning_rate": 1.890562701451437e-05, "loss": 0.1457, "num_tokens": 632464290.0, "step": 4838 }, { "epoch": 1.9309656823623305, "grad_norm": 0.362404465675354, "learning_rate": 1.8896481108747537e-05, "loss": 0.1965, "num_tokens": 632595362.0, "step": 4839 }, { "epoch": 1.9313647246608139, "grad_norm": 0.2489413619041443, "learning_rate": 1.8887336867681405e-05, "loss": 0.1227, "num_tokens": 632726434.0, "step": 4840 }, { "epoch": 1.9317637669592975, "grad_norm": 0.2807775139808655, "learning_rate": 1.8878194293085304e-05, "loss": 0.1588, "num_tokens": 632857506.0, "step": 4841 }, { "epoch": 1.9321628092577812, "grad_norm": 0.27780991792678833, "learning_rate": 1.886905338672824e-05, "loss": 0.1505, "num_tokens": 632988578.0, "step": 4842 }, { "epoch": 1.9325618515562648, "grad_norm": 0.24019593000411987, "learning_rate": 1.885991415037889e-05, "loss": 0.1067, "num_tokens": 633119650.0, "step": 4843 }, { "epoch": 1.9329608938547485, "grad_norm": 0.2891428470611572, "learning_rate": 1.88507765858056e-05, "loss": 0.152, "num_tokens": 633250722.0, "step": 4844 }, { "epoch": 1.933359936153232, "grad_norm": 0.28493422269821167, "learning_rate": 1.8841640694776426e-05, "loss": 0.1739, "num_tokens": 633381794.0, "step": 4845 }, { "epoch": 1.9337589784517157, "grad_norm": 0.29808223247528076, "learning_rate": 1.8832506479059063e-05, "loss": 0.1699, "num_tokens": 633512866.0, "step": 4846 }, { "epoch": 1.9341580207501994, "grad_norm": 0.27592548727989197, "learning_rate": 1.882337394042089e-05, "loss": 0.1492, "num_tokens": 633643938.0, "step": 4847 }, { "epoch": 1.934557063048683, "grad_norm": 0.2662820518016815, "learning_rate": 1.881424308062899e-05, "loss": 0.1379, "num_tokens": 633775010.0, "step": 4848 }, { "epoch": 1.9349561053471667, "grad_norm": 0.3211502432823181, "learning_rate": 1.8805113901450077e-05, "loss": 0.1867, "num_tokens": 633906082.0, "step": 4849 }, { "epoch": 1.9353551476456503, "grad_norm": 0.2591944932937622, "learning_rate": 1.8795986404650572e-05, "loss": 0.1289, "num_tokens": 634037154.0, "step": 4850 }, { "epoch": 1.935754189944134, "grad_norm": 0.2592165172100067, "learning_rate": 1.8786860591996557e-05, "loss": 0.1314, "num_tokens": 634168226.0, "step": 4851 }, { "epoch": 1.9361532322426176, "grad_norm": 0.3025195896625519, "learning_rate": 1.8777736465253802e-05, "loss": 0.1736, "num_tokens": 634299298.0, "step": 4852 }, { "epoch": 1.9365522745411012, "grad_norm": 0.30569547414779663, "learning_rate": 1.876861402618773e-05, "loss": 0.1645, "num_tokens": 634430370.0, "step": 4853 }, { "epoch": 1.9369513168395849, "grad_norm": 0.27731209993362427, "learning_rate": 1.875949327656345e-05, "loss": 0.1425, "num_tokens": 634561442.0, "step": 4854 }, { "epoch": 1.9373503591380685, "grad_norm": 0.25800883769989014, "learning_rate": 1.8750374218145744e-05, "loss": 0.1319, "num_tokens": 634692514.0, "step": 4855 }, { "epoch": 1.9377494014365522, "grad_norm": 0.2774898111820221, "learning_rate": 1.874125685269906e-05, "loss": 0.1454, "num_tokens": 634823586.0, "step": 4856 }, { "epoch": 1.9381484437350358, "grad_norm": 0.3357482850551605, "learning_rate": 1.8732141181987538e-05, "loss": 0.1521, "num_tokens": 634954658.0, "step": 4857 }, { "epoch": 1.9385474860335195, "grad_norm": 0.28751084208488464, "learning_rate": 1.8723027207774957e-05, "loss": 0.1616, "num_tokens": 635085730.0, "step": 4858 }, { "epoch": 1.938946528332003, "grad_norm": 0.2987452745437622, "learning_rate": 1.8713914931824793e-05, "loss": 0.154, "num_tokens": 635216802.0, "step": 4859 }, { "epoch": 1.9393455706304867, "grad_norm": 0.26134535670280457, "learning_rate": 1.8704804355900187e-05, "loss": 0.1278, "num_tokens": 635347874.0, "step": 4860 }, { "epoch": 1.9397446129289704, "grad_norm": 0.33148905634880066, "learning_rate": 1.8695695481763952e-05, "loss": 0.1785, "num_tokens": 635478946.0, "step": 4861 }, { "epoch": 1.940143655227454, "grad_norm": 0.2716238796710968, "learning_rate": 1.868658831117856e-05, "loss": 0.1615, "num_tokens": 635610018.0, "step": 4862 }, { "epoch": 1.9405426975259377, "grad_norm": 0.2685895264148712, "learning_rate": 1.867748284590618e-05, "loss": 0.1513, "num_tokens": 635741090.0, "step": 4863 }, { "epoch": 1.9409417398244213, "grad_norm": 0.28633251786231995, "learning_rate": 1.866837908770863e-05, "loss": 0.1547, "num_tokens": 635872162.0, "step": 4864 }, { "epoch": 1.941340782122905, "grad_norm": 0.30078762769699097, "learning_rate": 1.8659277038347394e-05, "loss": 0.1716, "num_tokens": 636003234.0, "step": 4865 }, { "epoch": 1.9417398244213886, "grad_norm": 0.2709192633628845, "learning_rate": 1.8650176699583638e-05, "loss": 0.1499, "num_tokens": 636134306.0, "step": 4866 }, { "epoch": 1.9421388667198722, "grad_norm": 0.2672830820083618, "learning_rate": 1.8641078073178202e-05, "loss": 0.1309, "num_tokens": 636265378.0, "step": 4867 }, { "epoch": 1.9425379090183559, "grad_norm": 0.3117278814315796, "learning_rate": 1.8631981160891566e-05, "loss": 0.1755, "num_tokens": 636396450.0, "step": 4868 }, { "epoch": 1.9429369513168395, "grad_norm": 0.2693993151187897, "learning_rate": 1.862288596448392e-05, "loss": 0.1589, "num_tokens": 636527522.0, "step": 4869 }, { "epoch": 1.9433359936153232, "grad_norm": 0.3054552376270294, "learning_rate": 1.861379248571508e-05, "loss": 0.1686, "num_tokens": 636658594.0, "step": 4870 }, { "epoch": 1.9437350359138068, "grad_norm": 0.25360339879989624, "learning_rate": 1.860470072634456e-05, "loss": 0.128, "num_tokens": 636789666.0, "step": 4871 }, { "epoch": 1.9441340782122905, "grad_norm": 0.25576546788215637, "learning_rate": 1.859561068813153e-05, "loss": 0.138, "num_tokens": 636920738.0, "step": 4872 }, { "epoch": 1.944533120510774, "grad_norm": 0.28363165259361267, "learning_rate": 1.8586522372834826e-05, "loss": 0.1531, "num_tokens": 637051810.0, "step": 4873 }, { "epoch": 1.9449321628092577, "grad_norm": 0.2684517502784729, "learning_rate": 1.857743578221295e-05, "loss": 0.1507, "num_tokens": 637182882.0, "step": 4874 }, { "epoch": 1.9453312051077414, "grad_norm": 0.2785923480987549, "learning_rate": 1.856835091802408e-05, "loss": 0.1632, "num_tokens": 637313954.0, "step": 4875 }, { "epoch": 1.945730247406225, "grad_norm": 0.26127105951309204, "learning_rate": 1.8559267782026042e-05, "loss": 0.1428, "num_tokens": 637445026.0, "step": 4876 }, { "epoch": 1.9461292897047087, "grad_norm": 0.31553399562835693, "learning_rate": 1.8550186375976342e-05, "loss": 0.1771, "num_tokens": 637576098.0, "step": 4877 }, { "epoch": 1.9465283320031923, "grad_norm": 0.30795571208000183, "learning_rate": 1.8541106701632153e-05, "loss": 0.1668, "num_tokens": 637707170.0, "step": 4878 }, { "epoch": 1.946927374301676, "grad_norm": 0.2508847713470459, "learning_rate": 1.8532028760750297e-05, "loss": 0.1162, "num_tokens": 637838242.0, "step": 4879 }, { "epoch": 1.9473264166001596, "grad_norm": 0.29632997512817383, "learning_rate": 1.8522952555087298e-05, "loss": 0.1653, "num_tokens": 637969314.0, "step": 4880 }, { "epoch": 1.9477254588986432, "grad_norm": 0.28033480048179626, "learning_rate": 1.8513878086399277e-05, "loss": 0.1339, "num_tokens": 638100386.0, "step": 4881 }, { "epoch": 1.9481245011971269, "grad_norm": 0.323214054107666, "learning_rate": 1.8504805356442086e-05, "loss": 0.1571, "num_tokens": 638231458.0, "step": 4882 }, { "epoch": 1.9485235434956105, "grad_norm": 0.29960140585899353, "learning_rate": 1.8495734366971206e-05, "loss": 0.1637, "num_tokens": 638362530.0, "step": 4883 }, { "epoch": 1.9489225857940942, "grad_norm": 0.3090386688709259, "learning_rate": 1.8486665119741786e-05, "loss": 0.146, "num_tokens": 638493602.0, "step": 4884 }, { "epoch": 1.9493216280925778, "grad_norm": 0.28964415192604065, "learning_rate": 1.8477597616508647e-05, "loss": 0.1517, "num_tokens": 638624674.0, "step": 4885 }, { "epoch": 1.9497206703910615, "grad_norm": 0.2869834899902344, "learning_rate": 1.8468531859026262e-05, "loss": 0.1452, "num_tokens": 638742766.0, "step": 4886 }, { "epoch": 1.950119712689545, "grad_norm": 0.3294309079647064, "learning_rate": 1.845946784904877e-05, "loss": 0.1828, "num_tokens": 638873838.0, "step": 4887 }, { "epoch": 1.9505187549880287, "grad_norm": 0.238773375749588, "learning_rate": 1.845040558832998e-05, "loss": 0.1301, "num_tokens": 639004910.0, "step": 4888 }, { "epoch": 1.9509177972865124, "grad_norm": 0.2875337302684784, "learning_rate": 1.8441345078623345e-05, "loss": 0.16, "num_tokens": 639135982.0, "step": 4889 }, { "epoch": 1.951316839584996, "grad_norm": 0.2802906334400177, "learning_rate": 1.843228632168199e-05, "loss": 0.1497, "num_tokens": 639267054.0, "step": 4890 }, { "epoch": 1.9517158818834797, "grad_norm": 0.25751033425331116, "learning_rate": 1.8423229319258718e-05, "loss": 0.1338, "num_tokens": 639398126.0, "step": 4891 }, { "epoch": 1.9521149241819633, "grad_norm": 0.2887055575847626, "learning_rate": 1.8414174073105943e-05, "loss": 0.1549, "num_tokens": 639513530.0, "step": 4892 }, { "epoch": 1.952513966480447, "grad_norm": 0.26192158460617065, "learning_rate": 1.84051205849758e-05, "loss": 0.1269, "num_tokens": 639644602.0, "step": 4893 }, { "epoch": 1.9529130087789306, "grad_norm": 0.31711575388908386, "learning_rate": 1.8396068856620036e-05, "loss": 0.1775, "num_tokens": 639775674.0, "step": 4894 }, { "epoch": 1.9533120510774142, "grad_norm": 0.2758648097515106, "learning_rate": 1.8387018889790087e-05, "loss": 0.1496, "num_tokens": 639906746.0, "step": 4895 }, { "epoch": 1.9537110933758979, "grad_norm": 0.25977712869644165, "learning_rate": 1.8377970686237028e-05, "loss": 0.1296, "num_tokens": 640037818.0, "step": 4896 }, { "epoch": 1.9541101356743815, "grad_norm": 0.268991082906723, "learning_rate": 1.8368924247711606e-05, "loss": 0.1449, "num_tokens": 640168890.0, "step": 4897 }, { "epoch": 1.9545091779728652, "grad_norm": 0.3363057076931, "learning_rate": 1.8359879575964223e-05, "loss": 0.1824, "num_tokens": 640288214.0, "step": 4898 }, { "epoch": 1.9549082202713488, "grad_norm": 0.27532482147216797, "learning_rate": 1.8350836672744947e-05, "loss": 0.1509, "num_tokens": 640419286.0, "step": 4899 }, { "epoch": 1.9553072625698324, "grad_norm": 0.2710345983505249, "learning_rate": 1.834179553980348e-05, "loss": 0.1413, "num_tokens": 640550358.0, "step": 4900 }, { "epoch": 1.955706304868316, "grad_norm": 0.34661462903022766, "learning_rate": 1.8332756178889206e-05, "loss": 0.1908, "num_tokens": 640681430.0, "step": 4901 }, { "epoch": 1.9561053471667997, "grad_norm": 0.28478145599365234, "learning_rate": 1.8323718591751167e-05, "loss": 0.1583, "num_tokens": 640812502.0, "step": 4902 }, { "epoch": 1.9565043894652834, "grad_norm": 0.2809225618839264, "learning_rate": 1.831468278013802e-05, "loss": 0.1472, "num_tokens": 640943574.0, "step": 4903 }, { "epoch": 1.956903431763767, "grad_norm": 0.23539406061172485, "learning_rate": 1.8305648745798142e-05, "loss": 0.1138, "num_tokens": 641074646.0, "step": 4904 }, { "epoch": 1.9573024740622507, "grad_norm": 0.2715570628643036, "learning_rate": 1.8296616490479525e-05, "loss": 0.1374, "num_tokens": 641205718.0, "step": 4905 }, { "epoch": 1.9577015163607343, "grad_norm": 0.29766395688056946, "learning_rate": 1.8287586015929812e-05, "loss": 0.1751, "num_tokens": 641336790.0, "step": 4906 }, { "epoch": 1.958100558659218, "grad_norm": 0.27352556586265564, "learning_rate": 1.827855732389634e-05, "loss": 0.1489, "num_tokens": 641467862.0, "step": 4907 }, { "epoch": 1.9584996009577016, "grad_norm": 0.26888683438301086, "learning_rate": 1.8269530416126064e-05, "loss": 0.1496, "num_tokens": 641598934.0, "step": 4908 }, { "epoch": 1.9588986432561852, "grad_norm": 0.2891605794429779, "learning_rate": 1.8260505294365606e-05, "loss": 0.1635, "num_tokens": 641730006.0, "step": 4909 }, { "epoch": 1.9592976855546689, "grad_norm": 0.2524432837963104, "learning_rate": 1.8251481960361237e-05, "loss": 0.1215, "num_tokens": 641861078.0, "step": 4910 }, { "epoch": 1.9596967278531525, "grad_norm": 0.2813820540904999, "learning_rate": 1.8242460415858904e-05, "loss": 0.1618, "num_tokens": 641992150.0, "step": 4911 }, { "epoch": 1.9600957701516362, "grad_norm": 0.26300784945487976, "learning_rate": 1.8233440662604174e-05, "loss": 0.1274, "num_tokens": 642123222.0, "step": 4912 }, { "epoch": 1.9604948124501198, "grad_norm": 0.28320249915122986, "learning_rate": 1.8224422702342296e-05, "loss": 0.1617, "num_tokens": 642254294.0, "step": 4913 }, { "epoch": 1.9608938547486034, "grad_norm": 0.2902042865753174, "learning_rate": 1.821540653681817e-05, "loss": 0.1536, "num_tokens": 642385366.0, "step": 4914 }, { "epoch": 1.961292897047087, "grad_norm": 0.30216437578201294, "learning_rate": 1.820639216777632e-05, "loss": 0.1508, "num_tokens": 642516438.0, "step": 4915 }, { "epoch": 1.9616919393455707, "grad_norm": 0.2730481028556824, "learning_rate": 1.8197379596960956e-05, "loss": 0.1359, "num_tokens": 642647510.0, "step": 4916 }, { "epoch": 1.9620909816440544, "grad_norm": 0.2856051027774811, "learning_rate": 1.818836882611592e-05, "loss": 0.1677, "num_tokens": 642778582.0, "step": 4917 }, { "epoch": 1.962490023942538, "grad_norm": 0.30184149742126465, "learning_rate": 1.8179359856984712e-05, "loss": 0.1546, "num_tokens": 642909654.0, "step": 4918 }, { "epoch": 1.9628890662410217, "grad_norm": 0.2580302059650421, "learning_rate": 1.8170352691310487e-05, "loss": 0.1314, "num_tokens": 643040726.0, "step": 4919 }, { "epoch": 1.9632881085395053, "grad_norm": 0.2706855833530426, "learning_rate": 1.8161347330836047e-05, "loss": 0.1455, "num_tokens": 643171798.0, "step": 4920 }, { "epoch": 1.963687150837989, "grad_norm": 0.2802872955799103, "learning_rate": 1.8152343777303843e-05, "loss": 0.1641, "num_tokens": 643302870.0, "step": 4921 }, { "epoch": 1.9640861931364726, "grad_norm": 0.2779759168624878, "learning_rate": 1.814334203245598e-05, "loss": 0.1592, "num_tokens": 643433942.0, "step": 4922 }, { "epoch": 1.9644852354349562, "grad_norm": 0.2956942021846771, "learning_rate": 1.8134342098034206e-05, "loss": 0.1506, "num_tokens": 643565014.0, "step": 4923 }, { "epoch": 1.9648842777334399, "grad_norm": 0.3013925552368164, "learning_rate": 1.8125343975779944e-05, "loss": 0.1732, "num_tokens": 643696086.0, "step": 4924 }, { "epoch": 1.9652833200319235, "grad_norm": 0.27629730105400085, "learning_rate": 1.8116347667434237e-05, "loss": 0.1486, "num_tokens": 643827158.0, "step": 4925 }, { "epoch": 1.9656823623304072, "grad_norm": 0.2742159962654114, "learning_rate": 1.8107353174737768e-05, "loss": 0.1467, "num_tokens": 643958230.0, "step": 4926 }, { "epoch": 1.9660814046288908, "grad_norm": 0.26441314816474915, "learning_rate": 1.8098360499430904e-05, "loss": 0.144, "num_tokens": 644089302.0, "step": 4927 }, { "epoch": 1.9664804469273744, "grad_norm": 0.281169593334198, "learning_rate": 1.808936964325365e-05, "loss": 0.169, "num_tokens": 644220374.0, "step": 4928 }, { "epoch": 1.966879489225858, "grad_norm": 0.24173204600811005, "learning_rate": 1.8080380607945642e-05, "loss": 0.1027, "num_tokens": 644351446.0, "step": 4929 }, { "epoch": 1.9672785315243417, "grad_norm": 0.25832462310791016, "learning_rate": 1.807139339524617e-05, "loss": 0.122, "num_tokens": 644482518.0, "step": 4930 }, { "epoch": 1.9676775738228254, "grad_norm": 0.3270050883293152, "learning_rate": 1.8062408006894198e-05, "loss": 0.1773, "num_tokens": 644613590.0, "step": 4931 }, { "epoch": 1.968076616121309, "grad_norm": 0.3115576505661011, "learning_rate": 1.8053424444628293e-05, "loss": 0.1496, "num_tokens": 644744662.0, "step": 4932 }, { "epoch": 1.9684756584197927, "grad_norm": 0.29445961117744446, "learning_rate": 1.804444271018669e-05, "loss": 0.1432, "num_tokens": 644875734.0, "step": 4933 }, { "epoch": 1.968874700718276, "grad_norm": 0.31683221459388733, "learning_rate": 1.8035462805307298e-05, "loss": 0.1429, "num_tokens": 645006806.0, "step": 4934 }, { "epoch": 1.9692737430167597, "grad_norm": 0.2645573616027832, "learning_rate": 1.802648473172761e-05, "loss": 0.1329, "num_tokens": 645137878.0, "step": 4935 }, { "epoch": 1.9696727853152434, "grad_norm": 0.27395427227020264, "learning_rate": 1.8017508491184825e-05, "loss": 0.1529, "num_tokens": 645268950.0, "step": 4936 }, { "epoch": 1.970071827613727, "grad_norm": 0.30080580711364746, "learning_rate": 1.8008534085415756e-05, "loss": 0.1848, "num_tokens": 645400022.0, "step": 4937 }, { "epoch": 1.9704708699122107, "grad_norm": 0.27929970622062683, "learning_rate": 1.799956151615685e-05, "loss": 0.1462, "num_tokens": 645531094.0, "step": 4938 }, { "epoch": 1.9708699122106943, "grad_norm": 0.2493930160999298, "learning_rate": 1.7990590785144234e-05, "loss": 0.1259, "num_tokens": 645662166.0, "step": 4939 }, { "epoch": 1.971268954509178, "grad_norm": 0.2599642872810364, "learning_rate": 1.798162189411366e-05, "loss": 0.1373, "num_tokens": 645793238.0, "step": 4940 }, { "epoch": 1.9716679968076616, "grad_norm": 0.27624547481536865, "learning_rate": 1.7972654844800513e-05, "loss": 0.1669, "num_tokens": 645924310.0, "step": 4941 }, { "epoch": 1.9720670391061452, "grad_norm": 0.26243528723716736, "learning_rate": 1.796368963893984e-05, "loss": 0.1268, "num_tokens": 646055382.0, "step": 4942 }, { "epoch": 1.9724660814046289, "grad_norm": 0.30114784836769104, "learning_rate": 1.7954726278266333e-05, "loss": 0.1626, "num_tokens": 646186454.0, "step": 4943 }, { "epoch": 1.9728651237031125, "grad_norm": 0.24754011631011963, "learning_rate": 1.7945764764514305e-05, "loss": 0.1322, "num_tokens": 646317526.0, "step": 4944 }, { "epoch": 1.9732641660015962, "grad_norm": 0.2672450542449951, "learning_rate": 1.7936805099417732e-05, "loss": 0.1433, "num_tokens": 646448598.0, "step": 4945 }, { "epoch": 1.9736632083000798, "grad_norm": 0.27068495750427246, "learning_rate": 1.792784728471023e-05, "loss": 0.133, "num_tokens": 646579670.0, "step": 4946 }, { "epoch": 1.9740622505985634, "grad_norm": 0.27539777755737305, "learning_rate": 1.7918891322125042e-05, "loss": 0.1391, "num_tokens": 646710742.0, "step": 4947 }, { "epoch": 1.974461292897047, "grad_norm": 0.23902243375778198, "learning_rate": 1.7909937213395078e-05, "loss": 0.1138, "num_tokens": 646841814.0, "step": 4948 }, { "epoch": 1.9748603351955307, "grad_norm": 0.3058764636516571, "learning_rate": 1.790098496025286e-05, "loss": 0.1621, "num_tokens": 646972886.0, "step": 4949 }, { "epoch": 1.9752593774940144, "grad_norm": 0.29743701219558716, "learning_rate": 1.7892034564430576e-05, "loss": 0.1729, "num_tokens": 647103958.0, "step": 4950 }, { "epoch": 1.975658419792498, "grad_norm": 0.3087639808654785, "learning_rate": 1.788308602766003e-05, "loss": 0.1812, "num_tokens": 647235030.0, "step": 4951 }, { "epoch": 1.9760574620909817, "grad_norm": 0.35488155484199524, "learning_rate": 1.7874139351672703e-05, "loss": 0.1593, "num_tokens": 647366102.0, "step": 4952 }, { "epoch": 1.9764565043894653, "grad_norm": 0.2526816129684448, "learning_rate": 1.7865194538199666e-05, "loss": 0.1219, "num_tokens": 647497174.0, "step": 4953 }, { "epoch": 1.976855546687949, "grad_norm": 0.2821393311023712, "learning_rate": 1.7856251588971674e-05, "loss": 0.1604, "num_tokens": 647628246.0, "step": 4954 }, { "epoch": 1.9772545889864326, "grad_norm": 0.32125136256217957, "learning_rate": 1.7847310505719112e-05, "loss": 0.1752, "num_tokens": 647759318.0, "step": 4955 }, { "epoch": 1.9776536312849162, "grad_norm": 0.24517779052257538, "learning_rate": 1.7838371290171972e-05, "loss": 0.111, "num_tokens": 647890390.0, "step": 4956 }, { "epoch": 1.9780526735833999, "grad_norm": 0.2454320788383484, "learning_rate": 1.782943394405993e-05, "loss": 0.1182, "num_tokens": 648021462.0, "step": 4957 }, { "epoch": 1.9784517158818835, "grad_norm": 0.28143125772476196, "learning_rate": 1.782049846911228e-05, "loss": 0.1379, "num_tokens": 648148757.0, "step": 4958 }, { "epoch": 1.9788507581803672, "grad_norm": 0.30146023631095886, "learning_rate": 1.781156486705794e-05, "loss": 0.1413, "num_tokens": 648279829.0, "step": 4959 }, { "epoch": 1.9792498004788508, "grad_norm": 0.27881672978401184, "learning_rate": 1.7802633139625475e-05, "loss": 0.1286, "num_tokens": 648410901.0, "step": 4960 }, { "epoch": 1.9796488427773344, "grad_norm": 0.28061163425445557, "learning_rate": 1.7793703288543102e-05, "loss": 0.1584, "num_tokens": 648541973.0, "step": 4961 }, { "epoch": 1.980047885075818, "grad_norm": 0.28344154357910156, "learning_rate": 1.778477531553867e-05, "loss": 0.1373, "num_tokens": 648673045.0, "step": 4962 }, { "epoch": 1.9804469273743017, "grad_norm": 0.27311038970947266, "learning_rate": 1.7775849222339648e-05, "loss": 0.1582, "num_tokens": 648804117.0, "step": 4963 }, { "epoch": 1.9808459696727854, "grad_norm": 0.2590904235839844, "learning_rate": 1.7766925010673146e-05, "loss": 0.1339, "num_tokens": 648935189.0, "step": 4964 }, { "epoch": 1.981245011971269, "grad_norm": 0.2598787844181061, "learning_rate": 1.7758002682265934e-05, "loss": 0.1465, "num_tokens": 649066261.0, "step": 4965 }, { "epoch": 1.9816440542697524, "grad_norm": 0.29292890429496765, "learning_rate": 1.774908223884438e-05, "loss": 0.1473, "num_tokens": 649197333.0, "step": 4966 }, { "epoch": 1.982043096568236, "grad_norm": 0.26417481899261475, "learning_rate": 1.774016368213452e-05, "loss": 0.1183, "num_tokens": 649328405.0, "step": 4967 }, { "epoch": 1.9824421388667197, "grad_norm": 0.28850483894348145, "learning_rate": 1.7731247013862012e-05, "loss": 0.1442, "num_tokens": 649459477.0, "step": 4968 }, { "epoch": 1.9828411811652034, "grad_norm": 0.3080179989337921, "learning_rate": 1.7722332235752138e-05, "loss": 0.1696, "num_tokens": 649590549.0, "step": 4969 }, { "epoch": 1.983240223463687, "grad_norm": 0.3034377694129944, "learning_rate": 1.7713419349529837e-05, "loss": 0.1456, "num_tokens": 649721621.0, "step": 4970 }, { "epoch": 1.9836392657621706, "grad_norm": 0.2788216173648834, "learning_rate": 1.7704508356919658e-05, "loss": 0.1542, "num_tokens": 649852693.0, "step": 4971 }, { "epoch": 1.9840383080606543, "grad_norm": 0.26000386476516724, "learning_rate": 1.7695599259645794e-05, "loss": 0.145, "num_tokens": 649983765.0, "step": 4972 }, { "epoch": 1.984437350359138, "grad_norm": 0.25566476583480835, "learning_rate": 1.768669205943208e-05, "loss": 0.1313, "num_tokens": 650114837.0, "step": 4973 }, { "epoch": 1.9848363926576216, "grad_norm": 0.3152273893356323, "learning_rate": 1.7677786758001985e-05, "loss": 0.1541, "num_tokens": 650245909.0, "step": 4974 }, { "epoch": 1.9852354349561052, "grad_norm": 0.2990151643753052, "learning_rate": 1.766888335707858e-05, "loss": 0.179, "num_tokens": 650376981.0, "step": 4975 }, { "epoch": 1.9856344772545889, "grad_norm": 0.28668922185897827, "learning_rate": 1.76599818583846e-05, "loss": 0.1649, "num_tokens": 650502568.0, "step": 4976 }, { "epoch": 1.9860335195530725, "grad_norm": 0.29657527804374695, "learning_rate": 1.765108226364241e-05, "loss": 0.1692, "num_tokens": 650633640.0, "step": 4977 }, { "epoch": 1.9864325618515561, "grad_norm": 0.2965400815010071, "learning_rate": 1.7642184574573983e-05, "loss": 0.1543, "num_tokens": 650764712.0, "step": 4978 }, { "epoch": 1.9868316041500398, "grad_norm": 0.28708916902542114, "learning_rate": 1.7633288792900955e-05, "loss": 0.1683, "num_tokens": 650895784.0, "step": 4979 }, { "epoch": 1.9872306464485234, "grad_norm": 0.31523454189300537, "learning_rate": 1.7624394920344567e-05, "loss": 0.1612, "num_tokens": 651026856.0, "step": 4980 }, { "epoch": 1.987629688747007, "grad_norm": 0.2567962110042572, "learning_rate": 1.7615502958625702e-05, "loss": 0.1124, "num_tokens": 651157928.0, "step": 4981 }, { "epoch": 1.9880287310454907, "grad_norm": 0.25202611088752747, "learning_rate": 1.7606612909464877e-05, "loss": 0.1378, "num_tokens": 651289000.0, "step": 4982 }, { "epoch": 1.9884277733439744, "grad_norm": 0.2560426890850067, "learning_rate": 1.759772477458222e-05, "loss": 0.1345, "num_tokens": 651420072.0, "step": 4983 }, { "epoch": 1.988826815642458, "grad_norm": 0.2478024661540985, "learning_rate": 1.7588838555697524e-05, "loss": 0.1134, "num_tokens": 651551144.0, "step": 4984 }, { "epoch": 1.9892258579409416, "grad_norm": 0.33434781432151794, "learning_rate": 1.7579954254530164e-05, "loss": 0.1814, "num_tokens": 651682216.0, "step": 4985 }, { "epoch": 1.9896249002394253, "grad_norm": 0.27538228034973145, "learning_rate": 1.7571071872799184e-05, "loss": 0.1451, "num_tokens": 651813288.0, "step": 4986 }, { "epoch": 1.990023942537909, "grad_norm": 0.2727046310901642, "learning_rate": 1.756219141222325e-05, "loss": 0.1298, "num_tokens": 651944360.0, "step": 4987 }, { "epoch": 1.9904229848363926, "grad_norm": 0.2832128703594208, "learning_rate": 1.7553312874520626e-05, "loss": 0.1531, "num_tokens": 652075432.0, "step": 4988 }, { "epoch": 1.9908220271348762, "grad_norm": 0.2500453591346741, "learning_rate": 1.754443626140924e-05, "loss": 0.1209, "num_tokens": 652206504.0, "step": 4989 }, { "epoch": 1.9912210694333599, "grad_norm": 0.2677362263202667, "learning_rate": 1.7535561574606645e-05, "loss": 0.138, "num_tokens": 652337576.0, "step": 4990 }, { "epoch": 1.9916201117318435, "grad_norm": 0.29476985335350037, "learning_rate": 1.7526688815829988e-05, "loss": 0.1579, "num_tokens": 652468648.0, "step": 4991 }, { "epoch": 1.9920191540303271, "grad_norm": 0.26009368896484375, "learning_rate": 1.7517817986796077e-05, "loss": 0.1248, "num_tokens": 652599720.0, "step": 4992 }, { "epoch": 1.9924181963288108, "grad_norm": 0.2550017237663269, "learning_rate": 1.7508949089221343e-05, "loss": 0.1317, "num_tokens": 652730792.0, "step": 4993 }, { "epoch": 1.9928172386272944, "grad_norm": 0.2539675235748291, "learning_rate": 1.7500082124821814e-05, "loss": 0.1211, "num_tokens": 652861864.0, "step": 4994 }, { "epoch": 1.993216280925778, "grad_norm": 0.27416643500328064, "learning_rate": 1.749121709531317e-05, "loss": 0.139, "num_tokens": 652992936.0, "step": 4995 }, { "epoch": 1.9936153232242617, "grad_norm": 0.2605080008506775, "learning_rate": 1.7482354002410735e-05, "loss": 0.128, "num_tokens": 653124008.0, "step": 4996 }, { "epoch": 1.9940143655227454, "grad_norm": 0.302816241979599, "learning_rate": 1.7473492847829405e-05, "loss": 0.1527, "num_tokens": 653255080.0, "step": 4997 }, { "epoch": 1.994413407821229, "grad_norm": 0.23716455698013306, "learning_rate": 1.7464633633283738e-05, "loss": 0.1088, "num_tokens": 653386152.0, "step": 4998 }, { "epoch": 1.9948124501197126, "grad_norm": 0.311217337846756, "learning_rate": 1.745577636048793e-05, "loss": 0.1453, "num_tokens": 653517224.0, "step": 4999 }, { "epoch": 1.9952114924181963, "grad_norm": 0.3308781385421753, "learning_rate": 1.7446921031155754e-05, "loss": 0.177, "num_tokens": 653648296.0, "step": 5000 }, { "epoch": 1.99561053471668, "grad_norm": 0.30452874302864075, "learning_rate": 1.7438067647000643e-05, "loss": 0.1612, "num_tokens": 653779368.0, "step": 5001 }, { "epoch": 1.9960095770151636, "grad_norm": 0.26780828833580017, "learning_rate": 1.7429216209735654e-05, "loss": 0.1471, "num_tokens": 653910440.0, "step": 5002 }, { "epoch": 1.9964086193136472, "grad_norm": 0.2851787507534027, "learning_rate": 1.7420366721073443e-05, "loss": 0.1342, "num_tokens": 654041512.0, "step": 5003 }, { "epoch": 1.9968076616121309, "grad_norm": 0.2818174958229065, "learning_rate": 1.741151918272632e-05, "loss": 0.1279, "num_tokens": 654172584.0, "step": 5004 }, { "epoch": 1.9972067039106145, "grad_norm": 0.2383393496274948, "learning_rate": 1.7402673596406183e-05, "loss": 0.1114, "num_tokens": 654303656.0, "step": 5005 }, { "epoch": 1.9976057462090981, "grad_norm": 0.30172857642173767, "learning_rate": 1.7393829963824586e-05, "loss": 0.1542, "num_tokens": 654434728.0, "step": 5006 }, { "epoch": 1.9980047885075818, "grad_norm": 0.30749160051345825, "learning_rate": 1.7384988286692678e-05, "loss": 0.1727, "num_tokens": 654565800.0, "step": 5007 }, { "epoch": 1.9984038308060654, "grad_norm": 0.2897140681743622, "learning_rate": 1.737614856672124e-05, "loss": 0.1488, "num_tokens": 654696872.0, "step": 5008 }, { "epoch": 1.998802873104549, "grad_norm": 0.27981966733932495, "learning_rate": 1.736731080562069e-05, "loss": 0.1343, "num_tokens": 654827944.0, "step": 5009 }, { "epoch": 1.9992019154030327, "grad_norm": 0.27037981152534485, "learning_rate": 1.7358475005101037e-05, "loss": 0.1453, "num_tokens": 654959016.0, "step": 5010 }, { "epoch": 1.9996009577015164, "grad_norm": 0.2309848964214325, "learning_rate": 1.7349641166871934e-05, "loss": 0.1064, "num_tokens": 655084064.0, "step": 5011 }, { "epoch": 2.0, "grad_norm": 0.44606828689575195, "learning_rate": 1.734080929264265e-05, "loss": 0.2073, "num_tokens": 655149600.0, "step": 5012 }, { "epoch": 2.0003990422984836, "grad_norm": 0.25026118755340576, "learning_rate": 1.733197938412206e-05, "loss": 0.1063, "num_tokens": 655280672.0, "step": 5013 }, { "epoch": 2.0007980845969673, "grad_norm": 0.2576446831226349, "learning_rate": 1.7323151443018675e-05, "loss": 0.1129, "num_tokens": 655411744.0, "step": 5014 }, { "epoch": 2.001197126895451, "grad_norm": 0.229672372341156, "learning_rate": 1.7314325471040626e-05, "loss": 0.1015, "num_tokens": 655542816.0, "step": 5015 }, { "epoch": 2.0015961691939346, "grad_norm": 0.2378011792898178, "learning_rate": 1.7305501469895642e-05, "loss": 0.1025, "num_tokens": 655673888.0, "step": 5016 }, { "epoch": 2.001995211492418, "grad_norm": 0.2661912441253662, "learning_rate": 1.7296679441291083e-05, "loss": 0.0989, "num_tokens": 655804960.0, "step": 5017 }, { "epoch": 2.002394253790902, "grad_norm": 0.28996923565864563, "learning_rate": 1.7287859386933953e-05, "loss": 0.0992, "num_tokens": 655936032.0, "step": 5018 }, { "epoch": 2.0027932960893855, "grad_norm": 0.26787781715393066, "learning_rate": 1.7279041308530823e-05, "loss": 0.0889, "num_tokens": 656067104.0, "step": 5019 }, { "epoch": 2.003192338387869, "grad_norm": 0.2624953091144562, "learning_rate": 1.7270225207787927e-05, "loss": 0.0937, "num_tokens": 656198176.0, "step": 5020 }, { "epoch": 2.003591380686353, "grad_norm": 0.28011998534202576, "learning_rate": 1.7261411086411097e-05, "loss": 0.0998, "num_tokens": 656329248.0, "step": 5021 }, { "epoch": 2.0039904229848364, "grad_norm": 0.3201409876346588, "learning_rate": 1.725259894610577e-05, "loss": 0.0976, "num_tokens": 656460320.0, "step": 5022 }, { "epoch": 2.00438946528332, "grad_norm": 0.31391045451164246, "learning_rate": 1.724378878857702e-05, "loss": 0.1075, "num_tokens": 656591392.0, "step": 5023 }, { "epoch": 2.0047885075818037, "grad_norm": 0.20403356850147247, "learning_rate": 1.7234980615529544e-05, "loss": 0.0727, "num_tokens": 656722464.0, "step": 5024 }, { "epoch": 2.0051875498802874, "grad_norm": 0.24993924796581268, "learning_rate": 1.722617442866762e-05, "loss": 0.0824, "num_tokens": 656853536.0, "step": 5025 }, { "epoch": 2.005586592178771, "grad_norm": 0.25799983739852905, "learning_rate": 1.7217370229695173e-05, "loss": 0.0929, "num_tokens": 656984608.0, "step": 5026 }, { "epoch": 2.0059856344772546, "grad_norm": 0.2527852952480316, "learning_rate": 1.7208568020315742e-05, "loss": 0.0997, "num_tokens": 657111345.0, "step": 5027 }, { "epoch": 2.0063846767757383, "grad_norm": 0.2590341567993164, "learning_rate": 1.7199767802232464e-05, "loss": 0.0947, "num_tokens": 657242417.0, "step": 5028 }, { "epoch": 2.006783719074222, "grad_norm": 0.27879762649536133, "learning_rate": 1.7190969577148088e-05, "loss": 0.1109, "num_tokens": 657373489.0, "step": 5029 }, { "epoch": 2.0071827613727056, "grad_norm": 0.21099498867988586, "learning_rate": 1.718217334676501e-05, "loss": 0.0776, "num_tokens": 657504561.0, "step": 5030 }, { "epoch": 2.007581803671189, "grad_norm": 0.24321971833705902, "learning_rate": 1.7173379112785198e-05, "loss": 0.0988, "num_tokens": 657635633.0, "step": 5031 }, { "epoch": 2.007980845969673, "grad_norm": 0.2480831742286682, "learning_rate": 1.7164586876910266e-05, "loss": 0.0965, "num_tokens": 657766705.0, "step": 5032 }, { "epoch": 2.0083798882681565, "grad_norm": 0.30429279804229736, "learning_rate": 1.7155796640841433e-05, "loss": 0.1229, "num_tokens": 657897777.0, "step": 5033 }, { "epoch": 2.00877893056664, "grad_norm": 0.2579600214958191, "learning_rate": 1.7147008406279518e-05, "loss": 0.1056, "num_tokens": 658028849.0, "step": 5034 }, { "epoch": 2.009177972865124, "grad_norm": 0.259514182806015, "learning_rate": 1.7138222174924967e-05, "loss": 0.1046, "num_tokens": 658159921.0, "step": 5035 }, { "epoch": 2.0095770151636074, "grad_norm": 0.244869202375412, "learning_rate": 1.7129437948477838e-05, "loss": 0.0832, "num_tokens": 658290993.0, "step": 5036 }, { "epoch": 2.009976057462091, "grad_norm": 0.23286627233028412, "learning_rate": 1.7120655728637786e-05, "loss": 0.094, "num_tokens": 658422065.0, "step": 5037 }, { "epoch": 2.0103750997605747, "grad_norm": 0.2768542170524597, "learning_rate": 1.7111875517104107e-05, "loss": 0.0987, "num_tokens": 658553137.0, "step": 5038 }, { "epoch": 2.0107741420590584, "grad_norm": 0.26099973917007446, "learning_rate": 1.7103097315575668e-05, "loss": 0.1017, "num_tokens": 658684209.0, "step": 5039 }, { "epoch": 2.011173184357542, "grad_norm": 0.24997834861278534, "learning_rate": 1.7094321125750983e-05, "loss": 0.1053, "num_tokens": 658815281.0, "step": 5040 }, { "epoch": 2.0115722266560256, "grad_norm": 0.3270493149757385, "learning_rate": 1.708554694932816e-05, "loss": 0.1079, "num_tokens": 658940673.0, "step": 5041 }, { "epoch": 2.0119712689545093, "grad_norm": 0.22230447828769684, "learning_rate": 1.7076774788004916e-05, "loss": 0.0783, "num_tokens": 659066260.0, "step": 5042 }, { "epoch": 2.012370311252993, "grad_norm": 0.3263758420944214, "learning_rate": 1.7068004643478598e-05, "loss": 0.0949, "num_tokens": 659197332.0, "step": 5043 }, { "epoch": 2.0127693535514766, "grad_norm": 0.2588503062725067, "learning_rate": 1.705923651744612e-05, "loss": 0.1034, "num_tokens": 659328404.0, "step": 5044 }, { "epoch": 2.01316839584996, "grad_norm": 0.2635151147842407, "learning_rate": 1.7050470411604053e-05, "loss": 0.1119, "num_tokens": 659459476.0, "step": 5045 }, { "epoch": 2.013567438148444, "grad_norm": 0.22538325190544128, "learning_rate": 1.7041706327648563e-05, "loss": 0.0928, "num_tokens": 659590548.0, "step": 5046 }, { "epoch": 2.0139664804469275, "grad_norm": 0.27179527282714844, "learning_rate": 1.7032944267275393e-05, "loss": 0.1048, "num_tokens": 659721620.0, "step": 5047 }, { "epoch": 2.014365522745411, "grad_norm": 0.2595054805278778, "learning_rate": 1.702418423217994e-05, "loss": 0.1059, "num_tokens": 659852692.0, "step": 5048 }, { "epoch": 2.014764565043895, "grad_norm": 0.24089588224887848, "learning_rate": 1.70154262240572e-05, "loss": 0.1003, "num_tokens": 659983764.0, "step": 5049 }, { "epoch": 2.0151636073423784, "grad_norm": 0.2574286460876465, "learning_rate": 1.700667024460173e-05, "loss": 0.105, "num_tokens": 660114836.0, "step": 5050 }, { "epoch": 2.015562649640862, "grad_norm": 0.245759055018425, "learning_rate": 1.6997916295507755e-05, "loss": 0.0989, "num_tokens": 660245908.0, "step": 5051 }, { "epoch": 2.0159616919393457, "grad_norm": 0.2703142464160919, "learning_rate": 1.6989164378469078e-05, "loss": 0.0996, "num_tokens": 660374716.0, "step": 5052 }, { "epoch": 2.0163607342378294, "grad_norm": 0.25876083970069885, "learning_rate": 1.6980414495179113e-05, "loss": 0.1049, "num_tokens": 660505788.0, "step": 5053 }, { "epoch": 2.016759776536313, "grad_norm": 0.25149548053741455, "learning_rate": 1.6971666647330875e-05, "loss": 0.0878, "num_tokens": 660636860.0, "step": 5054 }, { "epoch": 2.0171588188347966, "grad_norm": 0.26487794518470764, "learning_rate": 1.6962920836617008e-05, "loss": 0.0987, "num_tokens": 660767932.0, "step": 5055 }, { "epoch": 2.0175578611332803, "grad_norm": 0.2602387070655823, "learning_rate": 1.6954177064729724e-05, "loss": 0.0992, "num_tokens": 660899004.0, "step": 5056 }, { "epoch": 2.017956903431764, "grad_norm": 0.24634268879890442, "learning_rate": 1.694543533336087e-05, "loss": 0.0918, "num_tokens": 661030076.0, "step": 5057 }, { "epoch": 2.0183559457302476, "grad_norm": 0.2589421570301056, "learning_rate": 1.69366956442019e-05, "loss": 0.0914, "num_tokens": 661161148.0, "step": 5058 }, { "epoch": 2.018754988028731, "grad_norm": 0.2696773111820221, "learning_rate": 1.692795799894385e-05, "loss": 0.0854, "num_tokens": 661292220.0, "step": 5059 }, { "epoch": 2.019154030327215, "grad_norm": 0.23851417005062103, "learning_rate": 1.6919222399277368e-05, "loss": 0.0924, "num_tokens": 661423292.0, "step": 5060 }, { "epoch": 2.0195530726256985, "grad_norm": 0.2629394233226776, "learning_rate": 1.691048884689273e-05, "loss": 0.1086, "num_tokens": 661554364.0, "step": 5061 }, { "epoch": 2.019952114924182, "grad_norm": 0.28777167201042175, "learning_rate": 1.6901757343479792e-05, "loss": 0.0965, "num_tokens": 661685436.0, "step": 5062 }, { "epoch": 2.020351157222666, "grad_norm": 0.190824493765831, "learning_rate": 1.6893027890727995e-05, "loss": 0.0683, "num_tokens": 661816508.0, "step": 5063 }, { "epoch": 2.0207501995211494, "grad_norm": 0.26710206270217896, "learning_rate": 1.6884300490326428e-05, "loss": 0.1098, "num_tokens": 661947580.0, "step": 5064 }, { "epoch": 2.021149241819633, "grad_norm": 0.27418285608291626, "learning_rate": 1.6875575143963763e-05, "loss": 0.1134, "num_tokens": 662078652.0, "step": 5065 }, { "epoch": 2.0215482841181167, "grad_norm": 0.2293187975883484, "learning_rate": 1.6866851853328264e-05, "loss": 0.0799, "num_tokens": 662209724.0, "step": 5066 }, { "epoch": 2.0219473264166004, "grad_norm": 0.2504882514476776, "learning_rate": 1.685813062010781e-05, "loss": 0.1025, "num_tokens": 662340796.0, "step": 5067 }, { "epoch": 2.022346368715084, "grad_norm": 0.28384286165237427, "learning_rate": 1.684941144598989e-05, "loss": 0.1126, "num_tokens": 662471868.0, "step": 5068 }, { "epoch": 2.0227454110135676, "grad_norm": 0.2878536581993103, "learning_rate": 1.6840694332661554e-05, "loss": 0.1178, "num_tokens": 662602940.0, "step": 5069 }, { "epoch": 2.0231444533120513, "grad_norm": 0.2455444037914276, "learning_rate": 1.6831979281809508e-05, "loss": 0.096, "num_tokens": 662734012.0, "step": 5070 }, { "epoch": 2.023543495610535, "grad_norm": 0.29075708985328674, "learning_rate": 1.6823266295120027e-05, "loss": 0.105, "num_tokens": 662865084.0, "step": 5071 }, { "epoch": 2.0239425379090186, "grad_norm": 0.28989654779434204, "learning_rate": 1.6814555374278994e-05, "loss": 0.0979, "num_tokens": 662996156.0, "step": 5072 }, { "epoch": 2.024341580207502, "grad_norm": 0.26041385531425476, "learning_rate": 1.6805846520971883e-05, "loss": 0.1014, "num_tokens": 663127228.0, "step": 5073 }, { "epoch": 2.024740622505986, "grad_norm": 0.24265246093273163, "learning_rate": 1.679713973688379e-05, "loss": 0.0991, "num_tokens": 663254523.0, "step": 5074 }, { "epoch": 2.0251396648044695, "grad_norm": 0.24686846137046814, "learning_rate": 1.678843502369937e-05, "loss": 0.0893, "num_tokens": 663385595.0, "step": 5075 }, { "epoch": 2.0255387071029527, "grad_norm": 0.23240764439105988, "learning_rate": 1.6779732383102925e-05, "loss": 0.092, "num_tokens": 663516667.0, "step": 5076 }, { "epoch": 2.0259377494014363, "grad_norm": 0.23090136051177979, "learning_rate": 1.6771031816778337e-05, "loss": 0.0943, "num_tokens": 663647739.0, "step": 5077 }, { "epoch": 2.02633679169992, "grad_norm": 0.25947922468185425, "learning_rate": 1.6762333326409073e-05, "loss": 0.1093, "num_tokens": 663778811.0, "step": 5078 }, { "epoch": 2.0267358339984036, "grad_norm": 0.23431770503520966, "learning_rate": 1.6753636913678218e-05, "loss": 0.0798, "num_tokens": 663909883.0, "step": 5079 }, { "epoch": 2.0271348762968873, "grad_norm": 0.25368428230285645, "learning_rate": 1.6744942580268446e-05, "loss": 0.0921, "num_tokens": 664040955.0, "step": 5080 }, { "epoch": 2.027533918595371, "grad_norm": 0.23270346224308014, "learning_rate": 1.6736250327862023e-05, "loss": 0.0876, "num_tokens": 664172027.0, "step": 5081 }, { "epoch": 2.0279329608938546, "grad_norm": 0.25988176465034485, "learning_rate": 1.672756015814082e-05, "loss": 0.1051, "num_tokens": 664303099.0, "step": 5082 }, { "epoch": 2.028332003192338, "grad_norm": 0.21902155876159668, "learning_rate": 1.6718872072786317e-05, "loss": 0.0935, "num_tokens": 664434171.0, "step": 5083 }, { "epoch": 2.028731045490822, "grad_norm": 0.22985559701919556, "learning_rate": 1.671018607347957e-05, "loss": 0.0869, "num_tokens": 664565243.0, "step": 5084 }, { "epoch": 2.0291300877893055, "grad_norm": 0.28620049357414246, "learning_rate": 1.6701502161901225e-05, "loss": 0.1091, "num_tokens": 664696315.0, "step": 5085 }, { "epoch": 2.029529130087789, "grad_norm": 0.21781760454177856, "learning_rate": 1.669282033973155e-05, "loss": 0.0712, "num_tokens": 664827387.0, "step": 5086 }, { "epoch": 2.0299281723862728, "grad_norm": 0.24416114389896393, "learning_rate": 1.6684140608650406e-05, "loss": 0.0741, "num_tokens": 664958459.0, "step": 5087 }, { "epoch": 2.0303272146847564, "grad_norm": 0.23535752296447754, "learning_rate": 1.667546297033722e-05, "loss": 0.0852, "num_tokens": 665089531.0, "step": 5088 }, { "epoch": 2.03072625698324, "grad_norm": 0.27204471826553345, "learning_rate": 1.6666787426471052e-05, "loss": 0.1122, "num_tokens": 665220603.0, "step": 5089 }, { "epoch": 2.0311252992817237, "grad_norm": 0.2622685730457306, "learning_rate": 1.6658113978730526e-05, "loss": 0.0908, "num_tokens": 665351675.0, "step": 5090 }, { "epoch": 2.0315243415802073, "grad_norm": 0.28353792428970337, "learning_rate": 1.6649442628793877e-05, "loss": 0.1009, "num_tokens": 665482747.0, "step": 5091 }, { "epoch": 2.031923383878691, "grad_norm": 0.2572707235813141, "learning_rate": 1.6640773378338936e-05, "loss": 0.0994, "num_tokens": 665613819.0, "step": 5092 }, { "epoch": 2.0323224261771746, "grad_norm": 0.23212222754955292, "learning_rate": 1.6632106229043112e-05, "loss": 0.0846, "num_tokens": 665744891.0, "step": 5093 }, { "epoch": 2.0327214684756583, "grad_norm": 0.25306466221809387, "learning_rate": 1.662344118258342e-05, "loss": 0.0948, "num_tokens": 665875963.0, "step": 5094 }, { "epoch": 2.033120510774142, "grad_norm": 0.20731188356876373, "learning_rate": 1.6614778240636486e-05, "loss": 0.0642, "num_tokens": 666007035.0, "step": 5095 }, { "epoch": 2.0335195530726256, "grad_norm": 0.2659528851509094, "learning_rate": 1.6606117404878486e-05, "loss": 0.0971, "num_tokens": 666138107.0, "step": 5096 }, { "epoch": 2.033918595371109, "grad_norm": 0.2972564697265625, "learning_rate": 1.659745867698521e-05, "loss": 0.1052, "num_tokens": 666269179.0, "step": 5097 }, { "epoch": 2.034317637669593, "grad_norm": 0.23436006903648376, "learning_rate": 1.6588802058632046e-05, "loss": 0.0859, "num_tokens": 666400251.0, "step": 5098 }, { "epoch": 2.0347166799680765, "grad_norm": 0.2582715153694153, "learning_rate": 1.6580147551493983e-05, "loss": 0.1005, "num_tokens": 666531323.0, "step": 5099 }, { "epoch": 2.03511572226656, "grad_norm": 0.25213414430618286, "learning_rate": 1.6571495157245563e-05, "loss": 0.0898, "num_tokens": 666662395.0, "step": 5100 }, { "epoch": 2.0355147645650438, "grad_norm": 0.22923649847507477, "learning_rate": 1.6562844877560957e-05, "loss": 0.0928, "num_tokens": 666793467.0, "step": 5101 }, { "epoch": 2.0359138068635274, "grad_norm": 0.24716125428676605, "learning_rate": 1.655419671411392e-05, "loss": 0.0898, "num_tokens": 666924539.0, "step": 5102 }, { "epoch": 2.036312849162011, "grad_norm": 0.2904532253742218, "learning_rate": 1.654555066857778e-05, "loss": 0.1075, "num_tokens": 667055611.0, "step": 5103 }, { "epoch": 2.0367118914604947, "grad_norm": 0.2554458677768707, "learning_rate": 1.653690674262546e-05, "loss": 0.0997, "num_tokens": 667186683.0, "step": 5104 }, { "epoch": 2.0371109337589783, "grad_norm": 0.266044944524765, "learning_rate": 1.6528264937929507e-05, "loss": 0.0835, "num_tokens": 667317755.0, "step": 5105 }, { "epoch": 2.037509976057462, "grad_norm": 0.2577478289604187, "learning_rate": 1.6519625256162003e-05, "loss": 0.104, "num_tokens": 667448827.0, "step": 5106 }, { "epoch": 2.0379090183559456, "grad_norm": 0.2716936767101288, "learning_rate": 1.6510987698994647e-05, "loss": 0.0967, "num_tokens": 667579899.0, "step": 5107 }, { "epoch": 2.0383080606544293, "grad_norm": 0.25308501720428467, "learning_rate": 1.6502352268098747e-05, "loss": 0.0911, "num_tokens": 667710971.0, "step": 5108 }, { "epoch": 2.038707102952913, "grad_norm": 0.23507998883724213, "learning_rate": 1.6493718965145156e-05, "loss": 0.0989, "num_tokens": 667842043.0, "step": 5109 }, { "epoch": 2.0391061452513966, "grad_norm": 0.2582968771457672, "learning_rate": 1.648508779180434e-05, "loss": 0.102, "num_tokens": 667973115.0, "step": 5110 }, { "epoch": 2.03950518754988, "grad_norm": 0.24462084472179413, "learning_rate": 1.6476458749746367e-05, "loss": 0.0904, "num_tokens": 668104187.0, "step": 5111 }, { "epoch": 2.039904229848364, "grad_norm": 0.2922656834125519, "learning_rate": 1.6467831840640857e-05, "loss": 0.1083, "num_tokens": 668235259.0, "step": 5112 }, { "epoch": 2.0403032721468475, "grad_norm": 0.25589221715927124, "learning_rate": 1.6459207066157047e-05, "loss": 0.0923, "num_tokens": 668366331.0, "step": 5113 }, { "epoch": 2.040702314445331, "grad_norm": 0.246869757771492, "learning_rate": 1.6450584427963757e-05, "loss": 0.0881, "num_tokens": 668497403.0, "step": 5114 }, { "epoch": 2.0411013567438148, "grad_norm": 0.2687055766582489, "learning_rate": 1.644196392772937e-05, "loss": 0.0981, "num_tokens": 668628475.0, "step": 5115 }, { "epoch": 2.0415003990422984, "grad_norm": 0.20395801961421967, "learning_rate": 1.6433345567121884e-05, "loss": 0.0622, "num_tokens": 668759547.0, "step": 5116 }, { "epoch": 2.041899441340782, "grad_norm": 0.2372984141111374, "learning_rate": 1.642472934780887e-05, "loss": 0.0929, "num_tokens": 668890619.0, "step": 5117 }, { "epoch": 2.0422984836392657, "grad_norm": 0.25199952721595764, "learning_rate": 1.641611527145749e-05, "loss": 0.0981, "num_tokens": 669021691.0, "step": 5118 }, { "epoch": 2.0426975259377493, "grad_norm": 0.2724079191684723, "learning_rate": 1.6407503339734473e-05, "loss": 0.1085, "num_tokens": 669152763.0, "step": 5119 }, { "epoch": 2.043096568236233, "grad_norm": 0.2247786819934845, "learning_rate": 1.639889355430616e-05, "loss": 0.0917, "num_tokens": 669283835.0, "step": 5120 }, { "epoch": 2.0434956105347166, "grad_norm": 0.23629923164844513, "learning_rate": 1.6390285916838467e-05, "loss": 0.0882, "num_tokens": 669414907.0, "step": 5121 }, { "epoch": 2.0438946528332003, "grad_norm": 0.24817746877670288, "learning_rate": 1.6381680428996876e-05, "loss": 0.0974, "num_tokens": 669545979.0, "step": 5122 }, { "epoch": 2.044293695131684, "grad_norm": 0.30708709359169006, "learning_rate": 1.6373077092446475e-05, "loss": 0.114, "num_tokens": 669677051.0, "step": 5123 }, { "epoch": 2.0446927374301676, "grad_norm": 0.27037110924720764, "learning_rate": 1.6364475908851947e-05, "loss": 0.1032, "num_tokens": 669808123.0, "step": 5124 }, { "epoch": 2.045091779728651, "grad_norm": 0.29406115412712097, "learning_rate": 1.635587687987752e-05, "loss": 0.1107, "num_tokens": 669939195.0, "step": 5125 }, { "epoch": 2.045490822027135, "grad_norm": 0.2784251272678375, "learning_rate": 1.6347280007187024e-05, "loss": 0.0886, "num_tokens": 670070267.0, "step": 5126 }, { "epoch": 2.0458898643256185, "grad_norm": 0.2437441498041153, "learning_rate": 1.6338685292443894e-05, "loss": 0.0825, "num_tokens": 670198091.0, "step": 5127 }, { "epoch": 2.046288906624102, "grad_norm": 0.23534275591373444, "learning_rate": 1.633009273731112e-05, "loss": 0.0853, "num_tokens": 670329163.0, "step": 5128 }, { "epoch": 2.0466879489225858, "grad_norm": 0.2220175415277481, "learning_rate": 1.632150234345127e-05, "loss": 0.0799, "num_tokens": 670460235.0, "step": 5129 }, { "epoch": 2.0470869912210694, "grad_norm": 0.2726437449455261, "learning_rate": 1.631291411252651e-05, "loss": 0.0998, "num_tokens": 670588595.0, "step": 5130 }, { "epoch": 2.047486033519553, "grad_norm": 0.25928995013237, "learning_rate": 1.6304328046198582e-05, "loss": 0.1047, "num_tokens": 670719667.0, "step": 5131 }, { "epoch": 2.0478850758180367, "grad_norm": 0.23086576163768768, "learning_rate": 1.6295744146128814e-05, "loss": 0.0971, "num_tokens": 670850739.0, "step": 5132 }, { "epoch": 2.0482841181165203, "grad_norm": 0.3124454915523529, "learning_rate": 1.6287162413978115e-05, "loss": 0.1096, "num_tokens": 670981811.0, "step": 5133 }, { "epoch": 2.048683160415004, "grad_norm": 0.2635868489742279, "learning_rate": 1.6278582851406955e-05, "loss": 0.0964, "num_tokens": 671112883.0, "step": 5134 }, { "epoch": 2.0490822027134876, "grad_norm": 0.25161027908325195, "learning_rate": 1.627000546007541e-05, "loss": 0.1066, "num_tokens": 671243955.0, "step": 5135 }, { "epoch": 2.0494812450119713, "grad_norm": 0.23663924634456635, "learning_rate": 1.6261430241643133e-05, "loss": 0.0941, "num_tokens": 671375027.0, "step": 5136 }, { "epoch": 2.049880287310455, "grad_norm": 0.274173378944397, "learning_rate": 1.625285719776933e-05, "loss": 0.091, "num_tokens": 671506099.0, "step": 5137 }, { "epoch": 2.0502793296089385, "grad_norm": 0.22721873223781586, "learning_rate": 1.6244286330112814e-05, "loss": 0.0865, "num_tokens": 671637171.0, "step": 5138 }, { "epoch": 2.050678371907422, "grad_norm": 0.2820066809654236, "learning_rate": 1.6235717640331983e-05, "loss": 0.1098, "num_tokens": 671768243.0, "step": 5139 }, { "epoch": 2.051077414205906, "grad_norm": 0.25458523631095886, "learning_rate": 1.6227151130084774e-05, "loss": 0.0928, "num_tokens": 671899315.0, "step": 5140 }, { "epoch": 2.0514764565043895, "grad_norm": 0.3193248510360718, "learning_rate": 1.6218586801028737e-05, "loss": 0.1162, "num_tokens": 672030387.0, "step": 5141 }, { "epoch": 2.051875498802873, "grad_norm": 0.23025017976760864, "learning_rate": 1.6210024654820986e-05, "loss": 0.091, "num_tokens": 672161459.0, "step": 5142 }, { "epoch": 2.0522745411013568, "grad_norm": 0.26153871417045593, "learning_rate": 1.620146469311823e-05, "loss": 0.1075, "num_tokens": 672292531.0, "step": 5143 }, { "epoch": 2.0526735833998404, "grad_norm": 0.23551082611083984, "learning_rate": 1.6192906917576723e-05, "loss": 0.0833, "num_tokens": 672423603.0, "step": 5144 }, { "epoch": 2.053072625698324, "grad_norm": 0.2644636332988739, "learning_rate": 1.6184351329852326e-05, "loss": 0.1015, "num_tokens": 672554675.0, "step": 5145 }, { "epoch": 2.0534716679968077, "grad_norm": 0.2585759460926056, "learning_rate": 1.6175797931600473e-05, "loss": 0.0868, "num_tokens": 672685747.0, "step": 5146 }, { "epoch": 2.0538707102952913, "grad_norm": 0.2446611374616623, "learning_rate": 1.6167246724476142e-05, "loss": 0.0898, "num_tokens": 672816819.0, "step": 5147 }, { "epoch": 2.054269752593775, "grad_norm": 0.25556716322898865, "learning_rate": 1.6158697710133934e-05, "loss": 0.099, "num_tokens": 672947891.0, "step": 5148 }, { "epoch": 2.0546687948922586, "grad_norm": 0.2601032555103302, "learning_rate": 1.6150150890228002e-05, "loss": 0.0998, "num_tokens": 673078963.0, "step": 5149 }, { "epoch": 2.0550678371907423, "grad_norm": 0.2816649377346039, "learning_rate": 1.6141606266412067e-05, "loss": 0.1006, "num_tokens": 673210035.0, "step": 5150 }, { "epoch": 2.055466879489226, "grad_norm": 0.24979685246944427, "learning_rate": 1.613306384033944e-05, "loss": 0.0924, "num_tokens": 673341107.0, "step": 5151 }, { "epoch": 2.0558659217877095, "grad_norm": 0.26875054836273193, "learning_rate": 1.6124523613663004e-05, "loss": 0.1003, "num_tokens": 673472179.0, "step": 5152 }, { "epoch": 2.056264964086193, "grad_norm": 0.23988687992095947, "learning_rate": 1.6115985588035197e-05, "loss": 0.0885, "num_tokens": 673603251.0, "step": 5153 }, { "epoch": 2.056664006384677, "grad_norm": 0.276387095451355, "learning_rate": 1.6107449765108058e-05, "loss": 0.0966, "num_tokens": 673734323.0, "step": 5154 }, { "epoch": 2.0570630486831605, "grad_norm": 0.2749072313308716, "learning_rate": 1.60989161465332e-05, "loss": 0.1025, "num_tokens": 673865395.0, "step": 5155 }, { "epoch": 2.057462090981644, "grad_norm": 0.2584982216358185, "learning_rate": 1.6090384733961778e-05, "loss": 0.0953, "num_tokens": 673996467.0, "step": 5156 }, { "epoch": 2.0578611332801278, "grad_norm": 0.2838713228702545, "learning_rate": 1.6081855529044547e-05, "loss": 0.1156, "num_tokens": 674127539.0, "step": 5157 }, { "epoch": 2.0582601755786114, "grad_norm": 0.26632073521614075, "learning_rate": 1.6073328533431847e-05, "loss": 0.0962, "num_tokens": 674258611.0, "step": 5158 }, { "epoch": 2.058659217877095, "grad_norm": 0.2599281966686249, "learning_rate": 1.6064803748773548e-05, "loss": 0.0969, "num_tokens": 674389683.0, "step": 5159 }, { "epoch": 2.0590582601755787, "grad_norm": 0.2538995146751404, "learning_rate": 1.6056281176719122e-05, "loss": 0.0933, "num_tokens": 674520755.0, "step": 5160 }, { "epoch": 2.0594573024740623, "grad_norm": 0.2887440323829651, "learning_rate": 1.6047760818917623e-05, "loss": 0.1083, "num_tokens": 674651827.0, "step": 5161 }, { "epoch": 2.059856344772546, "grad_norm": 0.26094967126846313, "learning_rate": 1.603924267701765e-05, "loss": 0.1008, "num_tokens": 674782899.0, "step": 5162 }, { "epoch": 2.0602553870710296, "grad_norm": 0.24909308552742004, "learning_rate": 1.6030726752667375e-05, "loss": 0.0911, "num_tokens": 674913971.0, "step": 5163 }, { "epoch": 2.0606544293695133, "grad_norm": 0.2460092306137085, "learning_rate": 1.6022213047514557e-05, "loss": 0.0928, "num_tokens": 675045043.0, "step": 5164 }, { "epoch": 2.061053471667997, "grad_norm": 0.2589065134525299, "learning_rate": 1.601370156320653e-05, "loss": 0.1061, "num_tokens": 675176115.0, "step": 5165 }, { "epoch": 2.0614525139664805, "grad_norm": 0.2521878182888031, "learning_rate": 1.6005192301390164e-05, "loss": 0.0838, "num_tokens": 675307187.0, "step": 5166 }, { "epoch": 2.061851556264964, "grad_norm": 0.2497243881225586, "learning_rate": 1.5996685263711946e-05, "loss": 0.1012, "num_tokens": 675438259.0, "step": 5167 }, { "epoch": 2.062250598563448, "grad_norm": 0.2736142575740814, "learning_rate": 1.5988180451817887e-05, "loss": 0.0961, "num_tokens": 675569331.0, "step": 5168 }, { "epoch": 2.0626496408619315, "grad_norm": 0.20804940164089203, "learning_rate": 1.5979677867353602e-05, "loss": 0.0658, "num_tokens": 675685393.0, "step": 5169 }, { "epoch": 2.063048683160415, "grad_norm": 0.2574089467525482, "learning_rate": 1.5971177511964265e-05, "loss": 0.1033, "num_tokens": 675816465.0, "step": 5170 }, { "epoch": 2.0634477254588988, "grad_norm": 0.25266337394714355, "learning_rate": 1.59626793872946e-05, "loss": 0.0974, "num_tokens": 675947537.0, "step": 5171 }, { "epoch": 2.0638467677573824, "grad_norm": 0.27821671962738037, "learning_rate": 1.595418349498892e-05, "loss": 0.1009, "num_tokens": 676078609.0, "step": 5172 }, { "epoch": 2.064245810055866, "grad_norm": 0.21965548396110535, "learning_rate": 1.5945689836691118e-05, "loss": 0.0766, "num_tokens": 676209681.0, "step": 5173 }, { "epoch": 2.0646448523543497, "grad_norm": 0.2837827801704407, "learning_rate": 1.5937198414044624e-05, "loss": 0.1073, "num_tokens": 676340753.0, "step": 5174 }, { "epoch": 2.0650438946528333, "grad_norm": 0.32559317350387573, "learning_rate": 1.5928709228692445e-05, "loss": 0.104, "num_tokens": 676471825.0, "step": 5175 }, { "epoch": 2.065442936951317, "grad_norm": 0.23818044364452362, "learning_rate": 1.592022228227716e-05, "loss": 0.0902, "num_tokens": 676602897.0, "step": 5176 }, { "epoch": 2.0658419792498006, "grad_norm": 0.2691860795021057, "learning_rate": 1.5911737576440927e-05, "loss": 0.0908, "num_tokens": 676733969.0, "step": 5177 }, { "epoch": 2.0662410215482843, "grad_norm": 0.2463994175195694, "learning_rate": 1.590325511282544e-05, "loss": 0.0857, "num_tokens": 676865041.0, "step": 5178 }, { "epoch": 2.066640063846768, "grad_norm": 0.2416936606168747, "learning_rate": 1.5894774893071984e-05, "loss": 0.0866, "num_tokens": 676996113.0, "step": 5179 }, { "epoch": 2.0670391061452515, "grad_norm": 0.24776259064674377, "learning_rate": 1.5886296918821415e-05, "loss": 0.0918, "num_tokens": 677127185.0, "step": 5180 }, { "epoch": 2.067438148443735, "grad_norm": 0.2530735433101654, "learning_rate": 1.587782119171412e-05, "loss": 0.0902, "num_tokens": 677258257.0, "step": 5181 }, { "epoch": 2.067837190742219, "grad_norm": 0.23316791653633118, "learning_rate": 1.5869347713390087e-05, "loss": 0.0854, "num_tokens": 677389329.0, "step": 5182 }, { "epoch": 2.0682362330407025, "grad_norm": 0.28606149554252625, "learning_rate": 1.5860876485488857e-05, "loss": 0.0992, "num_tokens": 677520401.0, "step": 5183 }, { "epoch": 2.068635275339186, "grad_norm": 0.24344909191131592, "learning_rate": 1.5852407509649523e-05, "loss": 0.0859, "num_tokens": 677651473.0, "step": 5184 }, { "epoch": 2.0690343176376698, "grad_norm": 0.26987674832344055, "learning_rate": 1.5843940787510765e-05, "loss": 0.1018, "num_tokens": 677782545.0, "step": 5185 }, { "epoch": 2.0694333599361534, "grad_norm": 0.25789812207221985, "learning_rate": 1.5835476320710806e-05, "loss": 0.0968, "num_tokens": 677913617.0, "step": 5186 }, { "epoch": 2.069832402234637, "grad_norm": 0.238260880112648, "learning_rate": 1.582701411088744e-05, "loss": 0.0868, "num_tokens": 678044689.0, "step": 5187 }, { "epoch": 2.0702314445331207, "grad_norm": 0.246490940451622, "learning_rate": 1.5818554159678022e-05, "loss": 0.0912, "num_tokens": 678175761.0, "step": 5188 }, { "epoch": 2.0706304868316043, "grad_norm": 0.27089953422546387, "learning_rate": 1.58100964687195e-05, "loss": 0.1023, "num_tokens": 678306833.0, "step": 5189 }, { "epoch": 2.071029529130088, "grad_norm": 0.24749958515167236, "learning_rate": 1.580164103964832e-05, "loss": 0.1015, "num_tokens": 678437905.0, "step": 5190 }, { "epoch": 2.0714285714285716, "grad_norm": 0.2263050079345703, "learning_rate": 1.5793187874100554e-05, "loss": 0.0822, "num_tokens": 678568977.0, "step": 5191 }, { "epoch": 2.0718276137270553, "grad_norm": 0.27600693702697754, "learning_rate": 1.5784736973711817e-05, "loss": 0.1049, "num_tokens": 678700049.0, "step": 5192 }, { "epoch": 2.072226656025539, "grad_norm": 0.2947969436645508, "learning_rate": 1.577628834011725e-05, "loss": 0.1048, "num_tokens": 678831121.0, "step": 5193 }, { "epoch": 2.0726256983240225, "grad_norm": 0.2814992070198059, "learning_rate": 1.576784197495161e-05, "loss": 0.0958, "num_tokens": 678962193.0, "step": 5194 }, { "epoch": 2.073024740622506, "grad_norm": 0.27093765139579773, "learning_rate": 1.5759397879849184e-05, "loss": 0.0997, "num_tokens": 679093265.0, "step": 5195 }, { "epoch": 2.07342378292099, "grad_norm": 0.2822183668613434, "learning_rate": 1.5750956056443828e-05, "loss": 0.0917, "num_tokens": 679224337.0, "step": 5196 }, { "epoch": 2.0738228252194735, "grad_norm": 0.2661369740962982, "learning_rate": 1.574251650636894e-05, "loss": 0.1006, "num_tokens": 679355409.0, "step": 5197 }, { "epoch": 2.0742218675179567, "grad_norm": 0.2869323492050171, "learning_rate": 1.573407923125751e-05, "loss": 0.1216, "num_tokens": 679486481.0, "step": 5198 }, { "epoch": 2.0746209098164403, "grad_norm": 0.23573040962219238, "learning_rate": 1.5725644232742076e-05, "loss": 0.09, "num_tokens": 679612270.0, "step": 5199 }, { "epoch": 2.075019952114924, "grad_norm": 0.26939496397972107, "learning_rate": 1.5717211512454716e-05, "loss": 0.112, "num_tokens": 679743342.0, "step": 5200 }, { "epoch": 2.0754189944134076, "grad_norm": 0.27579089999198914, "learning_rate": 1.5708781072027083e-05, "loss": 0.1015, "num_tokens": 679874414.0, "step": 5201 }, { "epoch": 2.0758180367118912, "grad_norm": 0.2741278409957886, "learning_rate": 1.5700352913090416e-05, "loss": 0.0939, "num_tokens": 680005486.0, "step": 5202 }, { "epoch": 2.076217079010375, "grad_norm": 0.2545732259750366, "learning_rate": 1.569192703727545e-05, "loss": 0.1023, "num_tokens": 680136558.0, "step": 5203 }, { "epoch": 2.0766161213088585, "grad_norm": 0.22798605263233185, "learning_rate": 1.5683503446212533e-05, "loss": 0.087, "num_tokens": 680267630.0, "step": 5204 }, { "epoch": 2.077015163607342, "grad_norm": 0.2300693243741989, "learning_rate": 1.567508214153155e-05, "loss": 0.0895, "num_tokens": 680398702.0, "step": 5205 }, { "epoch": 2.077414205905826, "grad_norm": 0.2220923751592636, "learning_rate": 1.566666312486194e-05, "loss": 0.0849, "num_tokens": 680529774.0, "step": 5206 }, { "epoch": 2.0778132482043095, "grad_norm": 0.24761933088302612, "learning_rate": 1.56582463978327e-05, "loss": 0.0846, "num_tokens": 680660846.0, "step": 5207 }, { "epoch": 2.078212290502793, "grad_norm": 0.2564047574996948, "learning_rate": 1.5649831962072417e-05, "loss": 0.0878, "num_tokens": 680791918.0, "step": 5208 }, { "epoch": 2.0786113328012767, "grad_norm": 0.23802481591701508, "learning_rate": 1.5641419819209163e-05, "loss": 0.0895, "num_tokens": 680922990.0, "step": 5209 }, { "epoch": 2.0790103750997604, "grad_norm": 0.3003624677658081, "learning_rate": 1.5633009970870633e-05, "loss": 0.1172, "num_tokens": 681054062.0, "step": 5210 }, { "epoch": 2.079409417398244, "grad_norm": 0.30082595348358154, "learning_rate": 1.5624602418684057e-05, "loss": 0.0988, "num_tokens": 681185134.0, "step": 5211 }, { "epoch": 2.0798084596967277, "grad_norm": 0.25044625997543335, "learning_rate": 1.5616197164276202e-05, "loss": 0.0958, "num_tokens": 681316206.0, "step": 5212 }, { "epoch": 2.0802075019952113, "grad_norm": 0.2580897808074951, "learning_rate": 1.560779420927342e-05, "loss": 0.0904, "num_tokens": 681447278.0, "step": 5213 }, { "epoch": 2.080606544293695, "grad_norm": 0.280537873506546, "learning_rate": 1.559939355530161e-05, "loss": 0.1036, "num_tokens": 681578350.0, "step": 5214 }, { "epoch": 2.0810055865921786, "grad_norm": 0.288200706243515, "learning_rate": 1.5590995203986198e-05, "loss": 0.0958, "num_tokens": 681709422.0, "step": 5215 }, { "epoch": 2.0814046288906622, "grad_norm": 0.2697729468345642, "learning_rate": 1.5582599156952203e-05, "loss": 0.0977, "num_tokens": 681840494.0, "step": 5216 }, { "epoch": 2.081803671189146, "grad_norm": 0.22670507431030273, "learning_rate": 1.5574205415824182e-05, "loss": 0.0819, "num_tokens": 681971566.0, "step": 5217 }, { "epoch": 2.0822027134876295, "grad_norm": 0.28450632095336914, "learning_rate": 1.556581398222624e-05, "loss": 0.103, "num_tokens": 682102638.0, "step": 5218 }, { "epoch": 2.082601755786113, "grad_norm": 0.24032755196094513, "learning_rate": 1.555742485778205e-05, "loss": 0.0962, "num_tokens": 682233710.0, "step": 5219 }, { "epoch": 2.083000798084597, "grad_norm": 0.24568872153759003, "learning_rate": 1.554903804411481e-05, "loss": 0.0869, "num_tokens": 682364782.0, "step": 5220 }, { "epoch": 2.0833998403830805, "grad_norm": 0.23910702764987946, "learning_rate": 1.5540653542847315e-05, "loss": 0.0833, "num_tokens": 682495854.0, "step": 5221 }, { "epoch": 2.083798882681564, "grad_norm": 0.23404863476753235, "learning_rate": 1.5532271355601868e-05, "loss": 0.0753, "num_tokens": 682626926.0, "step": 5222 }, { "epoch": 2.0841979249800477, "grad_norm": 0.27791866660118103, "learning_rate": 1.5523891484000353e-05, "loss": 0.0908, "num_tokens": 682757998.0, "step": 5223 }, { "epoch": 2.0845969672785314, "grad_norm": 0.26835718750953674, "learning_rate": 1.5515513929664203e-05, "loss": 0.0885, "num_tokens": 682889070.0, "step": 5224 }, { "epoch": 2.084996009577015, "grad_norm": 0.19958624243736267, "learning_rate": 1.550713869421438e-05, "loss": 0.065, "num_tokens": 683020142.0, "step": 5225 }, { "epoch": 2.0853950518754987, "grad_norm": 0.24840345978736877, "learning_rate": 1.5498765779271425e-05, "loss": 0.0772, "num_tokens": 683151214.0, "step": 5226 }, { "epoch": 2.0857940941739823, "grad_norm": 0.23071737587451935, "learning_rate": 1.5490395186455424e-05, "loss": 0.0769, "num_tokens": 683282286.0, "step": 5227 }, { "epoch": 2.086193136472466, "grad_norm": 0.2914970815181732, "learning_rate": 1.5482026917386e-05, "loss": 0.1169, "num_tokens": 683400378.0, "step": 5228 }, { "epoch": 2.0865921787709496, "grad_norm": 0.2657875716686249, "learning_rate": 1.5473660973682338e-05, "loss": 0.0902, "num_tokens": 683531450.0, "step": 5229 }, { "epoch": 2.0869912210694332, "grad_norm": 0.25190410017967224, "learning_rate": 1.5465297356963175e-05, "loss": 0.0927, "num_tokens": 683662522.0, "step": 5230 }, { "epoch": 2.087390263367917, "grad_norm": 0.24391527473926544, "learning_rate": 1.5456936068846776e-05, "loss": 0.0836, "num_tokens": 683793594.0, "step": 5231 }, { "epoch": 2.0877893056664005, "grad_norm": 0.22347381711006165, "learning_rate": 1.5448577110950986e-05, "loss": 0.0761, "num_tokens": 683924666.0, "step": 5232 }, { "epoch": 2.088188347964884, "grad_norm": 0.2717994749546051, "learning_rate": 1.544022048489319e-05, "loss": 0.1025, "num_tokens": 684055738.0, "step": 5233 }, { "epoch": 2.088587390263368, "grad_norm": 0.22176052629947662, "learning_rate": 1.54318661922903e-05, "loss": 0.0851, "num_tokens": 684186810.0, "step": 5234 }, { "epoch": 2.0889864325618515, "grad_norm": 0.288007915019989, "learning_rate": 1.5423514234758807e-05, "loss": 0.0929, "num_tokens": 684317882.0, "step": 5235 }, { "epoch": 2.089385474860335, "grad_norm": 0.28085261583328247, "learning_rate": 1.541516461391474e-05, "loss": 0.1008, "num_tokens": 684448954.0, "step": 5236 }, { "epoch": 2.0897845171588187, "grad_norm": 0.2340182512998581, "learning_rate": 1.5406817331373657e-05, "loss": 0.0684, "num_tokens": 684580026.0, "step": 5237 }, { "epoch": 2.0901835594573024, "grad_norm": 0.2907900810241699, "learning_rate": 1.5398472388750696e-05, "loss": 0.1017, "num_tokens": 684711098.0, "step": 5238 }, { "epoch": 2.090582601755786, "grad_norm": 0.28057950735092163, "learning_rate": 1.5390129787660525e-05, "loss": 0.1002, "num_tokens": 684842170.0, "step": 5239 }, { "epoch": 2.0909816440542697, "grad_norm": 0.21128550171852112, "learning_rate": 1.5381789529717338e-05, "loss": 0.071, "num_tokens": 684973242.0, "step": 5240 }, { "epoch": 2.0913806863527533, "grad_norm": 0.238867849111557, "learning_rate": 1.5373451616534927e-05, "loss": 0.0847, "num_tokens": 685104314.0, "step": 5241 }, { "epoch": 2.091779728651237, "grad_norm": 0.2345530241727829, "learning_rate": 1.5365116049726587e-05, "loss": 0.0858, "num_tokens": 685235386.0, "step": 5242 }, { "epoch": 2.0921787709497206, "grad_norm": 0.2850872278213501, "learning_rate": 1.5356782830905158e-05, "loss": 0.099, "num_tokens": 685366458.0, "step": 5243 }, { "epoch": 2.0925778132482042, "grad_norm": 0.3029438257217407, "learning_rate": 1.5348451961683058e-05, "loss": 0.1046, "num_tokens": 685497530.0, "step": 5244 }, { "epoch": 2.092976855546688, "grad_norm": 0.26187410950660706, "learning_rate": 1.5340123443672238e-05, "loss": 0.0909, "num_tokens": 685628602.0, "step": 5245 }, { "epoch": 2.0933758978451715, "grad_norm": 0.2590905725955963, "learning_rate": 1.5331797278484168e-05, "loss": 0.0995, "num_tokens": 685759674.0, "step": 5246 }, { "epoch": 2.093774940143655, "grad_norm": 0.26061177253723145, "learning_rate": 1.532347346772989e-05, "loss": 0.0993, "num_tokens": 685890746.0, "step": 5247 }, { "epoch": 2.094173982442139, "grad_norm": 0.23591475188732147, "learning_rate": 1.531515201302e-05, "loss": 0.0801, "num_tokens": 686021818.0, "step": 5248 }, { "epoch": 2.0945730247406225, "grad_norm": 0.25580665469169617, "learning_rate": 1.5306832915964602e-05, "loss": 0.0969, "num_tokens": 686152890.0, "step": 5249 }, { "epoch": 2.094972067039106, "grad_norm": 0.24747566878795624, "learning_rate": 1.5298516178173373e-05, "loss": 0.0993, "num_tokens": 686283962.0, "step": 5250 }, { "epoch": 2.0953711093375897, "grad_norm": 0.2659134864807129, "learning_rate": 1.5290201801255527e-05, "loss": 0.0941, "num_tokens": 686415034.0, "step": 5251 }, { "epoch": 2.0957701516360734, "grad_norm": 0.2535739541053772, "learning_rate": 1.5281889786819808e-05, "loss": 0.0906, "num_tokens": 686546106.0, "step": 5252 }, { "epoch": 2.096169193934557, "grad_norm": 0.2781166434288025, "learning_rate": 1.5273580136474525e-05, "loss": 0.1206, "num_tokens": 686677178.0, "step": 5253 }, { "epoch": 2.0965682362330407, "grad_norm": 0.2686415910720825, "learning_rate": 1.5265272851827507e-05, "loss": 0.0923, "num_tokens": 686808250.0, "step": 5254 }, { "epoch": 2.0969672785315243, "grad_norm": 0.2582976520061493, "learning_rate": 1.525696793448615e-05, "loss": 0.0908, "num_tokens": 686939322.0, "step": 5255 }, { "epoch": 2.097366320830008, "grad_norm": 0.25297027826309204, "learning_rate": 1.524866538605736e-05, "loss": 0.0885, "num_tokens": 687070394.0, "step": 5256 }, { "epoch": 2.0977653631284916, "grad_norm": 0.24399013817310333, "learning_rate": 1.5240365208147611e-05, "loss": 0.0847, "num_tokens": 687201466.0, "step": 5257 }, { "epoch": 2.0981644054269752, "grad_norm": 0.2635231912136078, "learning_rate": 1.5232067402362923e-05, "loss": 0.0998, "num_tokens": 687332538.0, "step": 5258 }, { "epoch": 2.098563447725459, "grad_norm": 0.25950464606285095, "learning_rate": 1.5223771970308824e-05, "loss": 0.1016, "num_tokens": 687463610.0, "step": 5259 }, { "epoch": 2.0989624900239425, "grad_norm": 0.2710079550743103, "learning_rate": 1.521547891359041e-05, "loss": 0.0978, "num_tokens": 687594682.0, "step": 5260 }, { "epoch": 2.099361532322426, "grad_norm": 0.2515143156051636, "learning_rate": 1.520718823381232e-05, "loss": 0.0872, "num_tokens": 687725754.0, "step": 5261 }, { "epoch": 2.09976057462091, "grad_norm": 0.2859064042568207, "learning_rate": 1.5198899932578703e-05, "loss": 0.0978, "num_tokens": 687856826.0, "step": 5262 }, { "epoch": 2.1001596169193935, "grad_norm": 0.24276025593280792, "learning_rate": 1.5190614011493281e-05, "loss": 0.086, "num_tokens": 687987898.0, "step": 5263 }, { "epoch": 2.100558659217877, "grad_norm": 0.32722675800323486, "learning_rate": 1.5182330472159323e-05, "loss": 0.0843, "num_tokens": 688118970.0, "step": 5264 }, { "epoch": 2.1009577015163607, "grad_norm": 0.2756243050098419, "learning_rate": 1.5174049316179572e-05, "loss": 0.102, "num_tokens": 688250042.0, "step": 5265 }, { "epoch": 2.1013567438148444, "grad_norm": 0.2527812719345093, "learning_rate": 1.5165770545156383e-05, "loss": 0.0822, "num_tokens": 688381114.0, "step": 5266 }, { "epoch": 2.101755786113328, "grad_norm": 0.25296130776405334, "learning_rate": 1.5157494160691615e-05, "loss": 0.0965, "num_tokens": 688512186.0, "step": 5267 }, { "epoch": 2.1021548284118117, "grad_norm": 0.25903579592704773, "learning_rate": 1.5149220164386674e-05, "loss": 0.0959, "num_tokens": 688643258.0, "step": 5268 }, { "epoch": 2.1025538707102953, "grad_norm": 0.22580836713314056, "learning_rate": 1.5140948557842488e-05, "loss": 0.088, "num_tokens": 688774330.0, "step": 5269 }, { "epoch": 2.102952913008779, "grad_norm": 0.24403679370880127, "learning_rate": 1.5132679342659561e-05, "loss": 0.0907, "num_tokens": 688889673.0, "step": 5270 }, { "epoch": 2.1033519553072626, "grad_norm": 0.27555862069129944, "learning_rate": 1.5124412520437886e-05, "loss": 0.1079, "num_tokens": 689020745.0, "step": 5271 }, { "epoch": 2.1037509976057462, "grad_norm": 0.22226987779140472, "learning_rate": 1.5116148092777022e-05, "loss": 0.0706, "num_tokens": 689151817.0, "step": 5272 }, { "epoch": 2.10415003990423, "grad_norm": 0.29182711243629456, "learning_rate": 1.5107886061276073e-05, "loss": 0.1238, "num_tokens": 689282889.0, "step": 5273 }, { "epoch": 2.1045490822027135, "grad_norm": 0.25121647119522095, "learning_rate": 1.5099626427533642e-05, "loss": 0.0977, "num_tokens": 689413961.0, "step": 5274 }, { "epoch": 2.104948124501197, "grad_norm": 0.2018858641386032, "learning_rate": 1.5091369193147914e-05, "loss": 0.0673, "num_tokens": 689545033.0, "step": 5275 }, { "epoch": 2.105347166799681, "grad_norm": 0.26599156856536865, "learning_rate": 1.5083114359716566e-05, "loss": 0.0973, "num_tokens": 689676105.0, "step": 5276 }, { "epoch": 2.1057462090981645, "grad_norm": 0.24134604632854462, "learning_rate": 1.5074861928836845e-05, "loss": 0.066, "num_tokens": 689807177.0, "step": 5277 }, { "epoch": 2.106145251396648, "grad_norm": 0.2541530132293701, "learning_rate": 1.5066611902105509e-05, "loss": 0.0914, "num_tokens": 689938249.0, "step": 5278 }, { "epoch": 2.1065442936951317, "grad_norm": 0.22026674449443817, "learning_rate": 1.505836428111887e-05, "loss": 0.0694, "num_tokens": 690069321.0, "step": 5279 }, { "epoch": 2.1069433359936154, "grad_norm": 0.31775403022766113, "learning_rate": 1.505011906747277e-05, "loss": 0.1098, "num_tokens": 690200393.0, "step": 5280 }, { "epoch": 2.107342378292099, "grad_norm": 0.2964625358581543, "learning_rate": 1.5041876262762566e-05, "loss": 0.0931, "num_tokens": 690331465.0, "step": 5281 }, { "epoch": 2.1077414205905827, "grad_norm": 0.2200324386358261, "learning_rate": 1.5033635868583168e-05, "loss": 0.0698, "num_tokens": 690462537.0, "step": 5282 }, { "epoch": 2.1081404628890663, "grad_norm": 0.25600120425224304, "learning_rate": 1.5025397886529035e-05, "loss": 0.0899, "num_tokens": 690593609.0, "step": 5283 }, { "epoch": 2.10853950518755, "grad_norm": 0.2334771603345871, "learning_rate": 1.5017162318194112e-05, "loss": 0.069, "num_tokens": 690724681.0, "step": 5284 }, { "epoch": 2.1089385474860336, "grad_norm": 0.2698242962360382, "learning_rate": 1.5008929165171917e-05, "loss": 0.0989, "num_tokens": 690855753.0, "step": 5285 }, { "epoch": 2.1093375897845172, "grad_norm": 0.30599361658096313, "learning_rate": 1.5000698429055496e-05, "loss": 0.1208, "num_tokens": 690986825.0, "step": 5286 }, { "epoch": 2.109736632083001, "grad_norm": 0.2883162200450897, "learning_rate": 1.4992470111437412e-05, "loss": 0.1173, "num_tokens": 691117897.0, "step": 5287 }, { "epoch": 2.1101356743814845, "grad_norm": 0.2576785087585449, "learning_rate": 1.498424421390976e-05, "loss": 0.0963, "num_tokens": 691248969.0, "step": 5288 }, { "epoch": 2.110534716679968, "grad_norm": 0.25673767924308777, "learning_rate": 1.497602073806419e-05, "loss": 0.1066, "num_tokens": 691380041.0, "step": 5289 }, { "epoch": 2.110933758978452, "grad_norm": 0.2847711741924286, "learning_rate": 1.4967799685491851e-05, "loss": 0.1215, "num_tokens": 691511113.0, "step": 5290 }, { "epoch": 2.1113328012769355, "grad_norm": 0.22542837262153625, "learning_rate": 1.495958105778345e-05, "loss": 0.0762, "num_tokens": 691642185.0, "step": 5291 }, { "epoch": 2.111731843575419, "grad_norm": 0.2800033390522003, "learning_rate": 1.4951364856529224e-05, "loss": 0.0972, "num_tokens": 691773257.0, "step": 5292 }, { "epoch": 2.1121308858739027, "grad_norm": 0.2539764642715454, "learning_rate": 1.4943151083318913e-05, "loss": 0.0944, "num_tokens": 691898785.0, "step": 5293 }, { "epoch": 2.1125299281723864, "grad_norm": 0.27319005131721497, "learning_rate": 1.493493973974181e-05, "loss": 0.1065, "num_tokens": 692018607.0, "step": 5294 }, { "epoch": 2.11292897047087, "grad_norm": 0.2608278691768646, "learning_rate": 1.4926730827386751e-05, "loss": 0.0951, "num_tokens": 692149679.0, "step": 5295 }, { "epoch": 2.1133280127693537, "grad_norm": 0.23097066581249237, "learning_rate": 1.4918524347842059e-05, "loss": 0.0842, "num_tokens": 692280751.0, "step": 5296 }, { "epoch": 2.1137270550678373, "grad_norm": 0.244726300239563, "learning_rate": 1.4910320302695624e-05, "loss": 0.0897, "num_tokens": 692411823.0, "step": 5297 }, { "epoch": 2.114126097366321, "grad_norm": 0.28408539295196533, "learning_rate": 1.4902118693534858e-05, "loss": 0.1025, "num_tokens": 692542895.0, "step": 5298 }, { "epoch": 2.1145251396648046, "grad_norm": 0.2597079575061798, "learning_rate": 1.489391952194669e-05, "loss": 0.0994, "num_tokens": 692673967.0, "step": 5299 }, { "epoch": 2.1149241819632882, "grad_norm": 0.2871098816394806, "learning_rate": 1.488572278951757e-05, "loss": 0.0978, "num_tokens": 692805039.0, "step": 5300 }, { "epoch": 2.115323224261772, "grad_norm": 0.24773824214935303, "learning_rate": 1.4877528497833504e-05, "loss": 0.0851, "num_tokens": 692936111.0, "step": 5301 }, { "epoch": 2.1157222665602555, "grad_norm": 0.2993561625480652, "learning_rate": 1.486933664848002e-05, "loss": 0.0893, "num_tokens": 693067183.0, "step": 5302 }, { "epoch": 2.116121308858739, "grad_norm": 0.25337299704551697, "learning_rate": 1.486114724304214e-05, "loss": 0.0889, "num_tokens": 693198255.0, "step": 5303 }, { "epoch": 2.116520351157223, "grad_norm": 0.24655015766620636, "learning_rate": 1.4852960283104461e-05, "loss": 0.099, "num_tokens": 693329327.0, "step": 5304 }, { "epoch": 2.1169193934557065, "grad_norm": 0.2993987500667572, "learning_rate": 1.4844775770251065e-05, "loss": 0.0911, "num_tokens": 693460399.0, "step": 5305 }, { "epoch": 2.11731843575419, "grad_norm": 0.2611291706562042, "learning_rate": 1.4836593706065588e-05, "loss": 0.0729, "num_tokens": 693591471.0, "step": 5306 }, { "epoch": 2.1177174780526737, "grad_norm": 0.2530462443828583, "learning_rate": 1.4828414092131191e-05, "loss": 0.1047, "num_tokens": 693722543.0, "step": 5307 }, { "epoch": 2.1181165203511574, "grad_norm": 0.23949067294597626, "learning_rate": 1.4820236930030534e-05, "loss": 0.091, "num_tokens": 693853615.0, "step": 5308 }, { "epoch": 2.118515562649641, "grad_norm": 0.24652813374996185, "learning_rate": 1.4812062221345841e-05, "loss": 0.0834, "num_tokens": 693984687.0, "step": 5309 }, { "epoch": 2.1189146049481247, "grad_norm": 0.25863349437713623, "learning_rate": 1.4803889967658818e-05, "loss": 0.1031, "num_tokens": 694115759.0, "step": 5310 }, { "epoch": 2.1193136472466083, "grad_norm": 0.25642192363739014, "learning_rate": 1.479572017055075e-05, "loss": 0.0944, "num_tokens": 694246831.0, "step": 5311 }, { "epoch": 2.119712689545092, "grad_norm": 0.25201836228370667, "learning_rate": 1.4787552831602391e-05, "loss": 0.0948, "num_tokens": 694377903.0, "step": 5312 }, { "epoch": 2.1201117318435756, "grad_norm": 0.2201770842075348, "learning_rate": 1.4779387952394058e-05, "loss": 0.0774, "num_tokens": 694507464.0, "step": 5313 }, { "epoch": 2.1205107741420592, "grad_norm": 0.22814267873764038, "learning_rate": 1.4771225534505581e-05, "loss": 0.0707, "num_tokens": 694638536.0, "step": 5314 }, { "epoch": 2.120909816440543, "grad_norm": 0.26428166031837463, "learning_rate": 1.4763065579516297e-05, "loss": 0.1021, "num_tokens": 694769608.0, "step": 5315 }, { "epoch": 2.1213088587390265, "grad_norm": 0.260089248418808, "learning_rate": 1.4754908089005092e-05, "loss": 0.1102, "num_tokens": 694900680.0, "step": 5316 }, { "epoch": 2.12170790103751, "grad_norm": 0.22528105974197388, "learning_rate": 1.4746753064550377e-05, "loss": 0.0743, "num_tokens": 695031752.0, "step": 5317 }, { "epoch": 2.122106943335994, "grad_norm": 0.28762122988700867, "learning_rate": 1.473860050773005e-05, "loss": 0.1044, "num_tokens": 695162824.0, "step": 5318 }, { "epoch": 2.1225059856344775, "grad_norm": 0.24442747235298157, "learning_rate": 1.4730450420121558e-05, "loss": 0.0818, "num_tokens": 695293896.0, "step": 5319 }, { "epoch": 2.122905027932961, "grad_norm": 0.25879642367362976, "learning_rate": 1.4722302803301888e-05, "loss": 0.0985, "num_tokens": 695424968.0, "step": 5320 }, { "epoch": 2.1233040702314447, "grad_norm": 0.2528887987136841, "learning_rate": 1.471415765884751e-05, "loss": 0.0893, "num_tokens": 695556040.0, "step": 5321 }, { "epoch": 2.1237031125299284, "grad_norm": 0.2558688223361969, "learning_rate": 1.4706014988334432e-05, "loss": 0.0747, "num_tokens": 695687112.0, "step": 5322 }, { "epoch": 2.124102154828412, "grad_norm": 0.2676118314266205, "learning_rate": 1.4697874793338195e-05, "loss": 0.0981, "num_tokens": 695812790.0, "step": 5323 }, { "epoch": 2.1245011971268957, "grad_norm": 0.2732531726360321, "learning_rate": 1.4689737075433841e-05, "loss": 0.0974, "num_tokens": 695943862.0, "step": 5324 }, { "epoch": 2.1249002394253793, "grad_norm": 0.29154765605926514, "learning_rate": 1.4681601836195943e-05, "loss": 0.1042, "num_tokens": 696074934.0, "step": 5325 }, { "epoch": 2.125299281723863, "grad_norm": 0.24300667643547058, "learning_rate": 1.467346907719861e-05, "loss": 0.0874, "num_tokens": 696206006.0, "step": 5326 }, { "epoch": 2.1256983240223466, "grad_norm": 0.33158719539642334, "learning_rate": 1.4665338800015433e-05, "loss": 0.087, "num_tokens": 696337078.0, "step": 5327 }, { "epoch": 2.1260973663208302, "grad_norm": 0.2340565025806427, "learning_rate": 1.4657211006219554e-05, "loss": 0.0763, "num_tokens": 696468150.0, "step": 5328 }, { "epoch": 2.126496408619314, "grad_norm": 0.2590377926826477, "learning_rate": 1.4649085697383636e-05, "loss": 0.0956, "num_tokens": 696599222.0, "step": 5329 }, { "epoch": 2.126895450917797, "grad_norm": 0.2403193861246109, "learning_rate": 1.4640962875079833e-05, "loss": 0.0816, "num_tokens": 696730294.0, "step": 5330 }, { "epoch": 2.1272944932162807, "grad_norm": 0.27568602561950684, "learning_rate": 1.4632842540879837e-05, "loss": 0.0938, "num_tokens": 696861366.0, "step": 5331 }, { "epoch": 2.1276935355147644, "grad_norm": 0.23281078040599823, "learning_rate": 1.462472469635488e-05, "loss": 0.0792, "num_tokens": 696992438.0, "step": 5332 }, { "epoch": 2.128092577813248, "grad_norm": 0.2551184296607971, "learning_rate": 1.4616609343075672e-05, "loss": 0.0775, "num_tokens": 697123510.0, "step": 5333 }, { "epoch": 2.1284916201117317, "grad_norm": 0.24630768597126007, "learning_rate": 1.4608496482612444e-05, "loss": 0.0814, "num_tokens": 697254582.0, "step": 5334 }, { "epoch": 2.1288906624102153, "grad_norm": 0.2460245043039322, "learning_rate": 1.4600386116534978e-05, "loss": 0.0918, "num_tokens": 697385654.0, "step": 5335 }, { "epoch": 2.129289704708699, "grad_norm": 0.28456172347068787, "learning_rate": 1.4592278246412561e-05, "loss": 0.093, "num_tokens": 697516726.0, "step": 5336 }, { "epoch": 2.1296887470071826, "grad_norm": 0.28489378094673157, "learning_rate": 1.4584172873813964e-05, "loss": 0.1131, "num_tokens": 697647798.0, "step": 5337 }, { "epoch": 2.1300877893056662, "grad_norm": 0.23367473483085632, "learning_rate": 1.4576070000307518e-05, "loss": 0.0814, "num_tokens": 697778870.0, "step": 5338 }, { "epoch": 2.13048683160415, "grad_norm": 0.24876239895820618, "learning_rate": 1.4567969627461062e-05, "loss": 0.0803, "num_tokens": 697909942.0, "step": 5339 }, { "epoch": 2.1308858739026335, "grad_norm": 0.25205767154693604, "learning_rate": 1.4559871756841923e-05, "loss": 0.0905, "num_tokens": 698041014.0, "step": 5340 }, { "epoch": 2.131284916201117, "grad_norm": 0.22398681938648224, "learning_rate": 1.4551776390016974e-05, "loss": 0.0776, "num_tokens": 698172086.0, "step": 5341 }, { "epoch": 2.131683958499601, "grad_norm": 0.2623038589954376, "learning_rate": 1.4543683528552599e-05, "loss": 0.0821, "num_tokens": 698303158.0, "step": 5342 }, { "epoch": 2.1320830007980844, "grad_norm": 0.2824520766735077, "learning_rate": 1.4535593174014684e-05, "loss": 0.1077, "num_tokens": 698434230.0, "step": 5343 }, { "epoch": 2.132482043096568, "grad_norm": 0.298564612865448, "learning_rate": 1.452750532796863e-05, "loss": 0.0976, "num_tokens": 698565302.0, "step": 5344 }, { "epoch": 2.1328810853950517, "grad_norm": 0.2904967963695526, "learning_rate": 1.4519419991979372e-05, "loss": 0.0959, "num_tokens": 698696374.0, "step": 5345 }, { "epoch": 2.1332801276935354, "grad_norm": 0.28202998638153076, "learning_rate": 1.4511337167611336e-05, "loss": 0.1033, "num_tokens": 698827446.0, "step": 5346 }, { "epoch": 2.133679169992019, "grad_norm": 0.2573715150356293, "learning_rate": 1.4503256856428479e-05, "loss": 0.0877, "num_tokens": 698958518.0, "step": 5347 }, { "epoch": 2.1340782122905027, "grad_norm": 0.24956706166267395, "learning_rate": 1.4495179059994274e-05, "loss": 0.0775, "num_tokens": 699089590.0, "step": 5348 }, { "epoch": 2.1344772545889863, "grad_norm": 0.2238130271434784, "learning_rate": 1.4487103779871683e-05, "loss": 0.0802, "num_tokens": 699212978.0, "step": 5349 }, { "epoch": 2.13487629688747, "grad_norm": 0.26594433188438416, "learning_rate": 1.4479031017623206e-05, "loss": 0.0952, "num_tokens": 699344050.0, "step": 5350 }, { "epoch": 2.1352753391859536, "grad_norm": 0.27623918652534485, "learning_rate": 1.447096077481086e-05, "loss": 0.1105, "num_tokens": 699475122.0, "step": 5351 }, { "epoch": 2.1356743814844372, "grad_norm": 0.2685277760028839, "learning_rate": 1.4462893052996135e-05, "loss": 0.0845, "num_tokens": 699606194.0, "step": 5352 }, { "epoch": 2.136073423782921, "grad_norm": 0.26447439193725586, "learning_rate": 1.4454827853740077e-05, "loss": 0.0829, "num_tokens": 699737266.0, "step": 5353 }, { "epoch": 2.1364724660814045, "grad_norm": 0.24299533665180206, "learning_rate": 1.444676517860323e-05, "loss": 0.0966, "num_tokens": 699868338.0, "step": 5354 }, { "epoch": 2.136871508379888, "grad_norm": 0.25144901871681213, "learning_rate": 1.4438705029145641e-05, "loss": 0.0923, "num_tokens": 699999410.0, "step": 5355 }, { "epoch": 2.137270550678372, "grad_norm": 0.2766573429107666, "learning_rate": 1.4430647406926865e-05, "loss": 0.1065, "num_tokens": 700130482.0, "step": 5356 }, { "epoch": 2.1376695929768554, "grad_norm": 0.24776050448417664, "learning_rate": 1.4422592313505984e-05, "loss": 0.0851, "num_tokens": 700261554.0, "step": 5357 }, { "epoch": 2.138068635275339, "grad_norm": 0.2585161626338959, "learning_rate": 1.4414539750441592e-05, "loss": 0.0994, "num_tokens": 700392626.0, "step": 5358 }, { "epoch": 2.1384676775738227, "grad_norm": 0.2854871451854706, "learning_rate": 1.4406489719291771e-05, "loss": 0.1008, "num_tokens": 700523698.0, "step": 5359 }, { "epoch": 2.1388667198723064, "grad_norm": 0.3134814202785492, "learning_rate": 1.439844222161413e-05, "loss": 0.1132, "num_tokens": 700654770.0, "step": 5360 }, { "epoch": 2.13926576217079, "grad_norm": 0.25020700693130493, "learning_rate": 1.4390397258965799e-05, "loss": 0.0875, "num_tokens": 700785842.0, "step": 5361 }, { "epoch": 2.1396648044692737, "grad_norm": 0.25458934903144836, "learning_rate": 1.4382354832903383e-05, "loss": 0.0984, "num_tokens": 700916914.0, "step": 5362 }, { "epoch": 2.1400638467677573, "grad_norm": 0.2809029519557953, "learning_rate": 1.4374314944983025e-05, "loss": 0.1082, "num_tokens": 701047986.0, "step": 5363 }, { "epoch": 2.140462889066241, "grad_norm": 0.2579606771469116, "learning_rate": 1.436627759676038e-05, "loss": 0.0992, "num_tokens": 701179058.0, "step": 5364 }, { "epoch": 2.1408619313647246, "grad_norm": 0.22905705869197845, "learning_rate": 1.4358242789790582e-05, "loss": 0.0823, "num_tokens": 701310130.0, "step": 5365 }, { "epoch": 2.1412609736632082, "grad_norm": 0.24298743903636932, "learning_rate": 1.4350210525628302e-05, "loss": 0.0753, "num_tokens": 701426910.0, "step": 5366 }, { "epoch": 2.141660015961692, "grad_norm": 0.2744641900062561, "learning_rate": 1.4342180805827706e-05, "loss": 0.1004, "num_tokens": 701557982.0, "step": 5367 }, { "epoch": 2.1420590582601755, "grad_norm": 0.28143301606178284, "learning_rate": 1.433415363194246e-05, "loss": 0.1032, "num_tokens": 701689054.0, "step": 5368 }, { "epoch": 2.142458100558659, "grad_norm": 0.2602781355381012, "learning_rate": 1.4326129005525758e-05, "loss": 0.0814, "num_tokens": 701820126.0, "step": 5369 }, { "epoch": 2.142857142857143, "grad_norm": 0.24087253212928772, "learning_rate": 1.4318106928130293e-05, "loss": 0.0915, "num_tokens": 701951198.0, "step": 5370 }, { "epoch": 2.1432561851556264, "grad_norm": 0.26998716592788696, "learning_rate": 1.431008740130825e-05, "loss": 0.0833, "num_tokens": 702082270.0, "step": 5371 }, { "epoch": 2.14365522745411, "grad_norm": 0.27572718262672424, "learning_rate": 1.4302070426611342e-05, "loss": 0.1011, "num_tokens": 702213342.0, "step": 5372 }, { "epoch": 2.1440542697525937, "grad_norm": 0.2381904423236847, "learning_rate": 1.4294056005590784e-05, "loss": 0.095, "num_tokens": 702344414.0, "step": 5373 }, { "epoch": 2.1444533120510774, "grad_norm": 0.2571406662464142, "learning_rate": 1.4286044139797278e-05, "loss": 0.0983, "num_tokens": 702475486.0, "step": 5374 }, { "epoch": 2.144852354349561, "grad_norm": 0.24746505916118622, "learning_rate": 1.427803483078105e-05, "loss": 0.073, "num_tokens": 702606558.0, "step": 5375 }, { "epoch": 2.1452513966480447, "grad_norm": 0.25236353278160095, "learning_rate": 1.4270028080091834e-05, "loss": 0.0832, "num_tokens": 702737630.0, "step": 5376 }, { "epoch": 2.1456504389465283, "grad_norm": 0.23352022469043732, "learning_rate": 1.4262023889278859e-05, "loss": 0.0726, "num_tokens": 702868702.0, "step": 5377 }, { "epoch": 2.146049481245012, "grad_norm": 0.27496156096458435, "learning_rate": 1.4254022259890853e-05, "loss": 0.0966, "num_tokens": 702999774.0, "step": 5378 }, { "epoch": 2.1464485235434956, "grad_norm": 0.2859739661216736, "learning_rate": 1.424602319347606e-05, "loss": 0.0858, "num_tokens": 703130846.0, "step": 5379 }, { "epoch": 2.146847565841979, "grad_norm": 0.2708335816860199, "learning_rate": 1.4238026691582235e-05, "loss": 0.095, "num_tokens": 703261918.0, "step": 5380 }, { "epoch": 2.147246608140463, "grad_norm": 0.26316913962364197, "learning_rate": 1.4230032755756611e-05, "loss": 0.0933, "num_tokens": 703392990.0, "step": 5381 }, { "epoch": 2.1476456504389465, "grad_norm": 0.25505614280700684, "learning_rate": 1.4222041387545953e-05, "loss": 0.1028, "num_tokens": 703524062.0, "step": 5382 }, { "epoch": 2.14804469273743, "grad_norm": 0.2616204023361206, "learning_rate": 1.4214052588496507e-05, "loss": 0.1035, "num_tokens": 703655134.0, "step": 5383 }, { "epoch": 2.148443735035914, "grad_norm": 0.2574276924133301, "learning_rate": 1.4206066360154035e-05, "loss": 0.1022, "num_tokens": 703786206.0, "step": 5384 }, { "epoch": 2.1488427773343974, "grad_norm": 0.2314714938402176, "learning_rate": 1.4198082704063802e-05, "loss": 0.084, "num_tokens": 703913145.0, "step": 5385 }, { "epoch": 2.149241819632881, "grad_norm": 0.2769245505332947, "learning_rate": 1.4190101621770563e-05, "loss": 0.1163, "num_tokens": 704044217.0, "step": 5386 }, { "epoch": 2.1496408619313647, "grad_norm": 0.2563117444515228, "learning_rate": 1.4182123114818588e-05, "loss": 0.0846, "num_tokens": 704175289.0, "step": 5387 }, { "epoch": 2.1500399042298484, "grad_norm": 0.22780242562294006, "learning_rate": 1.4174147184751646e-05, "loss": 0.086, "num_tokens": 704306361.0, "step": 5388 }, { "epoch": 2.150438946528332, "grad_norm": 0.24257655441761017, "learning_rate": 1.4166173833113005e-05, "loss": 0.0893, "num_tokens": 704437433.0, "step": 5389 }, { "epoch": 2.1508379888268156, "grad_norm": 0.22813604772090912, "learning_rate": 1.4158203061445421e-05, "loss": 0.0766, "num_tokens": 704568505.0, "step": 5390 }, { "epoch": 2.1512370311252993, "grad_norm": 0.2664669156074524, "learning_rate": 1.4150234871291173e-05, "loss": 0.0876, "num_tokens": 704699577.0, "step": 5391 }, { "epoch": 2.151636073423783, "grad_norm": 0.2563427984714508, "learning_rate": 1.4142269264192043e-05, "loss": 0.0877, "num_tokens": 704830649.0, "step": 5392 }, { "epoch": 2.1520351157222666, "grad_norm": 0.29113081097602844, "learning_rate": 1.4134306241689282e-05, "loss": 0.0856, "num_tokens": 704961721.0, "step": 5393 }, { "epoch": 2.15243415802075, "grad_norm": 0.26565903425216675, "learning_rate": 1.4126345805323671e-05, "loss": 0.1067, "num_tokens": 705092793.0, "step": 5394 }, { "epoch": 2.152833200319234, "grad_norm": 0.2607060968875885, "learning_rate": 1.4118387956635485e-05, "loss": 0.0724, "num_tokens": 705208088.0, "step": 5395 }, { "epoch": 2.1532322426177175, "grad_norm": 0.29538270831108093, "learning_rate": 1.4110432697164483e-05, "loss": 0.097, "num_tokens": 705339160.0, "step": 5396 }, { "epoch": 2.153631284916201, "grad_norm": 0.3237660229206085, "learning_rate": 1.4102480028449932e-05, "loss": 0.1133, "num_tokens": 705470232.0, "step": 5397 }, { "epoch": 2.154030327214685, "grad_norm": 0.24727380275726318, "learning_rate": 1.4094529952030619e-05, "loss": 0.0814, "num_tokens": 705601304.0, "step": 5398 }, { "epoch": 2.1544293695131684, "grad_norm": 0.2825491726398468, "learning_rate": 1.4086582469444786e-05, "loss": 0.0998, "num_tokens": 705732376.0, "step": 5399 }, { "epoch": 2.154828411811652, "grad_norm": 0.23671145737171173, "learning_rate": 1.4078637582230215e-05, "loss": 0.0797, "num_tokens": 705863448.0, "step": 5400 }, { "epoch": 2.1552274541101357, "grad_norm": 0.27195852994918823, "learning_rate": 1.4070695291924163e-05, "loss": 0.0924, "num_tokens": 705994520.0, "step": 5401 }, { "epoch": 2.1556264964086194, "grad_norm": 0.2715493440628052, "learning_rate": 1.406275560006337e-05, "loss": 0.1099, "num_tokens": 706125592.0, "step": 5402 }, { "epoch": 2.156025538707103, "grad_norm": 0.24388940632343292, "learning_rate": 1.4054818508184114e-05, "loss": 0.0939, "num_tokens": 706256664.0, "step": 5403 }, { "epoch": 2.1564245810055866, "grad_norm": 0.2546651065349579, "learning_rate": 1.4046884017822148e-05, "loss": 0.0977, "num_tokens": 706387736.0, "step": 5404 }, { "epoch": 2.1568236233040703, "grad_norm": 0.2764383852481842, "learning_rate": 1.403895213051271e-05, "loss": 0.0998, "num_tokens": 706518808.0, "step": 5405 }, { "epoch": 2.157222665602554, "grad_norm": 0.2513122260570526, "learning_rate": 1.4031022847790547e-05, "loss": 0.0933, "num_tokens": 706649880.0, "step": 5406 }, { "epoch": 2.1576217079010376, "grad_norm": 0.23325268924236298, "learning_rate": 1.402309617118992e-05, "loss": 0.0824, "num_tokens": 706780952.0, "step": 5407 }, { "epoch": 2.158020750199521, "grad_norm": 0.257170170545578, "learning_rate": 1.4015172102244547e-05, "loss": 0.0844, "num_tokens": 706912024.0, "step": 5408 }, { "epoch": 2.158419792498005, "grad_norm": 0.24436545372009277, "learning_rate": 1.4007250642487666e-05, "loss": 0.0942, "num_tokens": 707043096.0, "step": 5409 }, { "epoch": 2.1588188347964885, "grad_norm": 0.23790238797664642, "learning_rate": 1.3999331793452014e-05, "loss": 0.086, "num_tokens": 707174168.0, "step": 5410 }, { "epoch": 2.159217877094972, "grad_norm": 0.25539615750312805, "learning_rate": 1.3991415556669813e-05, "loss": 0.0691, "num_tokens": 707305240.0, "step": 5411 }, { "epoch": 2.159616919393456, "grad_norm": 0.23875443637371063, "learning_rate": 1.3983501933672766e-05, "loss": 0.0823, "num_tokens": 707436312.0, "step": 5412 }, { "epoch": 2.1600159616919394, "grad_norm": 0.2480897158384323, "learning_rate": 1.3975590925992093e-05, "loss": 0.0927, "num_tokens": 707567384.0, "step": 5413 }, { "epoch": 2.160415003990423, "grad_norm": 0.26727333664894104, "learning_rate": 1.3967682535158515e-05, "loss": 0.0825, "num_tokens": 707698456.0, "step": 5414 }, { "epoch": 2.1608140462889067, "grad_norm": 0.2925439774990082, "learning_rate": 1.3959776762702209e-05, "loss": 0.0914, "num_tokens": 707829528.0, "step": 5415 }, { "epoch": 2.1612130885873904, "grad_norm": 0.22530792653560638, "learning_rate": 1.3951873610152878e-05, "loss": 0.0708, "num_tokens": 707960600.0, "step": 5416 }, { "epoch": 2.161612130885874, "grad_norm": 0.2709457278251648, "learning_rate": 1.3943973079039721e-05, "loss": 0.0824, "num_tokens": 708091672.0, "step": 5417 }, { "epoch": 2.1620111731843576, "grad_norm": 0.2866235375404358, "learning_rate": 1.3936075170891396e-05, "loss": 0.1089, "num_tokens": 708222744.0, "step": 5418 }, { "epoch": 2.1624102154828413, "grad_norm": 0.27242109179496765, "learning_rate": 1.3928179887236084e-05, "loss": 0.0861, "num_tokens": 708353816.0, "step": 5419 }, { "epoch": 2.162809257781325, "grad_norm": 0.24379438161849976, "learning_rate": 1.3920287229601459e-05, "loss": 0.0957, "num_tokens": 708484888.0, "step": 5420 }, { "epoch": 2.1632083000798086, "grad_norm": 0.25462958216667175, "learning_rate": 1.3912397199514654e-05, "loss": 0.0846, "num_tokens": 708615960.0, "step": 5421 }, { "epoch": 2.163607342378292, "grad_norm": 0.2653394937515259, "learning_rate": 1.390450979850234e-05, "loss": 0.1089, "num_tokens": 708741008.0, "step": 5422 }, { "epoch": 2.164006384676776, "grad_norm": 0.2695676386356354, "learning_rate": 1.3896625028090643e-05, "loss": 0.1046, "num_tokens": 708872080.0, "step": 5423 }, { "epoch": 2.1644054269752595, "grad_norm": 0.21613799035549164, "learning_rate": 1.3888742889805187e-05, "loss": 0.0793, "num_tokens": 709003152.0, "step": 5424 }, { "epoch": 2.164804469273743, "grad_norm": 0.23988644778728485, "learning_rate": 1.38808633851711e-05, "loss": 0.082, "num_tokens": 709134224.0, "step": 5425 }, { "epoch": 2.165203511572227, "grad_norm": 0.23344460129737854, "learning_rate": 1.3872986515712999e-05, "loss": 0.0743, "num_tokens": 709265296.0, "step": 5426 }, { "epoch": 2.1656025538707104, "grad_norm": 0.3220936954021454, "learning_rate": 1.386511228295497e-05, "loss": 0.1081, "num_tokens": 709396368.0, "step": 5427 }, { "epoch": 2.166001596169194, "grad_norm": 0.23957990109920502, "learning_rate": 1.3857240688420611e-05, "loss": 0.0789, "num_tokens": 709527440.0, "step": 5428 }, { "epoch": 2.1664006384676777, "grad_norm": 0.28487569093704224, "learning_rate": 1.3849371733633012e-05, "loss": 0.1014, "num_tokens": 709658512.0, "step": 5429 }, { "epoch": 2.1667996807661614, "grad_norm": 0.27794748544692993, "learning_rate": 1.3841505420114725e-05, "loss": 0.0881, "num_tokens": 709789584.0, "step": 5430 }, { "epoch": 2.167198723064645, "grad_norm": 0.2517331540584564, "learning_rate": 1.3833641749387815e-05, "loss": 0.0879, "num_tokens": 709920656.0, "step": 5431 }, { "epoch": 2.1675977653631286, "grad_norm": 0.28452637791633606, "learning_rate": 1.3825780722973841e-05, "loss": 0.095, "num_tokens": 710051728.0, "step": 5432 }, { "epoch": 2.1679968076616123, "grad_norm": 0.23696793615818024, "learning_rate": 1.3817922342393816e-05, "loss": 0.0916, "num_tokens": 710182800.0, "step": 5433 }, { "epoch": 2.168395849960096, "grad_norm": 0.22848913073539734, "learning_rate": 1.3810066609168281e-05, "loss": 0.0899, "num_tokens": 710313872.0, "step": 5434 }, { "epoch": 2.1687948922585796, "grad_norm": 0.2948366403579712, "learning_rate": 1.3802213524817237e-05, "loss": 0.1123, "num_tokens": 710444944.0, "step": 5435 }, { "epoch": 2.169193934557063, "grad_norm": 0.23025192320346832, "learning_rate": 1.3794363090860196e-05, "loss": 0.0786, "num_tokens": 710576016.0, "step": 5436 }, { "epoch": 2.169592976855547, "grad_norm": 0.22697851061820984, "learning_rate": 1.3786515308816122e-05, "loss": 0.081, "num_tokens": 710707088.0, "step": 5437 }, { "epoch": 2.1699920191540305, "grad_norm": 0.27136939764022827, "learning_rate": 1.3778670180203507e-05, "loss": 0.0881, "num_tokens": 710838160.0, "step": 5438 }, { "epoch": 2.170391061452514, "grad_norm": 0.24829168617725372, "learning_rate": 1.3770827706540306e-05, "loss": 0.0789, "num_tokens": 710969232.0, "step": 5439 }, { "epoch": 2.170790103750998, "grad_norm": 0.18994712829589844, "learning_rate": 1.3762987889343959e-05, "loss": 0.0635, "num_tokens": 711100304.0, "step": 5440 }, { "epoch": 2.1711891460494814, "grad_norm": 0.2450675517320633, "learning_rate": 1.3755150730131397e-05, "loss": 0.0894, "num_tokens": 711231376.0, "step": 5441 }, { "epoch": 2.171588188347965, "grad_norm": 0.2608329653739929, "learning_rate": 1.3747316230419049e-05, "loss": 0.1049, "num_tokens": 711362448.0, "step": 5442 }, { "epoch": 2.1719872306464487, "grad_norm": 0.25191542506217957, "learning_rate": 1.3739484391722801e-05, "loss": 0.0957, "num_tokens": 711493520.0, "step": 5443 }, { "epoch": 2.1723862729449324, "grad_norm": 0.2606600522994995, "learning_rate": 1.3731655215558058e-05, "loss": 0.0677, "num_tokens": 711624592.0, "step": 5444 }, { "epoch": 2.172785315243416, "grad_norm": 0.27636146545410156, "learning_rate": 1.3723828703439684e-05, "loss": 0.0842, "num_tokens": 711755664.0, "step": 5445 }, { "epoch": 2.1731843575418996, "grad_norm": 0.2838155925273895, "learning_rate": 1.3716004856882028e-05, "loss": 0.0959, "num_tokens": 711886736.0, "step": 5446 }, { "epoch": 2.173583399840383, "grad_norm": 0.2411348521709442, "learning_rate": 1.3708183677398938e-05, "loss": 0.0799, "num_tokens": 712017808.0, "step": 5447 }, { "epoch": 2.1739824421388665, "grad_norm": 0.2869575619697571, "learning_rate": 1.3700365166503753e-05, "loss": 0.1069, "num_tokens": 712148880.0, "step": 5448 }, { "epoch": 2.17438148443735, "grad_norm": 0.2561345100402832, "learning_rate": 1.3692549325709258e-05, "loss": 0.0969, "num_tokens": 712279952.0, "step": 5449 }, { "epoch": 2.1747805267358338, "grad_norm": 0.21354137361049652, "learning_rate": 1.3684736156527758e-05, "loss": 0.0597, "num_tokens": 712411024.0, "step": 5450 }, { "epoch": 2.1751795690343174, "grad_norm": 0.25900593400001526, "learning_rate": 1.3676925660471035e-05, "loss": 0.0956, "num_tokens": 712542096.0, "step": 5451 }, { "epoch": 2.175578611332801, "grad_norm": 0.2912136912345886, "learning_rate": 1.3669117839050333e-05, "loss": 0.1088, "num_tokens": 712673168.0, "step": 5452 }, { "epoch": 2.1759776536312847, "grad_norm": 0.28470662236213684, "learning_rate": 1.3661312693776398e-05, "loss": 0.101, "num_tokens": 712804240.0, "step": 5453 }, { "epoch": 2.1763766959297683, "grad_norm": 0.25287121534347534, "learning_rate": 1.3653510226159465e-05, "loss": 0.0907, "num_tokens": 712935312.0, "step": 5454 }, { "epoch": 2.176775738228252, "grad_norm": 0.23018261790275574, "learning_rate": 1.3645710437709213e-05, "loss": 0.0764, "num_tokens": 713066384.0, "step": 5455 }, { "epoch": 2.1771747805267356, "grad_norm": 0.24081605672836304, "learning_rate": 1.363791332993486e-05, "loss": 0.0869, "num_tokens": 713197456.0, "step": 5456 }, { "epoch": 2.1775738228252193, "grad_norm": 0.22440670430660248, "learning_rate": 1.3630118904345051e-05, "loss": 0.0736, "num_tokens": 713328528.0, "step": 5457 }, { "epoch": 2.177972865123703, "grad_norm": 0.23060020804405212, "learning_rate": 1.3622327162447935e-05, "loss": 0.0788, "num_tokens": 713459600.0, "step": 5458 }, { "epoch": 2.1783719074221866, "grad_norm": 0.22544197738170624, "learning_rate": 1.3614538105751146e-05, "loss": 0.0765, "num_tokens": 713590672.0, "step": 5459 }, { "epoch": 2.17877094972067, "grad_norm": 0.25826147198677063, "learning_rate": 1.3606751735761803e-05, "loss": 0.0886, "num_tokens": 713721744.0, "step": 5460 }, { "epoch": 2.179169992019154, "grad_norm": 0.24411790072917938, "learning_rate": 1.359896805398648e-05, "loss": 0.0775, "num_tokens": 713852816.0, "step": 5461 }, { "epoch": 2.1795690343176375, "grad_norm": 0.2644922435283661, "learning_rate": 1.3591187061931253e-05, "loss": 0.0884, "num_tokens": 713983888.0, "step": 5462 }, { "epoch": 2.179968076616121, "grad_norm": 0.2650998532772064, "learning_rate": 1.3583408761101688e-05, "loss": 0.0705, "num_tokens": 714114960.0, "step": 5463 }, { "epoch": 2.1803671189146048, "grad_norm": 0.2413022518157959, "learning_rate": 1.3575633153002785e-05, "loss": 0.0799, "num_tokens": 714246032.0, "step": 5464 }, { "epoch": 2.1807661612130884, "grad_norm": 0.24914075434207916, "learning_rate": 1.3567860239139068e-05, "loss": 0.0804, "num_tokens": 714377104.0, "step": 5465 }, { "epoch": 2.181165203511572, "grad_norm": 0.27624139189720154, "learning_rate": 1.3560090021014527e-05, "loss": 0.0759, "num_tokens": 714508176.0, "step": 5466 }, { "epoch": 2.1815642458100557, "grad_norm": 0.2890135943889618, "learning_rate": 1.3552322500132619e-05, "loss": 0.089, "num_tokens": 714639248.0, "step": 5467 }, { "epoch": 2.1819632881085393, "grad_norm": 0.3130837380886078, "learning_rate": 1.3544557677996284e-05, "loss": 0.104, "num_tokens": 714770320.0, "step": 5468 }, { "epoch": 2.182362330407023, "grad_norm": 0.2588851749897003, "learning_rate": 1.3536795556107949e-05, "loss": 0.1007, "num_tokens": 714901392.0, "step": 5469 }, { "epoch": 2.1827613727055066, "grad_norm": 0.2920726537704468, "learning_rate": 1.3529036135969514e-05, "loss": 0.0947, "num_tokens": 715032464.0, "step": 5470 }, { "epoch": 2.1831604150039903, "grad_norm": 0.2341538965702057, "learning_rate": 1.3521279419082344e-05, "loss": 0.0848, "num_tokens": 715163536.0, "step": 5471 }, { "epoch": 2.183559457302474, "grad_norm": 0.2051447480916977, "learning_rate": 1.35135254069473e-05, "loss": 0.0625, "num_tokens": 715294608.0, "step": 5472 }, { "epoch": 2.1839584996009576, "grad_norm": 0.2587980628013611, "learning_rate": 1.3505774101064716e-05, "loss": 0.0983, "num_tokens": 715425680.0, "step": 5473 }, { "epoch": 2.184357541899441, "grad_norm": 0.2564922273159027, "learning_rate": 1.3498025502934386e-05, "loss": 0.0989, "num_tokens": 715556752.0, "step": 5474 }, { "epoch": 2.184756584197925, "grad_norm": 0.27819910645484924, "learning_rate": 1.3490279614055596e-05, "loss": 0.1044, "num_tokens": 715687824.0, "step": 5475 }, { "epoch": 2.1851556264964085, "grad_norm": 0.22980037331581116, "learning_rate": 1.3482536435927109e-05, "loss": 0.0724, "num_tokens": 715818896.0, "step": 5476 }, { "epoch": 2.185554668794892, "grad_norm": 0.2688083350658417, "learning_rate": 1.3474795970047144e-05, "loss": 0.096, "num_tokens": 715949968.0, "step": 5477 }, { "epoch": 2.1859537110933758, "grad_norm": 0.24304711818695068, "learning_rate": 1.3467058217913419e-05, "loss": 0.0859, "num_tokens": 716081040.0, "step": 5478 }, { "epoch": 2.1863527533918594, "grad_norm": 0.27392202615737915, "learning_rate": 1.3459323181023131e-05, "loss": 0.0858, "num_tokens": 716212112.0, "step": 5479 }, { "epoch": 2.186751795690343, "grad_norm": 0.24454988539218903, "learning_rate": 1.345159086087291e-05, "loss": 0.0837, "num_tokens": 716343184.0, "step": 5480 }, { "epoch": 2.1871508379888267, "grad_norm": 0.23189538717269897, "learning_rate": 1.3443861258958896e-05, "loss": 0.0722, "num_tokens": 716474256.0, "step": 5481 }, { "epoch": 2.1875498802873103, "grad_norm": 0.2670873701572418, "learning_rate": 1.3436134376776704e-05, "loss": 0.0962, "num_tokens": 716605328.0, "step": 5482 }, { "epoch": 2.187948922585794, "grad_norm": 0.3085730969905853, "learning_rate": 1.3428410215821405e-05, "loss": 0.0883, "num_tokens": 716736400.0, "step": 5483 }, { "epoch": 2.1883479648842776, "grad_norm": 0.2993391752243042, "learning_rate": 1.3420688777587554e-05, "loss": 0.0979, "num_tokens": 716867472.0, "step": 5484 }, { "epoch": 2.1887470071827613, "grad_norm": 0.26871341466903687, "learning_rate": 1.3412970063569185e-05, "loss": 0.0947, "num_tokens": 716998544.0, "step": 5485 }, { "epoch": 2.189146049481245, "grad_norm": 0.2737202048301697, "learning_rate": 1.3405254075259788e-05, "loss": 0.089, "num_tokens": 717129616.0, "step": 5486 }, { "epoch": 2.1895450917797286, "grad_norm": 0.3402664065361023, "learning_rate": 1.3397540814152334e-05, "loss": 0.0902, "num_tokens": 717246203.0, "step": 5487 }, { "epoch": 2.189944134078212, "grad_norm": 0.28710341453552246, "learning_rate": 1.338983028173928e-05, "loss": 0.0791, "num_tokens": 717377275.0, "step": 5488 }, { "epoch": 2.190343176376696, "grad_norm": 0.2416934370994568, "learning_rate": 1.3382122479512524e-05, "loss": 0.0836, "num_tokens": 717508347.0, "step": 5489 }, { "epoch": 2.1907422186751795, "grad_norm": 0.2740766704082489, "learning_rate": 1.3374417408963474e-05, "loss": 0.0968, "num_tokens": 717639419.0, "step": 5490 }, { "epoch": 2.191141260973663, "grad_norm": 0.24491474032402039, "learning_rate": 1.3366715071582966e-05, "loss": 0.0777, "num_tokens": 717770491.0, "step": 5491 }, { "epoch": 2.1915403032721468, "grad_norm": 0.2648502290248871, "learning_rate": 1.3359015468861352e-05, "loss": 0.0972, "num_tokens": 717901563.0, "step": 5492 }, { "epoch": 2.1919393455706304, "grad_norm": 0.27518197894096375, "learning_rate": 1.3351318602288418e-05, "loss": 0.1056, "num_tokens": 718032635.0, "step": 5493 }, { "epoch": 2.192338387869114, "grad_norm": 0.2324870526790619, "learning_rate": 1.3343624473353442e-05, "loss": 0.0759, "num_tokens": 718163707.0, "step": 5494 }, { "epoch": 2.1927374301675977, "grad_norm": 0.27818235754966736, "learning_rate": 1.3335933083545174e-05, "loss": 0.1114, "num_tokens": 718294779.0, "step": 5495 }, { "epoch": 2.1931364724660813, "grad_norm": 0.2868809103965759, "learning_rate": 1.3328244434351811e-05, "loss": 0.1164, "num_tokens": 718425851.0, "step": 5496 }, { "epoch": 2.193535514764565, "grad_norm": 0.21687164902687073, "learning_rate": 1.332055852726104e-05, "loss": 0.0772, "num_tokens": 718556923.0, "step": 5497 }, { "epoch": 2.1939345570630486, "grad_norm": 0.2648732662200928, "learning_rate": 1.3312875363760025e-05, "loss": 0.0914, "num_tokens": 718686183.0, "step": 5498 }, { "epoch": 2.1943335993615323, "grad_norm": 0.284452885389328, "learning_rate": 1.3305194945335369e-05, "loss": 0.1, "num_tokens": 718817255.0, "step": 5499 }, { "epoch": 2.194732641660016, "grad_norm": 0.25429490208625793, "learning_rate": 1.329751727347317e-05, "loss": 0.0961, "num_tokens": 718948327.0, "step": 5500 }, { "epoch": 2.1951316839584996, "grad_norm": 0.25387951731681824, "learning_rate": 1.3289842349658998e-05, "loss": 0.0808, "num_tokens": 719079399.0, "step": 5501 }, { "epoch": 2.195530726256983, "grad_norm": 0.2878473997116089, "learning_rate": 1.328217017537785e-05, "loss": 0.1049, "num_tokens": 719210471.0, "step": 5502 }, { "epoch": 2.195929768555467, "grad_norm": 0.29287025332450867, "learning_rate": 1.3274500752114238e-05, "loss": 0.0985, "num_tokens": 719341543.0, "step": 5503 }, { "epoch": 2.1963288108539505, "grad_norm": 0.24583755433559418, "learning_rate": 1.326683408135213e-05, "loss": 0.0771, "num_tokens": 719472615.0, "step": 5504 }, { "epoch": 2.196727853152434, "grad_norm": 0.24476595222949982, "learning_rate": 1.3259170164574935e-05, "loss": 0.0904, "num_tokens": 719603687.0, "step": 5505 }, { "epoch": 2.1971268954509178, "grad_norm": 0.25204911828041077, "learning_rate": 1.3251509003265566e-05, "loss": 0.0936, "num_tokens": 719734759.0, "step": 5506 }, { "epoch": 2.1975259377494014, "grad_norm": 0.23697884380817413, "learning_rate": 1.324385059890639e-05, "loss": 0.0924, "num_tokens": 719865831.0, "step": 5507 }, { "epoch": 2.197924980047885, "grad_norm": 0.26484668254852295, "learning_rate": 1.3236194952979223e-05, "loss": 0.075, "num_tokens": 719996903.0, "step": 5508 }, { "epoch": 2.1983240223463687, "grad_norm": 0.2165287733078003, "learning_rate": 1.322854206696536e-05, "loss": 0.072, "num_tokens": 720127975.0, "step": 5509 }, { "epoch": 2.1987230646448523, "grad_norm": 0.2459428757429123, "learning_rate": 1.3220891942345587e-05, "loss": 0.0792, "num_tokens": 720259047.0, "step": 5510 }, { "epoch": 2.199122106943336, "grad_norm": 0.2968209683895111, "learning_rate": 1.3213244580600104e-05, "loss": 0.0988, "num_tokens": 720390119.0, "step": 5511 }, { "epoch": 2.1995211492418196, "grad_norm": 0.2280660718679428, "learning_rate": 1.3205599983208611e-05, "loss": 0.0711, "num_tokens": 720521191.0, "step": 5512 }, { "epoch": 2.1999201915403033, "grad_norm": 0.284128338098526, "learning_rate": 1.319795815165028e-05, "loss": 0.0958, "num_tokens": 720652263.0, "step": 5513 }, { "epoch": 2.200319233838787, "grad_norm": 0.2969081401824951, "learning_rate": 1.3190319087403729e-05, "loss": 0.1048, "num_tokens": 720783335.0, "step": 5514 }, { "epoch": 2.2007182761372706, "grad_norm": 0.2408367246389389, "learning_rate": 1.3182682791947026e-05, "loss": 0.082, "num_tokens": 720914407.0, "step": 5515 }, { "epoch": 2.201117318435754, "grad_norm": 0.3037334978580475, "learning_rate": 1.3175049266757744e-05, "loss": 0.0949, "num_tokens": 721045479.0, "step": 5516 }, { "epoch": 2.201516360734238, "grad_norm": 0.23958458006381989, "learning_rate": 1.31674185133129e-05, "loss": 0.0787, "num_tokens": 721176551.0, "step": 5517 }, { "epoch": 2.2019154030327215, "grad_norm": 0.28634902834892273, "learning_rate": 1.3159790533088962e-05, "loss": 0.0908, "num_tokens": 721307623.0, "step": 5518 }, { "epoch": 2.202314445331205, "grad_norm": 0.27093416452407837, "learning_rate": 1.3152165327561883e-05, "loss": 0.0802, "num_tokens": 721438695.0, "step": 5519 }, { "epoch": 2.2027134876296888, "grad_norm": 0.23015059530735016, "learning_rate": 1.3144542898207057e-05, "loss": 0.092, "num_tokens": 721569767.0, "step": 5520 }, { "epoch": 2.2031125299281724, "grad_norm": 0.23272277414798737, "learning_rate": 1.313692324649936e-05, "loss": 0.0789, "num_tokens": 721700839.0, "step": 5521 }, { "epoch": 2.203511572226656, "grad_norm": 0.3105199635028839, "learning_rate": 1.3129306373913133e-05, "loss": 0.098, "num_tokens": 721831911.0, "step": 5522 }, { "epoch": 2.2039106145251397, "grad_norm": 0.23133394122123718, "learning_rate": 1.3121692281922152e-05, "loss": 0.0784, "num_tokens": 721962983.0, "step": 5523 }, { "epoch": 2.2043096568236233, "grad_norm": 0.2731321454048157, "learning_rate": 1.3114080971999688e-05, "loss": 0.0931, "num_tokens": 722094055.0, "step": 5524 }, { "epoch": 2.204708699122107, "grad_norm": 0.2494775652885437, "learning_rate": 1.3106472445618446e-05, "loss": 0.0942, "num_tokens": 722225127.0, "step": 5525 }, { "epoch": 2.2051077414205906, "grad_norm": 0.24756945669651031, "learning_rate": 1.3098866704250617e-05, "loss": 0.0871, "num_tokens": 722356199.0, "step": 5526 }, { "epoch": 2.2055067837190743, "grad_norm": 0.27178725600242615, "learning_rate": 1.309126374936783e-05, "loss": 0.1031, "num_tokens": 722487271.0, "step": 5527 }, { "epoch": 2.205905826017558, "grad_norm": 0.2931436002254486, "learning_rate": 1.3083663582441192e-05, "loss": 0.0911, "num_tokens": 722618343.0, "step": 5528 }, { "epoch": 2.2063048683160416, "grad_norm": 0.29814717173576355, "learning_rate": 1.3076066204941265e-05, "loss": 0.0938, "num_tokens": 722748436.0, "step": 5529 }, { "epoch": 2.206703910614525, "grad_norm": 0.27837425470352173, "learning_rate": 1.3068471618338066e-05, "loss": 0.1088, "num_tokens": 722879508.0, "step": 5530 }, { "epoch": 2.207102952913009, "grad_norm": 0.25436827540397644, "learning_rate": 1.306087982410108e-05, "loss": 0.0898, "num_tokens": 723010580.0, "step": 5531 }, { "epoch": 2.2075019952114925, "grad_norm": 0.2506464719772339, "learning_rate": 1.3053290823699255e-05, "loss": 0.0917, "num_tokens": 723141652.0, "step": 5532 }, { "epoch": 2.207901037509976, "grad_norm": 0.24820928275585175, "learning_rate": 1.3045704618600976e-05, "loss": 0.0783, "num_tokens": 723272724.0, "step": 5533 }, { "epoch": 2.2083000798084598, "grad_norm": 0.2716352641582489, "learning_rate": 1.3038121210274115e-05, "loss": 0.0814, "num_tokens": 723403796.0, "step": 5534 }, { "epoch": 2.2086991221069434, "grad_norm": 0.33272725343704224, "learning_rate": 1.3030540600186e-05, "loss": 0.1054, "num_tokens": 723534868.0, "step": 5535 }, { "epoch": 2.209098164405427, "grad_norm": 0.21947580575942993, "learning_rate": 1.3022962789803383e-05, "loss": 0.0704, "num_tokens": 723665940.0, "step": 5536 }, { "epoch": 2.2094972067039107, "grad_norm": 0.2504221498966217, "learning_rate": 1.3015387780592513e-05, "loss": 0.0896, "num_tokens": 723797012.0, "step": 5537 }, { "epoch": 2.2098962490023943, "grad_norm": 0.2882482409477234, "learning_rate": 1.3007815574019089e-05, "loss": 0.0975, "num_tokens": 723928084.0, "step": 5538 }, { "epoch": 2.210295291300878, "grad_norm": 0.3142159879207611, "learning_rate": 1.3000246171548253e-05, "loss": 0.0756, "num_tokens": 724059156.0, "step": 5539 }, { "epoch": 2.2106943335993616, "grad_norm": 0.2362218052148819, "learning_rate": 1.2992679574644617e-05, "loss": 0.0749, "num_tokens": 724190228.0, "step": 5540 }, { "epoch": 2.2110933758978453, "grad_norm": 0.22716060280799866, "learning_rate": 1.2985115784772256e-05, "loss": 0.0751, "num_tokens": 724321300.0, "step": 5541 }, { "epoch": 2.211492418196329, "grad_norm": 0.27317312359809875, "learning_rate": 1.2977554803394676e-05, "loss": 0.0859, "num_tokens": 724452372.0, "step": 5542 }, { "epoch": 2.2118914604948126, "grad_norm": 0.2431045025587082, "learning_rate": 1.2969996631974862e-05, "loss": 0.0896, "num_tokens": 724583444.0, "step": 5543 }, { "epoch": 2.212290502793296, "grad_norm": 0.2618215084075928, "learning_rate": 1.2962441271975261e-05, "loss": 0.0831, "num_tokens": 724714516.0, "step": 5544 }, { "epoch": 2.21268954509178, "grad_norm": 0.2843715250492096, "learning_rate": 1.2954888724857756e-05, "loss": 0.0919, "num_tokens": 724845588.0, "step": 5545 }, { "epoch": 2.2130885873902635, "grad_norm": 0.2815777659416199, "learning_rate": 1.2947338992083685e-05, "loss": 0.102, "num_tokens": 724962173.0, "step": 5546 }, { "epoch": 2.213487629688747, "grad_norm": 0.26413494348526, "learning_rate": 1.2939792075113876e-05, "loss": 0.0953, "num_tokens": 725093245.0, "step": 5547 }, { "epoch": 2.2138866719872308, "grad_norm": 0.2710987627506256, "learning_rate": 1.2932247975408568e-05, "loss": 0.0794, "num_tokens": 725208253.0, "step": 5548 }, { "epoch": 2.2142857142857144, "grad_norm": 0.21818433701992035, "learning_rate": 1.2924706694427471e-05, "loss": 0.0679, "num_tokens": 725339325.0, "step": 5549 }, { "epoch": 2.214684756584198, "grad_norm": 0.25729992985725403, "learning_rate": 1.2917168233629762e-05, "loss": 0.0842, "num_tokens": 725470397.0, "step": 5550 }, { "epoch": 2.2150837988826817, "grad_norm": 0.2234911173582077, "learning_rate": 1.2909632594474063e-05, "loss": 0.0731, "num_tokens": 725601469.0, "step": 5551 }, { "epoch": 2.2154828411811653, "grad_norm": 0.24604903161525726, "learning_rate": 1.2902099778418447e-05, "loss": 0.0738, "num_tokens": 725732541.0, "step": 5552 }, { "epoch": 2.215881883479649, "grad_norm": 0.25128552317619324, "learning_rate": 1.2894569786920436e-05, "loss": 0.084, "num_tokens": 725863613.0, "step": 5553 }, { "epoch": 2.2162809257781326, "grad_norm": 0.21487018465995789, "learning_rate": 1.2887042621437035e-05, "loss": 0.0704, "num_tokens": 725994685.0, "step": 5554 }, { "epoch": 2.2166799680766163, "grad_norm": 0.3246888518333435, "learning_rate": 1.2879518283424654e-05, "loss": 0.1267, "num_tokens": 726125757.0, "step": 5555 }, { "epoch": 2.2170790103751, "grad_norm": 0.2705341577529907, "learning_rate": 1.2871996774339196e-05, "loss": 0.0907, "num_tokens": 726256829.0, "step": 5556 }, { "epoch": 2.2174780526735836, "grad_norm": 0.27992966771125793, "learning_rate": 1.2864478095636012e-05, "loss": 0.0914, "num_tokens": 726387901.0, "step": 5557 }, { "epoch": 2.217877094972067, "grad_norm": 0.26464053988456726, "learning_rate": 1.2856962248769882e-05, "loss": 0.0818, "num_tokens": 726518973.0, "step": 5558 }, { "epoch": 2.218276137270551, "grad_norm": 0.2467387318611145, "learning_rate": 1.2849449235195052e-05, "loss": 0.0942, "num_tokens": 726650045.0, "step": 5559 }, { "epoch": 2.2186751795690345, "grad_norm": 0.2290252447128296, "learning_rate": 1.2841939056365231e-05, "loss": 0.0842, "num_tokens": 726781117.0, "step": 5560 }, { "epoch": 2.219074221867518, "grad_norm": 0.2533304989337921, "learning_rate": 1.2834431713733553e-05, "loss": 0.0774, "num_tokens": 726912189.0, "step": 5561 }, { "epoch": 2.2194732641660018, "grad_norm": 0.22400949895381927, "learning_rate": 1.2826927208752628e-05, "loss": 0.0819, "num_tokens": 727043261.0, "step": 5562 }, { "epoch": 2.2198723064644854, "grad_norm": 0.25428900122642517, "learning_rate": 1.2819425542874517e-05, "loss": 0.0891, "num_tokens": 727174333.0, "step": 5563 }, { "epoch": 2.220271348762969, "grad_norm": 0.25821396708488464, "learning_rate": 1.2811926717550705e-05, "loss": 0.1048, "num_tokens": 727305405.0, "step": 5564 }, { "epoch": 2.2206703910614527, "grad_norm": 0.24597761034965515, "learning_rate": 1.2804430734232148e-05, "loss": 0.0875, "num_tokens": 727436477.0, "step": 5565 }, { "epoch": 2.2210694333599363, "grad_norm": 0.30077075958251953, "learning_rate": 1.2796937594369267e-05, "loss": 0.0996, "num_tokens": 727567549.0, "step": 5566 }, { "epoch": 2.22146847565842, "grad_norm": 0.28985148668289185, "learning_rate": 1.2789447299411891e-05, "loss": 0.0977, "num_tokens": 727698621.0, "step": 5567 }, { "epoch": 2.2218675179569036, "grad_norm": 0.19541506469249725, "learning_rate": 1.2781959850809334e-05, "loss": 0.0406, "num_tokens": 727829693.0, "step": 5568 }, { "epoch": 2.2222665602553873, "grad_norm": 0.2685944736003876, "learning_rate": 1.277447525001035e-05, "loss": 0.0798, "num_tokens": 727960765.0, "step": 5569 }, { "epoch": 2.222665602553871, "grad_norm": 0.24102580547332764, "learning_rate": 1.2766993498463142e-05, "loss": 0.0807, "num_tokens": 728091837.0, "step": 5570 }, { "epoch": 2.2230646448523546, "grad_norm": 0.26060721278190613, "learning_rate": 1.2759514597615341e-05, "loss": 0.0872, "num_tokens": 728222909.0, "step": 5571 }, { "epoch": 2.223463687150838, "grad_norm": 0.28165876865386963, "learning_rate": 1.2752038548914055e-05, "loss": 0.0864, "num_tokens": 728353981.0, "step": 5572 }, { "epoch": 2.223862729449322, "grad_norm": 0.2422114759683609, "learning_rate": 1.2744565353805842e-05, "loss": 0.0847, "num_tokens": 728485053.0, "step": 5573 }, { "epoch": 2.2242617717478055, "grad_norm": 0.27128565311431885, "learning_rate": 1.2737095013736674e-05, "loss": 0.0864, "num_tokens": 728616125.0, "step": 5574 }, { "epoch": 2.224660814046289, "grad_norm": 0.22362683713436127, "learning_rate": 1.2729627530152006e-05, "loss": 0.0704, "num_tokens": 728747197.0, "step": 5575 }, { "epoch": 2.2250598563447728, "grad_norm": 0.2531062364578247, "learning_rate": 1.2722162904496726e-05, "loss": 0.0811, "num_tokens": 728878269.0, "step": 5576 }, { "epoch": 2.2254588986432564, "grad_norm": 0.27446451783180237, "learning_rate": 1.2714701138215163e-05, "loss": 0.0803, "num_tokens": 729009341.0, "step": 5577 }, { "epoch": 2.22585794094174, "grad_norm": 0.2560904622077942, "learning_rate": 1.2707242232751102e-05, "loss": 0.0918, "num_tokens": 729140413.0, "step": 5578 }, { "epoch": 2.2262569832402237, "grad_norm": 0.2748163342475891, "learning_rate": 1.269978618954778e-05, "loss": 0.1012, "num_tokens": 729271485.0, "step": 5579 }, { "epoch": 2.226656025538707, "grad_norm": 0.2136528491973877, "learning_rate": 1.2692333010047866e-05, "loss": 0.0656, "num_tokens": 729402557.0, "step": 5580 }, { "epoch": 2.2270550678371905, "grad_norm": 0.21852412819862366, "learning_rate": 1.2684882695693467e-05, "loss": 0.0807, "num_tokens": 729533629.0, "step": 5581 }, { "epoch": 2.227454110135674, "grad_norm": 0.2539307177066803, "learning_rate": 1.2677435247926175e-05, "loss": 0.0808, "num_tokens": 729664701.0, "step": 5582 }, { "epoch": 2.227853152434158, "grad_norm": 0.24204058945178986, "learning_rate": 1.2669990668186979e-05, "loss": 0.0873, "num_tokens": 729795773.0, "step": 5583 }, { "epoch": 2.2282521947326415, "grad_norm": 0.25490501523017883, "learning_rate": 1.2662548957916342e-05, "loss": 0.0768, "num_tokens": 729926845.0, "step": 5584 }, { "epoch": 2.228651237031125, "grad_norm": 0.29674232006073, "learning_rate": 1.2655110118554181e-05, "loss": 0.0936, "num_tokens": 730057917.0, "step": 5585 }, { "epoch": 2.2290502793296088, "grad_norm": 0.26204153895378113, "learning_rate": 1.264767415153982e-05, "loss": 0.0735, "num_tokens": 730188989.0, "step": 5586 }, { "epoch": 2.2294493216280924, "grad_norm": 0.27590692043304443, "learning_rate": 1.2640241058312057e-05, "loss": 0.0942, "num_tokens": 730320061.0, "step": 5587 }, { "epoch": 2.229848363926576, "grad_norm": 0.28087547421455383, "learning_rate": 1.263281084030914e-05, "loss": 0.1041, "num_tokens": 730451133.0, "step": 5588 }, { "epoch": 2.2302474062250597, "grad_norm": 0.3004404306411743, "learning_rate": 1.2625383498968723e-05, "loss": 0.111, "num_tokens": 730582205.0, "step": 5589 }, { "epoch": 2.2306464485235433, "grad_norm": 0.2917776107788086, "learning_rate": 1.2617959035727944e-05, "loss": 0.0993, "num_tokens": 730713277.0, "step": 5590 }, { "epoch": 2.231045490822027, "grad_norm": 0.2693692743778229, "learning_rate": 1.261053745202337e-05, "loss": 0.1131, "num_tokens": 730844349.0, "step": 5591 }, { "epoch": 2.2314445331205106, "grad_norm": 0.2910778522491455, "learning_rate": 1.2603118749290999e-05, "loss": 0.0998, "num_tokens": 730975421.0, "step": 5592 }, { "epoch": 2.2318435754189943, "grad_norm": 0.247566819190979, "learning_rate": 1.2595702928966277e-05, "loss": 0.0832, "num_tokens": 731106493.0, "step": 5593 }, { "epoch": 2.232242617717478, "grad_norm": 0.24267052114009857, "learning_rate": 1.2588289992484097e-05, "loss": 0.0784, "num_tokens": 731237565.0, "step": 5594 }, { "epoch": 2.2326416600159615, "grad_norm": 0.2535766661167145, "learning_rate": 1.2580879941278811e-05, "loss": 0.0804, "num_tokens": 731368637.0, "step": 5595 }, { "epoch": 2.233040702314445, "grad_norm": 0.2851705551147461, "learning_rate": 1.257347277678418e-05, "loss": 0.1037, "num_tokens": 731499709.0, "step": 5596 }, { "epoch": 2.233439744612929, "grad_norm": 0.228702574968338, "learning_rate": 1.256606850043342e-05, "loss": 0.0804, "num_tokens": 731630781.0, "step": 5597 }, { "epoch": 2.2338387869114125, "grad_norm": 0.2361905425786972, "learning_rate": 1.255866711365919e-05, "loss": 0.0801, "num_tokens": 731761853.0, "step": 5598 }, { "epoch": 2.234237829209896, "grad_norm": 0.23459413647651672, "learning_rate": 1.2551268617893591e-05, "loss": 0.0776, "num_tokens": 731892925.0, "step": 5599 }, { "epoch": 2.2346368715083798, "grad_norm": 0.2719022333621979, "learning_rate": 1.2543873014568175e-05, "loss": 0.1075, "num_tokens": 732023997.0, "step": 5600 }, { "epoch": 2.2350359138068634, "grad_norm": 0.23220141232013702, "learning_rate": 1.2536480305113902e-05, "loss": 0.0762, "num_tokens": 732155069.0, "step": 5601 }, { "epoch": 2.235434956105347, "grad_norm": 0.2911236882209778, "learning_rate": 1.25290904909612e-05, "loss": 0.0907, "num_tokens": 732286141.0, "step": 5602 }, { "epoch": 2.2358339984038307, "grad_norm": 0.271823912858963, "learning_rate": 1.252170357353994e-05, "loss": 0.0854, "num_tokens": 732417213.0, "step": 5603 }, { "epoch": 2.2362330407023143, "grad_norm": 0.2389519065618515, "learning_rate": 1.2514319554279413e-05, "loss": 0.0659, "num_tokens": 732548285.0, "step": 5604 }, { "epoch": 2.236632083000798, "grad_norm": 0.29575082659721375, "learning_rate": 1.2506938434608351e-05, "loss": 0.1017, "num_tokens": 732679357.0, "step": 5605 }, { "epoch": 2.2370311252992816, "grad_norm": 0.24885442852973938, "learning_rate": 1.2499560215954936e-05, "loss": 0.0807, "num_tokens": 732810429.0, "step": 5606 }, { "epoch": 2.2374301675977653, "grad_norm": 0.23958462476730347, "learning_rate": 1.2492184899746794e-05, "loss": 0.0765, "num_tokens": 732941501.0, "step": 5607 }, { "epoch": 2.237829209896249, "grad_norm": 0.22491878271102905, "learning_rate": 1.248481248741097e-05, "loss": 0.0783, "num_tokens": 733072573.0, "step": 5608 }, { "epoch": 2.2382282521947325, "grad_norm": 0.25810369849205017, "learning_rate": 1.2477442980373956e-05, "loss": 0.086, "num_tokens": 733203645.0, "step": 5609 }, { "epoch": 2.238627294493216, "grad_norm": 0.25954675674438477, "learning_rate": 1.2470076380061697e-05, "loss": 0.0877, "num_tokens": 733334717.0, "step": 5610 }, { "epoch": 2.2390263367917, "grad_norm": 0.23253211379051208, "learning_rate": 1.2462712687899542e-05, "loss": 0.0706, "num_tokens": 733463973.0, "step": 5611 }, { "epoch": 2.2394253790901835, "grad_norm": 0.25098443031311035, "learning_rate": 1.2455351905312304e-05, "loss": 0.083, "num_tokens": 733595045.0, "step": 5612 }, { "epoch": 2.239824421388667, "grad_norm": 0.2315799444913864, "learning_rate": 1.2447994033724238e-05, "loss": 0.0643, "num_tokens": 733726117.0, "step": 5613 }, { "epoch": 2.2402234636871508, "grad_norm": 0.2597739100456238, "learning_rate": 1.2440639074559012e-05, "loss": 0.1023, "num_tokens": 733857189.0, "step": 5614 }, { "epoch": 2.2406225059856344, "grad_norm": 0.277934193611145, "learning_rate": 1.2433287029239736e-05, "loss": 0.0962, "num_tokens": 733988261.0, "step": 5615 }, { "epoch": 2.241021548284118, "grad_norm": 0.2383849322795868, "learning_rate": 1.242593789918898e-05, "loss": 0.0696, "num_tokens": 734119333.0, "step": 5616 }, { "epoch": 2.2414205905826017, "grad_norm": 0.2286609560251236, "learning_rate": 1.241859168582871e-05, "loss": 0.0659, "num_tokens": 734250405.0, "step": 5617 }, { "epoch": 2.2418196328810853, "grad_norm": 0.29611265659332275, "learning_rate": 1.2411248390580365e-05, "loss": 0.0906, "num_tokens": 734381477.0, "step": 5618 }, { "epoch": 2.242218675179569, "grad_norm": 0.2545422315597534, "learning_rate": 1.2403908014864807e-05, "loss": 0.0875, "num_tokens": 734512549.0, "step": 5619 }, { "epoch": 2.2426177174780526, "grad_norm": 0.21842247247695923, "learning_rate": 1.2396570560102316e-05, "loss": 0.0643, "num_tokens": 734643621.0, "step": 5620 }, { "epoch": 2.2430167597765363, "grad_norm": 0.22235234081745148, "learning_rate": 1.2389236027712631e-05, "loss": 0.0615, "num_tokens": 734774693.0, "step": 5621 }, { "epoch": 2.24341580207502, "grad_norm": 0.2957931458950043, "learning_rate": 1.2381904419114921e-05, "loss": 0.1038, "num_tokens": 734905765.0, "step": 5622 }, { "epoch": 2.2438148443735035, "grad_norm": 0.25971725583076477, "learning_rate": 1.2374575735727768e-05, "loss": 0.0856, "num_tokens": 735036837.0, "step": 5623 }, { "epoch": 2.244213886671987, "grad_norm": 0.23747040331363678, "learning_rate": 1.2367249978969215e-05, "loss": 0.0711, "num_tokens": 735167909.0, "step": 5624 }, { "epoch": 2.244612928970471, "grad_norm": 0.2574291229248047, "learning_rate": 1.235992715025673e-05, "loss": 0.1007, "num_tokens": 735298981.0, "step": 5625 }, { "epoch": 2.2450119712689545, "grad_norm": 0.2963469624519348, "learning_rate": 1.2352607251007211e-05, "loss": 0.0991, "num_tokens": 735430053.0, "step": 5626 }, { "epoch": 2.245411013567438, "grad_norm": 0.28111520409584045, "learning_rate": 1.2345290282636976e-05, "loss": 0.1048, "num_tokens": 735561125.0, "step": 5627 }, { "epoch": 2.2458100558659218, "grad_norm": 0.2537689507007599, "learning_rate": 1.2337976246561805e-05, "loss": 0.0857, "num_tokens": 735692197.0, "step": 5628 }, { "epoch": 2.2462090981644054, "grad_norm": 0.2614124119281769, "learning_rate": 1.2330665144196893e-05, "loss": 0.0866, "num_tokens": 735823269.0, "step": 5629 }, { "epoch": 2.246608140462889, "grad_norm": 0.25831055641174316, "learning_rate": 1.2323356976956867e-05, "loss": 0.0885, "num_tokens": 735954341.0, "step": 5630 }, { "epoch": 2.2470071827613727, "grad_norm": 0.27435559034347534, "learning_rate": 1.2316051746255788e-05, "loss": 0.0899, "num_tokens": 736085413.0, "step": 5631 }, { "epoch": 2.2474062250598563, "grad_norm": 0.30205926299095154, "learning_rate": 1.2308749453507163e-05, "loss": 0.1118, "num_tokens": 736216485.0, "step": 5632 }, { "epoch": 2.24780526735834, "grad_norm": 0.2824940085411072, "learning_rate": 1.2301450100123898e-05, "loss": 0.0868, "num_tokens": 736347557.0, "step": 5633 }, { "epoch": 2.2482043096568236, "grad_norm": 0.2558128237724304, "learning_rate": 1.2294153687518357e-05, "loss": 0.0764, "num_tokens": 736478629.0, "step": 5634 }, { "epoch": 2.2486033519553073, "grad_norm": 0.2734002470970154, "learning_rate": 1.2286860217102344e-05, "loss": 0.0924, "num_tokens": 736609701.0, "step": 5635 }, { "epoch": 2.249002394253791, "grad_norm": 0.26251259446144104, "learning_rate": 1.2279569690287053e-05, "loss": 0.0888, "num_tokens": 736740773.0, "step": 5636 }, { "epoch": 2.2494014365522745, "grad_norm": 0.23679479956626892, "learning_rate": 1.227228210848315e-05, "loss": 0.0741, "num_tokens": 736871845.0, "step": 5637 }, { "epoch": 2.249800478850758, "grad_norm": 0.26547175645828247, "learning_rate": 1.2264997473100712e-05, "loss": 0.0883, "num_tokens": 737002917.0, "step": 5638 }, { "epoch": 2.250199521149242, "grad_norm": 0.2586525082588196, "learning_rate": 1.2257715785549233e-05, "loss": 0.0895, "num_tokens": 737133989.0, "step": 5639 }, { "epoch": 2.2505985634477255, "grad_norm": 0.2695193886756897, "learning_rate": 1.2250437047237665e-05, "loss": 0.0952, "num_tokens": 737265061.0, "step": 5640 }, { "epoch": 2.250997605746209, "grad_norm": 0.24728737771511078, "learning_rate": 1.224316125957438e-05, "loss": 0.0737, "num_tokens": 737396133.0, "step": 5641 }, { "epoch": 2.2513966480446927, "grad_norm": 0.2617636024951935, "learning_rate": 1.2235888423967164e-05, "loss": 0.0908, "num_tokens": 737527205.0, "step": 5642 }, { "epoch": 2.2517956903431764, "grad_norm": 0.2804737687110901, "learning_rate": 1.2228618541823245e-05, "loss": 0.1033, "num_tokens": 737658277.0, "step": 5643 }, { "epoch": 2.25219473264166, "grad_norm": 0.26550209522247314, "learning_rate": 1.2221351614549287e-05, "loss": 0.0908, "num_tokens": 737789349.0, "step": 5644 }, { "epoch": 2.2525937749401437, "grad_norm": 0.31612110137939453, "learning_rate": 1.221408764355136e-05, "loss": 0.115, "num_tokens": 737920421.0, "step": 5645 }, { "epoch": 2.2529928172386273, "grad_norm": 0.2905711531639099, "learning_rate": 1.220682663023498e-05, "loss": 0.1017, "num_tokens": 738051493.0, "step": 5646 }, { "epoch": 2.253391859537111, "grad_norm": 0.2202995866537094, "learning_rate": 1.2199568576005094e-05, "loss": 0.0684, "num_tokens": 738182565.0, "step": 5647 }, { "epoch": 2.2537909018355946, "grad_norm": 0.21919895708560944, "learning_rate": 1.219231348226606e-05, "loss": 0.0609, "num_tokens": 738313637.0, "step": 5648 }, { "epoch": 2.2541899441340782, "grad_norm": 0.22904255986213684, "learning_rate": 1.218506135042166e-05, "loss": 0.0689, "num_tokens": 738444709.0, "step": 5649 }, { "epoch": 2.254588986432562, "grad_norm": 0.22145222127437592, "learning_rate": 1.2177812181875128e-05, "loss": 0.0687, "num_tokens": 738575781.0, "step": 5650 }, { "epoch": 2.2549880287310455, "grad_norm": 0.2787730097770691, "learning_rate": 1.2170565978029114e-05, "loss": 0.1053, "num_tokens": 738706853.0, "step": 5651 }, { "epoch": 2.255387071029529, "grad_norm": 0.26685285568237305, "learning_rate": 1.2163322740285677e-05, "loss": 0.0966, "num_tokens": 738837925.0, "step": 5652 }, { "epoch": 2.255786113328013, "grad_norm": 0.24138972163200378, "learning_rate": 1.215608247004632e-05, "loss": 0.0784, "num_tokens": 738968997.0, "step": 5653 }, { "epoch": 2.2561851556264965, "grad_norm": 0.2574106752872467, "learning_rate": 1.2148845168711983e-05, "loss": 0.0886, "num_tokens": 739100069.0, "step": 5654 }, { "epoch": 2.25658419792498, "grad_norm": 0.3616604804992676, "learning_rate": 1.2141610837682992e-05, "loss": 0.1184, "num_tokens": 739231141.0, "step": 5655 }, { "epoch": 2.2569832402234637, "grad_norm": 0.2944536805152893, "learning_rate": 1.2134379478359139e-05, "loss": 0.0962, "num_tokens": 739362213.0, "step": 5656 }, { "epoch": 2.2573822825219474, "grad_norm": 0.22803613543510437, "learning_rate": 1.2127151092139626e-05, "loss": 0.0737, "num_tokens": 739493285.0, "step": 5657 }, { "epoch": 2.257781324820431, "grad_norm": 0.27919501066207886, "learning_rate": 1.2119925680423065e-05, "loss": 0.0992, "num_tokens": 739624357.0, "step": 5658 }, { "epoch": 2.2581803671189147, "grad_norm": 0.24759003520011902, "learning_rate": 1.2112703244607523e-05, "loss": 0.0835, "num_tokens": 739755429.0, "step": 5659 }, { "epoch": 2.2585794094173983, "grad_norm": 0.2305126190185547, "learning_rate": 1.2105483786090463e-05, "loss": 0.0689, "num_tokens": 739886501.0, "step": 5660 }, { "epoch": 2.258978451715882, "grad_norm": 0.23855796456336975, "learning_rate": 1.2098267306268776e-05, "loss": 0.076, "num_tokens": 740017573.0, "step": 5661 }, { "epoch": 2.2593774940143656, "grad_norm": 0.2916867434978485, "learning_rate": 1.2091053806538796e-05, "loss": 0.1094, "num_tokens": 740148645.0, "step": 5662 }, { "epoch": 2.2597765363128492, "grad_norm": 0.2437431961297989, "learning_rate": 1.2083843288296268e-05, "loss": 0.0791, "num_tokens": 740279717.0, "step": 5663 }, { "epoch": 2.260175578611333, "grad_norm": 0.24397604167461395, "learning_rate": 1.207663575293635e-05, "loss": 0.0779, "num_tokens": 740401905.0, "step": 5664 }, { "epoch": 2.2605746209098165, "grad_norm": 0.2514888346195221, "learning_rate": 1.2069431201853636e-05, "loss": 0.0922, "num_tokens": 740532977.0, "step": 5665 }, { "epoch": 2.2609736632083, "grad_norm": 0.24481810629367828, "learning_rate": 1.2062229636442154e-05, "loss": 0.0801, "num_tokens": 740664049.0, "step": 5666 }, { "epoch": 2.261372705506784, "grad_norm": 0.2557942867279053, "learning_rate": 1.205503105809532e-05, "loss": 0.0976, "num_tokens": 740795121.0, "step": 5667 }, { "epoch": 2.2617717478052675, "grad_norm": 0.3092436194419861, "learning_rate": 1.2047835468205999e-05, "loss": 0.0964, "num_tokens": 740926193.0, "step": 5668 }, { "epoch": 2.262170790103751, "grad_norm": 0.25794973969459534, "learning_rate": 1.2040642868166481e-05, "loss": 0.0972, "num_tokens": 741057265.0, "step": 5669 }, { "epoch": 2.2625698324022347, "grad_norm": 0.3397315442562103, "learning_rate": 1.2033453259368448e-05, "loss": 0.0967, "num_tokens": 741188337.0, "step": 5670 }, { "epoch": 2.2629688747007184, "grad_norm": 0.24755224585533142, "learning_rate": 1.2026266643203043e-05, "loss": 0.091, "num_tokens": 741319409.0, "step": 5671 }, { "epoch": 2.263367916999202, "grad_norm": 0.25457534193992615, "learning_rate": 1.2019083021060793e-05, "loss": 0.1008, "num_tokens": 741450481.0, "step": 5672 }, { "epoch": 2.2637669592976857, "grad_norm": 0.24158567190170288, "learning_rate": 1.2011902394331662e-05, "loss": 0.0862, "num_tokens": 741581553.0, "step": 5673 }, { "epoch": 2.2641660015961693, "grad_norm": 0.26654595136642456, "learning_rate": 1.2004724764405042e-05, "loss": 0.0879, "num_tokens": 741712625.0, "step": 5674 }, { "epoch": 2.264565043894653, "grad_norm": 0.24310381710529327, "learning_rate": 1.1997550132669739e-05, "loss": 0.0812, "num_tokens": 741843697.0, "step": 5675 }, { "epoch": 2.2649640861931366, "grad_norm": 0.26471537351608276, "learning_rate": 1.1990378500513967e-05, "loss": 0.0808, "num_tokens": 741974769.0, "step": 5676 }, { "epoch": 2.2653631284916202, "grad_norm": 0.24970580637454987, "learning_rate": 1.1983209869325374e-05, "loss": 0.0694, "num_tokens": 742105841.0, "step": 5677 }, { "epoch": 2.265762170790104, "grad_norm": 0.262268990278244, "learning_rate": 1.1976044240491037e-05, "loss": 0.0833, "num_tokens": 742236913.0, "step": 5678 }, { "epoch": 2.2661612130885875, "grad_norm": 0.3005431890487671, "learning_rate": 1.1968881615397418e-05, "loss": 0.0901, "num_tokens": 742367985.0, "step": 5679 }, { "epoch": 2.266560255387071, "grad_norm": 0.272178590297699, "learning_rate": 1.1961721995430428e-05, "loss": 0.1029, "num_tokens": 742499057.0, "step": 5680 }, { "epoch": 2.266959297685555, "grad_norm": 0.2308669239282608, "learning_rate": 1.1954565381975391e-05, "loss": 0.0709, "num_tokens": 742630129.0, "step": 5681 }, { "epoch": 2.2673583399840385, "grad_norm": 0.27762937545776367, "learning_rate": 1.1947411776417044e-05, "loss": 0.0945, "num_tokens": 742761201.0, "step": 5682 }, { "epoch": 2.267757382282522, "grad_norm": 0.24114815890789032, "learning_rate": 1.1940261180139528e-05, "loss": 0.0835, "num_tokens": 742892273.0, "step": 5683 }, { "epoch": 2.2681564245810057, "grad_norm": 0.29287928342819214, "learning_rate": 1.193311359452643e-05, "loss": 0.0933, "num_tokens": 743023345.0, "step": 5684 }, { "epoch": 2.2685554668794894, "grad_norm": 0.2283366173505783, "learning_rate": 1.1925969020960743e-05, "loss": 0.0702, "num_tokens": 743154417.0, "step": 5685 }, { "epoch": 2.268954509177973, "grad_norm": 0.27485257387161255, "learning_rate": 1.1918827460824865e-05, "loss": 0.0969, "num_tokens": 743285489.0, "step": 5686 }, { "epoch": 2.2693535514764567, "grad_norm": 0.3011326491832733, "learning_rate": 1.191168891550063e-05, "loss": 0.0961, "num_tokens": 743416561.0, "step": 5687 }, { "epoch": 2.2697525937749403, "grad_norm": 0.2717224657535553, "learning_rate": 1.190455338636928e-05, "loss": 0.0894, "num_tokens": 743547633.0, "step": 5688 }, { "epoch": 2.270151636073424, "grad_norm": 0.26019909977912903, "learning_rate": 1.189742087481147e-05, "loss": 0.1021, "num_tokens": 743678705.0, "step": 5689 }, { "epoch": 2.2705506783719076, "grad_norm": 0.2807009816169739, "learning_rate": 1.189029138220727e-05, "loss": 0.1041, "num_tokens": 743809777.0, "step": 5690 }, { "epoch": 2.2709497206703912, "grad_norm": 0.2872382402420044, "learning_rate": 1.1883164909936182e-05, "loss": 0.0949, "num_tokens": 743940849.0, "step": 5691 }, { "epoch": 2.271348762968875, "grad_norm": 0.257282018661499, "learning_rate": 1.1876041459377102e-05, "loss": 0.0942, "num_tokens": 744071921.0, "step": 5692 }, { "epoch": 2.2717478052673585, "grad_norm": 0.2842228412628174, "learning_rate": 1.1868921031908361e-05, "loss": 0.1015, "num_tokens": 744202993.0, "step": 5693 }, { "epoch": 2.2721468475658417, "grad_norm": 0.27696993947029114, "learning_rate": 1.1861803628907691e-05, "loss": 0.0982, "num_tokens": 744326643.0, "step": 5694 }, { "epoch": 2.2725458898643254, "grad_norm": 0.23279549181461334, "learning_rate": 1.1854689251752232e-05, "loss": 0.0774, "num_tokens": 744457715.0, "step": 5695 }, { "epoch": 2.272944932162809, "grad_norm": 0.38120853900909424, "learning_rate": 1.184757790181856e-05, "loss": 0.1253, "num_tokens": 744588787.0, "step": 5696 }, { "epoch": 2.2733439744612927, "grad_norm": 0.27600154280662537, "learning_rate": 1.1840469580482661e-05, "loss": 0.1067, "num_tokens": 744719859.0, "step": 5697 }, { "epoch": 2.2737430167597763, "grad_norm": 0.2244713008403778, "learning_rate": 1.1833364289119913e-05, "loss": 0.0837, "num_tokens": 744850931.0, "step": 5698 }, { "epoch": 2.27414205905826, "grad_norm": 0.25534725189208984, "learning_rate": 1.1826262029105136e-05, "loss": 0.0853, "num_tokens": 744982003.0, "step": 5699 }, { "epoch": 2.2745411013567436, "grad_norm": 0.22289633750915527, "learning_rate": 1.1819162801812553e-05, "loss": 0.079, "num_tokens": 745113075.0, "step": 5700 }, { "epoch": 2.2749401436552272, "grad_norm": 0.2588086426258087, "learning_rate": 1.1812066608615784e-05, "loss": 0.0977, "num_tokens": 745244147.0, "step": 5701 }, { "epoch": 2.275339185953711, "grad_norm": 0.28964582085609436, "learning_rate": 1.180497345088789e-05, "loss": 0.0921, "num_tokens": 745375219.0, "step": 5702 }, { "epoch": 2.2757382282521945, "grad_norm": 0.2550879120826721, "learning_rate": 1.1797883330001327e-05, "loss": 0.084, "num_tokens": 745506291.0, "step": 5703 }, { "epoch": 2.276137270550678, "grad_norm": 0.24393253028392792, "learning_rate": 1.1790796247327964e-05, "loss": 0.0769, "num_tokens": 745637363.0, "step": 5704 }, { "epoch": 2.276536312849162, "grad_norm": 0.23652243614196777, "learning_rate": 1.1783712204239091e-05, "loss": 0.0873, "num_tokens": 745768435.0, "step": 5705 }, { "epoch": 2.2769353551476454, "grad_norm": 0.3166716694831848, "learning_rate": 1.1776631202105398e-05, "loss": 0.097, "num_tokens": 745899507.0, "step": 5706 }, { "epoch": 2.277334397446129, "grad_norm": 0.25061166286468506, "learning_rate": 1.1769553242297001e-05, "loss": 0.0813, "num_tokens": 746030579.0, "step": 5707 }, { "epoch": 2.2777334397446127, "grad_norm": 0.2246667891740799, "learning_rate": 1.1762478326183407e-05, "loss": 0.0755, "num_tokens": 746161651.0, "step": 5708 }, { "epoch": 2.2781324820430964, "grad_norm": 0.27496033906936646, "learning_rate": 1.1755406455133555e-05, "loss": 0.0896, "num_tokens": 746292723.0, "step": 5709 }, { "epoch": 2.27853152434158, "grad_norm": 0.2622411251068115, "learning_rate": 1.1748337630515792e-05, "loss": 0.0979, "num_tokens": 746423795.0, "step": 5710 }, { "epoch": 2.2789305666400637, "grad_norm": 0.24495570361614227, "learning_rate": 1.1741271853697854e-05, "loss": 0.0876, "num_tokens": 746554867.0, "step": 5711 }, { "epoch": 2.2793296089385473, "grad_norm": 0.20915593206882477, "learning_rate": 1.1734209126046912e-05, "loss": 0.0683, "num_tokens": 746685939.0, "step": 5712 }, { "epoch": 2.279728651237031, "grad_norm": 0.3316577076911926, "learning_rate": 1.1727149448929543e-05, "loss": 0.0961, "num_tokens": 746817011.0, "step": 5713 }, { "epoch": 2.2801276935355146, "grad_norm": 0.26709622144699097, "learning_rate": 1.1720092823711714e-05, "loss": 0.0988, "num_tokens": 746948083.0, "step": 5714 }, { "epoch": 2.2805267358339982, "grad_norm": 0.2506950795650482, "learning_rate": 1.171303925175883e-05, "loss": 0.0887, "num_tokens": 747079155.0, "step": 5715 }, { "epoch": 2.280925778132482, "grad_norm": 0.21843764185905457, "learning_rate": 1.1705988734435699e-05, "loss": 0.0634, "num_tokens": 747210227.0, "step": 5716 }, { "epoch": 2.2813248204309655, "grad_norm": 0.25258103013038635, "learning_rate": 1.1698941273106505e-05, "loss": 0.0869, "num_tokens": 747341299.0, "step": 5717 }, { "epoch": 2.281723862729449, "grad_norm": 0.22691737115383148, "learning_rate": 1.1691896869134876e-05, "loss": 0.0763, "num_tokens": 747472371.0, "step": 5718 }, { "epoch": 2.282122905027933, "grad_norm": 0.261686235666275, "learning_rate": 1.168485552388385e-05, "loss": 0.0858, "num_tokens": 747603443.0, "step": 5719 }, { "epoch": 2.2825219473264164, "grad_norm": 0.24332818388938904, "learning_rate": 1.1677817238715846e-05, "loss": 0.0879, "num_tokens": 747734515.0, "step": 5720 }, { "epoch": 2.2829209896249, "grad_norm": 0.24918079376220703, "learning_rate": 1.1670782014992714e-05, "loss": 0.0839, "num_tokens": 747865587.0, "step": 5721 }, { "epoch": 2.2833200319233837, "grad_norm": 0.3215422034263611, "learning_rate": 1.1663749854075712e-05, "loss": 0.0885, "num_tokens": 747996659.0, "step": 5722 }, { "epoch": 2.2837190742218674, "grad_norm": 0.2953811287879944, "learning_rate": 1.1656720757325481e-05, "loss": 0.0786, "num_tokens": 748127731.0, "step": 5723 }, { "epoch": 2.284118116520351, "grad_norm": 0.28416162729263306, "learning_rate": 1.1649694726102096e-05, "loss": 0.0927, "num_tokens": 748258803.0, "step": 5724 }, { "epoch": 2.2845171588188347, "grad_norm": 0.2264484465122223, "learning_rate": 1.1642671761765034e-05, "loss": 0.0737, "num_tokens": 748389875.0, "step": 5725 }, { "epoch": 2.2849162011173183, "grad_norm": 0.28358718752861023, "learning_rate": 1.1635651865673156e-05, "loss": 0.0945, "num_tokens": 748520947.0, "step": 5726 }, { "epoch": 2.285315243415802, "grad_norm": 0.2255762219429016, "learning_rate": 1.1628635039184767e-05, "loss": 0.0788, "num_tokens": 748652019.0, "step": 5727 }, { "epoch": 2.2857142857142856, "grad_norm": 0.2641640901565552, "learning_rate": 1.1621621283657537e-05, "loss": 0.0909, "num_tokens": 748783091.0, "step": 5728 }, { "epoch": 2.2861133280127692, "grad_norm": 0.32051801681518555, "learning_rate": 1.1614610600448581e-05, "loss": 0.1097, "num_tokens": 748914163.0, "step": 5729 }, { "epoch": 2.286512370311253, "grad_norm": 0.29099464416503906, "learning_rate": 1.1607602990914384e-05, "loss": 0.0989, "num_tokens": 749045235.0, "step": 5730 }, { "epoch": 2.2869114126097365, "grad_norm": 0.29106032848358154, "learning_rate": 1.1600598456410862e-05, "loss": 0.0909, "num_tokens": 749176307.0, "step": 5731 }, { "epoch": 2.28731045490822, "grad_norm": 0.2127559632062912, "learning_rate": 1.1593596998293334e-05, "loss": 0.0564, "num_tokens": 749307379.0, "step": 5732 }, { "epoch": 2.287709497206704, "grad_norm": 0.25363320112228394, "learning_rate": 1.1586598617916501e-05, "loss": 0.08, "num_tokens": 749438451.0, "step": 5733 }, { "epoch": 2.2881085395051874, "grad_norm": 0.22807207703590393, "learning_rate": 1.1579603316634499e-05, "loss": 0.0807, "num_tokens": 749567786.0, "step": 5734 }, { "epoch": 2.288507581803671, "grad_norm": 0.25166717171669006, "learning_rate": 1.1572611095800843e-05, "loss": 0.0732, "num_tokens": 749690851.0, "step": 5735 }, { "epoch": 2.2889066241021547, "grad_norm": 0.2512472867965698, "learning_rate": 1.1565621956768465e-05, "loss": 0.0816, "num_tokens": 749821923.0, "step": 5736 }, { "epoch": 2.2893056664006384, "grad_norm": 0.2967715561389923, "learning_rate": 1.1558635900889705e-05, "loss": 0.0897, "num_tokens": 749952995.0, "step": 5737 }, { "epoch": 2.289704708699122, "grad_norm": 0.25441956520080566, "learning_rate": 1.155165292951629e-05, "loss": 0.0896, "num_tokens": 750084067.0, "step": 5738 }, { "epoch": 2.2901037509976057, "grad_norm": 0.2740528881549835, "learning_rate": 1.1544673043999373e-05, "loss": 0.0943, "num_tokens": 750215139.0, "step": 5739 }, { "epoch": 2.2905027932960893, "grad_norm": 0.2809271812438965, "learning_rate": 1.1537696245689478e-05, "loss": 0.0937, "num_tokens": 750346211.0, "step": 5740 }, { "epoch": 2.290901835594573, "grad_norm": 0.3063168227672577, "learning_rate": 1.1530722535936572e-05, "loss": 0.1057, "num_tokens": 750477283.0, "step": 5741 }, { "epoch": 2.2913008778930566, "grad_norm": 0.29423436522483826, "learning_rate": 1.152375191608998e-05, "loss": 0.0861, "num_tokens": 750608355.0, "step": 5742 }, { "epoch": 2.2916999201915402, "grad_norm": 0.24757830798625946, "learning_rate": 1.1516784387498463e-05, "loss": 0.0762, "num_tokens": 750739427.0, "step": 5743 }, { "epoch": 2.292098962490024, "grad_norm": 0.256295382976532, "learning_rate": 1.1509819951510187e-05, "loss": 0.0954, "num_tokens": 750870499.0, "step": 5744 }, { "epoch": 2.2924980047885075, "grad_norm": 0.2615513801574707, "learning_rate": 1.1502858609472679e-05, "loss": 0.0806, "num_tokens": 751001571.0, "step": 5745 }, { "epoch": 2.292897047086991, "grad_norm": 0.21977290511131287, "learning_rate": 1.1495900362732908e-05, "loss": 0.0736, "num_tokens": 751132643.0, "step": 5746 }, { "epoch": 2.293296089385475, "grad_norm": 0.253801167011261, "learning_rate": 1.1488945212637234e-05, "loss": 0.0914, "num_tokens": 751263715.0, "step": 5747 }, { "epoch": 2.2936951316839584, "grad_norm": 0.2503456473350525, "learning_rate": 1.1481993160531404e-05, "loss": 0.092, "num_tokens": 751394787.0, "step": 5748 }, { "epoch": 2.294094173982442, "grad_norm": 0.2897633910179138, "learning_rate": 1.1475044207760578e-05, "loss": 0.0938, "num_tokens": 751509889.0, "step": 5749 }, { "epoch": 2.2944932162809257, "grad_norm": 0.3229263126850128, "learning_rate": 1.146809835566933e-05, "loss": 0.0901, "num_tokens": 751640961.0, "step": 5750 }, { "epoch": 2.2948922585794094, "grad_norm": 0.23606353998184204, "learning_rate": 1.1461155605601588e-05, "loss": 0.0883, "num_tokens": 751772033.0, "step": 5751 }, { "epoch": 2.295291300877893, "grad_norm": 0.30329227447509766, "learning_rate": 1.1454215958900725e-05, "loss": 0.1153, "num_tokens": 751903105.0, "step": 5752 }, { "epoch": 2.2956903431763767, "grad_norm": 0.25604331493377686, "learning_rate": 1.1447279416909502e-05, "loss": 0.0786, "num_tokens": 752034177.0, "step": 5753 }, { "epoch": 2.2960893854748603, "grad_norm": 0.28138279914855957, "learning_rate": 1.1440345980970065e-05, "loss": 0.0876, "num_tokens": 752165249.0, "step": 5754 }, { "epoch": 2.296488427773344, "grad_norm": 0.27236419916152954, "learning_rate": 1.1433415652423974e-05, "loss": 0.0923, "num_tokens": 752296321.0, "step": 5755 }, { "epoch": 2.2968874700718276, "grad_norm": 0.2431136518716812, "learning_rate": 1.1426488432612192e-05, "loss": 0.0762, "num_tokens": 752427393.0, "step": 5756 }, { "epoch": 2.2972865123703112, "grad_norm": 0.29786416888237, "learning_rate": 1.1419564322875054e-05, "loss": 0.0941, "num_tokens": 752558465.0, "step": 5757 }, { "epoch": 2.297685554668795, "grad_norm": 0.2684553563594818, "learning_rate": 1.1412643324552318e-05, "loss": 0.0951, "num_tokens": 752689537.0, "step": 5758 }, { "epoch": 2.2980845969672785, "grad_norm": 0.24016232788562775, "learning_rate": 1.1405725438983144e-05, "loss": 0.078, "num_tokens": 752820609.0, "step": 5759 }, { "epoch": 2.298483639265762, "grad_norm": 0.24640780687332153, "learning_rate": 1.139881066750606e-05, "loss": 0.0819, "num_tokens": 752951681.0, "step": 5760 }, { "epoch": 2.298882681564246, "grad_norm": 0.27494046092033386, "learning_rate": 1.1391899011459027e-05, "loss": 0.0845, "num_tokens": 753082753.0, "step": 5761 }, { "epoch": 2.2992817238627294, "grad_norm": 0.2607726752758026, "learning_rate": 1.1384990472179372e-05, "loss": 0.0993, "num_tokens": 753213825.0, "step": 5762 }, { "epoch": 2.299680766161213, "grad_norm": 0.25995370745658875, "learning_rate": 1.1378085051003843e-05, "loss": 0.0904, "num_tokens": 753344897.0, "step": 5763 }, { "epoch": 2.3000798084596967, "grad_norm": 0.27892005443573, "learning_rate": 1.1371182749268562e-05, "loss": 0.0921, "num_tokens": 753475969.0, "step": 5764 }, { "epoch": 2.3004788507581804, "grad_norm": 0.22955462336540222, "learning_rate": 1.1364283568309069e-05, "loss": 0.0696, "num_tokens": 753607041.0, "step": 5765 }, { "epoch": 2.300877893056664, "grad_norm": 0.2338985800743103, "learning_rate": 1.1357387509460297e-05, "loss": 0.0818, "num_tokens": 753738113.0, "step": 5766 }, { "epoch": 2.3012769353551477, "grad_norm": 0.21577054262161255, "learning_rate": 1.1350494574056559e-05, "loss": 0.062, "num_tokens": 753869185.0, "step": 5767 }, { "epoch": 2.3016759776536313, "grad_norm": 0.3153913617134094, "learning_rate": 1.1343604763431575e-05, "loss": 0.0996, "num_tokens": 754000257.0, "step": 5768 }, { "epoch": 2.302075019952115, "grad_norm": 0.2041197568178177, "learning_rate": 1.1336718078918467e-05, "loss": 0.0604, "num_tokens": 754131329.0, "step": 5769 }, { "epoch": 2.3024740622505986, "grad_norm": 0.2804412245750427, "learning_rate": 1.132983452184973e-05, "loss": 0.0833, "num_tokens": 754262401.0, "step": 5770 }, { "epoch": 2.3028731045490822, "grad_norm": 0.22150245308876038, "learning_rate": 1.1322954093557282e-05, "loss": 0.066, "num_tokens": 754393473.0, "step": 5771 }, { "epoch": 2.303272146847566, "grad_norm": 0.26165592670440674, "learning_rate": 1.1316076795372419e-05, "loss": 0.0879, "num_tokens": 754524545.0, "step": 5772 }, { "epoch": 2.3036711891460495, "grad_norm": 0.23843206465244293, "learning_rate": 1.1309202628625831e-05, "loss": 0.0725, "num_tokens": 754655617.0, "step": 5773 }, { "epoch": 2.304070231444533, "grad_norm": 0.22863641381263733, "learning_rate": 1.13023315946476e-05, "loss": 0.0726, "num_tokens": 754786689.0, "step": 5774 }, { "epoch": 2.304469273743017, "grad_norm": 0.22030676901340485, "learning_rate": 1.1295463694767222e-05, "loss": 0.0714, "num_tokens": 754917761.0, "step": 5775 }, { "epoch": 2.3048683160415004, "grad_norm": 0.328285276889801, "learning_rate": 1.1288598930313552e-05, "loss": 0.0933, "num_tokens": 755048833.0, "step": 5776 }, { "epoch": 2.305267358339984, "grad_norm": 0.2416951209306717, "learning_rate": 1.128173730261487e-05, "loss": 0.0659, "num_tokens": 755179905.0, "step": 5777 }, { "epoch": 2.3056664006384677, "grad_norm": 0.2745537757873535, "learning_rate": 1.1274878812998842e-05, "loss": 0.0976, "num_tokens": 755310977.0, "step": 5778 }, { "epoch": 2.3060654429369514, "grad_norm": 0.29461610317230225, "learning_rate": 1.1268023462792508e-05, "loss": 0.0923, "num_tokens": 755442049.0, "step": 5779 }, { "epoch": 2.306464485235435, "grad_norm": 0.2693931758403778, "learning_rate": 1.126117125332232e-05, "loss": 0.0853, "num_tokens": 755573121.0, "step": 5780 }, { "epoch": 2.3068635275339187, "grad_norm": 0.25990960001945496, "learning_rate": 1.1254322185914128e-05, "loss": 0.0762, "num_tokens": 755704193.0, "step": 5781 }, { "epoch": 2.3072625698324023, "grad_norm": 0.24571958184242249, "learning_rate": 1.1247476261893143e-05, "loss": 0.0755, "num_tokens": 755835265.0, "step": 5782 }, { "epoch": 2.307661612130886, "grad_norm": 0.29004883766174316, "learning_rate": 1.1240633482583998e-05, "loss": 0.0999, "num_tokens": 755966337.0, "step": 5783 }, { "epoch": 2.3080606544293696, "grad_norm": 0.2885898947715759, "learning_rate": 1.1233793849310714e-05, "loss": 0.0938, "num_tokens": 756097409.0, "step": 5784 }, { "epoch": 2.3084596967278532, "grad_norm": 0.24238260090351105, "learning_rate": 1.1226957363396687e-05, "loss": 0.0755, "num_tokens": 756228481.0, "step": 5785 }, { "epoch": 2.308858739026337, "grad_norm": 0.21354886889457703, "learning_rate": 1.122012402616471e-05, "loss": 0.0663, "num_tokens": 756359553.0, "step": 5786 }, { "epoch": 2.3092577813248205, "grad_norm": 0.20768828690052032, "learning_rate": 1.1213293838936973e-05, "loss": 0.0638, "num_tokens": 756490625.0, "step": 5787 }, { "epoch": 2.309656823623304, "grad_norm": 0.27304887771606445, "learning_rate": 1.1206466803035065e-05, "loss": 0.0826, "num_tokens": 756621697.0, "step": 5788 }, { "epoch": 2.310055865921788, "grad_norm": 0.23547327518463135, "learning_rate": 1.1199642919779933e-05, "loss": 0.0856, "num_tokens": 756752769.0, "step": 5789 }, { "epoch": 2.3104549082202714, "grad_norm": 0.23493196070194244, "learning_rate": 1.1192822190491947e-05, "loss": 0.0818, "num_tokens": 756883841.0, "step": 5790 }, { "epoch": 2.310853950518755, "grad_norm": 0.2606714069843292, "learning_rate": 1.1186004616490864e-05, "loss": 0.0851, "num_tokens": 757014913.0, "step": 5791 }, { "epoch": 2.3112529928172387, "grad_norm": 0.23736980557441711, "learning_rate": 1.1179190199095801e-05, "loss": 0.0717, "num_tokens": 757145985.0, "step": 5792 }, { "epoch": 2.3116520351157224, "grad_norm": 0.24880351126194, "learning_rate": 1.1172378939625295e-05, "loss": 0.0651, "num_tokens": 757277057.0, "step": 5793 }, { "epoch": 2.312051077414206, "grad_norm": 0.21656718850135803, "learning_rate": 1.1165570839397265e-05, "loss": 0.0678, "num_tokens": 757408129.0, "step": 5794 }, { "epoch": 2.3124501197126897, "grad_norm": 0.30091583728790283, "learning_rate": 1.1158765899729013e-05, "loss": 0.0888, "num_tokens": 757539201.0, "step": 5795 }, { "epoch": 2.3128491620111733, "grad_norm": 0.277305543422699, "learning_rate": 1.1151964121937217e-05, "loss": 0.0828, "num_tokens": 757670273.0, "step": 5796 }, { "epoch": 2.313248204309657, "grad_norm": 0.23346072435379028, "learning_rate": 1.1145165507337979e-05, "loss": 0.0793, "num_tokens": 757801345.0, "step": 5797 }, { "epoch": 2.3136472466081406, "grad_norm": 0.258736252784729, "learning_rate": 1.1138370057246752e-05, "loss": 0.0832, "num_tokens": 757932417.0, "step": 5798 }, { "epoch": 2.3140462889066242, "grad_norm": 0.25339969992637634, "learning_rate": 1.1131577772978394e-05, "loss": 0.0861, "num_tokens": 758063489.0, "step": 5799 }, { "epoch": 2.314445331205108, "grad_norm": 0.2773624062538147, "learning_rate": 1.1124788655847163e-05, "loss": 0.0879, "num_tokens": 758194561.0, "step": 5800 }, { "epoch": 2.3148443735035915, "grad_norm": 0.26776567101478577, "learning_rate": 1.1118002707166669e-05, "loss": 0.0814, "num_tokens": 758325633.0, "step": 5801 }, { "epoch": 2.315243415802075, "grad_norm": 0.29193443059921265, "learning_rate": 1.1111219928249944e-05, "loss": 0.0889, "num_tokens": 758456705.0, "step": 5802 }, { "epoch": 2.315642458100559, "grad_norm": 0.287931352853775, "learning_rate": 1.1104440320409394e-05, "loss": 0.095, "num_tokens": 758587777.0, "step": 5803 }, { "epoch": 2.3160415003990424, "grad_norm": 0.25280481576919556, "learning_rate": 1.1097663884956794e-05, "loss": 0.0761, "num_tokens": 758718849.0, "step": 5804 }, { "epoch": 2.316440542697526, "grad_norm": 0.26503077149391174, "learning_rate": 1.1090890623203337e-05, "loss": 0.0964, "num_tokens": 758849921.0, "step": 5805 }, { "epoch": 2.3168395849960097, "grad_norm": 0.2660253047943115, "learning_rate": 1.1084120536459583e-05, "loss": 0.0883, "num_tokens": 758980993.0, "step": 5806 }, { "epoch": 2.3172386272944934, "grad_norm": 0.2592334747314453, "learning_rate": 1.107735362603548e-05, "loss": 0.0973, "num_tokens": 759112065.0, "step": 5807 }, { "epoch": 2.317637669592977, "grad_norm": 0.26618409156799316, "learning_rate": 1.1070589893240357e-05, "loss": 0.1001, "num_tokens": 759243137.0, "step": 5808 }, { "epoch": 2.3180367118914607, "grad_norm": 0.252021849155426, "learning_rate": 1.1063829339382939e-05, "loss": 0.0913, "num_tokens": 759374209.0, "step": 5809 }, { "epoch": 2.3184357541899443, "grad_norm": 0.2373129278421402, "learning_rate": 1.1057071965771324e-05, "loss": 0.0803, "num_tokens": 759505281.0, "step": 5810 }, { "epoch": 2.318834796488428, "grad_norm": 0.244569793343544, "learning_rate": 1.1050317773713001e-05, "loss": 0.0827, "num_tokens": 759636353.0, "step": 5811 }, { "epoch": 2.3192338387869116, "grad_norm": 0.2972978949546814, "learning_rate": 1.1043566764514856e-05, "loss": 0.0928, "num_tokens": 759767425.0, "step": 5812 }, { "epoch": 2.3196328810853952, "grad_norm": 0.25091537833213806, "learning_rate": 1.103681893948313e-05, "loss": 0.0717, "num_tokens": 759898497.0, "step": 5813 }, { "epoch": 2.320031923383879, "grad_norm": 0.2263517528772354, "learning_rate": 1.1030074299923474e-05, "loss": 0.0662, "num_tokens": 760029569.0, "step": 5814 }, { "epoch": 2.3204309656823625, "grad_norm": 0.28668391704559326, "learning_rate": 1.1023332847140913e-05, "loss": 0.0948, "num_tokens": 760160641.0, "step": 5815 }, { "epoch": 2.320830007980846, "grad_norm": 0.26252707839012146, "learning_rate": 1.1016594582439845e-05, "loss": 0.0742, "num_tokens": 760291713.0, "step": 5816 }, { "epoch": 2.32122905027933, "grad_norm": 0.2749096155166626, "learning_rate": 1.1009859507124073e-05, "loss": 0.0933, "num_tokens": 760422785.0, "step": 5817 }, { "epoch": 2.3216280925778134, "grad_norm": 0.23328599333763123, "learning_rate": 1.1003127622496773e-05, "loss": 0.0718, "num_tokens": 760553857.0, "step": 5818 }, { "epoch": 2.322027134876297, "grad_norm": 0.2614191472530365, "learning_rate": 1.0996398929860494e-05, "loss": 0.0893, "num_tokens": 760684929.0, "step": 5819 }, { "epoch": 2.3224261771747807, "grad_norm": 0.3092884123325348, "learning_rate": 1.098967343051717e-05, "loss": 0.0951, "num_tokens": 760816001.0, "step": 5820 }, { "epoch": 2.3228252194732644, "grad_norm": 0.28212910890579224, "learning_rate": 1.0982951125768134e-05, "loss": 0.0944, "num_tokens": 760947073.0, "step": 5821 }, { "epoch": 2.323224261771748, "grad_norm": 0.24788641929626465, "learning_rate": 1.0976232016914088e-05, "loss": 0.0858, "num_tokens": 761078145.0, "step": 5822 }, { "epoch": 2.3236233040702317, "grad_norm": 0.28941839933395386, "learning_rate": 1.0969516105255112e-05, "loss": 0.0936, "num_tokens": 761209217.0, "step": 5823 }, { "epoch": 2.3240223463687153, "grad_norm": 0.25491851568222046, "learning_rate": 1.0962803392090671e-05, "loss": 0.0821, "num_tokens": 761340289.0, "step": 5824 }, { "epoch": 2.324421388667199, "grad_norm": 0.26934128999710083, "learning_rate": 1.0956093878719622e-05, "loss": 0.0943, "num_tokens": 761471361.0, "step": 5825 }, { "epoch": 2.3248204309656826, "grad_norm": 0.28205522894859314, "learning_rate": 1.094938756644018e-05, "loss": 0.0917, "num_tokens": 761602433.0, "step": 5826 }, { "epoch": 2.3252194732641662, "grad_norm": 0.23939087986946106, "learning_rate": 1.0942684456549965e-05, "loss": 0.0793, "num_tokens": 761733505.0, "step": 5827 }, { "epoch": 2.32561851556265, "grad_norm": 0.26286301016807556, "learning_rate": 1.0935984550345967e-05, "loss": 0.0907, "num_tokens": 761864577.0, "step": 5828 }, { "epoch": 2.3260175578611335, "grad_norm": 0.2527277171611786, "learning_rate": 1.0929287849124551e-05, "loss": 0.0919, "num_tokens": 761995649.0, "step": 5829 }, { "epoch": 2.326416600159617, "grad_norm": 0.23013024032115936, "learning_rate": 1.0922594354181456e-05, "loss": 0.0738, "num_tokens": 762126721.0, "step": 5830 }, { "epoch": 2.326815642458101, "grad_norm": 0.32205119729042053, "learning_rate": 1.0915904066811832e-05, "loss": 0.1047, "num_tokens": 762257793.0, "step": 5831 }, { "epoch": 2.3272146847565844, "grad_norm": 0.278568297624588, "learning_rate": 1.0909216988310166e-05, "loss": 0.0988, "num_tokens": 762388865.0, "step": 5832 }, { "epoch": 2.327613727055068, "grad_norm": 0.2095905989408493, "learning_rate": 1.090253311997036e-05, "loss": 0.0641, "num_tokens": 762508189.0, "step": 5833 }, { "epoch": 2.3280127693535517, "grad_norm": 0.25596946477890015, "learning_rate": 1.089585246308568e-05, "loss": 0.0829, "num_tokens": 762639261.0, "step": 5834 }, { "epoch": 2.328411811652035, "grad_norm": 0.2648681700229645, "learning_rate": 1.0889175018948758e-05, "loss": 0.0792, "num_tokens": 762770333.0, "step": 5835 }, { "epoch": 2.3288108539505186, "grad_norm": 0.3150343596935272, "learning_rate": 1.0882500788851626e-05, "loss": 0.0814, "num_tokens": 762901405.0, "step": 5836 }, { "epoch": 2.329209896249002, "grad_norm": 0.26953497529029846, "learning_rate": 1.0875829774085693e-05, "loss": 0.0874, "num_tokens": 763032477.0, "step": 5837 }, { "epoch": 2.329608938547486, "grad_norm": 0.24860823154449463, "learning_rate": 1.0869161975941725e-05, "loss": 0.0671, "num_tokens": 763163549.0, "step": 5838 }, { "epoch": 2.3300079808459695, "grad_norm": 0.2504879832267761, "learning_rate": 1.0862497395709883e-05, "loss": 0.082, "num_tokens": 763294621.0, "step": 5839 }, { "epoch": 2.330407023144453, "grad_norm": 0.27471643686294556, "learning_rate": 1.0855836034679706e-05, "loss": 0.0926, "num_tokens": 763425693.0, "step": 5840 }, { "epoch": 2.330806065442937, "grad_norm": 0.23921598494052887, "learning_rate": 1.0849177894140103e-05, "loss": 0.0681, "num_tokens": 763556765.0, "step": 5841 }, { "epoch": 2.3312051077414204, "grad_norm": 0.2413603961467743, "learning_rate": 1.0842522975379354e-05, "loss": 0.0747, "num_tokens": 763687837.0, "step": 5842 }, { "epoch": 2.331604150039904, "grad_norm": 0.23095113039016724, "learning_rate": 1.083587127968513e-05, "loss": 0.0741, "num_tokens": 763818909.0, "step": 5843 }, { "epoch": 2.3320031923383877, "grad_norm": 0.27331361174583435, "learning_rate": 1.082922280834448e-05, "loss": 0.0863, "num_tokens": 763949981.0, "step": 5844 }, { "epoch": 2.3324022346368714, "grad_norm": 0.2516181468963623, "learning_rate": 1.0822577562643808e-05, "loss": 0.0691, "num_tokens": 764081053.0, "step": 5845 }, { "epoch": 2.332801276935355, "grad_norm": 0.3332504332065582, "learning_rate": 1.0815935543868907e-05, "loss": 0.1009, "num_tokens": 764212125.0, "step": 5846 }, { "epoch": 2.3332003192338386, "grad_norm": 0.2625446915626526, "learning_rate": 1.0809296753304964e-05, "loss": 0.0704, "num_tokens": 764343197.0, "step": 5847 }, { "epoch": 2.3335993615323223, "grad_norm": 0.2463679015636444, "learning_rate": 1.0802661192236503e-05, "loss": 0.0665, "num_tokens": 764474269.0, "step": 5848 }, { "epoch": 2.333998403830806, "grad_norm": 0.24728047847747803, "learning_rate": 1.0796028861947449e-05, "loss": 0.0739, "num_tokens": 764605341.0, "step": 5849 }, { "epoch": 2.3343974461292896, "grad_norm": 0.25950756669044495, "learning_rate": 1.0789399763721105e-05, "loss": 0.082, "num_tokens": 764736413.0, "step": 5850 }, { "epoch": 2.334796488427773, "grad_norm": 0.27059999108314514, "learning_rate": 1.0782773898840124e-05, "loss": 0.0901, "num_tokens": 764867485.0, "step": 5851 }, { "epoch": 2.335195530726257, "grad_norm": 0.2762960195541382, "learning_rate": 1.0776151268586568e-05, "loss": 0.096, "num_tokens": 764998557.0, "step": 5852 }, { "epoch": 2.3355945730247405, "grad_norm": 0.24134162068367004, "learning_rate": 1.0769531874241842e-05, "loss": 0.0787, "num_tokens": 765129629.0, "step": 5853 }, { "epoch": 2.335993615323224, "grad_norm": 0.23902252316474915, "learning_rate": 1.0762915717086734e-05, "loss": 0.0831, "num_tokens": 765260701.0, "step": 5854 }, { "epoch": 2.336392657621708, "grad_norm": 0.2761825919151306, "learning_rate": 1.075630279840141e-05, "loss": 0.1045, "num_tokens": 765391773.0, "step": 5855 }, { "epoch": 2.3367916999201914, "grad_norm": 0.24503016471862793, "learning_rate": 1.0749693119465419e-05, "loss": 0.0838, "num_tokens": 765522845.0, "step": 5856 }, { "epoch": 2.337190742218675, "grad_norm": 0.2542482018470764, "learning_rate": 1.074308668155766e-05, "loss": 0.0883, "num_tokens": 765653917.0, "step": 5857 }, { "epoch": 2.3375897845171587, "grad_norm": 0.3000507652759552, "learning_rate": 1.0736483485956422e-05, "loss": 0.0948, "num_tokens": 765784989.0, "step": 5858 }, { "epoch": 2.3379888268156424, "grad_norm": 0.2820254862308502, "learning_rate": 1.0729883533939369e-05, "loss": 0.0928, "num_tokens": 765916061.0, "step": 5859 }, { "epoch": 2.338387869114126, "grad_norm": 0.2884752154350281, "learning_rate": 1.0723286826783516e-05, "loss": 0.0865, "num_tokens": 766047133.0, "step": 5860 }, { "epoch": 2.3387869114126096, "grad_norm": 0.29069849848747253, "learning_rate": 1.071669336576527e-05, "loss": 0.0998, "num_tokens": 766178205.0, "step": 5861 }, { "epoch": 2.3391859537110933, "grad_norm": 0.24535636603832245, "learning_rate": 1.0710103152160413e-05, "loss": 0.0786, "num_tokens": 766309277.0, "step": 5862 }, { "epoch": 2.339584996009577, "grad_norm": 0.27822205424308777, "learning_rate": 1.0703516187244084e-05, "loss": 0.0954, "num_tokens": 766440349.0, "step": 5863 }, { "epoch": 2.3399840383080606, "grad_norm": 0.23513570427894592, "learning_rate": 1.069693247229079e-05, "loss": 0.0692, "num_tokens": 766571421.0, "step": 5864 }, { "epoch": 2.340383080606544, "grad_norm": 0.264771044254303, "learning_rate": 1.0690352008574425e-05, "loss": 0.0974, "num_tokens": 766702493.0, "step": 5865 }, { "epoch": 2.340782122905028, "grad_norm": 0.2915026545524597, "learning_rate": 1.068377479736826e-05, "loss": 0.0964, "num_tokens": 766833565.0, "step": 5866 }, { "epoch": 2.3411811652035115, "grad_norm": 0.25376731157302856, "learning_rate": 1.0677200839944904e-05, "loss": 0.0876, "num_tokens": 766964637.0, "step": 5867 }, { "epoch": 2.341580207501995, "grad_norm": 0.24227683246135712, "learning_rate": 1.0670630137576365e-05, "loss": 0.0733, "num_tokens": 767095709.0, "step": 5868 }, { "epoch": 2.341979249800479, "grad_norm": 0.2553679347038269, "learning_rate": 1.0664062691534018e-05, "loss": 0.0899, "num_tokens": 767226781.0, "step": 5869 }, { "epoch": 2.3423782920989624, "grad_norm": 0.2710818648338318, "learning_rate": 1.0657498503088596e-05, "loss": 0.1017, "num_tokens": 767357853.0, "step": 5870 }, { "epoch": 2.342777334397446, "grad_norm": 0.2420322299003601, "learning_rate": 1.065093757351021e-05, "loss": 0.0762, "num_tokens": 767488925.0, "step": 5871 }, { "epoch": 2.3431763766959297, "grad_norm": 0.2331504374742508, "learning_rate": 1.0644379904068334e-05, "loss": 0.078, "num_tokens": 767619997.0, "step": 5872 }, { "epoch": 2.3435754189944134, "grad_norm": 0.23322540521621704, "learning_rate": 1.0637825496031823e-05, "loss": 0.0648, "num_tokens": 767751069.0, "step": 5873 }, { "epoch": 2.343974461292897, "grad_norm": 0.25766533613204956, "learning_rate": 1.0631274350668896e-05, "loss": 0.0864, "num_tokens": 767882141.0, "step": 5874 }, { "epoch": 2.3443735035913806, "grad_norm": 0.2936195135116577, "learning_rate": 1.0624726469247133e-05, "loss": 0.0894, "num_tokens": 768013213.0, "step": 5875 }, { "epoch": 2.3447725458898643, "grad_norm": 0.32990315556526184, "learning_rate": 1.0618181853033485e-05, "loss": 0.1069, "num_tokens": 768144285.0, "step": 5876 }, { "epoch": 2.345171588188348, "grad_norm": 0.2898375988006592, "learning_rate": 1.0611640503294276e-05, "loss": 0.0958, "num_tokens": 768275357.0, "step": 5877 }, { "epoch": 2.3455706304868316, "grad_norm": 0.21129193902015686, "learning_rate": 1.0605102421295204e-05, "loss": 0.056, "num_tokens": 768406429.0, "step": 5878 }, { "epoch": 2.345969672785315, "grad_norm": 0.2634439766407013, "learning_rate": 1.0598567608301317e-05, "loss": 0.0853, "num_tokens": 768537501.0, "step": 5879 }, { "epoch": 2.346368715083799, "grad_norm": 0.3111710548400879, "learning_rate": 1.0592036065577043e-05, "loss": 0.1076, "num_tokens": 768668573.0, "step": 5880 }, { "epoch": 2.3467677573822825, "grad_norm": 0.24987754225730896, "learning_rate": 1.0585507794386187e-05, "loss": 0.0836, "num_tokens": 768799645.0, "step": 5881 }, { "epoch": 2.347166799680766, "grad_norm": 0.26506683230400085, "learning_rate": 1.0578982795991886e-05, "loss": 0.0893, "num_tokens": 768930717.0, "step": 5882 }, { "epoch": 2.34756584197925, "grad_norm": 0.22897450625896454, "learning_rate": 1.057246107165668e-05, "loss": 0.0822, "num_tokens": 769061789.0, "step": 5883 }, { "epoch": 2.3479648842777334, "grad_norm": 0.2533230483531952, "learning_rate": 1.0565942622642469e-05, "loss": 0.0915, "num_tokens": 769192861.0, "step": 5884 }, { "epoch": 2.348363926576217, "grad_norm": 0.28400206565856934, "learning_rate": 1.0559427450210493e-05, "loss": 0.0747, "num_tokens": 769323933.0, "step": 5885 }, { "epoch": 2.3487629688747007, "grad_norm": 0.24313758313655853, "learning_rate": 1.0552915555621397e-05, "loss": 0.0758, "num_tokens": 769455005.0, "step": 5886 }, { "epoch": 2.3491620111731844, "grad_norm": 0.22923308610916138, "learning_rate": 1.0546406940135159e-05, "loss": 0.0705, "num_tokens": 769586077.0, "step": 5887 }, { "epoch": 2.349561053471668, "grad_norm": 0.2769458293914795, "learning_rate": 1.0539901605011135e-05, "loss": 0.0823, "num_tokens": 769717149.0, "step": 5888 }, { "epoch": 2.3499600957701516, "grad_norm": 0.2858191430568695, "learning_rate": 1.053339955150805e-05, "loss": 0.0998, "num_tokens": 769848221.0, "step": 5889 }, { "epoch": 2.3503591380686353, "grad_norm": 0.2663721442222595, "learning_rate": 1.0526900780883996e-05, "loss": 0.0966, "num_tokens": 769979293.0, "step": 5890 }, { "epoch": 2.350758180367119, "grad_norm": 0.26459717750549316, "learning_rate": 1.0520405294396415e-05, "loss": 0.0908, "num_tokens": 770110365.0, "step": 5891 }, { "epoch": 2.3511572226656026, "grad_norm": 0.28481537103652954, "learning_rate": 1.0513913093302128e-05, "loss": 0.0883, "num_tokens": 770241437.0, "step": 5892 }, { "epoch": 2.351556264964086, "grad_norm": 0.2060687094926834, "learning_rate": 1.0507424178857322e-05, "loss": 0.0613, "num_tokens": 770372509.0, "step": 5893 }, { "epoch": 2.35195530726257, "grad_norm": 0.3066929876804352, "learning_rate": 1.0500938552317525e-05, "loss": 0.0806, "num_tokens": 770503581.0, "step": 5894 }, { "epoch": 2.3523543495610535, "grad_norm": 0.22861036658287048, "learning_rate": 1.0494456214937657e-05, "loss": 0.0726, "num_tokens": 770634653.0, "step": 5895 }, { "epoch": 2.352753391859537, "grad_norm": 0.2629014551639557, "learning_rate": 1.0487977167971994e-05, "loss": 0.0887, "num_tokens": 770765725.0, "step": 5896 }, { "epoch": 2.353152434158021, "grad_norm": 0.3118445575237274, "learning_rate": 1.0481501412674166e-05, "loss": 0.1008, "num_tokens": 770896797.0, "step": 5897 }, { "epoch": 2.3535514764565044, "grad_norm": 0.3058840036392212, "learning_rate": 1.047502895029716e-05, "loss": 0.0925, "num_tokens": 771027869.0, "step": 5898 }, { "epoch": 2.353950518754988, "grad_norm": 0.261152058839798, "learning_rate": 1.0468559782093352e-05, "loss": 0.0862, "num_tokens": 771158941.0, "step": 5899 }, { "epoch": 2.3543495610534717, "grad_norm": 0.25541433691978455, "learning_rate": 1.0462093909314465e-05, "loss": 0.0766, "num_tokens": 771290013.0, "step": 5900 }, { "epoch": 2.3547486033519553, "grad_norm": 0.25727105140686035, "learning_rate": 1.0455631333211576e-05, "loss": 0.0801, "num_tokens": 771421085.0, "step": 5901 }, { "epoch": 2.355147645650439, "grad_norm": 0.2649167478084564, "learning_rate": 1.0449172055035139e-05, "loss": 0.0829, "num_tokens": 771552157.0, "step": 5902 }, { "epoch": 2.3555466879489226, "grad_norm": 0.2548748552799225, "learning_rate": 1.0442716076034967e-05, "loss": 0.0817, "num_tokens": 771683229.0, "step": 5903 }, { "epoch": 2.3559457302474063, "grad_norm": 0.2745678424835205, "learning_rate": 1.0436263397460225e-05, "loss": 0.0763, "num_tokens": 771806321.0, "step": 5904 }, { "epoch": 2.35634477254589, "grad_norm": 0.26379433274269104, "learning_rate": 1.0429814020559448e-05, "loss": 0.087, "num_tokens": 771937393.0, "step": 5905 }, { "epoch": 2.3567438148443736, "grad_norm": 0.23883265256881714, "learning_rate": 1.0423367946580537e-05, "loss": 0.0769, "num_tokens": 772068465.0, "step": 5906 }, { "epoch": 2.357142857142857, "grad_norm": 0.23812539875507355, "learning_rate": 1.0416925176770736e-05, "loss": 0.0847, "num_tokens": 772199537.0, "step": 5907 }, { "epoch": 2.357541899441341, "grad_norm": 0.18571066856384277, "learning_rate": 1.0410485712376674e-05, "loss": 0.0538, "num_tokens": 772330609.0, "step": 5908 }, { "epoch": 2.3579409417398245, "grad_norm": 0.24267473816871643, "learning_rate": 1.0404049554644323e-05, "loss": 0.0705, "num_tokens": 772461681.0, "step": 5909 }, { "epoch": 2.358339984038308, "grad_norm": 0.24723553657531738, "learning_rate": 1.0397616704819008e-05, "loss": 0.0806, "num_tokens": 772592753.0, "step": 5910 }, { "epoch": 2.3587390263367918, "grad_norm": 0.2683330476284027, "learning_rate": 1.0391187164145433e-05, "loss": 0.0844, "num_tokens": 772723825.0, "step": 5911 }, { "epoch": 2.3591380686352754, "grad_norm": 0.2760429084300995, "learning_rate": 1.0384760933867663e-05, "loss": 0.094, "num_tokens": 772854897.0, "step": 5912 }, { "epoch": 2.359537110933759, "grad_norm": 0.23659224808216095, "learning_rate": 1.03783380152291e-05, "loss": 0.0736, "num_tokens": 772985969.0, "step": 5913 }, { "epoch": 2.3599361532322427, "grad_norm": 0.2569538950920105, "learning_rate": 1.0371918409472522e-05, "loss": 0.0871, "num_tokens": 773117041.0, "step": 5914 }, { "epoch": 2.3603351955307263, "grad_norm": 0.22704698145389557, "learning_rate": 1.0365502117840079e-05, "loss": 0.067, "num_tokens": 773248113.0, "step": 5915 }, { "epoch": 2.36073423782921, "grad_norm": 0.26431939005851746, "learning_rate": 1.0359089141573238e-05, "loss": 0.086, "num_tokens": 773379185.0, "step": 5916 }, { "epoch": 2.3611332801276936, "grad_norm": 0.2660019099712372, "learning_rate": 1.0352679481912864e-05, "loss": 0.0958, "num_tokens": 773510257.0, "step": 5917 }, { "epoch": 2.3615323224261773, "grad_norm": 0.2673163414001465, "learning_rate": 1.0346273140099172e-05, "loss": 0.0867, "num_tokens": 773641329.0, "step": 5918 }, { "epoch": 2.361931364724661, "grad_norm": 0.2215685397386551, "learning_rate": 1.0339870117371723e-05, "loss": 0.068, "num_tokens": 773772401.0, "step": 5919 }, { "epoch": 2.3623304070231446, "grad_norm": 0.24509452283382416, "learning_rate": 1.0333470414969432e-05, "loss": 0.066, "num_tokens": 773903473.0, "step": 5920 }, { "epoch": 2.362729449321628, "grad_norm": 0.27254578471183777, "learning_rate": 1.0327074034130597e-05, "loss": 0.0927, "num_tokens": 774034545.0, "step": 5921 }, { "epoch": 2.363128491620112, "grad_norm": 0.22243620455265045, "learning_rate": 1.0320680976092853e-05, "loss": 0.0634, "num_tokens": 774165617.0, "step": 5922 }, { "epoch": 2.3635275339185955, "grad_norm": 0.29010865092277527, "learning_rate": 1.0314291242093191e-05, "loss": 0.1003, "num_tokens": 774296689.0, "step": 5923 }, { "epoch": 2.363926576217079, "grad_norm": 0.22751078009605408, "learning_rate": 1.0307904833367974e-05, "loss": 0.0768, "num_tokens": 774427761.0, "step": 5924 }, { "epoch": 2.3643256185155628, "grad_norm": 0.27993422746658325, "learning_rate": 1.0301521751152914e-05, "loss": 0.0895, "num_tokens": 774558833.0, "step": 5925 }, { "epoch": 2.3647246608140464, "grad_norm": 0.3143235743045807, "learning_rate": 1.0295141996683064e-05, "loss": 0.0865, "num_tokens": 774689905.0, "step": 5926 }, { "epoch": 2.36512370311253, "grad_norm": 0.24497146904468536, "learning_rate": 1.028876557119286e-05, "loss": 0.0813, "num_tokens": 774820977.0, "step": 5927 }, { "epoch": 2.3655227454110137, "grad_norm": 0.2460256963968277, "learning_rate": 1.0282392475916081e-05, "loss": 0.0901, "num_tokens": 774952049.0, "step": 5928 }, { "epoch": 2.3659217877094973, "grad_norm": 0.27837926149368286, "learning_rate": 1.0276022712085853e-05, "loss": 0.0848, "num_tokens": 775083121.0, "step": 5929 }, { "epoch": 2.366320830007981, "grad_norm": 0.30514487624168396, "learning_rate": 1.0269656280934668e-05, "loss": 0.1053, "num_tokens": 775214193.0, "step": 5930 }, { "epoch": 2.3667198723064646, "grad_norm": 0.25046467781066895, "learning_rate": 1.0263293183694387e-05, "loss": 0.0765, "num_tokens": 775345265.0, "step": 5931 }, { "epoch": 2.3671189146049483, "grad_norm": 0.2250608503818512, "learning_rate": 1.0256933421596182e-05, "loss": 0.0727, "num_tokens": 775476337.0, "step": 5932 }, { "epoch": 2.367517956903432, "grad_norm": 0.23007294535636902, "learning_rate": 1.0250576995870625e-05, "loss": 0.0766, "num_tokens": 775607409.0, "step": 5933 }, { "epoch": 2.3679169992019156, "grad_norm": 0.24371184408664703, "learning_rate": 1.0244223907747626e-05, "loss": 0.0773, "num_tokens": 775738481.0, "step": 5934 }, { "epoch": 2.368316041500399, "grad_norm": 0.25036653876304626, "learning_rate": 1.023787415845644e-05, "loss": 0.0782, "num_tokens": 775869553.0, "step": 5935 }, { "epoch": 2.368715083798883, "grad_norm": 0.26359352469444275, "learning_rate": 1.023152774922569e-05, "loss": 0.0749, "num_tokens": 775984957.0, "step": 5936 }, { "epoch": 2.3691141260973665, "grad_norm": 0.2501247823238373, "learning_rate": 1.022518468128335e-05, "loss": 0.0718, "num_tokens": 776116029.0, "step": 5937 }, { "epoch": 2.36951316839585, "grad_norm": 0.29740241169929504, "learning_rate": 1.0218844955856735e-05, "loss": 0.1005, "num_tokens": 776247101.0, "step": 5938 }, { "epoch": 2.3699122106943338, "grad_norm": 0.295870840549469, "learning_rate": 1.0212508574172526e-05, "loss": 0.0923, "num_tokens": 776378173.0, "step": 5939 }, { "epoch": 2.3703112529928174, "grad_norm": 0.280807226896286, "learning_rate": 1.0206175537456766e-05, "loss": 0.0939, "num_tokens": 776509245.0, "step": 5940 }, { "epoch": 2.370710295291301, "grad_norm": 0.28501126170158386, "learning_rate": 1.0199845846934818e-05, "loss": 0.0907, "num_tokens": 776640317.0, "step": 5941 }, { "epoch": 2.3711093375897847, "grad_norm": 0.23420339822769165, "learning_rate": 1.0193519503831438e-05, "loss": 0.069, "num_tokens": 776771389.0, "step": 5942 }, { "epoch": 2.3715083798882683, "grad_norm": 0.2534431219100952, "learning_rate": 1.01871965093707e-05, "loss": 0.0828, "num_tokens": 776902461.0, "step": 5943 }, { "epoch": 2.3719074221867515, "grad_norm": 0.2843075394630432, "learning_rate": 1.0180876864776052e-05, "loss": 0.0999, "num_tokens": 777033533.0, "step": 5944 }, { "epoch": 2.372306464485235, "grad_norm": 0.2522859275341034, "learning_rate": 1.017456057127028e-05, "loss": 0.0852, "num_tokens": 777164605.0, "step": 5945 }, { "epoch": 2.372705506783719, "grad_norm": 0.2609074115753174, "learning_rate": 1.0168247630075533e-05, "loss": 0.0731, "num_tokens": 777295677.0, "step": 5946 }, { "epoch": 2.3731045490822025, "grad_norm": 0.24575135111808777, "learning_rate": 1.0161938042413309e-05, "loss": 0.079, "num_tokens": 777426749.0, "step": 5947 }, { "epoch": 2.373503591380686, "grad_norm": 0.23901699483394623, "learning_rate": 1.0155631809504442e-05, "loss": 0.0774, "num_tokens": 777557821.0, "step": 5948 }, { "epoch": 2.3739026336791698, "grad_norm": 0.23992538452148438, "learning_rate": 1.0149328932569146e-05, "loss": 0.067, "num_tokens": 777688893.0, "step": 5949 }, { "epoch": 2.3743016759776534, "grad_norm": 0.2561595141887665, "learning_rate": 1.0143029412826957e-05, "loss": 0.0879, "num_tokens": 777819965.0, "step": 5950 }, { "epoch": 2.374700718276137, "grad_norm": 0.2258121520280838, "learning_rate": 1.0136733251496775e-05, "loss": 0.0666, "num_tokens": 777951037.0, "step": 5951 }, { "epoch": 2.3750997605746207, "grad_norm": 0.20894671976566315, "learning_rate": 1.0130440449796858e-05, "loss": 0.068, "num_tokens": 778082109.0, "step": 5952 }, { "epoch": 2.3754988028731043, "grad_norm": 0.27299872040748596, "learning_rate": 1.0124151008944795e-05, "loss": 0.0875, "num_tokens": 778213181.0, "step": 5953 }, { "epoch": 2.375897845171588, "grad_norm": 0.24248750507831573, "learning_rate": 1.011786493015753e-05, "loss": 0.076, "num_tokens": 778344253.0, "step": 5954 }, { "epoch": 2.3762968874700716, "grad_norm": 0.2984206974506378, "learning_rate": 1.0111582214651368e-05, "loss": 0.0914, "num_tokens": 778475325.0, "step": 5955 }, { "epoch": 2.3766959297685553, "grad_norm": 0.2634698450565338, "learning_rate": 1.010530286364196e-05, "loss": 0.0801, "num_tokens": 778606397.0, "step": 5956 }, { "epoch": 2.377094972067039, "grad_norm": 0.2664738893508911, "learning_rate": 1.0099026878344291e-05, "loss": 0.0774, "num_tokens": 778737469.0, "step": 5957 }, { "epoch": 2.3774940143655225, "grad_norm": 0.2613382339477539, "learning_rate": 1.0092754259972711e-05, "loss": 0.0881, "num_tokens": 778868541.0, "step": 5958 }, { "epoch": 2.377893056664006, "grad_norm": 0.2255934327840805, "learning_rate": 1.0086485009740921e-05, "loss": 0.0686, "num_tokens": 778999613.0, "step": 5959 }, { "epoch": 2.37829209896249, "grad_norm": 0.2567315399646759, "learning_rate": 1.0080219128861945e-05, "loss": 0.0881, "num_tokens": 779130685.0, "step": 5960 }, { "epoch": 2.3786911412609735, "grad_norm": 0.25406506657600403, "learning_rate": 1.0073956618548187e-05, "loss": 0.0748, "num_tokens": 779261757.0, "step": 5961 }, { "epoch": 2.379090183559457, "grad_norm": 0.2715385854244232, "learning_rate": 1.0067697480011384e-05, "loss": 0.1016, "num_tokens": 779392829.0, "step": 5962 }, { "epoch": 2.3794892258579408, "grad_norm": 0.2177722007036209, "learning_rate": 1.006144171446261e-05, "loss": 0.0631, "num_tokens": 779523901.0, "step": 5963 }, { "epoch": 2.3798882681564244, "grad_norm": 0.22595499455928802, "learning_rate": 1.0055189323112305e-05, "loss": 0.073, "num_tokens": 779654973.0, "step": 5964 }, { "epoch": 2.380287310454908, "grad_norm": 0.25189489126205444, "learning_rate": 1.0048940307170256e-05, "loss": 0.0749, "num_tokens": 779786045.0, "step": 5965 }, { "epoch": 2.3806863527533917, "grad_norm": 0.2603067457675934, "learning_rate": 1.0042694667845575e-05, "loss": 0.0874, "num_tokens": 779917117.0, "step": 5966 }, { "epoch": 2.3810853950518753, "grad_norm": 0.22815488278865814, "learning_rate": 1.0036452406346736e-05, "loss": 0.0742, "num_tokens": 780048189.0, "step": 5967 }, { "epoch": 2.381484437350359, "grad_norm": 0.2506844997406006, "learning_rate": 1.0030213523881573e-05, "loss": 0.0707, "num_tokens": 780179261.0, "step": 5968 }, { "epoch": 2.3818834796488426, "grad_norm": 0.21254408359527588, "learning_rate": 1.0023978021657232e-05, "loss": 0.061, "num_tokens": 780310333.0, "step": 5969 }, { "epoch": 2.3822825219473263, "grad_norm": 0.29428353905677795, "learning_rate": 1.0017745900880238e-05, "loss": 0.0871, "num_tokens": 780441405.0, "step": 5970 }, { "epoch": 2.38268156424581, "grad_norm": 0.2934053838253021, "learning_rate": 1.001151716275645e-05, "loss": 0.0936, "num_tokens": 780572477.0, "step": 5971 }, { "epoch": 2.3830806065442935, "grad_norm": 0.273734986782074, "learning_rate": 1.0005291808491057e-05, "loss": 0.0912, "num_tokens": 780702407.0, "step": 5972 }, { "epoch": 2.383479648842777, "grad_norm": 0.23114028573036194, "learning_rate": 9.999069839288616e-06, "loss": 0.062, "num_tokens": 780833479.0, "step": 5973 }, { "epoch": 2.383878691141261, "grad_norm": 0.2967807352542877, "learning_rate": 9.992851256353029e-06, "loss": 0.0917, "num_tokens": 780964551.0, "step": 5974 }, { "epoch": 2.3842777334397445, "grad_norm": 0.26398879289627075, "learning_rate": 9.986636060887514e-06, "loss": 0.0895, "num_tokens": 781095623.0, "step": 5975 }, { "epoch": 2.384676775738228, "grad_norm": 0.2598453462123871, "learning_rate": 9.980424254094672e-06, "loss": 0.0764, "num_tokens": 781226695.0, "step": 5976 }, { "epoch": 2.3850758180367118, "grad_norm": 0.24835799634456635, "learning_rate": 9.974215837176416e-06, "loss": 0.0727, "num_tokens": 781357767.0, "step": 5977 }, { "epoch": 2.3854748603351954, "grad_norm": 0.23039554059505463, "learning_rate": 9.968010811334026e-06, "loss": 0.0692, "num_tokens": 781477912.0, "step": 5978 }, { "epoch": 2.385873902633679, "grad_norm": 0.2731437087059021, "learning_rate": 9.961809177768109e-06, "loss": 0.1012, "num_tokens": 781608984.0, "step": 5979 }, { "epoch": 2.3862729449321627, "grad_norm": 0.2537217438220978, "learning_rate": 9.955610937678624e-06, "loss": 0.0843, "num_tokens": 781740056.0, "step": 5980 }, { "epoch": 2.3866719872306463, "grad_norm": 0.2166801244020462, "learning_rate": 9.949416092264885e-06, "loss": 0.0664, "num_tokens": 781871128.0, "step": 5981 }, { "epoch": 2.38707102952913, "grad_norm": 0.3027551770210266, "learning_rate": 9.943224642725521e-06, "loss": 0.0865, "num_tokens": 782002200.0, "step": 5982 }, { "epoch": 2.3874700718276136, "grad_norm": 0.28338751196861267, "learning_rate": 9.937036590258524e-06, "loss": 0.0929, "num_tokens": 782133272.0, "step": 5983 }, { "epoch": 2.3878691141260973, "grad_norm": 0.2608962059020996, "learning_rate": 9.930851936061236e-06, "loss": 0.0918, "num_tokens": 782264344.0, "step": 5984 }, { "epoch": 2.388268156424581, "grad_norm": 0.21832877397537231, "learning_rate": 9.924670681330314e-06, "loss": 0.0734, "num_tokens": 782395416.0, "step": 5985 }, { "epoch": 2.3886671987230645, "grad_norm": 0.26174256205558777, "learning_rate": 9.91849282726178e-06, "loss": 0.0995, "num_tokens": 782526488.0, "step": 5986 }, { "epoch": 2.389066241021548, "grad_norm": 0.22257082164287567, "learning_rate": 9.912318375051004e-06, "loss": 0.0711, "num_tokens": 782657560.0, "step": 5987 }, { "epoch": 2.389465283320032, "grad_norm": 0.34056711196899414, "learning_rate": 9.906147325892659e-06, "loss": 0.1056, "num_tokens": 782788632.0, "step": 5988 }, { "epoch": 2.3898643256185155, "grad_norm": 0.23815251886844635, "learning_rate": 9.899979680980797e-06, "loss": 0.0722, "num_tokens": 782919704.0, "step": 5989 }, { "epoch": 2.390263367916999, "grad_norm": 0.2452695071697235, "learning_rate": 9.893815441508811e-06, "loss": 0.0713, "num_tokens": 783050776.0, "step": 5990 }, { "epoch": 2.3906624102154828, "grad_norm": 0.21289362013339996, "learning_rate": 9.887654608669408e-06, "loss": 0.0707, "num_tokens": 783181848.0, "step": 5991 }, { "epoch": 2.3910614525139664, "grad_norm": 0.2569212317466736, "learning_rate": 9.881497183654653e-06, "loss": 0.0751, "num_tokens": 783312920.0, "step": 5992 }, { "epoch": 2.39146049481245, "grad_norm": 0.2721082270145416, "learning_rate": 9.875343167655968e-06, "loss": 0.0774, "num_tokens": 783443992.0, "step": 5993 }, { "epoch": 2.3918595371109337, "grad_norm": 0.2746914327144623, "learning_rate": 9.869192561864076e-06, "loss": 0.0832, "num_tokens": 783575064.0, "step": 5994 }, { "epoch": 2.3922585794094173, "grad_norm": 0.2756274938583374, "learning_rate": 9.863045367469071e-06, "loss": 0.1006, "num_tokens": 783706136.0, "step": 5995 }, { "epoch": 2.392657621707901, "grad_norm": 0.23269274830818176, "learning_rate": 9.856901585660382e-06, "loss": 0.0781, "num_tokens": 783837208.0, "step": 5996 }, { "epoch": 2.3930566640063846, "grad_norm": 0.2495374232530594, "learning_rate": 9.850761217626766e-06, "loss": 0.0788, "num_tokens": 783968280.0, "step": 5997 }, { "epoch": 2.3934557063048683, "grad_norm": 0.2722617983818054, "learning_rate": 9.844624264556333e-06, "loss": 0.0873, "num_tokens": 784099352.0, "step": 5998 }, { "epoch": 2.393854748603352, "grad_norm": 0.25174063444137573, "learning_rate": 9.838490727636525e-06, "loss": 0.0837, "num_tokens": 784230424.0, "step": 5999 }, { "epoch": 2.3942537909018355, "grad_norm": 0.25760161876678467, "learning_rate": 9.832360608054122e-06, "loss": 0.0856, "num_tokens": 784355289.0, "step": 6000 }, { "epoch": 2.394652833200319, "grad_norm": 0.27246376872062683, "learning_rate": 9.826233906995243e-06, "loss": 0.0967, "num_tokens": 784486361.0, "step": 6001 }, { "epoch": 2.395051875498803, "grad_norm": 0.27430981397628784, "learning_rate": 9.820110625645347e-06, "loss": 0.0904, "num_tokens": 784617433.0, "step": 6002 }, { "epoch": 2.3954509177972865, "grad_norm": 0.3453753888607025, "learning_rate": 9.813990765189243e-06, "loss": 0.0997, "num_tokens": 784748505.0, "step": 6003 }, { "epoch": 2.39584996009577, "grad_norm": 0.23907718062400818, "learning_rate": 9.807874326811053e-06, "loss": 0.0705, "num_tokens": 784879577.0, "step": 6004 }, { "epoch": 2.3962490023942538, "grad_norm": 0.26934242248535156, "learning_rate": 9.801761311694257e-06, "loss": 0.083, "num_tokens": 785010649.0, "step": 6005 }, { "epoch": 2.3966480446927374, "grad_norm": 0.2686123847961426, "learning_rate": 9.795651721021671e-06, "loss": 0.0848, "num_tokens": 785126320.0, "step": 6006 }, { "epoch": 2.397047086991221, "grad_norm": 0.25789719820022583, "learning_rate": 9.789545555975435e-06, "loss": 0.0723, "num_tokens": 785257392.0, "step": 6007 }, { "epoch": 2.3974461292897047, "grad_norm": 0.29040247201919556, "learning_rate": 9.783442817737039e-06, "loss": 0.0991, "num_tokens": 785388464.0, "step": 6008 }, { "epoch": 2.3978451715881883, "grad_norm": 0.2589728832244873, "learning_rate": 9.77734350748731e-06, "loss": 0.0836, "num_tokens": 785519536.0, "step": 6009 }, { "epoch": 2.398244213886672, "grad_norm": 0.21833862364292145, "learning_rate": 9.771247626406407e-06, "loss": 0.0664, "num_tokens": 785650608.0, "step": 6010 }, { "epoch": 2.3986432561851556, "grad_norm": 0.22149908542633057, "learning_rate": 9.765155175673815e-06, "loss": 0.0636, "num_tokens": 785781680.0, "step": 6011 }, { "epoch": 2.3990422984836393, "grad_norm": 0.22769923508167267, "learning_rate": 9.75906615646838e-06, "loss": 0.0656, "num_tokens": 785912752.0, "step": 6012 }, { "epoch": 2.399441340782123, "grad_norm": 0.2760605216026306, "learning_rate": 9.752980569968258e-06, "loss": 0.1044, "num_tokens": 786043824.0, "step": 6013 }, { "epoch": 2.3998403830806065, "grad_norm": 0.2447952926158905, "learning_rate": 9.74689841735096e-06, "loss": 0.0776, "num_tokens": 786174896.0, "step": 6014 }, { "epoch": 2.40023942537909, "grad_norm": 0.28642553091049194, "learning_rate": 9.740819699793337e-06, "loss": 0.0837, "num_tokens": 786305968.0, "step": 6015 }, { "epoch": 2.400638467677574, "grad_norm": 0.2395893633365631, "learning_rate": 9.734744418471543e-06, "loss": 0.0665, "num_tokens": 786437040.0, "step": 6016 }, { "epoch": 2.4010375099760575, "grad_norm": 0.2818344831466675, "learning_rate": 9.728672574561099e-06, "loss": 0.0859, "num_tokens": 786568112.0, "step": 6017 }, { "epoch": 2.401436552274541, "grad_norm": 0.25815054774284363, "learning_rate": 9.722604169236854e-06, "loss": 0.0831, "num_tokens": 786699184.0, "step": 6018 }, { "epoch": 2.4018355945730248, "grad_norm": 0.24941366910934448, "learning_rate": 9.716539203672981e-06, "loss": 0.0764, "num_tokens": 786830256.0, "step": 6019 }, { "epoch": 2.4022346368715084, "grad_norm": 0.3047906458377838, "learning_rate": 9.710477679042995e-06, "loss": 0.0833, "num_tokens": 786961328.0, "step": 6020 }, { "epoch": 2.402633679169992, "grad_norm": 0.28655269742012024, "learning_rate": 9.704419596519751e-06, "loss": 0.0907, "num_tokens": 787081930.0, "step": 6021 }, { "epoch": 2.4030327214684757, "grad_norm": 0.2126879245042801, "learning_rate": 9.698364957275429e-06, "loss": 0.0607, "num_tokens": 787213002.0, "step": 6022 }, { "epoch": 2.4034317637669593, "grad_norm": 0.24497616291046143, "learning_rate": 9.692313762481535e-06, "loss": 0.0832, "num_tokens": 787344074.0, "step": 6023 }, { "epoch": 2.403830806065443, "grad_norm": 0.305337518453598, "learning_rate": 9.686266013308933e-06, "loss": 0.0904, "num_tokens": 787473674.0, "step": 6024 }, { "epoch": 2.4042298483639266, "grad_norm": 0.2653108239173889, "learning_rate": 9.680221710927797e-06, "loss": 0.091, "num_tokens": 787604746.0, "step": 6025 }, { "epoch": 2.4046288906624103, "grad_norm": 0.25963401794433594, "learning_rate": 9.674180856507643e-06, "loss": 0.0789, "num_tokens": 787735818.0, "step": 6026 }, { "epoch": 2.405027932960894, "grad_norm": 0.2552826404571533, "learning_rate": 9.668143451217332e-06, "loss": 0.0886, "num_tokens": 787866890.0, "step": 6027 }, { "epoch": 2.4054269752593775, "grad_norm": 0.2534872889518738, "learning_rate": 9.662109496225028e-06, "loss": 0.0839, "num_tokens": 787997962.0, "step": 6028 }, { "epoch": 2.405826017557861, "grad_norm": 0.23991107940673828, "learning_rate": 9.656078992698254e-06, "loss": 0.0801, "num_tokens": 788129034.0, "step": 6029 }, { "epoch": 2.406225059856345, "grad_norm": 0.253737211227417, "learning_rate": 9.65005194180386e-06, "loss": 0.0858, "num_tokens": 788260106.0, "step": 6030 }, { "epoch": 2.4066241021548285, "grad_norm": 0.24198178946971893, "learning_rate": 9.64402834470802e-06, "loss": 0.0732, "num_tokens": 788391178.0, "step": 6031 }, { "epoch": 2.407023144453312, "grad_norm": 0.2521749436855316, "learning_rate": 9.638008202576246e-06, "loss": 0.0803, "num_tokens": 788522250.0, "step": 6032 }, { "epoch": 2.4074221867517958, "grad_norm": 0.29590168595314026, "learning_rate": 9.631991516573372e-06, "loss": 0.0747, "num_tokens": 788653322.0, "step": 6033 }, { "epoch": 2.4078212290502794, "grad_norm": 0.257378488779068, "learning_rate": 9.625978287863582e-06, "loss": 0.0787, "num_tokens": 788784394.0, "step": 6034 }, { "epoch": 2.408220271348763, "grad_norm": 0.20451322197914124, "learning_rate": 9.61996851761037e-06, "loss": 0.0544, "num_tokens": 788915466.0, "step": 6035 }, { "epoch": 2.4086193136472467, "grad_norm": 0.24570809304714203, "learning_rate": 9.613962206976569e-06, "loss": 0.0777, "num_tokens": 789046538.0, "step": 6036 }, { "epoch": 2.4090183559457303, "grad_norm": 0.26734068989753723, "learning_rate": 9.60795935712436e-06, "loss": 0.082, "num_tokens": 789177610.0, "step": 6037 }, { "epoch": 2.409417398244214, "grad_norm": 0.29525813460350037, "learning_rate": 9.60195996921522e-06, "loss": 0.0864, "num_tokens": 789308682.0, "step": 6038 }, { "epoch": 2.4098164405426976, "grad_norm": 0.21458423137664795, "learning_rate": 9.595964044409982e-06, "loss": 0.0683, "num_tokens": 789439754.0, "step": 6039 }, { "epoch": 2.4102154828411813, "grad_norm": 0.26334139704704285, "learning_rate": 9.589971583868811e-06, "loss": 0.0859, "num_tokens": 789570826.0, "step": 6040 }, { "epoch": 2.410614525139665, "grad_norm": 0.2727080285549164, "learning_rate": 9.583982588751176e-06, "loss": 0.0832, "num_tokens": 789701898.0, "step": 6041 }, { "epoch": 2.4110135674381485, "grad_norm": 0.2797369956970215, "learning_rate": 9.577997060215898e-06, "loss": 0.1002, "num_tokens": 789832970.0, "step": 6042 }, { "epoch": 2.411412609736632, "grad_norm": 0.24110008776187897, "learning_rate": 9.572014999421131e-06, "loss": 0.0776, "num_tokens": 789955224.0, "step": 6043 }, { "epoch": 2.411811652035116, "grad_norm": 0.2553565204143524, "learning_rate": 9.566036407524335e-06, "loss": 0.0706, "num_tokens": 790086296.0, "step": 6044 }, { "epoch": 2.4122106943335995, "grad_norm": 0.24299737811088562, "learning_rate": 9.560061285682312e-06, "loss": 0.0746, "num_tokens": 790217368.0, "step": 6045 }, { "epoch": 2.412609736632083, "grad_norm": 0.2880520522594452, "learning_rate": 9.554089635051203e-06, "loss": 0.0919, "num_tokens": 790341414.0, "step": 6046 }, { "epoch": 2.4130087789305668, "grad_norm": 0.2885577976703644, "learning_rate": 9.548121456786452e-06, "loss": 0.0906, "num_tokens": 790472486.0, "step": 6047 }, { "epoch": 2.4134078212290504, "grad_norm": 0.24222710728645325, "learning_rate": 9.542156752042854e-06, "loss": 0.0793, "num_tokens": 790603558.0, "step": 6048 }, { "epoch": 2.413806863527534, "grad_norm": 0.2840733528137207, "learning_rate": 9.536195521974531e-06, "loss": 0.0842, "num_tokens": 790734630.0, "step": 6049 }, { "epoch": 2.4142059058260177, "grad_norm": 0.24073426425457, "learning_rate": 9.530237767734912e-06, "loss": 0.0702, "num_tokens": 790865702.0, "step": 6050 }, { "epoch": 2.4146049481245013, "grad_norm": 0.23214887082576752, "learning_rate": 9.52428349047677e-06, "loss": 0.0726, "num_tokens": 790996774.0, "step": 6051 }, { "epoch": 2.415003990422985, "grad_norm": 0.26394638419151306, "learning_rate": 9.518332691352214e-06, "loss": 0.0856, "num_tokens": 791127846.0, "step": 6052 }, { "epoch": 2.4154030327214686, "grad_norm": 0.2346683293581009, "learning_rate": 9.512385371512655e-06, "loss": 0.0776, "num_tokens": 791258918.0, "step": 6053 }, { "epoch": 2.4158020750199523, "grad_norm": 0.2513706684112549, "learning_rate": 9.506441532108843e-06, "loss": 0.0746, "num_tokens": 791389990.0, "step": 6054 }, { "epoch": 2.416201117318436, "grad_norm": 0.3184030055999756, "learning_rate": 9.500501174290872e-06, "loss": 0.1016, "num_tokens": 791521062.0, "step": 6055 }, { "epoch": 2.4166001596169195, "grad_norm": 0.2789475619792938, "learning_rate": 9.494564299208131e-06, "loss": 0.0913, "num_tokens": 791652134.0, "step": 6056 }, { "epoch": 2.416999201915403, "grad_norm": 0.25250694155693054, "learning_rate": 9.48863090800935e-06, "loss": 0.0871, "num_tokens": 791783206.0, "step": 6057 }, { "epoch": 2.417398244213887, "grad_norm": 0.25873836874961853, "learning_rate": 9.482701001842587e-06, "loss": 0.0795, "num_tokens": 791914278.0, "step": 6058 }, { "epoch": 2.4177972865123705, "grad_norm": 0.22162695229053497, "learning_rate": 9.476774581855232e-06, "loss": 0.066, "num_tokens": 792045350.0, "step": 6059 }, { "epoch": 2.418196328810854, "grad_norm": 0.22034412622451782, "learning_rate": 9.470851649193978e-06, "loss": 0.0639, "num_tokens": 792176422.0, "step": 6060 }, { "epoch": 2.4185953711093378, "grad_norm": 0.23994380235671997, "learning_rate": 9.464932205004864e-06, "loss": 0.0751, "num_tokens": 792307494.0, "step": 6061 }, { "epoch": 2.4189944134078214, "grad_norm": 0.24787339568138123, "learning_rate": 9.459016250433256e-06, "loss": 0.0723, "num_tokens": 792438566.0, "step": 6062 }, { "epoch": 2.419393455706305, "grad_norm": 0.2667214572429657, "learning_rate": 9.453103786623819e-06, "loss": 0.0888, "num_tokens": 792569638.0, "step": 6063 }, { "epoch": 2.4197924980047887, "grad_norm": 0.2637821435928345, "learning_rate": 9.447194814720567e-06, "loss": 0.0846, "num_tokens": 792700710.0, "step": 6064 }, { "epoch": 2.4201915403032723, "grad_norm": 0.2928113639354706, "learning_rate": 9.44128933586684e-06, "loss": 0.0745, "num_tokens": 792827920.0, "step": 6065 }, { "epoch": 2.420590582601756, "grad_norm": 0.25857529044151306, "learning_rate": 9.435387351205288e-06, "loss": 0.0937, "num_tokens": 792958992.0, "step": 6066 }, { "epoch": 2.4209896249002396, "grad_norm": 0.24573972821235657, "learning_rate": 9.429488861877878e-06, "loss": 0.0783, "num_tokens": 793090064.0, "step": 6067 }, { "epoch": 2.4213886671987233, "grad_norm": 0.30438539385795593, "learning_rate": 9.42359386902593e-06, "loss": 0.0961, "num_tokens": 793221136.0, "step": 6068 }, { "epoch": 2.421787709497207, "grad_norm": 0.21002653241157532, "learning_rate": 9.417702373790054e-06, "loss": 0.0567, "num_tokens": 793352208.0, "step": 6069 }, { "epoch": 2.4221867517956905, "grad_norm": 0.2541816830635071, "learning_rate": 9.411814377310208e-06, "loss": 0.0807, "num_tokens": 793483280.0, "step": 6070 }, { "epoch": 2.422585794094174, "grad_norm": 0.30711862444877625, "learning_rate": 9.405929880725675e-06, "loss": 0.1118, "num_tokens": 793614352.0, "step": 6071 }, { "epoch": 2.422984836392658, "grad_norm": 0.24663059413433075, "learning_rate": 9.400048885175028e-06, "loss": 0.0721, "num_tokens": 793745424.0, "step": 6072 }, { "epoch": 2.4233838786911415, "grad_norm": 0.2371881604194641, "learning_rate": 9.394171391796197e-06, "loss": 0.0651, "num_tokens": 793876496.0, "step": 6073 }, { "epoch": 2.423782920989625, "grad_norm": 0.2588050663471222, "learning_rate": 9.388297401726429e-06, "loss": 0.0819, "num_tokens": 794007568.0, "step": 6074 }, { "epoch": 2.4241819632881088, "grad_norm": 0.25343844294548035, "learning_rate": 9.382426916102272e-06, "loss": 0.0804, "num_tokens": 794138640.0, "step": 6075 }, { "epoch": 2.4245810055865924, "grad_norm": 0.22541484236717224, "learning_rate": 9.376559936059614e-06, "loss": 0.0735, "num_tokens": 794269712.0, "step": 6076 }, { "epoch": 2.424980047885076, "grad_norm": 0.29124024510383606, "learning_rate": 9.370696462733671e-06, "loss": 0.0952, "num_tokens": 794400784.0, "step": 6077 }, { "epoch": 2.4253790901835597, "grad_norm": 0.283490389585495, "learning_rate": 9.364836497258966e-06, "loss": 0.0939, "num_tokens": 794531856.0, "step": 6078 }, { "epoch": 2.4257781324820433, "grad_norm": 0.24468128383159637, "learning_rate": 9.358980040769337e-06, "loss": 0.087, "num_tokens": 794662928.0, "step": 6079 }, { "epoch": 2.426177174780527, "grad_norm": 0.2326081395149231, "learning_rate": 9.353127094397962e-06, "loss": 0.0695, "num_tokens": 794794000.0, "step": 6080 }, { "epoch": 2.4265762170790106, "grad_norm": 0.20415982604026794, "learning_rate": 9.347277659277337e-06, "loss": 0.0631, "num_tokens": 794925072.0, "step": 6081 }, { "epoch": 2.4269752593774943, "grad_norm": 0.2514685094356537, "learning_rate": 9.341431736539261e-06, "loss": 0.0827, "num_tokens": 795056144.0, "step": 6082 }, { "epoch": 2.427374301675978, "grad_norm": 0.27839216589927673, "learning_rate": 9.335589327314873e-06, "loss": 0.0867, "num_tokens": 795187216.0, "step": 6083 }, { "epoch": 2.4277733439744615, "grad_norm": 0.2554825246334076, "learning_rate": 9.32975043273463e-06, "loss": 0.0785, "num_tokens": 795318288.0, "step": 6084 }, { "epoch": 2.4281723862729447, "grad_norm": 0.2890157103538513, "learning_rate": 9.323915053928292e-06, "loss": 0.1025, "num_tokens": 795449360.0, "step": 6085 }, { "epoch": 2.4285714285714284, "grad_norm": 0.22712086141109467, "learning_rate": 9.318083192024958e-06, "loss": 0.0632, "num_tokens": 795580432.0, "step": 6086 }, { "epoch": 2.428970470869912, "grad_norm": 0.270965576171875, "learning_rate": 9.312254848153031e-06, "loss": 0.0852, "num_tokens": 795711504.0, "step": 6087 }, { "epoch": 2.4293695131683957, "grad_norm": 0.30321359634399414, "learning_rate": 9.306430023440251e-06, "loss": 0.0893, "num_tokens": 795842576.0, "step": 6088 }, { "epoch": 2.4297685554668793, "grad_norm": 0.2750170826911926, "learning_rate": 9.300608719013667e-06, "loss": 0.0759, "num_tokens": 795973648.0, "step": 6089 }, { "epoch": 2.430167597765363, "grad_norm": 0.24477618932724, "learning_rate": 9.294790935999646e-06, "loss": 0.0849, "num_tokens": 796099937.0, "step": 6090 }, { "epoch": 2.4305666400638466, "grad_norm": 0.260320782661438, "learning_rate": 9.28897667552386e-06, "loss": 0.0863, "num_tokens": 796231009.0, "step": 6091 }, { "epoch": 2.4309656823623302, "grad_norm": 0.3064257800579071, "learning_rate": 9.283165938711332e-06, "loss": 0.095, "num_tokens": 796362081.0, "step": 6092 }, { "epoch": 2.431364724660814, "grad_norm": 0.26524603366851807, "learning_rate": 9.277358726686383e-06, "loss": 0.0923, "num_tokens": 796493153.0, "step": 6093 }, { "epoch": 2.4317637669592975, "grad_norm": 0.30879679322242737, "learning_rate": 9.271555040572646e-06, "loss": 0.0974, "num_tokens": 796624225.0, "step": 6094 }, { "epoch": 2.432162809257781, "grad_norm": 0.24068328738212585, "learning_rate": 9.265754881493083e-06, "loss": 0.0735, "num_tokens": 796755297.0, "step": 6095 }, { "epoch": 2.432561851556265, "grad_norm": 0.2467721402645111, "learning_rate": 9.259958250569984e-06, "loss": 0.0727, "num_tokens": 796886369.0, "step": 6096 }, { "epoch": 2.4329608938547485, "grad_norm": 0.280124694108963, "learning_rate": 9.254165148924922e-06, "loss": 0.0864, "num_tokens": 797001202.0, "step": 6097 }, { "epoch": 2.433359936153232, "grad_norm": 0.2540997862815857, "learning_rate": 9.248375577678817e-06, "loss": 0.0826, "num_tokens": 797132274.0, "step": 6098 }, { "epoch": 2.4337589784517157, "grad_norm": 0.24853679537773132, "learning_rate": 9.242589537951907e-06, "loss": 0.0828, "num_tokens": 797263346.0, "step": 6099 }, { "epoch": 2.4341580207501994, "grad_norm": 0.22589464485645294, "learning_rate": 9.236807030863728e-06, "loss": 0.0805, "num_tokens": 797394418.0, "step": 6100 }, { "epoch": 2.434557063048683, "grad_norm": 0.2243574559688568, "learning_rate": 9.231028057533134e-06, "loss": 0.0603, "num_tokens": 797525490.0, "step": 6101 }, { "epoch": 2.4349561053471667, "grad_norm": 0.2729145884513855, "learning_rate": 9.225252619078314e-06, "loss": 0.0838, "num_tokens": 797656562.0, "step": 6102 }, { "epoch": 2.4353551476456503, "grad_norm": 0.2727215886116028, "learning_rate": 9.219480716616754e-06, "loss": 0.0879, "num_tokens": 797787634.0, "step": 6103 }, { "epoch": 2.435754189944134, "grad_norm": 0.24720853567123413, "learning_rate": 9.213712351265269e-06, "loss": 0.0921, "num_tokens": 797918706.0, "step": 6104 }, { "epoch": 2.4361532322426176, "grad_norm": 0.264274537563324, "learning_rate": 9.207947524139984e-06, "loss": 0.0864, "num_tokens": 798049778.0, "step": 6105 }, { "epoch": 2.4365522745411012, "grad_norm": 0.2455579936504364, "learning_rate": 9.202186236356331e-06, "loss": 0.0739, "num_tokens": 798180850.0, "step": 6106 }, { "epoch": 2.436951316839585, "grad_norm": 0.2595941424369812, "learning_rate": 9.19642848902907e-06, "loss": 0.0825, "num_tokens": 798297435.0, "step": 6107 }, { "epoch": 2.4373503591380685, "grad_norm": 0.24609199166297913, "learning_rate": 9.19067428327228e-06, "loss": 0.0691, "num_tokens": 798428507.0, "step": 6108 }, { "epoch": 2.437749401436552, "grad_norm": 0.30669230222702026, "learning_rate": 9.184923620199334e-06, "loss": 0.0903, "num_tokens": 798559579.0, "step": 6109 }, { "epoch": 2.438148443735036, "grad_norm": 0.27655622363090515, "learning_rate": 9.179176500922931e-06, "loss": 0.0836, "num_tokens": 798690651.0, "step": 6110 }, { "epoch": 2.4385474860335195, "grad_norm": 0.23051193356513977, "learning_rate": 9.173432926555103e-06, "loss": 0.0725, "num_tokens": 798821723.0, "step": 6111 }, { "epoch": 2.438946528332003, "grad_norm": 0.2527804374694824, "learning_rate": 9.167692898207161e-06, "loss": 0.0736, "num_tokens": 798952795.0, "step": 6112 }, { "epoch": 2.4393455706304867, "grad_norm": 0.26793763041496277, "learning_rate": 9.16195641698975e-06, "loss": 0.0746, "num_tokens": 799083867.0, "step": 6113 }, { "epoch": 2.4397446129289704, "grad_norm": 0.292929083108902, "learning_rate": 9.156223484012824e-06, "loss": 0.0972, "num_tokens": 799214939.0, "step": 6114 }, { "epoch": 2.440143655227454, "grad_norm": 0.2483973503112793, "learning_rate": 9.150494100385664e-06, "loss": 0.0709, "num_tokens": 799346011.0, "step": 6115 }, { "epoch": 2.4405426975259377, "grad_norm": 0.2765411138534546, "learning_rate": 9.144768267216835e-06, "loss": 0.0883, "num_tokens": 799477083.0, "step": 6116 }, { "epoch": 2.4409417398244213, "grad_norm": 0.2624064087867737, "learning_rate": 9.139045985614242e-06, "loss": 0.0857, "num_tokens": 799608155.0, "step": 6117 }, { "epoch": 2.441340782122905, "grad_norm": 0.28134819865226746, "learning_rate": 9.1333272566851e-06, "loss": 0.0984, "num_tokens": 799739227.0, "step": 6118 }, { "epoch": 2.4417398244213886, "grad_norm": 0.255049467086792, "learning_rate": 9.127612081535913e-06, "loss": 0.092, "num_tokens": 799870299.0, "step": 6119 }, { "epoch": 2.4421388667198722, "grad_norm": 0.21745628118515015, "learning_rate": 9.121900461272523e-06, "loss": 0.0718, "num_tokens": 800001371.0, "step": 6120 }, { "epoch": 2.442537909018356, "grad_norm": 0.2480260580778122, "learning_rate": 9.116192397000084e-06, "loss": 0.0752, "num_tokens": 800132443.0, "step": 6121 }, { "epoch": 2.4429369513168395, "grad_norm": 0.3117411136627197, "learning_rate": 9.110487889823039e-06, "loss": 0.1028, "num_tokens": 800263515.0, "step": 6122 }, { "epoch": 2.443335993615323, "grad_norm": 0.24121195077896118, "learning_rate": 9.104786940845165e-06, "loss": 0.0726, "num_tokens": 800394587.0, "step": 6123 }, { "epoch": 2.443735035913807, "grad_norm": 0.24566689133644104, "learning_rate": 9.09908955116954e-06, "loss": 0.09, "num_tokens": 800525659.0, "step": 6124 }, { "epoch": 2.4441340782122905, "grad_norm": 0.2633350193500519, "learning_rate": 9.09339572189855e-06, "loss": 0.0881, "num_tokens": 800656731.0, "step": 6125 }, { "epoch": 2.444533120510774, "grad_norm": 0.22906675934791565, "learning_rate": 9.087705454133906e-06, "loss": 0.0742, "num_tokens": 800787803.0, "step": 6126 }, { "epoch": 2.4449321628092577, "grad_norm": 0.2833649516105652, "learning_rate": 9.082018748976625e-06, "loss": 0.0859, "num_tokens": 800918875.0, "step": 6127 }, { "epoch": 2.4453312051077414, "grad_norm": 0.28228187561035156, "learning_rate": 9.076335607527018e-06, "loss": 0.0882, "num_tokens": 801049947.0, "step": 6128 }, { "epoch": 2.445730247406225, "grad_norm": 0.20745590329170227, "learning_rate": 9.07065603088473e-06, "loss": 0.0592, "num_tokens": 801181019.0, "step": 6129 }, { "epoch": 2.4461292897047087, "grad_norm": 0.26851752400398254, "learning_rate": 9.064980020148709e-06, "loss": 0.0842, "num_tokens": 801312091.0, "step": 6130 }, { "epoch": 2.4465283320031923, "grad_norm": 0.28376930952072144, "learning_rate": 9.059307576417199e-06, "loss": 0.0955, "num_tokens": 801443163.0, "step": 6131 }, { "epoch": 2.446927374301676, "grad_norm": 0.22610433399677277, "learning_rate": 9.05363870078777e-06, "loss": 0.0752, "num_tokens": 801574235.0, "step": 6132 }, { "epoch": 2.4473264166001596, "grad_norm": 0.253171443939209, "learning_rate": 9.047973394357305e-06, "loss": 0.0768, "num_tokens": 801705307.0, "step": 6133 }, { "epoch": 2.4477254588986432, "grad_norm": 0.2246374636888504, "learning_rate": 9.04231165822198e-06, "loss": 0.0717, "num_tokens": 801836379.0, "step": 6134 }, { "epoch": 2.448124501197127, "grad_norm": 0.24468642473220825, "learning_rate": 9.036653493477281e-06, "loss": 0.0755, "num_tokens": 801967451.0, "step": 6135 }, { "epoch": 2.4485235434956105, "grad_norm": 0.24258452653884888, "learning_rate": 9.03099890121802e-06, "loss": 0.0811, "num_tokens": 802098523.0, "step": 6136 }, { "epoch": 2.448922585794094, "grad_norm": 0.17774371802806854, "learning_rate": 9.025347882538312e-06, "loss": 0.046, "num_tokens": 802229595.0, "step": 6137 }, { "epoch": 2.449321628092578, "grad_norm": 0.24982382357120514, "learning_rate": 9.019700438531562e-06, "loss": 0.0848, "num_tokens": 802360667.0, "step": 6138 }, { "epoch": 2.4497206703910615, "grad_norm": 0.29078468680381775, "learning_rate": 9.014056570290507e-06, "loss": 0.1013, "num_tokens": 802491739.0, "step": 6139 }, { "epoch": 2.450119712689545, "grad_norm": 0.323041707277298, "learning_rate": 9.00841627890719e-06, "loss": 0.0838, "num_tokens": 802622811.0, "step": 6140 }, { "epoch": 2.4505187549880287, "grad_norm": 0.37260305881500244, "learning_rate": 9.00277956547294e-06, "loss": 0.1136, "num_tokens": 802753883.0, "step": 6141 }, { "epoch": 2.4509177972865124, "grad_norm": 0.2782590687274933, "learning_rate": 8.997146431078413e-06, "loss": 0.0982, "num_tokens": 802884955.0, "step": 6142 }, { "epoch": 2.451316839584996, "grad_norm": 0.24416477978229523, "learning_rate": 8.991516876813578e-06, "loss": 0.073, "num_tokens": 803016027.0, "step": 6143 }, { "epoch": 2.4517158818834797, "grad_norm": 0.24879950284957886, "learning_rate": 8.985890903767688e-06, "loss": 0.0737, "num_tokens": 803147099.0, "step": 6144 }, { "epoch": 2.4521149241819633, "grad_norm": 0.2344399243593216, "learning_rate": 8.980268513029325e-06, "loss": 0.0713, "num_tokens": 803278171.0, "step": 6145 }, { "epoch": 2.452513966480447, "grad_norm": 0.29257676005363464, "learning_rate": 8.974649705686367e-06, "loss": 0.0937, "num_tokens": 803409243.0, "step": 6146 }, { "epoch": 2.4529130087789306, "grad_norm": 0.28694334626197815, "learning_rate": 8.969034482825996e-06, "loss": 0.0903, "num_tokens": 803540315.0, "step": 6147 }, { "epoch": 2.4533120510774142, "grad_norm": 0.22861291468143463, "learning_rate": 8.963422845534705e-06, "loss": 0.066, "num_tokens": 803671387.0, "step": 6148 }, { "epoch": 2.453711093375898, "grad_norm": 0.2614575922489166, "learning_rate": 8.957814794898309e-06, "loss": 0.0908, "num_tokens": 803802459.0, "step": 6149 }, { "epoch": 2.4541101356743815, "grad_norm": 0.2541465759277344, "learning_rate": 8.952210332001892e-06, "loss": 0.0853, "num_tokens": 803933531.0, "step": 6150 }, { "epoch": 2.454509177972865, "grad_norm": 0.25579941272735596, "learning_rate": 8.946609457929877e-06, "loss": 0.0761, "num_tokens": 804064603.0, "step": 6151 }, { "epoch": 2.454908220271349, "grad_norm": 0.23417827486991882, "learning_rate": 8.941012173765984e-06, "loss": 0.0659, "num_tokens": 804195675.0, "step": 6152 }, { "epoch": 2.4553072625698324, "grad_norm": 0.24074891209602356, "learning_rate": 8.935418480593226e-06, "loss": 0.0795, "num_tokens": 804326747.0, "step": 6153 }, { "epoch": 2.455706304868316, "grad_norm": 0.3005199432373047, "learning_rate": 8.929828379493936e-06, "loss": 0.0982, "num_tokens": 804457819.0, "step": 6154 }, { "epoch": 2.4561053471667997, "grad_norm": 0.20844995975494385, "learning_rate": 8.924241871549752e-06, "loss": 0.0605, "num_tokens": 804588891.0, "step": 6155 }, { "epoch": 2.4565043894652834, "grad_norm": 0.24184614419937134, "learning_rate": 8.918658957841597e-06, "loss": 0.0744, "num_tokens": 804719963.0, "step": 6156 }, { "epoch": 2.456903431763767, "grad_norm": 0.23946553468704224, "learning_rate": 8.913079639449726e-06, "loss": 0.0669, "num_tokens": 804851035.0, "step": 6157 }, { "epoch": 2.4573024740622507, "grad_norm": 0.308823823928833, "learning_rate": 8.907503917453678e-06, "loss": 0.089, "num_tokens": 804982107.0, "step": 6158 }, { "epoch": 2.4577015163607343, "grad_norm": 0.19988323748111725, "learning_rate": 8.901931792932306e-06, "loss": 0.0601, "num_tokens": 805113179.0, "step": 6159 }, { "epoch": 2.458100558659218, "grad_norm": 0.21727165579795837, "learning_rate": 8.896363266963759e-06, "loss": 0.0652, "num_tokens": 805244251.0, "step": 6160 }, { "epoch": 2.4584996009577016, "grad_norm": 0.25095370411872864, "learning_rate": 8.890798340625498e-06, "loss": 0.0794, "num_tokens": 805375323.0, "step": 6161 }, { "epoch": 2.4588986432561852, "grad_norm": 0.21988432109355927, "learning_rate": 8.885237014994293e-06, "loss": 0.0671, "num_tokens": 805506395.0, "step": 6162 }, { "epoch": 2.459297685554669, "grad_norm": 0.26140710711479187, "learning_rate": 8.879679291146193e-06, "loss": 0.0829, "num_tokens": 805637467.0, "step": 6163 }, { "epoch": 2.4596967278531525, "grad_norm": 0.22264164686203003, "learning_rate": 8.87412517015658e-06, "loss": 0.0632, "num_tokens": 805762515.0, "step": 6164 }, { "epoch": 2.460095770151636, "grad_norm": 0.2976977229118347, "learning_rate": 8.868574653100115e-06, "loss": 0.0987, "num_tokens": 805893587.0, "step": 6165 }, { "epoch": 2.46049481245012, "grad_norm": 0.23200681805610657, "learning_rate": 8.863027741050772e-06, "loss": 0.0674, "num_tokens": 806024659.0, "step": 6166 }, { "epoch": 2.4608938547486034, "grad_norm": 0.28028640151023865, "learning_rate": 8.857484435081838e-06, "loss": 0.0888, "num_tokens": 806155731.0, "step": 6167 }, { "epoch": 2.461292897047087, "grad_norm": 0.2453768253326416, "learning_rate": 8.851944736265879e-06, "loss": 0.0811, "num_tokens": 806286803.0, "step": 6168 }, { "epoch": 2.4616919393455707, "grad_norm": 0.25423434376716614, "learning_rate": 8.846408645674775e-06, "loss": 0.0859, "num_tokens": 806417875.0, "step": 6169 }, { "epoch": 2.4620909816440544, "grad_norm": 0.2817750573158264, "learning_rate": 8.840876164379711e-06, "loss": 0.0619, "num_tokens": 806548947.0, "step": 6170 }, { "epoch": 2.462490023942538, "grad_norm": 0.19413048028945923, "learning_rate": 8.835347293451177e-06, "loss": 0.0471, "num_tokens": 806680019.0, "step": 6171 }, { "epoch": 2.4628890662410217, "grad_norm": 0.27701136469841003, "learning_rate": 8.829822033958945e-06, "loss": 0.0908, "num_tokens": 806811091.0, "step": 6172 }, { "epoch": 2.4632881085395053, "grad_norm": 0.2590743601322174, "learning_rate": 8.824300386972112e-06, "loss": 0.0795, "num_tokens": 806942163.0, "step": 6173 }, { "epoch": 2.463687150837989, "grad_norm": 0.22856676578521729, "learning_rate": 8.818782353559064e-06, "loss": 0.0635, "num_tokens": 807073235.0, "step": 6174 }, { "epoch": 2.4640861931364726, "grad_norm": 0.28586870431900024, "learning_rate": 8.81326793478748e-06, "loss": 0.0792, "num_tokens": 807204307.0, "step": 6175 }, { "epoch": 2.4644852354349562, "grad_norm": 0.22065889835357666, "learning_rate": 8.807757131724357e-06, "loss": 0.0718, "num_tokens": 807335379.0, "step": 6176 }, { "epoch": 2.46488427773344, "grad_norm": 0.2058670073747635, "learning_rate": 8.802249945435986e-06, "loss": 0.0607, "num_tokens": 807466451.0, "step": 6177 }, { "epoch": 2.4652833200319235, "grad_norm": 0.22874023020267487, "learning_rate": 8.796746376987952e-06, "loss": 0.0774, "num_tokens": 807597523.0, "step": 6178 }, { "epoch": 2.465682362330407, "grad_norm": 0.24838143587112427, "learning_rate": 8.791246427445144e-06, "loss": 0.0747, "num_tokens": 807728595.0, "step": 6179 }, { "epoch": 2.466081404628891, "grad_norm": 0.23307166993618011, "learning_rate": 8.785750097871753e-06, "loss": 0.0777, "num_tokens": 807859667.0, "step": 6180 }, { "epoch": 2.4664804469273744, "grad_norm": 0.24164621531963348, "learning_rate": 8.780257389331261e-06, "loss": 0.0734, "num_tokens": 807990739.0, "step": 6181 }, { "epoch": 2.466879489225858, "grad_norm": 0.259752482175827, "learning_rate": 8.77476830288646e-06, "loss": 0.0679, "num_tokens": 808121811.0, "step": 6182 }, { "epoch": 2.4672785315243417, "grad_norm": 0.22938594222068787, "learning_rate": 8.769282839599447e-06, "loss": 0.0775, "num_tokens": 808252883.0, "step": 6183 }, { "epoch": 2.4676775738228254, "grad_norm": 0.25470179319381714, "learning_rate": 8.763801000531591e-06, "loss": 0.069, "num_tokens": 808383955.0, "step": 6184 }, { "epoch": 2.468076616121309, "grad_norm": 0.2180585116147995, "learning_rate": 8.758322786743584e-06, "loss": 0.0587, "num_tokens": 808515027.0, "step": 6185 }, { "epoch": 2.4684756584197927, "grad_norm": 0.22259747982025146, "learning_rate": 8.75284819929542e-06, "loss": 0.0685, "num_tokens": 808646099.0, "step": 6186 }, { "epoch": 2.4688747007182763, "grad_norm": 0.2718345820903778, "learning_rate": 8.747377239246359e-06, "loss": 0.08, "num_tokens": 808777171.0, "step": 6187 }, { "epoch": 2.46927374301676, "grad_norm": 0.28169894218444824, "learning_rate": 8.741909907654997e-06, "loss": 0.0853, "num_tokens": 808908243.0, "step": 6188 }, { "epoch": 2.4696727853152436, "grad_norm": 0.2332223355770111, "learning_rate": 8.736446205579208e-06, "loss": 0.0642, "num_tokens": 809039315.0, "step": 6189 }, { "epoch": 2.4700718276137272, "grad_norm": 0.226493239402771, "learning_rate": 8.730986134076166e-06, "loss": 0.0735, "num_tokens": 809170387.0, "step": 6190 }, { "epoch": 2.470470869912211, "grad_norm": 0.2583167850971222, "learning_rate": 8.725529694202348e-06, "loss": 0.0737, "num_tokens": 809301459.0, "step": 6191 }, { "epoch": 2.4708699122106945, "grad_norm": 0.28291791677474976, "learning_rate": 8.720076887013515e-06, "loss": 0.0862, "num_tokens": 809432531.0, "step": 6192 }, { "epoch": 2.471268954509178, "grad_norm": 0.2765946388244629, "learning_rate": 8.714627713564746e-06, "loss": 0.0773, "num_tokens": 809563603.0, "step": 6193 }, { "epoch": 2.471667996807662, "grad_norm": 0.2577129900455475, "learning_rate": 8.709182174910393e-06, "loss": 0.0702, "num_tokens": 809694675.0, "step": 6194 }, { "epoch": 2.472067039106145, "grad_norm": 0.23008394241333008, "learning_rate": 8.703740272104124e-06, "loss": 0.0669, "num_tokens": 809825747.0, "step": 6195 }, { "epoch": 2.4724660814046286, "grad_norm": 0.24526743590831757, "learning_rate": 8.698302006198902e-06, "loss": 0.0814, "num_tokens": 809952253.0, "step": 6196 }, { "epoch": 2.4728651237031123, "grad_norm": 0.22950401902198792, "learning_rate": 8.69286737824697e-06, "loss": 0.0698, "num_tokens": 810083325.0, "step": 6197 }, { "epoch": 2.473264166001596, "grad_norm": 0.248492032289505, "learning_rate": 8.68743638929988e-06, "loss": 0.0788, "num_tokens": 810214397.0, "step": 6198 }, { "epoch": 2.4736632083000796, "grad_norm": 0.22054438292980194, "learning_rate": 8.682009040408487e-06, "loss": 0.0576, "num_tokens": 810345469.0, "step": 6199 }, { "epoch": 2.474062250598563, "grad_norm": 0.24658942222595215, "learning_rate": 8.676585332622921e-06, "loss": 0.0702, "num_tokens": 810476541.0, "step": 6200 }, { "epoch": 2.474461292897047, "grad_norm": 0.2756114900112152, "learning_rate": 8.67116526699262e-06, "loss": 0.0848, "num_tokens": 810607613.0, "step": 6201 }, { "epoch": 2.4748603351955305, "grad_norm": 0.21703456342220306, "learning_rate": 8.665748844566335e-06, "loss": 0.0644, "num_tokens": 810738685.0, "step": 6202 }, { "epoch": 2.475259377494014, "grad_norm": 0.24150654673576355, "learning_rate": 8.660336066392065e-06, "loss": 0.0736, "num_tokens": 810869757.0, "step": 6203 }, { "epoch": 2.475658419792498, "grad_norm": 0.2575679421424866, "learning_rate": 8.654926933517146e-06, "loss": 0.0749, "num_tokens": 811000829.0, "step": 6204 }, { "epoch": 2.4760574620909814, "grad_norm": 0.25503280758857727, "learning_rate": 8.649521446988195e-06, "loss": 0.0755, "num_tokens": 811131901.0, "step": 6205 }, { "epoch": 2.476456504389465, "grad_norm": 0.26518815755844116, "learning_rate": 8.64411960785112e-06, "loss": 0.0693, "num_tokens": 811262973.0, "step": 6206 }, { "epoch": 2.4768555466879487, "grad_norm": 0.25018200278282166, "learning_rate": 8.63872141715113e-06, "loss": 0.0885, "num_tokens": 811394045.0, "step": 6207 }, { "epoch": 2.4772545889864324, "grad_norm": 0.3070366680622101, "learning_rate": 8.633326875932726e-06, "loss": 0.0884, "num_tokens": 811518912.0, "step": 6208 }, { "epoch": 2.477653631284916, "grad_norm": 0.2699578106403351, "learning_rate": 8.627935985239695e-06, "loss": 0.1035, "num_tokens": 811649984.0, "step": 6209 }, { "epoch": 2.4780526735833996, "grad_norm": 0.25582918524742126, "learning_rate": 8.622548746115128e-06, "loss": 0.0777, "num_tokens": 811777044.0, "step": 6210 }, { "epoch": 2.4784517158818833, "grad_norm": 0.25283128023147583, "learning_rate": 8.617165159601411e-06, "loss": 0.0738, "num_tokens": 811908116.0, "step": 6211 }, { "epoch": 2.478850758180367, "grad_norm": 0.24548982083797455, "learning_rate": 8.611785226740205e-06, "loss": 0.0694, "num_tokens": 812039188.0, "step": 6212 }, { "epoch": 2.4792498004788506, "grad_norm": 0.3026086091995239, "learning_rate": 8.606408948572489e-06, "loss": 0.0922, "num_tokens": 812170260.0, "step": 6213 }, { "epoch": 2.479648842777334, "grad_norm": 0.32009240984916687, "learning_rate": 8.601036326138511e-06, "loss": 0.0976, "num_tokens": 812301332.0, "step": 6214 }, { "epoch": 2.480047885075818, "grad_norm": 0.28900375962257385, "learning_rate": 8.595667360477842e-06, "loss": 0.0754, "num_tokens": 812416657.0, "step": 6215 }, { "epoch": 2.4804469273743015, "grad_norm": 0.2562401294708252, "learning_rate": 8.590302052629301e-06, "loss": 0.0894, "num_tokens": 812547729.0, "step": 6216 }, { "epoch": 2.480845969672785, "grad_norm": 0.3116680085659027, "learning_rate": 8.584940403631043e-06, "loss": 0.0934, "num_tokens": 812678801.0, "step": 6217 }, { "epoch": 2.481245011971269, "grad_norm": 0.28610846400260925, "learning_rate": 8.579582414520502e-06, "loss": 0.0998, "num_tokens": 812809873.0, "step": 6218 }, { "epoch": 2.4816440542697524, "grad_norm": 0.2452412098646164, "learning_rate": 8.574228086334379e-06, "loss": 0.0772, "num_tokens": 812940945.0, "step": 6219 }, { "epoch": 2.482043096568236, "grad_norm": 0.25839918851852417, "learning_rate": 8.568877420108698e-06, "loss": 0.0825, "num_tokens": 813072017.0, "step": 6220 }, { "epoch": 2.4824421388667197, "grad_norm": 0.23226645588874817, "learning_rate": 8.56353041687877e-06, "loss": 0.072, "num_tokens": 813203089.0, "step": 6221 }, { "epoch": 2.4828411811652034, "grad_norm": 0.228684201836586, "learning_rate": 8.558187077679176e-06, "loss": 0.0699, "num_tokens": 813334161.0, "step": 6222 }, { "epoch": 2.483240223463687, "grad_norm": 0.22601328790187836, "learning_rate": 8.55284740354381e-06, "loss": 0.0723, "num_tokens": 813465233.0, "step": 6223 }, { "epoch": 2.4836392657621706, "grad_norm": 0.2558053135871887, "learning_rate": 8.547511395505852e-06, "loss": 0.0949, "num_tokens": 813596305.0, "step": 6224 }, { "epoch": 2.4840383080606543, "grad_norm": 0.2446179986000061, "learning_rate": 8.542179054597769e-06, "loss": 0.0796, "num_tokens": 813727377.0, "step": 6225 }, { "epoch": 2.484437350359138, "grad_norm": 0.2087671160697937, "learning_rate": 8.536850381851309e-06, "loss": 0.0499, "num_tokens": 813858449.0, "step": 6226 }, { "epoch": 2.4848363926576216, "grad_norm": 0.23514719307422638, "learning_rate": 8.531525378297533e-06, "loss": 0.07, "num_tokens": 813989521.0, "step": 6227 }, { "epoch": 2.485235434956105, "grad_norm": 0.23626339435577393, "learning_rate": 8.52620404496677e-06, "loss": 0.0786, "num_tokens": 814120593.0, "step": 6228 }, { "epoch": 2.485634477254589, "grad_norm": 0.2755141854286194, "learning_rate": 8.520886382888652e-06, "loss": 0.079, "num_tokens": 814251665.0, "step": 6229 }, { "epoch": 2.4860335195530725, "grad_norm": 0.24729351699352264, "learning_rate": 8.515572393092103e-06, "loss": 0.0819, "num_tokens": 814382737.0, "step": 6230 }, { "epoch": 2.486432561851556, "grad_norm": 0.309573233127594, "learning_rate": 8.51026207660532e-06, "loss": 0.1005, "num_tokens": 814513809.0, "step": 6231 }, { "epoch": 2.48683160415004, "grad_norm": 0.2604709267616272, "learning_rate": 8.504955434455805e-06, "loss": 0.0758, "num_tokens": 814644881.0, "step": 6232 }, { "epoch": 2.4872306464485234, "grad_norm": 0.2506506145000458, "learning_rate": 8.499652467670353e-06, "loss": 0.0788, "num_tokens": 814775953.0, "step": 6233 }, { "epoch": 2.487629688747007, "grad_norm": 0.2922656238079071, "learning_rate": 8.494353177275022e-06, "loss": 0.1036, "num_tokens": 814907025.0, "step": 6234 }, { "epoch": 2.4880287310454907, "grad_norm": 0.2855193018913269, "learning_rate": 8.489057564295181e-06, "loss": 0.0805, "num_tokens": 815038097.0, "step": 6235 }, { "epoch": 2.4884277733439744, "grad_norm": 0.3075793385505676, "learning_rate": 8.483765629755489e-06, "loss": 0.0844, "num_tokens": 815169169.0, "step": 6236 }, { "epoch": 2.488826815642458, "grad_norm": 0.27902546525001526, "learning_rate": 8.478477374679883e-06, "loss": 0.0883, "num_tokens": 815300241.0, "step": 6237 }, { "epoch": 2.4892258579409416, "grad_norm": 0.272691547870636, "learning_rate": 8.473192800091581e-06, "loss": 0.0841, "num_tokens": 815420693.0, "step": 6238 }, { "epoch": 2.4896249002394253, "grad_norm": 0.26428791880607605, "learning_rate": 8.467911907013113e-06, "loss": 0.0736, "num_tokens": 815551765.0, "step": 6239 }, { "epoch": 2.490023942537909, "grad_norm": 0.27335938811302185, "learning_rate": 8.46263469646627e-06, "loss": 0.0723, "num_tokens": 815682837.0, "step": 6240 }, { "epoch": 2.4904229848363926, "grad_norm": 0.251392126083374, "learning_rate": 8.45736116947215e-06, "loss": 0.0715, "num_tokens": 815813909.0, "step": 6241 }, { "epoch": 2.490822027134876, "grad_norm": 0.28328338265419006, "learning_rate": 8.452091327051134e-06, "loss": 0.0753, "num_tokens": 815930480.0, "step": 6242 }, { "epoch": 2.49122106943336, "grad_norm": 0.279096782207489, "learning_rate": 8.44682517022288e-06, "loss": 0.09, "num_tokens": 816061552.0, "step": 6243 }, { "epoch": 2.4916201117318435, "grad_norm": 0.2285812795162201, "learning_rate": 8.441562700006347e-06, "loss": 0.0797, "num_tokens": 816192624.0, "step": 6244 }, { "epoch": 2.492019154030327, "grad_norm": 0.22238372266292572, "learning_rate": 8.436303917419771e-06, "loss": 0.0666, "num_tokens": 816323696.0, "step": 6245 }, { "epoch": 2.492418196328811, "grad_norm": 0.2675339877605438, "learning_rate": 8.431048823480679e-06, "loss": 0.0794, "num_tokens": 816454768.0, "step": 6246 }, { "epoch": 2.4928172386272944, "grad_norm": 0.2959097921848297, "learning_rate": 8.425797419205877e-06, "loss": 0.0822, "num_tokens": 816585840.0, "step": 6247 }, { "epoch": 2.493216280925778, "grad_norm": 0.22396211326122284, "learning_rate": 8.420549705611468e-06, "loss": 0.0682, "num_tokens": 816716912.0, "step": 6248 }, { "epoch": 2.4936153232242617, "grad_norm": 0.26828575134277344, "learning_rate": 8.415305683712838e-06, "loss": 0.0928, "num_tokens": 816847984.0, "step": 6249 }, { "epoch": 2.4940143655227454, "grad_norm": 0.24033339321613312, "learning_rate": 8.410065354524651e-06, "loss": 0.0641, "num_tokens": 816979056.0, "step": 6250 }, { "epoch": 2.494413407821229, "grad_norm": 0.23588529229164124, "learning_rate": 8.40482871906086e-06, "loss": 0.0766, "num_tokens": 817110128.0, "step": 6251 }, { "epoch": 2.4948124501197126, "grad_norm": 0.26182839274406433, "learning_rate": 8.399595778334716e-06, "loss": 0.0813, "num_tokens": 817241200.0, "step": 6252 }, { "epoch": 2.4952114924181963, "grad_norm": 0.241607204079628, "learning_rate": 8.39436653335873e-06, "loss": 0.0607, "num_tokens": 817372272.0, "step": 6253 }, { "epoch": 2.49561053471668, "grad_norm": 0.19963446259498596, "learning_rate": 8.389140985144722e-06, "loss": 0.0659, "num_tokens": 817503344.0, "step": 6254 }, { "epoch": 2.4960095770151636, "grad_norm": 0.2730841040611267, "learning_rate": 8.383919134703785e-06, "loss": 0.085, "num_tokens": 817634416.0, "step": 6255 }, { "epoch": 2.496408619313647, "grad_norm": 0.2590927481651306, "learning_rate": 8.378700983046298e-06, "loss": 0.0825, "num_tokens": 817765488.0, "step": 6256 }, { "epoch": 2.496807661612131, "grad_norm": 0.270079642534256, "learning_rate": 8.37348653118192e-06, "loss": 0.0828, "num_tokens": 817896560.0, "step": 6257 }, { "epoch": 2.4972067039106145, "grad_norm": 0.26980161666870117, "learning_rate": 8.368275780119617e-06, "loss": 0.0846, "num_tokens": 818027632.0, "step": 6258 }, { "epoch": 2.497605746209098, "grad_norm": 0.24610795080661774, "learning_rate": 8.363068730867597e-06, "loss": 0.0739, "num_tokens": 818158704.0, "step": 6259 }, { "epoch": 2.498004788507582, "grad_norm": 0.3180829882621765, "learning_rate": 8.35786538443338e-06, "loss": 0.1053, "num_tokens": 818289776.0, "step": 6260 }, { "epoch": 2.4984038308060654, "grad_norm": 0.2388184517621994, "learning_rate": 8.352665741823779e-06, "loss": 0.0692, "num_tokens": 818420848.0, "step": 6261 }, { "epoch": 2.498802873104549, "grad_norm": 0.27556878328323364, "learning_rate": 8.347469804044863e-06, "loss": 0.0926, "num_tokens": 818551920.0, "step": 6262 }, { "epoch": 2.4992019154030327, "grad_norm": 0.24900707602500916, "learning_rate": 8.342277572102e-06, "loss": 0.0766, "num_tokens": 818682992.0, "step": 6263 }, { "epoch": 2.4996009577015164, "grad_norm": 0.24781368672847748, "learning_rate": 8.33708904699985e-06, "loss": 0.086, "num_tokens": 818814064.0, "step": 6264 }, { "epoch": 2.5, "grad_norm": 0.25579118728637695, "learning_rate": 8.331904229742325e-06, "loss": 0.0671, "num_tokens": 818945136.0, "step": 6265 }, { "epoch": 2.5003990422984836, "grad_norm": 0.2396298497915268, "learning_rate": 8.326723121332652e-06, "loss": 0.077, "num_tokens": 819076208.0, "step": 6266 }, { "epoch": 2.5007980845969673, "grad_norm": 0.2628444731235504, "learning_rate": 8.321545722773327e-06, "loss": 0.0672, "num_tokens": 819207280.0, "step": 6267 }, { "epoch": 2.501197126895451, "grad_norm": 0.2447202503681183, "learning_rate": 8.316372035066124e-06, "loss": 0.0748, "num_tokens": 819338352.0, "step": 6268 }, { "epoch": 2.5015961691939346, "grad_norm": 0.2787201404571533, "learning_rate": 8.3112020592121e-06, "loss": 0.0903, "num_tokens": 819469424.0, "step": 6269 }, { "epoch": 2.501995211492418, "grad_norm": 0.23821331560611725, "learning_rate": 8.306035796211611e-06, "loss": 0.0744, "num_tokens": 819600496.0, "step": 6270 }, { "epoch": 2.502394253790902, "grad_norm": 0.2469310164451599, "learning_rate": 8.300873247064269e-06, "loss": 0.0744, "num_tokens": 819729253.0, "step": 6271 }, { "epoch": 2.5027932960893855, "grad_norm": 0.2558734714984894, "learning_rate": 8.295714412768977e-06, "loss": 0.0718, "num_tokens": 819860325.0, "step": 6272 }, { "epoch": 2.503192338387869, "grad_norm": 0.226039856672287, "learning_rate": 8.290559294323928e-06, "loss": 0.0695, "num_tokens": 819991397.0, "step": 6273 }, { "epoch": 2.503591380686353, "grad_norm": 0.2833957374095917, "learning_rate": 8.28540789272659e-06, "loss": 0.0986, "num_tokens": 820122469.0, "step": 6274 }, { "epoch": 2.5039904229848364, "grad_norm": 0.2317790985107422, "learning_rate": 8.280260208973706e-06, "loss": 0.0721, "num_tokens": 820253541.0, "step": 6275 }, { "epoch": 2.50438946528332, "grad_norm": 0.25660595297813416, "learning_rate": 8.275116244061304e-06, "loss": 0.0831, "num_tokens": 820384613.0, "step": 6276 }, { "epoch": 2.5047885075818037, "grad_norm": 0.2520752251148224, "learning_rate": 8.269975998984705e-06, "loss": 0.0694, "num_tokens": 820515685.0, "step": 6277 }, { "epoch": 2.5051875498802874, "grad_norm": 0.24781407415866852, "learning_rate": 8.264839474738483e-06, "loss": 0.0878, "num_tokens": 820646757.0, "step": 6278 }, { "epoch": 2.505586592178771, "grad_norm": 0.2733555734157562, "learning_rate": 8.259706672316516e-06, "loss": 0.0913, "num_tokens": 820777829.0, "step": 6279 }, { "epoch": 2.5059856344772546, "grad_norm": 0.1986246109008789, "learning_rate": 8.254577592711956e-06, "loss": 0.0624, "num_tokens": 820908901.0, "step": 6280 }, { "epoch": 2.5063846767757383, "grad_norm": 0.2610623836517334, "learning_rate": 8.249452236917233e-06, "loss": 0.0795, "num_tokens": 821039973.0, "step": 6281 }, { "epoch": 2.506783719074222, "grad_norm": 0.31664443016052246, "learning_rate": 8.244330605924038e-06, "loss": 0.099, "num_tokens": 821171045.0, "step": 6282 }, { "epoch": 2.5071827613727056, "grad_norm": 0.23291434347629547, "learning_rate": 8.239212700723384e-06, "loss": 0.0594, "num_tokens": 821302117.0, "step": 6283 }, { "epoch": 2.507581803671189, "grad_norm": 0.22509697079658508, "learning_rate": 8.234098522305517e-06, "loss": 0.0688, "num_tokens": 821433189.0, "step": 6284 }, { "epoch": 2.507980845969673, "grad_norm": 0.22029469907283783, "learning_rate": 8.228988071659991e-06, "loss": 0.0657, "num_tokens": 821564261.0, "step": 6285 }, { "epoch": 2.5083798882681565, "grad_norm": 0.29958686232566833, "learning_rate": 8.223881349775636e-06, "loss": 0.0904, "num_tokens": 821680438.0, "step": 6286 }, { "epoch": 2.50877893056664, "grad_norm": 0.2236645221710205, "learning_rate": 8.218778357640548e-06, "loss": 0.0617, "num_tokens": 821811510.0, "step": 6287 }, { "epoch": 2.509177972865124, "grad_norm": 0.2889237403869629, "learning_rate": 8.213679096242106e-06, "loss": 0.0836, "num_tokens": 821942582.0, "step": 6288 }, { "epoch": 2.5095770151636074, "grad_norm": 0.24781867861747742, "learning_rate": 8.208583566566982e-06, "loss": 0.0785, "num_tokens": 822073654.0, "step": 6289 }, { "epoch": 2.509976057462091, "grad_norm": 0.28902727365493774, "learning_rate": 8.2034917696011e-06, "loss": 0.095, "num_tokens": 822204726.0, "step": 6290 }, { "epoch": 2.5103750997605747, "grad_norm": 0.21646998822689056, "learning_rate": 8.198403706329683e-06, "loss": 0.0694, "num_tokens": 822335798.0, "step": 6291 }, { "epoch": 2.5107741420590584, "grad_norm": 0.2772504687309265, "learning_rate": 8.193319377737225e-06, "loss": 0.0839, "num_tokens": 822466870.0, "step": 6292 }, { "epoch": 2.511173184357542, "grad_norm": 0.2810984253883362, "learning_rate": 8.188238784807492e-06, "loss": 0.0893, "num_tokens": 822597942.0, "step": 6293 }, { "epoch": 2.5115722266560256, "grad_norm": 0.22819481790065765, "learning_rate": 8.183161928523527e-06, "loss": 0.0689, "num_tokens": 822726117.0, "step": 6294 }, { "epoch": 2.5119712689545093, "grad_norm": 0.26762357354164124, "learning_rate": 8.17808880986766e-06, "loss": 0.0893, "num_tokens": 822857189.0, "step": 6295 }, { "epoch": 2.512370311252993, "grad_norm": 0.25766128301620483, "learning_rate": 8.173019429821497e-06, "loss": 0.0697, "num_tokens": 822988261.0, "step": 6296 }, { "epoch": 2.5127693535514766, "grad_norm": 0.2471921294927597, "learning_rate": 8.167953789365904e-06, "loss": 0.0874, "num_tokens": 823119333.0, "step": 6297 }, { "epoch": 2.51316839584996, "grad_norm": 0.2624203562736511, "learning_rate": 8.162891889481044e-06, "loss": 0.0824, "num_tokens": 823250405.0, "step": 6298 }, { "epoch": 2.513567438148444, "grad_norm": 0.24715253710746765, "learning_rate": 8.15783373114635e-06, "loss": 0.0723, "num_tokens": 823381477.0, "step": 6299 }, { "epoch": 2.5139664804469275, "grad_norm": 0.24780148267745972, "learning_rate": 8.152779315340517e-06, "loss": 0.0688, "num_tokens": 823498004.0, "step": 6300 }, { "epoch": 2.514365522745411, "grad_norm": 0.25511613488197327, "learning_rate": 8.147728643041542e-06, "loss": 0.0858, "num_tokens": 823629076.0, "step": 6301 }, { "epoch": 2.514764565043895, "grad_norm": 0.2521195411682129, "learning_rate": 8.14268171522667e-06, "loss": 0.0783, "num_tokens": 823760148.0, "step": 6302 }, { "epoch": 2.5151636073423784, "grad_norm": 0.18491335213184357, "learning_rate": 8.137638532872436e-06, "loss": 0.0494, "num_tokens": 823891220.0, "step": 6303 }, { "epoch": 2.515562649640862, "grad_norm": 0.29102417826652527, "learning_rate": 8.132599096954655e-06, "loss": 0.0776, "num_tokens": 824022292.0, "step": 6304 }, { "epoch": 2.5159616919393457, "grad_norm": 0.26413774490356445, "learning_rate": 8.127563408448413e-06, "loss": 0.0775, "num_tokens": 824153364.0, "step": 6305 }, { "epoch": 2.5163607342378294, "grad_norm": 0.25798600912094116, "learning_rate": 8.12253146832806e-06, "loss": 0.0728, "num_tokens": 824284436.0, "step": 6306 }, { "epoch": 2.516759776536313, "grad_norm": 0.2743913531303406, "learning_rate": 8.11750327756723e-06, "loss": 0.0795, "num_tokens": 824415508.0, "step": 6307 }, { "epoch": 2.5171588188347966, "grad_norm": 0.22358165681362152, "learning_rate": 8.11247883713884e-06, "loss": 0.0647, "num_tokens": 824546580.0, "step": 6308 }, { "epoch": 2.5175578611332803, "grad_norm": 0.2847610116004944, "learning_rate": 8.107458148015062e-06, "loss": 0.0885, "num_tokens": 824677652.0, "step": 6309 }, { "epoch": 2.517956903431764, "grad_norm": 0.2357686161994934, "learning_rate": 8.102441211167358e-06, "loss": 0.0704, "num_tokens": 824808724.0, "step": 6310 }, { "epoch": 2.5183559457302476, "grad_norm": 0.22052016854286194, "learning_rate": 8.097428027566465e-06, "loss": 0.0622, "num_tokens": 824939796.0, "step": 6311 }, { "epoch": 2.518754988028731, "grad_norm": 0.25403735041618347, "learning_rate": 8.09241859818237e-06, "loss": 0.0696, "num_tokens": 825070868.0, "step": 6312 }, { "epoch": 2.519154030327215, "grad_norm": 0.22801557183265686, "learning_rate": 8.087412923984367e-06, "loss": 0.0627, "num_tokens": 825201940.0, "step": 6313 }, { "epoch": 2.5195530726256985, "grad_norm": 0.23646172881126404, "learning_rate": 8.082411005941002e-06, "loss": 0.0752, "num_tokens": 825333012.0, "step": 6314 }, { "epoch": 2.519952114924182, "grad_norm": 0.23913146555423737, "learning_rate": 8.0774128450201e-06, "loss": 0.0845, "num_tokens": 825464084.0, "step": 6315 }, { "epoch": 2.520351157222666, "grad_norm": 0.273538738489151, "learning_rate": 8.072418442188753e-06, "loss": 0.0808, "num_tokens": 825595156.0, "step": 6316 }, { "epoch": 2.5207501995211494, "grad_norm": 0.2554095685482025, "learning_rate": 8.067427798413341e-06, "loss": 0.0765, "num_tokens": 825726228.0, "step": 6317 }, { "epoch": 2.521149241819633, "grad_norm": 0.26883772015571594, "learning_rate": 8.062440914659494e-06, "loss": 0.0877, "num_tokens": 825857300.0, "step": 6318 }, { "epoch": 2.5215482841181167, "grad_norm": 0.22110524773597717, "learning_rate": 8.057457791892139e-06, "loss": 0.0597, "num_tokens": 825985117.0, "step": 6319 }, { "epoch": 2.5219473264166004, "grad_norm": 0.2586497366428375, "learning_rate": 8.052478431075464e-06, "loss": 0.0828, "num_tokens": 826116189.0, "step": 6320 }, { "epoch": 2.522346368715084, "grad_norm": 0.3032453954219818, "learning_rate": 8.047502833172921e-06, "loss": 0.0846, "num_tokens": 826247261.0, "step": 6321 }, { "epoch": 2.5227454110135676, "grad_norm": 0.2323305308818817, "learning_rate": 8.042530999147246e-06, "loss": 0.0681, "num_tokens": 826378333.0, "step": 6322 }, { "epoch": 2.5231444533120513, "grad_norm": 0.2644166350364685, "learning_rate": 8.037562929960446e-06, "loss": 0.0802, "num_tokens": 826509405.0, "step": 6323 }, { "epoch": 2.523543495610535, "grad_norm": 0.23894688487052917, "learning_rate": 8.032598626573788e-06, "loss": 0.0762, "num_tokens": 826640477.0, "step": 6324 }, { "epoch": 2.5239425379090186, "grad_norm": 0.2541283369064331, "learning_rate": 8.027638089947823e-06, "loss": 0.0874, "num_tokens": 826771549.0, "step": 6325 }, { "epoch": 2.524341580207502, "grad_norm": 0.22111542522907257, "learning_rate": 8.022681321042373e-06, "loss": 0.0642, "num_tokens": 826902621.0, "step": 6326 }, { "epoch": 2.524740622505986, "grad_norm": 0.19218164682388306, "learning_rate": 8.017728320816523e-06, "loss": 0.0532, "num_tokens": 827033693.0, "step": 6327 }, { "epoch": 2.5251396648044695, "grad_norm": 0.25617825984954834, "learning_rate": 8.012779090228626e-06, "loss": 0.0777, "num_tokens": 827164765.0, "step": 6328 }, { "epoch": 2.525538707102953, "grad_norm": 0.26261913776397705, "learning_rate": 8.007833630236316e-06, "loss": 0.076, "num_tokens": 827295837.0, "step": 6329 }, { "epoch": 2.525937749401437, "grad_norm": 0.26587703824043274, "learning_rate": 8.002891941796501e-06, "loss": 0.0679, "num_tokens": 827426909.0, "step": 6330 }, { "epoch": 2.5263367916999204, "grad_norm": 0.23683901131153107, "learning_rate": 7.997954025865343e-06, "loss": 0.0679, "num_tokens": 827557981.0, "step": 6331 }, { "epoch": 2.526735833998404, "grad_norm": 0.22668907046318054, "learning_rate": 7.993019883398282e-06, "loss": 0.0584, "num_tokens": 827689053.0, "step": 6332 }, { "epoch": 2.5271348762968877, "grad_norm": 0.20310010015964508, "learning_rate": 7.98808951535004e-06, "loss": 0.0613, "num_tokens": 827820125.0, "step": 6333 }, { "epoch": 2.5275339185953714, "grad_norm": 0.2502084970474243, "learning_rate": 7.983162922674581e-06, "loss": 0.0717, "num_tokens": 827951197.0, "step": 6334 }, { "epoch": 2.527932960893855, "grad_norm": 0.2162812054157257, "learning_rate": 7.978240106325163e-06, "loss": 0.0584, "num_tokens": 828082269.0, "step": 6335 }, { "epoch": 2.5283320031923386, "grad_norm": 0.24180392920970917, "learning_rate": 7.97332106725431e-06, "loss": 0.0625, "num_tokens": 828213341.0, "step": 6336 }, { "epoch": 2.5287310454908223, "grad_norm": 0.23363707959651947, "learning_rate": 7.968405806413803e-06, "loss": 0.0662, "num_tokens": 828344413.0, "step": 6337 }, { "epoch": 2.529130087789306, "grad_norm": 0.23284488916397095, "learning_rate": 7.963494324754705e-06, "loss": 0.0707, "num_tokens": 828475485.0, "step": 6338 }, { "epoch": 2.5295291300877896, "grad_norm": 0.2847291827201843, "learning_rate": 7.958586623227336e-06, "loss": 0.0851, "num_tokens": 828606557.0, "step": 6339 }, { "epoch": 2.529928172386273, "grad_norm": 0.28578388690948486, "learning_rate": 7.95368270278129e-06, "loss": 0.0717, "num_tokens": 828737629.0, "step": 6340 }, { "epoch": 2.530327214684757, "grad_norm": 0.24455735087394714, "learning_rate": 7.948782564365432e-06, "loss": 0.0698, "num_tokens": 828868701.0, "step": 6341 }, { "epoch": 2.5307262569832405, "grad_norm": 0.2355247288942337, "learning_rate": 7.943886208927897e-06, "loss": 0.0755, "num_tokens": 828999773.0, "step": 6342 }, { "epoch": 2.531125299281724, "grad_norm": 0.2025708705186844, "learning_rate": 7.938993637416078e-06, "loss": 0.0551, "num_tokens": 829130845.0, "step": 6343 }, { "epoch": 2.5315243415802073, "grad_norm": 0.2500779926776886, "learning_rate": 7.934104850776645e-06, "loss": 0.0649, "num_tokens": 829246434.0, "step": 6344 }, { "epoch": 2.531923383878691, "grad_norm": 0.22936533391475677, "learning_rate": 7.929219849955535e-06, "loss": 0.0612, "num_tokens": 829377506.0, "step": 6345 }, { "epoch": 2.5323224261771746, "grad_norm": 0.2276192605495453, "learning_rate": 7.924338635897942e-06, "loss": 0.0711, "num_tokens": 829508578.0, "step": 6346 }, { "epoch": 2.5327214684756583, "grad_norm": 0.28833600878715515, "learning_rate": 7.919461209548344e-06, "loss": 0.081, "num_tokens": 829639650.0, "step": 6347 }, { "epoch": 2.533120510774142, "grad_norm": 0.22838257253170013, "learning_rate": 7.914587571850477e-06, "loss": 0.0706, "num_tokens": 829770722.0, "step": 6348 }, { "epoch": 2.5335195530726256, "grad_norm": 0.23492136597633362, "learning_rate": 7.909717723747346e-06, "loss": 0.0732, "num_tokens": 829901794.0, "step": 6349 }, { "epoch": 2.533918595371109, "grad_norm": 0.2653530240058899, "learning_rate": 7.904851666181207e-06, "loss": 0.0681, "num_tokens": 830032866.0, "step": 6350 }, { "epoch": 2.534317637669593, "grad_norm": 0.2738246023654938, "learning_rate": 7.89998940009361e-06, "loss": 0.0814, "num_tokens": 830163938.0, "step": 6351 }, { "epoch": 2.5347166799680765, "grad_norm": 0.2363423854112625, "learning_rate": 7.895130926425361e-06, "loss": 0.0727, "num_tokens": 830295010.0, "step": 6352 }, { "epoch": 2.53511572226656, "grad_norm": 0.23410922288894653, "learning_rate": 7.890276246116515e-06, "loss": 0.0766, "num_tokens": 830426082.0, "step": 6353 }, { "epoch": 2.5355147645650438, "grad_norm": 0.23846697807312012, "learning_rate": 7.885425360106419e-06, "loss": 0.0656, "num_tokens": 830557154.0, "step": 6354 }, { "epoch": 2.5359138068635274, "grad_norm": 0.27073267102241516, "learning_rate": 7.880578269333674e-06, "loss": 0.0833, "num_tokens": 830688226.0, "step": 6355 }, { "epoch": 2.536312849162011, "grad_norm": 0.25427788496017456, "learning_rate": 7.87573497473614e-06, "loss": 0.0683, "num_tokens": 830819298.0, "step": 6356 }, { "epoch": 2.5367118914604947, "grad_norm": 0.2601880133152008, "learning_rate": 7.870895477250954e-06, "loss": 0.0607, "num_tokens": 830950370.0, "step": 6357 }, { "epoch": 2.5371109337589783, "grad_norm": 0.3107542395591736, "learning_rate": 7.866059777814518e-06, "loss": 0.0718, "num_tokens": 831081442.0, "step": 6358 }, { "epoch": 2.537509976057462, "grad_norm": 0.2922208309173584, "learning_rate": 7.861227877362482e-06, "loss": 0.0908, "num_tokens": 831212514.0, "step": 6359 }, { "epoch": 2.5379090183559456, "grad_norm": 0.28979772329330444, "learning_rate": 7.856399776829788e-06, "loss": 0.0674, "num_tokens": 831343586.0, "step": 6360 }, { "epoch": 2.5383080606544293, "grad_norm": 0.25422486662864685, "learning_rate": 7.851575477150623e-06, "loss": 0.0742, "num_tokens": 831474658.0, "step": 6361 }, { "epoch": 2.538707102952913, "grad_norm": 0.27624398469924927, "learning_rate": 7.846754979258436e-06, "loss": 0.0884, "num_tokens": 831605730.0, "step": 6362 }, { "epoch": 2.5391061452513966, "grad_norm": 0.20280152559280396, "learning_rate": 7.841938284085958e-06, "loss": 0.0611, "num_tokens": 831736802.0, "step": 6363 }, { "epoch": 2.53950518754988, "grad_norm": 0.23674213886260986, "learning_rate": 7.837125392565175e-06, "loss": 0.08, "num_tokens": 831867874.0, "step": 6364 }, { "epoch": 2.539904229848364, "grad_norm": 0.23362529277801514, "learning_rate": 7.83231630562733e-06, "loss": 0.0786, "num_tokens": 831998946.0, "step": 6365 }, { "epoch": 2.5403032721468475, "grad_norm": 0.22067375481128693, "learning_rate": 7.827511024202937e-06, "loss": 0.0607, "num_tokens": 832130018.0, "step": 6366 }, { "epoch": 2.540702314445331, "grad_norm": 0.22164960205554962, "learning_rate": 7.822709549221784e-06, "loss": 0.0684, "num_tokens": 832261090.0, "step": 6367 }, { "epoch": 2.5411013567438148, "grad_norm": 0.22997213900089264, "learning_rate": 7.817911881612896e-06, "loss": 0.0744, "num_tokens": 832392162.0, "step": 6368 }, { "epoch": 2.5415003990422984, "grad_norm": 0.2718501389026642, "learning_rate": 7.813118022304589e-06, "loss": 0.0922, "num_tokens": 832523234.0, "step": 6369 }, { "epoch": 2.541899441340782, "grad_norm": 0.2525506317615509, "learning_rate": 7.808327972224427e-06, "loss": 0.0735, "num_tokens": 832654306.0, "step": 6370 }, { "epoch": 2.5422984836392657, "grad_norm": 0.37526771426200867, "learning_rate": 7.80354173229924e-06, "loss": 0.1052, "num_tokens": 832785378.0, "step": 6371 }, { "epoch": 2.5426975259377493, "grad_norm": 0.24525800347328186, "learning_rate": 7.798759303455117e-06, "loss": 0.0676, "num_tokens": 832916450.0, "step": 6372 }, { "epoch": 2.543096568236233, "grad_norm": 0.25810033082962036, "learning_rate": 7.793980686617413e-06, "loss": 0.078, "num_tokens": 833047522.0, "step": 6373 }, { "epoch": 2.5434956105347166, "grad_norm": 0.2743109464645386, "learning_rate": 7.789205882710756e-06, "loss": 0.0841, "num_tokens": 833178594.0, "step": 6374 }, { "epoch": 2.5438946528332003, "grad_norm": 0.2881927192211151, "learning_rate": 7.784434892659013e-06, "loss": 0.0981, "num_tokens": 833309666.0, "step": 6375 }, { "epoch": 2.544293695131684, "grad_norm": 0.2229156792163849, "learning_rate": 7.779667717385335e-06, "loss": 0.0666, "num_tokens": 833440738.0, "step": 6376 }, { "epoch": 2.5446927374301676, "grad_norm": 0.26129308342933655, "learning_rate": 7.774904357812125e-06, "loss": 0.0813, "num_tokens": 833571810.0, "step": 6377 }, { "epoch": 2.545091779728651, "grad_norm": 0.2894098162651062, "learning_rate": 7.770144814861043e-06, "loss": 0.0875, "num_tokens": 833702882.0, "step": 6378 }, { "epoch": 2.545490822027135, "grad_norm": 0.243643656373024, "learning_rate": 7.765389089453024e-06, "loss": 0.0748, "num_tokens": 833833954.0, "step": 6379 }, { "epoch": 2.5458898643256185, "grad_norm": 0.27809229493141174, "learning_rate": 7.760637182508249e-06, "loss": 0.0951, "num_tokens": 833965026.0, "step": 6380 }, { "epoch": 2.546288906624102, "grad_norm": 0.27483683824539185, "learning_rate": 7.755889094946167e-06, "loss": 0.0862, "num_tokens": 834096098.0, "step": 6381 }, { "epoch": 2.5466879489225858, "grad_norm": 0.2844800651073456, "learning_rate": 7.751144827685503e-06, "loss": 0.0912, "num_tokens": 834227170.0, "step": 6382 }, { "epoch": 2.5470869912210694, "grad_norm": 0.2824946641921997, "learning_rate": 7.746404381644215e-06, "loss": 0.0902, "num_tokens": 834358242.0, "step": 6383 }, { "epoch": 2.547486033519553, "grad_norm": 0.2487231194972992, "learning_rate": 7.741667757739534e-06, "loss": 0.078, "num_tokens": 834489314.0, "step": 6384 }, { "epoch": 2.5478850758180367, "grad_norm": 0.2481115460395813, "learning_rate": 7.736934956887959e-06, "loss": 0.0783, "num_tokens": 834620386.0, "step": 6385 }, { "epoch": 2.5482841181165203, "grad_norm": 0.27620595693588257, "learning_rate": 7.732205980005245e-06, "loss": 0.0837, "num_tokens": 834751458.0, "step": 6386 }, { "epoch": 2.548683160415004, "grad_norm": 0.27393463253974915, "learning_rate": 7.727480828006396e-06, "loss": 0.0852, "num_tokens": 834882530.0, "step": 6387 }, { "epoch": 2.5490822027134876, "grad_norm": 0.17530442774295807, "learning_rate": 7.72275950180569e-06, "loss": 0.0379, "num_tokens": 835013602.0, "step": 6388 }, { "epoch": 2.5494812450119713, "grad_norm": 0.2560563087463379, "learning_rate": 7.718042002316663e-06, "loss": 0.0755, "num_tokens": 835144674.0, "step": 6389 }, { "epoch": 2.549880287310455, "grad_norm": 0.2570211887359619, "learning_rate": 7.713328330452099e-06, "loss": 0.0817, "num_tokens": 835275746.0, "step": 6390 }, { "epoch": 2.5502793296089385, "grad_norm": 0.29149875044822693, "learning_rate": 7.708618487124052e-06, "loss": 0.0834, "num_tokens": 835406818.0, "step": 6391 }, { "epoch": 2.550678371907422, "grad_norm": 0.24034596979618073, "learning_rate": 7.703912473243844e-06, "loss": 0.075, "num_tokens": 835537890.0, "step": 6392 }, { "epoch": 2.551077414205906, "grad_norm": 0.2213699370622635, "learning_rate": 7.699210289722027e-06, "loss": 0.0602, "num_tokens": 835668962.0, "step": 6393 }, { "epoch": 2.5514764565043895, "grad_norm": 0.29321223497390747, "learning_rate": 7.694511937468447e-06, "loss": 0.0988, "num_tokens": 835800034.0, "step": 6394 }, { "epoch": 2.551875498802873, "grad_norm": 0.25904104113578796, "learning_rate": 7.68981741739218e-06, "loss": 0.0708, "num_tokens": 835931106.0, "step": 6395 }, { "epoch": 2.5522745411013568, "grad_norm": 0.28803035616874695, "learning_rate": 7.68512673040157e-06, "loss": 0.0977, "num_tokens": 836062178.0, "step": 6396 }, { "epoch": 2.5526735833998404, "grad_norm": 0.2427288144826889, "learning_rate": 7.680439877404227e-06, "loss": 0.0787, "num_tokens": 836193250.0, "step": 6397 }, { "epoch": 2.553072625698324, "grad_norm": 0.2661665380001068, "learning_rate": 7.675756859307014e-06, "loss": 0.0805, "num_tokens": 836324322.0, "step": 6398 }, { "epoch": 2.5534716679968077, "grad_norm": 0.2642863392829895, "learning_rate": 7.671077677016047e-06, "loss": 0.0684, "num_tokens": 836455394.0, "step": 6399 }, { "epoch": 2.5538707102952913, "grad_norm": 0.2514996826648712, "learning_rate": 7.666402331436707e-06, "loss": 0.0842, "num_tokens": 836586466.0, "step": 6400 }, { "epoch": 2.554269752593775, "grad_norm": 0.2601536512374878, "learning_rate": 7.661730823473632e-06, "loss": 0.0867, "num_tokens": 836717538.0, "step": 6401 }, { "epoch": 2.5546687948922586, "grad_norm": 0.23079724609851837, "learning_rate": 7.65706315403071e-06, "loss": 0.0719, "num_tokens": 836848610.0, "step": 6402 }, { "epoch": 2.5550678371907423, "grad_norm": 0.2357131540775299, "learning_rate": 7.652399324011094e-06, "loss": 0.0629, "num_tokens": 836979682.0, "step": 6403 }, { "epoch": 2.555466879489226, "grad_norm": 0.24661299586296082, "learning_rate": 7.647739334317195e-06, "loss": 0.066, "num_tokens": 837110754.0, "step": 6404 }, { "epoch": 2.5558659217877095, "grad_norm": 0.25239020586013794, "learning_rate": 7.643083185850677e-06, "loss": 0.0815, "num_tokens": 837241826.0, "step": 6405 }, { "epoch": 2.556264964086193, "grad_norm": 0.24337798357009888, "learning_rate": 7.638430879512454e-06, "loss": 0.0745, "num_tokens": 837372898.0, "step": 6406 }, { "epoch": 2.556664006384677, "grad_norm": 0.2581634521484375, "learning_rate": 7.633782416202705e-06, "loss": 0.0797, "num_tokens": 837503970.0, "step": 6407 }, { "epoch": 2.5570630486831605, "grad_norm": 0.22649936378002167, "learning_rate": 7.629137796820876e-06, "loss": 0.0715, "num_tokens": 837635042.0, "step": 6408 }, { "epoch": 2.557462090981644, "grad_norm": 0.23528656363487244, "learning_rate": 7.624497022265645e-06, "loss": 0.0697, "num_tokens": 837766114.0, "step": 6409 }, { "epoch": 2.5578611332801278, "grad_norm": 0.2414967566728592, "learning_rate": 7.61986009343496e-06, "loss": 0.0759, "num_tokens": 837897186.0, "step": 6410 }, { "epoch": 2.5582601755786114, "grad_norm": 0.3348754942417145, "learning_rate": 7.6152270112260335e-06, "loss": 0.0975, "num_tokens": 838028258.0, "step": 6411 }, { "epoch": 2.558659217877095, "grad_norm": 0.24878425896167755, "learning_rate": 7.610597776535312e-06, "loss": 0.079, "num_tokens": 838159330.0, "step": 6412 }, { "epoch": 2.5590582601755787, "grad_norm": 0.23265984654426575, "learning_rate": 7.605972390258514e-06, "loss": 0.0681, "num_tokens": 838290402.0, "step": 6413 }, { "epoch": 2.5594573024740623, "grad_norm": 0.2475285530090332, "learning_rate": 7.601350853290612e-06, "loss": 0.0786, "num_tokens": 838421474.0, "step": 6414 }, { "epoch": 2.559856344772546, "grad_norm": 0.22522173821926117, "learning_rate": 7.596733166525824e-06, "loss": 0.0749, "num_tokens": 838552546.0, "step": 6415 }, { "epoch": 2.5602553870710296, "grad_norm": 0.27341124415397644, "learning_rate": 7.592119330857633e-06, "loss": 0.0713, "num_tokens": 838683618.0, "step": 6416 }, { "epoch": 2.5606544293695133, "grad_norm": 0.2747167646884918, "learning_rate": 7.587509347178778e-06, "loss": 0.0979, "num_tokens": 838814690.0, "step": 6417 }, { "epoch": 2.561053471667997, "grad_norm": 0.27249759435653687, "learning_rate": 7.582903216381237e-06, "loss": 0.0761, "num_tokens": 838945762.0, "step": 6418 }, { "epoch": 2.5614525139664805, "grad_norm": 0.2441834956407547, "learning_rate": 7.578300939356259e-06, "loss": 0.0643, "num_tokens": 839076834.0, "step": 6419 }, { "epoch": 2.561851556264964, "grad_norm": 0.2696549594402313, "learning_rate": 7.573702516994345e-06, "loss": 0.0751, "num_tokens": 839207906.0, "step": 6420 }, { "epoch": 2.562250598563448, "grad_norm": 0.24176834523677826, "learning_rate": 7.569107950185238e-06, "loss": 0.0699, "num_tokens": 839338978.0, "step": 6421 }, { "epoch": 2.5626496408619315, "grad_norm": 0.29501286149024963, "learning_rate": 7.564517239817951e-06, "loss": 0.0848, "num_tokens": 839470050.0, "step": 6422 }, { "epoch": 2.563048683160415, "grad_norm": 0.2858971059322357, "learning_rate": 7.5599303867807434e-06, "loss": 0.079, "num_tokens": 839601122.0, "step": 6423 }, { "epoch": 2.5634477254588988, "grad_norm": 0.22274278104305267, "learning_rate": 7.555347391961125e-06, "loss": 0.0651, "num_tokens": 839732194.0, "step": 6424 }, { "epoch": 2.5638467677573824, "grad_norm": 0.22474071383476257, "learning_rate": 7.550768256245865e-06, "loss": 0.0659, "num_tokens": 839863266.0, "step": 6425 }, { "epoch": 2.564245810055866, "grad_norm": 0.2395879626274109, "learning_rate": 7.546192980520987e-06, "loss": 0.0713, "num_tokens": 839994338.0, "step": 6426 }, { "epoch": 2.5646448523543497, "grad_norm": 0.2607446312904358, "learning_rate": 7.541621565671756e-06, "loss": 0.0791, "num_tokens": 840120997.0, "step": 6427 }, { "epoch": 2.5650438946528333, "grad_norm": 0.2587488293647766, "learning_rate": 7.537054012582706e-06, "loss": 0.0703, "num_tokens": 840252069.0, "step": 6428 }, { "epoch": 2.565442936951317, "grad_norm": 0.24710175395011902, "learning_rate": 7.532490322137609e-06, "loss": 0.0693, "num_tokens": 840383141.0, "step": 6429 }, { "epoch": 2.5658419792498006, "grad_norm": 0.30420902371406555, "learning_rate": 7.5279304952195015e-06, "loss": 0.0878, "num_tokens": 840514213.0, "step": 6430 }, { "epoch": 2.5662410215482843, "grad_norm": 0.2632255256175995, "learning_rate": 7.5233745327106625e-06, "loss": 0.085, "num_tokens": 840645285.0, "step": 6431 }, { "epoch": 2.566640063846768, "grad_norm": 0.24368615448474884, "learning_rate": 7.518822435492634e-06, "loss": 0.0759, "num_tokens": 840776357.0, "step": 6432 }, { "epoch": 2.5670391061452515, "grad_norm": 0.22832967340946198, "learning_rate": 7.5142742044462045e-06, "loss": 0.0656, "num_tokens": 840907429.0, "step": 6433 }, { "epoch": 2.567438148443735, "grad_norm": 0.23254409432411194, "learning_rate": 7.509729840451407e-06, "loss": 0.0752, "num_tokens": 841038501.0, "step": 6434 }, { "epoch": 2.567837190742219, "grad_norm": 0.22104136645793915, "learning_rate": 7.5051893443875405e-06, "loss": 0.0554, "num_tokens": 841169573.0, "step": 6435 }, { "epoch": 2.5682362330407025, "grad_norm": 0.27521422505378723, "learning_rate": 7.50065271713315e-06, "loss": 0.0927, "num_tokens": 841300645.0, "step": 6436 }, { "epoch": 2.568635275339186, "grad_norm": 0.2625024616718292, "learning_rate": 7.496119959566024e-06, "loss": 0.0783, "num_tokens": 841421641.0, "step": 6437 }, { "epoch": 2.5690343176376693, "grad_norm": 0.2542937397956848, "learning_rate": 7.4915910725632144e-06, "loss": 0.0772, "num_tokens": 841552713.0, "step": 6438 }, { "epoch": 2.569433359936153, "grad_norm": 0.2635970413684845, "learning_rate": 7.487066057001019e-06, "loss": 0.0849, "num_tokens": 841683785.0, "step": 6439 }, { "epoch": 2.5698324022346366, "grad_norm": 0.2733047306537628, "learning_rate": 7.482544913754974e-06, "loss": 0.083, "num_tokens": 841814857.0, "step": 6440 }, { "epoch": 2.5702314445331202, "grad_norm": 0.2836439609527588, "learning_rate": 7.478027643699892e-06, "loss": 0.0877, "num_tokens": 841945929.0, "step": 6441 }, { "epoch": 2.570630486831604, "grad_norm": 0.2716209590435028, "learning_rate": 7.473514247709823e-06, "loss": 0.0862, "num_tokens": 842077001.0, "step": 6442 }, { "epoch": 2.5710295291300875, "grad_norm": 0.2673908770084381, "learning_rate": 7.469004726658056e-06, "loss": 0.0858, "num_tokens": 842208073.0, "step": 6443 }, { "epoch": 2.571428571428571, "grad_norm": 0.26209747791290283, "learning_rate": 7.464499081417149e-06, "loss": 0.0796, "num_tokens": 842339145.0, "step": 6444 }, { "epoch": 2.571827613727055, "grad_norm": 0.22466397285461426, "learning_rate": 7.459997312858904e-06, "loss": 0.0672, "num_tokens": 842470217.0, "step": 6445 }, { "epoch": 2.5722266560255385, "grad_norm": 0.2330635040998459, "learning_rate": 7.455499421854366e-06, "loss": 0.0696, "num_tokens": 842601289.0, "step": 6446 }, { "epoch": 2.572625698324022, "grad_norm": 0.222026064991951, "learning_rate": 7.451005409273837e-06, "loss": 0.0633, "num_tokens": 842732361.0, "step": 6447 }, { "epoch": 2.5730247406225057, "grad_norm": 0.29886388778686523, "learning_rate": 7.446515275986872e-06, "loss": 0.0877, "num_tokens": 842863433.0, "step": 6448 }, { "epoch": 2.5734237829209894, "grad_norm": 0.29983335733413696, "learning_rate": 7.44202902286226e-06, "loss": 0.0801, "num_tokens": 842994505.0, "step": 6449 }, { "epoch": 2.573822825219473, "grad_norm": 0.25860920548439026, "learning_rate": 7.437546650768053e-06, "loss": 0.0782, "num_tokens": 843125577.0, "step": 6450 }, { "epoch": 2.5742218675179567, "grad_norm": 0.24694709479808807, "learning_rate": 7.433068160571555e-06, "loss": 0.0746, "num_tokens": 843256649.0, "step": 6451 }, { "epoch": 2.5746209098164403, "grad_norm": 0.29044309258461, "learning_rate": 7.428593553139306e-06, "loss": 0.0872, "num_tokens": 843387721.0, "step": 6452 }, { "epoch": 2.575019952114924, "grad_norm": 0.2764839231967926, "learning_rate": 7.424122829337097e-06, "loss": 0.0788, "num_tokens": 843518793.0, "step": 6453 }, { "epoch": 2.5754189944134076, "grad_norm": 0.26316216588020325, "learning_rate": 7.419655990029979e-06, "loss": 0.0834, "num_tokens": 843649865.0, "step": 6454 }, { "epoch": 2.5758180367118912, "grad_norm": 0.2180396169424057, "learning_rate": 7.415193036082235e-06, "loss": 0.0668, "num_tokens": 843780937.0, "step": 6455 }, { "epoch": 2.576217079010375, "grad_norm": 0.2701539993286133, "learning_rate": 7.410733968357414e-06, "loss": 0.0793, "num_tokens": 843912009.0, "step": 6456 }, { "epoch": 2.5766161213088585, "grad_norm": 0.24942463636398315, "learning_rate": 7.406278787718302e-06, "loss": 0.0828, "num_tokens": 844043081.0, "step": 6457 }, { "epoch": 2.577015163607342, "grad_norm": 0.28800255060195923, "learning_rate": 7.401827495026931e-06, "loss": 0.0889, "num_tokens": 844174153.0, "step": 6458 }, { "epoch": 2.577414205905826, "grad_norm": 0.25116199254989624, "learning_rate": 7.397380091144585e-06, "loss": 0.07, "num_tokens": 844305225.0, "step": 6459 }, { "epoch": 2.5778132482043095, "grad_norm": 0.19496046006679535, "learning_rate": 7.3929365769317994e-06, "loss": 0.0606, "num_tokens": 844436297.0, "step": 6460 }, { "epoch": 2.578212290502793, "grad_norm": 0.2379983514547348, "learning_rate": 7.38849695324835e-06, "loss": 0.0768, "num_tokens": 844567369.0, "step": 6461 }, { "epoch": 2.5786113328012767, "grad_norm": 0.2819608747959137, "learning_rate": 7.384061220953266e-06, "loss": 0.0957, "num_tokens": 844698441.0, "step": 6462 }, { "epoch": 2.5790103750997604, "grad_norm": 0.2357417643070221, "learning_rate": 7.379629380904814e-06, "loss": 0.0607, "num_tokens": 844829513.0, "step": 6463 }, { "epoch": 2.579409417398244, "grad_norm": 0.2414522022008896, "learning_rate": 7.3752014339605215e-06, "loss": 0.0689, "num_tokens": 844960585.0, "step": 6464 }, { "epoch": 2.5798084596967277, "grad_norm": 0.25092580914497375, "learning_rate": 7.370777380977145e-06, "loss": 0.0786, "num_tokens": 845091657.0, "step": 6465 }, { "epoch": 2.5802075019952113, "grad_norm": 0.25045540928840637, "learning_rate": 7.366357222810702e-06, "loss": 0.0679, "num_tokens": 845222729.0, "step": 6466 }, { "epoch": 2.580606544293695, "grad_norm": 0.23115144670009613, "learning_rate": 7.36194096031646e-06, "loss": 0.071, "num_tokens": 845353801.0, "step": 6467 }, { "epoch": 2.5810055865921786, "grad_norm": 0.2508176863193512, "learning_rate": 7.357528594348911e-06, "loss": 0.0722, "num_tokens": 845484873.0, "step": 6468 }, { "epoch": 2.5814046288906622, "grad_norm": 0.24292725324630737, "learning_rate": 7.353120125761813e-06, "loss": 0.0737, "num_tokens": 845615945.0, "step": 6469 }, { "epoch": 2.581803671189146, "grad_norm": 0.23893386125564575, "learning_rate": 7.348715555408171e-06, "loss": 0.0667, "num_tokens": 845747017.0, "step": 6470 }, { "epoch": 2.5822027134876295, "grad_norm": 0.22446276247501373, "learning_rate": 7.344314884140213e-06, "loss": 0.0629, "num_tokens": 845878089.0, "step": 6471 }, { "epoch": 2.582601755786113, "grad_norm": 0.21112900972366333, "learning_rate": 7.339918112809439e-06, "loss": 0.0547, "num_tokens": 846009161.0, "step": 6472 }, { "epoch": 2.583000798084597, "grad_norm": 0.26974180340766907, "learning_rate": 7.33552524226659e-06, "loss": 0.0669, "num_tokens": 846140233.0, "step": 6473 }, { "epoch": 2.5833998403830805, "grad_norm": 0.25675472617149353, "learning_rate": 7.331136273361624e-06, "loss": 0.0802, "num_tokens": 846271305.0, "step": 6474 }, { "epoch": 2.583798882681564, "grad_norm": 0.2724335789680481, "learning_rate": 7.326751206943779e-06, "loss": 0.0812, "num_tokens": 846402377.0, "step": 6475 }, { "epoch": 2.5841979249800477, "grad_norm": 0.2689724266529083, "learning_rate": 7.322370043861528e-06, "loss": 0.082, "num_tokens": 846533449.0, "step": 6476 }, { "epoch": 2.5845969672785314, "grad_norm": 0.24647413194179535, "learning_rate": 7.317992784962574e-06, "loss": 0.067, "num_tokens": 846664521.0, "step": 6477 }, { "epoch": 2.584996009577015, "grad_norm": 0.24694664776325226, "learning_rate": 7.3136194310938805e-06, "loss": 0.0813, "num_tokens": 846795593.0, "step": 6478 }, { "epoch": 2.5853950518754987, "grad_norm": 0.28030380606651306, "learning_rate": 7.309249983101658e-06, "loss": 0.0706, "num_tokens": 846926665.0, "step": 6479 }, { "epoch": 2.5857940941739823, "grad_norm": 0.2509288191795349, "learning_rate": 7.304884441831344e-06, "loss": 0.0627, "num_tokens": 847057737.0, "step": 6480 }, { "epoch": 2.586193136472466, "grad_norm": 0.24855713546276093, "learning_rate": 7.3005228081276355e-06, "loss": 0.0741, "num_tokens": 847188809.0, "step": 6481 }, { "epoch": 2.5865921787709496, "grad_norm": 0.32111072540283203, "learning_rate": 7.296165082834469e-06, "loss": 0.0866, "num_tokens": 847319881.0, "step": 6482 }, { "epoch": 2.5869912210694332, "grad_norm": 0.28604504466056824, "learning_rate": 7.291811266795016e-06, "loss": 0.0746, "num_tokens": 847450953.0, "step": 6483 }, { "epoch": 2.587390263367917, "grad_norm": 0.226244255900383, "learning_rate": 7.287461360851711e-06, "loss": 0.0569, "num_tokens": 847582025.0, "step": 6484 }, { "epoch": 2.5877893056664005, "grad_norm": 0.2491714358329773, "learning_rate": 7.2831153658462074e-06, "loss": 0.0697, "num_tokens": 847713097.0, "step": 6485 }, { "epoch": 2.588188347964884, "grad_norm": 0.25394490361213684, "learning_rate": 7.278773282619427e-06, "loss": 0.0769, "num_tokens": 847844169.0, "step": 6486 }, { "epoch": 2.588587390263368, "grad_norm": 0.235739067196846, "learning_rate": 7.274435112011512e-06, "loss": 0.0797, "num_tokens": 847975241.0, "step": 6487 }, { "epoch": 2.5889864325618515, "grad_norm": 0.2623063623905182, "learning_rate": 7.270100854861861e-06, "loss": 0.0689, "num_tokens": 848106313.0, "step": 6488 }, { "epoch": 2.589385474860335, "grad_norm": 0.2536422312259674, "learning_rate": 7.26577051200912e-06, "loss": 0.0703, "num_tokens": 848237385.0, "step": 6489 }, { "epoch": 2.5897845171588187, "grad_norm": 0.30627885460853577, "learning_rate": 7.261444084291161e-06, "loss": 0.107, "num_tokens": 848368457.0, "step": 6490 }, { "epoch": 2.5901835594573024, "grad_norm": 0.23215384781360626, "learning_rate": 7.2571215725451075e-06, "loss": 0.0636, "num_tokens": 848499529.0, "step": 6491 }, { "epoch": 2.590582601755786, "grad_norm": 0.25042614340782166, "learning_rate": 7.2528029776073315e-06, "loss": 0.0693, "num_tokens": 848630601.0, "step": 6492 }, { "epoch": 2.5909816440542697, "grad_norm": 0.2609964907169342, "learning_rate": 7.248488300313433e-06, "loss": 0.0742, "num_tokens": 848761673.0, "step": 6493 }, { "epoch": 2.5913806863527533, "grad_norm": 0.2681480050086975, "learning_rate": 7.2441775414982694e-06, "loss": 0.0793, "num_tokens": 848877469.0, "step": 6494 }, { "epoch": 2.591779728651237, "grad_norm": 0.23832198977470398, "learning_rate": 7.239870701995931e-06, "loss": 0.0701, "num_tokens": 849008541.0, "step": 6495 }, { "epoch": 2.5921787709497206, "grad_norm": 0.27883556485176086, "learning_rate": 7.235567782639751e-06, "loss": 0.0822, "num_tokens": 849139613.0, "step": 6496 }, { "epoch": 2.5925778132482042, "grad_norm": 0.2705577313899994, "learning_rate": 7.231268784262297e-06, "loss": 0.0799, "num_tokens": 849270685.0, "step": 6497 }, { "epoch": 2.592976855546688, "grad_norm": 0.23296208679676056, "learning_rate": 7.226973707695396e-06, "loss": 0.0675, "num_tokens": 849401757.0, "step": 6498 }, { "epoch": 2.5933758978451715, "grad_norm": 0.24083302915096283, "learning_rate": 7.222682553770094e-06, "loss": 0.0829, "num_tokens": 849532829.0, "step": 6499 }, { "epoch": 2.593774940143655, "grad_norm": 0.24664664268493652, "learning_rate": 7.218395323316696e-06, "loss": 0.0756, "num_tokens": 849663901.0, "step": 6500 }, { "epoch": 2.594173982442139, "grad_norm": 0.2528845965862274, "learning_rate": 7.214112017164743e-06, "loss": 0.0728, "num_tokens": 849794973.0, "step": 6501 }, { "epoch": 2.5945730247406225, "grad_norm": 0.25481486320495605, "learning_rate": 7.20983263614301e-06, "loss": 0.0739, "num_tokens": 849926045.0, "step": 6502 }, { "epoch": 2.594972067039106, "grad_norm": 0.2127763330936432, "learning_rate": 7.205557181079517e-06, "loss": 0.0667, "num_tokens": 850057117.0, "step": 6503 }, { "epoch": 2.5953711093375897, "grad_norm": 0.2594301104545593, "learning_rate": 7.201285652801531e-06, "loss": 0.0769, "num_tokens": 850188189.0, "step": 6504 }, { "epoch": 2.5957701516360734, "grad_norm": 0.2781384587287903, "learning_rate": 7.197018052135549e-06, "loss": 0.093, "num_tokens": 850319261.0, "step": 6505 }, { "epoch": 2.596169193934557, "grad_norm": 0.35326018929481506, "learning_rate": 7.19275437990731e-06, "loss": 0.0992, "num_tokens": 850450333.0, "step": 6506 }, { "epoch": 2.5965682362330407, "grad_norm": 0.31396040320396423, "learning_rate": 7.188494636941799e-06, "loss": 0.0841, "num_tokens": 850581405.0, "step": 6507 }, { "epoch": 2.5969672785315243, "grad_norm": 0.24528370797634125, "learning_rate": 7.184238824063236e-06, "loss": 0.0792, "num_tokens": 850712477.0, "step": 6508 }, { "epoch": 2.597366320830008, "grad_norm": 0.23690985143184662, "learning_rate": 7.179986942095074e-06, "loss": 0.0635, "num_tokens": 850843549.0, "step": 6509 }, { "epoch": 2.5977653631284916, "grad_norm": 0.2261689305305481, "learning_rate": 7.175738991860019e-06, "loss": 0.0706, "num_tokens": 850974621.0, "step": 6510 }, { "epoch": 2.5981644054269752, "grad_norm": 0.240886852145195, "learning_rate": 7.171494974180013e-06, "loss": 0.0816, "num_tokens": 851105693.0, "step": 6511 }, { "epoch": 2.598563447725459, "grad_norm": 0.2574295699596405, "learning_rate": 7.1672548898762235e-06, "loss": 0.077, "num_tokens": 851236765.0, "step": 6512 }, { "epoch": 2.5989624900239425, "grad_norm": 0.2777496874332428, "learning_rate": 7.163018739769076e-06, "loss": 0.0845, "num_tokens": 851367837.0, "step": 6513 }, { "epoch": 2.599361532322426, "grad_norm": 0.258266806602478, "learning_rate": 7.158786524678227e-06, "loss": 0.0787, "num_tokens": 851498909.0, "step": 6514 }, { "epoch": 2.59976057462091, "grad_norm": 0.2374909371137619, "learning_rate": 7.154558245422561e-06, "loss": 0.0658, "num_tokens": 851629981.0, "step": 6515 }, { "epoch": 2.6001596169193935, "grad_norm": 0.24196484684944153, "learning_rate": 7.150333902820222e-06, "loss": 0.0726, "num_tokens": 851761053.0, "step": 6516 }, { "epoch": 2.600558659217877, "grad_norm": 0.2512267827987671, "learning_rate": 7.146113497688574e-06, "loss": 0.087, "num_tokens": 851892125.0, "step": 6517 }, { "epoch": 2.6009577015163607, "grad_norm": 0.24417421221733093, "learning_rate": 7.141897030844227e-06, "loss": 0.0673, "num_tokens": 852023197.0, "step": 6518 }, { "epoch": 2.6013567438148444, "grad_norm": 0.25751209259033203, "learning_rate": 7.1376845031030285e-06, "loss": 0.0735, "num_tokens": 852154269.0, "step": 6519 }, { "epoch": 2.601755786113328, "grad_norm": 0.3002362847328186, "learning_rate": 7.133475915280066e-06, "loss": 0.0946, "num_tokens": 852285341.0, "step": 6520 }, { "epoch": 2.6021548284118117, "grad_norm": 0.2614586353302002, "learning_rate": 7.129271268189655e-06, "loss": 0.0765, "num_tokens": 852415003.0, "step": 6521 }, { "epoch": 2.6025538707102953, "grad_norm": 0.241773322224617, "learning_rate": 7.125070562645362e-06, "loss": 0.075, "num_tokens": 852546075.0, "step": 6522 }, { "epoch": 2.602952913008779, "grad_norm": 0.2909981310367584, "learning_rate": 7.120873799459986e-06, "loss": 0.0903, "num_tokens": 852677147.0, "step": 6523 }, { "epoch": 2.6033519553072626, "grad_norm": 0.2614644467830658, "learning_rate": 7.1166809794455534e-06, "loss": 0.0905, "num_tokens": 852808219.0, "step": 6524 }, { "epoch": 2.6037509976057462, "grad_norm": 0.24387918412685394, "learning_rate": 7.1124921034133384e-06, "loss": 0.0704, "num_tokens": 852939291.0, "step": 6525 }, { "epoch": 2.60415003990423, "grad_norm": 0.2504347860813141, "learning_rate": 7.108307172173859e-06, "loss": 0.0773, "num_tokens": 853070363.0, "step": 6526 }, { "epoch": 2.6045490822027135, "grad_norm": 0.247173473238945, "learning_rate": 7.104126186536847e-06, "loss": 0.0672, "num_tokens": 853201435.0, "step": 6527 }, { "epoch": 2.604948124501197, "grad_norm": 0.23926693201065063, "learning_rate": 7.09994914731129e-06, "loss": 0.0664, "num_tokens": 853332507.0, "step": 6528 }, { "epoch": 2.605347166799681, "grad_norm": 0.23737424612045288, "learning_rate": 7.095776055305411e-06, "loss": 0.0705, "num_tokens": 853463579.0, "step": 6529 }, { "epoch": 2.6057462090981645, "grad_norm": 0.26529401540756226, "learning_rate": 7.091606911326661e-06, "loss": 0.0725, "num_tokens": 853594651.0, "step": 6530 }, { "epoch": 2.606145251396648, "grad_norm": 0.28623154759407043, "learning_rate": 7.087441716181721e-06, "loss": 0.0804, "num_tokens": 853725723.0, "step": 6531 }, { "epoch": 2.6065442936951317, "grad_norm": 0.24318651854991913, "learning_rate": 7.083280470676531e-06, "loss": 0.0693, "num_tokens": 853856795.0, "step": 6532 }, { "epoch": 2.6069433359936154, "grad_norm": 0.275333046913147, "learning_rate": 7.079123175616245e-06, "loss": 0.0842, "num_tokens": 853987867.0, "step": 6533 }, { "epoch": 2.607342378292099, "grad_norm": 0.24441401660442352, "learning_rate": 7.074969831805263e-06, "loss": 0.0666, "num_tokens": 854118939.0, "step": 6534 }, { "epoch": 2.6077414205905827, "grad_norm": 0.22014491260051727, "learning_rate": 7.070820440047223e-06, "loss": 0.0555, "num_tokens": 854250011.0, "step": 6535 }, { "epoch": 2.6081404628890663, "grad_norm": 0.2250956892967224, "learning_rate": 7.066675001144985e-06, "loss": 0.0616, "num_tokens": 854381083.0, "step": 6536 }, { "epoch": 2.60853950518755, "grad_norm": 0.2459006905555725, "learning_rate": 7.0625335159006555e-06, "loss": 0.0802, "num_tokens": 854512155.0, "step": 6537 }, { "epoch": 2.6089385474860336, "grad_norm": 0.22197820246219635, "learning_rate": 7.058395985115582e-06, "loss": 0.0623, "num_tokens": 854643227.0, "step": 6538 }, { "epoch": 2.6093375897845172, "grad_norm": 0.2459946721792221, "learning_rate": 7.054262409590324e-06, "loss": 0.0744, "num_tokens": 854774299.0, "step": 6539 }, { "epoch": 2.609736632083001, "grad_norm": 0.2534506320953369, "learning_rate": 7.0501327901246996e-06, "loss": 0.0671, "num_tokens": 854905371.0, "step": 6540 }, { "epoch": 2.6101356743814845, "grad_norm": 0.26609572768211365, "learning_rate": 7.046007127517753e-06, "loss": 0.0752, "num_tokens": 855036443.0, "step": 6541 }, { "epoch": 2.610534716679968, "grad_norm": 0.26001814007759094, "learning_rate": 7.041885422567757e-06, "loss": 0.0698, "num_tokens": 855167515.0, "step": 6542 }, { "epoch": 2.610933758978452, "grad_norm": 0.22274167835712433, "learning_rate": 7.0377676760722196e-06, "loss": 0.0533, "num_tokens": 855298587.0, "step": 6543 }, { "epoch": 2.6113328012769355, "grad_norm": 0.24517206847667694, "learning_rate": 7.033653888827891e-06, "loss": 0.0689, "num_tokens": 855429659.0, "step": 6544 }, { "epoch": 2.611731843575419, "grad_norm": 0.2749052047729492, "learning_rate": 7.029544061630754e-06, "loss": 0.0793, "num_tokens": 855560731.0, "step": 6545 }, { "epoch": 2.6121308858739027, "grad_norm": 0.3104390799999237, "learning_rate": 7.025438195276013e-06, "loss": 0.0939, "num_tokens": 855691803.0, "step": 6546 }, { "epoch": 2.6125299281723864, "grad_norm": 0.2624402642250061, "learning_rate": 7.0213362905581195e-06, "loss": 0.0694, "num_tokens": 855822875.0, "step": 6547 }, { "epoch": 2.61292897047087, "grad_norm": 0.31405773758888245, "learning_rate": 7.01723834827076e-06, "loss": 0.0927, "num_tokens": 855953947.0, "step": 6548 }, { "epoch": 2.6133280127693537, "grad_norm": 0.2166426181793213, "learning_rate": 7.0131443692068354e-06, "loss": 0.0686, "num_tokens": 856085019.0, "step": 6549 }, { "epoch": 2.6137270550678373, "grad_norm": 0.23802275955677032, "learning_rate": 7.0090543541585025e-06, "loss": 0.0761, "num_tokens": 856216091.0, "step": 6550 }, { "epoch": 2.614126097366321, "grad_norm": 0.2817743420600891, "learning_rate": 7.004968303917141e-06, "loss": 0.0849, "num_tokens": 856347163.0, "step": 6551 }, { "epoch": 2.6145251396648046, "grad_norm": 0.2688118517398834, "learning_rate": 7.00088621927336e-06, "loss": 0.0932, "num_tokens": 856478235.0, "step": 6552 }, { "epoch": 2.6149241819632882, "grad_norm": 0.25512880086898804, "learning_rate": 6.996808101016999e-06, "loss": 0.0834, "num_tokens": 856609307.0, "step": 6553 }, { "epoch": 2.615323224261772, "grad_norm": 0.22532230615615845, "learning_rate": 6.99273394993715e-06, "loss": 0.0704, "num_tokens": 856740379.0, "step": 6554 }, { "epoch": 2.6157222665602555, "grad_norm": 0.2415996789932251, "learning_rate": 6.98866376682211e-06, "loss": 0.076, "num_tokens": 856871451.0, "step": 6555 }, { "epoch": 2.616121308858739, "grad_norm": 0.2717762589454651, "learning_rate": 6.984597552459425e-06, "loss": 0.0838, "num_tokens": 857002523.0, "step": 6556 }, { "epoch": 2.616520351157223, "grad_norm": 0.19912581145763397, "learning_rate": 6.980535307635876e-06, "loss": 0.0507, "num_tokens": 857133595.0, "step": 6557 }, { "epoch": 2.6169193934557065, "grad_norm": 0.21227477490901947, "learning_rate": 6.976477033137463e-06, "loss": 0.0679, "num_tokens": 857264667.0, "step": 6558 }, { "epoch": 2.61731843575419, "grad_norm": 0.2402472048997879, "learning_rate": 6.972422729749425e-06, "loss": 0.0784, "num_tokens": 857395739.0, "step": 6559 }, { "epoch": 2.6177174780526737, "grad_norm": 0.24412260949611664, "learning_rate": 6.968372398256236e-06, "loss": 0.0763, "num_tokens": 857526811.0, "step": 6560 }, { "epoch": 2.6181165203511574, "grad_norm": 0.28240373730659485, "learning_rate": 6.9643260394415935e-06, "loss": 0.0915, "num_tokens": 857657883.0, "step": 6561 }, { "epoch": 2.618515562649641, "grad_norm": 0.2621654272079468, "learning_rate": 6.960283654088429e-06, "loss": 0.0782, "num_tokens": 857788955.0, "step": 6562 }, { "epoch": 2.6189146049481247, "grad_norm": 0.2798207104206085, "learning_rate": 6.956245242978912e-06, "loss": 0.0816, "num_tokens": 857920027.0, "step": 6563 }, { "epoch": 2.6193136472466083, "grad_norm": 0.26652342081069946, "learning_rate": 6.952210806894437e-06, "loss": 0.0724, "num_tokens": 858051099.0, "step": 6564 }, { "epoch": 2.619712689545092, "grad_norm": 0.30957111716270447, "learning_rate": 6.94818034661562e-06, "loss": 0.0853, "num_tokens": 858182171.0, "step": 6565 }, { "epoch": 2.6201117318435756, "grad_norm": 0.22804859280586243, "learning_rate": 6.944153862922324e-06, "loss": 0.0636, "num_tokens": 858313243.0, "step": 6566 }, { "epoch": 2.6205107741420592, "grad_norm": 0.269625186920166, "learning_rate": 6.9401313565936435e-06, "loss": 0.0813, "num_tokens": 858444315.0, "step": 6567 }, { "epoch": 2.620909816440543, "grad_norm": 0.2457120418548584, "learning_rate": 6.936112828407883e-06, "loss": 0.0723, "num_tokens": 858575387.0, "step": 6568 }, { "epoch": 2.6213088587390265, "grad_norm": 0.22992916405200958, "learning_rate": 6.932098279142595e-06, "loss": 0.0677, "num_tokens": 858706459.0, "step": 6569 }, { "epoch": 2.62170790103751, "grad_norm": 0.23590721189975739, "learning_rate": 6.928087709574565e-06, "loss": 0.0675, "num_tokens": 858837531.0, "step": 6570 }, { "epoch": 2.622106943335994, "grad_norm": 0.2175644487142563, "learning_rate": 6.924081120479789e-06, "loss": 0.068, "num_tokens": 858968603.0, "step": 6571 }, { "epoch": 2.6225059856344775, "grad_norm": 0.24517077207565308, "learning_rate": 6.920078512633509e-06, "loss": 0.0676, "num_tokens": 859099675.0, "step": 6572 }, { "epoch": 2.622905027932961, "grad_norm": 0.24023400247097015, "learning_rate": 6.916079886810198e-06, "loss": 0.0704, "num_tokens": 859230747.0, "step": 6573 }, { "epoch": 2.6233040702314447, "grad_norm": 0.2632116675376892, "learning_rate": 6.912085243783544e-06, "loss": 0.0768, "num_tokens": 859361819.0, "step": 6574 }, { "epoch": 2.6237031125299284, "grad_norm": 0.23852109909057617, "learning_rate": 6.908094584326481e-06, "loss": 0.059, "num_tokens": 859492891.0, "step": 6575 }, { "epoch": 2.624102154828412, "grad_norm": 0.2750861644744873, "learning_rate": 6.9041079092111605e-06, "loss": 0.0772, "num_tokens": 859623963.0, "step": 6576 }, { "epoch": 2.6245011971268957, "grad_norm": 0.28678208589553833, "learning_rate": 6.900125219208965e-06, "loss": 0.0791, "num_tokens": 859755035.0, "step": 6577 }, { "epoch": 2.6249002394253793, "grad_norm": 0.23783962428569794, "learning_rate": 6.8961465150905065e-06, "loss": 0.0701, "num_tokens": 859886107.0, "step": 6578 }, { "epoch": 2.625299281723863, "grad_norm": 0.27431944012641907, "learning_rate": 6.892171797625638e-06, "loss": 0.0792, "num_tokens": 860017179.0, "step": 6579 }, { "epoch": 2.6256983240223466, "grad_norm": 0.21964260935783386, "learning_rate": 6.888201067583418e-06, "loss": 0.065, "num_tokens": 860148251.0, "step": 6580 }, { "epoch": 2.6260973663208302, "grad_norm": 0.23871323466300964, "learning_rate": 6.884234325732149e-06, "loss": 0.0585, "num_tokens": 860267607.0, "step": 6581 }, { "epoch": 2.626496408619314, "grad_norm": 0.23332016170024872, "learning_rate": 6.880271572839365e-06, "loss": 0.0706, "num_tokens": 860398679.0, "step": 6582 }, { "epoch": 2.6268954509177975, "grad_norm": 0.3080889582633972, "learning_rate": 6.876312809671813e-06, "loss": 0.0993, "num_tokens": 860529751.0, "step": 6583 }, { "epoch": 2.627294493216281, "grad_norm": 0.21193991601467133, "learning_rate": 6.8723580369954825e-06, "loss": 0.0614, "num_tokens": 860660823.0, "step": 6584 }, { "epoch": 2.627693535514765, "grad_norm": 0.23921595513820648, "learning_rate": 6.868407255575586e-06, "loss": 0.0658, "num_tokens": 860791895.0, "step": 6585 }, { "epoch": 2.6280925778132485, "grad_norm": 0.23230373859405518, "learning_rate": 6.86446046617656e-06, "loss": 0.0598, "num_tokens": 860922967.0, "step": 6586 }, { "epoch": 2.628491620111732, "grad_norm": 0.24744422733783722, "learning_rate": 6.860517669562069e-06, "loss": 0.0762, "num_tokens": 861054039.0, "step": 6587 }, { "epoch": 2.6288906624102157, "grad_norm": 0.24898098409175873, "learning_rate": 6.85657886649501e-06, "loss": 0.0738, "num_tokens": 861185111.0, "step": 6588 }, { "epoch": 2.6292897047086994, "grad_norm": 0.24049974977970123, "learning_rate": 6.852644057737508e-06, "loss": 0.0687, "num_tokens": 861316183.0, "step": 6589 }, { "epoch": 2.629688747007183, "grad_norm": 0.23136182129383087, "learning_rate": 6.848713244050905e-06, "loss": 0.0713, "num_tokens": 861447255.0, "step": 6590 }, { "epoch": 2.6300877893056667, "grad_norm": 0.24700026214122772, "learning_rate": 6.844786426195783e-06, "loss": 0.0782, "num_tokens": 861578327.0, "step": 6591 }, { "epoch": 2.6304868316041503, "grad_norm": 0.21959088742733002, "learning_rate": 6.840863604931938e-06, "loss": 0.0729, "num_tokens": 861709399.0, "step": 6592 }, { "epoch": 2.630885873902634, "grad_norm": 0.24739427864551544, "learning_rate": 6.836944781018408e-06, "loss": 0.0658, "num_tokens": 861840471.0, "step": 6593 }, { "epoch": 2.631284916201117, "grad_norm": 0.2792763411998749, "learning_rate": 6.833029955213444e-06, "loss": 0.0718, "num_tokens": 861971543.0, "step": 6594 }, { "epoch": 2.631683958499601, "grad_norm": 0.2626914083957672, "learning_rate": 6.8291191282745246e-06, "loss": 0.0708, "num_tokens": 862102615.0, "step": 6595 }, { "epoch": 2.6320830007980844, "grad_norm": 0.24548661708831787, "learning_rate": 6.825212300958364e-06, "loss": 0.0744, "num_tokens": 862233687.0, "step": 6596 }, { "epoch": 2.632482043096568, "grad_norm": 0.25544023513793945, "learning_rate": 6.821309474020899e-06, "loss": 0.0811, "num_tokens": 862364759.0, "step": 6597 }, { "epoch": 2.6328810853950517, "grad_norm": 0.24730947613716125, "learning_rate": 6.8174106482172875e-06, "loss": 0.0814, "num_tokens": 862495831.0, "step": 6598 }, { "epoch": 2.6332801276935354, "grad_norm": 0.24610406160354614, "learning_rate": 6.813515824301907e-06, "loss": 0.0764, "num_tokens": 862626903.0, "step": 6599 }, { "epoch": 2.633679169992019, "grad_norm": 0.2411700040102005, "learning_rate": 6.80962500302838e-06, "loss": 0.0744, "num_tokens": 862757975.0, "step": 6600 }, { "epoch": 2.6340782122905027, "grad_norm": 0.25543174147605896, "learning_rate": 6.805738185149548e-06, "loss": 0.0791, "num_tokens": 862889047.0, "step": 6601 }, { "epoch": 2.6344772545889863, "grad_norm": 0.3053950071334839, "learning_rate": 6.801855371417461e-06, "loss": 0.0874, "num_tokens": 863020119.0, "step": 6602 }, { "epoch": 2.63487629688747, "grad_norm": 0.24572129547595978, "learning_rate": 6.797976562583414e-06, "loss": 0.0796, "num_tokens": 863151191.0, "step": 6603 }, { "epoch": 2.6352753391859536, "grad_norm": 0.2610989809036255, "learning_rate": 6.794101759397927e-06, "loss": 0.0666, "num_tokens": 863282263.0, "step": 6604 }, { "epoch": 2.6356743814844372, "grad_norm": 0.21390710771083832, "learning_rate": 6.7902309626107236e-06, "loss": 0.0705, "num_tokens": 863413335.0, "step": 6605 }, { "epoch": 2.636073423782921, "grad_norm": 0.26091575622558594, "learning_rate": 6.786364172970777e-06, "loss": 0.0656, "num_tokens": 863544407.0, "step": 6606 }, { "epoch": 2.6364724660814045, "grad_norm": 0.24865789711475372, "learning_rate": 6.782501391226278e-06, "loss": 0.0676, "num_tokens": 863675479.0, "step": 6607 }, { "epoch": 2.636871508379888, "grad_norm": 0.21990881860256195, "learning_rate": 6.7786426181246265e-06, "loss": 0.0628, "num_tokens": 863806551.0, "step": 6608 }, { "epoch": 2.637270550678372, "grad_norm": 0.21318982541561127, "learning_rate": 6.774787854412473e-06, "loss": 0.0595, "num_tokens": 863937623.0, "step": 6609 }, { "epoch": 2.6376695929768554, "grad_norm": 0.24407516419887543, "learning_rate": 6.77093710083567e-06, "loss": 0.069, "num_tokens": 864068695.0, "step": 6610 }, { "epoch": 2.638068635275339, "grad_norm": 0.28581759333610535, "learning_rate": 6.767090358139302e-06, "loss": 0.0897, "num_tokens": 864199767.0, "step": 6611 }, { "epoch": 2.6384676775738227, "grad_norm": 0.23602023720741272, "learning_rate": 6.763247627067679e-06, "loss": 0.0673, "num_tokens": 864330839.0, "step": 6612 }, { "epoch": 2.6388667198723064, "grad_norm": 0.2445565164089203, "learning_rate": 6.75940890836434e-06, "loss": 0.0641, "num_tokens": 864461911.0, "step": 6613 }, { "epoch": 2.63926576217079, "grad_norm": 0.26253852248191833, "learning_rate": 6.75557420277203e-06, "loss": 0.0746, "num_tokens": 864592983.0, "step": 6614 }, { "epoch": 2.6396648044692737, "grad_norm": 0.3275136351585388, "learning_rate": 6.7517435110327345e-06, "loss": 0.0802, "num_tokens": 864724055.0, "step": 6615 }, { "epoch": 2.6400638467677573, "grad_norm": 0.22215710580348969, "learning_rate": 6.7479168338876615e-06, "loss": 0.061, "num_tokens": 864855127.0, "step": 6616 }, { "epoch": 2.640462889066241, "grad_norm": 0.2564559876918793, "learning_rate": 6.744094172077228e-06, "loss": 0.0772, "num_tokens": 864986199.0, "step": 6617 }, { "epoch": 2.6408619313647246, "grad_norm": 0.23642928898334503, "learning_rate": 6.740275526341092e-06, "loss": 0.0668, "num_tokens": 865117271.0, "step": 6618 }, { "epoch": 2.6412609736632082, "grad_norm": 0.20798039436340332, "learning_rate": 6.736460897418121e-06, "loss": 0.0526, "num_tokens": 865248343.0, "step": 6619 }, { "epoch": 2.641660015961692, "grad_norm": 0.21616074442863464, "learning_rate": 6.732650286046413e-06, "loss": 0.0598, "num_tokens": 865379415.0, "step": 6620 }, { "epoch": 2.6420590582601755, "grad_norm": 0.19870801270008087, "learning_rate": 6.72884369296328e-06, "loss": 0.0524, "num_tokens": 865510487.0, "step": 6621 }, { "epoch": 2.642458100558659, "grad_norm": 0.22352047264575958, "learning_rate": 6.725041118905266e-06, "loss": 0.0715, "num_tokens": 865641559.0, "step": 6622 }, { "epoch": 2.642857142857143, "grad_norm": 0.21608591079711914, "learning_rate": 6.721242564608139e-06, "loss": 0.056, "num_tokens": 865772631.0, "step": 6623 }, { "epoch": 2.6432561851556264, "grad_norm": 0.21894559264183044, "learning_rate": 6.717448030806872e-06, "loss": 0.0694, "num_tokens": 865903703.0, "step": 6624 }, { "epoch": 2.64365522745411, "grad_norm": 0.2544962167739868, "learning_rate": 6.71365751823568e-06, "loss": 0.07, "num_tokens": 866034775.0, "step": 6625 }, { "epoch": 2.6440542697525937, "grad_norm": 0.24697095155715942, "learning_rate": 6.709871027627995e-06, "loss": 0.0742, "num_tokens": 866165847.0, "step": 6626 }, { "epoch": 2.6444533120510774, "grad_norm": 0.23440511524677277, "learning_rate": 6.706088559716458e-06, "loss": 0.0688, "num_tokens": 866296919.0, "step": 6627 }, { "epoch": 2.644852354349561, "grad_norm": 0.25951534509658813, "learning_rate": 6.702310115232949e-06, "loss": 0.0849, "num_tokens": 866427991.0, "step": 6628 }, { "epoch": 2.6452513966480447, "grad_norm": 0.2906496524810791, "learning_rate": 6.698535694908562e-06, "loss": 0.0744, "num_tokens": 866545166.0, "step": 6629 }, { "epoch": 2.6456504389465283, "grad_norm": 0.29581403732299805, "learning_rate": 6.694765299473603e-06, "loss": 0.0797, "num_tokens": 866676238.0, "step": 6630 }, { "epoch": 2.646049481245012, "grad_norm": 0.22353637218475342, "learning_rate": 6.690998929657618e-06, "loss": 0.0689, "num_tokens": 866807310.0, "step": 6631 }, { "epoch": 2.6464485235434956, "grad_norm": 0.24812555313110352, "learning_rate": 6.687236586189363e-06, "loss": 0.0754, "num_tokens": 866938382.0, "step": 6632 }, { "epoch": 2.646847565841979, "grad_norm": 0.23857349157333374, "learning_rate": 6.683478269796811e-06, "loss": 0.0737, "num_tokens": 867069454.0, "step": 6633 }, { "epoch": 2.647246608140463, "grad_norm": 0.23235563933849335, "learning_rate": 6.679723981207164e-06, "loss": 0.0675, "num_tokens": 867200526.0, "step": 6634 }, { "epoch": 2.6476456504389465, "grad_norm": 0.2641178071498871, "learning_rate": 6.675973721146846e-06, "loss": 0.0817, "num_tokens": 867331598.0, "step": 6635 }, { "epoch": 2.64804469273743, "grad_norm": 0.2519122362136841, "learning_rate": 6.67222749034149e-06, "loss": 0.0682, "num_tokens": 867462670.0, "step": 6636 }, { "epoch": 2.648443735035914, "grad_norm": 0.22822275757789612, "learning_rate": 6.668485289515962e-06, "loss": 0.0733, "num_tokens": 867593742.0, "step": 6637 }, { "epoch": 2.6488427773343974, "grad_norm": 0.27649810910224915, "learning_rate": 6.664747119394343e-06, "loss": 0.083, "num_tokens": 867724814.0, "step": 6638 }, { "epoch": 2.649241819632881, "grad_norm": 0.24733395874500275, "learning_rate": 6.661012980699931e-06, "loss": 0.0563, "num_tokens": 867855886.0, "step": 6639 }, { "epoch": 2.6496408619313647, "grad_norm": 0.2866870164871216, "learning_rate": 6.657282874155246e-06, "loss": 0.0791, "num_tokens": 867986958.0, "step": 6640 }, { "epoch": 2.6500399042298484, "grad_norm": 0.30427709221839905, "learning_rate": 6.65355680048204e-06, "loss": 0.0825, "num_tokens": 868118030.0, "step": 6641 }, { "epoch": 2.650438946528332, "grad_norm": 0.2923352122306824, "learning_rate": 6.6498347604012574e-06, "loss": 0.0797, "num_tokens": 868249102.0, "step": 6642 }, { "epoch": 2.6508379888268156, "grad_norm": 0.27064865827560425, "learning_rate": 6.646116754633091e-06, "loss": 0.0805, "num_tokens": 868380174.0, "step": 6643 }, { "epoch": 2.6512370311252993, "grad_norm": 0.23861809074878693, "learning_rate": 6.642402783896932e-06, "loss": 0.0664, "num_tokens": 868511246.0, "step": 6644 }, { "epoch": 2.651636073423783, "grad_norm": 0.3944731056690216, "learning_rate": 6.638692848911404e-06, "loss": 0.0983, "num_tokens": 868628523.0, "step": 6645 }, { "epoch": 2.6520351157222666, "grad_norm": 0.2157188206911087, "learning_rate": 6.6349869503943424e-06, "loss": 0.0646, "num_tokens": 868759595.0, "step": 6646 }, { "epoch": 2.65243415802075, "grad_norm": 0.23589329421520233, "learning_rate": 6.6312850890628015e-06, "loss": 0.0729, "num_tokens": 868890667.0, "step": 6647 }, { "epoch": 2.652833200319234, "grad_norm": 0.23920275270938873, "learning_rate": 6.627587265633066e-06, "loss": 0.0657, "num_tokens": 869021739.0, "step": 6648 }, { "epoch": 2.6532322426177175, "grad_norm": 0.24358361959457397, "learning_rate": 6.6238934808206185e-06, "loss": 0.0812, "num_tokens": 869152811.0, "step": 6649 }, { "epoch": 2.653631284916201, "grad_norm": 0.21979445219039917, "learning_rate": 6.6202037353401806e-06, "loss": 0.0691, "num_tokens": 869283883.0, "step": 6650 }, { "epoch": 2.654030327214685, "grad_norm": 0.21609970927238464, "learning_rate": 6.6165180299056825e-06, "loss": 0.0562, "num_tokens": 869414955.0, "step": 6651 }, { "epoch": 2.6544293695131684, "grad_norm": 0.26334336400032043, "learning_rate": 6.612836365230269e-06, "loss": 0.0784, "num_tokens": 869546027.0, "step": 6652 }, { "epoch": 2.654828411811652, "grad_norm": 0.2169889211654663, "learning_rate": 6.609158742026317e-06, "loss": 0.0703, "num_tokens": 869677099.0, "step": 6653 }, { "epoch": 2.6552274541101357, "grad_norm": 0.22155849635601044, "learning_rate": 6.605485161005404e-06, "loss": 0.0672, "num_tokens": 869808171.0, "step": 6654 }, { "epoch": 2.6556264964086194, "grad_norm": 0.2237139344215393, "learning_rate": 6.6018156228783335e-06, "loss": 0.0656, "num_tokens": 869939243.0, "step": 6655 }, { "epoch": 2.656025538707103, "grad_norm": 0.2539966106414795, "learning_rate": 6.598150128355128e-06, "loss": 0.0787, "num_tokens": 870070315.0, "step": 6656 }, { "epoch": 2.6564245810055866, "grad_norm": 0.2514484226703644, "learning_rate": 6.594488678145033e-06, "loss": 0.0784, "num_tokens": 870201387.0, "step": 6657 }, { "epoch": 2.6568236233040703, "grad_norm": 0.25587940216064453, "learning_rate": 6.590831272956498e-06, "loss": 0.0806, "num_tokens": 870332459.0, "step": 6658 }, { "epoch": 2.657222665602554, "grad_norm": 0.2339974194765091, "learning_rate": 6.587177913497197e-06, "loss": 0.0776, "num_tokens": 870463531.0, "step": 6659 }, { "epoch": 2.6576217079010376, "grad_norm": 0.21430443227291107, "learning_rate": 6.583528600474028e-06, "loss": 0.0589, "num_tokens": 870594603.0, "step": 6660 }, { "epoch": 2.658020750199521, "grad_norm": 0.2787502706050873, "learning_rate": 6.579883334593087e-06, "loss": 0.0887, "num_tokens": 870725675.0, "step": 6661 }, { "epoch": 2.658419792498005, "grad_norm": 0.24077636003494263, "learning_rate": 6.57624211655971e-06, "loss": 0.0615, "num_tokens": 870856747.0, "step": 6662 }, { "epoch": 2.6588188347964885, "grad_norm": 0.26253578066825867, "learning_rate": 6.572604947078435e-06, "loss": 0.0823, "num_tokens": 870987819.0, "step": 6663 }, { "epoch": 2.659217877094972, "grad_norm": 0.29637691378593445, "learning_rate": 6.568971826853018e-06, "loss": 0.0735, "num_tokens": 871118891.0, "step": 6664 }, { "epoch": 2.659616919393456, "grad_norm": 0.27069389820098877, "learning_rate": 6.56534275658644e-06, "loss": 0.0735, "num_tokens": 871249963.0, "step": 6665 }, { "epoch": 2.6600159616919394, "grad_norm": 0.2395847588777542, "learning_rate": 6.561717736980884e-06, "loss": 0.0676, "num_tokens": 871381035.0, "step": 6666 }, { "epoch": 2.660415003990423, "grad_norm": 0.24818459153175354, "learning_rate": 6.558096768737766e-06, "loss": 0.0723, "num_tokens": 871512107.0, "step": 6667 }, { "epoch": 2.6608140462889067, "grad_norm": 0.2711902856826782, "learning_rate": 6.5544798525577e-06, "loss": 0.081, "num_tokens": 871643179.0, "step": 6668 }, { "epoch": 2.6612130885873904, "grad_norm": 0.24929608404636383, "learning_rate": 6.550866989140536e-06, "loss": 0.0728, "num_tokens": 871774251.0, "step": 6669 }, { "epoch": 2.661612130885874, "grad_norm": 0.23198732733726501, "learning_rate": 6.547258179185319e-06, "loss": 0.0685, "num_tokens": 871905323.0, "step": 6670 }, { "epoch": 2.6620111731843576, "grad_norm": 0.2656128704547882, "learning_rate": 6.543653423390328e-06, "loss": 0.0724, "num_tokens": 872036395.0, "step": 6671 }, { "epoch": 2.6624102154828413, "grad_norm": 0.2302136868238449, "learning_rate": 6.540052722453048e-06, "loss": 0.0642, "num_tokens": 872167467.0, "step": 6672 }, { "epoch": 2.662809257781325, "grad_norm": 0.2484729290008545, "learning_rate": 6.536456077070178e-06, "loss": 0.0718, "num_tokens": 872298539.0, "step": 6673 }, { "epoch": 2.6632083000798086, "grad_norm": 0.25350821018218994, "learning_rate": 6.5328634879376375e-06, "loss": 0.0758, "num_tokens": 872429611.0, "step": 6674 }, { "epoch": 2.663607342378292, "grad_norm": 0.2293877899646759, "learning_rate": 6.529274955750563e-06, "loss": 0.0613, "num_tokens": 872560683.0, "step": 6675 }, { "epoch": 2.664006384676776, "grad_norm": 0.21239440143108368, "learning_rate": 6.5256904812032936e-06, "loss": 0.0521, "num_tokens": 872691755.0, "step": 6676 }, { "epoch": 2.6644054269752595, "grad_norm": 0.2687729597091675, "learning_rate": 6.5221100649893996e-06, "loss": 0.0688, "num_tokens": 872822827.0, "step": 6677 }, { "epoch": 2.664804469273743, "grad_norm": 0.24452093243598938, "learning_rate": 6.51853370780165e-06, "loss": 0.0799, "num_tokens": 872953899.0, "step": 6678 }, { "epoch": 2.665203511572227, "grad_norm": 0.2237691879272461, "learning_rate": 6.514961410332044e-06, "loss": 0.065, "num_tokens": 873084971.0, "step": 6679 }, { "epoch": 2.6656025538707104, "grad_norm": 0.25648388266563416, "learning_rate": 6.511393173271785e-06, "loss": 0.0734, "num_tokens": 873216043.0, "step": 6680 }, { "epoch": 2.666001596169194, "grad_norm": 0.29591843485832214, "learning_rate": 6.507828997311291e-06, "loss": 0.0865, "num_tokens": 873331576.0, "step": 6681 }, { "epoch": 2.6664006384676777, "grad_norm": 0.25821995735168457, "learning_rate": 6.504268883140205e-06, "loss": 0.0788, "num_tokens": 873462648.0, "step": 6682 }, { "epoch": 2.6667996807661614, "grad_norm": 0.2462000548839569, "learning_rate": 6.500712831447365e-06, "loss": 0.0671, "num_tokens": 873593720.0, "step": 6683 }, { "epoch": 2.667198723064645, "grad_norm": 0.2691582143306732, "learning_rate": 6.497160842920841e-06, "loss": 0.0837, "num_tokens": 873724792.0, "step": 6684 }, { "epoch": 2.6675977653631286, "grad_norm": 0.2681458294391632, "learning_rate": 6.49361291824791e-06, "loss": 0.0754, "num_tokens": 873855864.0, "step": 6685 }, { "epoch": 2.6679968076616123, "grad_norm": 0.23911529779434204, "learning_rate": 6.4900690581150585e-06, "loss": 0.0635, "num_tokens": 873986936.0, "step": 6686 }, { "epoch": 2.668395849960096, "grad_norm": 0.22282803058624268, "learning_rate": 6.486529263207991e-06, "loss": 0.0587, "num_tokens": 874118008.0, "step": 6687 }, { "epoch": 2.668794892258579, "grad_norm": 0.27179333567619324, "learning_rate": 6.482993534211631e-06, "loss": 0.0896, "num_tokens": 874249080.0, "step": 6688 }, { "epoch": 2.6691939345570628, "grad_norm": 0.2186886966228485, "learning_rate": 6.479461871810101e-06, "loss": 0.0559, "num_tokens": 874380152.0, "step": 6689 }, { "epoch": 2.6695929768555464, "grad_norm": 0.24534928798675537, "learning_rate": 6.475934276686743e-06, "loss": 0.0799, "num_tokens": 874511224.0, "step": 6690 }, { "epoch": 2.66999201915403, "grad_norm": 0.21132782101631165, "learning_rate": 6.472410749524127e-06, "loss": 0.0602, "num_tokens": 874642296.0, "step": 6691 }, { "epoch": 2.6703910614525137, "grad_norm": 0.24937142431735992, "learning_rate": 6.468891291004009e-06, "loss": 0.0785, "num_tokens": 874773368.0, "step": 6692 }, { "epoch": 2.6707901037509973, "grad_norm": 0.21911323070526123, "learning_rate": 6.465375901807377e-06, "loss": 0.069, "num_tokens": 874904440.0, "step": 6693 }, { "epoch": 2.671189146049481, "grad_norm": 0.32706212997436523, "learning_rate": 6.461864582614428e-06, "loss": 0.0904, "num_tokens": 875035512.0, "step": 6694 }, { "epoch": 2.6715881883479646, "grad_norm": 0.23666033148765564, "learning_rate": 6.458357334104563e-06, "loss": 0.0572, "num_tokens": 875166584.0, "step": 6695 }, { "epoch": 2.6719872306464483, "grad_norm": 0.24541710317134857, "learning_rate": 6.454854156956408e-06, "loss": 0.071, "num_tokens": 875297656.0, "step": 6696 }, { "epoch": 2.672386272944932, "grad_norm": 0.23813168704509735, "learning_rate": 6.451355051847796e-06, "loss": 0.0719, "num_tokens": 875428728.0, "step": 6697 }, { "epoch": 2.6727853152434156, "grad_norm": 0.2817200720310211, "learning_rate": 6.447860019455766e-06, "loss": 0.0676, "num_tokens": 875559800.0, "step": 6698 }, { "epoch": 2.673184357541899, "grad_norm": 0.249411940574646, "learning_rate": 6.444369060456582e-06, "loss": 0.0704, "num_tokens": 875690872.0, "step": 6699 }, { "epoch": 2.673583399840383, "grad_norm": 0.25091803073883057, "learning_rate": 6.440882175525699e-06, "loss": 0.074, "num_tokens": 875821944.0, "step": 6700 }, { "epoch": 2.6739824421388665, "grad_norm": 0.23880034685134888, "learning_rate": 6.437399365337809e-06, "loss": 0.0773, "num_tokens": 875953016.0, "step": 6701 }, { "epoch": 2.67438148443735, "grad_norm": 0.21275003254413605, "learning_rate": 6.433920630566796e-06, "loss": 0.0642, "num_tokens": 876084088.0, "step": 6702 }, { "epoch": 2.6747805267358338, "grad_norm": 0.23203185200691223, "learning_rate": 6.4304459718857655e-06, "loss": 0.0647, "num_tokens": 876215160.0, "step": 6703 }, { "epoch": 2.6751795690343174, "grad_norm": 0.289623498916626, "learning_rate": 6.4269753899670355e-06, "loss": 0.0777, "num_tokens": 876346232.0, "step": 6704 }, { "epoch": 2.675578611332801, "grad_norm": 0.2543196678161621, "learning_rate": 6.423508885482124e-06, "loss": 0.0694, "num_tokens": 876477304.0, "step": 6705 }, { "epoch": 2.6759776536312847, "grad_norm": 0.25052255392074585, "learning_rate": 6.42004645910177e-06, "loss": 0.0699, "num_tokens": 876608376.0, "step": 6706 }, { "epoch": 2.6763766959297683, "grad_norm": 0.2724342346191406, "learning_rate": 6.416588111495922e-06, "loss": 0.0733, "num_tokens": 876739448.0, "step": 6707 }, { "epoch": 2.676775738228252, "grad_norm": 0.24187085032463074, "learning_rate": 6.413133843333738e-06, "loss": 0.0575, "num_tokens": 876870520.0, "step": 6708 }, { "epoch": 2.6771747805267356, "grad_norm": 0.2993219792842865, "learning_rate": 6.409683655283585e-06, "loss": 0.0779, "num_tokens": 877001592.0, "step": 6709 }, { "epoch": 2.6775738228252193, "grad_norm": 0.22420243918895721, "learning_rate": 6.406237548013047e-06, "loss": 0.0618, "num_tokens": 877125688.0, "step": 6710 }, { "epoch": 2.677972865123703, "grad_norm": 0.23457619547843933, "learning_rate": 6.402795522188909e-06, "loss": 0.0713, "num_tokens": 877256760.0, "step": 6711 }, { "epoch": 2.6783719074221866, "grad_norm": 0.3007448613643646, "learning_rate": 6.399357578477169e-06, "loss": 0.0853, "num_tokens": 877387832.0, "step": 6712 }, { "epoch": 2.67877094972067, "grad_norm": 0.29002586007118225, "learning_rate": 6.3959237175430426e-06, "loss": 0.096, "num_tokens": 877518904.0, "step": 6713 }, { "epoch": 2.679169992019154, "grad_norm": 0.24576929211616516, "learning_rate": 6.392493940050944e-06, "loss": 0.0732, "num_tokens": 877649976.0, "step": 6714 }, { "epoch": 2.6795690343176375, "grad_norm": 0.20420844852924347, "learning_rate": 6.389068246664508e-06, "loss": 0.0465, "num_tokens": 877781048.0, "step": 6715 }, { "epoch": 2.679968076616121, "grad_norm": 0.2197461724281311, "learning_rate": 6.385646638046572e-06, "loss": 0.0733, "num_tokens": 877912120.0, "step": 6716 }, { "epoch": 2.6803671189146048, "grad_norm": 0.2581785321235657, "learning_rate": 6.382229114859187e-06, "loss": 0.0607, "num_tokens": 878043192.0, "step": 6717 }, { "epoch": 2.6807661612130884, "grad_norm": 0.22009770572185516, "learning_rate": 6.3788156777636085e-06, "loss": 0.0633, "num_tokens": 878174264.0, "step": 6718 }, { "epoch": 2.681165203511572, "grad_norm": 0.23746876418590546, "learning_rate": 6.375406327420311e-06, "loss": 0.0675, "num_tokens": 878305336.0, "step": 6719 }, { "epoch": 2.6815642458100557, "grad_norm": 0.30084842443466187, "learning_rate": 6.372001064488966e-06, "loss": 0.0862, "num_tokens": 878436408.0, "step": 6720 }, { "epoch": 2.6819632881085393, "grad_norm": 0.20873907208442688, "learning_rate": 6.36859988962846e-06, "loss": 0.0541, "num_tokens": 878567480.0, "step": 6721 }, { "epoch": 2.682362330407023, "grad_norm": 0.2456807792186737, "learning_rate": 6.365202803496896e-06, "loss": 0.0669, "num_tokens": 878698552.0, "step": 6722 }, { "epoch": 2.6827613727055066, "grad_norm": 0.2270788848400116, "learning_rate": 6.361809806751573e-06, "loss": 0.071, "num_tokens": 878829624.0, "step": 6723 }, { "epoch": 2.6831604150039903, "grad_norm": 0.22560559213161469, "learning_rate": 6.358420900049e-06, "loss": 0.0726, "num_tokens": 878960696.0, "step": 6724 }, { "epoch": 2.683559457302474, "grad_norm": 0.24346232414245605, "learning_rate": 6.3550360840449065e-06, "loss": 0.062, "num_tokens": 879091768.0, "step": 6725 }, { "epoch": 2.6839584996009576, "grad_norm": 0.24559253454208374, "learning_rate": 6.351655359394221e-06, "loss": 0.0629, "num_tokens": 879222840.0, "step": 6726 }, { "epoch": 2.684357541899441, "grad_norm": 0.2293895184993744, "learning_rate": 6.348278726751078e-06, "loss": 0.066, "num_tokens": 879353912.0, "step": 6727 }, { "epoch": 2.684756584197925, "grad_norm": 0.2196570634841919, "learning_rate": 6.344906186768828e-06, "loss": 0.0626, "num_tokens": 879484984.0, "step": 6728 }, { "epoch": 2.6851556264964085, "grad_norm": 0.2722876965999603, "learning_rate": 6.341537740100029e-06, "loss": 0.0721, "num_tokens": 879616056.0, "step": 6729 }, { "epoch": 2.685554668794892, "grad_norm": 0.2390490472316742, "learning_rate": 6.338173387396438e-06, "loss": 0.067, "num_tokens": 879747128.0, "step": 6730 }, { "epoch": 2.6859537110933758, "grad_norm": 0.21773448586463928, "learning_rate": 6.334813129309032e-06, "loss": 0.0613, "num_tokens": 879878200.0, "step": 6731 }, { "epoch": 2.6863527533918594, "grad_norm": 0.2567402422428131, "learning_rate": 6.331456966487982e-06, "loss": 0.0673, "num_tokens": 880009272.0, "step": 6732 }, { "epoch": 2.686751795690343, "grad_norm": 0.2705787718296051, "learning_rate": 6.3281048995826804e-06, "loss": 0.0736, "num_tokens": 880140344.0, "step": 6733 }, { "epoch": 2.6871508379888267, "grad_norm": 0.2014043927192688, "learning_rate": 6.324756929241717e-06, "loss": 0.0574, "num_tokens": 880271416.0, "step": 6734 }, { "epoch": 2.6875498802873103, "grad_norm": 0.21410973370075226, "learning_rate": 6.321413056112898e-06, "loss": 0.0544, "num_tokens": 880402488.0, "step": 6735 }, { "epoch": 2.687948922585794, "grad_norm": 0.2109813392162323, "learning_rate": 6.318073280843223e-06, "loss": 0.0505, "num_tokens": 880533560.0, "step": 6736 }, { "epoch": 2.6883479648842776, "grad_norm": 0.2235998809337616, "learning_rate": 6.314737604078916e-06, "loss": 0.0639, "num_tokens": 880664632.0, "step": 6737 }, { "epoch": 2.6887470071827613, "grad_norm": 0.21907328069210052, "learning_rate": 6.311406026465398e-06, "loss": 0.0588, "num_tokens": 880795704.0, "step": 6738 }, { "epoch": 2.689146049481245, "grad_norm": 0.2285134494304657, "learning_rate": 6.308078548647294e-06, "loss": 0.0703, "num_tokens": 880926776.0, "step": 6739 }, { "epoch": 2.6895450917797286, "grad_norm": 0.22270184755325317, "learning_rate": 6.304755171268438e-06, "loss": 0.0541, "num_tokens": 881057848.0, "step": 6740 }, { "epoch": 2.689944134078212, "grad_norm": 0.2775857746601105, "learning_rate": 6.301435894971885e-06, "loss": 0.0718, "num_tokens": 881188920.0, "step": 6741 }, { "epoch": 2.690343176376696, "grad_norm": 0.2898983955383301, "learning_rate": 6.2981207203998695e-06, "loss": 0.0823, "num_tokens": 881319992.0, "step": 6742 }, { "epoch": 2.6907422186751795, "grad_norm": 0.2370651662349701, "learning_rate": 6.294809648193856e-06, "loss": 0.0629, "num_tokens": 881451064.0, "step": 6743 }, { "epoch": 2.691141260973663, "grad_norm": 0.23457421362400055, "learning_rate": 6.291502678994507e-06, "loss": 0.0609, "num_tokens": 881582136.0, "step": 6744 }, { "epoch": 2.6915403032721468, "grad_norm": 0.2600736618041992, "learning_rate": 6.288199813441682e-06, "loss": 0.0575, "num_tokens": 881713208.0, "step": 6745 }, { "epoch": 2.6919393455706304, "grad_norm": 0.24500957131385803, "learning_rate": 6.284901052174458e-06, "loss": 0.0732, "num_tokens": 881844280.0, "step": 6746 }, { "epoch": 2.692338387869114, "grad_norm": 0.2255602329969406, "learning_rate": 6.28160639583112e-06, "loss": 0.0694, "num_tokens": 881975352.0, "step": 6747 }, { "epoch": 2.6927374301675977, "grad_norm": 0.2291984260082245, "learning_rate": 6.278315845049144e-06, "loss": 0.0698, "num_tokens": 882106424.0, "step": 6748 }, { "epoch": 2.6931364724660813, "grad_norm": 0.21636556088924408, "learning_rate": 6.275029400465228e-06, "loss": 0.0622, "num_tokens": 882237496.0, "step": 6749 }, { "epoch": 2.693535514764565, "grad_norm": 0.2680993974208832, "learning_rate": 6.271747062715268e-06, "loss": 0.0695, "num_tokens": 882368568.0, "step": 6750 }, { "epoch": 2.6939345570630486, "grad_norm": 0.28778955340385437, "learning_rate": 6.268468832434364e-06, "loss": 0.0963, "num_tokens": 882499640.0, "step": 6751 }, { "epoch": 2.6943335993615323, "grad_norm": 0.2416885793209076, "learning_rate": 6.265194710256819e-06, "loss": 0.0637, "num_tokens": 882630712.0, "step": 6752 }, { "epoch": 2.694732641660016, "grad_norm": 0.25083813071250916, "learning_rate": 6.261924696816156e-06, "loss": 0.0759, "num_tokens": 882761784.0, "step": 6753 }, { "epoch": 2.6951316839584996, "grad_norm": 0.18644259870052338, "learning_rate": 6.25865879274508e-06, "loss": 0.0563, "num_tokens": 882892856.0, "step": 6754 }, { "epoch": 2.695530726256983, "grad_norm": 0.2472320944070816, "learning_rate": 6.255396998675518e-06, "loss": 0.0723, "num_tokens": 883023928.0, "step": 6755 }, { "epoch": 2.695929768555467, "grad_norm": 0.22197562456130981, "learning_rate": 6.252139315238602e-06, "loss": 0.0611, "num_tokens": 883155000.0, "step": 6756 }, { "epoch": 2.6963288108539505, "grad_norm": 0.2815890908241272, "learning_rate": 6.248885743064657e-06, "loss": 0.0794, "num_tokens": 883286072.0, "step": 6757 }, { "epoch": 2.696727853152434, "grad_norm": 0.27899712324142456, "learning_rate": 6.2456362827832176e-06, "loss": 0.0803, "num_tokens": 883417144.0, "step": 6758 }, { "epoch": 2.6971268954509178, "grad_norm": 0.23111481964588165, "learning_rate": 6.242390935023027e-06, "loss": 0.0619, "num_tokens": 883548216.0, "step": 6759 }, { "epoch": 2.6975259377494014, "grad_norm": 0.2629075050354004, "learning_rate": 6.239149700412033e-06, "loss": 0.0822, "num_tokens": 883679288.0, "step": 6760 }, { "epoch": 2.697924980047885, "grad_norm": 0.29244309663772583, "learning_rate": 6.235912579577378e-06, "loss": 0.079, "num_tokens": 883810360.0, "step": 6761 }, { "epoch": 2.6983240223463687, "grad_norm": 0.2835069000720978, "learning_rate": 6.232679573145418e-06, "loss": 0.0898, "num_tokens": 883941432.0, "step": 6762 }, { "epoch": 2.6987230646448523, "grad_norm": 0.2133008986711502, "learning_rate": 6.22945068174171e-06, "loss": 0.0627, "num_tokens": 884072504.0, "step": 6763 }, { "epoch": 2.699122106943336, "grad_norm": 0.27043670415878296, "learning_rate": 6.2262259059910125e-06, "loss": 0.0734, "num_tokens": 884203576.0, "step": 6764 }, { "epoch": 2.6995211492418196, "grad_norm": 0.24651353061199188, "learning_rate": 6.22300524651729e-06, "loss": 0.081, "num_tokens": 884334648.0, "step": 6765 }, { "epoch": 2.6999201915403033, "grad_norm": 0.2862454056739807, "learning_rate": 6.219788703943713e-06, "loss": 0.086, "num_tokens": 884465720.0, "step": 6766 }, { "epoch": 2.700319233838787, "grad_norm": 0.28055456280708313, "learning_rate": 6.216576278892649e-06, "loss": 0.0806, "num_tokens": 884596792.0, "step": 6767 }, { "epoch": 2.7007182761372706, "grad_norm": 0.24005822837352753, "learning_rate": 6.213367971985673e-06, "loss": 0.0703, "num_tokens": 884727864.0, "step": 6768 }, { "epoch": 2.701117318435754, "grad_norm": 0.29807817935943604, "learning_rate": 6.210163783843567e-06, "loss": 0.0871, "num_tokens": 884858936.0, "step": 6769 }, { "epoch": 2.701516360734238, "grad_norm": 0.22817322611808777, "learning_rate": 6.206963715086301e-06, "loss": 0.0666, "num_tokens": 884990008.0, "step": 6770 }, { "epoch": 2.7019154030327215, "grad_norm": 0.2509955167770386, "learning_rate": 6.203767766333065e-06, "loss": 0.0722, "num_tokens": 885121080.0, "step": 6771 }, { "epoch": 2.702314445331205, "grad_norm": 0.25954800844192505, "learning_rate": 6.2005759382022504e-06, "loss": 0.0721, "num_tokens": 885252152.0, "step": 6772 }, { "epoch": 2.7027134876296888, "grad_norm": 0.2267390638589859, "learning_rate": 6.197388231311438e-06, "loss": 0.0712, "num_tokens": 885383224.0, "step": 6773 }, { "epoch": 2.7031125299281724, "grad_norm": 0.24094848334789276, "learning_rate": 6.194204646277423e-06, "loss": 0.0684, "num_tokens": 885499339.0, "step": 6774 }, { "epoch": 2.703511572226656, "grad_norm": 0.23350289463996887, "learning_rate": 6.1910251837161995e-06, "loss": 0.0653, "num_tokens": 885630411.0, "step": 6775 }, { "epoch": 2.7039106145251397, "grad_norm": 0.23846711218357086, "learning_rate": 6.1878498442429615e-06, "loss": 0.0657, "num_tokens": 885761483.0, "step": 6776 }, { "epoch": 2.7043096568236233, "grad_norm": 0.21961252391338348, "learning_rate": 6.18467862847211e-06, "loss": 0.0612, "num_tokens": 885892555.0, "step": 6777 }, { "epoch": 2.704708699122107, "grad_norm": 0.23805490136146545, "learning_rate": 6.181511537017246e-06, "loss": 0.0696, "num_tokens": 886023627.0, "step": 6778 }, { "epoch": 2.7051077414205906, "grad_norm": 0.22882339358329773, "learning_rate": 6.178348570491173e-06, "loss": 0.0665, "num_tokens": 886154699.0, "step": 6779 }, { "epoch": 2.7055067837190743, "grad_norm": 0.3039524555206299, "learning_rate": 6.1751897295058905e-06, "loss": 0.0874, "num_tokens": 886285771.0, "step": 6780 }, { "epoch": 2.705905826017558, "grad_norm": 0.2682531774044037, "learning_rate": 6.172035014672608e-06, "loss": 0.0638, "num_tokens": 886416843.0, "step": 6781 }, { "epoch": 2.7063048683160416, "grad_norm": 0.22756558656692505, "learning_rate": 6.168884426601736e-06, "loss": 0.0581, "num_tokens": 886547915.0, "step": 6782 }, { "epoch": 2.706703910614525, "grad_norm": 0.20773734152317047, "learning_rate": 6.1657379659028775e-06, "loss": 0.0514, "num_tokens": 886678607.0, "step": 6783 }, { "epoch": 2.707102952913009, "grad_norm": 0.2537747323513031, "learning_rate": 6.162595633184849e-06, "loss": 0.0701, "num_tokens": 886809679.0, "step": 6784 }, { "epoch": 2.7075019952114925, "grad_norm": 0.36847421526908875, "learning_rate": 6.159457429055663e-06, "loss": 0.0873, "num_tokens": 886940751.0, "step": 6785 }, { "epoch": 2.707901037509976, "grad_norm": 0.25271883606910706, "learning_rate": 6.156323354122528e-06, "loss": 0.0624, "num_tokens": 887071823.0, "step": 6786 }, { "epoch": 2.7083000798084598, "grad_norm": 0.22215254604816437, "learning_rate": 6.15319340899186e-06, "loss": 0.0693, "num_tokens": 887202895.0, "step": 6787 }, { "epoch": 2.7086991221069434, "grad_norm": 0.232585147023201, "learning_rate": 6.1500675942692775e-06, "loss": 0.0646, "num_tokens": 887333967.0, "step": 6788 }, { "epoch": 2.709098164405427, "grad_norm": 0.292721688747406, "learning_rate": 6.146945910559592e-06, "loss": 0.0737, "num_tokens": 887465039.0, "step": 6789 }, { "epoch": 2.7094972067039107, "grad_norm": 0.2602023184299469, "learning_rate": 6.1438283584668245e-06, "loss": 0.0613, "num_tokens": 887596111.0, "step": 6790 }, { "epoch": 2.7098962490023943, "grad_norm": 0.2338029146194458, "learning_rate": 6.140714938594189e-06, "loss": 0.069, "num_tokens": 887727183.0, "step": 6791 }, { "epoch": 2.710295291300878, "grad_norm": 0.2557222545146942, "learning_rate": 6.1376056515441015e-06, "loss": 0.0733, "num_tokens": 887858255.0, "step": 6792 }, { "epoch": 2.7106943335993616, "grad_norm": 0.32366329431533813, "learning_rate": 6.134500497918183e-06, "loss": 0.0897, "num_tokens": 887989327.0, "step": 6793 }, { "epoch": 2.7110933758978453, "grad_norm": 0.26044344902038574, "learning_rate": 6.1313994783172565e-06, "loss": 0.082, "num_tokens": 888120399.0, "step": 6794 }, { "epoch": 2.711492418196329, "grad_norm": 0.2541799247264862, "learning_rate": 6.1283025933413295e-06, "loss": 0.0685, "num_tokens": 888251471.0, "step": 6795 }, { "epoch": 2.7118914604948126, "grad_norm": 0.25502100586891174, "learning_rate": 6.125209843589629e-06, "loss": 0.0745, "num_tokens": 888382543.0, "step": 6796 }, { "epoch": 2.712290502793296, "grad_norm": 0.2309325784444809, "learning_rate": 6.122121229660572e-06, "loss": 0.0643, "num_tokens": 888513615.0, "step": 6797 }, { "epoch": 2.71268954509178, "grad_norm": 0.25080275535583496, "learning_rate": 6.119036752151772e-06, "loss": 0.0711, "num_tokens": 888644687.0, "step": 6798 }, { "epoch": 2.7130885873902635, "grad_norm": 0.22349946200847626, "learning_rate": 6.11595641166005e-06, "loss": 0.0631, "num_tokens": 888775759.0, "step": 6799 }, { "epoch": 2.713487629688747, "grad_norm": 0.28091537952423096, "learning_rate": 6.112880208781426e-06, "loss": 0.0832, "num_tokens": 888906831.0, "step": 6800 }, { "epoch": 2.7138866719872308, "grad_norm": 0.2087143212556839, "learning_rate": 6.109808144111112e-06, "loss": 0.0546, "num_tokens": 889037903.0, "step": 6801 }, { "epoch": 2.7142857142857144, "grad_norm": 0.25975027680397034, "learning_rate": 6.106740218243522e-06, "loss": 0.0864, "num_tokens": 889168975.0, "step": 6802 }, { "epoch": 2.714684756584198, "grad_norm": 0.2703101336956024, "learning_rate": 6.1036764317722775e-06, "loss": 0.0868, "num_tokens": 889300047.0, "step": 6803 }, { "epoch": 2.7150837988826817, "grad_norm": 0.23176972568035126, "learning_rate": 6.100616785290189e-06, "loss": 0.0739, "num_tokens": 889431119.0, "step": 6804 }, { "epoch": 2.7154828411811653, "grad_norm": 0.24016806483268738, "learning_rate": 6.097561279389269e-06, "loss": 0.0698, "num_tokens": 889562191.0, "step": 6805 }, { "epoch": 2.715881883479649, "grad_norm": 0.27223798632621765, "learning_rate": 6.094509914660734e-06, "loss": 0.0805, "num_tokens": 889693263.0, "step": 6806 }, { "epoch": 2.7162809257781326, "grad_norm": 0.2450779229402542, "learning_rate": 6.091462691694987e-06, "loss": 0.0684, "num_tokens": 889824335.0, "step": 6807 }, { "epoch": 2.7166799680766163, "grad_norm": 0.31320521235466003, "learning_rate": 6.0884196110816405e-06, "loss": 0.08, "num_tokens": 889955407.0, "step": 6808 }, { "epoch": 2.7170790103751, "grad_norm": 0.26741817593574524, "learning_rate": 6.0853806734095064e-06, "loss": 0.0735, "num_tokens": 890086479.0, "step": 6809 }, { "epoch": 2.7174780526735836, "grad_norm": 0.26351332664489746, "learning_rate": 6.082345879266588e-06, "loss": 0.0602, "num_tokens": 890217551.0, "step": 6810 }, { "epoch": 2.717877094972067, "grad_norm": 0.2198760211467743, "learning_rate": 6.0793152292400835e-06, "loss": 0.0627, "num_tokens": 890348623.0, "step": 6811 }, { "epoch": 2.718276137270551, "grad_norm": 0.26880931854248047, "learning_rate": 6.076288723916408e-06, "loss": 0.0838, "num_tokens": 890479695.0, "step": 6812 }, { "epoch": 2.7186751795690345, "grad_norm": 0.2727065980434418, "learning_rate": 6.073266363881152e-06, "loss": 0.0739, "num_tokens": 890610767.0, "step": 6813 }, { "epoch": 2.719074221867518, "grad_norm": 0.23541754484176636, "learning_rate": 6.0702481497191166e-06, "loss": 0.0699, "num_tokens": 890741839.0, "step": 6814 }, { "epoch": 2.7194732641660018, "grad_norm": 0.23554863035678864, "learning_rate": 6.067234082014299e-06, "loss": 0.0747, "num_tokens": 890872911.0, "step": 6815 }, { "epoch": 2.7198723064644854, "grad_norm": 0.2366277575492859, "learning_rate": 6.064224161349893e-06, "loss": 0.0814, "num_tokens": 891003983.0, "step": 6816 }, { "epoch": 2.720271348762969, "grad_norm": 0.2889820337295532, "learning_rate": 6.061218388308292e-06, "loss": 0.0737, "num_tokens": 891135055.0, "step": 6817 }, { "epoch": 2.7206703910614527, "grad_norm": 0.22654299437999725, "learning_rate": 6.058216763471077e-06, "loss": 0.0677, "num_tokens": 891266127.0, "step": 6818 }, { "epoch": 2.7210694333599363, "grad_norm": 0.22455652058124542, "learning_rate": 6.055219287419049e-06, "loss": 0.0625, "num_tokens": 891397199.0, "step": 6819 }, { "epoch": 2.72146847565842, "grad_norm": 0.23802663385868073, "learning_rate": 6.052225960732176e-06, "loss": 0.0629, "num_tokens": 891528271.0, "step": 6820 }, { "epoch": 2.7218675179569036, "grad_norm": 0.2662467062473297, "learning_rate": 6.0492367839896485e-06, "loss": 0.0761, "num_tokens": 891659343.0, "step": 6821 }, { "epoch": 2.7222665602553873, "grad_norm": 0.25182411074638367, "learning_rate": 6.046251757769845e-06, "loss": 0.0748, "num_tokens": 891790415.0, "step": 6822 }, { "epoch": 2.722665602553871, "grad_norm": 0.25723540782928467, "learning_rate": 6.043270882650332e-06, "loss": 0.0744, "num_tokens": 891921487.0, "step": 6823 }, { "epoch": 2.7230646448523546, "grad_norm": 0.18623611330986023, "learning_rate": 6.040294159207891e-06, "loss": 0.0487, "num_tokens": 892052559.0, "step": 6824 }, { "epoch": 2.723463687150838, "grad_norm": 0.22485986351966858, "learning_rate": 6.0373215880184845e-06, "loss": 0.0659, "num_tokens": 892183631.0, "step": 6825 }, { "epoch": 2.723862729449322, "grad_norm": 0.228757843375206, "learning_rate": 6.034353169657274e-06, "loss": 0.0791, "num_tokens": 892314703.0, "step": 6826 }, { "epoch": 2.7242617717478055, "grad_norm": 0.25993579626083374, "learning_rate": 6.031388904698628e-06, "loss": 0.0799, "num_tokens": 892445775.0, "step": 6827 }, { "epoch": 2.724660814046289, "grad_norm": 0.2394494116306305, "learning_rate": 6.0284287937161e-06, "loss": 0.072, "num_tokens": 892576847.0, "step": 6828 }, { "epoch": 2.7250598563447728, "grad_norm": 0.3042345941066742, "learning_rate": 6.025472837282444e-06, "loss": 0.0756, "num_tokens": 892707919.0, "step": 6829 }, { "epoch": 2.7254588986432564, "grad_norm": 0.2645387649536133, "learning_rate": 6.022521035969613e-06, "loss": 0.0694, "num_tokens": 892838991.0, "step": 6830 }, { "epoch": 2.72585794094174, "grad_norm": 0.29954952001571655, "learning_rate": 6.019573390348751e-06, "loss": 0.0893, "num_tokens": 892970063.0, "step": 6831 }, { "epoch": 2.7262569832402237, "grad_norm": 0.2957085371017456, "learning_rate": 6.0166299009902e-06, "loss": 0.0798, "num_tokens": 893101135.0, "step": 6832 }, { "epoch": 2.7266560255387073, "grad_norm": 0.24861469864845276, "learning_rate": 6.013690568463495e-06, "loss": 0.0582, "num_tokens": 893232207.0, "step": 6833 }, { "epoch": 2.727055067837191, "grad_norm": 0.2875213921070099, "learning_rate": 6.010755393337376e-06, "loss": 0.0859, "num_tokens": 893363279.0, "step": 6834 }, { "epoch": 2.7274541101356746, "grad_norm": 0.20805178582668304, "learning_rate": 6.007824376179768e-06, "loss": 0.0694, "num_tokens": 893494351.0, "step": 6835 }, { "epoch": 2.7278531524341583, "grad_norm": 0.22310684621334076, "learning_rate": 6.0048975175577925e-06, "loss": 0.0679, "num_tokens": 893625423.0, "step": 6836 }, { "epoch": 2.728252194732642, "grad_norm": 0.26039040088653564, "learning_rate": 6.0019748180377745e-06, "loss": 0.0763, "num_tokens": 893756495.0, "step": 6837 }, { "epoch": 2.7286512370311256, "grad_norm": 0.22814816236495972, "learning_rate": 5.999056278185227e-06, "loss": 0.0688, "num_tokens": 893887567.0, "step": 6838 }, { "epoch": 2.729050279329609, "grad_norm": 0.25572091341018677, "learning_rate": 5.996141898564862e-06, "loss": 0.071, "num_tokens": 894018639.0, "step": 6839 }, { "epoch": 2.729449321628093, "grad_norm": 0.2312411665916443, "learning_rate": 5.993231679740579e-06, "loss": 0.0649, "num_tokens": 894149711.0, "step": 6840 }, { "epoch": 2.7298483639265765, "grad_norm": 0.24726326763629913, "learning_rate": 5.990325622275487e-06, "loss": 0.068, "num_tokens": 894280783.0, "step": 6841 }, { "epoch": 2.73024740622506, "grad_norm": 0.2341303676366806, "learning_rate": 5.987423726731873e-06, "loss": 0.0769, "num_tokens": 894411855.0, "step": 6842 }, { "epoch": 2.7306464485235438, "grad_norm": 0.2772409915924072, "learning_rate": 5.98452599367123e-06, "loss": 0.0815, "num_tokens": 894542927.0, "step": 6843 }, { "epoch": 2.7310454908220274, "grad_norm": 0.2817765176296234, "learning_rate": 5.981632423654245e-06, "loss": 0.0853, "num_tokens": 894673999.0, "step": 6844 }, { "epoch": 2.7314445331205106, "grad_norm": 0.24930909276008606, "learning_rate": 5.978743017240794e-06, "loss": 0.0724, "num_tokens": 894805071.0, "step": 6845 }, { "epoch": 2.7318435754189943, "grad_norm": 0.25453850626945496, "learning_rate": 5.975857774989953e-06, "loss": 0.0667, "num_tokens": 894936143.0, "step": 6846 }, { "epoch": 2.732242617717478, "grad_norm": 0.24801307916641235, "learning_rate": 5.972976697459985e-06, "loss": 0.0639, "num_tokens": 895067215.0, "step": 6847 }, { "epoch": 2.7326416600159615, "grad_norm": 0.23947741091251373, "learning_rate": 5.970099785208352e-06, "loss": 0.063, "num_tokens": 895198287.0, "step": 6848 }, { "epoch": 2.733040702314445, "grad_norm": 0.23010240495204926, "learning_rate": 5.96722703879171e-06, "loss": 0.0645, "num_tokens": 895329359.0, "step": 6849 }, { "epoch": 2.733439744612929, "grad_norm": 0.25727686285972595, "learning_rate": 5.964358458765916e-06, "loss": 0.0678, "num_tokens": 895460431.0, "step": 6850 }, { "epoch": 2.7338387869114125, "grad_norm": 0.2704242765903473, "learning_rate": 5.961494045686e-06, "loss": 0.0767, "num_tokens": 895591503.0, "step": 6851 }, { "epoch": 2.734237829209896, "grad_norm": 0.2884243130683899, "learning_rate": 5.958633800106212e-06, "loss": 0.0853, "num_tokens": 895722575.0, "step": 6852 }, { "epoch": 2.7346368715083798, "grad_norm": 0.23909267783164978, "learning_rate": 5.955777722579976e-06, "loss": 0.059, "num_tokens": 895853647.0, "step": 6853 }, { "epoch": 2.7350359138068634, "grad_norm": 0.25105753540992737, "learning_rate": 5.952925813659918e-06, "loss": 0.0754, "num_tokens": 895984719.0, "step": 6854 }, { "epoch": 2.735434956105347, "grad_norm": 0.22443048655986786, "learning_rate": 5.950078073897856e-06, "loss": 0.0669, "num_tokens": 896115791.0, "step": 6855 }, { "epoch": 2.7358339984038307, "grad_norm": 0.25779327750205994, "learning_rate": 5.947234503844802e-06, "loss": 0.08, "num_tokens": 896246863.0, "step": 6856 }, { "epoch": 2.7362330407023143, "grad_norm": 0.2740059196949005, "learning_rate": 5.944395104050959e-06, "loss": 0.086, "num_tokens": 896365811.0, "step": 6857 }, { "epoch": 2.736632083000798, "grad_norm": 0.26783204078674316, "learning_rate": 5.941559875065724e-06, "loss": 0.0778, "num_tokens": 896496883.0, "step": 6858 }, { "epoch": 2.7370311252992816, "grad_norm": 0.22103355824947357, "learning_rate": 5.938728817437686e-06, "loss": 0.0592, "num_tokens": 896627955.0, "step": 6859 }, { "epoch": 2.7374301675977653, "grad_norm": 0.18962393701076508, "learning_rate": 5.935901931714632e-06, "loss": 0.0428, "num_tokens": 896759027.0, "step": 6860 }, { "epoch": 2.737829209896249, "grad_norm": 0.2275771200656891, "learning_rate": 5.933079218443535e-06, "loss": 0.064, "num_tokens": 896890099.0, "step": 6861 }, { "epoch": 2.7382282521947325, "grad_norm": 0.25456252694129944, "learning_rate": 5.930260678170565e-06, "loss": 0.0696, "num_tokens": 897021171.0, "step": 6862 }, { "epoch": 2.738627294493216, "grad_norm": 0.2904651463031769, "learning_rate": 5.927446311441086e-06, "loss": 0.086, "num_tokens": 897152243.0, "step": 6863 }, { "epoch": 2.7390263367917, "grad_norm": 0.27733004093170166, "learning_rate": 5.9246361187996465e-06, "loss": 0.0751, "num_tokens": 897283315.0, "step": 6864 }, { "epoch": 2.7394253790901835, "grad_norm": 0.22707609832286835, "learning_rate": 5.921830100789995e-06, "loss": 0.0576, "num_tokens": 897414387.0, "step": 6865 }, { "epoch": 2.739824421388667, "grad_norm": 0.29252034425735474, "learning_rate": 5.919028257955072e-06, "loss": 0.0808, "num_tokens": 897545459.0, "step": 6866 }, { "epoch": 2.7402234636871508, "grad_norm": 0.21921603381633759, "learning_rate": 5.916230590837004e-06, "loss": 0.0644, "num_tokens": 897676531.0, "step": 6867 }, { "epoch": 2.7406225059856344, "grad_norm": 0.2484056055545807, "learning_rate": 5.913437099977117e-06, "loss": 0.057, "num_tokens": 897807603.0, "step": 6868 }, { "epoch": 2.741021548284118, "grad_norm": 0.25754356384277344, "learning_rate": 5.910647785915925e-06, "loss": 0.0746, "num_tokens": 897938675.0, "step": 6869 }, { "epoch": 2.7414205905826017, "grad_norm": 0.25315216183662415, "learning_rate": 5.907862649193129e-06, "loss": 0.0712, "num_tokens": 898069747.0, "step": 6870 }, { "epoch": 2.7418196328810853, "grad_norm": 0.21571050584316254, "learning_rate": 5.905081690347631e-06, "loss": 0.0606, "num_tokens": 898200819.0, "step": 6871 }, { "epoch": 2.742218675179569, "grad_norm": 0.2689554989337921, "learning_rate": 5.902304909917524e-06, "loss": 0.0892, "num_tokens": 898331891.0, "step": 6872 }, { "epoch": 2.7426177174780526, "grad_norm": 0.24346256256103516, "learning_rate": 5.8995323084400835e-06, "loss": 0.0742, "num_tokens": 898462963.0, "step": 6873 }, { "epoch": 2.7430167597765363, "grad_norm": 0.23590430617332458, "learning_rate": 5.896763886451784e-06, "loss": 0.0652, "num_tokens": 898594035.0, "step": 6874 }, { "epoch": 2.74341580207502, "grad_norm": 0.239288792014122, "learning_rate": 5.893999644488292e-06, "loss": 0.0693, "num_tokens": 898725107.0, "step": 6875 }, { "epoch": 2.7438148443735035, "grad_norm": 0.3122444748878479, "learning_rate": 5.8912395830844565e-06, "loss": 0.0718, "num_tokens": 898856179.0, "step": 6876 }, { "epoch": 2.744213886671987, "grad_norm": 0.21229611337184906, "learning_rate": 5.8884837027743296e-06, "loss": 0.0668, "num_tokens": 898987251.0, "step": 6877 }, { "epoch": 2.744612928970471, "grad_norm": 0.23748642206192017, "learning_rate": 5.885732004091147e-06, "loss": 0.0761, "num_tokens": 899118323.0, "step": 6878 }, { "epoch": 2.7450119712689545, "grad_norm": 0.20984522998332977, "learning_rate": 5.882984487567334e-06, "loss": 0.0606, "num_tokens": 899249395.0, "step": 6879 }, { "epoch": 2.745411013567438, "grad_norm": 0.2327217012643814, "learning_rate": 5.8802411537345145e-06, "loss": 0.0698, "num_tokens": 899380467.0, "step": 6880 }, { "epoch": 2.7458100558659218, "grad_norm": 0.24881501495838165, "learning_rate": 5.877502003123493e-06, "loss": 0.0778, "num_tokens": 899511539.0, "step": 6881 }, { "epoch": 2.7462090981644054, "grad_norm": 0.23539508879184723, "learning_rate": 5.874767036264273e-06, "loss": 0.0695, "num_tokens": 899642611.0, "step": 6882 }, { "epoch": 2.746608140462889, "grad_norm": 0.23625290393829346, "learning_rate": 5.872036253686041e-06, "loss": 0.0616, "num_tokens": 899765700.0, "step": 6883 }, { "epoch": 2.7470071827613727, "grad_norm": 0.24670425057411194, "learning_rate": 5.869309655917184e-06, "loss": 0.0783, "num_tokens": 899896772.0, "step": 6884 }, { "epoch": 2.7474062250598563, "grad_norm": 0.23781414330005646, "learning_rate": 5.8665872434852655e-06, "loss": 0.06, "num_tokens": 900027844.0, "step": 6885 }, { "epoch": 2.74780526735834, "grad_norm": 0.26880741119384766, "learning_rate": 5.863869016917053e-06, "loss": 0.0804, "num_tokens": 900158916.0, "step": 6886 }, { "epoch": 2.7482043096568236, "grad_norm": 0.23528458178043365, "learning_rate": 5.8611549767384976e-06, "loss": 0.0617, "num_tokens": 900289988.0, "step": 6887 }, { "epoch": 2.7486033519553073, "grad_norm": 0.20549263060092926, "learning_rate": 5.858445123474738e-06, "loss": 0.0565, "num_tokens": 900421060.0, "step": 6888 }, { "epoch": 2.749002394253791, "grad_norm": 0.2696191668510437, "learning_rate": 5.855739457650105e-06, "loss": 0.0627, "num_tokens": 900552132.0, "step": 6889 }, { "epoch": 2.7494014365522745, "grad_norm": 0.26747485995292664, "learning_rate": 5.853037979788128e-06, "loss": 0.0794, "num_tokens": 900683204.0, "step": 6890 }, { "epoch": 2.749800478850758, "grad_norm": 0.2485882192850113, "learning_rate": 5.850340690411509e-06, "loss": 0.0618, "num_tokens": 900814276.0, "step": 6891 }, { "epoch": 2.750199521149242, "grad_norm": 0.2679028809070587, "learning_rate": 5.84764759004215e-06, "loss": 0.0716, "num_tokens": 900940500.0, "step": 6892 }, { "epoch": 2.7505985634477255, "grad_norm": 0.2552708387374878, "learning_rate": 5.844958679201138e-06, "loss": 0.0815, "num_tokens": 901071572.0, "step": 6893 }, { "epoch": 2.750997605746209, "grad_norm": 0.22381176054477692, "learning_rate": 5.8422739584087626e-06, "loss": 0.0624, "num_tokens": 901202644.0, "step": 6894 }, { "epoch": 2.7513966480446927, "grad_norm": 0.2698310911655426, "learning_rate": 5.8395934281844815e-06, "loss": 0.0786, "num_tokens": 901333716.0, "step": 6895 }, { "epoch": 2.7517956903431764, "grad_norm": 0.25396162271499634, "learning_rate": 5.836917089046956e-06, "loss": 0.0689, "num_tokens": 901464788.0, "step": 6896 }, { "epoch": 2.75219473264166, "grad_norm": 0.1929519921541214, "learning_rate": 5.834244941514035e-06, "loss": 0.0502, "num_tokens": 901595860.0, "step": 6897 }, { "epoch": 2.7525937749401437, "grad_norm": 0.2501050531864166, "learning_rate": 5.83157698610275e-06, "loss": 0.0636, "num_tokens": 901726932.0, "step": 6898 }, { "epoch": 2.7529928172386273, "grad_norm": 0.2108270674943924, "learning_rate": 5.828913223329327e-06, "loss": 0.0558, "num_tokens": 901858004.0, "step": 6899 }, { "epoch": 2.753391859537111, "grad_norm": 0.2848677337169647, "learning_rate": 5.8262536537091825e-06, "loss": 0.077, "num_tokens": 901989076.0, "step": 6900 }, { "epoch": 2.7537909018355946, "grad_norm": 0.2872179448604584, "learning_rate": 5.823598277756915e-06, "loss": 0.0711, "num_tokens": 902120148.0, "step": 6901 }, { "epoch": 2.7541899441340782, "grad_norm": 0.2277965396642685, "learning_rate": 5.820947095986316e-06, "loss": 0.0648, "num_tokens": 902251220.0, "step": 6902 }, { "epoch": 2.754588986432562, "grad_norm": 0.23137684166431427, "learning_rate": 5.818300108910369e-06, "loss": 0.0665, "num_tokens": 902382292.0, "step": 6903 }, { "epoch": 2.7549880287310455, "grad_norm": 0.24533206224441528, "learning_rate": 5.815657317041231e-06, "loss": 0.0685, "num_tokens": 902513364.0, "step": 6904 }, { "epoch": 2.755387071029529, "grad_norm": 0.22570082545280457, "learning_rate": 5.813018720890264e-06, "loss": 0.0615, "num_tokens": 902644436.0, "step": 6905 }, { "epoch": 2.755786113328013, "grad_norm": 0.24998368322849274, "learning_rate": 5.8103843209680115e-06, "loss": 0.0699, "num_tokens": 902775508.0, "step": 6906 }, { "epoch": 2.7561851556264965, "grad_norm": 0.2540256977081299, "learning_rate": 5.807754117784205e-06, "loss": 0.0722, "num_tokens": 902906580.0, "step": 6907 }, { "epoch": 2.75658419792498, "grad_norm": 0.24735291302204132, "learning_rate": 5.805128111847764e-06, "loss": 0.0776, "num_tokens": 903037652.0, "step": 6908 }, { "epoch": 2.7569832402234637, "grad_norm": 0.2751026749610901, "learning_rate": 5.8025063036667975e-06, "loss": 0.0719, "num_tokens": 903168724.0, "step": 6909 }, { "epoch": 2.7573822825219474, "grad_norm": 0.223767951130867, "learning_rate": 5.799888693748597e-06, "loss": 0.0626, "num_tokens": 903299796.0, "step": 6910 }, { "epoch": 2.757781324820431, "grad_norm": 0.26281359791755676, "learning_rate": 5.797275282599649e-06, "loss": 0.0743, "num_tokens": 903430868.0, "step": 6911 }, { "epoch": 2.7581803671189147, "grad_norm": 0.23236693441867828, "learning_rate": 5.794666070725626e-06, "loss": 0.0621, "num_tokens": 903561940.0, "step": 6912 }, { "epoch": 2.7585794094173983, "grad_norm": 0.2824847102165222, "learning_rate": 5.792061058631382e-06, "loss": 0.0757, "num_tokens": 903693012.0, "step": 6913 }, { "epoch": 2.758978451715882, "grad_norm": 0.21301408112049103, "learning_rate": 5.789460246820966e-06, "loss": 0.0588, "num_tokens": 903824084.0, "step": 6914 }, { "epoch": 2.7593774940143656, "grad_norm": 0.24952644109725952, "learning_rate": 5.7868636357976085e-06, "loss": 0.0715, "num_tokens": 903955156.0, "step": 6915 }, { "epoch": 2.7597765363128492, "grad_norm": 0.23890122771263123, "learning_rate": 5.784271226063729e-06, "loss": 0.0631, "num_tokens": 904086228.0, "step": 6916 }, { "epoch": 2.760175578611333, "grad_norm": 0.2680458128452301, "learning_rate": 5.781683018120938e-06, "loss": 0.0715, "num_tokens": 904217300.0, "step": 6917 }, { "epoch": 2.7605746209098165, "grad_norm": 0.26437678933143616, "learning_rate": 5.779099012470023e-06, "loss": 0.0706, "num_tokens": 904348372.0, "step": 6918 }, { "epoch": 2.7609736632083, "grad_norm": 0.2462121695280075, "learning_rate": 5.7765192096109735e-06, "loss": 0.0727, "num_tokens": 904479444.0, "step": 6919 }, { "epoch": 2.761372705506784, "grad_norm": 0.24407699704170227, "learning_rate": 5.773943610042952e-06, "loss": 0.0737, "num_tokens": 904610516.0, "step": 6920 }, { "epoch": 2.7617717478052675, "grad_norm": 0.2757738530635834, "learning_rate": 5.771372214264313e-06, "loss": 0.0809, "num_tokens": 904741588.0, "step": 6921 }, { "epoch": 2.762170790103751, "grad_norm": 0.2931033670902252, "learning_rate": 5.768805022772601e-06, "loss": 0.0751, "num_tokens": 904872660.0, "step": 6922 }, { "epoch": 2.7625698324022347, "grad_norm": 0.22775480151176453, "learning_rate": 5.766242036064541e-06, "loss": 0.0689, "num_tokens": 905003732.0, "step": 6923 }, { "epoch": 2.7629688747007184, "grad_norm": 0.24635343253612518, "learning_rate": 5.763683254636047e-06, "loss": 0.0676, "num_tokens": 905134804.0, "step": 6924 }, { "epoch": 2.763367916999202, "grad_norm": 0.23052255809307098, "learning_rate": 5.761128678982222e-06, "loss": 0.0626, "num_tokens": 905265876.0, "step": 6925 }, { "epoch": 2.7637669592976857, "grad_norm": 0.21475070714950562, "learning_rate": 5.758578309597347e-06, "loss": 0.0517, "num_tokens": 905396948.0, "step": 6926 }, { "epoch": 2.7641660015961693, "grad_norm": 0.2395886778831482, "learning_rate": 5.756032146974899e-06, "loss": 0.067, "num_tokens": 905528020.0, "step": 6927 }, { "epoch": 2.764565043894653, "grad_norm": 0.2520934045314789, "learning_rate": 5.75349019160754e-06, "loss": 0.0638, "num_tokens": 905659092.0, "step": 6928 }, { "epoch": 2.7649640861931366, "grad_norm": 0.2716347277164459, "learning_rate": 5.750952443987104e-06, "loss": 0.0759, "num_tokens": 905790164.0, "step": 6929 }, { "epoch": 2.7653631284916202, "grad_norm": 0.2599424421787262, "learning_rate": 5.7484189046046314e-06, "loss": 0.0697, "num_tokens": 905921236.0, "step": 6930 }, { "epoch": 2.765762170790104, "grad_norm": 0.2554665207862854, "learning_rate": 5.745889573950338e-06, "loss": 0.0709, "num_tokens": 906052308.0, "step": 6931 }, { "epoch": 2.7661612130885875, "grad_norm": 0.28224125504493713, "learning_rate": 5.743364452513618e-06, "loss": 0.0802, "num_tokens": 906183380.0, "step": 6932 }, { "epoch": 2.766560255387071, "grad_norm": 0.19750751554965973, "learning_rate": 5.740843540783064e-06, "loss": 0.0581, "num_tokens": 906314452.0, "step": 6933 }, { "epoch": 2.766959297685555, "grad_norm": 0.17663490772247314, "learning_rate": 5.738326839246451e-06, "loss": 0.0417, "num_tokens": 906445524.0, "step": 6934 }, { "epoch": 2.7673583399840385, "grad_norm": 0.25046631693840027, "learning_rate": 5.735814348390733e-06, "loss": 0.0706, "num_tokens": 906576596.0, "step": 6935 }, { "epoch": 2.767757382282522, "grad_norm": 0.2100781798362732, "learning_rate": 5.733306068702054e-06, "loss": 0.0601, "num_tokens": 906707668.0, "step": 6936 }, { "epoch": 2.7681564245810057, "grad_norm": 0.2451092153787613, "learning_rate": 5.7308020006657475e-06, "loss": 0.0742, "num_tokens": 906838740.0, "step": 6937 }, { "epoch": 2.7685554668794894, "grad_norm": 0.2361602634191513, "learning_rate": 5.728302144766325e-06, "loss": 0.0664, "num_tokens": 906969812.0, "step": 6938 }, { "epoch": 2.7689545091779726, "grad_norm": 0.22405380010604858, "learning_rate": 5.72580650148748e-06, "loss": 0.0613, "num_tokens": 907100884.0, "step": 6939 }, { "epoch": 2.7693535514764562, "grad_norm": 0.2404336929321289, "learning_rate": 5.723315071312104e-06, "loss": 0.059, "num_tokens": 907231956.0, "step": 6940 }, { "epoch": 2.76975259377494, "grad_norm": 0.2140873372554779, "learning_rate": 5.720827854722261e-06, "loss": 0.0551, "num_tokens": 907363028.0, "step": 6941 }, { "epoch": 2.7701516360734235, "grad_norm": 0.2576693892478943, "learning_rate": 5.718344852199205e-06, "loss": 0.0759, "num_tokens": 907494100.0, "step": 6942 }, { "epoch": 2.770550678371907, "grad_norm": 0.22581639885902405, "learning_rate": 5.7158660642233705e-06, "loss": 0.0642, "num_tokens": 907625172.0, "step": 6943 }, { "epoch": 2.770949720670391, "grad_norm": 0.27105364203453064, "learning_rate": 5.7133914912743895e-06, "loss": 0.0847, "num_tokens": 907756244.0, "step": 6944 }, { "epoch": 2.7713487629688744, "grad_norm": 0.2453732192516327, "learning_rate": 5.710921133831059e-06, "loss": 0.0676, "num_tokens": 907887316.0, "step": 6945 }, { "epoch": 2.771747805267358, "grad_norm": 0.2672586143016815, "learning_rate": 5.708454992371376e-06, "loss": 0.0849, "num_tokens": 908018388.0, "step": 6946 }, { "epoch": 2.7721468475658417, "grad_norm": 0.2564227879047394, "learning_rate": 5.705993067372512e-06, "loss": 0.0686, "num_tokens": 908149460.0, "step": 6947 }, { "epoch": 2.7725458898643254, "grad_norm": 0.23473456501960754, "learning_rate": 5.703535359310833e-06, "loss": 0.0707, "num_tokens": 908280532.0, "step": 6948 }, { "epoch": 2.772944932162809, "grad_norm": 0.2522266209125519, "learning_rate": 5.701081868661872e-06, "loss": 0.0637, "num_tokens": 908411604.0, "step": 6949 }, { "epoch": 2.7733439744612927, "grad_norm": 0.24200089275836945, "learning_rate": 5.698632595900364e-06, "loss": 0.0663, "num_tokens": 908537896.0, "step": 6950 }, { "epoch": 2.7737430167597763, "grad_norm": 0.2438981831073761, "learning_rate": 5.6961875415002214e-06, "loss": 0.0756, "num_tokens": 908668968.0, "step": 6951 }, { "epoch": 2.77414205905826, "grad_norm": 0.29766231775283813, "learning_rate": 5.693746705934534e-06, "loss": 0.0886, "num_tokens": 908800040.0, "step": 6952 }, { "epoch": 2.7745411013567436, "grad_norm": 0.2560945451259613, "learning_rate": 5.6913100896755855e-06, "loss": 0.0742, "num_tokens": 908931112.0, "step": 6953 }, { "epoch": 2.7749401436552272, "grad_norm": 0.21753300726413727, "learning_rate": 5.688877693194835e-06, "loss": 0.0575, "num_tokens": 909062184.0, "step": 6954 }, { "epoch": 2.775339185953711, "grad_norm": 0.2663625180721283, "learning_rate": 5.686449516962931e-06, "loss": 0.073, "num_tokens": 909193256.0, "step": 6955 }, { "epoch": 2.7757382282521945, "grad_norm": 0.26208314299583435, "learning_rate": 5.684025561449702e-06, "loss": 0.0778, "num_tokens": 909324328.0, "step": 6956 }, { "epoch": 2.776137270550678, "grad_norm": 0.24087809026241302, "learning_rate": 5.681605827124161e-06, "loss": 0.0714, "num_tokens": 909455400.0, "step": 6957 }, { "epoch": 2.776536312849162, "grad_norm": 0.22333061695098877, "learning_rate": 5.679190314454503e-06, "loss": 0.0564, "num_tokens": 909586472.0, "step": 6958 }, { "epoch": 2.7769353551476454, "grad_norm": 0.2765280306339264, "learning_rate": 5.6767790239081144e-06, "loss": 0.0836, "num_tokens": 909717544.0, "step": 6959 }, { "epoch": 2.777334397446129, "grad_norm": 0.22729238867759705, "learning_rate": 5.6743719559515435e-06, "loss": 0.0564, "num_tokens": 909848616.0, "step": 6960 }, { "epoch": 2.7777334397446127, "grad_norm": 0.22925427556037903, "learning_rate": 5.671969111050545e-06, "loss": 0.0678, "num_tokens": 909979688.0, "step": 6961 }, { "epoch": 2.7781324820430964, "grad_norm": 0.21808123588562012, "learning_rate": 5.669570489670049e-06, "loss": 0.0646, "num_tokens": 910110760.0, "step": 6962 }, { "epoch": 2.77853152434158, "grad_norm": 0.23976927995681763, "learning_rate": 5.667176092274158e-06, "loss": 0.069, "num_tokens": 910241832.0, "step": 6963 }, { "epoch": 2.7789305666400637, "grad_norm": 0.2651802897453308, "learning_rate": 5.664785919326169e-06, "loss": 0.0697, "num_tokens": 910372904.0, "step": 6964 }, { "epoch": 2.7793296089385473, "grad_norm": 0.23824431002140045, "learning_rate": 5.662399971288565e-06, "loss": 0.0655, "num_tokens": 910503976.0, "step": 6965 }, { "epoch": 2.779728651237031, "grad_norm": 0.31801682710647583, "learning_rate": 5.660018248622995e-06, "loss": 0.0967, "num_tokens": 910635048.0, "step": 6966 }, { "epoch": 2.7801276935355146, "grad_norm": 0.22670073807239532, "learning_rate": 5.6576407517903046e-06, "loss": 0.067, "num_tokens": 910766120.0, "step": 6967 }, { "epoch": 2.7805267358339982, "grad_norm": 0.1965225487947464, "learning_rate": 5.6552674812505184e-06, "loss": 0.0515, "num_tokens": 910897192.0, "step": 6968 }, { "epoch": 2.780925778132482, "grad_norm": 0.21203437447547913, "learning_rate": 5.652898437462839e-06, "loss": 0.063, "num_tokens": 911028264.0, "step": 6969 }, { "epoch": 2.7813248204309655, "grad_norm": 0.24256160855293274, "learning_rate": 5.650533620885658e-06, "loss": 0.063, "num_tokens": 911159336.0, "step": 6970 }, { "epoch": 2.781723862729449, "grad_norm": 0.24111707508563995, "learning_rate": 5.648173031976541e-06, "loss": 0.0703, "num_tokens": 911290408.0, "step": 6971 }, { "epoch": 2.782122905027933, "grad_norm": 0.25687840580940247, "learning_rate": 5.645816671192244e-06, "loss": 0.06, "num_tokens": 911421480.0, "step": 6972 }, { "epoch": 2.7825219473264164, "grad_norm": 0.21610881388187408, "learning_rate": 5.643464538988697e-06, "loss": 0.0557, "num_tokens": 911552552.0, "step": 6973 }, { "epoch": 2.7829209896249, "grad_norm": 0.20072883367538452, "learning_rate": 5.64111663582102e-06, "loss": 0.0464, "num_tokens": 911683624.0, "step": 6974 }, { "epoch": 2.7833200319233837, "grad_norm": 0.2172076255083084, "learning_rate": 5.638772962143508e-06, "loss": 0.0635, "num_tokens": 911814696.0, "step": 6975 }, { "epoch": 2.7837190742218674, "grad_norm": 0.2873257100582123, "learning_rate": 5.636433518409636e-06, "loss": 0.0789, "num_tokens": 911945768.0, "step": 6976 }, { "epoch": 2.784118116520351, "grad_norm": 0.2264101207256317, "learning_rate": 5.63409830507207e-06, "loss": 0.0567, "num_tokens": 912076840.0, "step": 6977 }, { "epoch": 2.7845171588188347, "grad_norm": 0.2900168299674988, "learning_rate": 5.6317673225826535e-06, "loss": 0.0721, "num_tokens": 912207912.0, "step": 6978 }, { "epoch": 2.7849162011173183, "grad_norm": 0.2573409080505371, "learning_rate": 5.6294405713924055e-06, "loss": 0.0745, "num_tokens": 912338984.0, "step": 6979 }, { "epoch": 2.785315243415802, "grad_norm": 0.22894126176834106, "learning_rate": 5.627118051951531e-06, "loss": 0.066, "num_tokens": 912470056.0, "step": 6980 }, { "epoch": 2.7857142857142856, "grad_norm": 0.23010696470737457, "learning_rate": 5.62479976470942e-06, "loss": 0.0713, "num_tokens": 912601128.0, "step": 6981 }, { "epoch": 2.7861133280127692, "grad_norm": 0.3062782287597656, "learning_rate": 5.622485710114637e-06, "loss": 0.0892, "num_tokens": 912732200.0, "step": 6982 }, { "epoch": 2.786512370311253, "grad_norm": 0.2118476927280426, "learning_rate": 5.620175888614926e-06, "loss": 0.061, "num_tokens": 912863272.0, "step": 6983 }, { "epoch": 2.7869114126097365, "grad_norm": 0.2865534722805023, "learning_rate": 5.617870300657222e-06, "loss": 0.0886, "num_tokens": 912994344.0, "step": 6984 }, { "epoch": 2.78731045490822, "grad_norm": 0.23097340762615204, "learning_rate": 5.61556894668763e-06, "loss": 0.0738, "num_tokens": 913125416.0, "step": 6985 }, { "epoch": 2.787709497206704, "grad_norm": 0.22887377440929413, "learning_rate": 5.613271827151442e-06, "loss": 0.0569, "num_tokens": 913256488.0, "step": 6986 }, { "epoch": 2.7881085395051874, "grad_norm": 0.23115375638008118, "learning_rate": 5.6109789424931335e-06, "loss": 0.063, "num_tokens": 913387560.0, "step": 6987 }, { "epoch": 2.788507581803671, "grad_norm": 0.2699355483055115, "learning_rate": 5.60869029315635e-06, "loss": 0.0727, "num_tokens": 913518632.0, "step": 6988 }, { "epoch": 2.7889066241021547, "grad_norm": 0.31713443994522095, "learning_rate": 5.606405879583926e-06, "loss": 0.0747, "num_tokens": 913649704.0, "step": 6989 }, { "epoch": 2.7893056664006384, "grad_norm": 0.2598823606967926, "learning_rate": 5.604125702217878e-06, "loss": 0.0795, "num_tokens": 913780776.0, "step": 6990 }, { "epoch": 2.789704708699122, "grad_norm": 0.2390412837266922, "learning_rate": 5.601849761499394e-06, "loss": 0.064, "num_tokens": 913902837.0, "step": 6991 }, { "epoch": 2.7901037509976057, "grad_norm": 0.2608378231525421, "learning_rate": 5.599578057868847e-06, "loss": 0.0828, "num_tokens": 914033909.0, "step": 6992 }, { "epoch": 2.7905027932960893, "grad_norm": 0.2558082044124603, "learning_rate": 5.597310591765799e-06, "loss": 0.0701, "num_tokens": 914164981.0, "step": 6993 }, { "epoch": 2.790901835594573, "grad_norm": 0.3102640211582184, "learning_rate": 5.595047363628973e-06, "loss": 0.0881, "num_tokens": 914296053.0, "step": 6994 }, { "epoch": 2.7913008778930566, "grad_norm": 0.23138359189033508, "learning_rate": 5.592788373896288e-06, "loss": 0.0702, "num_tokens": 914427125.0, "step": 6995 }, { "epoch": 2.7916999201915402, "grad_norm": 0.19714930653572083, "learning_rate": 5.590533623004832e-06, "loss": 0.0596, "num_tokens": 914558197.0, "step": 6996 }, { "epoch": 2.792098962490024, "grad_norm": 0.22276277840137482, "learning_rate": 5.588283111390887e-06, "loss": 0.0649, "num_tokens": 914689269.0, "step": 6997 }, { "epoch": 2.7924980047885075, "grad_norm": 0.21328546106815338, "learning_rate": 5.586036839489901e-06, "loss": 0.0566, "num_tokens": 914820341.0, "step": 6998 }, { "epoch": 2.792897047086991, "grad_norm": 0.2866112291812897, "learning_rate": 5.583794807736504e-06, "loss": 0.0907, "num_tokens": 914951413.0, "step": 6999 }, { "epoch": 2.793296089385475, "grad_norm": 0.23847702145576477, "learning_rate": 5.581557016564515e-06, "loss": 0.0744, "num_tokens": 915082485.0, "step": 7000 }, { "epoch": 2.7936951316839584, "grad_norm": 0.23093105852603912, "learning_rate": 5.579323466406922e-06, "loss": 0.0647, "num_tokens": 915213557.0, "step": 7001 }, { "epoch": 2.794094173982442, "grad_norm": 0.23827970027923584, "learning_rate": 5.577094157695894e-06, "loss": 0.0752, "num_tokens": 915344629.0, "step": 7002 }, { "epoch": 2.7944932162809257, "grad_norm": 0.27851560711860657, "learning_rate": 5.574869090862785e-06, "loss": 0.0726, "num_tokens": 915475701.0, "step": 7003 }, { "epoch": 2.7948922585794094, "grad_norm": 0.2643626630306244, "learning_rate": 5.5726482663381264e-06, "loss": 0.067, "num_tokens": 915606773.0, "step": 7004 }, { "epoch": 2.795291300877893, "grad_norm": 0.2778271734714508, "learning_rate": 5.57043168455162e-06, "loss": 0.0775, "num_tokens": 915737845.0, "step": 7005 }, { "epoch": 2.7956903431763767, "grad_norm": 0.23841601610183716, "learning_rate": 5.568219345932159e-06, "loss": 0.0623, "num_tokens": 915868917.0, "step": 7006 }, { "epoch": 2.7960893854748603, "grad_norm": 0.24014458060264587, "learning_rate": 5.566011250907809e-06, "loss": 0.0659, "num_tokens": 915999989.0, "step": 7007 }, { "epoch": 2.796488427773344, "grad_norm": 0.23880846798419952, "learning_rate": 5.563807399905816e-06, "loss": 0.0598, "num_tokens": 916131061.0, "step": 7008 }, { "epoch": 2.7968874700718276, "grad_norm": 0.2345060557126999, "learning_rate": 5.5616077933526055e-06, "loss": 0.0706, "num_tokens": 916262133.0, "step": 7009 }, { "epoch": 2.7972865123703112, "grad_norm": 0.2278069704771042, "learning_rate": 5.559412431673779e-06, "loss": 0.0599, "num_tokens": 916393205.0, "step": 7010 }, { "epoch": 2.797685554668795, "grad_norm": 0.20945940911769867, "learning_rate": 5.557221315294119e-06, "loss": 0.0519, "num_tokens": 916524277.0, "step": 7011 }, { "epoch": 2.7980845969672785, "grad_norm": 0.26475828886032104, "learning_rate": 5.5550344446375864e-06, "loss": 0.0669, "num_tokens": 916655349.0, "step": 7012 }, { "epoch": 2.798483639265762, "grad_norm": 0.21477173268795013, "learning_rate": 5.552851820127322e-06, "loss": 0.0627, "num_tokens": 916786421.0, "step": 7013 }, { "epoch": 2.798882681564246, "grad_norm": 0.2405458390712738, "learning_rate": 5.550673442185639e-06, "loss": 0.0582, "num_tokens": 916917493.0, "step": 7014 }, { "epoch": 2.7992817238627294, "grad_norm": 0.22927455604076385, "learning_rate": 5.54849931123404e-06, "loss": 0.059, "num_tokens": 917048565.0, "step": 7015 }, { "epoch": 2.799680766161213, "grad_norm": 0.24158282577991486, "learning_rate": 5.546329427693192e-06, "loss": 0.0676, "num_tokens": 917179637.0, "step": 7016 }, { "epoch": 2.8000798084596967, "grad_norm": 0.2699021100997925, "learning_rate": 5.544163791982952e-06, "loss": 0.0786, "num_tokens": 917310709.0, "step": 7017 }, { "epoch": 2.8004788507581804, "grad_norm": 0.23281966149806976, "learning_rate": 5.542002404522345e-06, "loss": 0.0694, "num_tokens": 917441781.0, "step": 7018 }, { "epoch": 2.800877893056664, "grad_norm": 0.22215154767036438, "learning_rate": 5.539845265729587e-06, "loss": 0.0556, "num_tokens": 917572853.0, "step": 7019 }, { "epoch": 2.8012769353551477, "grad_norm": 0.21468986570835114, "learning_rate": 5.537692376022057e-06, "loss": 0.0564, "num_tokens": 917703925.0, "step": 7020 }, { "epoch": 2.8016759776536313, "grad_norm": 0.2721964716911316, "learning_rate": 5.535543735816325e-06, "loss": 0.0682, "num_tokens": 917834997.0, "step": 7021 }, { "epoch": 2.802075019952115, "grad_norm": 0.22393085062503815, "learning_rate": 5.533399345528128e-06, "loss": 0.0638, "num_tokens": 917966069.0, "step": 7022 }, { "epoch": 2.8024740622505986, "grad_norm": 0.24510405957698822, "learning_rate": 5.531259205572388e-06, "loss": 0.0619, "num_tokens": 918097141.0, "step": 7023 }, { "epoch": 2.8028731045490822, "grad_norm": 0.21258492767810822, "learning_rate": 5.529123316363202e-06, "loss": 0.0669, "num_tokens": 918228213.0, "step": 7024 }, { "epoch": 2.803272146847566, "grad_norm": 0.31739288568496704, "learning_rate": 5.5269916783138455e-06, "loss": 0.0663, "num_tokens": 918359285.0, "step": 7025 }, { "epoch": 2.8036711891460495, "grad_norm": 0.30208706855773926, "learning_rate": 5.524864291836766e-06, "loss": 0.0794, "num_tokens": 918483423.0, "step": 7026 }, { "epoch": 2.804070231444533, "grad_norm": 0.2650231420993805, "learning_rate": 5.5227411573436e-06, "loss": 0.07, "num_tokens": 918614495.0, "step": 7027 }, { "epoch": 2.804469273743017, "grad_norm": 0.22416247427463531, "learning_rate": 5.520622275245152e-06, "loss": 0.0542, "num_tokens": 918745567.0, "step": 7028 }, { "epoch": 2.8048683160415004, "grad_norm": 0.22079363465309143, "learning_rate": 5.5185076459513996e-06, "loss": 0.063, "num_tokens": 918876639.0, "step": 7029 }, { "epoch": 2.805267358339984, "grad_norm": 0.2435310035943985, "learning_rate": 5.516397269871511e-06, "loss": 0.0681, "num_tokens": 919007711.0, "step": 7030 }, { "epoch": 2.8056664006384677, "grad_norm": 0.28269514441490173, "learning_rate": 5.514291147413823e-06, "loss": 0.0803, "num_tokens": 919138783.0, "step": 7031 }, { "epoch": 2.8060654429369514, "grad_norm": 0.21743428707122803, "learning_rate": 5.5121892789858496e-06, "loss": 0.0614, "num_tokens": 919269855.0, "step": 7032 }, { "epoch": 2.806464485235435, "grad_norm": 0.2426513135433197, "learning_rate": 5.510091664994281e-06, "loss": 0.0645, "num_tokens": 919400927.0, "step": 7033 }, { "epoch": 2.8068635275339187, "grad_norm": 0.23751573264598846, "learning_rate": 5.50799830584499e-06, "loss": 0.0665, "num_tokens": 919531999.0, "step": 7034 }, { "epoch": 2.8072625698324023, "grad_norm": 0.2216905951499939, "learning_rate": 5.50590920194302e-06, "loss": 0.0578, "num_tokens": 919650242.0, "step": 7035 }, { "epoch": 2.807661612130886, "grad_norm": 0.2179393768310547, "learning_rate": 5.503824353692591e-06, "loss": 0.0607, "num_tokens": 919781314.0, "step": 7036 }, { "epoch": 2.8080606544293696, "grad_norm": 0.22234876453876495, "learning_rate": 5.501743761497106e-06, "loss": 0.0705, "num_tokens": 919912386.0, "step": 7037 }, { "epoch": 2.8084596967278532, "grad_norm": 0.22114117443561554, "learning_rate": 5.49966742575914e-06, "loss": 0.0599, "num_tokens": 920043458.0, "step": 7038 }, { "epoch": 2.808858739026337, "grad_norm": 0.21528202295303345, "learning_rate": 5.497595346880438e-06, "loss": 0.058, "num_tokens": 920174530.0, "step": 7039 }, { "epoch": 2.8092577813248205, "grad_norm": 0.224170982837677, "learning_rate": 5.495527525261936e-06, "loss": 0.0602, "num_tokens": 920305602.0, "step": 7040 }, { "epoch": 2.809656823623304, "grad_norm": 0.24831652641296387, "learning_rate": 5.493463961303734e-06, "loss": 0.0742, "num_tokens": 920436674.0, "step": 7041 }, { "epoch": 2.810055865921788, "grad_norm": 0.2725951373577118, "learning_rate": 5.491404655405111e-06, "loss": 0.0801, "num_tokens": 920566969.0, "step": 7042 }, { "epoch": 2.8104549082202714, "grad_norm": 0.20513416826725006, "learning_rate": 5.48934960796453e-06, "loss": 0.0521, "num_tokens": 920698041.0, "step": 7043 }, { "epoch": 2.810853950518755, "grad_norm": 0.2529098391532898, "learning_rate": 5.487298819379614e-06, "loss": 0.0784, "num_tokens": 920829113.0, "step": 7044 }, { "epoch": 2.8112529928172387, "grad_norm": 0.2144371122121811, "learning_rate": 5.485252290047179e-06, "loss": 0.0538, "num_tokens": 920960185.0, "step": 7045 }, { "epoch": 2.8116520351157224, "grad_norm": 0.2644861042499542, "learning_rate": 5.483210020363208e-06, "loss": 0.0681, "num_tokens": 921091257.0, "step": 7046 }, { "epoch": 2.812051077414206, "grad_norm": 0.24934016168117523, "learning_rate": 5.48117201072286e-06, "loss": 0.0725, "num_tokens": 921222329.0, "step": 7047 }, { "epoch": 2.8124501197126897, "grad_norm": 0.2479560375213623, "learning_rate": 5.479138261520469e-06, "loss": 0.0728, "num_tokens": 921353401.0, "step": 7048 }, { "epoch": 2.8128491620111733, "grad_norm": 0.2994833290576935, "learning_rate": 5.4771087731495504e-06, "loss": 0.085, "num_tokens": 921484473.0, "step": 7049 }, { "epoch": 2.813248204309657, "grad_norm": 0.23314185440540314, "learning_rate": 5.475083546002788e-06, "loss": 0.0495, "num_tokens": 921615545.0, "step": 7050 }, { "epoch": 2.8136472466081406, "grad_norm": 0.29486364126205444, "learning_rate": 5.473062580472047e-06, "loss": 0.0866, "num_tokens": 921746617.0, "step": 7051 }, { "epoch": 2.8140462889066242, "grad_norm": 0.1959981769323349, "learning_rate": 5.471045876948363e-06, "loss": 0.0529, "num_tokens": 921877689.0, "step": 7052 }, { "epoch": 2.814445331205108, "grad_norm": 0.22046677768230438, "learning_rate": 5.469033435821952e-06, "loss": 0.0594, "num_tokens": 922008761.0, "step": 7053 }, { "epoch": 2.8148443735035915, "grad_norm": 0.2656269967556, "learning_rate": 5.4670252574822006e-06, "loss": 0.0718, "num_tokens": 922139833.0, "step": 7054 }, { "epoch": 2.815243415802075, "grad_norm": 0.2195153534412384, "learning_rate": 5.465021342317672e-06, "loss": 0.0606, "num_tokens": 922270905.0, "step": 7055 }, { "epoch": 2.815642458100559, "grad_norm": 0.2121296525001526, "learning_rate": 5.463021690716109e-06, "loss": 0.0569, "num_tokens": 922401977.0, "step": 7056 }, { "epoch": 2.8160415003990424, "grad_norm": 0.19777938723564148, "learning_rate": 5.4610263030644195e-06, "loss": 0.0485, "num_tokens": 922533049.0, "step": 7057 }, { "epoch": 2.816440542697526, "grad_norm": 0.22515492141246796, "learning_rate": 5.459035179748696e-06, "loss": 0.0639, "num_tokens": 922664121.0, "step": 7058 }, { "epoch": 2.8168395849960097, "grad_norm": 0.26326844096183777, "learning_rate": 5.457048321154205e-06, "loss": 0.079, "num_tokens": 922795193.0, "step": 7059 }, { "epoch": 2.8172386272944934, "grad_norm": 0.24531519412994385, "learning_rate": 5.455065727665379e-06, "loss": 0.0746, "num_tokens": 922926265.0, "step": 7060 }, { "epoch": 2.817637669592977, "grad_norm": 0.2634330987930298, "learning_rate": 5.453087399665835e-06, "loss": 0.0713, "num_tokens": 923057337.0, "step": 7061 }, { "epoch": 2.8180367118914607, "grad_norm": 0.24545033276081085, "learning_rate": 5.451113337538365e-06, "loss": 0.0688, "num_tokens": 923188409.0, "step": 7062 }, { "epoch": 2.8184357541899443, "grad_norm": 0.2319684475660324, "learning_rate": 5.449143541664921e-06, "loss": 0.0707, "num_tokens": 923319481.0, "step": 7063 }, { "epoch": 2.818834796488428, "grad_norm": 0.262109637260437, "learning_rate": 5.4471780124266475e-06, "loss": 0.0737, "num_tokens": 923450553.0, "step": 7064 }, { "epoch": 2.8192338387869116, "grad_norm": 0.26889726519584656, "learning_rate": 5.445216750203856e-06, "loss": 0.077, "num_tokens": 923581625.0, "step": 7065 }, { "epoch": 2.8196328810853952, "grad_norm": 0.2752494812011719, "learning_rate": 5.443259755376031e-06, "loss": 0.0827, "num_tokens": 923712697.0, "step": 7066 }, { "epoch": 2.820031923383879, "grad_norm": 0.20148926973342896, "learning_rate": 5.4413070283218306e-06, "loss": 0.0527, "num_tokens": 923843769.0, "step": 7067 }, { "epoch": 2.8204309656823625, "grad_norm": 0.24536137282848358, "learning_rate": 5.439358569419095e-06, "loss": 0.072, "num_tokens": 923974841.0, "step": 7068 }, { "epoch": 2.820830007980846, "grad_norm": 0.27366065979003906, "learning_rate": 5.437414379044828e-06, "loss": 0.077, "num_tokens": 924105913.0, "step": 7069 }, { "epoch": 2.82122905027933, "grad_norm": 0.22946324944496155, "learning_rate": 5.435474457575216e-06, "loss": 0.0621, "num_tokens": 924235746.0, "step": 7070 }, { "epoch": 2.8216280925778134, "grad_norm": 0.24860776960849762, "learning_rate": 5.433538805385613e-06, "loss": 0.0816, "num_tokens": 924366818.0, "step": 7071 }, { "epoch": 2.822027134876297, "grad_norm": 0.2299092411994934, "learning_rate": 5.43160742285055e-06, "loss": 0.0552, "num_tokens": 924497890.0, "step": 7072 }, { "epoch": 2.8224261771747807, "grad_norm": 0.2539072334766388, "learning_rate": 5.42968031034373e-06, "loss": 0.0686, "num_tokens": 924628962.0, "step": 7073 }, { "epoch": 2.8228252194732644, "grad_norm": 0.21306093037128448, "learning_rate": 5.4277574682380334e-06, "loss": 0.0547, "num_tokens": 924760034.0, "step": 7074 }, { "epoch": 2.823224261771748, "grad_norm": 0.24137453734874725, "learning_rate": 5.425838896905513e-06, "loss": 0.0641, "num_tokens": 924891106.0, "step": 7075 }, { "epoch": 2.8236233040702317, "grad_norm": 0.2226228415966034, "learning_rate": 5.423924596717393e-06, "loss": 0.0578, "num_tokens": 925022178.0, "step": 7076 }, { "epoch": 2.8240223463687153, "grad_norm": 0.25784680247306824, "learning_rate": 5.422014568044071e-06, "loss": 0.0749, "num_tokens": 925153250.0, "step": 7077 }, { "epoch": 2.824421388667199, "grad_norm": 0.21190626919269562, "learning_rate": 5.420108811255125e-06, "loss": 0.0612, "num_tokens": 925284322.0, "step": 7078 }, { "epoch": 2.8248204309656826, "grad_norm": 0.32668501138687134, "learning_rate": 5.418207326719293e-06, "loss": 0.0709, "num_tokens": 925415394.0, "step": 7079 }, { "epoch": 2.8252194732641662, "grad_norm": 0.212937593460083, "learning_rate": 5.4163101148045045e-06, "loss": 0.053, "num_tokens": 925546466.0, "step": 7080 }, { "epoch": 2.82561851556265, "grad_norm": 0.22936134040355682, "learning_rate": 5.414417175877845e-06, "loss": 0.0592, "num_tokens": 925677538.0, "step": 7081 }, { "epoch": 2.8260175578611335, "grad_norm": 0.241791233420372, "learning_rate": 5.412528510305584e-06, "loss": 0.0758, "num_tokens": 925808610.0, "step": 7082 }, { "epoch": 2.826416600159617, "grad_norm": 0.23264671862125397, "learning_rate": 5.410644118453159e-06, "loss": 0.0673, "num_tokens": 925939682.0, "step": 7083 }, { "epoch": 2.826815642458101, "grad_norm": 0.22642937302589417, "learning_rate": 5.408764000685184e-06, "loss": 0.0712, "num_tokens": 926070754.0, "step": 7084 }, { "epoch": 2.8272146847565844, "grad_norm": 0.23637765645980835, "learning_rate": 5.4068881573654405e-06, "loss": 0.0646, "num_tokens": 926201826.0, "step": 7085 }, { "epoch": 2.827613727055068, "grad_norm": 0.24027059972286224, "learning_rate": 5.405016588856889e-06, "loss": 0.0735, "num_tokens": 926332898.0, "step": 7086 }, { "epoch": 2.8280127693535517, "grad_norm": 0.2466897815465927, "learning_rate": 5.403149295521663e-06, "loss": 0.0709, "num_tokens": 926463970.0, "step": 7087 }, { "epoch": 2.8284118116520354, "grad_norm": 0.21363860368728638, "learning_rate": 5.401286277721063e-06, "loss": 0.0608, "num_tokens": 926595042.0, "step": 7088 }, { "epoch": 2.828810853950519, "grad_norm": 0.23802490532398224, "learning_rate": 5.3994275358155665e-06, "loss": 0.0736, "num_tokens": 926726114.0, "step": 7089 }, { "epoch": 2.8292098962490027, "grad_norm": 0.22393769025802612, "learning_rate": 5.397573070164823e-06, "loss": 0.0541, "num_tokens": 926857186.0, "step": 7090 }, { "epoch": 2.8296089385474863, "grad_norm": 0.2614094018936157, "learning_rate": 5.395722881127655e-06, "loss": 0.0802, "num_tokens": 926988258.0, "step": 7091 }, { "epoch": 2.83000798084597, "grad_norm": 0.2370891273021698, "learning_rate": 5.393876969062054e-06, "loss": 0.0745, "num_tokens": 927119330.0, "step": 7092 }, { "epoch": 2.8304070231444536, "grad_norm": 0.30636537075042725, "learning_rate": 5.392035334325191e-06, "loss": 0.0723, "num_tokens": 927250402.0, "step": 7093 }, { "epoch": 2.8308060654429372, "grad_norm": 0.326753169298172, "learning_rate": 5.390197977273403e-06, "loss": 0.0909, "num_tokens": 927381474.0, "step": 7094 }, { "epoch": 2.831205107741421, "grad_norm": 0.22460950911045074, "learning_rate": 5.388364898262201e-06, "loss": 0.0567, "num_tokens": 927512546.0, "step": 7095 }, { "epoch": 2.831604150039904, "grad_norm": 0.26499325037002563, "learning_rate": 5.386536097646269e-06, "loss": 0.081, "num_tokens": 927643618.0, "step": 7096 }, { "epoch": 2.8320031923383877, "grad_norm": 0.23037628829479218, "learning_rate": 5.384711575779463e-06, "loss": 0.0717, "num_tokens": 927774690.0, "step": 7097 }, { "epoch": 2.8324022346368714, "grad_norm": 0.24025501310825348, "learning_rate": 5.382891333014809e-06, "loss": 0.0509, "num_tokens": 927905762.0, "step": 7098 }, { "epoch": 2.832801276935355, "grad_norm": 0.24993547797203064, "learning_rate": 5.381075369704514e-06, "loss": 0.0583, "num_tokens": 928036834.0, "step": 7099 }, { "epoch": 2.8332003192338386, "grad_norm": 0.21887598931789398, "learning_rate": 5.3792636861999405e-06, "loss": 0.0595, "num_tokens": 928167906.0, "step": 7100 }, { "epoch": 2.8335993615323223, "grad_norm": 0.2375381588935852, "learning_rate": 5.377456282851637e-06, "loss": 0.0713, "num_tokens": 928288966.0, "step": 7101 }, { "epoch": 2.833998403830806, "grad_norm": 0.22616378962993622, "learning_rate": 5.37565316000932e-06, "loss": 0.0645, "num_tokens": 928420038.0, "step": 7102 }, { "epoch": 2.8343974461292896, "grad_norm": 0.23813676834106445, "learning_rate": 5.373854318021876e-06, "loss": 0.0689, "num_tokens": 928551110.0, "step": 7103 }, { "epoch": 2.834796488427773, "grad_norm": 0.23946630954742432, "learning_rate": 5.372059757237365e-06, "loss": 0.0625, "num_tokens": 928682182.0, "step": 7104 }, { "epoch": 2.835195530726257, "grad_norm": 0.26962196826934814, "learning_rate": 5.370269478003016e-06, "loss": 0.0669, "num_tokens": 928813254.0, "step": 7105 }, { "epoch": 2.8355945730247405, "grad_norm": 0.2232246696949005, "learning_rate": 5.368483480665234e-06, "loss": 0.068, "num_tokens": 928944326.0, "step": 7106 }, { "epoch": 2.835993615323224, "grad_norm": 0.2418430894613266, "learning_rate": 5.36670176556959e-06, "loss": 0.0683, "num_tokens": 929075398.0, "step": 7107 }, { "epoch": 2.836392657621708, "grad_norm": 0.24358029663562775, "learning_rate": 5.364924333060829e-06, "loss": 0.0701, "num_tokens": 929206470.0, "step": 7108 }, { "epoch": 2.8367916999201914, "grad_norm": 0.24836207926273346, "learning_rate": 5.363151183482871e-06, "loss": 0.064, "num_tokens": 929337542.0, "step": 7109 }, { "epoch": 2.837190742218675, "grad_norm": 0.20779214799404144, "learning_rate": 5.3613823171788e-06, "loss": 0.0549, "num_tokens": 929468614.0, "step": 7110 }, { "epoch": 2.8375897845171587, "grad_norm": 0.2344399243593216, "learning_rate": 5.359617734490876e-06, "loss": 0.072, "num_tokens": 929599686.0, "step": 7111 }, { "epoch": 2.8379888268156424, "grad_norm": 0.24097378551959991, "learning_rate": 5.357857435760534e-06, "loss": 0.072, "num_tokens": 929728092.0, "step": 7112 }, { "epoch": 2.838387869114126, "grad_norm": 0.2263268679380417, "learning_rate": 5.35610142132837e-06, "loss": 0.0611, "num_tokens": 929859164.0, "step": 7113 }, { "epoch": 2.8387869114126096, "grad_norm": 0.2555994987487793, "learning_rate": 5.354349691534159e-06, "loss": 0.0776, "num_tokens": 929990236.0, "step": 7114 }, { "epoch": 2.8391859537110933, "grad_norm": 0.22751963138580322, "learning_rate": 5.352602246716842e-06, "loss": 0.0677, "num_tokens": 930121308.0, "step": 7115 }, { "epoch": 2.839584996009577, "grad_norm": 0.26344165205955505, "learning_rate": 5.350859087214537e-06, "loss": 0.069, "num_tokens": 930252380.0, "step": 7116 }, { "epoch": 2.8399840383080606, "grad_norm": 0.22950853407382965, "learning_rate": 5.349120213364526e-06, "loss": 0.0674, "num_tokens": 930383452.0, "step": 7117 }, { "epoch": 2.840383080606544, "grad_norm": 0.23322245478630066, "learning_rate": 5.347385625503265e-06, "loss": 0.0712, "num_tokens": 930514524.0, "step": 7118 }, { "epoch": 2.840782122905028, "grad_norm": 0.2725805938243866, "learning_rate": 5.345655323966383e-06, "loss": 0.0818, "num_tokens": 930645596.0, "step": 7119 }, { "epoch": 2.8411811652035115, "grad_norm": 0.26852938532829285, "learning_rate": 5.3439293090886734e-06, "loss": 0.071, "num_tokens": 930776668.0, "step": 7120 }, { "epoch": 2.841580207501995, "grad_norm": 0.2612791657447815, "learning_rate": 5.342207581204109e-06, "loss": 0.0634, "num_tokens": 930907740.0, "step": 7121 }, { "epoch": 2.841979249800479, "grad_norm": 0.24024027585983276, "learning_rate": 5.340490140645822e-06, "loss": 0.0635, "num_tokens": 931038812.0, "step": 7122 }, { "epoch": 2.8423782920989624, "grad_norm": 0.22544138133525848, "learning_rate": 5.338776987746127e-06, "loss": 0.0572, "num_tokens": 931169884.0, "step": 7123 }, { "epoch": 2.842777334397446, "grad_norm": 0.26366016268730164, "learning_rate": 5.3370681228364996e-06, "loss": 0.0814, "num_tokens": 931300956.0, "step": 7124 }, { "epoch": 2.8431763766959297, "grad_norm": 0.2741961181163788, "learning_rate": 5.335363546247591e-06, "loss": 0.0751, "num_tokens": 931432028.0, "step": 7125 }, { "epoch": 2.8435754189944134, "grad_norm": 0.26320767402648926, "learning_rate": 5.3336632583092185e-06, "loss": 0.0589, "num_tokens": 931563100.0, "step": 7126 }, { "epoch": 2.843974461292897, "grad_norm": 0.29272353649139404, "learning_rate": 5.331967259350376e-06, "loss": 0.0838, "num_tokens": 931694172.0, "step": 7127 }, { "epoch": 2.8443735035913806, "grad_norm": 0.2228487879037857, "learning_rate": 5.3302755496992195e-06, "loss": 0.0533, "num_tokens": 931825244.0, "step": 7128 }, { "epoch": 2.8447725458898643, "grad_norm": 0.2355557680130005, "learning_rate": 5.328588129683081e-06, "loss": 0.0684, "num_tokens": 931956316.0, "step": 7129 }, { "epoch": 2.845171588188348, "grad_norm": 0.23513805866241455, "learning_rate": 5.32690499962846e-06, "loss": 0.0683, "num_tokens": 932087388.0, "step": 7130 }, { "epoch": 2.8455706304868316, "grad_norm": 0.2168637365102768, "learning_rate": 5.325226159861028e-06, "loss": 0.0645, "num_tokens": 932218460.0, "step": 7131 }, { "epoch": 2.845969672785315, "grad_norm": 0.2271224558353424, "learning_rate": 5.323551610705621e-06, "loss": 0.0683, "num_tokens": 932349532.0, "step": 7132 }, { "epoch": 2.846368715083799, "grad_norm": 0.2239273637533188, "learning_rate": 5.321881352486251e-06, "loss": 0.0675, "num_tokens": 932480604.0, "step": 7133 }, { "epoch": 2.8467677573822825, "grad_norm": 0.23288536071777344, "learning_rate": 5.320215385526098e-06, "loss": 0.0711, "num_tokens": 932611676.0, "step": 7134 }, { "epoch": 2.847166799680766, "grad_norm": 0.2589460611343384, "learning_rate": 5.318553710147511e-06, "loss": 0.0689, "num_tokens": 932742748.0, "step": 7135 }, { "epoch": 2.84756584197925, "grad_norm": 0.26540571451187134, "learning_rate": 5.316896326672009e-06, "loss": 0.0716, "num_tokens": 932873820.0, "step": 7136 }, { "epoch": 2.8479648842777334, "grad_norm": 0.2477266490459442, "learning_rate": 5.315243235420279e-06, "loss": 0.0691, "num_tokens": 933004892.0, "step": 7137 }, { "epoch": 2.848363926576217, "grad_norm": 0.23702974617481232, "learning_rate": 5.3135944367121785e-06, "loss": 0.0687, "num_tokens": 933135964.0, "step": 7138 }, { "epoch": 2.8487629688747007, "grad_norm": 0.23340649902820587, "learning_rate": 5.3119499308667345e-06, "loss": 0.0707, "num_tokens": 933267036.0, "step": 7139 }, { "epoch": 2.8491620111731844, "grad_norm": 0.27717483043670654, "learning_rate": 5.310309718202151e-06, "loss": 0.0832, "num_tokens": 933398108.0, "step": 7140 }, { "epoch": 2.849561053471668, "grad_norm": 0.24917644262313843, "learning_rate": 5.308673799035781e-06, "loss": 0.0657, "num_tokens": 933529180.0, "step": 7141 }, { "epoch": 2.8499600957701516, "grad_norm": 0.2346237152814865, "learning_rate": 5.307042173684166e-06, "loss": 0.0678, "num_tokens": 933660252.0, "step": 7142 }, { "epoch": 2.8503591380686353, "grad_norm": 0.26348963379859924, "learning_rate": 5.305414842463014e-06, "loss": 0.0783, "num_tokens": 933791324.0, "step": 7143 }, { "epoch": 2.850758180367119, "grad_norm": 0.2509525716304779, "learning_rate": 5.303791805687192e-06, "loss": 0.0721, "num_tokens": 933922396.0, "step": 7144 }, { "epoch": 2.8511572226656026, "grad_norm": 0.260263055562973, "learning_rate": 5.302173063670745e-06, "loss": 0.0617, "num_tokens": 934053468.0, "step": 7145 }, { "epoch": 2.851556264964086, "grad_norm": 0.2650441825389862, "learning_rate": 5.300558616726885e-06, "loss": 0.0781, "num_tokens": 934184540.0, "step": 7146 }, { "epoch": 2.85195530726257, "grad_norm": 0.2532416582107544, "learning_rate": 5.298948465167991e-06, "loss": 0.0693, "num_tokens": 934315612.0, "step": 7147 }, { "epoch": 2.8523543495610535, "grad_norm": 0.21406280994415283, "learning_rate": 5.297342609305616e-06, "loss": 0.0556, "num_tokens": 934446684.0, "step": 7148 }, { "epoch": 2.852753391859537, "grad_norm": 0.2464185357093811, "learning_rate": 5.295741049450476e-06, "loss": 0.0729, "num_tokens": 934577756.0, "step": 7149 }, { "epoch": 2.853152434158021, "grad_norm": 0.24840806424617767, "learning_rate": 5.294143785912455e-06, "loss": 0.063, "num_tokens": 934708828.0, "step": 7150 }, { "epoch": 2.8535514764565044, "grad_norm": 0.23215436935424805, "learning_rate": 5.292550819000611e-06, "loss": 0.0648, "num_tokens": 934839900.0, "step": 7151 }, { "epoch": 2.853950518754988, "grad_norm": 0.25144854187965393, "learning_rate": 5.290962149023171e-06, "loss": 0.0688, "num_tokens": 934970972.0, "step": 7152 }, { "epoch": 2.8543495610534717, "grad_norm": 0.25279700756073, "learning_rate": 5.289377776287522e-06, "loss": 0.0739, "num_tokens": 935102044.0, "step": 7153 }, { "epoch": 2.8547486033519553, "grad_norm": 0.22470836341381073, "learning_rate": 5.287797701100229e-06, "loss": 0.0692, "num_tokens": 935233116.0, "step": 7154 }, { "epoch": 2.855147645650439, "grad_norm": 0.1991012543439865, "learning_rate": 5.286221923767023e-06, "loss": 0.0495, "num_tokens": 935364188.0, "step": 7155 }, { "epoch": 2.8555466879489226, "grad_norm": 0.24608023464679718, "learning_rate": 5.284650444592803e-06, "loss": 0.0562, "num_tokens": 935495260.0, "step": 7156 }, { "epoch": 2.8559457302474063, "grad_norm": 0.2666679322719574, "learning_rate": 5.2830832638816286e-06, "loss": 0.0662, "num_tokens": 935626332.0, "step": 7157 }, { "epoch": 2.85634477254589, "grad_norm": 0.2841046154499054, "learning_rate": 5.281520381936739e-06, "loss": 0.0591, "num_tokens": 935757404.0, "step": 7158 }, { "epoch": 2.8567438148443736, "grad_norm": 0.22797495126724243, "learning_rate": 5.27996179906054e-06, "loss": 0.0588, "num_tokens": 935888476.0, "step": 7159 }, { "epoch": 2.857142857142857, "grad_norm": 0.2592228949069977, "learning_rate": 5.278407515554597e-06, "loss": 0.0714, "num_tokens": 936019548.0, "step": 7160 }, { "epoch": 2.857541899441341, "grad_norm": 0.19361038506031036, "learning_rate": 5.276857531719656e-06, "loss": 0.0493, "num_tokens": 936150620.0, "step": 7161 }, { "epoch": 2.8579409417398245, "grad_norm": 0.242516428232193, "learning_rate": 5.27531184785562e-06, "loss": 0.0633, "num_tokens": 936281692.0, "step": 7162 }, { "epoch": 2.858339984038308, "grad_norm": 0.2763614058494568, "learning_rate": 5.273770464261568e-06, "loss": 0.067, "num_tokens": 936412764.0, "step": 7163 }, { "epoch": 2.8587390263367918, "grad_norm": 0.23078519105911255, "learning_rate": 5.272233381235736e-06, "loss": 0.0613, "num_tokens": 936543836.0, "step": 7164 }, { "epoch": 2.8591380686352754, "grad_norm": 0.2023816555738449, "learning_rate": 5.270700599075545e-06, "loss": 0.057, "num_tokens": 936674908.0, "step": 7165 }, { "epoch": 2.859537110933759, "grad_norm": 0.26369309425354004, "learning_rate": 5.269172118077565e-06, "loss": 0.0698, "num_tokens": 936805980.0, "step": 7166 }, { "epoch": 2.8599361532322427, "grad_norm": 0.26830917596817017, "learning_rate": 5.267647938537549e-06, "loss": 0.0773, "num_tokens": 936937052.0, "step": 7167 }, { "epoch": 2.8603351955307263, "grad_norm": 0.24447008967399597, "learning_rate": 5.266128060750411e-06, "loss": 0.0666, "num_tokens": 937068124.0, "step": 7168 }, { "epoch": 2.86073423782921, "grad_norm": 0.20905499160289764, "learning_rate": 5.264612485010229e-06, "loss": 0.0616, "num_tokens": 937199196.0, "step": 7169 }, { "epoch": 2.8611332801276936, "grad_norm": 0.24322384595870972, "learning_rate": 5.263101211610255e-06, "loss": 0.0703, "num_tokens": 937330268.0, "step": 7170 }, { "epoch": 2.8615323224261773, "grad_norm": 0.24450314044952393, "learning_rate": 5.26159424084291e-06, "loss": 0.0755, "num_tokens": 937461340.0, "step": 7171 }, { "epoch": 2.861931364724661, "grad_norm": 0.22331103682518005, "learning_rate": 5.260091572999773e-06, "loss": 0.0574, "num_tokens": 937592412.0, "step": 7172 }, { "epoch": 2.8623304070231446, "grad_norm": 0.18828991055488586, "learning_rate": 5.258593208371601e-06, "loss": 0.0518, "num_tokens": 937723484.0, "step": 7173 }, { "epoch": 2.862729449321628, "grad_norm": 0.26188233494758606, "learning_rate": 5.257099147248312e-06, "loss": 0.0604, "num_tokens": 937854556.0, "step": 7174 }, { "epoch": 2.863128491620112, "grad_norm": 0.24331440031528473, "learning_rate": 5.255609389918991e-06, "loss": 0.0627, "num_tokens": 937985628.0, "step": 7175 }, { "epoch": 2.8635275339185955, "grad_norm": 0.2664337754249573, "learning_rate": 5.254123936671891e-06, "loss": 0.0678, "num_tokens": 938116700.0, "step": 7176 }, { "epoch": 2.863926576217079, "grad_norm": 0.23985722661018372, "learning_rate": 5.25264278779444e-06, "loss": 0.0644, "num_tokens": 938247772.0, "step": 7177 }, { "epoch": 2.8643256185155628, "grad_norm": 0.19639867544174194, "learning_rate": 5.251165943573219e-06, "loss": 0.0531, "num_tokens": 938362665.0, "step": 7178 }, { "epoch": 2.8647246608140464, "grad_norm": 0.286617636680603, "learning_rate": 5.249693404293988e-06, "loss": 0.071, "num_tokens": 938493737.0, "step": 7179 }, { "epoch": 2.86512370311253, "grad_norm": 0.23846952617168427, "learning_rate": 5.248225170241669e-06, "loss": 0.0658, "num_tokens": 938624809.0, "step": 7180 }, { "epoch": 2.8655227454110137, "grad_norm": 0.25104033946990967, "learning_rate": 5.246761241700351e-06, "loss": 0.0674, "num_tokens": 938755881.0, "step": 7181 }, { "epoch": 2.8659217877094973, "grad_norm": 0.2497238665819168, "learning_rate": 5.24530161895329e-06, "loss": 0.0712, "num_tokens": 938886953.0, "step": 7182 }, { "epoch": 2.866320830007981, "grad_norm": 0.2758033871650696, "learning_rate": 5.243846302282912e-06, "loss": 0.0652, "num_tokens": 939018025.0, "step": 7183 }, { "epoch": 2.8667198723064646, "grad_norm": 0.23592102527618408, "learning_rate": 5.242395291970803e-06, "loss": 0.0546, "num_tokens": 939149097.0, "step": 7184 }, { "epoch": 2.8671189146049483, "grad_norm": 0.24188700318336487, "learning_rate": 5.240948588297726e-06, "loss": 0.0694, "num_tokens": 939280169.0, "step": 7185 }, { "epoch": 2.867517956903432, "grad_norm": 0.2575422525405884, "learning_rate": 5.239506191543596e-06, "loss": 0.0722, "num_tokens": 939411241.0, "step": 7186 }, { "epoch": 2.8679169992019156, "grad_norm": 0.22991329431533813, "learning_rate": 5.23806810198751e-06, "loss": 0.056, "num_tokens": 939542313.0, "step": 7187 }, { "epoch": 2.868316041500399, "grad_norm": 0.24938881397247314, "learning_rate": 5.236634319907725e-06, "loss": 0.0673, "num_tokens": 939673385.0, "step": 7188 }, { "epoch": 2.868715083798883, "grad_norm": 0.2602991461753845, "learning_rate": 5.235204845581661e-06, "loss": 0.0751, "num_tokens": 939804457.0, "step": 7189 }, { "epoch": 2.869114126097366, "grad_norm": 0.2529003322124481, "learning_rate": 5.2337796792859095e-06, "loss": 0.0738, "num_tokens": 939935529.0, "step": 7190 }, { "epoch": 2.8695131683958497, "grad_norm": 0.21140863001346588, "learning_rate": 5.2323588212962274e-06, "loss": 0.0522, "num_tokens": 940066601.0, "step": 7191 }, { "epoch": 2.8699122106943333, "grad_norm": 0.2199697643518448, "learning_rate": 5.230942271887537e-06, "loss": 0.0534, "num_tokens": 940182475.0, "step": 7192 }, { "epoch": 2.870311252992817, "grad_norm": 0.22595784068107605, "learning_rate": 5.229530031333927e-06, "loss": 0.0579, "num_tokens": 940313547.0, "step": 7193 }, { "epoch": 2.8707102952913006, "grad_norm": 0.22413961589336395, "learning_rate": 5.228122099908654e-06, "loss": 0.0623, "num_tokens": 940444619.0, "step": 7194 }, { "epoch": 2.8711093375897843, "grad_norm": 0.22053630650043488, "learning_rate": 5.226718477884137e-06, "loss": 0.0524, "num_tokens": 940575691.0, "step": 7195 }, { "epoch": 2.871508379888268, "grad_norm": 0.23020394146442413, "learning_rate": 5.225319165531971e-06, "loss": 0.0621, "num_tokens": 940706763.0, "step": 7196 }, { "epoch": 2.8719074221867515, "grad_norm": 0.21746926009655, "learning_rate": 5.223924163122898e-06, "loss": 0.0611, "num_tokens": 940837835.0, "step": 7197 }, { "epoch": 2.872306464485235, "grad_norm": 0.3210603594779968, "learning_rate": 5.222533470926849e-06, "loss": 0.0828, "num_tokens": 940968907.0, "step": 7198 }, { "epoch": 2.872705506783719, "grad_norm": 0.2156962752342224, "learning_rate": 5.221147089212904e-06, "loss": 0.0544, "num_tokens": 941084556.0, "step": 7199 }, { "epoch": 2.8731045490822025, "grad_norm": 0.25660309195518494, "learning_rate": 5.219765018249315e-06, "loss": 0.0792, "num_tokens": 941215628.0, "step": 7200 }, { "epoch": 2.873503591380686, "grad_norm": 0.2561297118663788, "learning_rate": 5.218387258303502e-06, "loss": 0.0727, "num_tokens": 941346700.0, "step": 7201 }, { "epoch": 2.8739026336791698, "grad_norm": 0.2642289102077484, "learning_rate": 5.217013809642051e-06, "loss": 0.0807, "num_tokens": 941477772.0, "step": 7202 }, { "epoch": 2.8743016759776534, "grad_norm": 0.27795177698135376, "learning_rate": 5.215644672530703e-06, "loss": 0.0683, "num_tokens": 941608844.0, "step": 7203 }, { "epoch": 2.874700718276137, "grad_norm": 0.2611403465270996, "learning_rate": 5.214279847234381e-06, "loss": 0.0753, "num_tokens": 941739916.0, "step": 7204 }, { "epoch": 2.8750997605746207, "grad_norm": 0.23658855259418488, "learning_rate": 5.2129193340171665e-06, "loss": 0.0618, "num_tokens": 941870988.0, "step": 7205 }, { "epoch": 2.8754988028731043, "grad_norm": 0.23404628038406372, "learning_rate": 5.211563133142301e-06, "loss": 0.0586, "num_tokens": 942002060.0, "step": 7206 }, { "epoch": 2.875897845171588, "grad_norm": 0.2557176649570465, "learning_rate": 5.2102112448722e-06, "loss": 0.0784, "num_tokens": 942133132.0, "step": 7207 }, { "epoch": 2.8762968874700716, "grad_norm": 0.20416630804538727, "learning_rate": 5.208863669468439e-06, "loss": 0.0498, "num_tokens": 942264204.0, "step": 7208 }, { "epoch": 2.8766959297685553, "grad_norm": 0.223649799823761, "learning_rate": 5.207520407191765e-06, "loss": 0.0744, "num_tokens": 942395276.0, "step": 7209 }, { "epoch": 2.877094972067039, "grad_norm": 0.2247677892446518, "learning_rate": 5.206181458302083e-06, "loss": 0.065, "num_tokens": 942526348.0, "step": 7210 }, { "epoch": 2.8774940143655225, "grad_norm": 0.24817286431789398, "learning_rate": 5.204846823058466e-06, "loss": 0.0682, "num_tokens": 942657420.0, "step": 7211 }, { "epoch": 2.877893056664006, "grad_norm": 0.2896495759487152, "learning_rate": 5.203516501719159e-06, "loss": 0.0776, "num_tokens": 942788492.0, "step": 7212 }, { "epoch": 2.87829209896249, "grad_norm": 0.24917501211166382, "learning_rate": 5.202190494541563e-06, "loss": 0.0713, "num_tokens": 942919564.0, "step": 7213 }, { "epoch": 2.8786911412609735, "grad_norm": 0.28238487243652344, "learning_rate": 5.200868801782249e-06, "loss": 0.0597, "num_tokens": 943050636.0, "step": 7214 }, { "epoch": 2.879090183559457, "grad_norm": 0.23089809715747833, "learning_rate": 5.199551423696951e-06, "loss": 0.0718, "num_tokens": 943181708.0, "step": 7215 }, { "epoch": 2.8794892258579408, "grad_norm": 0.23711460828781128, "learning_rate": 5.198238360540571e-06, "loss": 0.0616, "num_tokens": 943312780.0, "step": 7216 }, { "epoch": 2.8798882681564244, "grad_norm": 0.2520810067653656, "learning_rate": 5.196929612567172e-06, "loss": 0.0717, "num_tokens": 943443852.0, "step": 7217 }, { "epoch": 2.880287310454908, "grad_norm": 0.20766234397888184, "learning_rate": 5.195625180029989e-06, "loss": 0.0639, "num_tokens": 943574924.0, "step": 7218 }, { "epoch": 2.8806863527533917, "grad_norm": 0.276949405670166, "learning_rate": 5.194325063181417e-06, "loss": 0.0725, "num_tokens": 943705996.0, "step": 7219 }, { "epoch": 2.8810853950518753, "grad_norm": 0.2484394758939743, "learning_rate": 5.193029262273011e-06, "loss": 0.0681, "num_tokens": 943837068.0, "step": 7220 }, { "epoch": 2.881484437350359, "grad_norm": 0.28630179166793823, "learning_rate": 5.191737777555501e-06, "loss": 0.0855, "num_tokens": 943968140.0, "step": 7221 }, { "epoch": 2.8818834796488426, "grad_norm": 0.22916635870933533, "learning_rate": 5.190450609278774e-06, "loss": 0.0632, "num_tokens": 944099212.0, "step": 7222 }, { "epoch": 2.8822825219473263, "grad_norm": 0.2351728230714798, "learning_rate": 5.189167757691887e-06, "loss": 0.0615, "num_tokens": 944230284.0, "step": 7223 }, { "epoch": 2.88268156424581, "grad_norm": 0.23964092135429382, "learning_rate": 5.1878892230430635e-06, "loss": 0.0654, "num_tokens": 944361356.0, "step": 7224 }, { "epoch": 2.8830806065442935, "grad_norm": 0.27451208233833313, "learning_rate": 5.186615005579683e-06, "loss": 0.068, "num_tokens": 944492428.0, "step": 7225 }, { "epoch": 2.883479648842777, "grad_norm": 0.265251487493515, "learning_rate": 5.185345105548294e-06, "loss": 0.0686, "num_tokens": 944623500.0, "step": 7226 }, { "epoch": 2.883878691141261, "grad_norm": 0.2685023546218872, "learning_rate": 5.184079523194616e-06, "loss": 0.0653, "num_tokens": 944754572.0, "step": 7227 }, { "epoch": 2.8842777334397445, "grad_norm": 0.2621097266674042, "learning_rate": 5.182818258763523e-06, "loss": 0.0747, "num_tokens": 944885644.0, "step": 7228 }, { "epoch": 2.884676775738228, "grad_norm": 0.2747798562049866, "learning_rate": 5.181561312499056e-06, "loss": 0.0844, "num_tokens": 945016716.0, "step": 7229 }, { "epoch": 2.8850758180367118, "grad_norm": 0.23549719154834747, "learning_rate": 5.180308684644429e-06, "loss": 0.0617, "num_tokens": 945147788.0, "step": 7230 }, { "epoch": 2.8854748603351954, "grad_norm": 0.24951207637786865, "learning_rate": 5.179060375442007e-06, "loss": 0.0616, "num_tokens": 945278860.0, "step": 7231 }, { "epoch": 2.885873902633679, "grad_norm": 0.25882285833358765, "learning_rate": 5.177816385133332e-06, "loss": 0.0833, "num_tokens": 945409932.0, "step": 7232 }, { "epoch": 2.8862729449321627, "grad_norm": 0.24056531488895416, "learning_rate": 5.176576713959097e-06, "loss": 0.0662, "num_tokens": 945541004.0, "step": 7233 }, { "epoch": 2.8866719872306463, "grad_norm": 0.2160925418138504, "learning_rate": 5.175341362159178e-06, "loss": 0.0636, "num_tokens": 945672076.0, "step": 7234 }, { "epoch": 2.88707102952913, "grad_norm": 0.22828440368175507, "learning_rate": 5.174110329972594e-06, "loss": 0.0627, "num_tokens": 945803148.0, "step": 7235 }, { "epoch": 2.8874700718276136, "grad_norm": 0.2543781101703644, "learning_rate": 5.1728836176375425e-06, "loss": 0.0625, "num_tokens": 945934220.0, "step": 7236 }, { "epoch": 2.8878691141260973, "grad_norm": 0.28236356377601624, "learning_rate": 5.1716612253913794e-06, "loss": 0.0729, "num_tokens": 946065292.0, "step": 7237 }, { "epoch": 2.888268156424581, "grad_norm": 0.22335241734981537, "learning_rate": 5.170443153470627e-06, "loss": 0.063, "num_tokens": 946196364.0, "step": 7238 }, { "epoch": 2.8886671987230645, "grad_norm": 0.22496041655540466, "learning_rate": 5.169229402110971e-06, "loss": 0.0631, "num_tokens": 946327436.0, "step": 7239 }, { "epoch": 2.889066241021548, "grad_norm": 0.22252412140369415, "learning_rate": 5.168019971547263e-06, "loss": 0.0515, "num_tokens": 946458508.0, "step": 7240 }, { "epoch": 2.889465283320032, "grad_norm": 0.20609959959983826, "learning_rate": 5.166814862013512e-06, "loss": 0.0614, "num_tokens": 946589580.0, "step": 7241 }, { "epoch": 2.8898643256185155, "grad_norm": 0.2439575493335724, "learning_rate": 5.1656140737429e-06, "loss": 0.0698, "num_tokens": 946720652.0, "step": 7242 }, { "epoch": 2.890263367916999, "grad_norm": 0.23982609808444977, "learning_rate": 5.1644176069677645e-06, "loss": 0.0637, "num_tokens": 946851724.0, "step": 7243 }, { "epoch": 2.8906624102154828, "grad_norm": 0.20240013301372528, "learning_rate": 5.163225461919613e-06, "loss": 0.05, "num_tokens": 946982796.0, "step": 7244 }, { "epoch": 2.8910614525139664, "grad_norm": 0.21637922525405884, "learning_rate": 5.162037638829113e-06, "loss": 0.0585, "num_tokens": 947113868.0, "step": 7245 }, { "epoch": 2.89146049481245, "grad_norm": 0.22336873412132263, "learning_rate": 5.160854137926099e-06, "loss": 0.0545, "num_tokens": 947244940.0, "step": 7246 }, { "epoch": 2.8918595371109337, "grad_norm": 0.2443746030330658, "learning_rate": 5.1596749594395654e-06, "loss": 0.0684, "num_tokens": 947376012.0, "step": 7247 }, { "epoch": 2.8922585794094173, "grad_norm": 0.2837192118167877, "learning_rate": 5.158500103597675e-06, "loss": 0.0687, "num_tokens": 947507084.0, "step": 7248 }, { "epoch": 2.892657621707901, "grad_norm": 0.22570204734802246, "learning_rate": 5.157329570627748e-06, "loss": 0.0602, "num_tokens": 947638156.0, "step": 7249 }, { "epoch": 2.8930566640063846, "grad_norm": 0.2253519743680954, "learning_rate": 5.156163360756276e-06, "loss": 0.0576, "num_tokens": 947769228.0, "step": 7250 }, { "epoch": 2.8934557063048683, "grad_norm": 0.25851473212242126, "learning_rate": 5.155001474208904e-06, "loss": 0.0687, "num_tokens": 947900300.0, "step": 7251 }, { "epoch": 2.893854748603352, "grad_norm": 0.2632109522819519, "learning_rate": 5.1538439112104515e-06, "loss": 0.0785, "num_tokens": 948031372.0, "step": 7252 }, { "epoch": 2.8942537909018355, "grad_norm": 0.2154509574174881, "learning_rate": 5.152690671984894e-06, "loss": 0.0653, "num_tokens": 948162444.0, "step": 7253 }, { "epoch": 2.894652833200319, "grad_norm": 0.23016461730003357, "learning_rate": 5.15154175675537e-06, "loss": 0.0702, "num_tokens": 948293516.0, "step": 7254 }, { "epoch": 2.895051875498803, "grad_norm": 0.24700641632080078, "learning_rate": 5.150397165744189e-06, "loss": 0.0705, "num_tokens": 948424588.0, "step": 7255 }, { "epoch": 2.8954509177972865, "grad_norm": 0.21719051897525787, "learning_rate": 5.149256899172815e-06, "loss": 0.054, "num_tokens": 948555660.0, "step": 7256 }, { "epoch": 2.89584996009577, "grad_norm": 0.2168189138174057, "learning_rate": 5.14812095726188e-06, "loss": 0.0596, "num_tokens": 948686732.0, "step": 7257 }, { "epoch": 2.8962490023942538, "grad_norm": 0.2727406322956085, "learning_rate": 5.146989340231179e-06, "loss": 0.0747, "num_tokens": 948817804.0, "step": 7258 }, { "epoch": 2.8966480446927374, "grad_norm": 0.28132492303848267, "learning_rate": 5.145862048299667e-06, "loss": 0.0705, "num_tokens": 948948876.0, "step": 7259 }, { "epoch": 2.897047086991221, "grad_norm": 0.20041897892951965, "learning_rate": 5.144739081685467e-06, "loss": 0.0492, "num_tokens": 949079948.0, "step": 7260 }, { "epoch": 2.8974461292897047, "grad_norm": 0.24244317412376404, "learning_rate": 5.143620440605863e-06, "loss": 0.0671, "num_tokens": 949211020.0, "step": 7261 }, { "epoch": 2.8978451715881883, "grad_norm": 0.2860756814479828, "learning_rate": 5.142506125277298e-06, "loss": 0.0774, "num_tokens": 949342092.0, "step": 7262 }, { "epoch": 2.898244213886672, "grad_norm": 0.24829061329364777, "learning_rate": 5.141396135915383e-06, "loss": 0.0759, "num_tokens": 949473164.0, "step": 7263 }, { "epoch": 2.8986432561851556, "grad_norm": 0.2859116494655609, "learning_rate": 5.140290472734893e-06, "loss": 0.0689, "num_tokens": 949604236.0, "step": 7264 }, { "epoch": 2.8990422984836393, "grad_norm": 0.2552219331264496, "learning_rate": 5.139189135949764e-06, "loss": 0.0741, "num_tokens": 949735308.0, "step": 7265 }, { "epoch": 2.899441340782123, "grad_norm": 0.25535136461257935, "learning_rate": 5.1380921257730905e-06, "loss": 0.0709, "num_tokens": 949866380.0, "step": 7266 }, { "epoch": 2.8998403830806065, "grad_norm": 0.21687127649784088, "learning_rate": 5.1369994424171345e-06, "loss": 0.0647, "num_tokens": 949994424.0, "step": 7267 }, { "epoch": 2.90023942537909, "grad_norm": 0.2542450726032257, "learning_rate": 5.135911086093322e-06, "loss": 0.0762, "num_tokens": 950125496.0, "step": 7268 }, { "epoch": 2.900638467677574, "grad_norm": 0.2697810232639313, "learning_rate": 5.134827057012239e-06, "loss": 0.066, "num_tokens": 950256568.0, "step": 7269 }, { "epoch": 2.9010375099760575, "grad_norm": 0.21589519083499908, "learning_rate": 5.133747355383634e-06, "loss": 0.0577, "num_tokens": 950387640.0, "step": 7270 }, { "epoch": 2.901436552274541, "grad_norm": 0.25640785694122314, "learning_rate": 5.1326719814164225e-06, "loss": 0.0771, "num_tokens": 950518712.0, "step": 7271 }, { "epoch": 2.9018355945730248, "grad_norm": 0.2515494227409363, "learning_rate": 5.131600935318676e-06, "loss": 0.0661, "num_tokens": 950649784.0, "step": 7272 }, { "epoch": 2.9022346368715084, "grad_norm": 0.23531262576580048, "learning_rate": 5.1305342172976315e-06, "loss": 0.0621, "num_tokens": 950780856.0, "step": 7273 }, { "epoch": 2.902633679169992, "grad_norm": 0.2433822751045227, "learning_rate": 5.129471827559692e-06, "loss": 0.0726, "num_tokens": 950911928.0, "step": 7274 }, { "epoch": 2.9030327214684757, "grad_norm": 0.2842177152633667, "learning_rate": 5.128413766310419e-06, "loss": 0.0802, "num_tokens": 951043000.0, "step": 7275 }, { "epoch": 2.9034317637669593, "grad_norm": 0.2557470500469208, "learning_rate": 5.127360033754535e-06, "loss": 0.0695, "num_tokens": 951174072.0, "step": 7276 }, { "epoch": 2.903830806065443, "grad_norm": 0.25447404384613037, "learning_rate": 5.126310630095932e-06, "loss": 0.0738, "num_tokens": 951305144.0, "step": 7277 }, { "epoch": 2.9042298483639266, "grad_norm": 0.2538101375102997, "learning_rate": 5.1252655555376546e-06, "loss": 0.0686, "num_tokens": 951436216.0, "step": 7278 }, { "epoch": 2.9046288906624103, "grad_norm": 0.18466448783874512, "learning_rate": 5.124224810281919e-06, "loss": 0.049, "num_tokens": 951567288.0, "step": 7279 }, { "epoch": 2.905027932960894, "grad_norm": 0.2216443568468094, "learning_rate": 5.123188394530098e-06, "loss": 0.0538, "num_tokens": 951698360.0, "step": 7280 }, { "epoch": 2.9054269752593775, "grad_norm": 0.25917285680770874, "learning_rate": 5.12215630848273e-06, "loss": 0.0744, "num_tokens": 951829432.0, "step": 7281 }, { "epoch": 2.905826017557861, "grad_norm": 0.295954167842865, "learning_rate": 5.12112855233951e-06, "loss": 0.0983, "num_tokens": 951960504.0, "step": 7282 }, { "epoch": 2.906225059856345, "grad_norm": 0.23274368047714233, "learning_rate": 5.120105126299303e-06, "loss": 0.0689, "num_tokens": 952091576.0, "step": 7283 }, { "epoch": 2.9066241021548285, "grad_norm": 0.22918246686458588, "learning_rate": 5.119086030560131e-06, "loss": 0.0581, "num_tokens": 952222648.0, "step": 7284 }, { "epoch": 2.907023144453312, "grad_norm": 0.2631692588329315, "learning_rate": 5.118071265319183e-06, "loss": 0.0753, "num_tokens": 952353720.0, "step": 7285 }, { "epoch": 2.9074221867517958, "grad_norm": 0.23263512551784515, "learning_rate": 5.117060830772799e-06, "loss": 0.0643, "num_tokens": 952484792.0, "step": 7286 }, { "epoch": 2.9078212290502794, "grad_norm": 0.247759610414505, "learning_rate": 5.1160547271164965e-06, "loss": 0.0685, "num_tokens": 952615864.0, "step": 7287 }, { "epoch": 2.908220271348763, "grad_norm": 0.2599925696849823, "learning_rate": 5.115052954544941e-06, "loss": 0.0876, "num_tokens": 952746936.0, "step": 7288 }, { "epoch": 2.9086193136472467, "grad_norm": 0.2743052840232849, "learning_rate": 5.114055513251971e-06, "loss": 0.0828, "num_tokens": 952878008.0, "step": 7289 }, { "epoch": 2.9090183559457303, "grad_norm": 0.2661561369895935, "learning_rate": 5.113062403430579e-06, "loss": 0.0698, "num_tokens": 953009080.0, "step": 7290 }, { "epoch": 2.909417398244214, "grad_norm": 0.21693797409534454, "learning_rate": 5.112073625272926e-06, "loss": 0.0628, "num_tokens": 953140152.0, "step": 7291 }, { "epoch": 2.9098164405426976, "grad_norm": 0.21950063109397888, "learning_rate": 5.111089178970327e-06, "loss": 0.0598, "num_tokens": 953271224.0, "step": 7292 }, { "epoch": 2.9102154828411813, "grad_norm": 0.22336043417453766, "learning_rate": 5.110109064713265e-06, "loss": 0.0607, "num_tokens": 953402296.0, "step": 7293 }, { "epoch": 2.910614525139665, "grad_norm": 0.19355955719947815, "learning_rate": 5.1091332826913815e-06, "loss": 0.048, "num_tokens": 953533368.0, "step": 7294 }, { "epoch": 2.9110135674381485, "grad_norm": 0.2458193451166153, "learning_rate": 5.108161833093485e-06, "loss": 0.0664, "num_tokens": 953664440.0, "step": 7295 }, { "epoch": 2.911412609736632, "grad_norm": 0.23437093198299408, "learning_rate": 5.10719471610754e-06, "loss": 0.0649, "num_tokens": 953795512.0, "step": 7296 }, { "epoch": 2.911811652035116, "grad_norm": 0.1956474632024765, "learning_rate": 5.106231931920672e-06, "loss": 0.0504, "num_tokens": 953926584.0, "step": 7297 }, { "epoch": 2.9122106943335995, "grad_norm": 0.24146567285060883, "learning_rate": 5.105273480719175e-06, "loss": 0.0625, "num_tokens": 954057656.0, "step": 7298 }, { "epoch": 2.912609736632083, "grad_norm": 0.22863809764385223, "learning_rate": 5.104319362688499e-06, "loss": 0.0565, "num_tokens": 954188728.0, "step": 7299 }, { "epoch": 2.9130087789305668, "grad_norm": 0.22803939878940582, "learning_rate": 5.103369578013257e-06, "loss": 0.0654, "num_tokens": 954319800.0, "step": 7300 }, { "epoch": 2.9134078212290504, "grad_norm": 0.21363748610019684, "learning_rate": 5.1024241268772205e-06, "loss": 0.0535, "num_tokens": 954450872.0, "step": 7301 }, { "epoch": 2.913806863527534, "grad_norm": 0.25809353590011597, "learning_rate": 5.101483009463329e-06, "loss": 0.0654, "num_tokens": 954581944.0, "step": 7302 }, { "epoch": 2.9142059058260177, "grad_norm": 0.2321193963289261, "learning_rate": 5.1005462259536795e-06, "loss": 0.0688, "num_tokens": 954713016.0, "step": 7303 }, { "epoch": 2.9146049481245013, "grad_norm": 0.28505703806877136, "learning_rate": 5.099613776529529e-06, "loss": 0.0758, "num_tokens": 954844088.0, "step": 7304 }, { "epoch": 2.915003990422985, "grad_norm": 0.21743160486221313, "learning_rate": 5.098685661371298e-06, "loss": 0.0495, "num_tokens": 954975160.0, "step": 7305 }, { "epoch": 2.9154030327214686, "grad_norm": 0.21301040053367615, "learning_rate": 5.09776188065857e-06, "loss": 0.0619, "num_tokens": 955106232.0, "step": 7306 }, { "epoch": 2.9158020750199523, "grad_norm": 0.24618178606033325, "learning_rate": 5.0968424345700875e-06, "loss": 0.0681, "num_tokens": 955237304.0, "step": 7307 }, { "epoch": 2.916201117318436, "grad_norm": 0.2570669949054718, "learning_rate": 5.095927323283755e-06, "loss": 0.0666, "num_tokens": 955368376.0, "step": 7308 }, { "epoch": 2.9166001596169195, "grad_norm": 0.24960127472877502, "learning_rate": 5.0950165469766375e-06, "loss": 0.0689, "num_tokens": 955499448.0, "step": 7309 }, { "epoch": 2.916999201915403, "grad_norm": 0.2718104422092438, "learning_rate": 5.094110105824962e-06, "loss": 0.0706, "num_tokens": 955630520.0, "step": 7310 }, { "epoch": 2.917398244213887, "grad_norm": 0.24822081625461578, "learning_rate": 5.093208000004116e-06, "loss": 0.072, "num_tokens": 955746855.0, "step": 7311 }, { "epoch": 2.9177972865123705, "grad_norm": 0.2360982596874237, "learning_rate": 5.092310229688646e-06, "loss": 0.0617, "num_tokens": 955877927.0, "step": 7312 }, { "epoch": 2.918196328810854, "grad_norm": 0.26366114616394043, "learning_rate": 5.091416795052266e-06, "loss": 0.0679, "num_tokens": 956008999.0, "step": 7313 }, { "epoch": 2.9185953711093378, "grad_norm": 0.29470306634902954, "learning_rate": 5.090527696267848e-06, "loss": 0.0796, "num_tokens": 956140071.0, "step": 7314 }, { "epoch": 2.9189944134078214, "grad_norm": 0.2374231219291687, "learning_rate": 5.08964293350742e-06, "loss": 0.0568, "num_tokens": 956271143.0, "step": 7315 }, { "epoch": 2.919393455706305, "grad_norm": 0.23592981696128845, "learning_rate": 5.088762506942177e-06, "loss": 0.0637, "num_tokens": 956402215.0, "step": 7316 }, { "epoch": 2.9197924980047887, "grad_norm": 0.2540907561779022, "learning_rate": 5.087886416742475e-06, "loss": 0.0657, "num_tokens": 956533287.0, "step": 7317 }, { "epoch": 2.9201915403032723, "grad_norm": 0.2428675889968872, "learning_rate": 5.087014663077828e-06, "loss": 0.0711, "num_tokens": 956664359.0, "step": 7318 }, { "epoch": 2.920590582601756, "grad_norm": 0.208888441324234, "learning_rate": 5.086147246116913e-06, "loss": 0.0676, "num_tokens": 956795431.0, "step": 7319 }, { "epoch": 2.9209896249002396, "grad_norm": 0.24456320703029633, "learning_rate": 5.0852841660275665e-06, "loss": 0.0614, "num_tokens": 956926503.0, "step": 7320 }, { "epoch": 2.9213886671987233, "grad_norm": 0.23053596913814545, "learning_rate": 5.084425422976788e-06, "loss": 0.0611, "num_tokens": 957057575.0, "step": 7321 }, { "epoch": 2.921787709497207, "grad_norm": 0.22405976057052612, "learning_rate": 5.083571017130734e-06, "loss": 0.0666, "num_tokens": 957188647.0, "step": 7322 }, { "epoch": 2.9221867517956905, "grad_norm": 0.2515329122543335, "learning_rate": 5.082720948654723e-06, "loss": 0.0757, "num_tokens": 957319719.0, "step": 7323 }, { "epoch": 2.922585794094174, "grad_norm": 0.23072896897792816, "learning_rate": 5.08187521771324e-06, "loss": 0.0585, "num_tokens": 957450791.0, "step": 7324 }, { "epoch": 2.922984836392658, "grad_norm": 0.22582665085792542, "learning_rate": 5.081033824469922e-06, "loss": 0.0596, "num_tokens": 957581863.0, "step": 7325 }, { "epoch": 2.9233838786911415, "grad_norm": 0.31993407011032104, "learning_rate": 5.080196769087571e-06, "loss": 0.0773, "num_tokens": 957712935.0, "step": 7326 }, { "epoch": 2.923782920989625, "grad_norm": 0.2731166183948517, "learning_rate": 5.079364051728152e-06, "loss": 0.0787, "num_tokens": 957844007.0, "step": 7327 }, { "epoch": 2.9241819632881088, "grad_norm": 0.2475508749485016, "learning_rate": 5.078535672552784e-06, "loss": 0.0587, "num_tokens": 957975079.0, "step": 7328 }, { "epoch": 2.9245810055865924, "grad_norm": 0.23806187510490417, "learning_rate": 5.077711631721756e-06, "loss": 0.0616, "num_tokens": 958106151.0, "step": 7329 }, { "epoch": 2.924980047885076, "grad_norm": 0.22852762043476105, "learning_rate": 5.0768919293945084e-06, "loss": 0.0557, "num_tokens": 958237223.0, "step": 7330 }, { "epoch": 2.9253790901835597, "grad_norm": 0.20922568440437317, "learning_rate": 5.076076565729648e-06, "loss": 0.0502, "num_tokens": 958368295.0, "step": 7331 }, { "epoch": 2.9257781324820433, "grad_norm": 0.22071543335914612, "learning_rate": 5.075265540884938e-06, "loss": 0.0555, "num_tokens": 958499367.0, "step": 7332 }, { "epoch": 2.926177174780527, "grad_norm": 0.22525233030319214, "learning_rate": 5.0744588550173055e-06, "loss": 0.0576, "num_tokens": 958630439.0, "step": 7333 }, { "epoch": 2.9265762170790106, "grad_norm": 0.2528274655342102, "learning_rate": 5.073656508282837e-06, "loss": 0.0723, "num_tokens": 958761511.0, "step": 7334 }, { "epoch": 2.9269752593774943, "grad_norm": 0.2118932008743286, "learning_rate": 5.072858500836775e-06, "loss": 0.061, "num_tokens": 958892583.0, "step": 7335 }, { "epoch": 2.927374301675978, "grad_norm": 0.26827552914619446, "learning_rate": 5.072064832833532e-06, "loss": 0.077, "num_tokens": 959023655.0, "step": 7336 }, { "epoch": 2.9277733439744615, "grad_norm": 0.2483556717634201, "learning_rate": 5.071275504426673e-06, "loss": 0.0714, "num_tokens": 959154727.0, "step": 7337 }, { "epoch": 2.928172386272945, "grad_norm": 0.20475876331329346, "learning_rate": 5.070490515768928e-06, "loss": 0.0558, "num_tokens": 959283434.0, "step": 7338 }, { "epoch": 2.928571428571429, "grad_norm": 0.2553362548351288, "learning_rate": 5.06970986701218e-06, "loss": 0.0672, "num_tokens": 959414506.0, "step": 7339 }, { "epoch": 2.9289704708699125, "grad_norm": 0.23015160858631134, "learning_rate": 5.068933558307483e-06, "loss": 0.0577, "num_tokens": 959545578.0, "step": 7340 }, { "epoch": 2.929369513168396, "grad_norm": 0.2165604531764984, "learning_rate": 5.068161589805041e-06, "loss": 0.0587, "num_tokens": 959676650.0, "step": 7341 }, { "epoch": 2.9297685554668798, "grad_norm": 0.25970256328582764, "learning_rate": 5.067393961654224e-06, "loss": 0.0836, "num_tokens": 959807722.0, "step": 7342 }, { "epoch": 2.9301675977653634, "grad_norm": 0.22001948952674866, "learning_rate": 5.066630674003562e-06, "loss": 0.0649, "num_tokens": 959938794.0, "step": 7343 }, { "epoch": 2.930566640063847, "grad_norm": 0.3009316325187683, "learning_rate": 5.065871727000743e-06, "loss": 0.0778, "num_tokens": 960069866.0, "step": 7344 }, { "epoch": 2.9309656823623307, "grad_norm": 0.2517809271812439, "learning_rate": 5.065117120792619e-06, "loss": 0.0621, "num_tokens": 960200938.0, "step": 7345 }, { "epoch": 2.931364724660814, "grad_norm": 0.21254876255989075, "learning_rate": 5.064366855525193e-06, "loss": 0.0549, "num_tokens": 960332010.0, "step": 7346 }, { "epoch": 2.9317637669592975, "grad_norm": 0.22421063482761383, "learning_rate": 5.06362093134364e-06, "loss": 0.0625, "num_tokens": 960463082.0, "step": 7347 }, { "epoch": 2.932162809257781, "grad_norm": 0.2343800961971283, "learning_rate": 5.062879348392289e-06, "loss": 0.0663, "num_tokens": 960594154.0, "step": 7348 }, { "epoch": 2.932561851556265, "grad_norm": 0.2810558080673218, "learning_rate": 5.062142106814626e-06, "loss": 0.0684, "num_tokens": 960725226.0, "step": 7349 }, { "epoch": 2.9329608938547485, "grad_norm": 0.28122544288635254, "learning_rate": 5.061409206753303e-06, "loss": 0.0685, "num_tokens": 960856298.0, "step": 7350 }, { "epoch": 2.933359936153232, "grad_norm": 0.23905602097511292, "learning_rate": 5.0606806483501285e-06, "loss": 0.0618, "num_tokens": 960987370.0, "step": 7351 }, { "epoch": 2.9337589784517157, "grad_norm": 0.23733286559581757, "learning_rate": 5.0599564317460716e-06, "loss": 0.0699, "num_tokens": 961118442.0, "step": 7352 }, { "epoch": 2.9341580207501994, "grad_norm": 0.22470740973949432, "learning_rate": 5.059236557081261e-06, "loss": 0.058, "num_tokens": 961249514.0, "step": 7353 }, { "epoch": 2.934557063048683, "grad_norm": 0.2544892132282257, "learning_rate": 5.0585210244949865e-06, "loss": 0.0748, "num_tokens": 961380586.0, "step": 7354 }, { "epoch": 2.9349561053471667, "grad_norm": 0.21579158306121826, "learning_rate": 5.0578098341257005e-06, "loss": 0.0592, "num_tokens": 961511658.0, "step": 7355 }, { "epoch": 2.9353551476456503, "grad_norm": 0.28928330540657043, "learning_rate": 5.0571029861110035e-06, "loss": 0.0761, "num_tokens": 961642730.0, "step": 7356 }, { "epoch": 2.935754189944134, "grad_norm": 0.23917318880558014, "learning_rate": 5.056400480587669e-06, "loss": 0.066, "num_tokens": 961773802.0, "step": 7357 }, { "epoch": 2.9361532322426176, "grad_norm": 0.254374235868454, "learning_rate": 5.05570231769163e-06, "loss": 0.0624, "num_tokens": 961890699.0, "step": 7358 }, { "epoch": 2.9365522745411012, "grad_norm": 0.2590443193912506, "learning_rate": 5.055008497557966e-06, "loss": 0.0654, "num_tokens": 962021771.0, "step": 7359 }, { "epoch": 2.936951316839585, "grad_norm": 0.23057562112808228, "learning_rate": 5.054319020320927e-06, "loss": 0.0626, "num_tokens": 962152843.0, "step": 7360 }, { "epoch": 2.9373503591380685, "grad_norm": 0.23916932940483093, "learning_rate": 5.053633886113922e-06, "loss": 0.0647, "num_tokens": 962283915.0, "step": 7361 }, { "epoch": 2.937749401436552, "grad_norm": 0.2200157344341278, "learning_rate": 5.052953095069518e-06, "loss": 0.0545, "num_tokens": 962414987.0, "step": 7362 }, { "epoch": 2.938148443735036, "grad_norm": 0.2654156982898712, "learning_rate": 5.052276647319441e-06, "loss": 0.0678, "num_tokens": 962546059.0, "step": 7363 }, { "epoch": 2.9385474860335195, "grad_norm": 0.2969902455806732, "learning_rate": 5.051604542994579e-06, "loss": 0.0827, "num_tokens": 962677131.0, "step": 7364 }, { "epoch": 2.938946528332003, "grad_norm": 0.22995516657829285, "learning_rate": 5.050936782224978e-06, "loss": 0.0645, "num_tokens": 962808203.0, "step": 7365 }, { "epoch": 2.9393455706304867, "grad_norm": 0.22737736999988556, "learning_rate": 5.0502733651398405e-06, "loss": 0.0604, "num_tokens": 962939275.0, "step": 7366 }, { "epoch": 2.9397446129289704, "grad_norm": 0.2273491770029068, "learning_rate": 5.049614291867533e-06, "loss": 0.0598, "num_tokens": 963070347.0, "step": 7367 }, { "epoch": 2.940143655227454, "grad_norm": 0.21057361364364624, "learning_rate": 5.04895956253558e-06, "loss": 0.05, "num_tokens": 963201419.0, "step": 7368 }, { "epoch": 2.9405426975259377, "grad_norm": 0.26654526591300964, "learning_rate": 5.048309177270666e-06, "loss": 0.0736, "num_tokens": 963332491.0, "step": 7369 }, { "epoch": 2.9409417398244213, "grad_norm": 0.2450336068868637, "learning_rate": 5.047663136198634e-06, "loss": 0.0682, "num_tokens": 963463563.0, "step": 7370 }, { "epoch": 2.941340782122905, "grad_norm": 0.2224462777376175, "learning_rate": 5.047021439444488e-06, "loss": 0.062, "num_tokens": 963594635.0, "step": 7371 }, { "epoch": 2.9417398244213886, "grad_norm": 0.24816037714481354, "learning_rate": 5.046384087132389e-06, "loss": 0.0634, "num_tokens": 963725707.0, "step": 7372 }, { "epoch": 2.9421388667198722, "grad_norm": 0.2542414665222168, "learning_rate": 5.045751079385659e-06, "loss": 0.0666, "num_tokens": 963856779.0, "step": 7373 }, { "epoch": 2.942537909018356, "grad_norm": 0.2637385427951813, "learning_rate": 5.045122416326779e-06, "loss": 0.0661, "num_tokens": 963987851.0, "step": 7374 }, { "epoch": 2.9429369513168395, "grad_norm": 0.24119599163532257, "learning_rate": 5.0444980980773886e-06, "loss": 0.0691, "num_tokens": 964118923.0, "step": 7375 }, { "epoch": 2.943335993615323, "grad_norm": 0.22650454938411713, "learning_rate": 5.04387812475829e-06, "loss": 0.068, "num_tokens": 964249995.0, "step": 7376 }, { "epoch": 2.943735035913807, "grad_norm": 0.2547233998775482, "learning_rate": 5.043262496489443e-06, "loss": 0.0748, "num_tokens": 964381067.0, "step": 7377 }, { "epoch": 2.9441340782122905, "grad_norm": 0.25409528613090515, "learning_rate": 5.042651213389961e-06, "loss": 0.0677, "num_tokens": 964512139.0, "step": 7378 }, { "epoch": 2.944533120510774, "grad_norm": 0.2640683054924011, "learning_rate": 5.0420442755781245e-06, "loss": 0.0736, "num_tokens": 964643211.0, "step": 7379 }, { "epoch": 2.9449321628092577, "grad_norm": 0.2577494978904724, "learning_rate": 5.041441683171372e-06, "loss": 0.0645, "num_tokens": 964774283.0, "step": 7380 }, { "epoch": 2.9453312051077414, "grad_norm": 0.24202293157577515, "learning_rate": 5.040843436286296e-06, "loss": 0.0663, "num_tokens": 964905355.0, "step": 7381 }, { "epoch": 2.945730247406225, "grad_norm": 0.21782414615154266, "learning_rate": 5.040249535038655e-06, "loss": 0.0555, "num_tokens": 965036427.0, "step": 7382 }, { "epoch": 2.9461292897047087, "grad_norm": 0.287676602602005, "learning_rate": 5.039659979543362e-06, "loss": 0.0783, "num_tokens": 965167499.0, "step": 7383 }, { "epoch": 2.9465283320031923, "grad_norm": 0.2475801408290863, "learning_rate": 5.039074769914488e-06, "loss": 0.0725, "num_tokens": 965298571.0, "step": 7384 }, { "epoch": 2.946927374301676, "grad_norm": 0.22944869101047516, "learning_rate": 5.0384939062652685e-06, "loss": 0.0657, "num_tokens": 965429643.0, "step": 7385 }, { "epoch": 2.9473264166001596, "grad_norm": 0.23082703351974487, "learning_rate": 5.037917388708097e-06, "loss": 0.0654, "num_tokens": 965560715.0, "step": 7386 }, { "epoch": 2.9477254588986432, "grad_norm": 0.2647269070148468, "learning_rate": 5.037345217354521e-06, "loss": 0.0704, "num_tokens": 965691787.0, "step": 7387 }, { "epoch": 2.948124501197127, "grad_norm": 0.24096572399139404, "learning_rate": 5.0367773923152495e-06, "loss": 0.0716, "num_tokens": 965822859.0, "step": 7388 }, { "epoch": 2.9485235434956105, "grad_norm": 0.22859235107898712, "learning_rate": 5.036213913700155e-06, "loss": 0.065, "num_tokens": 965953931.0, "step": 7389 }, { "epoch": 2.948922585794094, "grad_norm": 0.2047329992055893, "learning_rate": 5.035654781618263e-06, "loss": 0.0615, "num_tokens": 966085003.0, "step": 7390 }, { "epoch": 2.949321628092578, "grad_norm": 0.2569017708301544, "learning_rate": 5.035099996177761e-06, "loss": 0.0601, "num_tokens": 966216075.0, "step": 7391 }, { "epoch": 2.9497206703910615, "grad_norm": 0.21430155634880066, "learning_rate": 5.034549557485994e-06, "loss": 0.0474, "num_tokens": 966347147.0, "step": 7392 }, { "epoch": 2.950119712689545, "grad_norm": 0.21897347271442413, "learning_rate": 5.034003465649468e-06, "loss": 0.0532, "num_tokens": 966478219.0, "step": 7393 }, { "epoch": 2.9505187549880287, "grad_norm": 0.2626478970050812, "learning_rate": 5.033461720773847e-06, "loss": 0.0713, "num_tokens": 966609291.0, "step": 7394 }, { "epoch": 2.9509177972865124, "grad_norm": 0.22342263162136078, "learning_rate": 5.032924322963949e-06, "loss": 0.0594, "num_tokens": 966740363.0, "step": 7395 }, { "epoch": 2.951316839584996, "grad_norm": 0.24774233996868134, "learning_rate": 5.032391272323764e-06, "loss": 0.063, "num_tokens": 966871435.0, "step": 7396 }, { "epoch": 2.9517158818834797, "grad_norm": 0.2607285976409912, "learning_rate": 5.031862568956423e-06, "loss": 0.0599, "num_tokens": 967002507.0, "step": 7397 }, { "epoch": 2.9521149241819633, "grad_norm": 0.24559533596038818, "learning_rate": 5.031338212964231e-06, "loss": 0.0596, "num_tokens": 967133579.0, "step": 7398 }, { "epoch": 2.952513966480447, "grad_norm": 0.2982461452484131, "learning_rate": 5.030818204448646e-06, "loss": 0.0772, "num_tokens": 967264651.0, "step": 7399 }, { "epoch": 2.9529130087789306, "grad_norm": 0.22761009633541107, "learning_rate": 5.030302543510282e-06, "loss": 0.0641, "num_tokens": 967395723.0, "step": 7400 }, { "epoch": 2.9533120510774142, "grad_norm": 0.2526581585407257, "learning_rate": 5.029791230248915e-06, "loss": 0.0621, "num_tokens": 967526795.0, "step": 7401 }, { "epoch": 2.953711093375898, "grad_norm": 0.2716800570487976, "learning_rate": 5.029284264763484e-06, "loss": 0.0688, "num_tokens": 967657867.0, "step": 7402 }, { "epoch": 2.9541101356743815, "grad_norm": 0.23889930546283722, "learning_rate": 5.0287816471520755e-06, "loss": 0.0601, "num_tokens": 967788939.0, "step": 7403 }, { "epoch": 2.954509177972865, "grad_norm": 0.22163081169128418, "learning_rate": 5.028283377511945e-06, "loss": 0.0586, "num_tokens": 967920011.0, "step": 7404 }, { "epoch": 2.954908220271349, "grad_norm": 0.2527618408203125, "learning_rate": 5.027789455939503e-06, "loss": 0.0671, "num_tokens": 968043978.0, "step": 7405 }, { "epoch": 2.9553072625698324, "grad_norm": 0.23052442073822021, "learning_rate": 5.027299882530316e-06, "loss": 0.0588, "num_tokens": 968175050.0, "step": 7406 }, { "epoch": 2.955706304868316, "grad_norm": 0.23177789151668549, "learning_rate": 5.026814657379114e-06, "loss": 0.0604, "num_tokens": 968306122.0, "step": 7407 }, { "epoch": 2.9561053471667997, "grad_norm": 0.24763637781143188, "learning_rate": 5.026333780579787e-06, "loss": 0.0776, "num_tokens": 968437194.0, "step": 7408 }, { "epoch": 2.9565043894652834, "grad_norm": 0.2404882162809372, "learning_rate": 5.025857252225373e-06, "loss": 0.0574, "num_tokens": 968568266.0, "step": 7409 }, { "epoch": 2.956903431763767, "grad_norm": 0.23705250024795532, "learning_rate": 5.0253850724080796e-06, "loss": 0.056, "num_tokens": 968699338.0, "step": 7410 }, { "epoch": 2.9573024740622507, "grad_norm": 0.22000934183597565, "learning_rate": 5.024917241219271e-06, "loss": 0.0507, "num_tokens": 968830410.0, "step": 7411 }, { "epoch": 2.9577015163607343, "grad_norm": 0.231242835521698, "learning_rate": 5.024453758749465e-06, "loss": 0.0643, "num_tokens": 968961482.0, "step": 7412 }, { "epoch": 2.958100558659218, "grad_norm": 0.2256074845790863, "learning_rate": 5.023994625088342e-06, "loss": 0.0584, "num_tokens": 969092554.0, "step": 7413 }, { "epoch": 2.9584996009577016, "grad_norm": 0.24625606834888458, "learning_rate": 5.023539840324741e-06, "loss": 0.0671, "num_tokens": 969223626.0, "step": 7414 }, { "epoch": 2.9588986432561852, "grad_norm": 0.24512171745300293, "learning_rate": 5.023089404546658e-06, "loss": 0.0724, "num_tokens": 969354698.0, "step": 7415 }, { "epoch": 2.959297685554669, "grad_norm": 0.23285970091819763, "learning_rate": 5.022643317841247e-06, "loss": 0.058, "num_tokens": 969485770.0, "step": 7416 }, { "epoch": 2.9596967278531525, "grad_norm": 0.26133257150650024, "learning_rate": 5.0222015802948236e-06, "loss": 0.0711, "num_tokens": 969616842.0, "step": 7417 }, { "epoch": 2.960095770151636, "grad_norm": 0.23848891258239746, "learning_rate": 5.021764191992859e-06, "loss": 0.0495, "num_tokens": 969747914.0, "step": 7418 }, { "epoch": 2.96049481245012, "grad_norm": 0.23898379504680634, "learning_rate": 5.021331153019984e-06, "loss": 0.0723, "num_tokens": 969878986.0, "step": 7419 }, { "epoch": 2.9608938547486034, "grad_norm": 0.2765321135520935, "learning_rate": 5.020902463459989e-06, "loss": 0.0761, "num_tokens": 970010058.0, "step": 7420 }, { "epoch": 2.961292897047087, "grad_norm": 0.24555136263370514, "learning_rate": 5.020478123395817e-06, "loss": 0.0713, "num_tokens": 970141130.0, "step": 7421 }, { "epoch": 2.9616919393455707, "grad_norm": 0.25788119435310364, "learning_rate": 5.020058132909578e-06, "loss": 0.0688, "num_tokens": 970272202.0, "step": 7422 }, { "epoch": 2.9620909816440544, "grad_norm": 0.20824183523654938, "learning_rate": 5.019642492082534e-06, "loss": 0.0524, "num_tokens": 970403274.0, "step": 7423 }, { "epoch": 2.962490023942538, "grad_norm": 0.26416924595832825, "learning_rate": 5.019231200995111e-06, "loss": 0.0765, "num_tokens": 970534346.0, "step": 7424 }, { "epoch": 2.9628890662410217, "grad_norm": 0.24152499437332153, "learning_rate": 5.0188242597268865e-06, "loss": 0.0598, "num_tokens": 970665418.0, "step": 7425 }, { "epoch": 2.9632881085395053, "grad_norm": 0.24689015746116638, "learning_rate": 5.018421668356597e-06, "loss": 0.0619, "num_tokens": 970796490.0, "step": 7426 }, { "epoch": 2.963687150837989, "grad_norm": 0.21418899297714233, "learning_rate": 5.018023426962148e-06, "loss": 0.05, "num_tokens": 970927562.0, "step": 7427 }, { "epoch": 2.9640861931364726, "grad_norm": 0.24021756649017334, "learning_rate": 5.017629535620592e-06, "loss": 0.0559, "num_tokens": 971058634.0, "step": 7428 }, { "epoch": 2.9644852354349562, "grad_norm": 0.2213914841413498, "learning_rate": 5.017239994408141e-06, "loss": 0.0576, "num_tokens": 971189706.0, "step": 7429 }, { "epoch": 2.96488427773344, "grad_norm": 0.21926064789295197, "learning_rate": 5.016854803400171e-06, "loss": 0.0564, "num_tokens": 971320778.0, "step": 7430 }, { "epoch": 2.9652833200319235, "grad_norm": 0.25519856810569763, "learning_rate": 5.0164739626712105e-06, "loss": 0.0663, "num_tokens": 971451850.0, "step": 7431 }, { "epoch": 2.965682362330407, "grad_norm": 0.24747610092163086, "learning_rate": 5.016097472294948e-06, "loss": 0.0699, "num_tokens": 971582922.0, "step": 7432 }, { "epoch": 2.966081404628891, "grad_norm": 0.25593528151512146, "learning_rate": 5.015725332344236e-06, "loss": 0.0678, "num_tokens": 971702978.0, "step": 7433 }, { "epoch": 2.9664804469273744, "grad_norm": 0.23044684529304504, "learning_rate": 5.0153575428910725e-06, "loss": 0.0629, "num_tokens": 971834050.0, "step": 7434 }, { "epoch": 2.966879489225858, "grad_norm": 0.23214930295944214, "learning_rate": 5.014994104006627e-06, "loss": 0.0647, "num_tokens": 971965122.0, "step": 7435 }, { "epoch": 2.9672785315243417, "grad_norm": 0.262819766998291, "learning_rate": 5.014635015761222e-06, "loss": 0.0592, "num_tokens": 972096194.0, "step": 7436 }, { "epoch": 2.9676775738228254, "grad_norm": 0.24962599575519562, "learning_rate": 5.014280278224332e-06, "loss": 0.0708, "num_tokens": 972227266.0, "step": 7437 }, { "epoch": 2.968076616121309, "grad_norm": 0.24456585943698883, "learning_rate": 5.0139298914646e-06, "loss": 0.0627, "num_tokens": 972358338.0, "step": 7438 }, { "epoch": 2.9684756584197927, "grad_norm": 0.23016907274723053, "learning_rate": 5.013583855549823e-06, "loss": 0.0579, "num_tokens": 972489410.0, "step": 7439 }, { "epoch": 2.968874700718276, "grad_norm": 0.21035078167915344, "learning_rate": 5.013242170546955e-06, "loss": 0.0526, "num_tokens": 972620482.0, "step": 7440 }, { "epoch": 2.9692737430167595, "grad_norm": 0.2541937530040741, "learning_rate": 5.012904836522105e-06, "loss": 0.0797, "num_tokens": 972751554.0, "step": 7441 }, { "epoch": 2.969672785315243, "grad_norm": 0.25653091073036194, "learning_rate": 5.012571853540547e-06, "loss": 0.0742, "num_tokens": 972882626.0, "step": 7442 }, { "epoch": 2.970071827613727, "grad_norm": 0.25525662302970886, "learning_rate": 5.012243221666714e-06, "loss": 0.076, "num_tokens": 973013698.0, "step": 7443 }, { "epoch": 2.9704708699122104, "grad_norm": 0.22963854670524597, "learning_rate": 5.011918940964186e-06, "loss": 0.0582, "num_tokens": 973144770.0, "step": 7444 }, { "epoch": 2.970869912210694, "grad_norm": 0.2366836965084076, "learning_rate": 5.011599011495713e-06, "loss": 0.0553, "num_tokens": 973275842.0, "step": 7445 }, { "epoch": 2.9712689545091777, "grad_norm": 0.20801182091236115, "learning_rate": 5.0112834333231966e-06, "loss": 0.0588, "num_tokens": 973406914.0, "step": 7446 }, { "epoch": 2.9716679968076614, "grad_norm": 0.2544686198234558, "learning_rate": 5.010972206507697e-06, "loss": 0.0749, "num_tokens": 973537986.0, "step": 7447 }, { "epoch": 2.972067039106145, "grad_norm": 0.28580278158187866, "learning_rate": 5.01066533110944e-06, "loss": 0.0822, "num_tokens": 973669058.0, "step": 7448 }, { "epoch": 2.9724660814046286, "grad_norm": 0.2408013939857483, "learning_rate": 5.010362807187794e-06, "loss": 0.0577, "num_tokens": 973786520.0, "step": 7449 }, { "epoch": 2.9728651237031123, "grad_norm": 0.23088328540325165, "learning_rate": 5.010064634801302e-06, "loss": 0.06, "num_tokens": 973917592.0, "step": 7450 }, { "epoch": 2.973264166001596, "grad_norm": 0.2374231517314911, "learning_rate": 5.009770814007654e-06, "loss": 0.0616, "num_tokens": 974048664.0, "step": 7451 }, { "epoch": 2.9736632083000796, "grad_norm": 0.25037840008735657, "learning_rate": 5.009481344863702e-06, "loss": 0.077, "num_tokens": 974179736.0, "step": 7452 }, { "epoch": 2.974062250598563, "grad_norm": 0.24030672013759613, "learning_rate": 5.0091962274254556e-06, "loss": 0.0544, "num_tokens": 974310808.0, "step": 7453 }, { "epoch": 2.974461292897047, "grad_norm": 0.22688890993595123, "learning_rate": 5.0089154617480805e-06, "loss": 0.0601, "num_tokens": 974441880.0, "step": 7454 }, { "epoch": 2.9748603351955305, "grad_norm": 0.2642395496368408, "learning_rate": 5.008639047885906e-06, "loss": 0.0802, "num_tokens": 974572952.0, "step": 7455 }, { "epoch": 2.975259377494014, "grad_norm": 0.23803818225860596, "learning_rate": 5.0083669858924155e-06, "loss": 0.0807, "num_tokens": 974704024.0, "step": 7456 }, { "epoch": 2.975658419792498, "grad_norm": 0.24102851748466492, "learning_rate": 5.008099275820248e-06, "loss": 0.0645, "num_tokens": 974835096.0, "step": 7457 }, { "epoch": 2.9760574620909814, "grad_norm": 0.23522165417671204, "learning_rate": 5.007835917721204e-06, "loss": 0.0627, "num_tokens": 974966168.0, "step": 7458 }, { "epoch": 2.976456504389465, "grad_norm": 0.20479775965213776, "learning_rate": 5.007576911646239e-06, "loss": 0.0466, "num_tokens": 975097240.0, "step": 7459 }, { "epoch": 2.9768555466879487, "grad_norm": 0.2509640157222748, "learning_rate": 5.007322257645473e-06, "loss": 0.0638, "num_tokens": 975228312.0, "step": 7460 }, { "epoch": 2.9772545889864324, "grad_norm": 0.2430078536272049, "learning_rate": 5.007071955768173e-06, "loss": 0.0543, "num_tokens": 975345885.0, "step": 7461 }, { "epoch": 2.977653631284916, "grad_norm": 0.22064107656478882, "learning_rate": 5.006826006062773e-06, "loss": 0.0556, "num_tokens": 975476957.0, "step": 7462 }, { "epoch": 2.9780526735833996, "grad_norm": 0.2563459873199463, "learning_rate": 5.006584408576864e-06, "loss": 0.0635, "num_tokens": 975608029.0, "step": 7463 }, { "epoch": 2.9784517158818833, "grad_norm": 0.2670905590057373, "learning_rate": 5.006347163357192e-06, "loss": 0.0715, "num_tokens": 975739101.0, "step": 7464 }, { "epoch": 2.978850758180367, "grad_norm": 0.19600869715213776, "learning_rate": 5.006114270449658e-06, "loss": 0.0491, "num_tokens": 975870173.0, "step": 7465 }, { "epoch": 2.9792498004788506, "grad_norm": 0.26792800426483154, "learning_rate": 5.005885729899328e-06, "loss": 0.0751, "num_tokens": 976001245.0, "step": 7466 }, { "epoch": 2.979648842777334, "grad_norm": 0.23989790678024292, "learning_rate": 5.005661541750422e-06, "loss": 0.0678, "num_tokens": 976132317.0, "step": 7467 }, { "epoch": 2.980047885075818, "grad_norm": 0.253813773393631, "learning_rate": 5.00544170604632e-06, "loss": 0.0795, "num_tokens": 976263389.0, "step": 7468 }, { "epoch": 2.9804469273743015, "grad_norm": 0.24641427397727966, "learning_rate": 5.005226222829555e-06, "loss": 0.0603, "num_tokens": 976394461.0, "step": 7469 }, { "epoch": 2.980845969672785, "grad_norm": 0.259956955909729, "learning_rate": 5.005015092141823e-06, "loss": 0.0589, "num_tokens": 976525533.0, "step": 7470 }, { "epoch": 2.981245011971269, "grad_norm": 0.22088973224163055, "learning_rate": 5.004808314023974e-06, "loss": 0.0596, "num_tokens": 976656605.0, "step": 7471 }, { "epoch": 2.9816440542697524, "grad_norm": 0.2501309812068939, "learning_rate": 5.004605888516018e-06, "loss": 0.0615, "num_tokens": 976782757.0, "step": 7472 }, { "epoch": 2.982043096568236, "grad_norm": 0.23003895580768585, "learning_rate": 5.004407815657124e-06, "loss": 0.0683, "num_tokens": 976913829.0, "step": 7473 }, { "epoch": 2.9824421388667197, "grad_norm": 0.2385275810956955, "learning_rate": 5.004214095485617e-06, "loss": 0.0724, "num_tokens": 977044901.0, "step": 7474 }, { "epoch": 2.9828411811652034, "grad_norm": 0.2414790838956833, "learning_rate": 5.004024728038978e-06, "loss": 0.0617, "num_tokens": 977175973.0, "step": 7475 }, { "epoch": 2.983240223463687, "grad_norm": 0.26087939739227295, "learning_rate": 5.00383971335385e-06, "loss": 0.0757, "num_tokens": 977307045.0, "step": 7476 }, { "epoch": 2.9836392657621706, "grad_norm": 0.2349415272474289, "learning_rate": 5.003659051466031e-06, "loss": 0.0663, "num_tokens": 977438117.0, "step": 7477 }, { "epoch": 2.9840383080606543, "grad_norm": 0.24725060164928436, "learning_rate": 5.0034827424104755e-06, "loss": 0.0624, "num_tokens": 977569189.0, "step": 7478 }, { "epoch": 2.984437350359138, "grad_norm": 0.2231065183877945, "learning_rate": 5.0033107862213034e-06, "loss": 0.058, "num_tokens": 977700261.0, "step": 7479 }, { "epoch": 2.9848363926576216, "grad_norm": 0.2354707419872284, "learning_rate": 5.003143182931781e-06, "loss": 0.0632, "num_tokens": 977831333.0, "step": 7480 }, { "epoch": 2.985235434956105, "grad_norm": 0.24374766647815704, "learning_rate": 5.002979932574339e-06, "loss": 0.0514, "num_tokens": 977962405.0, "step": 7481 }, { "epoch": 2.985634477254589, "grad_norm": 0.2691224217414856, "learning_rate": 5.002821035180566e-06, "loss": 0.0747, "num_tokens": 978093477.0, "step": 7482 }, { "epoch": 2.9860335195530725, "grad_norm": 0.24978767335414886, "learning_rate": 5.002666490781206e-06, "loss": 0.0641, "num_tokens": 978224549.0, "step": 7483 }, { "epoch": 2.986432561851556, "grad_norm": 0.23648063838481903, "learning_rate": 5.002516299406164e-06, "loss": 0.0621, "num_tokens": 978355621.0, "step": 7484 }, { "epoch": 2.98683160415004, "grad_norm": 0.2263132631778717, "learning_rate": 5.0023704610844975e-06, "loss": 0.0596, "num_tokens": 978486693.0, "step": 7485 }, { "epoch": 2.9872306464485234, "grad_norm": 0.22349877655506134, "learning_rate": 5.002228975844428e-06, "loss": 0.0642, "num_tokens": 978617765.0, "step": 7486 }, { "epoch": 2.987629688747007, "grad_norm": 0.19764064252376556, "learning_rate": 5.002091843713328e-06, "loss": 0.0479, "num_tokens": 978748837.0, "step": 7487 }, { "epoch": 2.9880287310454907, "grad_norm": 0.22837486863136292, "learning_rate": 5.001959064717737e-06, "loss": 0.0661, "num_tokens": 978879909.0, "step": 7488 }, { "epoch": 2.9884277733439744, "grad_norm": 0.28588053584098816, "learning_rate": 5.001830638883341e-06, "loss": 0.0915, "num_tokens": 979010981.0, "step": 7489 }, { "epoch": 2.988826815642458, "grad_norm": 0.2314884215593338, "learning_rate": 5.00170656623499e-06, "loss": 0.0649, "num_tokens": 979142053.0, "step": 7490 }, { "epoch": 2.9892258579409416, "grad_norm": 0.2695004642009735, "learning_rate": 5.0015868467966925e-06, "loss": 0.0717, "num_tokens": 979273125.0, "step": 7491 }, { "epoch": 2.9896249002394253, "grad_norm": 0.2365013211965561, "learning_rate": 5.001471480591613e-06, "loss": 0.0686, "num_tokens": 979404197.0, "step": 7492 }, { "epoch": 2.990023942537909, "grad_norm": 0.26102668046951294, "learning_rate": 5.0013604676420755e-06, "loss": 0.0599, "num_tokens": 979527480.0, "step": 7493 }, { "epoch": 2.9904229848363926, "grad_norm": 0.23678801953792572, "learning_rate": 5.001253807969553e-06, "loss": 0.0641, "num_tokens": 979658552.0, "step": 7494 }, { "epoch": 2.990822027134876, "grad_norm": 0.253866046667099, "learning_rate": 5.001151501594692e-06, "loss": 0.0674, "num_tokens": 979789624.0, "step": 7495 }, { "epoch": 2.99122106943336, "grad_norm": 0.2456185668706894, "learning_rate": 5.001053548537281e-06, "loss": 0.0695, "num_tokens": 979920696.0, "step": 7496 }, { "epoch": 2.9916201117318435, "grad_norm": 0.25525012612342834, "learning_rate": 5.00095994881628e-06, "loss": 0.0743, "num_tokens": 980051768.0, "step": 7497 }, { "epoch": 2.992019154030327, "grad_norm": 0.23447129130363464, "learning_rate": 5.000870702449794e-06, "loss": 0.0618, "num_tokens": 980182840.0, "step": 7498 }, { "epoch": 2.992418196328811, "grad_norm": 0.24641910195350647, "learning_rate": 5.000785809455089e-06, "loss": 0.0685, "num_tokens": 980313912.0, "step": 7499 }, { "epoch": 2.9928172386272944, "grad_norm": 0.20573578774929047, "learning_rate": 5.000705269848597e-06, "loss": 0.0528, "num_tokens": 980444984.0, "step": 7500 }, { "epoch": 2.993216280925778, "grad_norm": 0.24012432992458344, "learning_rate": 5.000629083645901e-06, "loss": 0.0637, "num_tokens": 980576056.0, "step": 7501 }, { "epoch": 2.9936153232242617, "grad_norm": 0.2662348747253418, "learning_rate": 5.00055725086174e-06, "loss": 0.0747, "num_tokens": 980707128.0, "step": 7502 }, { "epoch": 2.9940143655227454, "grad_norm": 0.27035096287727356, "learning_rate": 5.000489771510016e-06, "loss": 0.0702, "num_tokens": 980838200.0, "step": 7503 }, { "epoch": 2.994413407821229, "grad_norm": 0.27072781324386597, "learning_rate": 5.000426645603782e-06, "loss": 0.0741, "num_tokens": 980969272.0, "step": 7504 }, { "epoch": 2.9948124501197126, "grad_norm": 0.25150004029273987, "learning_rate": 5.000367873155251e-06, "loss": 0.0808, "num_tokens": 981100344.0, "step": 7505 }, { "epoch": 2.9952114924181963, "grad_norm": 0.26677966117858887, "learning_rate": 5.000313454175801e-06, "loss": 0.0686, "num_tokens": 981231416.0, "step": 7506 }, { "epoch": 2.99561053471668, "grad_norm": 0.2200930416584015, "learning_rate": 5.000263388675958e-06, "loss": 0.0571, "num_tokens": 981362488.0, "step": 7507 }, { "epoch": 2.9960095770151636, "grad_norm": 0.26524868607521057, "learning_rate": 5.0002176766654055e-06, "loss": 0.0736, "num_tokens": 981493560.0, "step": 7508 }, { "epoch": 2.996408619313647, "grad_norm": 0.22819185256958008, "learning_rate": 5.000176318152996e-06, "loss": 0.0678, "num_tokens": 981624632.0, "step": 7509 }, { "epoch": 2.996807661612131, "grad_norm": 0.2139161080121994, "learning_rate": 5.0001393131467285e-06, "loss": 0.0583, "num_tokens": 981755704.0, "step": 7510 }, { "epoch": 2.9972067039106145, "grad_norm": 0.22668002545833588, "learning_rate": 5.000106661653762e-06, "loss": 0.059, "num_tokens": 981886776.0, "step": 7511 }, { "epoch": 2.997605746209098, "grad_norm": 0.2742205858230591, "learning_rate": 5.000078363680415e-06, "loss": 0.0723, "num_tokens": 982017848.0, "step": 7512 }, { "epoch": 2.998004788507582, "grad_norm": 0.2557747960090637, "learning_rate": 5.000054419232161e-06, "loss": 0.0663, "num_tokens": 982148920.0, "step": 7513 }, { "epoch": 2.9984038308060654, "grad_norm": 0.22821351885795593, "learning_rate": 5.000034828313639e-06, "loss": 0.057, "num_tokens": 982279992.0, "step": 7514 }, { "epoch": 2.998802873104549, "grad_norm": 0.2543848752975464, "learning_rate": 5.000019590928632e-06, "loss": 0.0658, "num_tokens": 982411064.0, "step": 7515 }, { "epoch": 2.9992019154030327, "grad_norm": 0.2511218190193176, "learning_rate": 5.000008707080095e-06, "loss": 0.0681, "num_tokens": 982527792.0, "step": 7516 }, { "epoch": 2.9996009577015164, "grad_norm": 0.2634769678115845, "learning_rate": 5.00000217677013e-06, "loss": 0.0762, "num_tokens": 982658864.0, "step": 7517 }, { "epoch": 3.0, "grad_norm": 0.3487602472305298, "learning_rate": 5e-06, "loss": 0.0585, "num_tokens": 982724400.0, "step": 7518 }, { "epoch": 3.0, "step": 7518, "total_flos": 9.125228890829095e+18, "train_loss": 0.22417377430849506, "train_runtime": 62104.7918, "train_samples_per_second": 0.968, "train_steps_per_second": 0.121 } ], "logging_steps": 1, "max_steps": 7518, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.125228890829095e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }