| { | |
| "best_global_step": 1008, | |
| "best_metric": 0.38490504026412964, | |
| "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_wsc_1754652157/checkpoint-1008", | |
| "epoch": 10.0, | |
| "eval_steps": 63, | |
| "global_step": 1250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.4841861724853516, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "loss": 15.2081, | |
| "num_input_tokens_seen": 2144, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.424849510192871, | |
| "learning_rate": 3.6e-06, | |
| "loss": 15.3148, | |
| "num_input_tokens_seen": 4128, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.6069109439849854, | |
| "learning_rate": 5.600000000000001e-06, | |
| "loss": 15.3875, | |
| "num_input_tokens_seen": 6240, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 2.666384696960449, | |
| "learning_rate": 7.6e-06, | |
| "loss": 15.1604, | |
| "num_input_tokens_seen": 8096, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.383948564529419, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 15.358, | |
| "num_input_tokens_seen": 10112, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 2.555549383163452, | |
| "learning_rate": 1.16e-05, | |
| "loss": 14.8947, | |
| "num_input_tokens_seen": 12032, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 2.3365485668182373, | |
| "learning_rate": 1.3600000000000002e-05, | |
| "loss": 14.9376, | |
| "num_input_tokens_seen": 13824, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.37636137008667, | |
| "learning_rate": 1.56e-05, | |
| "loss": 15.0763, | |
| "num_input_tokens_seen": 15840, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.863443374633789, | |
| "learning_rate": 1.76e-05, | |
| "loss": 14.7281, | |
| "num_input_tokens_seen": 17920, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.697338819503784, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 14.723, | |
| "num_input_tokens_seen": 19712, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.416774272918701, | |
| "learning_rate": 2.16e-05, | |
| "loss": 14.5953, | |
| "num_input_tokens_seen": 21952, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.4722342491149902, | |
| "learning_rate": 2.36e-05, | |
| "loss": 13.9838, | |
| "num_input_tokens_seen": 24160, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "eval_loss": 13.871087074279785, | |
| "eval_runtime": 0.8458, | |
| "eval_samples_per_second": 66.213, | |
| "eval_steps_per_second": 16.553, | |
| "num_input_tokens_seen": 25504, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.531996250152588, | |
| "learning_rate": 2.5600000000000002e-05, | |
| "loss": 13.8098, | |
| "num_input_tokens_seen": 26112, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.4052157402038574, | |
| "learning_rate": 2.7600000000000003e-05, | |
| "loss": 13.7342, | |
| "num_input_tokens_seen": 28064, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.380267381668091, | |
| "learning_rate": 2.96e-05, | |
| "loss": 13.545, | |
| "num_input_tokens_seen": 29824, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.2633209228515625, | |
| "learning_rate": 3.16e-05, | |
| "loss": 13.168, | |
| "num_input_tokens_seen": 31904, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.5283195972442627, | |
| "learning_rate": 3.3600000000000004e-05, | |
| "loss": 13.0662, | |
| "num_input_tokens_seen": 33984, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 2.547339916229248, | |
| "learning_rate": 3.56e-05, | |
| "loss": 12.3617, | |
| "num_input_tokens_seen": 35776, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 2.478236675262451, | |
| "learning_rate": 3.76e-05, | |
| "loss": 12.2163, | |
| "num_input_tokens_seen": 37472, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.3510525226593018, | |
| "learning_rate": 3.960000000000001e-05, | |
| "loss": 12.0549, | |
| "num_input_tokens_seen": 39328, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.5149877071380615, | |
| "learning_rate": 4.16e-05, | |
| "loss": 11.9157, | |
| "num_input_tokens_seen": 41280, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.451765537261963, | |
| "learning_rate": 4.36e-05, | |
| "loss": 11.5939, | |
| "num_input_tokens_seen": 43552, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 2.4347667694091797, | |
| "learning_rate": 4.5600000000000004e-05, | |
| "loss": 10.6461, | |
| "num_input_tokens_seen": 45216, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.257194995880127, | |
| "learning_rate": 4.76e-05, | |
| "loss": 10.384, | |
| "num_input_tokens_seen": 47360, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.9155919551849365, | |
| "learning_rate": 4.96e-05, | |
| "loss": 9.9251, | |
| "num_input_tokens_seen": 49376, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "eval_loss": 9.737223625183105, | |
| "eval_runtime": 0.8536, | |
| "eval_samples_per_second": 65.608, | |
| "eval_steps_per_second": 16.402, | |
| "num_input_tokens_seen": 49696, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.394456386566162, | |
| "learning_rate": 4.9998440375027166e-05, | |
| "loss": 9.5781, | |
| "num_input_tokens_seen": 51200, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 2.3213348388671875, | |
| "learning_rate": 4.99921047320825e-05, | |
| "loss": 8.9157, | |
| "num_input_tokens_seen": 53216, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 2.2454257011413574, | |
| "learning_rate": 4.998089682880117e-05, | |
| "loss": 8.0389, | |
| "num_input_tokens_seen": 55168, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 2.323469400405884, | |
| "learning_rate": 4.9964818850186135e-05, | |
| "loss": 8.6052, | |
| "num_input_tokens_seen": 56960, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.213332414627075, | |
| "learning_rate": 4.994387393067117e-05, | |
| "loss": 7.8179, | |
| "num_input_tokens_seen": 58880, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.9772940874099731, | |
| "learning_rate": 4.9918066153509834e-05, | |
| "loss": 7.3245, | |
| "num_input_tokens_seen": 60672, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.1031250953674316, | |
| "learning_rate": 4.988740054997943e-05, | |
| "loss": 7.6592, | |
| "num_input_tokens_seen": 62848, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 1.5588639974594116, | |
| "learning_rate": 4.985188309840012e-05, | |
| "loss": 6.5144, | |
| "num_input_tokens_seen": 64448, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 1.6640610694885254, | |
| "learning_rate": 4.9811520722969465e-05, | |
| "loss": 6.9579, | |
| "num_input_tokens_seen": 66368, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.4669989347457886, | |
| "learning_rate": 4.976632129241252e-05, | |
| "loss": 5.7934, | |
| "num_input_tokens_seen": 68128, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 1.5640122890472412, | |
| "learning_rate": 4.971629361844785e-05, | |
| "loss": 6.0098, | |
| "num_input_tokens_seen": 70112, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.9767147302627563, | |
| "learning_rate": 4.966144745406961e-05, | |
| "loss": 6.0466, | |
| "num_input_tokens_seen": 72384, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "eval_loss": 5.308993816375732, | |
| "eval_runtime": 0.8536, | |
| "eval_samples_per_second": 65.601, | |
| "eval_steps_per_second": 16.4, | |
| "num_input_tokens_seen": 74112, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.4375022649765015, | |
| "learning_rate": 4.960179349164621e-05, | |
| "loss": 5.7175, | |
| "num_input_tokens_seen": 74752, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.8162568807601929, | |
| "learning_rate": 4.953734336083583e-05, | |
| "loss": 5.0706, | |
| "num_input_tokens_seen": 76640, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.0760273933410645, | |
| "learning_rate": 4.946810962631916e-05, | |
| "loss": 5.3219, | |
| "num_input_tokens_seen": 78784, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 1.7913469076156616, | |
| "learning_rate": 4.9394105785349944e-05, | |
| "loss": 4.6854, | |
| "num_input_tokens_seen": 80768, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 1.471130609512329, | |
| "learning_rate": 4.9315346265123594e-05, | |
| "loss": 4.0843, | |
| "num_input_tokens_seen": 82848, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 1.8904441595077515, | |
| "learning_rate": 4.923184641996463e-05, | |
| "loss": 4.0373, | |
| "num_input_tokens_seen": 84768, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.13264536857605, | |
| "learning_rate": 4.914362252833332e-05, | |
| "loss": 3.6419, | |
| "num_input_tokens_seen": 86848, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.7601622343063354, | |
| "learning_rate": 4.905069178965215e-05, | |
| "loss": 3.3145, | |
| "num_input_tokens_seen": 88736, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 1.8561395406723022, | |
| "learning_rate": 4.8953072320952745e-05, | |
| "loss": 3.4327, | |
| "num_input_tokens_seen": 90848, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 1.9273947477340698, | |
| "learning_rate": 4.885078315334395e-05, | |
| "loss": 3.2526, | |
| "num_input_tokens_seen": 92992, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.0955724716186523, | |
| "learning_rate": 4.874384422830167e-05, | |
| "loss": 2.791, | |
| "num_input_tokens_seen": 94880, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 1.2840852737426758, | |
| "learning_rate": 4.863227639378124e-05, | |
| "loss": 2.2217, | |
| "num_input_tokens_seen": 96704, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.7399126291275024, | |
| "learning_rate": 4.851610140015304e-05, | |
| "loss": 1.6788, | |
| "num_input_tokens_seen": 98240, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "eval_loss": 1.980229139328003, | |
| "eval_runtime": 0.8532, | |
| "eval_samples_per_second": 65.632, | |
| "eval_steps_per_second": 16.408, | |
| "num_input_tokens_seen": 99136, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 1.2386776208877563, | |
| "learning_rate": 4.839534189596228e-05, | |
| "loss": 2.0365, | |
| "num_input_tokens_seen": 100224, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 1.1297599077224731, | |
| "learning_rate": 4.8270021423513554e-05, | |
| "loss": 1.4581, | |
| "num_input_tokens_seen": 101920, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 1.090529203414917, | |
| "learning_rate": 4.8140164414281306e-05, | |
| "loss": 1.2941, | |
| "num_input_tokens_seen": 103808, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 1.0772465467453003, | |
| "learning_rate": 4.800579618414676e-05, | |
| "loss": 1.4016, | |
| "num_input_tokens_seen": 105920, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 1.179438591003418, | |
| "learning_rate": 4.7866942928462625e-05, | |
| "loss": 1.5631, | |
| "num_input_tokens_seen": 108160, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 1.1663882732391357, | |
| "learning_rate": 4.772363171694622e-05, | |
| "loss": 0.856, | |
| "num_input_tokens_seen": 109920, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 1.3647843599319458, | |
| "learning_rate": 4.7575890488402185e-05, | |
| "loss": 1.1216, | |
| "num_input_tokens_seen": 111904, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 1.4680769443511963, | |
| "learning_rate": 4.742374804527575e-05, | |
| "loss": 0.776, | |
| "num_input_tokens_seen": 113632, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 0.9575800895690918, | |
| "learning_rate": 4.7267234048037664e-05, | |
| "loss": 1.0421, | |
| "num_input_tokens_seen": 115616, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.7263527512550354, | |
| "learning_rate": 4.710637900940181e-05, | |
| "loss": 0.7994, | |
| "num_input_tokens_seen": 117472, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 1.1039828062057495, | |
| "learning_rate": 4.694121428837668e-05, | |
| "loss": 1.0609, | |
| "num_input_tokens_seen": 119616, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 0.9919302463531494, | |
| "learning_rate": 4.6771772084151885e-05, | |
| "loss": 0.8039, | |
| "num_input_tokens_seen": 121568, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 1.072923183441162, | |
| "learning_rate": 4.659808542982088e-05, | |
| "loss": 1.0818, | |
| "num_input_tokens_seen": 123904, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "eval_loss": 0.7967647314071655, | |
| "eval_runtime": 0.8522, | |
| "eval_samples_per_second": 65.715, | |
| "eval_steps_per_second": 16.429, | |
| "num_input_tokens_seen": 123904, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.6072083711624146, | |
| "learning_rate": 4.642018818594107e-05, | |
| "loss": 0.5327, | |
| "num_input_tokens_seen": 125696, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 0.9770075678825378, | |
| "learning_rate": 4.6238115033932636e-05, | |
| "loss": 0.6088, | |
| "num_input_tokens_seen": 127488, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.5782830715179443, | |
| "learning_rate": 4.605190146931731e-05, | |
| "loss": 0.7891, | |
| "num_input_tokens_seen": 129632, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 1.686546802520752, | |
| "learning_rate": 4.586158379479848e-05, | |
| "loss": 0.7432, | |
| "num_input_tokens_seen": 131680, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 0.8748849630355835, | |
| "learning_rate": 4.566719911318389e-05, | |
| "loss": 0.5091, | |
| "num_input_tokens_seen": 133472, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 0.690437912940979, | |
| "learning_rate": 4.5468785320152365e-05, | |
| "loss": 0.5338, | |
| "num_input_tokens_seen": 135200, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.1224329471588135, | |
| "learning_rate": 4.5266381096866e-05, | |
| "loss": 0.9588, | |
| "num_input_tokens_seen": 137536, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 0.998359203338623, | |
| "learning_rate": 4.5060025902429174e-05, | |
| "loss": 0.7235, | |
| "num_input_tokens_seen": 139744, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 1.1624242067337036, | |
| "learning_rate": 4.484975996619589e-05, | |
| "loss": 0.703, | |
| "num_input_tokens_seen": 141760, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 1.1226049661636353, | |
| "learning_rate": 4.4635624279927044e-05, | |
| "loss": 0.6324, | |
| "num_input_tokens_seen": 143872, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 0.6500603556632996, | |
| "learning_rate": 4.441766058979898e-05, | |
| "loss": 0.5377, | |
| "num_input_tokens_seen": 145856, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.9589425325393677, | |
| "learning_rate": 4.4195911388264946e-05, | |
| "loss": 0.8394, | |
| "num_input_tokens_seen": 147648, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.024, | |
| "eval_loss": 0.5601035952568054, | |
| "eval_runtime": 0.8544, | |
| "eval_samples_per_second": 65.545, | |
| "eval_steps_per_second": 16.386, | |
| "num_input_tokens_seen": 148736, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 1.4394139051437378, | |
| "learning_rate": 4.3970419905771145e-05, | |
| "loss": 0.4311, | |
| "num_input_tokens_seen": 149472, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 0.8933354020118713, | |
| "learning_rate": 4.374123010232888e-05, | |
| "loss": 0.6238, | |
| "num_input_tokens_seen": 151552, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 0.8065023422241211, | |
| "learning_rate": 4.350838665894446e-05, | |
| "loss": 0.5635, | |
| "num_input_tokens_seen": 153568, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 0.6302071213722229, | |
| "learning_rate": 4.3271934968908514e-05, | |
| "loss": 0.4754, | |
| "num_input_tokens_seen": 155616, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.5202668309211731, | |
| "learning_rate": 4.303192112894652e-05, | |
| "loss": 0.6315, | |
| "num_input_tokens_seen": 157728, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 0.9103642702102661, | |
| "learning_rate": 4.278839193023214e-05, | |
| "loss": 0.4821, | |
| "num_input_tokens_seen": 159488, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.2800000000000002, | |
| "grad_norm": 0.6605204939842224, | |
| "learning_rate": 4.254139484926519e-05, | |
| "loss": 0.5577, | |
| "num_input_tokens_seen": 161600, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 0.8729458451271057, | |
| "learning_rate": 4.2290978038616e-05, | |
| "loss": 0.5065, | |
| "num_input_tokens_seen": 163712, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 0.7586544752120972, | |
| "learning_rate": 4.2037190317538e-05, | |
| "loss": 0.4285, | |
| "num_input_tokens_seen": 165536, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 1.765929102897644, | |
| "learning_rate": 4.178008116245024e-05, | |
| "loss": 0.5842, | |
| "num_input_tokens_seen": 167872, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 0.9758070111274719, | |
| "learning_rate": 4.1519700697291944e-05, | |
| "loss": 0.6883, | |
| "num_input_tokens_seen": 170112, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 1.6534411907196045, | |
| "learning_rate": 4.125609968375072e-05, | |
| "loss": 0.4686, | |
| "num_input_tokens_seen": 172000, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 0.6654028296470642, | |
| "learning_rate": 4.098932951136645e-05, | |
| "loss": 0.5184, | |
| "num_input_tokens_seen": 174016, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.528, | |
| "eval_loss": 0.4781542420387268, | |
| "eval_runtime": 0.8555, | |
| "eval_samples_per_second": 65.46, | |
| "eval_steps_per_second": 16.365, | |
| "num_input_tokens_seen": 174432, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 0.7983071208000183, | |
| "learning_rate": 4.071944218751282e-05, | |
| "loss": 0.4466, | |
| "num_input_tokens_seen": 175776, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 0.9768652319908142, | |
| "learning_rate": 4.044649032725836e-05, | |
| "loss": 0.5084, | |
| "num_input_tokens_seen": 177952, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 1.1632089614868164, | |
| "learning_rate": 4.017052714310906e-05, | |
| "loss": 0.3983, | |
| "num_input_tokens_seen": 179968, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 1.7327042818069458, | |
| "learning_rate": 3.989160643463445e-05, | |
| "loss": 0.4557, | |
| "num_input_tokens_seen": 181952, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.7199999999999998, | |
| "grad_norm": 1.0546882152557373, | |
| "learning_rate": 3.960978257797931e-05, | |
| "loss": 0.3364, | |
| "num_input_tokens_seen": 183680, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 0.5163419246673584, | |
| "learning_rate": 3.932511051526289e-05, | |
| "loss": 0.4238, | |
| "num_input_tokens_seen": 185632, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 1.192272663116455, | |
| "learning_rate": 3.903764574386786e-05, | |
| "loss": 0.46, | |
| "num_input_tokens_seen": 187552, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 1.180910348892212, | |
| "learning_rate": 3.8747444305621e-05, | |
| "loss": 0.4533, | |
| "num_input_tokens_seen": 189408, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 0.7053818702697754, | |
| "learning_rate": 3.8454562775867684e-05, | |
| "loss": 0.4832, | |
| "num_input_tokens_seen": 191488, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 1.434988260269165, | |
| "learning_rate": 3.8159058252442446e-05, | |
| "loss": 0.4162, | |
| "num_input_tokens_seen": 193312, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 0.8368187546730042, | |
| "learning_rate": 3.786098834453766e-05, | |
| "loss": 0.4683, | |
| "num_input_tokens_seen": 195424, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.403336763381958, | |
| "learning_rate": 3.7560411161472456e-05, | |
| "loss": 0.3853, | |
| "num_input_tokens_seen": 197024, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.032, | |
| "eval_loss": 0.46134254336357117, | |
| "eval_runtime": 0.8491, | |
| "eval_samples_per_second": 65.955, | |
| "eval_steps_per_second": 16.489, | |
| "num_input_tokens_seen": 198656, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 0.6058657169342041, | |
| "learning_rate": 3.725738530136422e-05, | |
| "loss": 0.5412, | |
| "num_input_tokens_seen": 199040, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 1.2126246690750122, | |
| "learning_rate": 3.695196983970481e-05, | |
| "loss": 0.4867, | |
| "num_input_tokens_seen": 200960, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 0.5591365098953247, | |
| "learning_rate": 3.664422431784361e-05, | |
| "loss": 0.3728, | |
| "num_input_tokens_seen": 203008, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 1.040564775466919, | |
| "learning_rate": 3.633420873137988e-05, | |
| "loss": 0.378, | |
| "num_input_tokens_seen": 204672, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 0.4747677743434906, | |
| "learning_rate": 3.602198351846647e-05, | |
| "loss": 0.4497, | |
| "num_input_tokens_seen": 206784, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 0.8027376532554626, | |
| "learning_rate": 3.570760954802726e-05, | |
| "loss": 0.3772, | |
| "num_input_tokens_seen": 208672, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 0.7639246582984924, | |
| "learning_rate": 3.53911481078907e-05, | |
| "loss": 0.4375, | |
| "num_input_tokens_seen": 210752, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 0.7363210320472717, | |
| "learning_rate": 3.507266089284157e-05, | |
| "loss": 0.5944, | |
| "num_input_tokens_seen": 213472, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 1.636357307434082, | |
| "learning_rate": 3.475220999259349e-05, | |
| "loss": 0.4546, | |
| "num_input_tokens_seen": 215616, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 0.7487366795539856, | |
| "learning_rate": 3.442985787968442e-05, | |
| "loss": 0.4207, | |
| "num_input_tokens_seen": 217664, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 0.5726836323738098, | |
| "learning_rate": 3.410566739729746e-05, | |
| "loss": 0.4204, | |
| "num_input_tokens_seen": 219584, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.7676336169242859, | |
| "learning_rate": 3.3779701747009504e-05, | |
| "loss": 0.4381, | |
| "num_input_tokens_seen": 221504, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 0.8204706907272339, | |
| "learning_rate": 3.3452024476469934e-05, | |
| "loss": 0.4549, | |
| "num_input_tokens_seen": 223424, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 4.536, | |
| "eval_loss": 0.43884095549583435, | |
| "eval_runtime": 0.8563, | |
| "eval_samples_per_second": 65.4, | |
| "eval_steps_per_second": 16.35, | |
| "num_input_tokens_seen": 224032, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 4.5600000000000005, | |
| "grad_norm": 1.2742042541503906, | |
| "learning_rate": 3.312269946701191e-05, | |
| "loss": 0.4388, | |
| "num_input_tokens_seen": 225216, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 1.1421749591827393, | |
| "learning_rate": 3.279179092119855e-05, | |
| "loss": 0.3681, | |
| "num_input_tokens_seen": 227008, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 1.359496831893921, | |
| "learning_rate": 3.245936335030651e-05, | |
| "loss": 0.4424, | |
| "num_input_tokens_seen": 228736, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 0.7598556280136108, | |
| "learning_rate": 3.21254815617494e-05, | |
| "loss": 0.3985, | |
| "num_input_tokens_seen": 230240, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 0.4738464057445526, | |
| "learning_rate": 3.179021064644347e-05, | |
| "loss": 0.428, | |
| "num_input_tokens_seen": 232192, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 0.8662070035934448, | |
| "learning_rate": 3.145361596611795e-05, | |
| "loss": 0.416, | |
| "num_input_tokens_seen": 234368, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 0.7484014630317688, | |
| "learning_rate": 3.111576314057268e-05, | |
| "loss": 0.395, | |
| "num_input_tokens_seen": 236032, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 0.9487155079841614, | |
| "learning_rate": 3.0776718034885454e-05, | |
| "loss": 0.3691, | |
| "num_input_tokens_seen": 237920, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 0.6988774538040161, | |
| "learning_rate": 3.0436546746571372e-05, | |
| "loss": 0.3724, | |
| "num_input_tokens_seen": 239680, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 0.8400317430496216, | |
| "learning_rate": 3.0095315592697126e-05, | |
| "loss": 0.3814, | |
| "num_input_tokens_seen": 241504, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 0.9846633672714233, | |
| "learning_rate": 2.9753091096952255e-05, | |
| "loss": 0.4676, | |
| "num_input_tokens_seen": 243584, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.6307002902030945, | |
| "learning_rate": 2.9409939976680313e-05, | |
| "loss": 0.4232, | |
| "num_input_tokens_seen": 245472, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "grad_norm": 1.2824044227600098, | |
| "learning_rate": 2.9065929129872094e-05, | |
| "loss": 0.4193, | |
| "num_input_tokens_seen": 247424, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.04, | |
| "eval_loss": 0.42154452204704285, | |
| "eval_runtime": 0.8526, | |
| "eval_samples_per_second": 65.682, | |
| "eval_steps_per_second": 16.42, | |
| "num_input_tokens_seen": 247424, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.08, | |
| "grad_norm": 0.6836050152778625, | |
| "learning_rate": 2.8721125622123806e-05, | |
| "loss": 0.3778, | |
| "num_input_tokens_seen": 249472, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 1.2711609601974487, | |
| "learning_rate": 2.8375596673562482e-05, | |
| "loss": 0.3189, | |
| "num_input_tokens_seen": 251296, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 5.16, | |
| "grad_norm": 1.1848688125610352, | |
| "learning_rate": 2.8029409645741267e-05, | |
| "loss": 0.3988, | |
| "num_input_tokens_seen": 253344, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 0.43571653962135315, | |
| "learning_rate": 2.7682632028507167e-05, | |
| "loss": 0.3687, | |
| "num_input_tokens_seen": 255104, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 5.24, | |
| "grad_norm": 2.1147029399871826, | |
| "learning_rate": 2.733533142684377e-05, | |
| "loss": 0.3907, | |
| "num_input_tokens_seen": 256832, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 5.28, | |
| "grad_norm": 0.5585914850234985, | |
| "learning_rate": 2.6987575547691497e-05, | |
| "loss": 0.4176, | |
| "num_input_tokens_seen": 258720, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 5.32, | |
| "grad_norm": 1.013206124305725, | |
| "learning_rate": 2.6639432186748043e-05, | |
| "loss": 0.4127, | |
| "num_input_tokens_seen": 260576, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 5.36, | |
| "grad_norm": 0.6060461401939392, | |
| "learning_rate": 2.6290969215251416e-05, | |
| "loss": 0.3817, | |
| "num_input_tokens_seen": 262368, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "grad_norm": 0.9579009413719177, | |
| "learning_rate": 2.594225456674837e-05, | |
| "loss": 0.3826, | |
| "num_input_tokens_seen": 264320, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 5.44, | |
| "grad_norm": 1.1483041048049927, | |
| "learning_rate": 2.559335622385055e-05, | |
| "loss": 0.3825, | |
| "num_input_tokens_seen": 266304, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 5.48, | |
| "grad_norm": 0.8167299628257751, | |
| "learning_rate": 2.524434220498123e-05, | |
| "loss": 0.3944, | |
| "num_input_tokens_seen": 268384, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 5.52, | |
| "grad_norm": 0.5977395176887512, | |
| "learning_rate": 2.4895280551114907e-05, | |
| "loss": 0.3691, | |
| "num_input_tokens_seen": 270208, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 5.5440000000000005, | |
| "eval_loss": 0.4073176383972168, | |
| "eval_runtime": 0.8504, | |
| "eval_samples_per_second": 65.848, | |
| "eval_steps_per_second": 16.462, | |
| "num_input_tokens_seen": 271232, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 5.5600000000000005, | |
| "grad_norm": 1.8195998668670654, | |
| "learning_rate": 2.4546239312512635e-05, | |
| "loss": 0.4443, | |
| "num_input_tokens_seen": 271840, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 1.045985460281372, | |
| "learning_rate": 2.4197286535455464e-05, | |
| "loss": 0.502, | |
| "num_input_tokens_seen": 273888, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 5.64, | |
| "grad_norm": 1.0764570236206055, | |
| "learning_rate": 2.384849024897869e-05, | |
| "loss": 0.4579, | |
| "num_input_tokens_seen": 275904, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 5.68, | |
| "grad_norm": 0.2045237421989441, | |
| "learning_rate": 2.349991845160949e-05, | |
| "loss": 0.4459, | |
| "num_input_tokens_seen": 277888, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 5.72, | |
| "grad_norm": 1.0971755981445312, | |
| "learning_rate": 2.3151639098110377e-05, | |
| "loss": 0.4206, | |
| "num_input_tokens_seen": 279872, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 0.39166244864463806, | |
| "learning_rate": 2.280372008623142e-05, | |
| "loss": 0.3705, | |
| "num_input_tokens_seen": 281664, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 5.8, | |
| "grad_norm": 0.7744928002357483, | |
| "learning_rate": 2.2456229243473345e-05, | |
| "loss": 0.3991, | |
| "num_input_tokens_seen": 283776, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 5.84, | |
| "grad_norm": 0.4687577486038208, | |
| "learning_rate": 2.2109234313864465e-05, | |
| "loss": 0.3788, | |
| "num_input_tokens_seen": 285568, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 5.88, | |
| "grad_norm": 0.439230352640152, | |
| "learning_rate": 2.176280294475383e-05, | |
| "loss": 0.3795, | |
| "num_input_tokens_seen": 287360, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 5.92, | |
| "grad_norm": 0.6617324948310852, | |
| "learning_rate": 2.1417002673623264e-05, | |
| "loss": 0.4557, | |
| "num_input_tokens_seen": 289632, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 5.96, | |
| "grad_norm": 0.32485124468803406, | |
| "learning_rate": 2.1071900914920816e-05, | |
| "loss": 0.4056, | |
| "num_input_tokens_seen": 291552, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 1.3294658660888672, | |
| "learning_rate": 2.0727564946918087e-05, | |
| "loss": 0.4079, | |
| "num_input_tokens_seen": 293616, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 6.04, | |
| "grad_norm": 0.7842015027999878, | |
| "learning_rate": 2.038406189859433e-05, | |
| "loss": 0.3746, | |
| "num_input_tokens_seen": 295440, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 6.048, | |
| "eval_loss": 0.40050727128982544, | |
| "eval_runtime": 0.8608, | |
| "eval_samples_per_second": 65.059, | |
| "eval_steps_per_second": 16.265, | |
| "num_input_tokens_seen": 295728, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 6.08, | |
| "grad_norm": 0.7505316734313965, | |
| "learning_rate": 2.004145873654942e-05, | |
| "loss": 0.4006, | |
| "num_input_tokens_seen": 297360, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 6.12, | |
| "grad_norm": 0.787607729434967, | |
| "learning_rate": 1.969982225194864e-05, | |
| "loss": 0.3928, | |
| "num_input_tokens_seen": 299312, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 6.16, | |
| "grad_norm": 0.8527250289916992, | |
| "learning_rate": 1.9359219047501565e-05, | |
| "loss": 0.3839, | |
| "num_input_tokens_seen": 301488, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 6.2, | |
| "grad_norm": 0.8097345232963562, | |
| "learning_rate": 1.9019715524477767e-05, | |
| "loss": 0.4211, | |
| "num_input_tokens_seen": 303696, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 6.24, | |
| "grad_norm": 1.303476095199585, | |
| "learning_rate": 1.868137786976177e-05, | |
| "loss": 0.3858, | |
| "num_input_tokens_seen": 305360, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 6.28, | |
| "grad_norm": 0.7829755544662476, | |
| "learning_rate": 1.8344272042949724e-05, | |
| "loss": 0.3779, | |
| "num_input_tokens_seen": 307408, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 6.32, | |
| "grad_norm": 0.659096360206604, | |
| "learning_rate": 1.800846376349051e-05, | |
| "loss": 0.4222, | |
| "num_input_tokens_seen": 309232, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 6.36, | |
| "grad_norm": 0.6200412511825562, | |
| "learning_rate": 1.767401849787357e-05, | |
| "loss": 0.3675, | |
| "num_input_tokens_seen": 311184, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 0.5834780335426331, | |
| "learning_rate": 1.73410014468661e-05, | |
| "loss": 0.3727, | |
| "num_input_tokens_seen": 313072, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 6.44, | |
| "grad_norm": 1.6976076364517212, | |
| "learning_rate": 1.7009477532802054e-05, | |
| "loss": 0.3823, | |
| "num_input_tokens_seen": 314832, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 6.48, | |
| "grad_norm": 1.1492793560028076, | |
| "learning_rate": 1.6679511386925337e-05, | |
| "loss": 0.3356, | |
| "num_input_tokens_seen": 316560, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 6.52, | |
| "grad_norm": 0.6304459571838379, | |
| "learning_rate": 1.635116733678988e-05, | |
| "loss": 0.427, | |
| "num_input_tokens_seen": 318960, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 6.552, | |
| "eval_loss": 0.40054336190223694, | |
| "eval_runtime": 0.8527, | |
| "eval_samples_per_second": 65.674, | |
| "eval_steps_per_second": 16.418, | |
| "num_input_tokens_seen": 320464, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 6.5600000000000005, | |
| "grad_norm": 0.825368344783783, | |
| "learning_rate": 1.6024509393718844e-05, | |
| "loss": 0.3747, | |
| "num_input_tokens_seen": 320880, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 6.6, | |
| "grad_norm": 0.5931684970855713, | |
| "learning_rate": 1.5699601240325474e-05, | |
| "loss": 0.3936, | |
| "num_input_tokens_seen": 323184, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 6.64, | |
| "grad_norm": 0.8659843802452087, | |
| "learning_rate": 1.5376506218098015e-05, | |
| "loss": 0.371, | |
| "num_input_tokens_seen": 325168, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 6.68, | |
| "grad_norm": 0.6564781069755554, | |
| "learning_rate": 1.505528731505126e-05, | |
| "loss": 0.3795, | |
| "num_input_tokens_seen": 326992, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 0.9912712574005127, | |
| "learning_rate": 1.4736007153446801e-05, | |
| "loss": 0.4107, | |
| "num_input_tokens_seen": 329104, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 6.76, | |
| "grad_norm": 0.7603425979614258, | |
| "learning_rate": 1.4418727977584774e-05, | |
| "loss": 0.3653, | |
| "num_input_tokens_seen": 331088, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 0.5329681038856506, | |
| "learning_rate": 1.4103511641669152e-05, | |
| "loss": 0.3939, | |
| "num_input_tokens_seen": 333008, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 6.84, | |
| "grad_norm": 0.5622363686561584, | |
| "learning_rate": 1.3790419597749199e-05, | |
| "loss": 0.3725, | |
| "num_input_tokens_seen": 335024, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 6.88, | |
| "grad_norm": 0.9950863718986511, | |
| "learning_rate": 1.3479512883739232e-05, | |
| "loss": 0.4179, | |
| "num_input_tokens_seen": 337104, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 6.92, | |
| "grad_norm": 0.7129436135292053, | |
| "learning_rate": 1.3170852111519175e-05, | |
| "loss": 0.3773, | |
| "num_input_tokens_seen": 338960, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 6.96, | |
| "grad_norm": 1.585489273071289, | |
| "learning_rate": 1.2864497455118152e-05, | |
| "loss": 0.3702, | |
| "num_input_tokens_seen": 340848, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 1.2564765214920044, | |
| "learning_rate": 1.2560508638983437e-05, | |
| "loss": 0.3594, | |
| "num_input_tokens_seen": 343040, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 7.04, | |
| "grad_norm": 0.6992194056510925, | |
| "learning_rate": 1.2258944926337057e-05, | |
| "loss": 0.3347, | |
| "num_input_tokens_seen": 345056, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 7.056, | |
| "eval_loss": 0.41065576672554016, | |
| "eval_runtime": 0.8558, | |
| "eval_samples_per_second": 65.435, | |
| "eval_steps_per_second": 16.359, | |
| "num_input_tokens_seen": 345856, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 7.08, | |
| "grad_norm": 0.7301174402236938, | |
| "learning_rate": 1.1959865107622307e-05, | |
| "loss": 0.3597, | |
| "num_input_tokens_seen": 347232, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 7.12, | |
| "grad_norm": 1.776132345199585, | |
| "learning_rate": 1.1663327489042435e-05, | |
| "loss": 0.4383, | |
| "num_input_tokens_seen": 349504, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 7.16, | |
| "grad_norm": 0.7216264605522156, | |
| "learning_rate": 1.1369389881193749e-05, | |
| "loss": 0.4164, | |
| "num_input_tokens_seen": 351296, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 1.4257539510726929, | |
| "learning_rate": 1.107810958779531e-05, | |
| "loss": 0.3858, | |
| "num_input_tokens_seen": 353248, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 7.24, | |
| "grad_norm": 1.8053635358810425, | |
| "learning_rate": 1.0789543394517435e-05, | |
| "loss": 0.4069, | |
| "num_input_tokens_seen": 355232, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 7.28, | |
| "grad_norm": 0.6325796246528625, | |
| "learning_rate": 1.050374755791127e-05, | |
| "loss": 0.3591, | |
| "num_input_tokens_seen": 357376, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 7.32, | |
| "grad_norm": 1.1207585334777832, | |
| "learning_rate": 1.022077779444145e-05, | |
| "loss": 0.3539, | |
| "num_input_tokens_seen": 359232, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 7.36, | |
| "grad_norm": 1.4070286750793457, | |
| "learning_rate": 9.94068926962404e-06, | |
| "loss": 0.3527, | |
| "num_input_tokens_seen": 361024, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 7.4, | |
| "grad_norm": 0.627964973449707, | |
| "learning_rate": 9.663536587271902e-06, | |
| "loss": 0.376, | |
| "num_input_tokens_seen": 363200, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 7.44, | |
| "grad_norm": 0.6566188335418701, | |
| "learning_rate": 9.389373778849612e-06, | |
| "loss": 0.3517, | |
| "num_input_tokens_seen": 365152, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 7.48, | |
| "grad_norm": 0.6392218470573425, | |
| "learning_rate": 9.11825429293989e-06, | |
| "loss": 0.4175, | |
| "num_input_tokens_seen": 367328, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 7.52, | |
| "grad_norm": 0.5262883901596069, | |
| "learning_rate": 8.850230984823735e-06, | |
| "loss": 0.358, | |
| "num_input_tokens_seen": 369248, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 7.5600000000000005, | |
| "grad_norm": 0.9610680937767029, | |
| "learning_rate": 8.585356106176094e-06, | |
| "loss": 0.331, | |
| "num_input_tokens_seen": 371040, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 7.5600000000000005, | |
| "eval_loss": 0.4088888466358185, | |
| "eval_runtime": 0.8481, | |
| "eval_samples_per_second": 66.028, | |
| "eval_steps_per_second": 16.507, | |
| "num_input_tokens_seen": 371040, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 0.7290608882904053, | |
| "learning_rate": 8.323681294879394e-06, | |
| "loss": 0.4139, | |
| "num_input_tokens_seen": 372928, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 7.64, | |
| "grad_norm": 0.532922089099884, | |
| "learning_rate": 8.06525756495657e-06, | |
| "loss": 0.4047, | |
| "num_input_tokens_seen": 374816, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 7.68, | |
| "grad_norm": 0.77403724193573, | |
| "learning_rate": 7.810135296625818e-06, | |
| "loss": 0.3748, | |
| "num_input_tokens_seen": 376704, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 7.72, | |
| "grad_norm": 0.5218887329101562, | |
| "learning_rate": 7.558364226478842e-06, | |
| "loss": 0.3622, | |
| "num_input_tokens_seen": 378624, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 7.76, | |
| "grad_norm": 0.5533618927001953, | |
| "learning_rate": 7.309993437784624e-06, | |
| "loss": 0.3991, | |
| "num_input_tokens_seen": 380480, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 7.8, | |
| "grad_norm": 0.6465098261833191, | |
| "learning_rate": 7.065071350920538e-06, | |
| "loss": 0.3731, | |
| "num_input_tokens_seen": 382144, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 7.84, | |
| "grad_norm": 0.6888776421546936, | |
| "learning_rate": 6.823645713932708e-06, | |
| "loss": 0.4033, | |
| "num_input_tokens_seen": 384320, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 7.88, | |
| "grad_norm": 0.6575106382369995, | |
| "learning_rate": 6.58576359322742e-06, | |
| "loss": 0.4272, | |
| "num_input_tokens_seen": 386656, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 7.92, | |
| "grad_norm": 0.4115797281265259, | |
| "learning_rate": 6.3514713643954475e-06, | |
| "loss": 0.3612, | |
| "num_input_tokens_seen": 388672, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 7.96, | |
| "grad_norm": 1.37602698802948, | |
| "learning_rate": 6.120814703171024e-06, | |
| "loss": 0.3805, | |
| "num_input_tokens_seen": 390400, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.3773898780345917, | |
| "learning_rate": 5.893838576527275e-06, | |
| "loss": 0.3604, | |
| "num_input_tokens_seen": 392080, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 8.04, | |
| "grad_norm": 0.7060356140136719, | |
| "learning_rate": 5.6705872339098186e-06, | |
| "loss": 0.4144, | |
| "num_input_tokens_seen": 394160, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 8.064, | |
| "eval_loss": 0.38490504026412964, | |
| "eval_runtime": 0.8524, | |
| "eval_samples_per_second": 65.693, | |
| "eval_steps_per_second": 16.423, | |
| "num_input_tokens_seen": 395216, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 8.08, | |
| "grad_norm": 0.7593961954116821, | |
| "learning_rate": 5.451104198610249e-06, | |
| "loss": 0.3568, | |
| "num_input_tokens_seen": 395888, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 8.12, | |
| "grad_norm": 1.2459512948989868, | |
| "learning_rate": 5.235432259281175e-06, | |
| "loss": 0.3882, | |
| "num_input_tokens_seen": 398032, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 8.16, | |
| "grad_norm": 1.0449175834655762, | |
| "learning_rate": 5.023613461594512e-06, | |
| "loss": 0.3798, | |
| "num_input_tokens_seen": 399856, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 8.2, | |
| "grad_norm": 0.7643663287162781, | |
| "learning_rate": 4.8156891000445406e-06, | |
| "loss": 0.3718, | |
| "num_input_tokens_seen": 401616, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 8.24, | |
| "grad_norm": 0.7826510071754456, | |
| "learning_rate": 4.6116997098975465e-06, | |
| "loss": 0.3458, | |
| "num_input_tokens_seen": 403568, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 8.28, | |
| "grad_norm": 0.6359825134277344, | |
| "learning_rate": 4.411685059289314e-06, | |
| "loss": 0.3725, | |
| "num_input_tokens_seen": 405712, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 8.32, | |
| "grad_norm": 0.4394562244415283, | |
| "learning_rate": 4.215684141472292e-06, | |
| "loss": 0.3745, | |
| "num_input_tokens_seen": 407888, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 8.36, | |
| "grad_norm": 0.6596431732177734, | |
| "learning_rate": 4.023735167213752e-06, | |
| "loss": 0.3587, | |
| "num_input_tokens_seen": 409712, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 0.9594627618789673, | |
| "learning_rate": 3.835875557346552e-06, | |
| "loss": 0.3633, | |
| "num_input_tokens_seen": 411504, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 8.44, | |
| "grad_norm": 0.5970214009284973, | |
| "learning_rate": 3.6521419354738738e-06, | |
| "loss": 0.368, | |
| "num_input_tokens_seen": 413168, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 8.48, | |
| "grad_norm": 0.4645940959453583, | |
| "learning_rate": 3.4725701208293435e-06, | |
| "loss": 0.3561, | |
| "num_input_tokens_seen": 414960, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 8.52, | |
| "grad_norm": 0.6897419691085815, | |
| "learning_rate": 3.297195121294022e-06, | |
| "loss": 0.3644, | |
| "num_input_tokens_seen": 416880, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 8.56, | |
| "grad_norm": 0.8192169666290283, | |
| "learning_rate": 3.126051126571561e-06, | |
| "loss": 0.3779, | |
| "num_input_tokens_seen": 418768, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 8.568, | |
| "eval_loss": 0.38693496584892273, | |
| "eval_runtime": 0.8505, | |
| "eval_samples_per_second": 65.843, | |
| "eval_steps_per_second": 16.461, | |
| "num_input_tokens_seen": 419184, | |
| "step": 1071 | |
| }, | |
| { | |
| "epoch": 8.6, | |
| "grad_norm": 0.9851546287536621, | |
| "learning_rate": 2.9591715015228284e-06, | |
| "loss": 0.4053, | |
| "num_input_tokens_seen": 421008, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 8.64, | |
| "grad_norm": 0.5261719226837158, | |
| "learning_rate": 2.7965887796613884e-06, | |
| "loss": 0.3589, | |
| "num_input_tokens_seen": 422864, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 8.68, | |
| "grad_norm": 0.7083816528320312, | |
| "learning_rate": 2.6383346568110062e-06, | |
| "loss": 0.3792, | |
| "num_input_tokens_seen": 424976, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 8.72, | |
| "grad_norm": 0.8124401569366455, | |
| "learning_rate": 2.4844399849264928e-06, | |
| "loss": 0.3899, | |
| "num_input_tokens_seen": 427120, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 8.76, | |
| "grad_norm": 0.7263258695602417, | |
| "learning_rate": 2.3349347660790582e-06, | |
| "loss": 0.4017, | |
| "num_input_tokens_seen": 429264, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 0.6860739588737488, | |
| "learning_rate": 2.189848146607348e-06, | |
| "loss": 0.3718, | |
| "num_input_tokens_seen": 431056, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 8.84, | |
| "grad_norm": 0.49945640563964844, | |
| "learning_rate": 2.0492084114352965e-06, | |
| "loss": 0.3644, | |
| "num_input_tokens_seen": 433136, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 8.88, | |
| "grad_norm": 0.8145077228546143, | |
| "learning_rate": 1.913042978557944e-06, | |
| "loss": 0.3859, | |
| "num_input_tokens_seen": 435216, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 8.92, | |
| "grad_norm": 1.1362378597259521, | |
| "learning_rate": 1.7813783936962258e-06, | |
| "loss": 0.3521, | |
| "num_input_tokens_seen": 437040, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 8.96, | |
| "grad_norm": 1.012280821800232, | |
| "learning_rate": 1.654240325121831e-06, | |
| "loss": 0.3873, | |
| "num_input_tokens_seen": 439088, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 1.0121548175811768, | |
| "learning_rate": 1.5316535586531483e-06, | |
| "loss": 0.4015, | |
| "num_input_tokens_seen": 440848, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 9.04, | |
| "grad_norm": 0.9955313205718994, | |
| "learning_rate": 1.4136419928231892e-06, | |
| "loss": 0.3714, | |
| "num_input_tokens_seen": 442864, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 9.072, | |
| "eval_loss": 0.38989585638046265, | |
| "eval_runtime": 0.8537, | |
| "eval_samples_per_second": 65.599, | |
| "eval_steps_per_second": 16.4, | |
| "num_input_tokens_seen": 444560, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 9.08, | |
| "grad_norm": 0.6308397054672241, | |
| "learning_rate": 1.3002286342205462e-06, | |
| "loss": 0.3985, | |
| "num_input_tokens_seen": 445040, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 9.12, | |
| "grad_norm": 0.438340961933136, | |
| "learning_rate": 1.1914355930041837e-06, | |
| "loss": 0.3583, | |
| "num_input_tokens_seen": 446864, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 9.16, | |
| "grad_norm": 0.80867600440979, | |
| "learning_rate": 1.087284078593051e-06, | |
| "loss": 0.3824, | |
| "num_input_tokens_seen": 449008, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 9.2, | |
| "grad_norm": 1.2045706510543823, | |
| "learning_rate": 9.877943955312552e-07, | |
| "loss": 0.3471, | |
| "num_input_tokens_seen": 450832, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 9.24, | |
| "grad_norm": 1.406896948814392, | |
| "learning_rate": 8.929859395296364e-07, | |
| "loss": 0.3737, | |
| "num_input_tokens_seen": 452784, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 9.28, | |
| "grad_norm": 0.849940836429596, | |
| "learning_rate": 8.028771936845342e-07, | |
| "loss": 0.3719, | |
| "num_input_tokens_seen": 454992, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 9.32, | |
| "grad_norm": 0.889966607093811, | |
| "learning_rate": 7.174857248745004e-07, | |
| "loss": 0.3571, | |
| "num_input_tokens_seen": 456528, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 9.36, | |
| "grad_norm": 1.1385633945465088, | |
| "learning_rate": 6.368281803355691e-07, | |
| "loss": 0.3921, | |
| "num_input_tokens_seen": 458768, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 9.4, | |
| "grad_norm": 0.6559231877326965, | |
| "learning_rate": 5.609202844158723e-07, | |
| "loss": 0.3935, | |
| "num_input_tokens_seen": 460816, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 9.44, | |
| "grad_norm": 0.8425350785255432, | |
| "learning_rate": 4.897768355101084e-07, | |
| "loss": 0.3657, | |
| "num_input_tokens_seen": 462384, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 9.48, | |
| "grad_norm": 0.9185330271720886, | |
| "learning_rate": 4.234117031746143e-07, | |
| "loss": 0.3923, | |
| "num_input_tokens_seen": 464304, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 9.52, | |
| "grad_norm": 0.5886545777320862, | |
| "learning_rate": 3.6183782542343057e-07, | |
| "loss": 0.3341, | |
| "num_input_tokens_seen": 466384, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 9.56, | |
| "grad_norm": 0.7718898057937622, | |
| "learning_rate": 3.050672062060278e-07, | |
| "loss": 0.3858, | |
| "num_input_tokens_seen": 468368, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 9.576, | |
| "eval_loss": 0.3861676752567291, | |
| "eval_runtime": 0.8505, | |
| "eval_samples_per_second": 65.845, | |
| "eval_steps_per_second": 16.461, | |
| "num_input_tokens_seen": 469104, | |
| "step": 1197 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 1.3115769624710083, | |
| "learning_rate": 2.531109130671061e-07, | |
| "loss": 0.3761, | |
| "num_input_tokens_seen": 470192, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 9.64, | |
| "grad_norm": 0.625157356262207, | |
| "learning_rate": 2.0597907498896007e-07, | |
| "loss": 0.3878, | |
| "num_input_tokens_seen": 472240, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 9.68, | |
| "grad_norm": 1.4318360090255737, | |
| "learning_rate": 1.6368088041681108e-07, | |
| "loss": 0.3748, | |
| "num_input_tokens_seen": 474160, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 9.72, | |
| "grad_norm": 0.62953782081604, | |
| "learning_rate": 1.2622457546749567e-07, | |
| "loss": 0.4067, | |
| "num_input_tokens_seen": 476784, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 9.76, | |
| "grad_norm": 1.3049143552780151, | |
| "learning_rate": 9.361746232188495e-08, | |
| "loss": 0.3679, | |
| "num_input_tokens_seen": 478864, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 9.8, | |
| "grad_norm": 0.5870345234870911, | |
| "learning_rate": 6.586589780128716e-08, | |
| "loss": 0.3603, | |
| "num_input_tokens_seen": 480816, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 9.84, | |
| "grad_norm": 0.5798351764678955, | |
| "learning_rate": 4.2975292128200064e-08, | |
| "loss": 0.384, | |
| "num_input_tokens_seen": 482832, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 9.88, | |
| "grad_norm": 0.6793395280838013, | |
| "learning_rate": 2.4950107871549167e-08, | |
| "loss": 0.3827, | |
| "num_input_tokens_seen": 484912, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 9.92, | |
| "grad_norm": 0.9757604002952576, | |
| "learning_rate": 1.179385907672248e-08, | |
| "loss": 0.3863, | |
| "num_input_tokens_seen": 486640, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 9.96, | |
| "grad_norm": 0.7342190742492676, | |
| "learning_rate": 3.5091105804907487e-09, | |
| "loss": 0.3537, | |
| "num_input_tokens_seen": 488336, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.7796341180801392, | |
| "learning_rate": 9.747751098521107e-11, | |
| "loss": 0.3453, | |
| "num_input_tokens_seen": 490000, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "num_input_tokens_seen": 490000, | |
| "step": 1250, | |
| "total_flos": 2.206447853568e+16, | |
| "train_loss": 2.282332957649231, | |
| "train_runtime": 172.1756, | |
| "train_samples_per_second": 28.924, | |
| "train_steps_per_second": 7.26 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1250, | |
| "num_input_tokens_seen": 490000, | |
| "num_train_epochs": 10, | |
| "save_steps": 63, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.206447853568e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |