| { | |
| "best_global_step": 1080, | |
| "best_metric": 0.2312491238117218, | |
| "best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_copa_1757340203/checkpoint-1080", | |
| "epoch": 20.0, | |
| "eval_steps": 180, | |
| "global_step": 3600, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.027777777777777776, | |
| "grad_norm": 167.9774627685547, | |
| "learning_rate": 5.555555555555556e-07, | |
| "loss": 8.8223, | |
| "num_input_tokens_seen": 752, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.05555555555555555, | |
| "grad_norm": 152.64581298828125, | |
| "learning_rate": 1.25e-06, | |
| "loss": 8.0585, | |
| "num_input_tokens_seen": 1520, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 116.32727813720703, | |
| "learning_rate": 1.9444444444444444e-06, | |
| "loss": 6.6675, | |
| "num_input_tokens_seen": 2320, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 86.08872985839844, | |
| "learning_rate": 2.638888888888889e-06, | |
| "loss": 5.3834, | |
| "num_input_tokens_seen": 3072, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1388888888888889, | |
| "grad_norm": 68.7329330444336, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 3.9719, | |
| "num_input_tokens_seen": 3840, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.16666666666666666, | |
| "grad_norm": 48.6358642578125, | |
| "learning_rate": 4.027777777777779e-06, | |
| "loss": 2.8677, | |
| "num_input_tokens_seen": 4576, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.19444444444444445, | |
| "grad_norm": 35.65837097167969, | |
| "learning_rate": 4.722222222222222e-06, | |
| "loss": 1.9241, | |
| "num_input_tokens_seen": 5328, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 26.759178161621094, | |
| "learning_rate": 5.416666666666667e-06, | |
| "loss": 1.0376, | |
| "num_input_tokens_seen": 6112, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 32.65984344482422, | |
| "learning_rate": 6.111111111111111e-06, | |
| "loss": 0.52, | |
| "num_input_tokens_seen": 6848, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2777777777777778, | |
| "grad_norm": 12.905600547790527, | |
| "learning_rate": 6.805555555555556e-06, | |
| "loss": 0.4276, | |
| "num_input_tokens_seen": 7600, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3055555555555556, | |
| "grad_norm": 21.74563980102539, | |
| "learning_rate": 7.5e-06, | |
| "loss": 0.2801, | |
| "num_input_tokens_seen": 8368, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 12.704620361328125, | |
| "learning_rate": 8.194444444444445e-06, | |
| "loss": 0.1757, | |
| "num_input_tokens_seen": 9152, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3611111111111111, | |
| "grad_norm": 54.7520751953125, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 0.4626, | |
| "num_input_tokens_seen": 9888, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.3888888888888889, | |
| "grad_norm": 36.79613494873047, | |
| "learning_rate": 9.583333333333334e-06, | |
| "loss": 0.5824, | |
| "num_input_tokens_seen": 10656, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4166666666666667, | |
| "grad_norm": 10.10336685180664, | |
| "learning_rate": 1.0277777777777777e-05, | |
| "loss": 0.3283, | |
| "num_input_tokens_seen": 11408, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 25.05152130126953, | |
| "learning_rate": 1.0972222222222223e-05, | |
| "loss": 0.3072, | |
| "num_input_tokens_seen": 12144, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4722222222222222, | |
| "grad_norm": 16.299089431762695, | |
| "learning_rate": 1.1666666666666668e-05, | |
| "loss": 0.2421, | |
| "num_input_tokens_seen": 12880, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 7.419747352600098, | |
| "learning_rate": 1.2361111111111112e-05, | |
| "loss": 0.2412, | |
| "num_input_tokens_seen": 13664, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5277777777777778, | |
| "grad_norm": 15.009756088256836, | |
| "learning_rate": 1.3055555555555557e-05, | |
| "loss": 0.4789, | |
| "num_input_tokens_seen": 14464, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 8.924155235290527, | |
| "learning_rate": 1.3750000000000002e-05, | |
| "loss": 0.2502, | |
| "num_input_tokens_seen": 15216, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5833333333333334, | |
| "grad_norm": 10.291241645812988, | |
| "learning_rate": 1.4444444444444444e-05, | |
| "loss": 0.2588, | |
| "num_input_tokens_seen": 15984, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6111111111111112, | |
| "grad_norm": 16.27149200439453, | |
| "learning_rate": 1.5138888888888888e-05, | |
| "loss": 0.3088, | |
| "num_input_tokens_seen": 16768, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6388888888888888, | |
| "grad_norm": 8.211831092834473, | |
| "learning_rate": 1.5833333333333333e-05, | |
| "loss": 0.2734, | |
| "num_input_tokens_seen": 17552, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 4.888490676879883, | |
| "learning_rate": 1.6527777777777777e-05, | |
| "loss": 0.2656, | |
| "num_input_tokens_seen": 18304, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6944444444444444, | |
| "grad_norm": 5.311173915863037, | |
| "learning_rate": 1.7222222222222224e-05, | |
| "loss": 0.23, | |
| "num_input_tokens_seen": 19072, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7222222222222222, | |
| "grad_norm": 3.224381446838379, | |
| "learning_rate": 1.7916666666666667e-05, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 19840, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 10.3880615234375, | |
| "learning_rate": 1.861111111111111e-05, | |
| "loss": 0.3357, | |
| "num_input_tokens_seen": 20640, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 7.747908115386963, | |
| "learning_rate": 1.9305555555555558e-05, | |
| "loss": 0.3484, | |
| "num_input_tokens_seen": 21408, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8055555555555556, | |
| "grad_norm": 5.8082990646362305, | |
| "learning_rate": 2e-05, | |
| "loss": 0.2401, | |
| "num_input_tokens_seen": 22128, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 3.7531211376190186, | |
| "learning_rate": 2.0694444444444445e-05, | |
| "loss": 0.2147, | |
| "num_input_tokens_seen": 22880, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8611111111111112, | |
| "grad_norm": 6.251461505889893, | |
| "learning_rate": 2.138888888888889e-05, | |
| "loss": 0.2482, | |
| "num_input_tokens_seen": 23664, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 1.8583431243896484, | |
| "learning_rate": 2.2083333333333333e-05, | |
| "loss": 0.2706, | |
| "num_input_tokens_seen": 24432, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.9166666666666666, | |
| "grad_norm": 1.6401225328445435, | |
| "learning_rate": 2.277777777777778e-05, | |
| "loss": 0.2638, | |
| "num_input_tokens_seen": 25184, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.9444444444444444, | |
| "grad_norm": 4.828391075134277, | |
| "learning_rate": 2.3472222222222223e-05, | |
| "loss": 0.2324, | |
| "num_input_tokens_seen": 25920, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9722222222222222, | |
| "grad_norm": 2.8311290740966797, | |
| "learning_rate": 2.4166666666666667e-05, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 26672, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.7510238885879517, | |
| "learning_rate": 2.4861111111111114e-05, | |
| "loss": 0.2307, | |
| "num_input_tokens_seen": 27408, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.24429556727409363, | |
| "eval_runtime": 0.841, | |
| "eval_samples_per_second": 47.562, | |
| "eval_steps_per_second": 23.781, | |
| "num_input_tokens_seen": 27408, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.0277777777777777, | |
| "grad_norm": 1.5177123546600342, | |
| "learning_rate": 2.5555555555555554e-05, | |
| "loss": 0.2288, | |
| "num_input_tokens_seen": 28176, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.0555555555555556, | |
| "grad_norm": 1.2902367115020752, | |
| "learning_rate": 2.625e-05, | |
| "loss": 0.2356, | |
| "num_input_tokens_seen": 28928, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0833333333333333, | |
| "grad_norm": 4.072321891784668, | |
| "learning_rate": 2.6944444444444445e-05, | |
| "loss": 0.2577, | |
| "num_input_tokens_seen": 29696, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.1111111111111112, | |
| "grad_norm": 0.4522273540496826, | |
| "learning_rate": 2.7638888888888892e-05, | |
| "loss": 0.2403, | |
| "num_input_tokens_seen": 30448, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1388888888888888, | |
| "grad_norm": 0.927115797996521, | |
| "learning_rate": 2.8333333333333335e-05, | |
| "loss": 0.257, | |
| "num_input_tokens_seen": 31200, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.1666666666666667, | |
| "grad_norm": 2.620466470718384, | |
| "learning_rate": 2.9027777777777782e-05, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 31936, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1944444444444444, | |
| "grad_norm": 1.051540493965149, | |
| "learning_rate": 2.9722222222222223e-05, | |
| "loss": 0.2362, | |
| "num_input_tokens_seen": 32704, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.2222222222222223, | |
| "grad_norm": 0.6837607622146606, | |
| "learning_rate": 3.0416666666666666e-05, | |
| "loss": 0.2375, | |
| "num_input_tokens_seen": 33440, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.1056437492370605, | |
| "learning_rate": 3.111111111111111e-05, | |
| "loss": 0.2384, | |
| "num_input_tokens_seen": 34192, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.2777777777777777, | |
| "grad_norm": 1.122992992401123, | |
| "learning_rate": 3.180555555555556e-05, | |
| "loss": 0.2388, | |
| "num_input_tokens_seen": 34960, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.3055555555555556, | |
| "grad_norm": 0.9644664525985718, | |
| "learning_rate": 3.2500000000000004e-05, | |
| "loss": 0.2736, | |
| "num_input_tokens_seen": 35728, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.8712486028671265, | |
| "learning_rate": 3.3194444444444444e-05, | |
| "loss": 0.2352, | |
| "num_input_tokens_seen": 36464, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3611111111111112, | |
| "grad_norm": 2.6176774501800537, | |
| "learning_rate": 3.388888888888889e-05, | |
| "loss": 0.2393, | |
| "num_input_tokens_seen": 37216, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.3888888888888888, | |
| "grad_norm": 1.305676817893982, | |
| "learning_rate": 3.458333333333333e-05, | |
| "loss": 0.2299, | |
| "num_input_tokens_seen": 38000, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.4166666666666667, | |
| "grad_norm": 9.111144065856934, | |
| "learning_rate": 3.527777777777778e-05, | |
| "loss": 0.275, | |
| "num_input_tokens_seen": 38800, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.4444444444444444, | |
| "grad_norm": 2.4773318767547607, | |
| "learning_rate": 3.5972222222222225e-05, | |
| "loss": 0.2884, | |
| "num_input_tokens_seen": 39584, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4722222222222223, | |
| "grad_norm": 3.8086376190185547, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 0.2382, | |
| "num_input_tokens_seen": 40352, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 3.7353453636169434, | |
| "learning_rate": 3.736111111111111e-05, | |
| "loss": 0.2257, | |
| "num_input_tokens_seen": 41120, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.5277777777777777, | |
| "grad_norm": 25.111865997314453, | |
| "learning_rate": 3.805555555555555e-05, | |
| "loss": 0.2836, | |
| "num_input_tokens_seen": 41856, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.5555555555555556, | |
| "grad_norm": 13.982370376586914, | |
| "learning_rate": 3.875e-05, | |
| "loss": 0.8554, | |
| "num_input_tokens_seen": 42640, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.5833333333333335, | |
| "grad_norm": 154.56797790527344, | |
| "learning_rate": 3.944444444444445e-05, | |
| "loss": 3.143, | |
| "num_input_tokens_seen": 43392, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.6111111111111112, | |
| "grad_norm": 3.4631268978118896, | |
| "learning_rate": 4.0138888888888894e-05, | |
| "loss": 0.3615, | |
| "num_input_tokens_seen": 44176, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.6388888888888888, | |
| "grad_norm": 0.45719239115715027, | |
| "learning_rate": 4.0833333333333334e-05, | |
| "loss": 0.2289, | |
| "num_input_tokens_seen": 44928, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.5930590629577637, | |
| "learning_rate": 4.152777777777778e-05, | |
| "loss": 0.2271, | |
| "num_input_tokens_seen": 45696, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6944444444444444, | |
| "grad_norm": 0.19013027846813202, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 0.2477, | |
| "num_input_tokens_seen": 46464, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.7222222222222223, | |
| "grad_norm": 1.5907601118087769, | |
| "learning_rate": 4.291666666666667e-05, | |
| "loss": 0.2669, | |
| "num_input_tokens_seen": 47216, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.35706278681755066, | |
| "learning_rate": 4.3611111111111116e-05, | |
| "loss": 0.2423, | |
| "num_input_tokens_seen": 47984, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 1.6776552200317383, | |
| "learning_rate": 4.4305555555555556e-05, | |
| "loss": 0.2228, | |
| "num_input_tokens_seen": 48720, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.8055555555555556, | |
| "grad_norm": 1.1364836692810059, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.2698, | |
| "num_input_tokens_seen": 49472, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.8333333333333335, | |
| "grad_norm": 0.17125339806079865, | |
| "learning_rate": 4.569444444444444e-05, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 50224, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.8611111111111112, | |
| "grad_norm": 1.3546942472457886, | |
| "learning_rate": 4.638888888888889e-05, | |
| "loss": 0.2387, | |
| "num_input_tokens_seen": 50944, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 5.500971794128418, | |
| "learning_rate": 4.708333333333334e-05, | |
| "loss": 0.264, | |
| "num_input_tokens_seen": 51696, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.9166666666666665, | |
| "grad_norm": 3.2084038257598877, | |
| "learning_rate": 4.7777777777777784e-05, | |
| "loss": 0.3239, | |
| "num_input_tokens_seen": 52480, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.9444444444444444, | |
| "grad_norm": 1.4998213052749634, | |
| "learning_rate": 4.8472222222222224e-05, | |
| "loss": 0.2497, | |
| "num_input_tokens_seen": 53232, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.9722222222222223, | |
| "grad_norm": 0.29920241236686707, | |
| "learning_rate": 4.9166666666666665e-05, | |
| "loss": 0.2366, | |
| "num_input_tokens_seen": 54016, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.9520807862281799, | |
| "learning_rate": 4.986111111111111e-05, | |
| "loss": 0.2399, | |
| "num_input_tokens_seen": 54752, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.23347768187522888, | |
| "eval_runtime": 0.9935, | |
| "eval_samples_per_second": 40.261, | |
| "eval_steps_per_second": 20.131, | |
| "num_input_tokens_seen": 54752, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.0277777777777777, | |
| "grad_norm": 0.16839726269245148, | |
| "learning_rate": 4.99998119647914e-05, | |
| "loss": 0.2378, | |
| "num_input_tokens_seen": 55520, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.0555555555555554, | |
| "grad_norm": 0.8747950792312622, | |
| "learning_rate": 4.999904807660428e-05, | |
| "loss": 0.2401, | |
| "num_input_tokens_seen": 56288, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.0833333333333335, | |
| "grad_norm": 0.27838072180747986, | |
| "learning_rate": 4.999769660117901e-05, | |
| "loss": 0.2284, | |
| "num_input_tokens_seen": 57040, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "grad_norm": 0.28668636083602905, | |
| "learning_rate": 4.999575757028119e-05, | |
| "loss": 0.2262, | |
| "num_input_tokens_seen": 57792, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.138888888888889, | |
| "grad_norm": 0.7376558780670166, | |
| "learning_rate": 4.9993231029486544e-05, | |
| "loss": 0.2531, | |
| "num_input_tokens_seen": 58576, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 2.1666666666666665, | |
| "grad_norm": 7.420198440551758, | |
| "learning_rate": 4.999011703817986e-05, | |
| "loss": 0.2609, | |
| "num_input_tokens_seen": 59344, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.1944444444444446, | |
| "grad_norm": 3.8974297046661377, | |
| "learning_rate": 4.9986415669553586e-05, | |
| "loss": 0.2575, | |
| "num_input_tokens_seen": 60112, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 0.4262005090713501, | |
| "learning_rate": 4.998212701060612e-05, | |
| "loss": 0.215, | |
| "num_input_tokens_seen": 60896, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.2879176735877991, | |
| "learning_rate": 4.997725116213973e-05, | |
| "loss": 0.2506, | |
| "num_input_tokens_seen": 61648, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 2.2777777777777777, | |
| "grad_norm": 1.6084895133972168, | |
| "learning_rate": 4.997178823875826e-05, | |
| "loss": 0.241, | |
| "num_input_tokens_seen": 62400, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.3055555555555554, | |
| "grad_norm": 0.1998496651649475, | |
| "learning_rate": 4.996573836886435e-05, | |
| "loss": 0.2412, | |
| "num_input_tokens_seen": 63136, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.7503790259361267, | |
| "learning_rate": 4.995910169465646e-05, | |
| "loss": 0.23, | |
| "num_input_tokens_seen": 63888, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.361111111111111, | |
| "grad_norm": 1.1558177471160889, | |
| "learning_rate": 4.9951878372125547e-05, | |
| "loss": 0.2064, | |
| "num_input_tokens_seen": 64624, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.388888888888889, | |
| "grad_norm": 0.29164397716522217, | |
| "learning_rate": 4.994406857105136e-05, | |
| "loss": 0.2645, | |
| "num_input_tokens_seen": 65376, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.4166666666666665, | |
| "grad_norm": 0.4840473532676697, | |
| "learning_rate": 4.993567247499845e-05, | |
| "loss": 0.2689, | |
| "num_input_tokens_seen": 66112, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 0.6077887415885925, | |
| "learning_rate": 4.9926690281311904e-05, | |
| "loss": 0.2355, | |
| "num_input_tokens_seen": 66896, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.4722222222222223, | |
| "grad_norm": 0.37077003717422485, | |
| "learning_rate": 4.9917122201112656e-05, | |
| "loss": 0.2409, | |
| "num_input_tokens_seen": 67664, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.4490332305431366, | |
| "learning_rate": 4.9906968459292524e-05, | |
| "loss": 0.204, | |
| "num_input_tokens_seen": 68432, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.5277777777777777, | |
| "grad_norm": 0.24682901799678802, | |
| "learning_rate": 4.9896229294508976e-05, | |
| "loss": 0.2514, | |
| "num_input_tokens_seen": 69152, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "grad_norm": 0.17960534989833832, | |
| "learning_rate": 4.988490495917947e-05, | |
| "loss": 0.2615, | |
| "num_input_tokens_seen": 69936, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.5833333333333335, | |
| "grad_norm": 1.6768314838409424, | |
| "learning_rate": 4.987299571947553e-05, | |
| "loss": 0.2344, | |
| "num_input_tokens_seen": 70704, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 2.611111111111111, | |
| "grad_norm": 1.0035189390182495, | |
| "learning_rate": 4.9860501855316514e-05, | |
| "loss": 0.2223, | |
| "num_input_tokens_seen": 71488, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.638888888888889, | |
| "grad_norm": 0.2373342365026474, | |
| "learning_rate": 4.9847423660363e-05, | |
| "loss": 0.2689, | |
| "num_input_tokens_seen": 72224, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.534795343875885, | |
| "learning_rate": 4.983376144200992e-05, | |
| "loss": 0.242, | |
| "num_input_tokens_seen": 72976, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.6944444444444446, | |
| "grad_norm": 0.2486744225025177, | |
| "learning_rate": 4.981951552137929e-05, | |
| "loss": 0.233, | |
| "num_input_tokens_seen": 73792, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 2.7222222222222223, | |
| "grad_norm": 0.1451578289270401, | |
| "learning_rate": 4.980468623331273e-05, | |
| "loss": 0.2351, | |
| "num_input_tokens_seen": 74560, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.7469542026519775, | |
| "learning_rate": 4.978927392636351e-05, | |
| "loss": 0.2396, | |
| "num_input_tokens_seen": 75328, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.7777777777777777, | |
| "grad_norm": 0.9310773611068726, | |
| "learning_rate": 4.9773278962788436e-05, | |
| "loss": 0.2568, | |
| "num_input_tokens_seen": 76080, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.8055555555555554, | |
| "grad_norm": 0.08243211358785629, | |
| "learning_rate": 4.975670171853926e-05, | |
| "loss": 0.2442, | |
| "num_input_tokens_seen": 76848, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.8333333333333335, | |
| "grad_norm": 0.07204469293355942, | |
| "learning_rate": 4.973954258325392e-05, | |
| "loss": 0.2304, | |
| "num_input_tokens_seen": 77616, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.861111111111111, | |
| "grad_norm": 0.09893473237752914, | |
| "learning_rate": 4.972180196024733e-05, | |
| "loss": 0.222, | |
| "num_input_tokens_seen": 78384, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.888888888888889, | |
| "grad_norm": 0.10142597556114197, | |
| "learning_rate": 4.97034802665019e-05, | |
| "loss": 0.2374, | |
| "num_input_tokens_seen": 79136, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.9166666666666665, | |
| "grad_norm": 0.5532053112983704, | |
| "learning_rate": 4.9684577932657786e-05, | |
| "loss": 0.2284, | |
| "num_input_tokens_seen": 79920, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.9444444444444446, | |
| "grad_norm": 0.6459735631942749, | |
| "learning_rate": 4.966509540300269e-05, | |
| "loss": 0.2185, | |
| "num_input_tokens_seen": 80672, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.9722222222222223, | |
| "grad_norm": 0.07756423205137253, | |
| "learning_rate": 4.9645033135461494e-05, | |
| "loss": 0.2737, | |
| "num_input_tokens_seen": 81440, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.3974766135215759, | |
| "learning_rate": 4.962439160158544e-05, | |
| "loss": 0.2437, | |
| "num_input_tokens_seen": 82176, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.23567190766334534, | |
| "eval_runtime": 0.8517, | |
| "eval_samples_per_second": 46.965, | |
| "eval_steps_per_second": 23.483, | |
| "num_input_tokens_seen": 82176, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 3.0277777777777777, | |
| "grad_norm": 0.35840246081352234, | |
| "learning_rate": 4.960317128654108e-05, | |
| "loss": 0.2396, | |
| "num_input_tokens_seen": 82944, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 3.0555555555555554, | |
| "grad_norm": 0.06530583649873734, | |
| "learning_rate": 4.958137268909887e-05, | |
| "loss": 0.2521, | |
| "num_input_tokens_seen": 83712, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.0833333333333335, | |
| "grad_norm": 0.10089149326086044, | |
| "learning_rate": 4.9558996321621405e-05, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 84496, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 3.111111111111111, | |
| "grad_norm": 0.4209873676300049, | |
| "learning_rate": 4.953604271005144e-05, | |
| "loss": 0.2335, | |
| "num_input_tokens_seen": 85280, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 3.138888888888889, | |
| "grad_norm": 0.298819363117218, | |
| "learning_rate": 4.951251239389948e-05, | |
| "loss": 0.2434, | |
| "num_input_tokens_seen": 86016, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 3.1666666666666665, | |
| "grad_norm": 0.051832061260938644, | |
| "learning_rate": 4.9488405926231144e-05, | |
| "loss": 0.2308, | |
| "num_input_tokens_seen": 86800, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 3.1944444444444446, | |
| "grad_norm": 0.04287365451455116, | |
| "learning_rate": 4.946372387365409e-05, | |
| "loss": 0.2427, | |
| "num_input_tokens_seen": 87568, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.2222222222222223, | |
| "grad_norm": 0.29626473784446716, | |
| "learning_rate": 4.943846681630479e-05, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 88320, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 0.06258111447095871, | |
| "learning_rate": 4.941263534783482e-05, | |
| "loss": 0.2285, | |
| "num_input_tokens_seen": 89072, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 3.2777777777777777, | |
| "grad_norm": 0.13033808767795563, | |
| "learning_rate": 4.9386230075396964e-05, | |
| "loss": 0.2165, | |
| "num_input_tokens_seen": 89792, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.3055555555555554, | |
| "grad_norm": 0.3183232247829437, | |
| "learning_rate": 4.9359251619630886e-05, | |
| "loss": 0.2417, | |
| "num_input_tokens_seen": 90576, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.2652042508125305, | |
| "learning_rate": 4.933170061464858e-05, | |
| "loss": 0.2425, | |
| "num_input_tokens_seen": 91360, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.361111111111111, | |
| "grad_norm": 0.2555042803287506, | |
| "learning_rate": 4.930357770801947e-05, | |
| "loss": 0.1958, | |
| "num_input_tokens_seen": 92128, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 3.388888888888889, | |
| "grad_norm": 0.4280713200569153, | |
| "learning_rate": 4.9274883560755156e-05, | |
| "loss": 0.2538, | |
| "num_input_tokens_seen": 92912, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.4166666666666665, | |
| "grad_norm": 0.23289966583251953, | |
| "learning_rate": 4.924561884729391e-05, | |
| "loss": 0.2223, | |
| "num_input_tokens_seen": 93648, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 0.06474464386701584, | |
| "learning_rate": 4.921578425548482e-05, | |
| "loss": 0.2504, | |
| "num_input_tokens_seen": 94416, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.4722222222222223, | |
| "grad_norm": 0.3162301480770111, | |
| "learning_rate": 4.9185380486571595e-05, | |
| "loss": 0.2311, | |
| "num_input_tokens_seen": 95168, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.04434465989470482, | |
| "learning_rate": 4.915440825517612e-05, | |
| "loss": 0.2216, | |
| "num_input_tokens_seen": 95936, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.5277777777777777, | |
| "grad_norm": 0.2360685020685196, | |
| "learning_rate": 4.912286828928162e-05, | |
| "loss": 0.2187, | |
| "num_input_tokens_seen": 96688, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 0.04857361316680908, | |
| "learning_rate": 4.909076133021557e-05, | |
| "loss": 0.2402, | |
| "num_input_tokens_seen": 97456, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.5833333333333335, | |
| "grad_norm": 0.03909294307231903, | |
| "learning_rate": 4.9058088132632306e-05, | |
| "loss": 0.2395, | |
| "num_input_tokens_seen": 98208, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 3.611111111111111, | |
| "grad_norm": 0.2648538053035736, | |
| "learning_rate": 4.9024849464495215e-05, | |
| "loss": 0.2456, | |
| "num_input_tokens_seen": 98944, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.638888888888889, | |
| "grad_norm": 0.035087645053863525, | |
| "learning_rate": 4.8991046107058735e-05, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 99728, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 0.2587370276451111, | |
| "learning_rate": 4.895667885484997e-05, | |
| "loss": 0.2403, | |
| "num_input_tokens_seen": 100496, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.6944444444444446, | |
| "grad_norm": 0.26684993505477905, | |
| "learning_rate": 4.892174851565004e-05, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 101264, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 3.7222222222222223, | |
| "grad_norm": 0.2731635868549347, | |
| "learning_rate": 4.8886255910475054e-05, | |
| "loss": 0.2396, | |
| "num_input_tokens_seen": 102016, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.25409963726997375, | |
| "learning_rate": 4.885020187355687e-05, | |
| "loss": 0.2517, | |
| "num_input_tokens_seen": 102768, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "grad_norm": 0.21939261257648468, | |
| "learning_rate": 4.881358725232342e-05, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 103520, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.8055555555555554, | |
| "grad_norm": 0.2201327532529831, | |
| "learning_rate": 4.877641290737884e-05, | |
| "loss": 0.2379, | |
| "num_input_tokens_seen": 104288, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 3.8333333333333335, | |
| "grad_norm": 0.2317037433385849, | |
| "learning_rate": 4.873867971248324e-05, | |
| "loss": 0.2403, | |
| "num_input_tokens_seen": 105056, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.861111111111111, | |
| "grad_norm": 0.20789989829063416, | |
| "learning_rate": 4.870038855453213e-05, | |
| "loss": 0.2258, | |
| "num_input_tokens_seen": 105792, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 3.888888888888889, | |
| "grad_norm": 0.20359978079795837, | |
| "learning_rate": 4.866154033353561e-05, | |
| "loss": 0.23, | |
| "num_input_tokens_seen": 106544, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.9166666666666665, | |
| "grad_norm": 0.044127654284238815, | |
| "learning_rate": 4.86221359625972e-05, | |
| "loss": 0.23, | |
| "num_input_tokens_seen": 107312, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 3.9444444444444446, | |
| "grad_norm": 0.04580540582537651, | |
| "learning_rate": 4.858217636789241e-05, | |
| "loss": 0.2158, | |
| "num_input_tokens_seen": 108064, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.9722222222222223, | |
| "grad_norm": 0.04189879074692726, | |
| "learning_rate": 4.854166248864689e-05, | |
| "loss": 0.2468, | |
| "num_input_tokens_seen": 108848, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.22962501645088196, | |
| "learning_rate": 4.850059527711444e-05, | |
| "loss": 0.2275, | |
| "num_input_tokens_seen": 109584, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.23304803669452667, | |
| "eval_runtime": 0.8386, | |
| "eval_samples_per_second": 47.699, | |
| "eval_steps_per_second": 23.849, | |
| "num_input_tokens_seen": 109584, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 4.027777777777778, | |
| "grad_norm": 0.21242158114910126, | |
| "learning_rate": 4.84589756985546e-05, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 110336, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.055555555555555, | |
| "grad_norm": 0.03922082111239433, | |
| "learning_rate": 4.8416804731209945e-05, | |
| "loss": 0.2314, | |
| "num_input_tokens_seen": 111104, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 4.083333333333333, | |
| "grad_norm": 0.04513590782880783, | |
| "learning_rate": 4.8374083366283096e-05, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 111856, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 4.111111111111111, | |
| "grad_norm": 0.041459400206804276, | |
| "learning_rate": 4.833081260791345e-05, | |
| "loss": 0.2443, | |
| "num_input_tokens_seen": 112624, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 4.138888888888889, | |
| "grad_norm": 0.22475330531597137, | |
| "learning_rate": 4.828699347315356e-05, | |
| "loss": 0.2314, | |
| "num_input_tokens_seen": 113376, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 4.166666666666667, | |
| "grad_norm": 0.04301132634282112, | |
| "learning_rate": 4.82426269919452e-05, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 114096, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.194444444444445, | |
| "grad_norm": 0.021374447271227837, | |
| "learning_rate": 4.8197714207095205e-05, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 114832, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 0.2153686285018921, | |
| "learning_rate": 4.815225617425095e-05, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 115632, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 0.03661501035094261, | |
| "learning_rate": 4.8106253961875506e-05, | |
| "loss": 0.2277, | |
| "num_input_tokens_seen": 116352, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 4.277777777777778, | |
| "grad_norm": 0.22066719830036163, | |
| "learning_rate": 4.805970865122257e-05, | |
| "loss": 0.2379, | |
| "num_input_tokens_seen": 117152, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 4.305555555555555, | |
| "grad_norm": 0.22886253893375397, | |
| "learning_rate": 4.8012621336311016e-05, | |
| "loss": 0.2341, | |
| "num_input_tokens_seen": 117904, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 0.2830849289894104, | |
| "learning_rate": 4.7964993123899195e-05, | |
| "loss": 0.2498, | |
| "num_input_tokens_seen": 118672, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.361111111111111, | |
| "grad_norm": 0.030636107549071312, | |
| "learning_rate": 4.791682513345892e-05, | |
| "loss": 0.2342, | |
| "num_input_tokens_seen": 119424, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 4.388888888888889, | |
| "grad_norm": 0.041791874915361404, | |
| "learning_rate": 4.786811849714918e-05, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 120176, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.416666666666667, | |
| "grad_norm": 0.2024121731519699, | |
| "learning_rate": 4.781887435978947e-05, | |
| "loss": 0.2275, | |
| "num_input_tokens_seen": 120960, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 0.19425232708454132, | |
| "learning_rate": 4.776909387883292e-05, | |
| "loss": 0.2274, | |
| "num_input_tokens_seen": 121712, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.472222222222222, | |
| "grad_norm": 0.30294886231422424, | |
| "learning_rate": 4.771877822433911e-05, | |
| "loss": 0.227, | |
| "num_input_tokens_seen": 122464, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.20341075956821442, | |
| "learning_rate": 4.766792857894652e-05, | |
| "loss": 0.2314, | |
| "num_input_tokens_seen": 123232, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 4.527777777777778, | |
| "grad_norm": 0.07046259939670563, | |
| "learning_rate": 4.761654613784477e-05, | |
| "loss": 0.2583, | |
| "num_input_tokens_seen": 124000, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 4.555555555555555, | |
| "grad_norm": 0.06253518909215927, | |
| "learning_rate": 4.756463210874652e-05, | |
| "loss": 0.2518, | |
| "num_input_tokens_seen": 124768, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.583333333333333, | |
| "grad_norm": 0.062459491193294525, | |
| "learning_rate": 4.751218771185906e-05, | |
| "loss": 0.2418, | |
| "num_input_tokens_seen": 125520, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 4.611111111111111, | |
| "grad_norm": 0.1863209456205368, | |
| "learning_rate": 4.745921417985566e-05, | |
| "loss": 0.2181, | |
| "num_input_tokens_seen": 126256, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 4.638888888888889, | |
| "grad_norm": 0.1782846450805664, | |
| "learning_rate": 4.740571275784659e-05, | |
| "loss": 0.232, | |
| "num_input_tokens_seen": 127024, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "grad_norm": 0.1876181811094284, | |
| "learning_rate": 4.735168470334984e-05, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 127792, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.694444444444445, | |
| "grad_norm": 0.21437183022499084, | |
| "learning_rate": 4.729713128626158e-05, | |
| "loss": 0.2405, | |
| "num_input_tokens_seen": 128544, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 4.722222222222222, | |
| "grad_norm": 0.2045063078403473, | |
| "learning_rate": 4.72420537888263e-05, | |
| "loss": 0.234, | |
| "num_input_tokens_seen": 129312, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 0.023555081337690353, | |
| "learning_rate": 4.7186453505606676e-05, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 130112, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 4.777777777777778, | |
| "grad_norm": 0.038099661469459534, | |
| "learning_rate": 4.713033174345314e-05, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 130864, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.805555555555555, | |
| "grad_norm": 0.20526079833507538, | |
| "learning_rate": 4.707368982147318e-05, | |
| "loss": 0.2155, | |
| "num_input_tokens_seen": 131632, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 4.833333333333333, | |
| "grad_norm": 0.07691137492656708, | |
| "learning_rate": 4.701652907100029e-05, | |
| "loss": 0.2335, | |
| "num_input_tokens_seen": 132400, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 4.861111111111111, | |
| "grad_norm": 0.2254783809185028, | |
| "learning_rate": 4.695885083556275e-05, | |
| "loss": 0.2363, | |
| "num_input_tokens_seen": 133152, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 4.888888888888889, | |
| "grad_norm": 0.08800611644983292, | |
| "learning_rate": 4.6900656470851964e-05, | |
| "loss": 0.2614, | |
| "num_input_tokens_seen": 133920, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.916666666666667, | |
| "grad_norm": 0.044184520840644836, | |
| "learning_rate": 4.684194734469067e-05, | |
| "loss": 0.2221, | |
| "num_input_tokens_seen": 134688, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 4.944444444444445, | |
| "grad_norm": 0.21772927045822144, | |
| "learning_rate": 4.678272483700074e-05, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 135456, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 4.972222222222222, | |
| "grad_norm": 0.06680847704410553, | |
| "learning_rate": 4.672299033977076e-05, | |
| "loss": 0.2498, | |
| "num_input_tokens_seen": 136240, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.029849985614418983, | |
| "learning_rate": 4.6662745257023325e-05, | |
| "loss": 0.2341, | |
| "num_input_tokens_seen": 137008, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.23569175601005554, | |
| "eval_runtime": 0.8416, | |
| "eval_samples_per_second": 47.528, | |
| "eval_steps_per_second": 23.764, | |
| "num_input_tokens_seen": 137008, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.027777777777778, | |
| "grad_norm": 0.190761536359787, | |
| "learning_rate": 4.660199100478202e-05, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 137776, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 5.055555555555555, | |
| "grad_norm": 0.19231367111206055, | |
| "learning_rate": 4.6540729011038146e-05, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 138496, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 5.083333333333333, | |
| "grad_norm": 0.1897604763507843, | |
| "learning_rate": 4.6478960715717176e-05, | |
| "loss": 0.2253, | |
| "num_input_tokens_seen": 139280, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 5.111111111111111, | |
| "grad_norm": 0.22185863554477692, | |
| "learning_rate": 4.641668757064486e-05, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 140080, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 5.138888888888889, | |
| "grad_norm": 0.0547848604619503, | |
| "learning_rate": 4.6353911039513145e-05, | |
| "loss": 0.2493, | |
| "num_input_tokens_seen": 140848, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 5.166666666666667, | |
| "grad_norm": 0.19223369657993317, | |
| "learning_rate": 4.6290632597845755e-05, | |
| "loss": 0.2273, | |
| "num_input_tokens_seen": 141632, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 5.194444444444445, | |
| "grad_norm": 0.04600772261619568, | |
| "learning_rate": 4.622685373296353e-05, | |
| "loss": 0.2319, | |
| "num_input_tokens_seen": 142368, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 5.222222222222222, | |
| "grad_norm": 0.18761615455150604, | |
| "learning_rate": 4.61625759439494e-05, | |
| "loss": 0.2338, | |
| "num_input_tokens_seen": 143120, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 5.25, | |
| "grad_norm": 0.06209826469421387, | |
| "learning_rate": 4.609780074161327e-05, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 143904, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 5.277777777777778, | |
| "grad_norm": 0.05093017593026161, | |
| "learning_rate": 4.603252964845638e-05, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 144640, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.305555555555555, | |
| "grad_norm": 0.03405000641942024, | |
| "learning_rate": 4.5966764198635606e-05, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 145376, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 5.333333333333333, | |
| "grad_norm": 0.17441076040267944, | |
| "learning_rate": 4.590050593792736e-05, | |
| "loss": 0.2235, | |
| "num_input_tokens_seen": 146144, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 5.361111111111111, | |
| "grad_norm": 0.048055585473775864, | |
| "learning_rate": 4.583375642369129e-05, | |
| "loss": 0.2386, | |
| "num_input_tokens_seen": 146912, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 5.388888888888889, | |
| "grad_norm": 0.06384623795747757, | |
| "learning_rate": 4.5766517224833637e-05, | |
| "loss": 0.2377, | |
| "num_input_tokens_seen": 147664, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 5.416666666666667, | |
| "grad_norm": 0.2151617407798767, | |
| "learning_rate": 4.569878992177039e-05, | |
| "loss": 0.2251, | |
| "num_input_tokens_seen": 148448, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 5.444444444444445, | |
| "grad_norm": 0.17460361123085022, | |
| "learning_rate": 4.5630576106390114e-05, | |
| "loss": 0.2462, | |
| "num_input_tokens_seen": 149200, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 5.472222222222222, | |
| "grad_norm": 0.17640213668346405, | |
| "learning_rate": 4.556187738201656e-05, | |
| "loss": 0.2139, | |
| "num_input_tokens_seen": 149936, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 5.5, | |
| "grad_norm": 0.18245279788970947, | |
| "learning_rate": 4.549269536337095e-05, | |
| "loss": 0.2209, | |
| "num_input_tokens_seen": 150672, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 5.527777777777778, | |
| "grad_norm": 0.05563855543732643, | |
| "learning_rate": 4.5423031676534065e-05, | |
| "loss": 0.2216, | |
| "num_input_tokens_seen": 151440, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 5.555555555555555, | |
| "grad_norm": 0.24899983406066895, | |
| "learning_rate": 4.535288795890798e-05, | |
| "loss": 0.24, | |
| "num_input_tokens_seen": 152224, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.583333333333333, | |
| "grad_norm": 0.24502120912075043, | |
| "learning_rate": 4.528226585917761e-05, | |
| "loss": 0.2357, | |
| "num_input_tokens_seen": 152976, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 5.611111111111111, | |
| "grad_norm": 0.12031486630439758, | |
| "learning_rate": 4.521116703727193e-05, | |
| "loss": 0.2383, | |
| "num_input_tokens_seen": 153744, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 5.638888888888889, | |
| "grad_norm": 0.25375932455062866, | |
| "learning_rate": 4.5139593164324986e-05, | |
| "loss": 0.2352, | |
| "num_input_tokens_seen": 154512, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 5.666666666666667, | |
| "grad_norm": 0.09390950202941895, | |
| "learning_rate": 4.506754592263662e-05, | |
| "loss": 0.2343, | |
| "num_input_tokens_seen": 155280, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 5.694444444444445, | |
| "grad_norm": 0.2544936239719391, | |
| "learning_rate": 4.49950270056329e-05, | |
| "loss": 0.2422, | |
| "num_input_tokens_seen": 156032, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 5.722222222222222, | |
| "grad_norm": 0.06608662009239197, | |
| "learning_rate": 4.4922038117826334e-05, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 156848, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 5.75, | |
| "grad_norm": 0.20967911183834076, | |
| "learning_rate": 4.48485809747758e-05, | |
| "loss": 0.2347, | |
| "num_input_tokens_seen": 157600, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 5.777777777777778, | |
| "grad_norm": 0.1931481808423996, | |
| "learning_rate": 4.477465730304624e-05, | |
| "loss": 0.2333, | |
| "num_input_tokens_seen": 158352, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 5.805555555555555, | |
| "grad_norm": 0.0319942943751812, | |
| "learning_rate": 4.4700268840168045e-05, | |
| "loss": 0.2372, | |
| "num_input_tokens_seen": 159104, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 5.833333333333333, | |
| "grad_norm": 0.18197861313819885, | |
| "learning_rate": 4.462541733459628e-05, | |
| "loss": 0.2318, | |
| "num_input_tokens_seen": 159856, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 5.861111111111111, | |
| "grad_norm": 0.18240846693515778, | |
| "learning_rate": 4.455010454566947e-05, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 160624, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 5.888888888888889, | |
| "grad_norm": 0.19988971948623657, | |
| "learning_rate": 4.447433224356839e-05, | |
| "loss": 0.2385, | |
| "num_input_tokens_seen": 161360, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 5.916666666666667, | |
| "grad_norm": 0.03416143357753754, | |
| "learning_rate": 4.439810220927436e-05, | |
| "loss": 0.2285, | |
| "num_input_tokens_seen": 162080, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 5.944444444444445, | |
| "grad_norm": 0.19043967127799988, | |
| "learning_rate": 4.432141623452743e-05, | |
| "loss": 0.2425, | |
| "num_input_tokens_seen": 162832, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 5.972222222222222, | |
| "grad_norm": 0.04088395833969116, | |
| "learning_rate": 4.4244276121784195e-05, | |
| "loss": 0.2258, | |
| "num_input_tokens_seen": 163568, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.1749011129140854, | |
| "learning_rate": 4.416668368417556e-05, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 164336, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.2312491238117218, | |
| "eval_runtime": 0.845, | |
| "eval_samples_per_second": 47.336, | |
| "eval_steps_per_second": 23.668, | |
| "num_input_tokens_seen": 164336, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 6.027777777777778, | |
| "grad_norm": 0.18159140646457672, | |
| "learning_rate": 4.408864074546401e-05, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 165072, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 6.055555555555555, | |
| "grad_norm": 0.06449268013238907, | |
| "learning_rate": 4.401014914000078e-05, | |
| "loss": 0.2238, | |
| "num_input_tokens_seen": 165824, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 6.083333333333333, | |
| "grad_norm": 0.17080135643482208, | |
| "learning_rate": 4.393121071268274e-05, | |
| "loss": 0.2221, | |
| "num_input_tokens_seen": 166608, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 6.111111111111111, | |
| "grad_norm": 0.19969283044338226, | |
| "learning_rate": 4.3851827318909036e-05, | |
| "loss": 0.2433, | |
| "num_input_tokens_seen": 167360, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 6.138888888888889, | |
| "grad_norm": 0.0702192559838295, | |
| "learning_rate": 4.377200082453749e-05, | |
| "loss": 0.2346, | |
| "num_input_tokens_seen": 168128, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 6.166666666666667, | |
| "grad_norm": 0.08154265582561493, | |
| "learning_rate": 4.36917331058407e-05, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 168896, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 6.194444444444445, | |
| "grad_norm": 0.05283205211162567, | |
| "learning_rate": 4.361102604946201e-05, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 169648, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 6.222222222222222, | |
| "grad_norm": 0.19671730697155, | |
| "learning_rate": 4.3529881552371096e-05, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 170416, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 6.25, | |
| "grad_norm": 0.1893257349729538, | |
| "learning_rate": 4.344830152181941e-05, | |
| "loss": 0.2337, | |
| "num_input_tokens_seen": 171184, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 6.277777777777778, | |
| "grad_norm": 0.046719856560230255, | |
| "learning_rate": 4.336628787529538e-05, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 171984, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 6.305555555555555, | |
| "grad_norm": 0.1931154429912567, | |
| "learning_rate": 4.3283842540479264e-05, | |
| "loss": 0.2233, | |
| "num_input_tokens_seen": 172768, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 6.333333333333333, | |
| "grad_norm": 0.19740572571754456, | |
| "learning_rate": 4.320096745519793e-05, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 173520, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 6.361111111111111, | |
| "grad_norm": 0.0646887719631195, | |
| "learning_rate": 4.3117664567379237e-05, | |
| "loss": 0.2278, | |
| "num_input_tokens_seen": 174304, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 6.388888888888889, | |
| "grad_norm": 0.18534712493419647, | |
| "learning_rate": 4.303393583500628e-05, | |
| "loss": 0.2358, | |
| "num_input_tokens_seen": 175040, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 6.416666666666667, | |
| "grad_norm": 0.04154275357723236, | |
| "learning_rate": 4.2949783226071406e-05, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 175776, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 6.444444444444445, | |
| "grad_norm": 0.1973147988319397, | |
| "learning_rate": 4.286520871852987e-05, | |
| "loss": 0.2291, | |
| "num_input_tokens_seen": 176512, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 6.472222222222222, | |
| "grad_norm": 0.22881700098514557, | |
| "learning_rate": 4.278021430025343e-05, | |
| "loss": 0.2456, | |
| "num_input_tokens_seen": 177280, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 6.5, | |
| "grad_norm": 0.10801947116851807, | |
| "learning_rate": 4.2694801968983566e-05, | |
| "loss": 0.2344, | |
| "num_input_tokens_seen": 178032, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 6.527777777777778, | |
| "grad_norm": 0.05663140118122101, | |
| "learning_rate": 4.260897373228456e-05, | |
| "loss": 0.2234, | |
| "num_input_tokens_seen": 178784, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 6.555555555555555, | |
| "grad_norm": 0.08504916727542877, | |
| "learning_rate": 4.2522731607496275e-05, | |
| "loss": 0.2243, | |
| "num_input_tokens_seen": 179568, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 6.583333333333333, | |
| "grad_norm": 0.21337293088436127, | |
| "learning_rate": 4.2436077621686786e-05, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 180336, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 6.611111111111111, | |
| "grad_norm": 0.2441207319498062, | |
| "learning_rate": 4.234901381160469e-05, | |
| "loss": 0.2335, | |
| "num_input_tokens_seen": 181056, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 6.638888888888889, | |
| "grad_norm": 0.16203121840953827, | |
| "learning_rate": 4.226154222363124e-05, | |
| "loss": 0.2302, | |
| "num_input_tokens_seen": 181840, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.11439399421215057, | |
| "learning_rate": 4.21736649137323e-05, | |
| "loss": 0.2272, | |
| "num_input_tokens_seen": 182592, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.694444444444445, | |
| "grad_norm": 0.30583301186561584, | |
| "learning_rate": 4.208538394740993e-05, | |
| "loss": 0.2556, | |
| "num_input_tokens_seen": 183344, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 6.722222222222222, | |
| "grad_norm": 0.18746714293956757, | |
| "learning_rate": 4.199670139965393e-05, | |
| "loss": 0.2201, | |
| "num_input_tokens_seen": 184112, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 6.75, | |
| "grad_norm": 0.07710317522287369, | |
| "learning_rate": 4.1907619354892965e-05, | |
| "loss": 0.2304, | |
| "num_input_tokens_seen": 184864, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 6.777777777777778, | |
| "grad_norm": 0.21295295655727386, | |
| "learning_rate": 4.1818139906945694e-05, | |
| "loss": 0.2436, | |
| "num_input_tokens_seen": 185632, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 6.805555555555555, | |
| "grad_norm": 0.1958753913640976, | |
| "learning_rate": 4.172826515897146e-05, | |
| "loss": 0.2305, | |
| "num_input_tokens_seen": 186368, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 6.833333333333333, | |
| "grad_norm": 0.21225431561470032, | |
| "learning_rate": 4.163799722342089e-05, | |
| "loss": 0.2342, | |
| "num_input_tokens_seen": 187120, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 6.861111111111111, | |
| "grad_norm": 0.2011038064956665, | |
| "learning_rate": 4.1547338221986266e-05, | |
| "loss": 0.2402, | |
| "num_input_tokens_seen": 187888, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 6.888888888888889, | |
| "grad_norm": 0.21114513278007507, | |
| "learning_rate": 4.1456290285551596e-05, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 188672, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 6.916666666666667, | |
| "grad_norm": 0.1804109662771225, | |
| "learning_rate": 4.13648555541426e-05, | |
| "loss": 0.2256, | |
| "num_input_tokens_seen": 189424, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 6.944444444444445, | |
| "grad_norm": 0.08288107067346573, | |
| "learning_rate": 4.127303617687636e-05, | |
| "loss": 0.232, | |
| "num_input_tokens_seen": 190176, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 6.972222222222222, | |
| "grad_norm": 0.13274434208869934, | |
| "learning_rate": 4.118083431191081e-05, | |
| "loss": 0.2197, | |
| "num_input_tokens_seen": 190928, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.10268542170524597, | |
| "learning_rate": 4.108825212639405e-05, | |
| "loss": 0.2103, | |
| "num_input_tokens_seen": 191712, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.23792143166065216, | |
| "eval_runtime": 0.8418, | |
| "eval_samples_per_second": 47.518, | |
| "eval_steps_per_second": 23.759, | |
| "num_input_tokens_seen": 191712, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 7.027777777777778, | |
| "grad_norm": 0.3135641813278198, | |
| "learning_rate": 4.099529179641337e-05, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 192480, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 7.055555555555555, | |
| "grad_norm": 0.36921995878219604, | |
| "learning_rate": 4.09019555069441e-05, | |
| "loss": 0.2162, | |
| "num_input_tokens_seen": 193248, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 7.083333333333333, | |
| "grad_norm": 0.17098984122276306, | |
| "learning_rate": 4.080824545179828e-05, | |
| "loss": 0.2273, | |
| "num_input_tokens_seen": 194000, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 7.111111111111111, | |
| "grad_norm": 0.15590733289718628, | |
| "learning_rate": 4.071416383357307e-05, | |
| "loss": 0.2495, | |
| "num_input_tokens_seen": 194752, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 7.138888888888889, | |
| "grad_norm": 0.23567894101142883, | |
| "learning_rate": 4.0619712863599e-05, | |
| "loss": 0.2551, | |
| "num_input_tokens_seen": 195504, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 7.166666666666667, | |
| "grad_norm": 0.1218881830573082, | |
| "learning_rate": 4.0524894761888e-05, | |
| "loss": 0.2465, | |
| "num_input_tokens_seen": 196224, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 7.194444444444445, | |
| "grad_norm": 0.07201294600963593, | |
| "learning_rate": 4.042971175708118e-05, | |
| "loss": 0.2236, | |
| "num_input_tokens_seen": 196976, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 7.222222222222222, | |
| "grad_norm": 0.26333221793174744, | |
| "learning_rate": 4.0334166086396484e-05, | |
| "loss": 0.234, | |
| "num_input_tokens_seen": 197776, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 7.25, | |
| "grad_norm": 0.2225353866815567, | |
| "learning_rate": 4.0238259995576084e-05, | |
| "loss": 0.234, | |
| "num_input_tokens_seen": 198512, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 7.277777777777778, | |
| "grad_norm": 0.23296818137168884, | |
| "learning_rate": 4.0141995738833625e-05, | |
| "loss": 0.2295, | |
| "num_input_tokens_seen": 199280, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 7.305555555555555, | |
| "grad_norm": 0.10952712595462799, | |
| "learning_rate": 4.0045375578801214e-05, | |
| "loss": 0.238, | |
| "num_input_tokens_seen": 200032, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 7.333333333333333, | |
| "grad_norm": 0.21747390925884247, | |
| "learning_rate": 3.994840178647623e-05, | |
| "loss": 0.2259, | |
| "num_input_tokens_seen": 200768, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 7.361111111111111, | |
| "grad_norm": 0.10875441133975983, | |
| "learning_rate": 3.985107664116798e-05, | |
| "loss": 0.2207, | |
| "num_input_tokens_seen": 201552, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 7.388888888888889, | |
| "grad_norm": 0.1415949910879135, | |
| "learning_rate": 3.9753402430444116e-05, | |
| "loss": 0.224, | |
| "num_input_tokens_seen": 202320, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 7.416666666666667, | |
| "grad_norm": 0.36580806970596313, | |
| "learning_rate": 3.9655381450076826e-05, | |
| "loss": 0.2026, | |
| "num_input_tokens_seen": 203088, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 7.444444444444445, | |
| "grad_norm": 0.5669692754745483, | |
| "learning_rate": 3.955701600398892e-05, | |
| "loss": 0.2945, | |
| "num_input_tokens_seen": 203824, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 7.472222222222222, | |
| "grad_norm": 0.2120898813009262, | |
| "learning_rate": 3.945830840419966e-05, | |
| "loss": 0.2271, | |
| "num_input_tokens_seen": 204576, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 7.5, | |
| "grad_norm": 0.19381973147392273, | |
| "learning_rate": 3.935926097077045e-05, | |
| "loss": 0.2358, | |
| "num_input_tokens_seen": 205312, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 7.527777777777778, | |
| "grad_norm": 0.07539873570203781, | |
| "learning_rate": 3.925987603175023e-05, | |
| "loss": 0.2252, | |
| "num_input_tokens_seen": 206080, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 7.555555555555555, | |
| "grad_norm": 0.09191757440567017, | |
| "learning_rate": 3.916015592312082e-05, | |
| "loss": 0.2275, | |
| "num_input_tokens_seen": 206848, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 7.583333333333333, | |
| "grad_norm": 0.24951884150505066, | |
| "learning_rate": 3.9060102988742e-05, | |
| "loss": 0.2218, | |
| "num_input_tokens_seen": 207616, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 7.611111111111111, | |
| "grad_norm": 0.08662799745798111, | |
| "learning_rate": 3.8959719580296415e-05, | |
| "loss": 0.2411, | |
| "num_input_tokens_seen": 208352, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 7.638888888888889, | |
| "grad_norm": 0.0688060075044632, | |
| "learning_rate": 3.885900805723429e-05, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 209104, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 7.666666666666667, | |
| "grad_norm": 0.0747101828455925, | |
| "learning_rate": 3.875797078671798e-05, | |
| "loss": 0.2197, | |
| "num_input_tokens_seen": 209824, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 7.694444444444445, | |
| "grad_norm": 0.284817636013031, | |
| "learning_rate": 3.865661014356635e-05, | |
| "loss": 0.2272, | |
| "num_input_tokens_seen": 210576, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 7.722222222222222, | |
| "grad_norm": 0.09095483273267746, | |
| "learning_rate": 3.855492851019893e-05, | |
| "loss": 0.2434, | |
| "num_input_tokens_seen": 211376, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 7.75, | |
| "grad_norm": 0.08101051300764084, | |
| "learning_rate": 3.8452928276579916e-05, | |
| "loss": 0.2328, | |
| "num_input_tokens_seen": 212128, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 7.777777777777778, | |
| "grad_norm": 0.2549460828304291, | |
| "learning_rate": 3.835061184016203e-05, | |
| "loss": 0.236, | |
| "num_input_tokens_seen": 212928, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.805555555555555, | |
| "grad_norm": 0.20550718903541565, | |
| "learning_rate": 3.824798160583012e-05, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 213696, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 7.833333333333333, | |
| "grad_norm": 0.11653517186641693, | |
| "learning_rate": 3.814503998584471e-05, | |
| "loss": 0.2256, | |
| "num_input_tokens_seen": 214464, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 7.861111111111111, | |
| "grad_norm": 0.19850236177444458, | |
| "learning_rate": 3.804178939978517e-05, | |
| "loss": 0.2198, | |
| "num_input_tokens_seen": 215248, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 7.888888888888889, | |
| "grad_norm": 0.11180128902196884, | |
| "learning_rate": 3.7938232274493e-05, | |
| "loss": 0.2279, | |
| "num_input_tokens_seen": 216016, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 7.916666666666667, | |
| "grad_norm": 0.35185161232948303, | |
| "learning_rate": 3.783437104401469e-05, | |
| "loss": 0.2001, | |
| "num_input_tokens_seen": 216784, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 7.944444444444445, | |
| "grad_norm": 0.12019939720630646, | |
| "learning_rate": 3.773020814954453e-05, | |
| "loss": 0.2461, | |
| "num_input_tokens_seen": 217552, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 7.972222222222222, | |
| "grad_norm": 0.23168745636940002, | |
| "learning_rate": 3.762574603936725e-05, | |
| "loss": 0.2246, | |
| "num_input_tokens_seen": 218320, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.20853398740291595, | |
| "learning_rate": 3.752098716880045e-05, | |
| "loss": 0.24, | |
| "num_input_tokens_seen": 219072, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.24177499115467072, | |
| "eval_runtime": 0.8427, | |
| "eval_samples_per_second": 47.465, | |
| "eval_steps_per_second": 23.732, | |
| "num_input_tokens_seen": 219072, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 8.027777777777779, | |
| "grad_norm": 0.15344388782978058, | |
| "learning_rate": 3.74159340001369e-05, | |
| "loss": 0.2449, | |
| "num_input_tokens_seen": 219824, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 8.055555555555555, | |
| "grad_norm": 0.14319944381713867, | |
| "learning_rate": 3.731058900258668e-05, | |
| "loss": 0.2346, | |
| "num_input_tokens_seen": 220608, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 8.083333333333334, | |
| "grad_norm": 0.24495133757591248, | |
| "learning_rate": 3.7204954652219104e-05, | |
| "loss": 0.2317, | |
| "num_input_tokens_seen": 221344, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 8.11111111111111, | |
| "grad_norm": 0.23096558451652527, | |
| "learning_rate": 3.7099033431904575e-05, | |
| "loss": 0.2173, | |
| "num_input_tokens_seen": 222096, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 8.13888888888889, | |
| "grad_norm": 0.15797853469848633, | |
| "learning_rate": 3.699282783125616e-05, | |
| "loss": 0.2158, | |
| "num_input_tokens_seen": 222864, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 8.166666666666666, | |
| "grad_norm": 0.5462031364440918, | |
| "learning_rate": 3.688634034657115e-05, | |
| "loss": 0.2098, | |
| "num_input_tokens_seen": 223648, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 8.194444444444445, | |
| "grad_norm": 0.5156317949295044, | |
| "learning_rate": 3.6779573480772325e-05, | |
| "loss": 0.2215, | |
| "num_input_tokens_seen": 224448, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 8.222222222222221, | |
| "grad_norm": 1.479132056236267, | |
| "learning_rate": 3.6672529743349146e-05, | |
| "loss": 0.255, | |
| "num_input_tokens_seen": 225184, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 8.25, | |
| "grad_norm": 1.7528244256973267, | |
| "learning_rate": 3.656521165029879e-05, | |
| "loss": 0.2673, | |
| "num_input_tokens_seen": 225936, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 8.277777777777779, | |
| "grad_norm": 0.23426023125648499, | |
| "learning_rate": 3.6457621724066964e-05, | |
| "loss": 0.2224, | |
| "num_input_tokens_seen": 226672, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 8.305555555555555, | |
| "grad_norm": 0.4958864152431488, | |
| "learning_rate": 3.634976249348867e-05, | |
| "loss": 0.2324, | |
| "num_input_tokens_seen": 227424, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.5171009302139282, | |
| "learning_rate": 3.6241636493728736e-05, | |
| "loss": 0.2379, | |
| "num_input_tokens_seen": 228208, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 8.36111111111111, | |
| "grad_norm": 0.10480030626058578, | |
| "learning_rate": 3.613324626622224e-05, | |
| "loss": 0.2496, | |
| "num_input_tokens_seen": 228992, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 8.38888888888889, | |
| "grad_norm": 0.29571622610092163, | |
| "learning_rate": 3.602459435861475e-05, | |
| "loss": 0.2361, | |
| "num_input_tokens_seen": 229744, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 8.416666666666666, | |
| "grad_norm": 0.1226486787199974, | |
| "learning_rate": 3.591568332470249e-05, | |
| "loss": 0.2283, | |
| "num_input_tokens_seen": 230496, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 8.444444444444445, | |
| "grad_norm": 0.48671409487724304, | |
| "learning_rate": 3.5806515724372274e-05, | |
| "loss": 0.2484, | |
| "num_input_tokens_seen": 231248, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 8.472222222222221, | |
| "grad_norm": 0.09926868230104446, | |
| "learning_rate": 3.569709412354136e-05, | |
| "loss": 0.2262, | |
| "num_input_tokens_seen": 232000, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 8.5, | |
| "grad_norm": 0.10085448622703552, | |
| "learning_rate": 3.5587421094097115e-05, | |
| "loss": 0.2362, | |
| "num_input_tokens_seen": 232768, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 8.527777777777779, | |
| "grad_norm": 0.24544471502304077, | |
| "learning_rate": 3.5477499213836616e-05, | |
| "loss": 0.2216, | |
| "num_input_tokens_seen": 233568, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 8.555555555555555, | |
| "grad_norm": 0.23776550590991974, | |
| "learning_rate": 3.536733106640598e-05, | |
| "loss": 0.2283, | |
| "num_input_tokens_seen": 234320, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 8.583333333333334, | |
| "grad_norm": 0.34067851305007935, | |
| "learning_rate": 3.525691924123971e-05, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 235040, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 8.61111111111111, | |
| "grad_norm": 0.35513660311698914, | |
| "learning_rate": 3.5146266333499795e-05, | |
| "loss": 0.2289, | |
| "num_input_tokens_seen": 235824, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 8.63888888888889, | |
| "grad_norm": 0.16709186136722565, | |
| "learning_rate": 3.503537494401473e-05, | |
| "loss": 0.248, | |
| "num_input_tokens_seen": 236592, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 8.666666666666666, | |
| "grad_norm": 0.1513429880142212, | |
| "learning_rate": 3.4924247679218375e-05, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 237344, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 8.694444444444445, | |
| "grad_norm": 0.13434197008609772, | |
| "learning_rate": 3.481288715108868e-05, | |
| "loss": 0.2316, | |
| "num_input_tokens_seen": 238080, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 8.722222222222221, | |
| "grad_norm": 0.15651777386665344, | |
| "learning_rate": 3.4701295977086324e-05, | |
| "loss": 0.2211, | |
| "num_input_tokens_seen": 238816, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 8.75, | |
| "grad_norm": 0.14915582537651062, | |
| "learning_rate": 3.4589476780093166e-05, | |
| "loss": 0.2264, | |
| "num_input_tokens_seen": 239568, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 8.777777777777779, | |
| "grad_norm": 0.16734644770622253, | |
| "learning_rate": 3.44774321883506e-05, | |
| "loss": 0.2365, | |
| "num_input_tokens_seen": 240352, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 8.805555555555555, | |
| "grad_norm": 0.1950577050447464, | |
| "learning_rate": 3.436516483539781e-05, | |
| "loss": 0.2298, | |
| "num_input_tokens_seen": 241120, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 8.833333333333334, | |
| "grad_norm": 0.23239564895629883, | |
| "learning_rate": 3.42526773600098e-05, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 241856, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 8.86111111111111, | |
| "grad_norm": 0.23152494430541992, | |
| "learning_rate": 3.4139972406135464e-05, | |
| "loss": 0.2219, | |
| "num_input_tokens_seen": 242608, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 8.88888888888889, | |
| "grad_norm": 0.2428346872329712, | |
| "learning_rate": 3.402705262283537e-05, | |
| "loss": 0.2305, | |
| "num_input_tokens_seen": 243360, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.916666666666666, | |
| "grad_norm": 0.21203842759132385, | |
| "learning_rate": 3.39139206642195e-05, | |
| "loss": 0.2253, | |
| "num_input_tokens_seen": 244128, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 8.944444444444445, | |
| "grad_norm": 0.29292061924934387, | |
| "learning_rate": 3.3800579189384944e-05, | |
| "loss": 0.2334, | |
| "num_input_tokens_seen": 244896, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 8.972222222222221, | |
| "grad_norm": 0.14690163731575012, | |
| "learning_rate": 3.3687030862353286e-05, | |
| "loss": 0.232, | |
| "num_input_tokens_seen": 245664, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.25889158248901367, | |
| "learning_rate": 3.357327835200807e-05, | |
| "loss": 0.231, | |
| "num_input_tokens_seen": 246416, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.2354724407196045, | |
| "eval_runtime": 0.8507, | |
| "eval_samples_per_second": 47.02, | |
| "eval_steps_per_second": 23.51, | |
| "num_input_tokens_seen": 246416, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 9.027777777777779, | |
| "grad_norm": 0.15429404377937317, | |
| "learning_rate": 3.3459324332032035e-05, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 247184, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 9.055555555555555, | |
| "grad_norm": 0.21661336719989777, | |
| "learning_rate": 3.3345171480844275e-05, | |
| "loss": 0.212, | |
| "num_input_tokens_seen": 247936, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 9.083333333333334, | |
| "grad_norm": 0.36724531650543213, | |
| "learning_rate": 3.32308224815373e-05, | |
| "loss": 0.2297, | |
| "num_input_tokens_seen": 248688, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 9.11111111111111, | |
| "grad_norm": 0.2442227602005005, | |
| "learning_rate": 3.311628002181398e-05, | |
| "loss": 0.2371, | |
| "num_input_tokens_seen": 249472, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 9.13888888888889, | |
| "grad_norm": 0.3292847275733948, | |
| "learning_rate": 3.3001546793924285e-05, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 250224, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 9.166666666666666, | |
| "grad_norm": 0.22451095283031464, | |
| "learning_rate": 3.288662549460216e-05, | |
| "loss": 0.2218, | |
| "num_input_tokens_seen": 250960, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 9.194444444444445, | |
| "grad_norm": 0.578140914440155, | |
| "learning_rate": 3.277151882500199e-05, | |
| "loss": 0.2258, | |
| "num_input_tokens_seen": 251728, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 9.222222222222221, | |
| "grad_norm": 0.15085914731025696, | |
| "learning_rate": 3.26562294906352e-05, | |
| "loss": 0.2476, | |
| "num_input_tokens_seen": 252512, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 9.25, | |
| "grad_norm": 0.3858492076396942, | |
| "learning_rate": 3.254076020130664e-05, | |
| "loss": 0.2142, | |
| "num_input_tokens_seen": 253280, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 9.277777777777779, | |
| "grad_norm": 0.41480836272239685, | |
| "learning_rate": 3.242511367105087e-05, | |
| "loss": 0.2091, | |
| "num_input_tokens_seen": 254032, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 9.305555555555555, | |
| "grad_norm": 0.6945786476135254, | |
| "learning_rate": 3.230929261806842e-05, | |
| "loss": 0.1549, | |
| "num_input_tokens_seen": 254800, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 9.333333333333334, | |
| "grad_norm": 3.4180619716644287, | |
| "learning_rate": 3.2193299764661845e-05, | |
| "loss": 0.2614, | |
| "num_input_tokens_seen": 255584, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 9.36111111111111, | |
| "grad_norm": 2.3858699798583984, | |
| "learning_rate": 3.207713783717176e-05, | |
| "loss": 0.3004, | |
| "num_input_tokens_seen": 256368, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 9.38888888888889, | |
| "grad_norm": 13.90650463104248, | |
| "learning_rate": 3.1960809565912794e-05, | |
| "loss": 0.2912, | |
| "num_input_tokens_seen": 257104, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 9.416666666666666, | |
| "grad_norm": 23.31499671936035, | |
| "learning_rate": 3.1844317685109354e-05, | |
| "loss": 0.4081, | |
| "num_input_tokens_seen": 257856, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 9.444444444444445, | |
| "grad_norm": 3.4871859550476074, | |
| "learning_rate": 3.1727664932831394e-05, | |
| "loss": 0.3059, | |
| "num_input_tokens_seen": 258608, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 9.472222222222221, | |
| "grad_norm": 5.127751350402832, | |
| "learning_rate": 3.161085405093006e-05, | |
| "loss": 0.2625, | |
| "num_input_tokens_seen": 259344, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 9.5, | |
| "grad_norm": 1.413158655166626, | |
| "learning_rate": 3.149388778497323e-05, | |
| "loss": 0.2651, | |
| "num_input_tokens_seen": 260112, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 9.527777777777779, | |
| "grad_norm": 0.344842791557312, | |
| "learning_rate": 3.137676888418099e-05, | |
| "loss": 0.2377, | |
| "num_input_tokens_seen": 260864, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 9.555555555555555, | |
| "grad_norm": 0.6449137330055237, | |
| "learning_rate": 3.125950010136104e-05, | |
| "loss": 0.2376, | |
| "num_input_tokens_seen": 261632, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 9.583333333333334, | |
| "grad_norm": 0.5378340482711792, | |
| "learning_rate": 3.114208419284391e-05, | |
| "loss": 0.2311, | |
| "num_input_tokens_seen": 262368, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 9.61111111111111, | |
| "grad_norm": 0.503693699836731, | |
| "learning_rate": 3.102452391841828e-05, | |
| "loss": 0.2083, | |
| "num_input_tokens_seen": 263136, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 9.63888888888889, | |
| "grad_norm": 0.7619789838790894, | |
| "learning_rate": 3.090682204126604e-05, | |
| "loss": 0.2502, | |
| "num_input_tokens_seen": 263872, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 9.666666666666666, | |
| "grad_norm": 0.6181730031967163, | |
| "learning_rate": 3.078898132789735e-05, | |
| "loss": 0.2408, | |
| "num_input_tokens_seen": 264608, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 9.694444444444445, | |
| "grad_norm": 0.521176278591156, | |
| "learning_rate": 3.0671004548085675e-05, | |
| "loss": 0.2263, | |
| "num_input_tokens_seen": 265328, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 9.722222222222221, | |
| "grad_norm": 0.3363834321498871, | |
| "learning_rate": 3.0552894474802584e-05, | |
| "loss": 0.2311, | |
| "num_input_tokens_seen": 266112, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 9.75, | |
| "grad_norm": 0.22221645712852478, | |
| "learning_rate": 3.043465388415267e-05, | |
| "loss": 0.2469, | |
| "num_input_tokens_seen": 266864, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 9.777777777777779, | |
| "grad_norm": 0.19677917659282684, | |
| "learning_rate": 3.0316285555308233e-05, | |
| "loss": 0.2179, | |
| "num_input_tokens_seen": 267600, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 9.805555555555555, | |
| "grad_norm": 0.3606593608856201, | |
| "learning_rate": 3.0197792270443982e-05, | |
| "loss": 0.2245, | |
| "num_input_tokens_seen": 268384, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 9.833333333333334, | |
| "grad_norm": 0.25018107891082764, | |
| "learning_rate": 3.0079176814671656e-05, | |
| "loss": 0.2253, | |
| "num_input_tokens_seen": 269168, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 9.86111111111111, | |
| "grad_norm": 0.2878740727901459, | |
| "learning_rate": 2.9960441975974534e-05, | |
| "loss": 0.2276, | |
| "num_input_tokens_seen": 269904, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 9.88888888888889, | |
| "grad_norm": 0.3176515996456146, | |
| "learning_rate": 2.9841590545141906e-05, | |
| "loss": 0.2348, | |
| "num_input_tokens_seen": 270656, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 9.916666666666666, | |
| "grad_norm": 0.26501137018203735, | |
| "learning_rate": 2.9722625315703512e-05, | |
| "loss": 0.2349, | |
| "num_input_tokens_seen": 271408, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 9.944444444444445, | |
| "grad_norm": 0.13339616358280182, | |
| "learning_rate": 2.9603549083863847e-05, | |
| "loss": 0.2326, | |
| "num_input_tokens_seen": 272192, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 9.972222222222221, | |
| "grad_norm": 0.09521733969449997, | |
| "learning_rate": 2.9484364648436437e-05, | |
| "loss": 0.2394, | |
| "num_input_tokens_seen": 272960, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.13899917900562286, | |
| "learning_rate": 2.9365074810778094e-05, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 273712, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.23395749926567078, | |
| "eval_runtime": 0.8436, | |
| "eval_samples_per_second": 47.418, | |
| "eval_steps_per_second": 23.709, | |
| "num_input_tokens_seen": 273712, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 10.027777777777779, | |
| "grad_norm": 0.37314531207084656, | |
| "learning_rate": 2.9245682374723016e-05, | |
| "loss": 0.2183, | |
| "num_input_tokens_seen": 274480, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 10.055555555555555, | |
| "grad_norm": 0.40587741136550903, | |
| "learning_rate": 2.9126190146516942e-05, | |
| "loss": 0.2281, | |
| "num_input_tokens_seen": 275264, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 10.083333333333334, | |
| "grad_norm": 0.2202170491218567, | |
| "learning_rate": 2.9006600934751145e-05, | |
| "loss": 0.2193, | |
| "num_input_tokens_seen": 276016, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 10.11111111111111, | |
| "grad_norm": 0.1918032318353653, | |
| "learning_rate": 2.888691755029642e-05, | |
| "loss": 0.2351, | |
| "num_input_tokens_seen": 276752, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 10.13888888888889, | |
| "grad_norm": 0.22308309376239777, | |
| "learning_rate": 2.876714280623708e-05, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 277520, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 10.166666666666666, | |
| "grad_norm": 0.17808394134044647, | |
| "learning_rate": 2.8647279517804754e-05, | |
| "loss": 0.2205, | |
| "num_input_tokens_seen": 278272, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 10.194444444444445, | |
| "grad_norm": 0.287969172000885, | |
| "learning_rate": 2.8527330502312248e-05, | |
| "loss": 0.2259, | |
| "num_input_tokens_seen": 279040, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 10.222222222222221, | |
| "grad_norm": 0.1247008889913559, | |
| "learning_rate": 2.8407298579087365e-05, | |
| "loss": 0.2343, | |
| "num_input_tokens_seen": 279792, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 10.25, | |
| "grad_norm": 0.18362240493297577, | |
| "learning_rate": 2.8287186569406566e-05, | |
| "loss": 0.2281, | |
| "num_input_tokens_seen": 280560, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 10.277777777777779, | |
| "grad_norm": 0.3308933675289154, | |
| "learning_rate": 2.816699729642871e-05, | |
| "loss": 0.2175, | |
| "num_input_tokens_seen": 281328, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 10.305555555555555, | |
| "grad_norm": 0.19184178113937378, | |
| "learning_rate": 2.8046733585128687e-05, | |
| "loss": 0.2199, | |
| "num_input_tokens_seen": 282112, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 10.333333333333334, | |
| "grad_norm": 0.32785189151763916, | |
| "learning_rate": 2.792639826223101e-05, | |
| "loss": 0.223, | |
| "num_input_tokens_seen": 282880, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 10.36111111111111, | |
| "grad_norm": 0.304923415184021, | |
| "learning_rate": 2.7805994156143376e-05, | |
| "loss": 0.2083, | |
| "num_input_tokens_seen": 283648, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 10.38888888888889, | |
| "grad_norm": 0.4556541442871094, | |
| "learning_rate": 2.7685524096890185e-05, | |
| "loss": 0.2172, | |
| "num_input_tokens_seen": 284464, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 10.416666666666666, | |
| "grad_norm": 0.4782843291759491, | |
| "learning_rate": 2.756499091604603e-05, | |
| "loss": 0.2526, | |
| "num_input_tokens_seen": 285232, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 10.444444444444445, | |
| "grad_norm": 0.4656970798969269, | |
| "learning_rate": 2.744439744666915e-05, | |
| "loss": 0.2165, | |
| "num_input_tokens_seen": 285984, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 10.472222222222221, | |
| "grad_norm": 0.7466927766799927, | |
| "learning_rate": 2.732374652323481e-05, | |
| "loss": 0.2424, | |
| "num_input_tokens_seen": 286752, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 10.5, | |
| "grad_norm": 0.5435920357704163, | |
| "learning_rate": 2.72030409815687e-05, | |
| "loss": 0.2334, | |
| "num_input_tokens_seen": 287520, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 10.527777777777779, | |
| "grad_norm": 0.746811032295227, | |
| "learning_rate": 2.7082283658780288e-05, | |
| "loss": 0.2073, | |
| "num_input_tokens_seen": 288240, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 10.555555555555555, | |
| "grad_norm": 0.601383626461029, | |
| "learning_rate": 2.6961477393196126e-05, | |
| "loss": 0.2345, | |
| "num_input_tokens_seen": 289008, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 10.583333333333334, | |
| "grad_norm": 0.4319969415664673, | |
| "learning_rate": 2.684062502429312e-05, | |
| "loss": 0.2182, | |
| "num_input_tokens_seen": 289776, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 10.61111111111111, | |
| "grad_norm": 0.7417342662811279, | |
| "learning_rate": 2.6719729392631826e-05, | |
| "loss": 0.2547, | |
| "num_input_tokens_seen": 290560, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 10.63888888888889, | |
| "grad_norm": 0.4038422703742981, | |
| "learning_rate": 2.659879333978964e-05, | |
| "loss": 0.217, | |
| "num_input_tokens_seen": 291296, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 10.666666666666666, | |
| "grad_norm": 0.3393801152706146, | |
| "learning_rate": 2.6477819708294064e-05, | |
| "loss": 0.2522, | |
| "num_input_tokens_seen": 292032, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 10.694444444444445, | |
| "grad_norm": 0.3926829397678375, | |
| "learning_rate": 2.635681134155585e-05, | |
| "loss": 0.2324, | |
| "num_input_tokens_seen": 292784, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 10.722222222222221, | |
| "grad_norm": 0.1447238177061081, | |
| "learning_rate": 2.623577108380215e-05, | |
| "loss": 0.2305, | |
| "num_input_tokens_seen": 293520, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 10.75, | |
| "grad_norm": 0.3792881667613983, | |
| "learning_rate": 2.6114701780009753e-05, | |
| "loss": 0.2424, | |
| "num_input_tokens_seen": 294272, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 10.777777777777779, | |
| "grad_norm": 0.1470859944820404, | |
| "learning_rate": 2.5993606275838117e-05, | |
| "loss": 0.2427, | |
| "num_input_tokens_seen": 295008, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 10.805555555555555, | |
| "grad_norm": 0.290236234664917, | |
| "learning_rate": 2.587248741756253e-05, | |
| "loss": 0.2342, | |
| "num_input_tokens_seen": 295776, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 10.833333333333334, | |
| "grad_norm": 0.1666247695684433, | |
| "learning_rate": 2.5751348052007206e-05, | |
| "loss": 0.2315, | |
| "num_input_tokens_seen": 296512, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 10.86111111111111, | |
| "grad_norm": 0.26337382197380066, | |
| "learning_rate": 2.5630191026478368e-05, | |
| "loss": 0.2293, | |
| "num_input_tokens_seen": 297248, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 10.88888888888889, | |
| "grad_norm": 0.3205804228782654, | |
| "learning_rate": 2.5509019188697343e-05, | |
| "loss": 0.2214, | |
| "num_input_tokens_seen": 298032, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 10.916666666666666, | |
| "grad_norm": 0.3166457414627075, | |
| "learning_rate": 2.5387835386733584e-05, | |
| "loss": 0.2314, | |
| "num_input_tokens_seen": 298800, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 10.944444444444445, | |
| "grad_norm": 0.22426152229309082, | |
| "learning_rate": 2.5266642468937766e-05, | |
| "loss": 0.2428, | |
| "num_input_tokens_seen": 299552, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 10.972222222222221, | |
| "grad_norm": 0.20033201575279236, | |
| "learning_rate": 2.5145443283874848e-05, | |
| "loss": 0.2221, | |
| "num_input_tokens_seen": 300320, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.29905927181243896, | |
| "learning_rate": 2.5024240680257055e-05, | |
| "loss": 0.2153, | |
| "num_input_tokens_seen": 301088, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.23569095134735107, | |
| "eval_runtime": 0.8449, | |
| "eval_samples_per_second": 47.345, | |
| "eval_steps_per_second": 23.673, | |
| "num_input_tokens_seen": 301088, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 11.027777777777779, | |
| "grad_norm": 0.23592987656593323, | |
| "learning_rate": 2.4903037506876997e-05, | |
| "loss": 0.2065, | |
| "num_input_tokens_seen": 301856, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 11.055555555555555, | |
| "grad_norm": 0.3055577576160431, | |
| "learning_rate": 2.4781836612540657e-05, | |
| "loss": 0.239, | |
| "num_input_tokens_seen": 302592, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 11.083333333333334, | |
| "grad_norm": 0.3388107120990753, | |
| "learning_rate": 2.4660640846000453e-05, | |
| "loss": 0.2296, | |
| "num_input_tokens_seen": 303360, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 11.11111111111111, | |
| "grad_norm": 0.3578348755836487, | |
| "learning_rate": 2.4539453055888297e-05, | |
| "loss": 0.2151, | |
| "num_input_tokens_seen": 304096, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 11.13888888888889, | |
| "grad_norm": 0.2506900131702423, | |
| "learning_rate": 2.4418276090648596e-05, | |
| "loss": 0.2218, | |
| "num_input_tokens_seen": 304880, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 11.166666666666666, | |
| "grad_norm": 0.4147559106349945, | |
| "learning_rate": 2.4297112798471326e-05, | |
| "loss": 0.2321, | |
| "num_input_tokens_seen": 305664, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 11.194444444444445, | |
| "grad_norm": 0.47687992453575134, | |
| "learning_rate": 2.4175966027225107e-05, | |
| "loss": 0.2202, | |
| "num_input_tokens_seen": 306448, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 11.222222222222221, | |
| "grad_norm": 0.36644211411476135, | |
| "learning_rate": 2.405483862439023e-05, | |
| "loss": 0.2241, | |
| "num_input_tokens_seen": 307216, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 11.25, | |
| "grad_norm": 0.45731303095817566, | |
| "learning_rate": 2.3933733436991732e-05, | |
| "loss": 0.2306, | |
| "num_input_tokens_seen": 307968, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 11.277777777777779, | |
| "grad_norm": 0.5980037450790405, | |
| "learning_rate": 2.381265331153252e-05, | |
| "loss": 0.2391, | |
| "num_input_tokens_seen": 308720, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 11.305555555555555, | |
| "grad_norm": 0.9406579732894897, | |
| "learning_rate": 2.3691601093926404e-05, | |
| "loss": 0.2059, | |
| "num_input_tokens_seen": 309472, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 11.333333333333334, | |
| "grad_norm": 0.5597310662269592, | |
| "learning_rate": 2.3570579629431267e-05, | |
| "loss": 0.2382, | |
| "num_input_tokens_seen": 310224, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 11.36111111111111, | |
| "grad_norm": 0.5758081674575806, | |
| "learning_rate": 2.344959176258212e-05, | |
| "loss": 0.222, | |
| "num_input_tokens_seen": 311008, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 11.38888888888889, | |
| "grad_norm": 0.6426092386245728, | |
| "learning_rate": 2.3328640337124326e-05, | |
| "loss": 0.211, | |
| "num_input_tokens_seen": 311744, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 11.416666666666666, | |
| "grad_norm": 0.5062636137008667, | |
| "learning_rate": 2.3207728195946688e-05, | |
| "loss": 0.2242, | |
| "num_input_tokens_seen": 312512, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 11.444444444444445, | |
| "grad_norm": 0.6146106123924255, | |
| "learning_rate": 2.3086858181014653e-05, | |
| "loss": 0.2537, | |
| "num_input_tokens_seen": 313248, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 11.472222222222221, | |
| "grad_norm": 0.6537105441093445, | |
| "learning_rate": 2.2966033133303545e-05, | |
| "loss": 0.2294, | |
| "num_input_tokens_seen": 314032, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 11.5, | |
| "grad_norm": 0.6333838105201721, | |
| "learning_rate": 2.2845255892731733e-05, | |
| "loss": 0.2345, | |
| "num_input_tokens_seen": 314784, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 11.527777777777779, | |
| "grad_norm": 0.5479741096496582, | |
| "learning_rate": 2.2724529298093915e-05, | |
| "loss": 0.2169, | |
| "num_input_tokens_seen": 315520, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 11.555555555555555, | |
| "grad_norm": 0.49262702465057373, | |
| "learning_rate": 2.26038561869944e-05, | |
| "loss": 0.2138, | |
| "num_input_tokens_seen": 316288, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 11.583333333333334, | |
| "grad_norm": 0.4937076270580292, | |
| "learning_rate": 2.248323939578039e-05, | |
| "loss": 0.2234, | |
| "num_input_tokens_seen": 317040, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 11.61111111111111, | |
| "grad_norm": 0.5979186296463013, | |
| "learning_rate": 2.2362681759475307e-05, | |
| "loss": 0.2273, | |
| "num_input_tokens_seen": 317776, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 11.63888888888889, | |
| "grad_norm": 0.47289130091667175, | |
| "learning_rate": 2.2242186111712208e-05, | |
| "loss": 0.2244, | |
| "num_input_tokens_seen": 318560, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 11.666666666666666, | |
| "grad_norm": 0.5286301970481873, | |
| "learning_rate": 2.212175528466712e-05, | |
| "loss": 0.2486, | |
| "num_input_tokens_seen": 319296, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 11.694444444444445, | |
| "grad_norm": 0.6446691751480103, | |
| "learning_rate": 2.2001392108992504e-05, | |
| "loss": 0.1969, | |
| "num_input_tokens_seen": 320064, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 11.722222222222221, | |
| "grad_norm": 0.9104120135307312, | |
| "learning_rate": 2.1881099413750733e-05, | |
| "loss": 0.2327, | |
| "num_input_tokens_seen": 320800, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 11.75, | |
| "grad_norm": 1.2983386516571045, | |
| "learning_rate": 2.1760880026347562e-05, | |
| "loss": 0.2108, | |
| "num_input_tokens_seen": 321536, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 11.777777777777779, | |
| "grad_norm": 0.8108537793159485, | |
| "learning_rate": 2.16407367724657e-05, | |
| "loss": 0.2259, | |
| "num_input_tokens_seen": 322320, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 11.805555555555555, | |
| "grad_norm": 0.9631186723709106, | |
| "learning_rate": 2.1520672475998373e-05, | |
| "loss": 0.2064, | |
| "num_input_tokens_seen": 323056, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 11.833333333333334, | |
| "grad_norm": 0.7661670446395874, | |
| "learning_rate": 2.140068995898297e-05, | |
| "loss": 0.2108, | |
| "num_input_tokens_seen": 323824, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 11.86111111111111, | |
| "grad_norm": 0.737259566783905, | |
| "learning_rate": 2.1280792041534714e-05, | |
| "loss": 0.2186, | |
| "num_input_tokens_seen": 324624, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 11.88888888888889, | |
| "grad_norm": 1.0927965641021729, | |
| "learning_rate": 2.116098154178035e-05, | |
| "loss": 0.2016, | |
| "num_input_tokens_seen": 325392, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 11.916666666666666, | |
| "grad_norm": 1.9153894186019897, | |
| "learning_rate": 2.1041261275791933e-05, | |
| "loss": 0.248, | |
| "num_input_tokens_seen": 326144, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 11.944444444444445, | |
| "grad_norm": 2.076587438583374, | |
| "learning_rate": 2.092163405752063e-05, | |
| "loss": 0.201, | |
| "num_input_tokens_seen": 326880, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 11.972222222222221, | |
| "grad_norm": 1.3827784061431885, | |
| "learning_rate": 2.0802102698730574e-05, | |
| "loss": 0.2192, | |
| "num_input_tokens_seen": 327648, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 6.211891174316406, | |
| "learning_rate": 2.0682670008932785e-05, | |
| "loss": 0.3359, | |
| "num_input_tokens_seen": 328384, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.3373684883117676, | |
| "eval_runtime": 0.8514, | |
| "eval_samples_per_second": 46.983, | |
| "eval_steps_per_second": 23.491, | |
| "num_input_tokens_seen": 328384, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 12.027777777777779, | |
| "grad_norm": 2.2284414768218994, | |
| "learning_rate": 2.0563338795319123e-05, | |
| "loss": 0.2025, | |
| "num_input_tokens_seen": 329136, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 12.055555555555555, | |
| "grad_norm": 1.8171899318695068, | |
| "learning_rate": 2.0444111862696314e-05, | |
| "loss": 0.2236, | |
| "num_input_tokens_seen": 329904, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 12.083333333333334, | |
| "grad_norm": 4.0419111251831055, | |
| "learning_rate": 2.032499201342003e-05, | |
| "loss": 0.222, | |
| "num_input_tokens_seen": 330640, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 12.11111111111111, | |
| "grad_norm": 1.5563691854476929, | |
| "learning_rate": 2.020598204732901e-05, | |
| "loss": 0.2336, | |
| "num_input_tokens_seen": 331360, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 12.13888888888889, | |
| "grad_norm": 1.042913556098938, | |
| "learning_rate": 2.0087084761679245e-05, | |
| "loss": 0.2341, | |
| "num_input_tokens_seen": 332112, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 12.166666666666666, | |
| "grad_norm": 1.2564104795455933, | |
| "learning_rate": 1.996830295107827e-05, | |
| "loss": 0.2747, | |
| "num_input_tokens_seen": 332864, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 12.194444444444445, | |
| "grad_norm": 0.921697735786438, | |
| "learning_rate": 1.9849639407419423e-05, | |
| "loss": 0.2155, | |
| "num_input_tokens_seen": 333648, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 12.222222222222221, | |
| "grad_norm": 1.2880674600601196, | |
| "learning_rate": 1.973109691981627e-05, | |
| "loss": 0.2432, | |
| "num_input_tokens_seen": 334416, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 12.25, | |
| "grad_norm": 1.4810035228729248, | |
| "learning_rate": 1.9612678274537005e-05, | |
| "loss": 0.2355, | |
| "num_input_tokens_seen": 335184, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 12.277777777777779, | |
| "grad_norm": 1.5859031677246094, | |
| "learning_rate": 1.9494386254939e-05, | |
| "loss": 0.197, | |
| "num_input_tokens_seen": 335968, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 12.305555555555555, | |
| "grad_norm": 0.9962775707244873, | |
| "learning_rate": 1.937622364140338e-05, | |
| "loss": 0.2029, | |
| "num_input_tokens_seen": 336736, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 12.333333333333334, | |
| "grad_norm": 0.7494038939476013, | |
| "learning_rate": 1.925819321126964e-05, | |
| "loss": 0.2067, | |
| "num_input_tokens_seen": 337488, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 12.36111111111111, | |
| "grad_norm": 0.8293570876121521, | |
| "learning_rate": 1.9140297738770385e-05, | |
| "loss": 0.2677, | |
| "num_input_tokens_seen": 338240, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 12.38888888888889, | |
| "grad_norm": 1.0475269556045532, | |
| "learning_rate": 1.9022539994966147e-05, | |
| "loss": 0.2074, | |
| "num_input_tokens_seen": 338976, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 12.416666666666666, | |
| "grad_norm": 0.9049837589263916, | |
| "learning_rate": 1.8904922747680204e-05, | |
| "loss": 0.2153, | |
| "num_input_tokens_seen": 339760, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 12.444444444444445, | |
| "grad_norm": 0.732437252998352, | |
| "learning_rate": 1.8787448761433556e-05, | |
| "loss": 0.2247, | |
| "num_input_tokens_seen": 340528, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 12.472222222222221, | |
| "grad_norm": 1.3903310298919678, | |
| "learning_rate": 1.8670120797379958e-05, | |
| "loss": 0.2214, | |
| "num_input_tokens_seen": 341232, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 12.5, | |
| "grad_norm": 0.7470361590385437, | |
| "learning_rate": 1.8552941613240983e-05, | |
| "loss": 0.2235, | |
| "num_input_tokens_seen": 342000, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 12.527777777777779, | |
| "grad_norm": 1.040972352027893, | |
| "learning_rate": 1.8435913963241226e-05, | |
| "loss": 0.197, | |
| "num_input_tokens_seen": 342768, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 12.555555555555555, | |
| "grad_norm": 1.4287666082382202, | |
| "learning_rate": 1.831904059804358e-05, | |
| "loss": 0.2268, | |
| "num_input_tokens_seen": 343568, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 12.583333333333334, | |
| "grad_norm": 0.8281350135803223, | |
| "learning_rate": 1.8202324264684544e-05, | |
| "loss": 0.2185, | |
| "num_input_tokens_seen": 344304, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 12.61111111111111, | |
| "grad_norm": 1.3658063411712646, | |
| "learning_rate": 1.8085767706509712e-05, | |
| "loss": 0.1767, | |
| "num_input_tokens_seen": 345088, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 12.63888888888889, | |
| "grad_norm": 0.9685779809951782, | |
| "learning_rate": 1.7969373663109234e-05, | |
| "loss": 0.2127, | |
| "num_input_tokens_seen": 345856, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 12.666666666666666, | |
| "grad_norm": 1.1758708953857422, | |
| "learning_rate": 1.7853144870253458e-05, | |
| "loss": 0.2382, | |
| "num_input_tokens_seen": 346608, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 12.694444444444445, | |
| "grad_norm": 0.8447101712226868, | |
| "learning_rate": 1.7737084059828637e-05, | |
| "loss": 0.194, | |
| "num_input_tokens_seen": 347376, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 12.722222222222221, | |
| "grad_norm": 2.4415032863616943, | |
| "learning_rate": 1.7621193959772657e-05, | |
| "loss": 0.1908, | |
| "num_input_tokens_seen": 348144, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 12.75, | |
| "grad_norm": 1.2303696870803833, | |
| "learning_rate": 1.750547729401101e-05, | |
| "loss": 0.1831, | |
| "num_input_tokens_seen": 348912, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 12.777777777777779, | |
| "grad_norm": 1.4564770460128784, | |
| "learning_rate": 1.7389936782392695e-05, | |
| "loss": 0.2239, | |
| "num_input_tokens_seen": 349664, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 12.805555555555555, | |
| "grad_norm": 0.8448835611343384, | |
| "learning_rate": 1.7274575140626318e-05, | |
| "loss": 0.1875, | |
| "num_input_tokens_seen": 350432, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 12.833333333333334, | |
| "grad_norm": 9.424522399902344, | |
| "learning_rate": 1.7159395080216273e-05, | |
| "loss": 0.344, | |
| "num_input_tokens_seen": 351200, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 12.86111111111111, | |
| "grad_norm": 1.3844144344329834, | |
| "learning_rate": 1.7044399308398983e-05, | |
| "loss": 0.3025, | |
| "num_input_tokens_seen": 351936, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 12.88888888888889, | |
| "grad_norm": 1.3061507940292358, | |
| "learning_rate": 1.692959052807928e-05, | |
| "loss": 0.1906, | |
| "num_input_tokens_seen": 352720, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 12.916666666666666, | |
| "grad_norm": 1.519792079925537, | |
| "learning_rate": 1.681497143776689e-05, | |
| "loss": 0.2825, | |
| "num_input_tokens_seen": 353488, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 12.944444444444445, | |
| "grad_norm": 1.082457423210144, | |
| "learning_rate": 1.670054473151298e-05, | |
| "loss": 0.1878, | |
| "num_input_tokens_seen": 354256, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 12.972222222222221, | |
| "grad_norm": 1.1723442077636719, | |
| "learning_rate": 1.658631309884684e-05, | |
| "loss": 0.2078, | |
| "num_input_tokens_seen": 355008, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 1.186935544013977, | |
| "learning_rate": 1.6472279224712702e-05, | |
| "loss": 0.2397, | |
| "num_input_tokens_seen": 355760, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.24227285385131836, | |
| "eval_runtime": 0.8489, | |
| "eval_samples_per_second": 47.122, | |
| "eval_steps_per_second": 23.561, | |
| "num_input_tokens_seen": 355760, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 13.027777777777779, | |
| "grad_norm": 2.641793966293335, | |
| "learning_rate": 1.6358445789406584e-05, | |
| "loss": 0.2267, | |
| "num_input_tokens_seen": 356528, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 13.055555555555555, | |
| "grad_norm": 3.405515193939209, | |
| "learning_rate": 1.6244815468513315e-05, | |
| "loss": 0.1854, | |
| "num_input_tokens_seen": 357296, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 13.083333333333334, | |
| "grad_norm": 3.2545127868652344, | |
| "learning_rate": 1.6131390932843648e-05, | |
| "loss": 0.182, | |
| "num_input_tokens_seen": 358048, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 13.11111111111111, | |
| "grad_norm": 1.9090595245361328, | |
| "learning_rate": 1.6018174848371494e-05, | |
| "loss": 0.2446, | |
| "num_input_tokens_seen": 358816, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 13.13888888888889, | |
| "grad_norm": 2.3876793384552, | |
| "learning_rate": 1.5905169876171223e-05, | |
| "loss": 0.1739, | |
| "num_input_tokens_seen": 359568, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 13.166666666666666, | |
| "grad_norm": 9.157187461853027, | |
| "learning_rate": 1.579237867235514e-05, | |
| "loss": 0.2171, | |
| "num_input_tokens_seen": 360336, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 13.194444444444445, | |
| "grad_norm": 2.800305128097534, | |
| "learning_rate": 1.567980388801109e-05, | |
| "loss": 0.2339, | |
| "num_input_tokens_seen": 361056, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 13.222222222222221, | |
| "grad_norm": 2.117903470993042, | |
| "learning_rate": 1.556744816914008e-05, | |
| "loss": 0.2172, | |
| "num_input_tokens_seen": 361792, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 13.25, | |
| "grad_norm": 2.7015206813812256, | |
| "learning_rate": 1.5455314156594124e-05, | |
| "loss": 0.1971, | |
| "num_input_tokens_seen": 362576, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 13.277777777777779, | |
| "grad_norm": 1.168046474456787, | |
| "learning_rate": 1.534340448601418e-05, | |
| "loss": 0.2087, | |
| "num_input_tokens_seen": 363344, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 13.305555555555555, | |
| "grad_norm": 1.9067094326019287, | |
| "learning_rate": 1.523172178776816e-05, | |
| "loss": 0.2058, | |
| "num_input_tokens_seen": 364096, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 1.733651041984558, | |
| "learning_rate": 1.512026868688915e-05, | |
| "loss": 0.1853, | |
| "num_input_tokens_seen": 364848, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 13.36111111111111, | |
| "grad_norm": 2.2120025157928467, | |
| "learning_rate": 1.5009047803013699e-05, | |
| "loss": 0.2178, | |
| "num_input_tokens_seen": 365568, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 13.38888888888889, | |
| "grad_norm": 2.4871954917907715, | |
| "learning_rate": 1.4898061750320212e-05, | |
| "loss": 0.1482, | |
| "num_input_tokens_seen": 366368, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 13.416666666666666, | |
| "grad_norm": 2.5109379291534424, | |
| "learning_rate": 1.4787313137467546e-05, | |
| "loss": 0.1657, | |
| "num_input_tokens_seen": 367168, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 13.444444444444445, | |
| "grad_norm": 3.289309501647949, | |
| "learning_rate": 1.4676804567533687e-05, | |
| "loss": 0.2238, | |
| "num_input_tokens_seen": 367904, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 13.472222222222221, | |
| "grad_norm": 13.250516891479492, | |
| "learning_rate": 1.4566538637954554e-05, | |
| "loss": 0.1961, | |
| "num_input_tokens_seen": 368672, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 13.5, | |
| "grad_norm": 4.981893539428711, | |
| "learning_rate": 1.4456517940462949e-05, | |
| "loss": 0.2555, | |
| "num_input_tokens_seen": 369424, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 13.527777777777779, | |
| "grad_norm": 3.6648523807525635, | |
| "learning_rate": 1.4346745061027644e-05, | |
| "loss": 0.1898, | |
| "num_input_tokens_seen": 370192, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 13.555555555555555, | |
| "grad_norm": 4.466832160949707, | |
| "learning_rate": 1.4237222579792618e-05, | |
| "loss": 0.2684, | |
| "num_input_tokens_seen": 370928, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 13.583333333333334, | |
| "grad_norm": 0.43382784724235535, | |
| "learning_rate": 1.4127953071016383e-05, | |
| "loss": 0.1869, | |
| "num_input_tokens_seen": 371664, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 13.61111111111111, | |
| "grad_norm": 7.605555534362793, | |
| "learning_rate": 1.4018939103011472e-05, | |
| "loss": 0.2325, | |
| "num_input_tokens_seen": 372448, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 13.63888888888889, | |
| "grad_norm": 4.352413177490234, | |
| "learning_rate": 1.3910183238084112e-05, | |
| "loss": 0.2535, | |
| "num_input_tokens_seen": 373232, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 13.666666666666666, | |
| "grad_norm": 5.196812629699707, | |
| "learning_rate": 1.3801688032473958e-05, | |
| "loss": 0.3524, | |
| "num_input_tokens_seen": 374000, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 13.694444444444445, | |
| "grad_norm": 6.573197841644287, | |
| "learning_rate": 1.369345603629406e-05, | |
| "loss": 0.3079, | |
| "num_input_tokens_seen": 374768, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 13.722222222222221, | |
| "grad_norm": 2.1059021949768066, | |
| "learning_rate": 1.3585489793470862e-05, | |
| "loss": 0.1759, | |
| "num_input_tokens_seen": 375552, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 13.75, | |
| "grad_norm": 1.5465530157089233, | |
| "learning_rate": 1.3477791841684451e-05, | |
| "loss": 0.1818, | |
| "num_input_tokens_seen": 376320, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 13.777777777777779, | |
| "grad_norm": 2.8227381706237793, | |
| "learning_rate": 1.337036471230889e-05, | |
| "loss": 0.2174, | |
| "num_input_tokens_seen": 377104, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 13.805555555555555, | |
| "grad_norm": 2.1553452014923096, | |
| "learning_rate": 1.3263210930352737e-05, | |
| "loss": 0.1612, | |
| "num_input_tokens_seen": 377872, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 13.833333333333334, | |
| "grad_norm": 1.7421936988830566, | |
| "learning_rate": 1.3156333014399674e-05, | |
| "loss": 0.1359, | |
| "num_input_tokens_seen": 378656, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 13.86111111111111, | |
| "grad_norm": 1.0619189739227295, | |
| "learning_rate": 1.3049733476549352e-05, | |
| "loss": 0.1586, | |
| "num_input_tokens_seen": 379408, | |
| "step": 2495 | |
| }, | |
| { | |
| "epoch": 13.88888888888889, | |
| "grad_norm": 2.5276153087615967, | |
| "learning_rate": 1.2943414822358285e-05, | |
| "loss": 0.14, | |
| "num_input_tokens_seen": 380144, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 13.916666666666666, | |
| "grad_norm": 2.2656829357147217, | |
| "learning_rate": 1.2837379550781003e-05, | |
| "loss": 0.1432, | |
| "num_input_tokens_seen": 380880, | |
| "step": 2505 | |
| }, | |
| { | |
| "epoch": 13.944444444444445, | |
| "grad_norm": 3.669142723083496, | |
| "learning_rate": 1.2731630154111296e-05, | |
| "loss": 0.2479, | |
| "num_input_tokens_seen": 381632, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 13.972222222222221, | |
| "grad_norm": 3.9385387897491455, | |
| "learning_rate": 1.262616911792365e-05, | |
| "loss": 0.1723, | |
| "num_input_tokens_seen": 382368, | |
| "step": 2515 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 5.834008693695068, | |
| "learning_rate": 1.2520998921014792e-05, | |
| "loss": 0.1748, | |
| "num_input_tokens_seen": 383088, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.2950591444969177, | |
| "eval_runtime": 0.8591, | |
| "eval_samples_per_second": 46.563, | |
| "eval_steps_per_second": 23.282, | |
| "num_input_tokens_seen": 383088, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 14.027777777777779, | |
| "grad_norm": 0.713314414024353, | |
| "learning_rate": 1.2416122035345507e-05, | |
| "loss": 0.1308, | |
| "num_input_tokens_seen": 383840, | |
| "step": 2525 | |
| }, | |
| { | |
| "epoch": 14.055555555555555, | |
| "grad_norm": 4.134765625, | |
| "learning_rate": 1.2311540925982403e-05, | |
| "loss": 0.251, | |
| "num_input_tokens_seen": 384624, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 14.083333333333334, | |
| "grad_norm": 4.927321910858154, | |
| "learning_rate": 1.2207258051040099e-05, | |
| "loss": 0.209, | |
| "num_input_tokens_seen": 385392, | |
| "step": 2535 | |
| }, | |
| { | |
| "epoch": 14.11111111111111, | |
| "grad_norm": 3.297750949859619, | |
| "learning_rate": 1.2103275861623378e-05, | |
| "loss": 0.1824, | |
| "num_input_tokens_seen": 386176, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 14.13888888888889, | |
| "grad_norm": 3.990372657775879, | |
| "learning_rate": 1.1999596801769616e-05, | |
| "loss": 0.1749, | |
| "num_input_tokens_seen": 386944, | |
| "step": 2545 | |
| }, | |
| { | |
| "epoch": 14.166666666666666, | |
| "grad_norm": 3.1698896884918213, | |
| "learning_rate": 1.189622330839129e-05, | |
| "loss": 0.1024, | |
| "num_input_tokens_seen": 387696, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 14.194444444444445, | |
| "grad_norm": 1.4032636880874634, | |
| "learning_rate": 1.179315781121874e-05, | |
| "loss": 0.1795, | |
| "num_input_tokens_seen": 388464, | |
| "step": 2555 | |
| }, | |
| { | |
| "epoch": 14.222222222222221, | |
| "grad_norm": 5.369940280914307, | |
| "learning_rate": 1.1690402732743042e-05, | |
| "loss": 0.151, | |
| "num_input_tokens_seen": 389232, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 14.25, | |
| "grad_norm": 4.481320381164551, | |
| "learning_rate": 1.158796048815906e-05, | |
| "loss": 0.2119, | |
| "num_input_tokens_seen": 390000, | |
| "step": 2565 | |
| }, | |
| { | |
| "epoch": 14.277777777777779, | |
| "grad_norm": 1.490330696105957, | |
| "learning_rate": 1.1485833485308702e-05, | |
| "loss": 0.1076, | |
| "num_input_tokens_seen": 390768, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 14.305555555555555, | |
| "grad_norm": 6.507543087005615, | |
| "learning_rate": 1.1384024124624324e-05, | |
| "loss": 0.14, | |
| "num_input_tokens_seen": 391568, | |
| "step": 2575 | |
| }, | |
| { | |
| "epoch": 14.333333333333334, | |
| "grad_norm": 6.145547866821289, | |
| "learning_rate": 1.1282534799072272e-05, | |
| "loss": 0.197, | |
| "num_input_tokens_seen": 392320, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 14.36111111111111, | |
| "grad_norm": 18.988567352294922, | |
| "learning_rate": 1.1181367894096684e-05, | |
| "loss": 0.3318, | |
| "num_input_tokens_seen": 393136, | |
| "step": 2585 | |
| }, | |
| { | |
| "epoch": 14.38888888888889, | |
| "grad_norm": 19.283180236816406, | |
| "learning_rate": 1.1080525787563393e-05, | |
| "loss": 0.2645, | |
| "num_input_tokens_seen": 393936, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 14.416666666666666, | |
| "grad_norm": 10.825935363769531, | |
| "learning_rate": 1.0980010849704036e-05, | |
| "loss": 0.1461, | |
| "num_input_tokens_seen": 394688, | |
| "step": 2595 | |
| }, | |
| { | |
| "epoch": 14.444444444444445, | |
| "grad_norm": 31.148212432861328, | |
| "learning_rate": 1.0879825443060362e-05, | |
| "loss": 0.3501, | |
| "num_input_tokens_seen": 395456, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 14.472222222222221, | |
| "grad_norm": 9.394278526306152, | |
| "learning_rate": 1.0779971922428711e-05, | |
| "loss": 0.155, | |
| "num_input_tokens_seen": 396224, | |
| "step": 2605 | |
| }, | |
| { | |
| "epoch": 14.5, | |
| "grad_norm": 14.434137344360352, | |
| "learning_rate": 1.0680452634804603e-05, | |
| "loss": 0.2158, | |
| "num_input_tokens_seen": 396960, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 14.527777777777779, | |
| "grad_norm": 2.4239039421081543, | |
| "learning_rate": 1.0581269919327643e-05, | |
| "loss": 0.0862, | |
| "num_input_tokens_seen": 397712, | |
| "step": 2615 | |
| }, | |
| { | |
| "epoch": 14.555555555555555, | |
| "grad_norm": 2.1457433700561523, | |
| "learning_rate": 1.0482426107226507e-05, | |
| "loss": 0.0899, | |
| "num_input_tokens_seen": 398448, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 14.583333333333334, | |
| "grad_norm": 0.8032980561256409, | |
| "learning_rate": 1.0383923521764174e-05, | |
| "loss": 0.1994, | |
| "num_input_tokens_seen": 399200, | |
| "step": 2625 | |
| }, | |
| { | |
| "epoch": 14.61111111111111, | |
| "grad_norm": 12.607194900512695, | |
| "learning_rate": 1.0285764478183284e-05, | |
| "loss": 0.2465, | |
| "num_input_tokens_seen": 399952, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 14.63888888888889, | |
| "grad_norm": 12.285235404968262, | |
| "learning_rate": 1.0187951283651736e-05, | |
| "loss": 0.1971, | |
| "num_input_tokens_seen": 400688, | |
| "step": 2635 | |
| }, | |
| { | |
| "epoch": 14.666666666666666, | |
| "grad_norm": 9.90485668182373, | |
| "learning_rate": 1.0090486237208463e-05, | |
| "loss": 0.2033, | |
| "num_input_tokens_seen": 401424, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 14.694444444444445, | |
| "grad_norm": 7.427250385284424, | |
| "learning_rate": 9.993371629709391e-06, | |
| "loss": 0.0746, | |
| "num_input_tokens_seen": 402176, | |
| "step": 2645 | |
| }, | |
| { | |
| "epoch": 14.722222222222221, | |
| "grad_norm": 6.782092571258545, | |
| "learning_rate": 9.89660974377359e-06, | |
| "loss": 0.1266, | |
| "num_input_tokens_seen": 402944, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 14.75, | |
| "grad_norm": 1.4690288305282593, | |
| "learning_rate": 9.800202853729651e-06, | |
| "loss": 0.142, | |
| "num_input_tokens_seen": 403696, | |
| "step": 2655 | |
| }, | |
| { | |
| "epoch": 14.777777777777779, | |
| "grad_norm": 25.32921600341797, | |
| "learning_rate": 9.704153225562171e-06, | |
| "loss": 0.4962, | |
| "num_input_tokens_seen": 404464, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 14.805555555555555, | |
| "grad_norm": 1.8058884143829346, | |
| "learning_rate": 9.608463116858542e-06, | |
| "loss": 0.1042, | |
| "num_input_tokens_seen": 405200, | |
| "step": 2665 | |
| }, | |
| { | |
| "epoch": 14.833333333333334, | |
| "grad_norm": 3.9442355632781982, | |
| "learning_rate": 9.51313477675588e-06, | |
| "loss": 0.1045, | |
| "num_input_tokens_seen": 405936, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 14.86111111111111, | |
| "grad_norm": 20.509428024291992, | |
| "learning_rate": 9.418170445888139e-06, | |
| "loss": 0.3351, | |
| "num_input_tokens_seen": 406688, | |
| "step": 2675 | |
| }, | |
| { | |
| "epoch": 14.88888888888889, | |
| "grad_norm": 0.28635165095329285, | |
| "learning_rate": 9.323572356333454e-06, | |
| "loss": 0.1427, | |
| "num_input_tokens_seen": 407424, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 14.916666666666666, | |
| "grad_norm": 15.302724838256836, | |
| "learning_rate": 9.22934273156172e-06, | |
| "loss": 0.1361, | |
| "num_input_tokens_seen": 408160, | |
| "step": 2685 | |
| }, | |
| { | |
| "epoch": 14.944444444444445, | |
| "grad_norm": 9.12713623046875, | |
| "learning_rate": 9.135483786382262e-06, | |
| "loss": 0.2055, | |
| "num_input_tokens_seen": 408912, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 14.972222222222221, | |
| "grad_norm": 9.957405090332031, | |
| "learning_rate": 9.0419977268918e-06, | |
| "loss": 0.1449, | |
| "num_input_tokens_seen": 409696, | |
| "step": 2695 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 6.853147983551025, | |
| "learning_rate": 8.948886750422636e-06, | |
| "loss": 0.0885, | |
| "num_input_tokens_seen": 410448, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.38744935393333435, | |
| "eval_runtime": 0.8554, | |
| "eval_samples_per_second": 46.759, | |
| "eval_steps_per_second": 23.38, | |
| "num_input_tokens_seen": 410448, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 15.027777777777779, | |
| "grad_norm": 0.3130747675895691, | |
| "learning_rate": 8.856153045490948e-06, | |
| "loss": 0.0314, | |
| "num_input_tokens_seen": 411184, | |
| "step": 2705 | |
| }, | |
| { | |
| "epoch": 15.055555555555555, | |
| "grad_norm": 0.21885517239570618, | |
| "learning_rate": 8.763798791745411e-06, | |
| "loss": 0.0263, | |
| "num_input_tokens_seen": 411936, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 15.083333333333334, | |
| "grad_norm": 0.5549080967903137, | |
| "learning_rate": 8.671826159915907e-06, | |
| "loss": 0.1901, | |
| "num_input_tokens_seen": 412720, | |
| "step": 2715 | |
| }, | |
| { | |
| "epoch": 15.11111111111111, | |
| "grad_norm": 0.5954522490501404, | |
| "learning_rate": 8.58023731176254e-06, | |
| "loss": 0.2449, | |
| "num_input_tokens_seen": 413472, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 15.13888888888889, | |
| "grad_norm": 0.868930459022522, | |
| "learning_rate": 8.489034400024812e-06, | |
| "loss": 0.0795, | |
| "num_input_tokens_seen": 414208, | |
| "step": 2725 | |
| }, | |
| { | |
| "epoch": 15.166666666666666, | |
| "grad_norm": 17.721887588500977, | |
| "learning_rate": 8.39821956837102e-06, | |
| "loss": 0.1313, | |
| "num_input_tokens_seen": 414944, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 15.194444444444445, | |
| "grad_norm": 11.335846900939941, | |
| "learning_rate": 8.3077949513479e-06, | |
| "loss": 0.1054, | |
| "num_input_tokens_seen": 415712, | |
| "step": 2735 | |
| }, | |
| { | |
| "epoch": 15.222222222222221, | |
| "grad_norm": 3.936131238937378, | |
| "learning_rate": 8.217762674330413e-06, | |
| "loss": 0.1153, | |
| "num_input_tokens_seen": 416448, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 15.25, | |
| "grad_norm": 23.35863494873047, | |
| "learning_rate": 8.128124853471814e-06, | |
| "loss": 0.3075, | |
| "num_input_tokens_seen": 417184, | |
| "step": 2745 | |
| }, | |
| { | |
| "epoch": 15.277777777777779, | |
| "grad_norm": 21.123655319213867, | |
| "learning_rate": 8.03888359565391e-06, | |
| "loss": 0.1342, | |
| "num_input_tokens_seen": 417920, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 15.305555555555555, | |
| "grad_norm": 19.830791473388672, | |
| "learning_rate": 7.950040998437542e-06, | |
| "loss": 0.2187, | |
| "num_input_tokens_seen": 418688, | |
| "step": 2755 | |
| }, | |
| { | |
| "epoch": 15.333333333333334, | |
| "grad_norm": 1.0024510622024536, | |
| "learning_rate": 7.86159915001326e-06, | |
| "loss": 0.0683, | |
| "num_input_tokens_seen": 419424, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 15.36111111111111, | |
| "grad_norm": 1.5306583642959595, | |
| "learning_rate": 7.7735601291523e-06, | |
| "loss": 0.0755, | |
| "num_input_tokens_seen": 420224, | |
| "step": 2765 | |
| }, | |
| { | |
| "epoch": 15.38888888888889, | |
| "grad_norm": 2.4585580825805664, | |
| "learning_rate": 7.685926005157651e-06, | |
| "loss": 0.0281, | |
| "num_input_tokens_seen": 420992, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 15.416666666666666, | |
| "grad_norm": 23.670074462890625, | |
| "learning_rate": 7.598698837815449e-06, | |
| "loss": 0.1672, | |
| "num_input_tokens_seen": 421744, | |
| "step": 2775 | |
| }, | |
| { | |
| "epoch": 15.444444444444445, | |
| "grad_norm": 10.413293838500977, | |
| "learning_rate": 7.511880677346578e-06, | |
| "loss": 0.3337, | |
| "num_input_tokens_seen": 422496, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 15.472222222222221, | |
| "grad_norm": 0.5710373520851135, | |
| "learning_rate": 7.4254735643584564e-06, | |
| "loss": 0.0069, | |
| "num_input_tokens_seen": 423264, | |
| "step": 2785 | |
| }, | |
| { | |
| "epoch": 15.5, | |
| "grad_norm": 5.425608158111572, | |
| "learning_rate": 7.339479529797111e-06, | |
| "loss": 0.1213, | |
| "num_input_tokens_seen": 424032, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 15.527777777777779, | |
| "grad_norm": 0.2852936387062073, | |
| "learning_rate": 7.2539005948993825e-06, | |
| "loss": 0.0621, | |
| "num_input_tokens_seen": 424816, | |
| "step": 2795 | |
| }, | |
| { | |
| "epoch": 15.555555555555555, | |
| "grad_norm": 0.4949374496936798, | |
| "learning_rate": 7.168738771145464e-06, | |
| "loss": 0.1208, | |
| "num_input_tokens_seen": 425584, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 15.583333333333334, | |
| "grad_norm": 2.448948860168457, | |
| "learning_rate": 7.083996060211607e-06, | |
| "loss": 0.1817, | |
| "num_input_tokens_seen": 426336, | |
| "step": 2805 | |
| }, | |
| { | |
| "epoch": 15.61111111111111, | |
| "grad_norm": 0.2703257203102112, | |
| "learning_rate": 6.9996744539230665e-06, | |
| "loss": 0.1328, | |
| "num_input_tokens_seen": 427120, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 15.63888888888889, | |
| "grad_norm": 5.2962422370910645, | |
| "learning_rate": 6.9157759342072995e-06, | |
| "loss": 0.2257, | |
| "num_input_tokens_seen": 427904, | |
| "step": 2815 | |
| }, | |
| { | |
| "epoch": 15.666666666666666, | |
| "grad_norm": 2.691361665725708, | |
| "learning_rate": 6.832302473047384e-06, | |
| "loss": 0.0523, | |
| "num_input_tokens_seen": 428672, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 15.694444444444445, | |
| "grad_norm": 18.699260711669922, | |
| "learning_rate": 6.7492560324356355e-06, | |
| "loss": 0.1134, | |
| "num_input_tokens_seen": 429456, | |
| "step": 2825 | |
| }, | |
| { | |
| "epoch": 15.722222222222221, | |
| "grad_norm": 9.735588073730469, | |
| "learning_rate": 6.666638564327532e-06, | |
| "loss": 0.0496, | |
| "num_input_tokens_seen": 430192, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 15.75, | |
| "grad_norm": 1.238945722579956, | |
| "learning_rate": 6.584452010595807e-06, | |
| "loss": 0.2287, | |
| "num_input_tokens_seen": 430944, | |
| "step": 2835 | |
| }, | |
| { | |
| "epoch": 15.777777777777779, | |
| "grad_norm": 7.4446563720703125, | |
| "learning_rate": 6.502698302984811e-06, | |
| "loss": 0.0264, | |
| "num_input_tokens_seen": 431680, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 15.805555555555555, | |
| "grad_norm": 14.15257740020752, | |
| "learning_rate": 6.421379363065142e-06, | |
| "loss": 0.1039, | |
| "num_input_tokens_seen": 432480, | |
| "step": 2845 | |
| }, | |
| { | |
| "epoch": 15.833333333333334, | |
| "grad_norm": 14.9490385055542, | |
| "learning_rate": 6.340497102188425e-06, | |
| "loss": 0.135, | |
| "num_input_tokens_seen": 433248, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 15.86111111111111, | |
| "grad_norm": 0.2764110267162323, | |
| "learning_rate": 6.26005342144241e-06, | |
| "loss": 0.3508, | |
| "num_input_tokens_seen": 434016, | |
| "step": 2855 | |
| }, | |
| { | |
| "epoch": 15.88888888888889, | |
| "grad_norm": 0.11058162897825241, | |
| "learning_rate": 6.180050211606303e-06, | |
| "loss": 0.0026, | |
| "num_input_tokens_seen": 434768, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 15.916666666666666, | |
| "grad_norm": 22.15040397644043, | |
| "learning_rate": 6.100489353106304e-06, | |
| "loss": 0.3825, | |
| "num_input_tokens_seen": 435504, | |
| "step": 2865 | |
| }, | |
| { | |
| "epoch": 15.944444444444445, | |
| "grad_norm": 11.71772289276123, | |
| "learning_rate": 6.021372715971437e-06, | |
| "loss": 0.2274, | |
| "num_input_tokens_seen": 436256, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 15.972222222222221, | |
| "grad_norm": 0.5031313896179199, | |
| "learning_rate": 5.942702159789554e-06, | |
| "loss": 0.0297, | |
| "num_input_tokens_seen": 437024, | |
| "step": 2875 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 21.48788833618164, | |
| "learning_rate": 5.864479533663655e-06, | |
| "loss": 0.1848, | |
| "num_input_tokens_seen": 437776, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.6435995697975159, | |
| "eval_runtime": 0.8611, | |
| "eval_samples_per_second": 46.454, | |
| "eval_steps_per_second": 23.227, | |
| "num_input_tokens_seen": 437776, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 16.02777777777778, | |
| "grad_norm": 0.4101243019104004, | |
| "learning_rate": 5.786706676168424e-06, | |
| "loss": 0.0116, | |
| "num_input_tokens_seen": 438576, | |
| "step": 2885 | |
| }, | |
| { | |
| "epoch": 16.055555555555557, | |
| "grad_norm": 5.98344612121582, | |
| "learning_rate": 5.709385415307006e-06, | |
| "loss": 0.0365, | |
| "num_input_tokens_seen": 439360, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 16.083333333333332, | |
| "grad_norm": 7.092445373535156, | |
| "learning_rate": 5.6325175684680374e-06, | |
| "loss": 0.1803, | |
| "num_input_tokens_seen": 440096, | |
| "step": 2895 | |
| }, | |
| { | |
| "epoch": 16.11111111111111, | |
| "grad_norm": 9.738005638122559, | |
| "learning_rate": 5.556104942382964e-06, | |
| "loss": 0.0186, | |
| "num_input_tokens_seen": 440848, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 16.13888888888889, | |
| "grad_norm": 0.38426294922828674, | |
| "learning_rate": 5.48014933308352e-06, | |
| "loss": 0.1468, | |
| "num_input_tokens_seen": 441616, | |
| "step": 2905 | |
| }, | |
| { | |
| "epoch": 16.166666666666668, | |
| "grad_norm": 0.8579995036125183, | |
| "learning_rate": 5.404652525859552e-06, | |
| "loss": 0.1638, | |
| "num_input_tokens_seen": 442352, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 16.194444444444443, | |
| "grad_norm": 0.12119382619857788, | |
| "learning_rate": 5.329616295217046e-06, | |
| "loss": 0.0028, | |
| "num_input_tokens_seen": 443088, | |
| "step": 2915 | |
| }, | |
| { | |
| "epoch": 16.22222222222222, | |
| "grad_norm": 0.04318145290017128, | |
| "learning_rate": 5.2550424048364185e-06, | |
| "loss": 0.0059, | |
| "num_input_tokens_seen": 443856, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 16.25, | |
| "grad_norm": 25.193330764770508, | |
| "learning_rate": 5.180932607531056e-06, | |
| "loss": 0.3279, | |
| "num_input_tokens_seen": 444608, | |
| "step": 2925 | |
| }, | |
| { | |
| "epoch": 16.27777777777778, | |
| "grad_norm": 0.30459845066070557, | |
| "learning_rate": 5.107288645206149e-06, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 445344, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 16.305555555555557, | |
| "grad_norm": 0.07173694670200348, | |
| "learning_rate": 5.034112248817685e-06, | |
| "loss": 0.0828, | |
| "num_input_tokens_seen": 446128, | |
| "step": 2935 | |
| }, | |
| { | |
| "epoch": 16.333333333333332, | |
| "grad_norm": 1.1420878171920776, | |
| "learning_rate": 4.961405138331826e-06, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 446912, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 16.36111111111111, | |
| "grad_norm": 0.09082265198230743, | |
| "learning_rate": 4.88916902268445e-06, | |
| "loss": 0.1174, | |
| "num_input_tokens_seen": 447664, | |
| "step": 2945 | |
| }, | |
| { | |
| "epoch": 16.38888888888889, | |
| "grad_norm": 0.08622191846370697, | |
| "learning_rate": 4.817405599741004e-06, | |
| "loss": 0.0612, | |
| "num_input_tokens_seen": 448416, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 16.416666666666668, | |
| "grad_norm": 12.944429397583008, | |
| "learning_rate": 4.746116556256569e-06, | |
| "loss": 0.366, | |
| "num_input_tokens_seen": 449184, | |
| "step": 2955 | |
| }, | |
| { | |
| "epoch": 16.444444444444443, | |
| "grad_norm": 22.423507690429688, | |
| "learning_rate": 4.6753035678362314e-06, | |
| "loss": 0.0248, | |
| "num_input_tokens_seen": 449936, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 16.47222222222222, | |
| "grad_norm": 0.026965582743287086, | |
| "learning_rate": 4.604968298895703e-06, | |
| "loss": 0.0326, | |
| "num_input_tokens_seen": 450688, | |
| "step": 2965 | |
| }, | |
| { | |
| "epoch": 16.5, | |
| "grad_norm": 4.680528163909912, | |
| "learning_rate": 4.535112402622185e-06, | |
| "loss": 0.0039, | |
| "num_input_tokens_seen": 451440, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 16.52777777777778, | |
| "grad_norm": 0.01207332219928503, | |
| "learning_rate": 4.465737520935517e-06, | |
| "loss": 0.0057, | |
| "num_input_tokens_seen": 452160, | |
| "step": 2975 | |
| }, | |
| { | |
| "epoch": 16.555555555555557, | |
| "grad_norm": 0.024040911346673965, | |
| "learning_rate": 4.396845284449608e-06, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 452944, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 16.583333333333332, | |
| "grad_norm": 39.016632080078125, | |
| "learning_rate": 4.328437312434067e-06, | |
| "loss": 0.3633, | |
| "num_input_tokens_seen": 453680, | |
| "step": 2985 | |
| }, | |
| { | |
| "epoch": 16.61111111111111, | |
| "grad_norm": 0.005136567167937756, | |
| "learning_rate": 4.2605152127761675e-06, | |
| "loss": 0.0115, | |
| "num_input_tokens_seen": 454432, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 16.63888888888889, | |
| "grad_norm": 0.10608967393636703, | |
| "learning_rate": 4.19308058194306e-06, | |
| "loss": 0.0943, | |
| "num_input_tokens_seen": 455232, | |
| "step": 2995 | |
| }, | |
| { | |
| "epoch": 16.666666666666668, | |
| "grad_norm": 1.2691866159439087, | |
| "learning_rate": 4.126135004944231e-06, | |
| "loss": 0.0029, | |
| "num_input_tokens_seen": 455984, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 16.694444444444443, | |
| "grad_norm": 0.4157579243183136, | |
| "learning_rate": 4.059680055294266e-06, | |
| "loss": 0.0577, | |
| "num_input_tokens_seen": 456736, | |
| "step": 3005 | |
| }, | |
| { | |
| "epoch": 16.72222222222222, | |
| "grad_norm": 39.1358642578125, | |
| "learning_rate": 3.993717294975863e-06, | |
| "loss": 0.1053, | |
| "num_input_tokens_seen": 457520, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 16.75, | |
| "grad_norm": 43.38942337036133, | |
| "learning_rate": 3.92824827440309e-06, | |
| "loss": 0.2089, | |
| "num_input_tokens_seen": 458256, | |
| "step": 3015 | |
| }, | |
| { | |
| "epoch": 16.77777777777778, | |
| "grad_norm": 1.4325363636016846, | |
| "learning_rate": 3.863274532384981e-06, | |
| "loss": 0.003, | |
| "num_input_tokens_seen": 459008, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 16.805555555555557, | |
| "grad_norm": 34.902889251708984, | |
| "learning_rate": 3.798797596089351e-06, | |
| "loss": 0.0759, | |
| "num_input_tokens_seen": 459808, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 16.833333333333332, | |
| "grad_norm": 1.068337321281433, | |
| "learning_rate": 3.73481898100691e-06, | |
| "loss": 0.0139, | |
| "num_input_tokens_seen": 460576, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 16.86111111111111, | |
| "grad_norm": 0.5029184818267822, | |
| "learning_rate": 3.6713401909156204e-06, | |
| "loss": 0.0528, | |
| "num_input_tokens_seen": 461328, | |
| "step": 3035 | |
| }, | |
| { | |
| "epoch": 16.88888888888889, | |
| "grad_norm": 0.1493435800075531, | |
| "learning_rate": 3.608362717845376e-06, | |
| "loss": 0.0237, | |
| "num_input_tokens_seen": 462096, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 16.916666666666668, | |
| "grad_norm": 25.79793930053711, | |
| "learning_rate": 3.5458880420429135e-06, | |
| "loss": 0.1684, | |
| "num_input_tokens_seen": 462848, | |
| "step": 3045 | |
| }, | |
| { | |
| "epoch": 16.944444444444443, | |
| "grad_norm": 0.8746198415756226, | |
| "learning_rate": 3.4839176319370394e-06, | |
| "loss": 0.0018, | |
| "num_input_tokens_seen": 463616, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 16.97222222222222, | |
| "grad_norm": 0.01299325842410326, | |
| "learning_rate": 3.4224529441040904e-06, | |
| "loss": 0.1967, | |
| "num_input_tokens_seen": 464384, | |
| "step": 3055 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.3657170236110687, | |
| "learning_rate": 3.3614954232337374e-06, | |
| "loss": 0.0092, | |
| "num_input_tokens_seen": 465168, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.8914836645126343, | |
| "eval_runtime": 0.8427, | |
| "eval_samples_per_second": 47.467, | |
| "eval_steps_per_second": 23.734, | |
| "num_input_tokens_seen": 465168, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 17.02777777777778, | |
| "grad_norm": 0.033842090517282486, | |
| "learning_rate": 3.3010465020949818e-06, | |
| "loss": 0.0159, | |
| "num_input_tokens_seen": 465920, | |
| "step": 3065 | |
| }, | |
| { | |
| "epoch": 17.055555555555557, | |
| "grad_norm": 0.09151696413755417, | |
| "learning_rate": 3.2411076015025075e-06, | |
| "loss": 0.217, | |
| "num_input_tokens_seen": 466688, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 17.083333333333332, | |
| "grad_norm": 0.13220135867595673, | |
| "learning_rate": 3.1816801302832848e-06, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 467488, | |
| "step": 3075 | |
| }, | |
| { | |
| "epoch": 17.11111111111111, | |
| "grad_norm": 0.2489822804927826, | |
| "learning_rate": 3.1227654852434454e-06, | |
| "loss": 0.001, | |
| "num_input_tokens_seen": 468256, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 17.13888888888889, | |
| "grad_norm": 8.796856880187988, | |
| "learning_rate": 3.0643650511354484e-06, | |
| "loss": 0.0073, | |
| "num_input_tokens_seen": 469008, | |
| "step": 3085 | |
| }, | |
| { | |
| "epoch": 17.166666666666668, | |
| "grad_norm": 32.82636260986328, | |
| "learning_rate": 3.006480200625572e-06, | |
| "loss": 0.2167, | |
| "num_input_tokens_seen": 469808, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 17.194444444444443, | |
| "grad_norm": 0.05829642340540886, | |
| "learning_rate": 2.949112294261591e-06, | |
| "loss": 0.0127, | |
| "num_input_tokens_seen": 470560, | |
| "step": 3095 | |
| }, | |
| { | |
| "epoch": 17.22222222222222, | |
| "grad_norm": 0.014127260074019432, | |
| "learning_rate": 2.89226268044083e-06, | |
| "loss": 0.1617, | |
| "num_input_tokens_seen": 471328, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 17.25, | |
| "grad_norm": 0.004986308049410582, | |
| "learning_rate": 2.8359326953784737e-06, | |
| "loss": 0.0014, | |
| "num_input_tokens_seen": 472064, | |
| "step": 3105 | |
| }, | |
| { | |
| "epoch": 17.27777777777778, | |
| "grad_norm": 0.09535623341798782, | |
| "learning_rate": 2.780123663076142e-06, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 472832, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 17.305555555555557, | |
| "grad_norm": 10.574342727661133, | |
| "learning_rate": 2.7248368952908053e-06, | |
| "loss": 0.0079, | |
| "num_input_tokens_seen": 473600, | |
| "step": 3115 | |
| }, | |
| { | |
| "epoch": 17.333333333333332, | |
| "grad_norm": 0.5229879021644592, | |
| "learning_rate": 2.670073691503902e-06, | |
| "loss": 0.0046, | |
| "num_input_tokens_seen": 474352, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 17.36111111111111, | |
| "grad_norm": 0.13766419887542725, | |
| "learning_rate": 2.6158353388908293e-06, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 475104, | |
| "step": 3125 | |
| }, | |
| { | |
| "epoch": 17.38888888888889, | |
| "grad_norm": 0.2758001387119293, | |
| "learning_rate": 2.5621231122906873e-06, | |
| "loss": 0.1262, | |
| "num_input_tokens_seen": 475856, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 17.416666666666668, | |
| "grad_norm": 0.24915482103824615, | |
| "learning_rate": 2.5089382741762925e-06, | |
| "loss": 0.0091, | |
| "num_input_tokens_seen": 476640, | |
| "step": 3135 | |
| }, | |
| { | |
| "epoch": 17.444444444444443, | |
| "grad_norm": 0.6348158121109009, | |
| "learning_rate": 2.4562820746245386e-06, | |
| "loss": 0.002, | |
| "num_input_tokens_seen": 477408, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 17.47222222222222, | |
| "grad_norm": 0.009842537343502045, | |
| "learning_rate": 2.4041557512869878e-06, | |
| "loss": 0.0016, | |
| "num_input_tokens_seen": 478160, | |
| "step": 3145 | |
| }, | |
| { | |
| "epoch": 17.5, | |
| "grad_norm": 0.12595131993293762, | |
| "learning_rate": 2.3525605293607784e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 478928, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 17.52777777777778, | |
| "grad_norm": 0.013150378130376339, | |
| "learning_rate": 2.3014976215598503e-06, | |
| "loss": 0.016, | |
| "num_input_tokens_seen": 479696, | |
| "step": 3155 | |
| }, | |
| { | |
| "epoch": 17.555555555555557, | |
| "grad_norm": 0.0263340026140213, | |
| "learning_rate": 2.2509682280864224e-06, | |
| "loss": 0.0076, | |
| "num_input_tokens_seen": 480464, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 17.583333333333332, | |
| "grad_norm": 54.26401138305664, | |
| "learning_rate": 2.2009735366027795e-06, | |
| "loss": 0.1149, | |
| "num_input_tokens_seen": 481232, | |
| "step": 3165 | |
| }, | |
| { | |
| "epoch": 17.61111111111111, | |
| "grad_norm": 34.03044891357422, | |
| "learning_rate": 2.151514722203385e-06, | |
| "loss": 0.097, | |
| "num_input_tokens_seen": 481984, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 17.63888888888889, | |
| "grad_norm": 1.2055052518844604, | |
| "learning_rate": 2.1025929473872274e-06, | |
| "loss": 0.0034, | |
| "num_input_tokens_seen": 482768, | |
| "step": 3175 | |
| }, | |
| { | |
| "epoch": 17.666666666666668, | |
| "grad_norm": 0.01310269720852375, | |
| "learning_rate": 2.0542093620305042e-06, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 483536, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 17.694444444444443, | |
| "grad_norm": 0.12006166577339172, | |
| "learning_rate": 2.0063651033596143e-06, | |
| "loss": 0.0124, | |
| "num_input_tokens_seen": 484320, | |
| "step": 3185 | |
| }, | |
| { | |
| "epoch": 17.72222222222222, | |
| "grad_norm": 3.16572642326355, | |
| "learning_rate": 1.9590612959244055e-06, | |
| "loss": 0.0128, | |
| "num_input_tokens_seen": 485056, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 17.75, | |
| "grad_norm": 0.02956937998533249, | |
| "learning_rate": 1.912299051571764e-06, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 485808, | |
| "step": 3195 | |
| }, | |
| { | |
| "epoch": 17.77777777777778, | |
| "grad_norm": 0.007258450146764517, | |
| "learning_rate": 1.8660794694194573e-06, | |
| "loss": 0.0161, | |
| "num_input_tokens_seen": 486560, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 17.805555555555557, | |
| "grad_norm": 0.009658108465373516, | |
| "learning_rate": 1.8204036358303173e-06, | |
| "loss": 0.1385, | |
| "num_input_tokens_seen": 487312, | |
| "step": 3205 | |
| }, | |
| { | |
| "epoch": 17.833333333333332, | |
| "grad_norm": 0.5556612014770508, | |
| "learning_rate": 1.775272624386695e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 488080, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 17.86111111111111, | |
| "grad_norm": 0.012735828757286072, | |
| "learning_rate": 1.7306874958652408e-06, | |
| "loss": 0.0409, | |
| "num_input_tokens_seen": 488832, | |
| "step": 3215 | |
| }, | |
| { | |
| "epoch": 17.88888888888889, | |
| "grad_norm": 0.5004767179489136, | |
| "learning_rate": 1.686649298211951e-06, | |
| "loss": 0.0264, | |
| "num_input_tokens_seen": 489600, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 17.916666666666668, | |
| "grad_norm": 38.82133865356445, | |
| "learning_rate": 1.643159066517566e-06, | |
| "loss": 0.0869, | |
| "num_input_tokens_seen": 490336, | |
| "step": 3225 | |
| }, | |
| { | |
| "epoch": 17.944444444444443, | |
| "grad_norm": 0.010125933215022087, | |
| "learning_rate": 1.6002178229932107e-06, | |
| "loss": 0.003, | |
| "num_input_tokens_seen": 491056, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 17.97222222222222, | |
| "grad_norm": 66.36505126953125, | |
| "learning_rate": 1.5578265769463806e-06, | |
| "loss": 0.1735, | |
| "num_input_tokens_seen": 491792, | |
| "step": 3235 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.29106247425079346, | |
| "learning_rate": 1.5159863247572236e-06, | |
| "loss": 0.0543, | |
| "num_input_tokens_seen": 492560, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 1.0005823373794556, | |
| "eval_runtime": 0.8454, | |
| "eval_samples_per_second": 47.318, | |
| "eval_steps_per_second": 23.659, | |
| "num_input_tokens_seen": 492560, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 18.02777777777778, | |
| "grad_norm": 0.005374926142394543, | |
| "learning_rate": 1.4746980498551112e-06, | |
| "loss": 0.0001, | |
| "num_input_tokens_seen": 493280, | |
| "step": 3245 | |
| }, | |
| { | |
| "epoch": 18.055555555555557, | |
| "grad_norm": 0.06802839785814285, | |
| "learning_rate": 1.4339627226955392e-06, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 494048, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 18.083333333333332, | |
| "grad_norm": 0.2004125714302063, | |
| "learning_rate": 1.3937813007373013e-06, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 494784, | |
| "step": 3255 | |
| }, | |
| { | |
| "epoch": 18.11111111111111, | |
| "grad_norm": 0.2468828707933426, | |
| "learning_rate": 1.354154728419979e-06, | |
| "loss": 0.1658, | |
| "num_input_tokens_seen": 495552, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 18.13888888888889, | |
| "grad_norm": 4.114477157592773, | |
| "learning_rate": 1.31508393714177e-06, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 496320, | |
| "step": 3265 | |
| }, | |
| { | |
| "epoch": 18.166666666666668, | |
| "grad_norm": 0.005297894589602947, | |
| "learning_rate": 1.276569845237574e-06, | |
| "loss": 0.0193, | |
| "num_input_tokens_seen": 497072, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 18.194444444444443, | |
| "grad_norm": 1.2201541662216187, | |
| "learning_rate": 1.2386133579574189e-06, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 497808, | |
| "step": 3275 | |
| }, | |
| { | |
| "epoch": 18.22222222222222, | |
| "grad_norm": 0.28695741295814514, | |
| "learning_rate": 1.2012153674451715e-06, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 498528, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 18.25, | |
| "grad_norm": 0.01707194373011589, | |
| "learning_rate": 1.1643767527175857e-06, | |
| "loss": 0.0034, | |
| "num_input_tokens_seen": 499296, | |
| "step": 3285 | |
| }, | |
| { | |
| "epoch": 18.27777777777778, | |
| "grad_norm": 0.05932219699025154, | |
| "learning_rate": 1.1280983796436245e-06, | |
| "loss": 0.0233, | |
| "num_input_tokens_seen": 500064, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 18.305555555555557, | |
| "grad_norm": 0.0543174110352993, | |
| "learning_rate": 1.0923811009241142e-06, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 500848, | |
| "step": 3295 | |
| }, | |
| { | |
| "epoch": 18.333333333333332, | |
| "grad_norm": 0.005716706160455942, | |
| "learning_rate": 1.0572257560717086e-06, | |
| "loss": 0.0861, | |
| "num_input_tokens_seen": 501600, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 18.36111111111111, | |
| "grad_norm": 0.004924902692437172, | |
| "learning_rate": 1.0226331713911546e-06, | |
| "loss": 0.0012, | |
| "num_input_tokens_seen": 502352, | |
| "step": 3305 | |
| }, | |
| { | |
| "epoch": 18.38888888888889, | |
| "grad_norm": 0.22492168843746185, | |
| "learning_rate": 9.886041599598606e-07, | |
| "loss": 0.0049, | |
| "num_input_tokens_seen": 503120, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 18.416666666666668, | |
| "grad_norm": 0.012456899508833885, | |
| "learning_rate": 9.551395216087944e-07, | |
| "loss": 0.0011, | |
| "num_input_tokens_seen": 503872, | |
| "step": 3315 | |
| }, | |
| { | |
| "epoch": 18.444444444444443, | |
| "grad_norm": 51.44154739379883, | |
| "learning_rate": 9.222400429036854e-07, | |
| "loss": 0.0504, | |
| "num_input_tokens_seen": 504624, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 18.47222222222222, | |
| "grad_norm": 2.0177342891693115, | |
| "learning_rate": 8.899064971265276e-07, | |
| "loss": 0.0048, | |
| "num_input_tokens_seen": 505392, | |
| "step": 3325 | |
| }, | |
| { | |
| "epoch": 18.5, | |
| "grad_norm": 0.26857876777648926, | |
| "learning_rate": 8.581396442574135e-07, | |
| "loss": 0.1322, | |
| "num_input_tokens_seen": 506160, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 18.52777777777778, | |
| "grad_norm": 0.036319322884082794, | |
| "learning_rate": 8.269402309566743e-07, | |
| "loss": 0.0058, | |
| "num_input_tokens_seen": 506880, | |
| "step": 3335 | |
| }, | |
| { | |
| "epoch": 18.555555555555557, | |
| "grad_norm": 0.32240647077560425, | |
| "learning_rate": 7.963089905473092e-07, | |
| "loss": 0.0184, | |
| "num_input_tokens_seen": 507616, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 18.583333333333332, | |
| "grad_norm": 0.011376683600246906, | |
| "learning_rate": 7.662466429977699e-07, | |
| "loss": 0.0009, | |
| "num_input_tokens_seen": 508368, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 18.61111111111111, | |
| "grad_norm": 0.20107612013816833, | |
| "learning_rate": 7.367538949050345e-07, | |
| "loss": 0.0825, | |
| "num_input_tokens_seen": 509152, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 18.63888888888889, | |
| "grad_norm": 0.08009302616119385, | |
| "learning_rate": 7.078314394779961e-07, | |
| "loss": 0.01, | |
| "num_input_tokens_seen": 509936, | |
| "step": 3355 | |
| }, | |
| { | |
| "epoch": 18.666666666666668, | |
| "grad_norm": 0.00780840078368783, | |
| "learning_rate": 6.794799565211646e-07, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 510688, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 18.694444444444443, | |
| "grad_norm": 0.014461885206401348, | |
| "learning_rate": 6.517001124186989e-07, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 511440, | |
| "step": 3365 | |
| }, | |
| { | |
| "epoch": 18.72222222222222, | |
| "grad_norm": 0.02738901786506176, | |
| "learning_rate": 6.244925601187363e-07, | |
| "loss": 0.1572, | |
| "num_input_tokens_seen": 512192, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 18.75, | |
| "grad_norm": 0.2982403337955475, | |
| "learning_rate": 5.978579391180461e-07, | |
| "loss": 0.0347, | |
| "num_input_tokens_seen": 512960, | |
| "step": 3375 | |
| }, | |
| { | |
| "epoch": 18.77777777777778, | |
| "grad_norm": 0.006348731461912394, | |
| "learning_rate": 5.717968754469977e-07, | |
| "loss": 0.0024, | |
| "num_input_tokens_seen": 513712, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 18.805555555555557, | |
| "grad_norm": 0.012313781306147575, | |
| "learning_rate": 5.463099816548579e-07, | |
| "loss": 0.0162, | |
| "num_input_tokens_seen": 514496, | |
| "step": 3385 | |
| }, | |
| { | |
| "epoch": 18.833333333333332, | |
| "grad_norm": 0.025961650535464287, | |
| "learning_rate": 5.213978567953775e-07, | |
| "loss": 0.0261, | |
| "num_input_tokens_seen": 515296, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 18.86111111111111, | |
| "grad_norm": 0.02834870107471943, | |
| "learning_rate": 4.970610864127173e-07, | |
| "loss": 0.0021, | |
| "num_input_tokens_seen": 516032, | |
| "step": 3395 | |
| }, | |
| { | |
| "epoch": 18.88888888888889, | |
| "grad_norm": 0.058260053396224976, | |
| "learning_rate": 4.7330024252768555e-07, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 516816, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 18.916666666666668, | |
| "grad_norm": 3.5997354984283447, | |
| "learning_rate": 4.5011588362429134e-07, | |
| "loss": 0.0022, | |
| "num_input_tokens_seen": 517584, | |
| "step": 3405 | |
| }, | |
| { | |
| "epoch": 18.944444444444443, | |
| "grad_norm": 0.009501924738287926, | |
| "learning_rate": 4.2750855463662143e-07, | |
| "loss": 0.0024, | |
| "num_input_tokens_seen": 518336, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 18.97222222222222, | |
| "grad_norm": 0.0839788019657135, | |
| "learning_rate": 4.05478786936031e-07, | |
| "loss": 0.0029, | |
| "num_input_tokens_seen": 519088, | |
| "step": 3415 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.05105578154325485, | |
| "learning_rate": 3.8402709831865113e-07, | |
| "loss": 0.1114, | |
| "num_input_tokens_seen": 519840, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 1.036169409751892, | |
| "eval_runtime": 0.854, | |
| "eval_samples_per_second": 46.839, | |
| "eval_steps_per_second": 23.419, | |
| "num_input_tokens_seen": 519840, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 19.02777777777778, | |
| "grad_norm": 0.12579289078712463, | |
| "learning_rate": 3.6315399299321484e-07, | |
| "loss": 0.0218, | |
| "num_input_tokens_seen": 520624, | |
| "step": 3425 | |
| }, | |
| { | |
| "epoch": 19.055555555555557, | |
| "grad_norm": 0.00517948716878891, | |
| "learning_rate": 3.428599615692141e-07, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 521392, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 19.083333333333332, | |
| "grad_norm": 7.694216251373291, | |
| "learning_rate": 3.2314548104537545e-07, | |
| "loss": 0.0078, | |
| "num_input_tokens_seen": 522144, | |
| "step": 3435 | |
| }, | |
| { | |
| "epoch": 19.11111111111111, | |
| "grad_norm": 16.037460327148438, | |
| "learning_rate": 3.040110147984221e-07, | |
| "loss": 0.0194, | |
| "num_input_tokens_seen": 522896, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 19.13888888888889, | |
| "grad_norm": 0.8977673649787903, | |
| "learning_rate": 2.8545701257221e-07, | |
| "loss": 0.0008, | |
| "num_input_tokens_seen": 523632, | |
| "step": 3445 | |
| }, | |
| { | |
| "epoch": 19.166666666666668, | |
| "grad_norm": 0.025021294131875038, | |
| "learning_rate": 2.674839104671367e-07, | |
| "loss": 0.0007, | |
| "num_input_tokens_seen": 524400, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 19.194444444444443, | |
| "grad_norm": 43.39835739135742, | |
| "learning_rate": 2.5009213092991034e-07, | |
| "loss": 0.0131, | |
| "num_input_tokens_seen": 525200, | |
| "step": 3455 | |
| }, | |
| { | |
| "epoch": 19.22222222222222, | |
| "grad_norm": 0.020788980647921562, | |
| "learning_rate": 2.3328208274359942e-07, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 525952, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 19.25, | |
| "grad_norm": 0.006578272208571434, | |
| "learning_rate": 2.170541610180432e-07, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 526704, | |
| "step": 3465 | |
| }, | |
| { | |
| "epoch": 19.27777777777778, | |
| "grad_norm": 0.020482726395130157, | |
| "learning_rate": 2.014087471805509e-07, | |
| "loss": 0.0009, | |
| "num_input_tokens_seen": 527440, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 19.305555555555557, | |
| "grad_norm": 0.029707515612244606, | |
| "learning_rate": 1.8634620896695043e-07, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 528208, | |
| "step": 3475 | |
| }, | |
| { | |
| "epoch": 19.333333333333332, | |
| "grad_norm": 0.011560129933059216, | |
| "learning_rate": 1.7186690041292586e-07, | |
| "loss": 0.0009, | |
| "num_input_tokens_seen": 529008, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 19.36111111111111, | |
| "grad_norm": 0.9206594228744507, | |
| "learning_rate": 1.5797116184571304e-07, | |
| "loss": 0.0011, | |
| "num_input_tokens_seen": 529760, | |
| "step": 3485 | |
| }, | |
| { | |
| "epoch": 19.38888888888889, | |
| "grad_norm": 0.096881203353405, | |
| "learning_rate": 1.4465931987609482e-07, | |
| "loss": 0.0019, | |
| "num_input_tokens_seen": 530480, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 19.416666666666668, | |
| "grad_norm": 0.055456917732954025, | |
| "learning_rate": 1.319316873907267e-07, | |
| "loss": 0.0017, | |
| "num_input_tokens_seen": 531216, | |
| "step": 3495 | |
| }, | |
| { | |
| "epoch": 19.444444444444443, | |
| "grad_norm": 0.10053224116563797, | |
| "learning_rate": 1.1978856354477595e-07, | |
| "loss": 0.0654, | |
| "num_input_tokens_seen": 531968, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 19.47222222222222, | |
| "grad_norm": 3.4088387489318848, | |
| "learning_rate": 1.0823023375489127e-07, | |
| "loss": 0.0026, | |
| "num_input_tokens_seen": 532736, | |
| "step": 3505 | |
| }, | |
| { | |
| "epoch": 19.5, | |
| "grad_norm": 0.00926142930984497, | |
| "learning_rate": 9.725696969249965e-08, | |
| "loss": 0.0137, | |
| "num_input_tokens_seen": 533504, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 19.52777777777778, | |
| "grad_norm": 0.005964045878499746, | |
| "learning_rate": 8.686902927741991e-08, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 534256, | |
| "step": 3515 | |
| }, | |
| { | |
| "epoch": 19.555555555555557, | |
| "grad_norm": 0.28770729899406433, | |
| "learning_rate": 7.706665667180091e-08, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 535008, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 19.583333333333332, | |
| "grad_norm": 0.0930958241224289, | |
| "learning_rate": 6.785008227437329e-08, | |
| "loss": 0.0438, | |
| "num_input_tokens_seen": 535792, | |
| "step": 3525 | |
| }, | |
| { | |
| "epoch": 19.61111111111111, | |
| "grad_norm": 0.028239967301487923, | |
| "learning_rate": 5.921952271504827e-08, | |
| "loss": 0.0086, | |
| "num_input_tokens_seen": 536560, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 19.63888888888889, | |
| "grad_norm": 0.22114162147045135, | |
| "learning_rate": 5.117518084981621e-08, | |
| "loss": 0.0005, | |
| "num_input_tokens_seen": 537328, | |
| "step": 3535 | |
| }, | |
| { | |
| "epoch": 19.666666666666668, | |
| "grad_norm": 0.10239258408546448, | |
| "learning_rate": 4.371724575597535e-08, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 538064, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 19.694444444444443, | |
| "grad_norm": 35.42240905761719, | |
| "learning_rate": 3.684589272771044e-08, | |
| "loss": 0.1792, | |
| "num_input_tokens_seen": 538848, | |
| "step": 3545 | |
| }, | |
| { | |
| "epoch": 19.72222222222222, | |
| "grad_norm": 0.09444889426231384, | |
| "learning_rate": 3.056128327193486e-08, | |
| "loss": 0.0761, | |
| "num_input_tokens_seen": 539616, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 19.75, | |
| "grad_norm": 0.004997141659259796, | |
| "learning_rate": 2.486356510453258e-08, | |
| "loss": 0.001, | |
| "num_input_tokens_seen": 540352, | |
| "step": 3555 | |
| }, | |
| { | |
| "epoch": 19.77777777777778, | |
| "grad_norm": 0.003712383331730962, | |
| "learning_rate": 1.975287214685817e-08, | |
| "loss": 0.0004, | |
| "num_input_tokens_seen": 541104, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 19.805555555555557, | |
| "grad_norm": 0.10159067809581757, | |
| "learning_rate": 1.522932452260595e-08, | |
| "loss": 0.0166, | |
| "num_input_tokens_seen": 541856, | |
| "step": 3565 | |
| }, | |
| { | |
| "epoch": 19.833333333333332, | |
| "grad_norm": 1.4221373796463013, | |
| "learning_rate": 1.1293028554978935e-08, | |
| "loss": 0.001, | |
| "num_input_tokens_seen": 542640, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 19.86111111111111, | |
| "grad_norm": 0.006227497011423111, | |
| "learning_rate": 7.944076764190845e-09, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 543392, | |
| "step": 3575 | |
| }, | |
| { | |
| "epoch": 19.88888888888889, | |
| "grad_norm": 0.003915317822247744, | |
| "learning_rate": 5.182547865290044e-09, | |
| "loss": 0.0512, | |
| "num_input_tokens_seen": 544128, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 19.916666666666668, | |
| "grad_norm": 6.322549819946289, | |
| "learning_rate": 3.008506766313812e-09, | |
| "loss": 0.155, | |
| "num_input_tokens_seen": 544880, | |
| "step": 3585 | |
| }, | |
| { | |
| "epoch": 19.944444444444443, | |
| "grad_norm": 0.006800191942602396, | |
| "learning_rate": 1.4220045667645566e-09, | |
| "loss": 0.0003, | |
| "num_input_tokens_seen": 545600, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 19.97222222222222, | |
| "grad_norm": 0.013187541626393795, | |
| "learning_rate": 4.2307855639411865e-10, | |
| "loss": 0.006, | |
| "num_input_tokens_seen": 546384, | |
| "step": 3595 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.07555617392063141, | |
| "learning_rate": 1.1752214348903501e-11, | |
| "loss": 0.0002, | |
| "num_input_tokens_seen": 547136, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 1.036588430404663, | |
| "eval_runtime": 0.8689, | |
| "eval_samples_per_second": 46.033, | |
| "eval_steps_per_second": 23.016, | |
| "num_input_tokens_seen": 547136, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "num_input_tokens_seen": 547136, | |
| "step": 3600, | |
| "total_flos": 2.463728679203635e+16, | |
| "train_loss": 0.24590962828558985, | |
| "train_runtime": 353.9588, | |
| "train_samples_per_second": 20.341, | |
| "train_steps_per_second": 10.171 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 3600, | |
| "num_input_tokens_seen": 547136, | |
| "num_train_epochs": 20, | |
| "save_steps": 180, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.463728679203635e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |