train_copa_1757340203 / trainer_state.json
rbelanec's picture
End of training
80e4645 verified
{
"best_global_step": 1080,
"best_metric": 0.2312491238117218,
"best_model_checkpoint": "saves_stability/prefix-tuning/llama-3-8b-instruct/train_copa_1757340203/checkpoint-1080",
"epoch": 20.0,
"eval_steps": 180,
"global_step": 3600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.027777777777777776,
"grad_norm": 167.9774627685547,
"learning_rate": 5.555555555555556e-07,
"loss": 8.8223,
"num_input_tokens_seen": 752,
"step": 5
},
{
"epoch": 0.05555555555555555,
"grad_norm": 152.64581298828125,
"learning_rate": 1.25e-06,
"loss": 8.0585,
"num_input_tokens_seen": 1520,
"step": 10
},
{
"epoch": 0.08333333333333333,
"grad_norm": 116.32727813720703,
"learning_rate": 1.9444444444444444e-06,
"loss": 6.6675,
"num_input_tokens_seen": 2320,
"step": 15
},
{
"epoch": 0.1111111111111111,
"grad_norm": 86.08872985839844,
"learning_rate": 2.638888888888889e-06,
"loss": 5.3834,
"num_input_tokens_seen": 3072,
"step": 20
},
{
"epoch": 0.1388888888888889,
"grad_norm": 68.7329330444336,
"learning_rate": 3.3333333333333333e-06,
"loss": 3.9719,
"num_input_tokens_seen": 3840,
"step": 25
},
{
"epoch": 0.16666666666666666,
"grad_norm": 48.6358642578125,
"learning_rate": 4.027777777777779e-06,
"loss": 2.8677,
"num_input_tokens_seen": 4576,
"step": 30
},
{
"epoch": 0.19444444444444445,
"grad_norm": 35.65837097167969,
"learning_rate": 4.722222222222222e-06,
"loss": 1.9241,
"num_input_tokens_seen": 5328,
"step": 35
},
{
"epoch": 0.2222222222222222,
"grad_norm": 26.759178161621094,
"learning_rate": 5.416666666666667e-06,
"loss": 1.0376,
"num_input_tokens_seen": 6112,
"step": 40
},
{
"epoch": 0.25,
"grad_norm": 32.65984344482422,
"learning_rate": 6.111111111111111e-06,
"loss": 0.52,
"num_input_tokens_seen": 6848,
"step": 45
},
{
"epoch": 0.2777777777777778,
"grad_norm": 12.905600547790527,
"learning_rate": 6.805555555555556e-06,
"loss": 0.4276,
"num_input_tokens_seen": 7600,
"step": 50
},
{
"epoch": 0.3055555555555556,
"grad_norm": 21.74563980102539,
"learning_rate": 7.5e-06,
"loss": 0.2801,
"num_input_tokens_seen": 8368,
"step": 55
},
{
"epoch": 0.3333333333333333,
"grad_norm": 12.704620361328125,
"learning_rate": 8.194444444444445e-06,
"loss": 0.1757,
"num_input_tokens_seen": 9152,
"step": 60
},
{
"epoch": 0.3611111111111111,
"grad_norm": 54.7520751953125,
"learning_rate": 8.88888888888889e-06,
"loss": 0.4626,
"num_input_tokens_seen": 9888,
"step": 65
},
{
"epoch": 0.3888888888888889,
"grad_norm": 36.79613494873047,
"learning_rate": 9.583333333333334e-06,
"loss": 0.5824,
"num_input_tokens_seen": 10656,
"step": 70
},
{
"epoch": 0.4166666666666667,
"grad_norm": 10.10336685180664,
"learning_rate": 1.0277777777777777e-05,
"loss": 0.3283,
"num_input_tokens_seen": 11408,
"step": 75
},
{
"epoch": 0.4444444444444444,
"grad_norm": 25.05152130126953,
"learning_rate": 1.0972222222222223e-05,
"loss": 0.3072,
"num_input_tokens_seen": 12144,
"step": 80
},
{
"epoch": 0.4722222222222222,
"grad_norm": 16.299089431762695,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.2421,
"num_input_tokens_seen": 12880,
"step": 85
},
{
"epoch": 0.5,
"grad_norm": 7.419747352600098,
"learning_rate": 1.2361111111111112e-05,
"loss": 0.2412,
"num_input_tokens_seen": 13664,
"step": 90
},
{
"epoch": 0.5277777777777778,
"grad_norm": 15.009756088256836,
"learning_rate": 1.3055555555555557e-05,
"loss": 0.4789,
"num_input_tokens_seen": 14464,
"step": 95
},
{
"epoch": 0.5555555555555556,
"grad_norm": 8.924155235290527,
"learning_rate": 1.3750000000000002e-05,
"loss": 0.2502,
"num_input_tokens_seen": 15216,
"step": 100
},
{
"epoch": 0.5833333333333334,
"grad_norm": 10.291241645812988,
"learning_rate": 1.4444444444444444e-05,
"loss": 0.2588,
"num_input_tokens_seen": 15984,
"step": 105
},
{
"epoch": 0.6111111111111112,
"grad_norm": 16.27149200439453,
"learning_rate": 1.5138888888888888e-05,
"loss": 0.3088,
"num_input_tokens_seen": 16768,
"step": 110
},
{
"epoch": 0.6388888888888888,
"grad_norm": 8.211831092834473,
"learning_rate": 1.5833333333333333e-05,
"loss": 0.2734,
"num_input_tokens_seen": 17552,
"step": 115
},
{
"epoch": 0.6666666666666666,
"grad_norm": 4.888490676879883,
"learning_rate": 1.6527777777777777e-05,
"loss": 0.2656,
"num_input_tokens_seen": 18304,
"step": 120
},
{
"epoch": 0.6944444444444444,
"grad_norm": 5.311173915863037,
"learning_rate": 1.7222222222222224e-05,
"loss": 0.23,
"num_input_tokens_seen": 19072,
"step": 125
},
{
"epoch": 0.7222222222222222,
"grad_norm": 3.224381446838379,
"learning_rate": 1.7916666666666667e-05,
"loss": 0.2338,
"num_input_tokens_seen": 19840,
"step": 130
},
{
"epoch": 0.75,
"grad_norm": 10.3880615234375,
"learning_rate": 1.861111111111111e-05,
"loss": 0.3357,
"num_input_tokens_seen": 20640,
"step": 135
},
{
"epoch": 0.7777777777777778,
"grad_norm": 7.747908115386963,
"learning_rate": 1.9305555555555558e-05,
"loss": 0.3484,
"num_input_tokens_seen": 21408,
"step": 140
},
{
"epoch": 0.8055555555555556,
"grad_norm": 5.8082990646362305,
"learning_rate": 2e-05,
"loss": 0.2401,
"num_input_tokens_seen": 22128,
"step": 145
},
{
"epoch": 0.8333333333333334,
"grad_norm": 3.7531211376190186,
"learning_rate": 2.0694444444444445e-05,
"loss": 0.2147,
"num_input_tokens_seen": 22880,
"step": 150
},
{
"epoch": 0.8611111111111112,
"grad_norm": 6.251461505889893,
"learning_rate": 2.138888888888889e-05,
"loss": 0.2482,
"num_input_tokens_seen": 23664,
"step": 155
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.8583431243896484,
"learning_rate": 2.2083333333333333e-05,
"loss": 0.2706,
"num_input_tokens_seen": 24432,
"step": 160
},
{
"epoch": 0.9166666666666666,
"grad_norm": 1.6401225328445435,
"learning_rate": 2.277777777777778e-05,
"loss": 0.2638,
"num_input_tokens_seen": 25184,
"step": 165
},
{
"epoch": 0.9444444444444444,
"grad_norm": 4.828391075134277,
"learning_rate": 2.3472222222222223e-05,
"loss": 0.2324,
"num_input_tokens_seen": 25920,
"step": 170
},
{
"epoch": 0.9722222222222222,
"grad_norm": 2.8311290740966797,
"learning_rate": 2.4166666666666667e-05,
"loss": 0.2318,
"num_input_tokens_seen": 26672,
"step": 175
},
{
"epoch": 1.0,
"grad_norm": 0.7510238885879517,
"learning_rate": 2.4861111111111114e-05,
"loss": 0.2307,
"num_input_tokens_seen": 27408,
"step": 180
},
{
"epoch": 1.0,
"eval_loss": 0.24429556727409363,
"eval_runtime": 0.841,
"eval_samples_per_second": 47.562,
"eval_steps_per_second": 23.781,
"num_input_tokens_seen": 27408,
"step": 180
},
{
"epoch": 1.0277777777777777,
"grad_norm": 1.5177123546600342,
"learning_rate": 2.5555555555555554e-05,
"loss": 0.2288,
"num_input_tokens_seen": 28176,
"step": 185
},
{
"epoch": 1.0555555555555556,
"grad_norm": 1.2902367115020752,
"learning_rate": 2.625e-05,
"loss": 0.2356,
"num_input_tokens_seen": 28928,
"step": 190
},
{
"epoch": 1.0833333333333333,
"grad_norm": 4.072321891784668,
"learning_rate": 2.6944444444444445e-05,
"loss": 0.2577,
"num_input_tokens_seen": 29696,
"step": 195
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.4522273540496826,
"learning_rate": 2.7638888888888892e-05,
"loss": 0.2403,
"num_input_tokens_seen": 30448,
"step": 200
},
{
"epoch": 1.1388888888888888,
"grad_norm": 0.927115797996521,
"learning_rate": 2.8333333333333335e-05,
"loss": 0.257,
"num_input_tokens_seen": 31200,
"step": 205
},
{
"epoch": 1.1666666666666667,
"grad_norm": 2.620466470718384,
"learning_rate": 2.9027777777777782e-05,
"loss": 0.2328,
"num_input_tokens_seen": 31936,
"step": 210
},
{
"epoch": 1.1944444444444444,
"grad_norm": 1.051540493965149,
"learning_rate": 2.9722222222222223e-05,
"loss": 0.2362,
"num_input_tokens_seen": 32704,
"step": 215
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.6837607622146606,
"learning_rate": 3.0416666666666666e-05,
"loss": 0.2375,
"num_input_tokens_seen": 33440,
"step": 220
},
{
"epoch": 1.25,
"grad_norm": 2.1056437492370605,
"learning_rate": 3.111111111111111e-05,
"loss": 0.2384,
"num_input_tokens_seen": 34192,
"step": 225
},
{
"epoch": 1.2777777777777777,
"grad_norm": 1.122992992401123,
"learning_rate": 3.180555555555556e-05,
"loss": 0.2388,
"num_input_tokens_seen": 34960,
"step": 230
},
{
"epoch": 1.3055555555555556,
"grad_norm": 0.9644664525985718,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.2736,
"num_input_tokens_seen": 35728,
"step": 235
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.8712486028671265,
"learning_rate": 3.3194444444444444e-05,
"loss": 0.2352,
"num_input_tokens_seen": 36464,
"step": 240
},
{
"epoch": 1.3611111111111112,
"grad_norm": 2.6176774501800537,
"learning_rate": 3.388888888888889e-05,
"loss": 0.2393,
"num_input_tokens_seen": 37216,
"step": 245
},
{
"epoch": 1.3888888888888888,
"grad_norm": 1.305676817893982,
"learning_rate": 3.458333333333333e-05,
"loss": 0.2299,
"num_input_tokens_seen": 38000,
"step": 250
},
{
"epoch": 1.4166666666666667,
"grad_norm": 9.111144065856934,
"learning_rate": 3.527777777777778e-05,
"loss": 0.275,
"num_input_tokens_seen": 38800,
"step": 255
},
{
"epoch": 1.4444444444444444,
"grad_norm": 2.4773318767547607,
"learning_rate": 3.5972222222222225e-05,
"loss": 0.2884,
"num_input_tokens_seen": 39584,
"step": 260
},
{
"epoch": 1.4722222222222223,
"grad_norm": 3.8086376190185547,
"learning_rate": 3.6666666666666666e-05,
"loss": 0.2382,
"num_input_tokens_seen": 40352,
"step": 265
},
{
"epoch": 1.5,
"grad_norm": 3.7353453636169434,
"learning_rate": 3.736111111111111e-05,
"loss": 0.2257,
"num_input_tokens_seen": 41120,
"step": 270
},
{
"epoch": 1.5277777777777777,
"grad_norm": 25.111865997314453,
"learning_rate": 3.805555555555555e-05,
"loss": 0.2836,
"num_input_tokens_seen": 41856,
"step": 275
},
{
"epoch": 1.5555555555555556,
"grad_norm": 13.982370376586914,
"learning_rate": 3.875e-05,
"loss": 0.8554,
"num_input_tokens_seen": 42640,
"step": 280
},
{
"epoch": 1.5833333333333335,
"grad_norm": 154.56797790527344,
"learning_rate": 3.944444444444445e-05,
"loss": 3.143,
"num_input_tokens_seen": 43392,
"step": 285
},
{
"epoch": 1.6111111111111112,
"grad_norm": 3.4631268978118896,
"learning_rate": 4.0138888888888894e-05,
"loss": 0.3615,
"num_input_tokens_seen": 44176,
"step": 290
},
{
"epoch": 1.6388888888888888,
"grad_norm": 0.45719239115715027,
"learning_rate": 4.0833333333333334e-05,
"loss": 0.2289,
"num_input_tokens_seen": 44928,
"step": 295
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.5930590629577637,
"learning_rate": 4.152777777777778e-05,
"loss": 0.2271,
"num_input_tokens_seen": 45696,
"step": 300
},
{
"epoch": 1.6944444444444444,
"grad_norm": 0.19013027846813202,
"learning_rate": 4.222222222222222e-05,
"loss": 0.2477,
"num_input_tokens_seen": 46464,
"step": 305
},
{
"epoch": 1.7222222222222223,
"grad_norm": 1.5907601118087769,
"learning_rate": 4.291666666666667e-05,
"loss": 0.2669,
"num_input_tokens_seen": 47216,
"step": 310
},
{
"epoch": 1.75,
"grad_norm": 0.35706278681755066,
"learning_rate": 4.3611111111111116e-05,
"loss": 0.2423,
"num_input_tokens_seen": 47984,
"step": 315
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.6776552200317383,
"learning_rate": 4.4305555555555556e-05,
"loss": 0.2228,
"num_input_tokens_seen": 48720,
"step": 320
},
{
"epoch": 1.8055555555555556,
"grad_norm": 1.1364836692810059,
"learning_rate": 4.5e-05,
"loss": 0.2698,
"num_input_tokens_seen": 49472,
"step": 325
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.17125339806079865,
"learning_rate": 4.569444444444444e-05,
"loss": 0.2317,
"num_input_tokens_seen": 50224,
"step": 330
},
{
"epoch": 1.8611111111111112,
"grad_norm": 1.3546942472457886,
"learning_rate": 4.638888888888889e-05,
"loss": 0.2387,
"num_input_tokens_seen": 50944,
"step": 335
},
{
"epoch": 1.8888888888888888,
"grad_norm": 5.500971794128418,
"learning_rate": 4.708333333333334e-05,
"loss": 0.264,
"num_input_tokens_seen": 51696,
"step": 340
},
{
"epoch": 1.9166666666666665,
"grad_norm": 3.2084038257598877,
"learning_rate": 4.7777777777777784e-05,
"loss": 0.3239,
"num_input_tokens_seen": 52480,
"step": 345
},
{
"epoch": 1.9444444444444444,
"grad_norm": 1.4998213052749634,
"learning_rate": 4.8472222222222224e-05,
"loss": 0.2497,
"num_input_tokens_seen": 53232,
"step": 350
},
{
"epoch": 1.9722222222222223,
"grad_norm": 0.29920241236686707,
"learning_rate": 4.9166666666666665e-05,
"loss": 0.2366,
"num_input_tokens_seen": 54016,
"step": 355
},
{
"epoch": 2.0,
"grad_norm": 0.9520807862281799,
"learning_rate": 4.986111111111111e-05,
"loss": 0.2399,
"num_input_tokens_seen": 54752,
"step": 360
},
{
"epoch": 2.0,
"eval_loss": 0.23347768187522888,
"eval_runtime": 0.9935,
"eval_samples_per_second": 40.261,
"eval_steps_per_second": 20.131,
"num_input_tokens_seen": 54752,
"step": 360
},
{
"epoch": 2.0277777777777777,
"grad_norm": 0.16839726269245148,
"learning_rate": 4.99998119647914e-05,
"loss": 0.2378,
"num_input_tokens_seen": 55520,
"step": 365
},
{
"epoch": 2.0555555555555554,
"grad_norm": 0.8747950792312622,
"learning_rate": 4.999904807660428e-05,
"loss": 0.2401,
"num_input_tokens_seen": 56288,
"step": 370
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.27838072180747986,
"learning_rate": 4.999769660117901e-05,
"loss": 0.2284,
"num_input_tokens_seen": 57040,
"step": 375
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.28668636083602905,
"learning_rate": 4.999575757028119e-05,
"loss": 0.2262,
"num_input_tokens_seen": 57792,
"step": 380
},
{
"epoch": 2.138888888888889,
"grad_norm": 0.7376558780670166,
"learning_rate": 4.9993231029486544e-05,
"loss": 0.2531,
"num_input_tokens_seen": 58576,
"step": 385
},
{
"epoch": 2.1666666666666665,
"grad_norm": 7.420198440551758,
"learning_rate": 4.999011703817986e-05,
"loss": 0.2609,
"num_input_tokens_seen": 59344,
"step": 390
},
{
"epoch": 2.1944444444444446,
"grad_norm": 3.8974297046661377,
"learning_rate": 4.9986415669553586e-05,
"loss": 0.2575,
"num_input_tokens_seen": 60112,
"step": 395
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.4262005090713501,
"learning_rate": 4.998212701060612e-05,
"loss": 0.215,
"num_input_tokens_seen": 60896,
"step": 400
},
{
"epoch": 2.25,
"grad_norm": 0.2879176735877991,
"learning_rate": 4.997725116213973e-05,
"loss": 0.2506,
"num_input_tokens_seen": 61648,
"step": 405
},
{
"epoch": 2.2777777777777777,
"grad_norm": 1.6084895133972168,
"learning_rate": 4.997178823875826e-05,
"loss": 0.241,
"num_input_tokens_seen": 62400,
"step": 410
},
{
"epoch": 2.3055555555555554,
"grad_norm": 0.1998496651649475,
"learning_rate": 4.996573836886435e-05,
"loss": 0.2412,
"num_input_tokens_seen": 63136,
"step": 415
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.7503790259361267,
"learning_rate": 4.995910169465646e-05,
"loss": 0.23,
"num_input_tokens_seen": 63888,
"step": 420
},
{
"epoch": 2.361111111111111,
"grad_norm": 1.1558177471160889,
"learning_rate": 4.9951878372125547e-05,
"loss": 0.2064,
"num_input_tokens_seen": 64624,
"step": 425
},
{
"epoch": 2.388888888888889,
"grad_norm": 0.29164397716522217,
"learning_rate": 4.994406857105136e-05,
"loss": 0.2645,
"num_input_tokens_seen": 65376,
"step": 430
},
{
"epoch": 2.4166666666666665,
"grad_norm": 0.4840473532676697,
"learning_rate": 4.993567247499845e-05,
"loss": 0.2689,
"num_input_tokens_seen": 66112,
"step": 435
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.6077887415885925,
"learning_rate": 4.9926690281311904e-05,
"loss": 0.2355,
"num_input_tokens_seen": 66896,
"step": 440
},
{
"epoch": 2.4722222222222223,
"grad_norm": 0.37077003717422485,
"learning_rate": 4.9917122201112656e-05,
"loss": 0.2409,
"num_input_tokens_seen": 67664,
"step": 445
},
{
"epoch": 2.5,
"grad_norm": 0.4490332305431366,
"learning_rate": 4.9906968459292524e-05,
"loss": 0.204,
"num_input_tokens_seen": 68432,
"step": 450
},
{
"epoch": 2.5277777777777777,
"grad_norm": 0.24682901799678802,
"learning_rate": 4.9896229294508976e-05,
"loss": 0.2514,
"num_input_tokens_seen": 69152,
"step": 455
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.17960534989833832,
"learning_rate": 4.988490495917947e-05,
"loss": 0.2615,
"num_input_tokens_seen": 69936,
"step": 460
},
{
"epoch": 2.5833333333333335,
"grad_norm": 1.6768314838409424,
"learning_rate": 4.987299571947553e-05,
"loss": 0.2344,
"num_input_tokens_seen": 70704,
"step": 465
},
{
"epoch": 2.611111111111111,
"grad_norm": 1.0035189390182495,
"learning_rate": 4.9860501855316514e-05,
"loss": 0.2223,
"num_input_tokens_seen": 71488,
"step": 470
},
{
"epoch": 2.638888888888889,
"grad_norm": 0.2373342365026474,
"learning_rate": 4.9847423660363e-05,
"loss": 0.2689,
"num_input_tokens_seen": 72224,
"step": 475
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.534795343875885,
"learning_rate": 4.983376144200992e-05,
"loss": 0.242,
"num_input_tokens_seen": 72976,
"step": 480
},
{
"epoch": 2.6944444444444446,
"grad_norm": 0.2486744225025177,
"learning_rate": 4.981951552137929e-05,
"loss": 0.233,
"num_input_tokens_seen": 73792,
"step": 485
},
{
"epoch": 2.7222222222222223,
"grad_norm": 0.1451578289270401,
"learning_rate": 4.980468623331273e-05,
"loss": 0.2351,
"num_input_tokens_seen": 74560,
"step": 490
},
{
"epoch": 2.75,
"grad_norm": 0.7469542026519775,
"learning_rate": 4.978927392636351e-05,
"loss": 0.2396,
"num_input_tokens_seen": 75328,
"step": 495
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.9310773611068726,
"learning_rate": 4.9773278962788436e-05,
"loss": 0.2568,
"num_input_tokens_seen": 76080,
"step": 500
},
{
"epoch": 2.8055555555555554,
"grad_norm": 0.08243211358785629,
"learning_rate": 4.975670171853926e-05,
"loss": 0.2442,
"num_input_tokens_seen": 76848,
"step": 505
},
{
"epoch": 2.8333333333333335,
"grad_norm": 0.07204469293355942,
"learning_rate": 4.973954258325392e-05,
"loss": 0.2304,
"num_input_tokens_seen": 77616,
"step": 510
},
{
"epoch": 2.861111111111111,
"grad_norm": 0.09893473237752914,
"learning_rate": 4.972180196024733e-05,
"loss": 0.222,
"num_input_tokens_seen": 78384,
"step": 515
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.10142597556114197,
"learning_rate": 4.97034802665019e-05,
"loss": 0.2374,
"num_input_tokens_seen": 79136,
"step": 520
},
{
"epoch": 2.9166666666666665,
"grad_norm": 0.5532053112983704,
"learning_rate": 4.9684577932657786e-05,
"loss": 0.2284,
"num_input_tokens_seen": 79920,
"step": 525
},
{
"epoch": 2.9444444444444446,
"grad_norm": 0.6459735631942749,
"learning_rate": 4.966509540300269e-05,
"loss": 0.2185,
"num_input_tokens_seen": 80672,
"step": 530
},
{
"epoch": 2.9722222222222223,
"grad_norm": 0.07756423205137253,
"learning_rate": 4.9645033135461494e-05,
"loss": 0.2737,
"num_input_tokens_seen": 81440,
"step": 535
},
{
"epoch": 3.0,
"grad_norm": 0.3974766135215759,
"learning_rate": 4.962439160158544e-05,
"loss": 0.2437,
"num_input_tokens_seen": 82176,
"step": 540
},
{
"epoch": 3.0,
"eval_loss": 0.23567190766334534,
"eval_runtime": 0.8517,
"eval_samples_per_second": 46.965,
"eval_steps_per_second": 23.483,
"num_input_tokens_seen": 82176,
"step": 540
},
{
"epoch": 3.0277777777777777,
"grad_norm": 0.35840246081352234,
"learning_rate": 4.960317128654108e-05,
"loss": 0.2396,
"num_input_tokens_seen": 82944,
"step": 545
},
{
"epoch": 3.0555555555555554,
"grad_norm": 0.06530583649873734,
"learning_rate": 4.958137268909887e-05,
"loss": 0.2521,
"num_input_tokens_seen": 83712,
"step": 550
},
{
"epoch": 3.0833333333333335,
"grad_norm": 0.10089149326086044,
"learning_rate": 4.9558996321621405e-05,
"loss": 0.2297,
"num_input_tokens_seen": 84496,
"step": 555
},
{
"epoch": 3.111111111111111,
"grad_norm": 0.4209873676300049,
"learning_rate": 4.953604271005144e-05,
"loss": 0.2335,
"num_input_tokens_seen": 85280,
"step": 560
},
{
"epoch": 3.138888888888889,
"grad_norm": 0.298819363117218,
"learning_rate": 4.951251239389948e-05,
"loss": 0.2434,
"num_input_tokens_seen": 86016,
"step": 565
},
{
"epoch": 3.1666666666666665,
"grad_norm": 0.051832061260938644,
"learning_rate": 4.9488405926231144e-05,
"loss": 0.2308,
"num_input_tokens_seen": 86800,
"step": 570
},
{
"epoch": 3.1944444444444446,
"grad_norm": 0.04287365451455116,
"learning_rate": 4.946372387365409e-05,
"loss": 0.2427,
"num_input_tokens_seen": 87568,
"step": 575
},
{
"epoch": 3.2222222222222223,
"grad_norm": 0.29626473784446716,
"learning_rate": 4.943846681630479e-05,
"loss": 0.2317,
"num_input_tokens_seen": 88320,
"step": 580
},
{
"epoch": 3.25,
"grad_norm": 0.06258111447095871,
"learning_rate": 4.941263534783482e-05,
"loss": 0.2285,
"num_input_tokens_seen": 89072,
"step": 585
},
{
"epoch": 3.2777777777777777,
"grad_norm": 0.13033808767795563,
"learning_rate": 4.9386230075396964e-05,
"loss": 0.2165,
"num_input_tokens_seen": 89792,
"step": 590
},
{
"epoch": 3.3055555555555554,
"grad_norm": 0.3183232247829437,
"learning_rate": 4.9359251619630886e-05,
"loss": 0.2417,
"num_input_tokens_seen": 90576,
"step": 595
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.2652042508125305,
"learning_rate": 4.933170061464858e-05,
"loss": 0.2425,
"num_input_tokens_seen": 91360,
"step": 600
},
{
"epoch": 3.361111111111111,
"grad_norm": 0.2555042803287506,
"learning_rate": 4.930357770801947e-05,
"loss": 0.1958,
"num_input_tokens_seen": 92128,
"step": 605
},
{
"epoch": 3.388888888888889,
"grad_norm": 0.4280713200569153,
"learning_rate": 4.9274883560755156e-05,
"loss": 0.2538,
"num_input_tokens_seen": 92912,
"step": 610
},
{
"epoch": 3.4166666666666665,
"grad_norm": 0.23289966583251953,
"learning_rate": 4.924561884729391e-05,
"loss": 0.2223,
"num_input_tokens_seen": 93648,
"step": 615
},
{
"epoch": 3.4444444444444446,
"grad_norm": 0.06474464386701584,
"learning_rate": 4.921578425548482e-05,
"loss": 0.2504,
"num_input_tokens_seen": 94416,
"step": 620
},
{
"epoch": 3.4722222222222223,
"grad_norm": 0.3162301480770111,
"learning_rate": 4.9185380486571595e-05,
"loss": 0.2311,
"num_input_tokens_seen": 95168,
"step": 625
},
{
"epoch": 3.5,
"grad_norm": 0.04434465989470482,
"learning_rate": 4.915440825517612e-05,
"loss": 0.2216,
"num_input_tokens_seen": 95936,
"step": 630
},
{
"epoch": 3.5277777777777777,
"grad_norm": 0.2360685020685196,
"learning_rate": 4.912286828928162e-05,
"loss": 0.2187,
"num_input_tokens_seen": 96688,
"step": 635
},
{
"epoch": 3.5555555555555554,
"grad_norm": 0.04857361316680908,
"learning_rate": 4.909076133021557e-05,
"loss": 0.2402,
"num_input_tokens_seen": 97456,
"step": 640
},
{
"epoch": 3.5833333333333335,
"grad_norm": 0.03909294307231903,
"learning_rate": 4.9058088132632306e-05,
"loss": 0.2395,
"num_input_tokens_seen": 98208,
"step": 645
},
{
"epoch": 3.611111111111111,
"grad_norm": 0.2648538053035736,
"learning_rate": 4.9024849464495215e-05,
"loss": 0.2456,
"num_input_tokens_seen": 98944,
"step": 650
},
{
"epoch": 3.638888888888889,
"grad_norm": 0.035087645053863525,
"learning_rate": 4.8991046107058735e-05,
"loss": 0.2295,
"num_input_tokens_seen": 99728,
"step": 655
},
{
"epoch": 3.6666666666666665,
"grad_norm": 0.2587370276451111,
"learning_rate": 4.895667885484997e-05,
"loss": 0.2403,
"num_input_tokens_seen": 100496,
"step": 660
},
{
"epoch": 3.6944444444444446,
"grad_norm": 0.26684993505477905,
"learning_rate": 4.892174851565004e-05,
"loss": 0.2246,
"num_input_tokens_seen": 101264,
"step": 665
},
{
"epoch": 3.7222222222222223,
"grad_norm": 0.2731635868549347,
"learning_rate": 4.8886255910475054e-05,
"loss": 0.2396,
"num_input_tokens_seen": 102016,
"step": 670
},
{
"epoch": 3.75,
"grad_norm": 0.25409963726997375,
"learning_rate": 4.885020187355687e-05,
"loss": 0.2517,
"num_input_tokens_seen": 102768,
"step": 675
},
{
"epoch": 3.7777777777777777,
"grad_norm": 0.21939261257648468,
"learning_rate": 4.881358725232342e-05,
"loss": 0.2297,
"num_input_tokens_seen": 103520,
"step": 680
},
{
"epoch": 3.8055555555555554,
"grad_norm": 0.2201327532529831,
"learning_rate": 4.877641290737884e-05,
"loss": 0.2379,
"num_input_tokens_seen": 104288,
"step": 685
},
{
"epoch": 3.8333333333333335,
"grad_norm": 0.2317037433385849,
"learning_rate": 4.873867971248324e-05,
"loss": 0.2403,
"num_input_tokens_seen": 105056,
"step": 690
},
{
"epoch": 3.861111111111111,
"grad_norm": 0.20789989829063416,
"learning_rate": 4.870038855453213e-05,
"loss": 0.2258,
"num_input_tokens_seen": 105792,
"step": 695
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.20359978079795837,
"learning_rate": 4.866154033353561e-05,
"loss": 0.23,
"num_input_tokens_seen": 106544,
"step": 700
},
{
"epoch": 3.9166666666666665,
"grad_norm": 0.044127654284238815,
"learning_rate": 4.86221359625972e-05,
"loss": 0.23,
"num_input_tokens_seen": 107312,
"step": 705
},
{
"epoch": 3.9444444444444446,
"grad_norm": 0.04580540582537651,
"learning_rate": 4.858217636789241e-05,
"loss": 0.2158,
"num_input_tokens_seen": 108064,
"step": 710
},
{
"epoch": 3.9722222222222223,
"grad_norm": 0.04189879074692726,
"learning_rate": 4.854166248864689e-05,
"loss": 0.2468,
"num_input_tokens_seen": 108848,
"step": 715
},
{
"epoch": 4.0,
"grad_norm": 0.22962501645088196,
"learning_rate": 4.850059527711444e-05,
"loss": 0.2275,
"num_input_tokens_seen": 109584,
"step": 720
},
{
"epoch": 4.0,
"eval_loss": 0.23304803669452667,
"eval_runtime": 0.8386,
"eval_samples_per_second": 47.699,
"eval_steps_per_second": 23.849,
"num_input_tokens_seen": 109584,
"step": 720
},
{
"epoch": 4.027777777777778,
"grad_norm": 0.21242158114910126,
"learning_rate": 4.84589756985546e-05,
"loss": 0.2294,
"num_input_tokens_seen": 110336,
"step": 725
},
{
"epoch": 4.055555555555555,
"grad_norm": 0.03922082111239433,
"learning_rate": 4.8416804731209945e-05,
"loss": 0.2314,
"num_input_tokens_seen": 111104,
"step": 730
},
{
"epoch": 4.083333333333333,
"grad_norm": 0.04513590782880783,
"learning_rate": 4.8374083366283096e-05,
"loss": 0.2337,
"num_input_tokens_seen": 111856,
"step": 735
},
{
"epoch": 4.111111111111111,
"grad_norm": 0.041459400206804276,
"learning_rate": 4.833081260791345e-05,
"loss": 0.2443,
"num_input_tokens_seen": 112624,
"step": 740
},
{
"epoch": 4.138888888888889,
"grad_norm": 0.22475330531597137,
"learning_rate": 4.828699347315356e-05,
"loss": 0.2314,
"num_input_tokens_seen": 113376,
"step": 745
},
{
"epoch": 4.166666666666667,
"grad_norm": 0.04301132634282112,
"learning_rate": 4.82426269919452e-05,
"loss": 0.2316,
"num_input_tokens_seen": 114096,
"step": 750
},
{
"epoch": 4.194444444444445,
"grad_norm": 0.021374447271227837,
"learning_rate": 4.8197714207095205e-05,
"loss": 0.2339,
"num_input_tokens_seen": 114832,
"step": 755
},
{
"epoch": 4.222222222222222,
"grad_norm": 0.2153686285018921,
"learning_rate": 4.815225617425095e-05,
"loss": 0.2317,
"num_input_tokens_seen": 115632,
"step": 760
},
{
"epoch": 4.25,
"grad_norm": 0.03661501035094261,
"learning_rate": 4.8106253961875506e-05,
"loss": 0.2277,
"num_input_tokens_seen": 116352,
"step": 765
},
{
"epoch": 4.277777777777778,
"grad_norm": 0.22066719830036163,
"learning_rate": 4.805970865122257e-05,
"loss": 0.2379,
"num_input_tokens_seen": 117152,
"step": 770
},
{
"epoch": 4.305555555555555,
"grad_norm": 0.22886253893375397,
"learning_rate": 4.8012621336311016e-05,
"loss": 0.2341,
"num_input_tokens_seen": 117904,
"step": 775
},
{
"epoch": 4.333333333333333,
"grad_norm": 0.2830849289894104,
"learning_rate": 4.7964993123899195e-05,
"loss": 0.2498,
"num_input_tokens_seen": 118672,
"step": 780
},
{
"epoch": 4.361111111111111,
"grad_norm": 0.030636107549071312,
"learning_rate": 4.791682513345892e-05,
"loss": 0.2342,
"num_input_tokens_seen": 119424,
"step": 785
},
{
"epoch": 4.388888888888889,
"grad_norm": 0.041791874915361404,
"learning_rate": 4.786811849714918e-05,
"loss": 0.2317,
"num_input_tokens_seen": 120176,
"step": 790
},
{
"epoch": 4.416666666666667,
"grad_norm": 0.2024121731519699,
"learning_rate": 4.781887435978947e-05,
"loss": 0.2275,
"num_input_tokens_seen": 120960,
"step": 795
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.19425232708454132,
"learning_rate": 4.776909387883292e-05,
"loss": 0.2274,
"num_input_tokens_seen": 121712,
"step": 800
},
{
"epoch": 4.472222222222222,
"grad_norm": 0.30294886231422424,
"learning_rate": 4.771877822433911e-05,
"loss": 0.227,
"num_input_tokens_seen": 122464,
"step": 805
},
{
"epoch": 4.5,
"grad_norm": 0.20341075956821442,
"learning_rate": 4.766792857894652e-05,
"loss": 0.2314,
"num_input_tokens_seen": 123232,
"step": 810
},
{
"epoch": 4.527777777777778,
"grad_norm": 0.07046259939670563,
"learning_rate": 4.761654613784477e-05,
"loss": 0.2583,
"num_input_tokens_seen": 124000,
"step": 815
},
{
"epoch": 4.555555555555555,
"grad_norm": 0.06253518909215927,
"learning_rate": 4.756463210874652e-05,
"loss": 0.2518,
"num_input_tokens_seen": 124768,
"step": 820
},
{
"epoch": 4.583333333333333,
"grad_norm": 0.062459491193294525,
"learning_rate": 4.751218771185906e-05,
"loss": 0.2418,
"num_input_tokens_seen": 125520,
"step": 825
},
{
"epoch": 4.611111111111111,
"grad_norm": 0.1863209456205368,
"learning_rate": 4.745921417985566e-05,
"loss": 0.2181,
"num_input_tokens_seen": 126256,
"step": 830
},
{
"epoch": 4.638888888888889,
"grad_norm": 0.1782846450805664,
"learning_rate": 4.740571275784659e-05,
"loss": 0.232,
"num_input_tokens_seen": 127024,
"step": 835
},
{
"epoch": 4.666666666666667,
"grad_norm": 0.1876181811094284,
"learning_rate": 4.735168470334984e-05,
"loss": 0.2327,
"num_input_tokens_seen": 127792,
"step": 840
},
{
"epoch": 4.694444444444445,
"grad_norm": 0.21437183022499084,
"learning_rate": 4.729713128626158e-05,
"loss": 0.2405,
"num_input_tokens_seen": 128544,
"step": 845
},
{
"epoch": 4.722222222222222,
"grad_norm": 0.2045063078403473,
"learning_rate": 4.72420537888263e-05,
"loss": 0.234,
"num_input_tokens_seen": 129312,
"step": 850
},
{
"epoch": 4.75,
"grad_norm": 0.023555081337690353,
"learning_rate": 4.7186453505606676e-05,
"loss": 0.2316,
"num_input_tokens_seen": 130112,
"step": 855
},
{
"epoch": 4.777777777777778,
"grad_norm": 0.038099661469459534,
"learning_rate": 4.713033174345314e-05,
"loss": 0.236,
"num_input_tokens_seen": 130864,
"step": 860
},
{
"epoch": 4.805555555555555,
"grad_norm": 0.20526079833507538,
"learning_rate": 4.707368982147318e-05,
"loss": 0.2155,
"num_input_tokens_seen": 131632,
"step": 865
},
{
"epoch": 4.833333333333333,
"grad_norm": 0.07691137492656708,
"learning_rate": 4.701652907100029e-05,
"loss": 0.2335,
"num_input_tokens_seen": 132400,
"step": 870
},
{
"epoch": 4.861111111111111,
"grad_norm": 0.2254783809185028,
"learning_rate": 4.695885083556275e-05,
"loss": 0.2363,
"num_input_tokens_seen": 133152,
"step": 875
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.08800611644983292,
"learning_rate": 4.6900656470851964e-05,
"loss": 0.2614,
"num_input_tokens_seen": 133920,
"step": 880
},
{
"epoch": 4.916666666666667,
"grad_norm": 0.044184520840644836,
"learning_rate": 4.684194734469067e-05,
"loss": 0.2221,
"num_input_tokens_seen": 134688,
"step": 885
},
{
"epoch": 4.944444444444445,
"grad_norm": 0.21772927045822144,
"learning_rate": 4.678272483700074e-05,
"loss": 0.2317,
"num_input_tokens_seen": 135456,
"step": 890
},
{
"epoch": 4.972222222222222,
"grad_norm": 0.06680847704410553,
"learning_rate": 4.672299033977076e-05,
"loss": 0.2498,
"num_input_tokens_seen": 136240,
"step": 895
},
{
"epoch": 5.0,
"grad_norm": 0.029849985614418983,
"learning_rate": 4.6662745257023325e-05,
"loss": 0.2341,
"num_input_tokens_seen": 137008,
"step": 900
},
{
"epoch": 5.0,
"eval_loss": 0.23569175601005554,
"eval_runtime": 0.8416,
"eval_samples_per_second": 47.528,
"eval_steps_per_second": 23.764,
"num_input_tokens_seen": 137008,
"step": 900
},
{
"epoch": 5.027777777777778,
"grad_norm": 0.190761536359787,
"learning_rate": 4.660199100478202e-05,
"loss": 0.2336,
"num_input_tokens_seen": 137776,
"step": 905
},
{
"epoch": 5.055555555555555,
"grad_norm": 0.19231367111206055,
"learning_rate": 4.6540729011038146e-05,
"loss": 0.2295,
"num_input_tokens_seen": 138496,
"step": 910
},
{
"epoch": 5.083333333333333,
"grad_norm": 0.1897604763507843,
"learning_rate": 4.6478960715717176e-05,
"loss": 0.2253,
"num_input_tokens_seen": 139280,
"step": 915
},
{
"epoch": 5.111111111111111,
"grad_norm": 0.22185863554477692,
"learning_rate": 4.641668757064486e-05,
"loss": 0.2338,
"num_input_tokens_seen": 140080,
"step": 920
},
{
"epoch": 5.138888888888889,
"grad_norm": 0.0547848604619503,
"learning_rate": 4.6353911039513145e-05,
"loss": 0.2493,
"num_input_tokens_seen": 140848,
"step": 925
},
{
"epoch": 5.166666666666667,
"grad_norm": 0.19223369657993317,
"learning_rate": 4.6290632597845755e-05,
"loss": 0.2273,
"num_input_tokens_seen": 141632,
"step": 930
},
{
"epoch": 5.194444444444445,
"grad_norm": 0.04600772261619568,
"learning_rate": 4.622685373296353e-05,
"loss": 0.2319,
"num_input_tokens_seen": 142368,
"step": 935
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.18761615455150604,
"learning_rate": 4.61625759439494e-05,
"loss": 0.2338,
"num_input_tokens_seen": 143120,
"step": 940
},
{
"epoch": 5.25,
"grad_norm": 0.06209826469421387,
"learning_rate": 4.609780074161327e-05,
"loss": 0.2296,
"num_input_tokens_seen": 143904,
"step": 945
},
{
"epoch": 5.277777777777778,
"grad_norm": 0.05093017593026161,
"learning_rate": 4.603252964845638e-05,
"loss": 0.2296,
"num_input_tokens_seen": 144640,
"step": 950
},
{
"epoch": 5.305555555555555,
"grad_norm": 0.03405000641942024,
"learning_rate": 4.5966764198635606e-05,
"loss": 0.2294,
"num_input_tokens_seen": 145376,
"step": 955
},
{
"epoch": 5.333333333333333,
"grad_norm": 0.17441076040267944,
"learning_rate": 4.590050593792736e-05,
"loss": 0.2235,
"num_input_tokens_seen": 146144,
"step": 960
},
{
"epoch": 5.361111111111111,
"grad_norm": 0.048055585473775864,
"learning_rate": 4.583375642369129e-05,
"loss": 0.2386,
"num_input_tokens_seen": 146912,
"step": 965
},
{
"epoch": 5.388888888888889,
"grad_norm": 0.06384623795747757,
"learning_rate": 4.5766517224833637e-05,
"loss": 0.2377,
"num_input_tokens_seen": 147664,
"step": 970
},
{
"epoch": 5.416666666666667,
"grad_norm": 0.2151617407798767,
"learning_rate": 4.569878992177039e-05,
"loss": 0.2251,
"num_input_tokens_seen": 148448,
"step": 975
},
{
"epoch": 5.444444444444445,
"grad_norm": 0.17460361123085022,
"learning_rate": 4.5630576106390114e-05,
"loss": 0.2462,
"num_input_tokens_seen": 149200,
"step": 980
},
{
"epoch": 5.472222222222222,
"grad_norm": 0.17640213668346405,
"learning_rate": 4.556187738201656e-05,
"loss": 0.2139,
"num_input_tokens_seen": 149936,
"step": 985
},
{
"epoch": 5.5,
"grad_norm": 0.18245279788970947,
"learning_rate": 4.549269536337095e-05,
"loss": 0.2209,
"num_input_tokens_seen": 150672,
"step": 990
},
{
"epoch": 5.527777777777778,
"grad_norm": 0.05563855543732643,
"learning_rate": 4.5423031676534065e-05,
"loss": 0.2216,
"num_input_tokens_seen": 151440,
"step": 995
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.24899983406066895,
"learning_rate": 4.535288795890798e-05,
"loss": 0.24,
"num_input_tokens_seen": 152224,
"step": 1000
},
{
"epoch": 5.583333333333333,
"grad_norm": 0.24502120912075043,
"learning_rate": 4.528226585917761e-05,
"loss": 0.2357,
"num_input_tokens_seen": 152976,
"step": 1005
},
{
"epoch": 5.611111111111111,
"grad_norm": 0.12031486630439758,
"learning_rate": 4.521116703727193e-05,
"loss": 0.2383,
"num_input_tokens_seen": 153744,
"step": 1010
},
{
"epoch": 5.638888888888889,
"grad_norm": 0.25375932455062866,
"learning_rate": 4.5139593164324986e-05,
"loss": 0.2352,
"num_input_tokens_seen": 154512,
"step": 1015
},
{
"epoch": 5.666666666666667,
"grad_norm": 0.09390950202941895,
"learning_rate": 4.506754592263662e-05,
"loss": 0.2343,
"num_input_tokens_seen": 155280,
"step": 1020
},
{
"epoch": 5.694444444444445,
"grad_norm": 0.2544936239719391,
"learning_rate": 4.49950270056329e-05,
"loss": 0.2422,
"num_input_tokens_seen": 156032,
"step": 1025
},
{
"epoch": 5.722222222222222,
"grad_norm": 0.06608662009239197,
"learning_rate": 4.4922038117826334e-05,
"loss": 0.2336,
"num_input_tokens_seen": 156848,
"step": 1030
},
{
"epoch": 5.75,
"grad_norm": 0.20967911183834076,
"learning_rate": 4.48485809747758e-05,
"loss": 0.2347,
"num_input_tokens_seen": 157600,
"step": 1035
},
{
"epoch": 5.777777777777778,
"grad_norm": 0.1931481808423996,
"learning_rate": 4.477465730304624e-05,
"loss": 0.2333,
"num_input_tokens_seen": 158352,
"step": 1040
},
{
"epoch": 5.805555555555555,
"grad_norm": 0.0319942943751812,
"learning_rate": 4.4700268840168045e-05,
"loss": 0.2372,
"num_input_tokens_seen": 159104,
"step": 1045
},
{
"epoch": 5.833333333333333,
"grad_norm": 0.18197861313819885,
"learning_rate": 4.462541733459628e-05,
"loss": 0.2318,
"num_input_tokens_seen": 159856,
"step": 1050
},
{
"epoch": 5.861111111111111,
"grad_norm": 0.18240846693515778,
"learning_rate": 4.455010454566947e-05,
"loss": 0.2337,
"num_input_tokens_seen": 160624,
"step": 1055
},
{
"epoch": 5.888888888888889,
"grad_norm": 0.19988971948623657,
"learning_rate": 4.447433224356839e-05,
"loss": 0.2385,
"num_input_tokens_seen": 161360,
"step": 1060
},
{
"epoch": 5.916666666666667,
"grad_norm": 0.03416143357753754,
"learning_rate": 4.439810220927436e-05,
"loss": 0.2285,
"num_input_tokens_seen": 162080,
"step": 1065
},
{
"epoch": 5.944444444444445,
"grad_norm": 0.19043967127799988,
"learning_rate": 4.432141623452743e-05,
"loss": 0.2425,
"num_input_tokens_seen": 162832,
"step": 1070
},
{
"epoch": 5.972222222222222,
"grad_norm": 0.04088395833969116,
"learning_rate": 4.4244276121784195e-05,
"loss": 0.2258,
"num_input_tokens_seen": 163568,
"step": 1075
},
{
"epoch": 6.0,
"grad_norm": 0.1749011129140854,
"learning_rate": 4.416668368417556e-05,
"loss": 0.236,
"num_input_tokens_seen": 164336,
"step": 1080
},
{
"epoch": 6.0,
"eval_loss": 0.2312491238117218,
"eval_runtime": 0.845,
"eval_samples_per_second": 47.336,
"eval_steps_per_second": 23.668,
"num_input_tokens_seen": 164336,
"step": 1080
},
{
"epoch": 6.027777777777778,
"grad_norm": 0.18159140646457672,
"learning_rate": 4.408864074546401e-05,
"loss": 0.2276,
"num_input_tokens_seen": 165072,
"step": 1085
},
{
"epoch": 6.055555555555555,
"grad_norm": 0.06449268013238907,
"learning_rate": 4.401014914000078e-05,
"loss": 0.2238,
"num_input_tokens_seen": 165824,
"step": 1090
},
{
"epoch": 6.083333333333333,
"grad_norm": 0.17080135643482208,
"learning_rate": 4.393121071268274e-05,
"loss": 0.2221,
"num_input_tokens_seen": 166608,
"step": 1095
},
{
"epoch": 6.111111111111111,
"grad_norm": 0.19969283044338226,
"learning_rate": 4.3851827318909036e-05,
"loss": 0.2433,
"num_input_tokens_seen": 167360,
"step": 1100
},
{
"epoch": 6.138888888888889,
"grad_norm": 0.0702192559838295,
"learning_rate": 4.377200082453749e-05,
"loss": 0.2346,
"num_input_tokens_seen": 168128,
"step": 1105
},
{
"epoch": 6.166666666666667,
"grad_norm": 0.08154265582561493,
"learning_rate": 4.36917331058407e-05,
"loss": 0.2325,
"num_input_tokens_seen": 168896,
"step": 1110
},
{
"epoch": 6.194444444444445,
"grad_norm": 0.05283205211162567,
"learning_rate": 4.361102604946201e-05,
"loss": 0.2294,
"num_input_tokens_seen": 169648,
"step": 1115
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.19671730697155,
"learning_rate": 4.3529881552371096e-05,
"loss": 0.2294,
"num_input_tokens_seen": 170416,
"step": 1120
},
{
"epoch": 6.25,
"grad_norm": 0.1893257349729538,
"learning_rate": 4.344830152181941e-05,
"loss": 0.2337,
"num_input_tokens_seen": 171184,
"step": 1125
},
{
"epoch": 6.277777777777778,
"grad_norm": 0.046719856560230255,
"learning_rate": 4.336628787529538e-05,
"loss": 0.2317,
"num_input_tokens_seen": 171984,
"step": 1130
},
{
"epoch": 6.305555555555555,
"grad_norm": 0.1931154429912567,
"learning_rate": 4.3283842540479264e-05,
"loss": 0.2233,
"num_input_tokens_seen": 172768,
"step": 1135
},
{
"epoch": 6.333333333333333,
"grad_norm": 0.19740572571754456,
"learning_rate": 4.320096745519793e-05,
"loss": 0.2317,
"num_input_tokens_seen": 173520,
"step": 1140
},
{
"epoch": 6.361111111111111,
"grad_norm": 0.0646887719631195,
"learning_rate": 4.3117664567379237e-05,
"loss": 0.2278,
"num_input_tokens_seen": 174304,
"step": 1145
},
{
"epoch": 6.388888888888889,
"grad_norm": 0.18534712493419647,
"learning_rate": 4.303393583500628e-05,
"loss": 0.2358,
"num_input_tokens_seen": 175040,
"step": 1150
},
{
"epoch": 6.416666666666667,
"grad_norm": 0.04154275357723236,
"learning_rate": 4.2949783226071406e-05,
"loss": 0.2325,
"num_input_tokens_seen": 175776,
"step": 1155
},
{
"epoch": 6.444444444444445,
"grad_norm": 0.1973147988319397,
"learning_rate": 4.286520871852987e-05,
"loss": 0.2291,
"num_input_tokens_seen": 176512,
"step": 1160
},
{
"epoch": 6.472222222222222,
"grad_norm": 0.22881700098514557,
"learning_rate": 4.278021430025343e-05,
"loss": 0.2456,
"num_input_tokens_seen": 177280,
"step": 1165
},
{
"epoch": 6.5,
"grad_norm": 0.10801947116851807,
"learning_rate": 4.2694801968983566e-05,
"loss": 0.2344,
"num_input_tokens_seen": 178032,
"step": 1170
},
{
"epoch": 6.527777777777778,
"grad_norm": 0.05663140118122101,
"learning_rate": 4.260897373228456e-05,
"loss": 0.2234,
"num_input_tokens_seen": 178784,
"step": 1175
},
{
"epoch": 6.555555555555555,
"grad_norm": 0.08504916727542877,
"learning_rate": 4.2522731607496275e-05,
"loss": 0.2243,
"num_input_tokens_seen": 179568,
"step": 1180
},
{
"epoch": 6.583333333333333,
"grad_norm": 0.21337293088436127,
"learning_rate": 4.2436077621686786e-05,
"loss": 0.236,
"num_input_tokens_seen": 180336,
"step": 1185
},
{
"epoch": 6.611111111111111,
"grad_norm": 0.2441207319498062,
"learning_rate": 4.234901381160469e-05,
"loss": 0.2335,
"num_input_tokens_seen": 181056,
"step": 1190
},
{
"epoch": 6.638888888888889,
"grad_norm": 0.16203121840953827,
"learning_rate": 4.226154222363124e-05,
"loss": 0.2302,
"num_input_tokens_seen": 181840,
"step": 1195
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.11439399421215057,
"learning_rate": 4.21736649137323e-05,
"loss": 0.2272,
"num_input_tokens_seen": 182592,
"step": 1200
},
{
"epoch": 6.694444444444445,
"grad_norm": 0.30583301186561584,
"learning_rate": 4.208538394740993e-05,
"loss": 0.2556,
"num_input_tokens_seen": 183344,
"step": 1205
},
{
"epoch": 6.722222222222222,
"grad_norm": 0.18746714293956757,
"learning_rate": 4.199670139965393e-05,
"loss": 0.2201,
"num_input_tokens_seen": 184112,
"step": 1210
},
{
"epoch": 6.75,
"grad_norm": 0.07710317522287369,
"learning_rate": 4.1907619354892965e-05,
"loss": 0.2304,
"num_input_tokens_seen": 184864,
"step": 1215
},
{
"epoch": 6.777777777777778,
"grad_norm": 0.21295295655727386,
"learning_rate": 4.1818139906945694e-05,
"loss": 0.2436,
"num_input_tokens_seen": 185632,
"step": 1220
},
{
"epoch": 6.805555555555555,
"grad_norm": 0.1958753913640976,
"learning_rate": 4.172826515897146e-05,
"loss": 0.2305,
"num_input_tokens_seen": 186368,
"step": 1225
},
{
"epoch": 6.833333333333333,
"grad_norm": 0.21225431561470032,
"learning_rate": 4.163799722342089e-05,
"loss": 0.2342,
"num_input_tokens_seen": 187120,
"step": 1230
},
{
"epoch": 6.861111111111111,
"grad_norm": 0.2011038064956665,
"learning_rate": 4.1547338221986266e-05,
"loss": 0.2402,
"num_input_tokens_seen": 187888,
"step": 1235
},
{
"epoch": 6.888888888888889,
"grad_norm": 0.21114513278007507,
"learning_rate": 4.1456290285551596e-05,
"loss": 0.2297,
"num_input_tokens_seen": 188672,
"step": 1240
},
{
"epoch": 6.916666666666667,
"grad_norm": 0.1804109662771225,
"learning_rate": 4.13648555541426e-05,
"loss": 0.2256,
"num_input_tokens_seen": 189424,
"step": 1245
},
{
"epoch": 6.944444444444445,
"grad_norm": 0.08288107067346573,
"learning_rate": 4.127303617687636e-05,
"loss": 0.232,
"num_input_tokens_seen": 190176,
"step": 1250
},
{
"epoch": 6.972222222222222,
"grad_norm": 0.13274434208869934,
"learning_rate": 4.118083431191081e-05,
"loss": 0.2197,
"num_input_tokens_seen": 190928,
"step": 1255
},
{
"epoch": 7.0,
"grad_norm": 0.10268542170524597,
"learning_rate": 4.108825212639405e-05,
"loss": 0.2103,
"num_input_tokens_seen": 191712,
"step": 1260
},
{
"epoch": 7.0,
"eval_loss": 0.23792143166065216,
"eval_runtime": 0.8418,
"eval_samples_per_second": 47.518,
"eval_steps_per_second": 23.759,
"num_input_tokens_seen": 191712,
"step": 1260
},
{
"epoch": 7.027777777777778,
"grad_norm": 0.3135641813278198,
"learning_rate": 4.099529179641337e-05,
"loss": 0.2294,
"num_input_tokens_seen": 192480,
"step": 1265
},
{
"epoch": 7.055555555555555,
"grad_norm": 0.36921995878219604,
"learning_rate": 4.09019555069441e-05,
"loss": 0.2162,
"num_input_tokens_seen": 193248,
"step": 1270
},
{
"epoch": 7.083333333333333,
"grad_norm": 0.17098984122276306,
"learning_rate": 4.080824545179828e-05,
"loss": 0.2273,
"num_input_tokens_seen": 194000,
"step": 1275
},
{
"epoch": 7.111111111111111,
"grad_norm": 0.15590733289718628,
"learning_rate": 4.071416383357307e-05,
"loss": 0.2495,
"num_input_tokens_seen": 194752,
"step": 1280
},
{
"epoch": 7.138888888888889,
"grad_norm": 0.23567894101142883,
"learning_rate": 4.0619712863599e-05,
"loss": 0.2551,
"num_input_tokens_seen": 195504,
"step": 1285
},
{
"epoch": 7.166666666666667,
"grad_norm": 0.1218881830573082,
"learning_rate": 4.0524894761888e-05,
"loss": 0.2465,
"num_input_tokens_seen": 196224,
"step": 1290
},
{
"epoch": 7.194444444444445,
"grad_norm": 0.07201294600963593,
"learning_rate": 4.042971175708118e-05,
"loss": 0.2236,
"num_input_tokens_seen": 196976,
"step": 1295
},
{
"epoch": 7.222222222222222,
"grad_norm": 0.26333221793174744,
"learning_rate": 4.0334166086396484e-05,
"loss": 0.234,
"num_input_tokens_seen": 197776,
"step": 1300
},
{
"epoch": 7.25,
"grad_norm": 0.2225353866815567,
"learning_rate": 4.0238259995576084e-05,
"loss": 0.234,
"num_input_tokens_seen": 198512,
"step": 1305
},
{
"epoch": 7.277777777777778,
"grad_norm": 0.23296818137168884,
"learning_rate": 4.0141995738833625e-05,
"loss": 0.2295,
"num_input_tokens_seen": 199280,
"step": 1310
},
{
"epoch": 7.305555555555555,
"grad_norm": 0.10952712595462799,
"learning_rate": 4.0045375578801214e-05,
"loss": 0.238,
"num_input_tokens_seen": 200032,
"step": 1315
},
{
"epoch": 7.333333333333333,
"grad_norm": 0.21747390925884247,
"learning_rate": 3.994840178647623e-05,
"loss": 0.2259,
"num_input_tokens_seen": 200768,
"step": 1320
},
{
"epoch": 7.361111111111111,
"grad_norm": 0.10875441133975983,
"learning_rate": 3.985107664116798e-05,
"loss": 0.2207,
"num_input_tokens_seen": 201552,
"step": 1325
},
{
"epoch": 7.388888888888889,
"grad_norm": 0.1415949910879135,
"learning_rate": 3.9753402430444116e-05,
"loss": 0.224,
"num_input_tokens_seen": 202320,
"step": 1330
},
{
"epoch": 7.416666666666667,
"grad_norm": 0.36580806970596313,
"learning_rate": 3.9655381450076826e-05,
"loss": 0.2026,
"num_input_tokens_seen": 203088,
"step": 1335
},
{
"epoch": 7.444444444444445,
"grad_norm": 0.5669692754745483,
"learning_rate": 3.955701600398892e-05,
"loss": 0.2945,
"num_input_tokens_seen": 203824,
"step": 1340
},
{
"epoch": 7.472222222222222,
"grad_norm": 0.2120898813009262,
"learning_rate": 3.945830840419966e-05,
"loss": 0.2271,
"num_input_tokens_seen": 204576,
"step": 1345
},
{
"epoch": 7.5,
"grad_norm": 0.19381973147392273,
"learning_rate": 3.935926097077045e-05,
"loss": 0.2358,
"num_input_tokens_seen": 205312,
"step": 1350
},
{
"epoch": 7.527777777777778,
"grad_norm": 0.07539873570203781,
"learning_rate": 3.925987603175023e-05,
"loss": 0.2252,
"num_input_tokens_seen": 206080,
"step": 1355
},
{
"epoch": 7.555555555555555,
"grad_norm": 0.09191757440567017,
"learning_rate": 3.916015592312082e-05,
"loss": 0.2275,
"num_input_tokens_seen": 206848,
"step": 1360
},
{
"epoch": 7.583333333333333,
"grad_norm": 0.24951884150505066,
"learning_rate": 3.9060102988742e-05,
"loss": 0.2218,
"num_input_tokens_seen": 207616,
"step": 1365
},
{
"epoch": 7.611111111111111,
"grad_norm": 0.08662799745798111,
"learning_rate": 3.8959719580296415e-05,
"loss": 0.2411,
"num_input_tokens_seen": 208352,
"step": 1370
},
{
"epoch": 7.638888888888889,
"grad_norm": 0.0688060075044632,
"learning_rate": 3.885900805723429e-05,
"loss": 0.2321,
"num_input_tokens_seen": 209104,
"step": 1375
},
{
"epoch": 7.666666666666667,
"grad_norm": 0.0747101828455925,
"learning_rate": 3.875797078671798e-05,
"loss": 0.2197,
"num_input_tokens_seen": 209824,
"step": 1380
},
{
"epoch": 7.694444444444445,
"grad_norm": 0.284817636013031,
"learning_rate": 3.865661014356635e-05,
"loss": 0.2272,
"num_input_tokens_seen": 210576,
"step": 1385
},
{
"epoch": 7.722222222222222,
"grad_norm": 0.09095483273267746,
"learning_rate": 3.855492851019893e-05,
"loss": 0.2434,
"num_input_tokens_seen": 211376,
"step": 1390
},
{
"epoch": 7.75,
"grad_norm": 0.08101051300764084,
"learning_rate": 3.8452928276579916e-05,
"loss": 0.2328,
"num_input_tokens_seen": 212128,
"step": 1395
},
{
"epoch": 7.777777777777778,
"grad_norm": 0.2549460828304291,
"learning_rate": 3.835061184016203e-05,
"loss": 0.236,
"num_input_tokens_seen": 212928,
"step": 1400
},
{
"epoch": 7.805555555555555,
"grad_norm": 0.20550718903541565,
"learning_rate": 3.824798160583012e-05,
"loss": 0.2321,
"num_input_tokens_seen": 213696,
"step": 1405
},
{
"epoch": 7.833333333333333,
"grad_norm": 0.11653517186641693,
"learning_rate": 3.814503998584471e-05,
"loss": 0.2256,
"num_input_tokens_seen": 214464,
"step": 1410
},
{
"epoch": 7.861111111111111,
"grad_norm": 0.19850236177444458,
"learning_rate": 3.804178939978517e-05,
"loss": 0.2198,
"num_input_tokens_seen": 215248,
"step": 1415
},
{
"epoch": 7.888888888888889,
"grad_norm": 0.11180128902196884,
"learning_rate": 3.7938232274493e-05,
"loss": 0.2279,
"num_input_tokens_seen": 216016,
"step": 1420
},
{
"epoch": 7.916666666666667,
"grad_norm": 0.35185161232948303,
"learning_rate": 3.783437104401469e-05,
"loss": 0.2001,
"num_input_tokens_seen": 216784,
"step": 1425
},
{
"epoch": 7.944444444444445,
"grad_norm": 0.12019939720630646,
"learning_rate": 3.773020814954453e-05,
"loss": 0.2461,
"num_input_tokens_seen": 217552,
"step": 1430
},
{
"epoch": 7.972222222222222,
"grad_norm": 0.23168745636940002,
"learning_rate": 3.762574603936725e-05,
"loss": 0.2246,
"num_input_tokens_seen": 218320,
"step": 1435
},
{
"epoch": 8.0,
"grad_norm": 0.20853398740291595,
"learning_rate": 3.752098716880045e-05,
"loss": 0.24,
"num_input_tokens_seen": 219072,
"step": 1440
},
{
"epoch": 8.0,
"eval_loss": 0.24177499115467072,
"eval_runtime": 0.8427,
"eval_samples_per_second": 47.465,
"eval_steps_per_second": 23.732,
"num_input_tokens_seen": 219072,
"step": 1440
},
{
"epoch": 8.027777777777779,
"grad_norm": 0.15344388782978058,
"learning_rate": 3.74159340001369e-05,
"loss": 0.2449,
"num_input_tokens_seen": 219824,
"step": 1445
},
{
"epoch": 8.055555555555555,
"grad_norm": 0.14319944381713867,
"learning_rate": 3.731058900258668e-05,
"loss": 0.2346,
"num_input_tokens_seen": 220608,
"step": 1450
},
{
"epoch": 8.083333333333334,
"grad_norm": 0.24495133757591248,
"learning_rate": 3.7204954652219104e-05,
"loss": 0.2317,
"num_input_tokens_seen": 221344,
"step": 1455
},
{
"epoch": 8.11111111111111,
"grad_norm": 0.23096558451652527,
"learning_rate": 3.7099033431904575e-05,
"loss": 0.2173,
"num_input_tokens_seen": 222096,
"step": 1460
},
{
"epoch": 8.13888888888889,
"grad_norm": 0.15797853469848633,
"learning_rate": 3.699282783125616e-05,
"loss": 0.2158,
"num_input_tokens_seen": 222864,
"step": 1465
},
{
"epoch": 8.166666666666666,
"grad_norm": 0.5462031364440918,
"learning_rate": 3.688634034657115e-05,
"loss": 0.2098,
"num_input_tokens_seen": 223648,
"step": 1470
},
{
"epoch": 8.194444444444445,
"grad_norm": 0.5156317949295044,
"learning_rate": 3.6779573480772325e-05,
"loss": 0.2215,
"num_input_tokens_seen": 224448,
"step": 1475
},
{
"epoch": 8.222222222222221,
"grad_norm": 1.479132056236267,
"learning_rate": 3.6672529743349146e-05,
"loss": 0.255,
"num_input_tokens_seen": 225184,
"step": 1480
},
{
"epoch": 8.25,
"grad_norm": 1.7528244256973267,
"learning_rate": 3.656521165029879e-05,
"loss": 0.2673,
"num_input_tokens_seen": 225936,
"step": 1485
},
{
"epoch": 8.277777777777779,
"grad_norm": 0.23426023125648499,
"learning_rate": 3.6457621724066964e-05,
"loss": 0.2224,
"num_input_tokens_seen": 226672,
"step": 1490
},
{
"epoch": 8.305555555555555,
"grad_norm": 0.4958864152431488,
"learning_rate": 3.634976249348867e-05,
"loss": 0.2324,
"num_input_tokens_seen": 227424,
"step": 1495
},
{
"epoch": 8.333333333333334,
"grad_norm": 0.5171009302139282,
"learning_rate": 3.6241636493728736e-05,
"loss": 0.2379,
"num_input_tokens_seen": 228208,
"step": 1500
},
{
"epoch": 8.36111111111111,
"grad_norm": 0.10480030626058578,
"learning_rate": 3.613324626622224e-05,
"loss": 0.2496,
"num_input_tokens_seen": 228992,
"step": 1505
},
{
"epoch": 8.38888888888889,
"grad_norm": 0.29571622610092163,
"learning_rate": 3.602459435861475e-05,
"loss": 0.2361,
"num_input_tokens_seen": 229744,
"step": 1510
},
{
"epoch": 8.416666666666666,
"grad_norm": 0.1226486787199974,
"learning_rate": 3.591568332470249e-05,
"loss": 0.2283,
"num_input_tokens_seen": 230496,
"step": 1515
},
{
"epoch": 8.444444444444445,
"grad_norm": 0.48671409487724304,
"learning_rate": 3.5806515724372274e-05,
"loss": 0.2484,
"num_input_tokens_seen": 231248,
"step": 1520
},
{
"epoch": 8.472222222222221,
"grad_norm": 0.09926868230104446,
"learning_rate": 3.569709412354136e-05,
"loss": 0.2262,
"num_input_tokens_seen": 232000,
"step": 1525
},
{
"epoch": 8.5,
"grad_norm": 0.10085448622703552,
"learning_rate": 3.5587421094097115e-05,
"loss": 0.2362,
"num_input_tokens_seen": 232768,
"step": 1530
},
{
"epoch": 8.527777777777779,
"grad_norm": 0.24544471502304077,
"learning_rate": 3.5477499213836616e-05,
"loss": 0.2216,
"num_input_tokens_seen": 233568,
"step": 1535
},
{
"epoch": 8.555555555555555,
"grad_norm": 0.23776550590991974,
"learning_rate": 3.536733106640598e-05,
"loss": 0.2283,
"num_input_tokens_seen": 234320,
"step": 1540
},
{
"epoch": 8.583333333333334,
"grad_norm": 0.34067851305007935,
"learning_rate": 3.525691924123971e-05,
"loss": 0.2336,
"num_input_tokens_seen": 235040,
"step": 1545
},
{
"epoch": 8.61111111111111,
"grad_norm": 0.35513660311698914,
"learning_rate": 3.5146266333499795e-05,
"loss": 0.2289,
"num_input_tokens_seen": 235824,
"step": 1550
},
{
"epoch": 8.63888888888889,
"grad_norm": 0.16709186136722565,
"learning_rate": 3.503537494401473e-05,
"loss": 0.248,
"num_input_tokens_seen": 236592,
"step": 1555
},
{
"epoch": 8.666666666666666,
"grad_norm": 0.1513429880142212,
"learning_rate": 3.4924247679218375e-05,
"loss": 0.2297,
"num_input_tokens_seen": 237344,
"step": 1560
},
{
"epoch": 8.694444444444445,
"grad_norm": 0.13434197008609772,
"learning_rate": 3.481288715108868e-05,
"loss": 0.2316,
"num_input_tokens_seen": 238080,
"step": 1565
},
{
"epoch": 8.722222222222221,
"grad_norm": 0.15651777386665344,
"learning_rate": 3.4701295977086324e-05,
"loss": 0.2211,
"num_input_tokens_seen": 238816,
"step": 1570
},
{
"epoch": 8.75,
"grad_norm": 0.14915582537651062,
"learning_rate": 3.4589476780093166e-05,
"loss": 0.2264,
"num_input_tokens_seen": 239568,
"step": 1575
},
{
"epoch": 8.777777777777779,
"grad_norm": 0.16734644770622253,
"learning_rate": 3.44774321883506e-05,
"loss": 0.2365,
"num_input_tokens_seen": 240352,
"step": 1580
},
{
"epoch": 8.805555555555555,
"grad_norm": 0.1950577050447464,
"learning_rate": 3.436516483539781e-05,
"loss": 0.2298,
"num_input_tokens_seen": 241120,
"step": 1585
},
{
"epoch": 8.833333333333334,
"grad_norm": 0.23239564895629883,
"learning_rate": 3.42526773600098e-05,
"loss": 0.2276,
"num_input_tokens_seen": 241856,
"step": 1590
},
{
"epoch": 8.86111111111111,
"grad_norm": 0.23152494430541992,
"learning_rate": 3.4139972406135464e-05,
"loss": 0.2219,
"num_input_tokens_seen": 242608,
"step": 1595
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.2428346872329712,
"learning_rate": 3.402705262283537e-05,
"loss": 0.2305,
"num_input_tokens_seen": 243360,
"step": 1600
},
{
"epoch": 8.916666666666666,
"grad_norm": 0.21203842759132385,
"learning_rate": 3.39139206642195e-05,
"loss": 0.2253,
"num_input_tokens_seen": 244128,
"step": 1605
},
{
"epoch": 8.944444444444445,
"grad_norm": 0.29292061924934387,
"learning_rate": 3.3800579189384944e-05,
"loss": 0.2334,
"num_input_tokens_seen": 244896,
"step": 1610
},
{
"epoch": 8.972222222222221,
"grad_norm": 0.14690163731575012,
"learning_rate": 3.3687030862353286e-05,
"loss": 0.232,
"num_input_tokens_seen": 245664,
"step": 1615
},
{
"epoch": 9.0,
"grad_norm": 0.25889158248901367,
"learning_rate": 3.357327835200807e-05,
"loss": 0.231,
"num_input_tokens_seen": 246416,
"step": 1620
},
{
"epoch": 9.0,
"eval_loss": 0.2354724407196045,
"eval_runtime": 0.8507,
"eval_samples_per_second": 47.02,
"eval_steps_per_second": 23.51,
"num_input_tokens_seen": 246416,
"step": 1620
},
{
"epoch": 9.027777777777779,
"grad_norm": 0.15429404377937317,
"learning_rate": 3.3459324332032035e-05,
"loss": 0.2321,
"num_input_tokens_seen": 247184,
"step": 1625
},
{
"epoch": 9.055555555555555,
"grad_norm": 0.21661336719989777,
"learning_rate": 3.3345171480844275e-05,
"loss": 0.212,
"num_input_tokens_seen": 247936,
"step": 1630
},
{
"epoch": 9.083333333333334,
"grad_norm": 0.36724531650543213,
"learning_rate": 3.32308224815373e-05,
"loss": 0.2297,
"num_input_tokens_seen": 248688,
"step": 1635
},
{
"epoch": 9.11111111111111,
"grad_norm": 0.2442227602005005,
"learning_rate": 3.311628002181398e-05,
"loss": 0.2371,
"num_input_tokens_seen": 249472,
"step": 1640
},
{
"epoch": 9.13888888888889,
"grad_norm": 0.3292847275733948,
"learning_rate": 3.3001546793924285e-05,
"loss": 0.2321,
"num_input_tokens_seen": 250224,
"step": 1645
},
{
"epoch": 9.166666666666666,
"grad_norm": 0.22451095283031464,
"learning_rate": 3.288662549460216e-05,
"loss": 0.2218,
"num_input_tokens_seen": 250960,
"step": 1650
},
{
"epoch": 9.194444444444445,
"grad_norm": 0.578140914440155,
"learning_rate": 3.277151882500199e-05,
"loss": 0.2258,
"num_input_tokens_seen": 251728,
"step": 1655
},
{
"epoch": 9.222222222222221,
"grad_norm": 0.15085914731025696,
"learning_rate": 3.26562294906352e-05,
"loss": 0.2476,
"num_input_tokens_seen": 252512,
"step": 1660
},
{
"epoch": 9.25,
"grad_norm": 0.3858492076396942,
"learning_rate": 3.254076020130664e-05,
"loss": 0.2142,
"num_input_tokens_seen": 253280,
"step": 1665
},
{
"epoch": 9.277777777777779,
"grad_norm": 0.41480836272239685,
"learning_rate": 3.242511367105087e-05,
"loss": 0.2091,
"num_input_tokens_seen": 254032,
"step": 1670
},
{
"epoch": 9.305555555555555,
"grad_norm": 0.6945786476135254,
"learning_rate": 3.230929261806842e-05,
"loss": 0.1549,
"num_input_tokens_seen": 254800,
"step": 1675
},
{
"epoch": 9.333333333333334,
"grad_norm": 3.4180619716644287,
"learning_rate": 3.2193299764661845e-05,
"loss": 0.2614,
"num_input_tokens_seen": 255584,
"step": 1680
},
{
"epoch": 9.36111111111111,
"grad_norm": 2.3858699798583984,
"learning_rate": 3.207713783717176e-05,
"loss": 0.3004,
"num_input_tokens_seen": 256368,
"step": 1685
},
{
"epoch": 9.38888888888889,
"grad_norm": 13.90650463104248,
"learning_rate": 3.1960809565912794e-05,
"loss": 0.2912,
"num_input_tokens_seen": 257104,
"step": 1690
},
{
"epoch": 9.416666666666666,
"grad_norm": 23.31499671936035,
"learning_rate": 3.1844317685109354e-05,
"loss": 0.4081,
"num_input_tokens_seen": 257856,
"step": 1695
},
{
"epoch": 9.444444444444445,
"grad_norm": 3.4871859550476074,
"learning_rate": 3.1727664932831394e-05,
"loss": 0.3059,
"num_input_tokens_seen": 258608,
"step": 1700
},
{
"epoch": 9.472222222222221,
"grad_norm": 5.127751350402832,
"learning_rate": 3.161085405093006e-05,
"loss": 0.2625,
"num_input_tokens_seen": 259344,
"step": 1705
},
{
"epoch": 9.5,
"grad_norm": 1.413158655166626,
"learning_rate": 3.149388778497323e-05,
"loss": 0.2651,
"num_input_tokens_seen": 260112,
"step": 1710
},
{
"epoch": 9.527777777777779,
"grad_norm": 0.344842791557312,
"learning_rate": 3.137676888418099e-05,
"loss": 0.2377,
"num_input_tokens_seen": 260864,
"step": 1715
},
{
"epoch": 9.555555555555555,
"grad_norm": 0.6449137330055237,
"learning_rate": 3.125950010136104e-05,
"loss": 0.2376,
"num_input_tokens_seen": 261632,
"step": 1720
},
{
"epoch": 9.583333333333334,
"grad_norm": 0.5378340482711792,
"learning_rate": 3.114208419284391e-05,
"loss": 0.2311,
"num_input_tokens_seen": 262368,
"step": 1725
},
{
"epoch": 9.61111111111111,
"grad_norm": 0.503693699836731,
"learning_rate": 3.102452391841828e-05,
"loss": 0.2083,
"num_input_tokens_seen": 263136,
"step": 1730
},
{
"epoch": 9.63888888888889,
"grad_norm": 0.7619789838790894,
"learning_rate": 3.090682204126604e-05,
"loss": 0.2502,
"num_input_tokens_seen": 263872,
"step": 1735
},
{
"epoch": 9.666666666666666,
"grad_norm": 0.6181730031967163,
"learning_rate": 3.078898132789735e-05,
"loss": 0.2408,
"num_input_tokens_seen": 264608,
"step": 1740
},
{
"epoch": 9.694444444444445,
"grad_norm": 0.521176278591156,
"learning_rate": 3.0671004548085675e-05,
"loss": 0.2263,
"num_input_tokens_seen": 265328,
"step": 1745
},
{
"epoch": 9.722222222222221,
"grad_norm": 0.3363834321498871,
"learning_rate": 3.0552894474802584e-05,
"loss": 0.2311,
"num_input_tokens_seen": 266112,
"step": 1750
},
{
"epoch": 9.75,
"grad_norm": 0.22221645712852478,
"learning_rate": 3.043465388415267e-05,
"loss": 0.2469,
"num_input_tokens_seen": 266864,
"step": 1755
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.19677917659282684,
"learning_rate": 3.0316285555308233e-05,
"loss": 0.2179,
"num_input_tokens_seen": 267600,
"step": 1760
},
{
"epoch": 9.805555555555555,
"grad_norm": 0.3606593608856201,
"learning_rate": 3.0197792270443982e-05,
"loss": 0.2245,
"num_input_tokens_seen": 268384,
"step": 1765
},
{
"epoch": 9.833333333333334,
"grad_norm": 0.25018107891082764,
"learning_rate": 3.0079176814671656e-05,
"loss": 0.2253,
"num_input_tokens_seen": 269168,
"step": 1770
},
{
"epoch": 9.86111111111111,
"grad_norm": 0.2878740727901459,
"learning_rate": 2.9960441975974534e-05,
"loss": 0.2276,
"num_input_tokens_seen": 269904,
"step": 1775
},
{
"epoch": 9.88888888888889,
"grad_norm": 0.3176515996456146,
"learning_rate": 2.9841590545141906e-05,
"loss": 0.2348,
"num_input_tokens_seen": 270656,
"step": 1780
},
{
"epoch": 9.916666666666666,
"grad_norm": 0.26501137018203735,
"learning_rate": 2.9722625315703512e-05,
"loss": 0.2349,
"num_input_tokens_seen": 271408,
"step": 1785
},
{
"epoch": 9.944444444444445,
"grad_norm": 0.13339616358280182,
"learning_rate": 2.9603549083863847e-05,
"loss": 0.2326,
"num_input_tokens_seen": 272192,
"step": 1790
},
{
"epoch": 9.972222222222221,
"grad_norm": 0.09521733969449997,
"learning_rate": 2.9484364648436437e-05,
"loss": 0.2394,
"num_input_tokens_seen": 272960,
"step": 1795
},
{
"epoch": 10.0,
"grad_norm": 0.13899917900562286,
"learning_rate": 2.9365074810778094e-05,
"loss": 0.2325,
"num_input_tokens_seen": 273712,
"step": 1800
},
{
"epoch": 10.0,
"eval_loss": 0.23395749926567078,
"eval_runtime": 0.8436,
"eval_samples_per_second": 47.418,
"eval_steps_per_second": 23.709,
"num_input_tokens_seen": 273712,
"step": 1800
},
{
"epoch": 10.027777777777779,
"grad_norm": 0.37314531207084656,
"learning_rate": 2.9245682374723016e-05,
"loss": 0.2183,
"num_input_tokens_seen": 274480,
"step": 1805
},
{
"epoch": 10.055555555555555,
"grad_norm": 0.40587741136550903,
"learning_rate": 2.9126190146516942e-05,
"loss": 0.2281,
"num_input_tokens_seen": 275264,
"step": 1810
},
{
"epoch": 10.083333333333334,
"grad_norm": 0.2202170491218567,
"learning_rate": 2.9006600934751145e-05,
"loss": 0.2193,
"num_input_tokens_seen": 276016,
"step": 1815
},
{
"epoch": 10.11111111111111,
"grad_norm": 0.1918032318353653,
"learning_rate": 2.888691755029642e-05,
"loss": 0.2351,
"num_input_tokens_seen": 276752,
"step": 1820
},
{
"epoch": 10.13888888888889,
"grad_norm": 0.22308309376239777,
"learning_rate": 2.876714280623708e-05,
"loss": 0.2296,
"num_input_tokens_seen": 277520,
"step": 1825
},
{
"epoch": 10.166666666666666,
"grad_norm": 0.17808394134044647,
"learning_rate": 2.8647279517804754e-05,
"loss": 0.2205,
"num_input_tokens_seen": 278272,
"step": 1830
},
{
"epoch": 10.194444444444445,
"grad_norm": 0.287969172000885,
"learning_rate": 2.8527330502312248e-05,
"loss": 0.2259,
"num_input_tokens_seen": 279040,
"step": 1835
},
{
"epoch": 10.222222222222221,
"grad_norm": 0.1247008889913559,
"learning_rate": 2.8407298579087365e-05,
"loss": 0.2343,
"num_input_tokens_seen": 279792,
"step": 1840
},
{
"epoch": 10.25,
"grad_norm": 0.18362240493297577,
"learning_rate": 2.8287186569406566e-05,
"loss": 0.2281,
"num_input_tokens_seen": 280560,
"step": 1845
},
{
"epoch": 10.277777777777779,
"grad_norm": 0.3308933675289154,
"learning_rate": 2.816699729642871e-05,
"loss": 0.2175,
"num_input_tokens_seen": 281328,
"step": 1850
},
{
"epoch": 10.305555555555555,
"grad_norm": 0.19184178113937378,
"learning_rate": 2.8046733585128687e-05,
"loss": 0.2199,
"num_input_tokens_seen": 282112,
"step": 1855
},
{
"epoch": 10.333333333333334,
"grad_norm": 0.32785189151763916,
"learning_rate": 2.792639826223101e-05,
"loss": 0.223,
"num_input_tokens_seen": 282880,
"step": 1860
},
{
"epoch": 10.36111111111111,
"grad_norm": 0.304923415184021,
"learning_rate": 2.7805994156143376e-05,
"loss": 0.2083,
"num_input_tokens_seen": 283648,
"step": 1865
},
{
"epoch": 10.38888888888889,
"grad_norm": 0.4556541442871094,
"learning_rate": 2.7685524096890185e-05,
"loss": 0.2172,
"num_input_tokens_seen": 284464,
"step": 1870
},
{
"epoch": 10.416666666666666,
"grad_norm": 0.4782843291759491,
"learning_rate": 2.756499091604603e-05,
"loss": 0.2526,
"num_input_tokens_seen": 285232,
"step": 1875
},
{
"epoch": 10.444444444444445,
"grad_norm": 0.4656970798969269,
"learning_rate": 2.744439744666915e-05,
"loss": 0.2165,
"num_input_tokens_seen": 285984,
"step": 1880
},
{
"epoch": 10.472222222222221,
"grad_norm": 0.7466927766799927,
"learning_rate": 2.732374652323481e-05,
"loss": 0.2424,
"num_input_tokens_seen": 286752,
"step": 1885
},
{
"epoch": 10.5,
"grad_norm": 0.5435920357704163,
"learning_rate": 2.72030409815687e-05,
"loss": 0.2334,
"num_input_tokens_seen": 287520,
"step": 1890
},
{
"epoch": 10.527777777777779,
"grad_norm": 0.746811032295227,
"learning_rate": 2.7082283658780288e-05,
"loss": 0.2073,
"num_input_tokens_seen": 288240,
"step": 1895
},
{
"epoch": 10.555555555555555,
"grad_norm": 0.601383626461029,
"learning_rate": 2.6961477393196126e-05,
"loss": 0.2345,
"num_input_tokens_seen": 289008,
"step": 1900
},
{
"epoch": 10.583333333333334,
"grad_norm": 0.4319969415664673,
"learning_rate": 2.684062502429312e-05,
"loss": 0.2182,
"num_input_tokens_seen": 289776,
"step": 1905
},
{
"epoch": 10.61111111111111,
"grad_norm": 0.7417342662811279,
"learning_rate": 2.6719729392631826e-05,
"loss": 0.2547,
"num_input_tokens_seen": 290560,
"step": 1910
},
{
"epoch": 10.63888888888889,
"grad_norm": 0.4038422703742981,
"learning_rate": 2.659879333978964e-05,
"loss": 0.217,
"num_input_tokens_seen": 291296,
"step": 1915
},
{
"epoch": 10.666666666666666,
"grad_norm": 0.3393801152706146,
"learning_rate": 2.6477819708294064e-05,
"loss": 0.2522,
"num_input_tokens_seen": 292032,
"step": 1920
},
{
"epoch": 10.694444444444445,
"grad_norm": 0.3926829397678375,
"learning_rate": 2.635681134155585e-05,
"loss": 0.2324,
"num_input_tokens_seen": 292784,
"step": 1925
},
{
"epoch": 10.722222222222221,
"grad_norm": 0.1447238177061081,
"learning_rate": 2.623577108380215e-05,
"loss": 0.2305,
"num_input_tokens_seen": 293520,
"step": 1930
},
{
"epoch": 10.75,
"grad_norm": 0.3792881667613983,
"learning_rate": 2.6114701780009753e-05,
"loss": 0.2424,
"num_input_tokens_seen": 294272,
"step": 1935
},
{
"epoch": 10.777777777777779,
"grad_norm": 0.1470859944820404,
"learning_rate": 2.5993606275838117e-05,
"loss": 0.2427,
"num_input_tokens_seen": 295008,
"step": 1940
},
{
"epoch": 10.805555555555555,
"grad_norm": 0.290236234664917,
"learning_rate": 2.587248741756253e-05,
"loss": 0.2342,
"num_input_tokens_seen": 295776,
"step": 1945
},
{
"epoch": 10.833333333333334,
"grad_norm": 0.1666247695684433,
"learning_rate": 2.5751348052007206e-05,
"loss": 0.2315,
"num_input_tokens_seen": 296512,
"step": 1950
},
{
"epoch": 10.86111111111111,
"grad_norm": 0.26337382197380066,
"learning_rate": 2.5630191026478368e-05,
"loss": 0.2293,
"num_input_tokens_seen": 297248,
"step": 1955
},
{
"epoch": 10.88888888888889,
"grad_norm": 0.3205804228782654,
"learning_rate": 2.5509019188697343e-05,
"loss": 0.2214,
"num_input_tokens_seen": 298032,
"step": 1960
},
{
"epoch": 10.916666666666666,
"grad_norm": 0.3166457414627075,
"learning_rate": 2.5387835386733584e-05,
"loss": 0.2314,
"num_input_tokens_seen": 298800,
"step": 1965
},
{
"epoch": 10.944444444444445,
"grad_norm": 0.22426152229309082,
"learning_rate": 2.5266642468937766e-05,
"loss": 0.2428,
"num_input_tokens_seen": 299552,
"step": 1970
},
{
"epoch": 10.972222222222221,
"grad_norm": 0.20033201575279236,
"learning_rate": 2.5145443283874848e-05,
"loss": 0.2221,
"num_input_tokens_seen": 300320,
"step": 1975
},
{
"epoch": 11.0,
"grad_norm": 0.29905927181243896,
"learning_rate": 2.5024240680257055e-05,
"loss": 0.2153,
"num_input_tokens_seen": 301088,
"step": 1980
},
{
"epoch": 11.0,
"eval_loss": 0.23569095134735107,
"eval_runtime": 0.8449,
"eval_samples_per_second": 47.345,
"eval_steps_per_second": 23.673,
"num_input_tokens_seen": 301088,
"step": 1980
},
{
"epoch": 11.027777777777779,
"grad_norm": 0.23592987656593323,
"learning_rate": 2.4903037506876997e-05,
"loss": 0.2065,
"num_input_tokens_seen": 301856,
"step": 1985
},
{
"epoch": 11.055555555555555,
"grad_norm": 0.3055577576160431,
"learning_rate": 2.4781836612540657e-05,
"loss": 0.239,
"num_input_tokens_seen": 302592,
"step": 1990
},
{
"epoch": 11.083333333333334,
"grad_norm": 0.3388107120990753,
"learning_rate": 2.4660640846000453e-05,
"loss": 0.2296,
"num_input_tokens_seen": 303360,
"step": 1995
},
{
"epoch": 11.11111111111111,
"grad_norm": 0.3578348755836487,
"learning_rate": 2.4539453055888297e-05,
"loss": 0.2151,
"num_input_tokens_seen": 304096,
"step": 2000
},
{
"epoch": 11.13888888888889,
"grad_norm": 0.2506900131702423,
"learning_rate": 2.4418276090648596e-05,
"loss": 0.2218,
"num_input_tokens_seen": 304880,
"step": 2005
},
{
"epoch": 11.166666666666666,
"grad_norm": 0.4147559106349945,
"learning_rate": 2.4297112798471326e-05,
"loss": 0.2321,
"num_input_tokens_seen": 305664,
"step": 2010
},
{
"epoch": 11.194444444444445,
"grad_norm": 0.47687992453575134,
"learning_rate": 2.4175966027225107e-05,
"loss": 0.2202,
"num_input_tokens_seen": 306448,
"step": 2015
},
{
"epoch": 11.222222222222221,
"grad_norm": 0.36644211411476135,
"learning_rate": 2.405483862439023e-05,
"loss": 0.2241,
"num_input_tokens_seen": 307216,
"step": 2020
},
{
"epoch": 11.25,
"grad_norm": 0.45731303095817566,
"learning_rate": 2.3933733436991732e-05,
"loss": 0.2306,
"num_input_tokens_seen": 307968,
"step": 2025
},
{
"epoch": 11.277777777777779,
"grad_norm": 0.5980037450790405,
"learning_rate": 2.381265331153252e-05,
"loss": 0.2391,
"num_input_tokens_seen": 308720,
"step": 2030
},
{
"epoch": 11.305555555555555,
"grad_norm": 0.9406579732894897,
"learning_rate": 2.3691601093926404e-05,
"loss": 0.2059,
"num_input_tokens_seen": 309472,
"step": 2035
},
{
"epoch": 11.333333333333334,
"grad_norm": 0.5597310662269592,
"learning_rate": 2.3570579629431267e-05,
"loss": 0.2382,
"num_input_tokens_seen": 310224,
"step": 2040
},
{
"epoch": 11.36111111111111,
"grad_norm": 0.5758081674575806,
"learning_rate": 2.344959176258212e-05,
"loss": 0.222,
"num_input_tokens_seen": 311008,
"step": 2045
},
{
"epoch": 11.38888888888889,
"grad_norm": 0.6426092386245728,
"learning_rate": 2.3328640337124326e-05,
"loss": 0.211,
"num_input_tokens_seen": 311744,
"step": 2050
},
{
"epoch": 11.416666666666666,
"grad_norm": 0.5062636137008667,
"learning_rate": 2.3207728195946688e-05,
"loss": 0.2242,
"num_input_tokens_seen": 312512,
"step": 2055
},
{
"epoch": 11.444444444444445,
"grad_norm": 0.6146106123924255,
"learning_rate": 2.3086858181014653e-05,
"loss": 0.2537,
"num_input_tokens_seen": 313248,
"step": 2060
},
{
"epoch": 11.472222222222221,
"grad_norm": 0.6537105441093445,
"learning_rate": 2.2966033133303545e-05,
"loss": 0.2294,
"num_input_tokens_seen": 314032,
"step": 2065
},
{
"epoch": 11.5,
"grad_norm": 0.6333838105201721,
"learning_rate": 2.2845255892731733e-05,
"loss": 0.2345,
"num_input_tokens_seen": 314784,
"step": 2070
},
{
"epoch": 11.527777777777779,
"grad_norm": 0.5479741096496582,
"learning_rate": 2.2724529298093915e-05,
"loss": 0.2169,
"num_input_tokens_seen": 315520,
"step": 2075
},
{
"epoch": 11.555555555555555,
"grad_norm": 0.49262702465057373,
"learning_rate": 2.26038561869944e-05,
"loss": 0.2138,
"num_input_tokens_seen": 316288,
"step": 2080
},
{
"epoch": 11.583333333333334,
"grad_norm": 0.4937076270580292,
"learning_rate": 2.248323939578039e-05,
"loss": 0.2234,
"num_input_tokens_seen": 317040,
"step": 2085
},
{
"epoch": 11.61111111111111,
"grad_norm": 0.5979186296463013,
"learning_rate": 2.2362681759475307e-05,
"loss": 0.2273,
"num_input_tokens_seen": 317776,
"step": 2090
},
{
"epoch": 11.63888888888889,
"grad_norm": 0.47289130091667175,
"learning_rate": 2.2242186111712208e-05,
"loss": 0.2244,
"num_input_tokens_seen": 318560,
"step": 2095
},
{
"epoch": 11.666666666666666,
"grad_norm": 0.5286301970481873,
"learning_rate": 2.212175528466712e-05,
"loss": 0.2486,
"num_input_tokens_seen": 319296,
"step": 2100
},
{
"epoch": 11.694444444444445,
"grad_norm": 0.6446691751480103,
"learning_rate": 2.2001392108992504e-05,
"loss": 0.1969,
"num_input_tokens_seen": 320064,
"step": 2105
},
{
"epoch": 11.722222222222221,
"grad_norm": 0.9104120135307312,
"learning_rate": 2.1881099413750733e-05,
"loss": 0.2327,
"num_input_tokens_seen": 320800,
"step": 2110
},
{
"epoch": 11.75,
"grad_norm": 1.2983386516571045,
"learning_rate": 2.1760880026347562e-05,
"loss": 0.2108,
"num_input_tokens_seen": 321536,
"step": 2115
},
{
"epoch": 11.777777777777779,
"grad_norm": 0.8108537793159485,
"learning_rate": 2.16407367724657e-05,
"loss": 0.2259,
"num_input_tokens_seen": 322320,
"step": 2120
},
{
"epoch": 11.805555555555555,
"grad_norm": 0.9631186723709106,
"learning_rate": 2.1520672475998373e-05,
"loss": 0.2064,
"num_input_tokens_seen": 323056,
"step": 2125
},
{
"epoch": 11.833333333333334,
"grad_norm": 0.7661670446395874,
"learning_rate": 2.140068995898297e-05,
"loss": 0.2108,
"num_input_tokens_seen": 323824,
"step": 2130
},
{
"epoch": 11.86111111111111,
"grad_norm": 0.737259566783905,
"learning_rate": 2.1280792041534714e-05,
"loss": 0.2186,
"num_input_tokens_seen": 324624,
"step": 2135
},
{
"epoch": 11.88888888888889,
"grad_norm": 1.0927965641021729,
"learning_rate": 2.116098154178035e-05,
"loss": 0.2016,
"num_input_tokens_seen": 325392,
"step": 2140
},
{
"epoch": 11.916666666666666,
"grad_norm": 1.9153894186019897,
"learning_rate": 2.1041261275791933e-05,
"loss": 0.248,
"num_input_tokens_seen": 326144,
"step": 2145
},
{
"epoch": 11.944444444444445,
"grad_norm": 2.076587438583374,
"learning_rate": 2.092163405752063e-05,
"loss": 0.201,
"num_input_tokens_seen": 326880,
"step": 2150
},
{
"epoch": 11.972222222222221,
"grad_norm": 1.3827784061431885,
"learning_rate": 2.0802102698730574e-05,
"loss": 0.2192,
"num_input_tokens_seen": 327648,
"step": 2155
},
{
"epoch": 12.0,
"grad_norm": 6.211891174316406,
"learning_rate": 2.0682670008932785e-05,
"loss": 0.3359,
"num_input_tokens_seen": 328384,
"step": 2160
},
{
"epoch": 12.0,
"eval_loss": 0.3373684883117676,
"eval_runtime": 0.8514,
"eval_samples_per_second": 46.983,
"eval_steps_per_second": 23.491,
"num_input_tokens_seen": 328384,
"step": 2160
},
{
"epoch": 12.027777777777779,
"grad_norm": 2.2284414768218994,
"learning_rate": 2.0563338795319123e-05,
"loss": 0.2025,
"num_input_tokens_seen": 329136,
"step": 2165
},
{
"epoch": 12.055555555555555,
"grad_norm": 1.8171899318695068,
"learning_rate": 2.0444111862696314e-05,
"loss": 0.2236,
"num_input_tokens_seen": 329904,
"step": 2170
},
{
"epoch": 12.083333333333334,
"grad_norm": 4.0419111251831055,
"learning_rate": 2.032499201342003e-05,
"loss": 0.222,
"num_input_tokens_seen": 330640,
"step": 2175
},
{
"epoch": 12.11111111111111,
"grad_norm": 1.5563691854476929,
"learning_rate": 2.020598204732901e-05,
"loss": 0.2336,
"num_input_tokens_seen": 331360,
"step": 2180
},
{
"epoch": 12.13888888888889,
"grad_norm": 1.042913556098938,
"learning_rate": 2.0087084761679245e-05,
"loss": 0.2341,
"num_input_tokens_seen": 332112,
"step": 2185
},
{
"epoch": 12.166666666666666,
"grad_norm": 1.2564104795455933,
"learning_rate": 1.996830295107827e-05,
"loss": 0.2747,
"num_input_tokens_seen": 332864,
"step": 2190
},
{
"epoch": 12.194444444444445,
"grad_norm": 0.921697735786438,
"learning_rate": 1.9849639407419423e-05,
"loss": 0.2155,
"num_input_tokens_seen": 333648,
"step": 2195
},
{
"epoch": 12.222222222222221,
"grad_norm": 1.2880674600601196,
"learning_rate": 1.973109691981627e-05,
"loss": 0.2432,
"num_input_tokens_seen": 334416,
"step": 2200
},
{
"epoch": 12.25,
"grad_norm": 1.4810035228729248,
"learning_rate": 1.9612678274537005e-05,
"loss": 0.2355,
"num_input_tokens_seen": 335184,
"step": 2205
},
{
"epoch": 12.277777777777779,
"grad_norm": 1.5859031677246094,
"learning_rate": 1.9494386254939e-05,
"loss": 0.197,
"num_input_tokens_seen": 335968,
"step": 2210
},
{
"epoch": 12.305555555555555,
"grad_norm": 0.9962775707244873,
"learning_rate": 1.937622364140338e-05,
"loss": 0.2029,
"num_input_tokens_seen": 336736,
"step": 2215
},
{
"epoch": 12.333333333333334,
"grad_norm": 0.7494038939476013,
"learning_rate": 1.925819321126964e-05,
"loss": 0.2067,
"num_input_tokens_seen": 337488,
"step": 2220
},
{
"epoch": 12.36111111111111,
"grad_norm": 0.8293570876121521,
"learning_rate": 1.9140297738770385e-05,
"loss": 0.2677,
"num_input_tokens_seen": 338240,
"step": 2225
},
{
"epoch": 12.38888888888889,
"grad_norm": 1.0475269556045532,
"learning_rate": 1.9022539994966147e-05,
"loss": 0.2074,
"num_input_tokens_seen": 338976,
"step": 2230
},
{
"epoch": 12.416666666666666,
"grad_norm": 0.9049837589263916,
"learning_rate": 1.8904922747680204e-05,
"loss": 0.2153,
"num_input_tokens_seen": 339760,
"step": 2235
},
{
"epoch": 12.444444444444445,
"grad_norm": 0.732437252998352,
"learning_rate": 1.8787448761433556e-05,
"loss": 0.2247,
"num_input_tokens_seen": 340528,
"step": 2240
},
{
"epoch": 12.472222222222221,
"grad_norm": 1.3903310298919678,
"learning_rate": 1.8670120797379958e-05,
"loss": 0.2214,
"num_input_tokens_seen": 341232,
"step": 2245
},
{
"epoch": 12.5,
"grad_norm": 0.7470361590385437,
"learning_rate": 1.8552941613240983e-05,
"loss": 0.2235,
"num_input_tokens_seen": 342000,
"step": 2250
},
{
"epoch": 12.527777777777779,
"grad_norm": 1.040972352027893,
"learning_rate": 1.8435913963241226e-05,
"loss": 0.197,
"num_input_tokens_seen": 342768,
"step": 2255
},
{
"epoch": 12.555555555555555,
"grad_norm": 1.4287666082382202,
"learning_rate": 1.831904059804358e-05,
"loss": 0.2268,
"num_input_tokens_seen": 343568,
"step": 2260
},
{
"epoch": 12.583333333333334,
"grad_norm": 0.8281350135803223,
"learning_rate": 1.8202324264684544e-05,
"loss": 0.2185,
"num_input_tokens_seen": 344304,
"step": 2265
},
{
"epoch": 12.61111111111111,
"grad_norm": 1.3658063411712646,
"learning_rate": 1.8085767706509712e-05,
"loss": 0.1767,
"num_input_tokens_seen": 345088,
"step": 2270
},
{
"epoch": 12.63888888888889,
"grad_norm": 0.9685779809951782,
"learning_rate": 1.7969373663109234e-05,
"loss": 0.2127,
"num_input_tokens_seen": 345856,
"step": 2275
},
{
"epoch": 12.666666666666666,
"grad_norm": 1.1758708953857422,
"learning_rate": 1.7853144870253458e-05,
"loss": 0.2382,
"num_input_tokens_seen": 346608,
"step": 2280
},
{
"epoch": 12.694444444444445,
"grad_norm": 0.8447101712226868,
"learning_rate": 1.7737084059828637e-05,
"loss": 0.194,
"num_input_tokens_seen": 347376,
"step": 2285
},
{
"epoch": 12.722222222222221,
"grad_norm": 2.4415032863616943,
"learning_rate": 1.7621193959772657e-05,
"loss": 0.1908,
"num_input_tokens_seen": 348144,
"step": 2290
},
{
"epoch": 12.75,
"grad_norm": 1.2303696870803833,
"learning_rate": 1.750547729401101e-05,
"loss": 0.1831,
"num_input_tokens_seen": 348912,
"step": 2295
},
{
"epoch": 12.777777777777779,
"grad_norm": 1.4564770460128784,
"learning_rate": 1.7389936782392695e-05,
"loss": 0.2239,
"num_input_tokens_seen": 349664,
"step": 2300
},
{
"epoch": 12.805555555555555,
"grad_norm": 0.8448835611343384,
"learning_rate": 1.7274575140626318e-05,
"loss": 0.1875,
"num_input_tokens_seen": 350432,
"step": 2305
},
{
"epoch": 12.833333333333334,
"grad_norm": 9.424522399902344,
"learning_rate": 1.7159395080216273e-05,
"loss": 0.344,
"num_input_tokens_seen": 351200,
"step": 2310
},
{
"epoch": 12.86111111111111,
"grad_norm": 1.3844144344329834,
"learning_rate": 1.7044399308398983e-05,
"loss": 0.3025,
"num_input_tokens_seen": 351936,
"step": 2315
},
{
"epoch": 12.88888888888889,
"grad_norm": 1.3061507940292358,
"learning_rate": 1.692959052807928e-05,
"loss": 0.1906,
"num_input_tokens_seen": 352720,
"step": 2320
},
{
"epoch": 12.916666666666666,
"grad_norm": 1.519792079925537,
"learning_rate": 1.681497143776689e-05,
"loss": 0.2825,
"num_input_tokens_seen": 353488,
"step": 2325
},
{
"epoch": 12.944444444444445,
"grad_norm": 1.082457423210144,
"learning_rate": 1.670054473151298e-05,
"loss": 0.1878,
"num_input_tokens_seen": 354256,
"step": 2330
},
{
"epoch": 12.972222222222221,
"grad_norm": 1.1723442077636719,
"learning_rate": 1.658631309884684e-05,
"loss": 0.2078,
"num_input_tokens_seen": 355008,
"step": 2335
},
{
"epoch": 13.0,
"grad_norm": 1.186935544013977,
"learning_rate": 1.6472279224712702e-05,
"loss": 0.2397,
"num_input_tokens_seen": 355760,
"step": 2340
},
{
"epoch": 13.0,
"eval_loss": 0.24227285385131836,
"eval_runtime": 0.8489,
"eval_samples_per_second": 47.122,
"eval_steps_per_second": 23.561,
"num_input_tokens_seen": 355760,
"step": 2340
},
{
"epoch": 13.027777777777779,
"grad_norm": 2.641793966293335,
"learning_rate": 1.6358445789406584e-05,
"loss": 0.2267,
"num_input_tokens_seen": 356528,
"step": 2345
},
{
"epoch": 13.055555555555555,
"grad_norm": 3.405515193939209,
"learning_rate": 1.6244815468513315e-05,
"loss": 0.1854,
"num_input_tokens_seen": 357296,
"step": 2350
},
{
"epoch": 13.083333333333334,
"grad_norm": 3.2545127868652344,
"learning_rate": 1.6131390932843648e-05,
"loss": 0.182,
"num_input_tokens_seen": 358048,
"step": 2355
},
{
"epoch": 13.11111111111111,
"grad_norm": 1.9090595245361328,
"learning_rate": 1.6018174848371494e-05,
"loss": 0.2446,
"num_input_tokens_seen": 358816,
"step": 2360
},
{
"epoch": 13.13888888888889,
"grad_norm": 2.3876793384552,
"learning_rate": 1.5905169876171223e-05,
"loss": 0.1739,
"num_input_tokens_seen": 359568,
"step": 2365
},
{
"epoch": 13.166666666666666,
"grad_norm": 9.157187461853027,
"learning_rate": 1.579237867235514e-05,
"loss": 0.2171,
"num_input_tokens_seen": 360336,
"step": 2370
},
{
"epoch": 13.194444444444445,
"grad_norm": 2.800305128097534,
"learning_rate": 1.567980388801109e-05,
"loss": 0.2339,
"num_input_tokens_seen": 361056,
"step": 2375
},
{
"epoch": 13.222222222222221,
"grad_norm": 2.117903470993042,
"learning_rate": 1.556744816914008e-05,
"loss": 0.2172,
"num_input_tokens_seen": 361792,
"step": 2380
},
{
"epoch": 13.25,
"grad_norm": 2.7015206813812256,
"learning_rate": 1.5455314156594124e-05,
"loss": 0.1971,
"num_input_tokens_seen": 362576,
"step": 2385
},
{
"epoch": 13.277777777777779,
"grad_norm": 1.168046474456787,
"learning_rate": 1.534340448601418e-05,
"loss": 0.2087,
"num_input_tokens_seen": 363344,
"step": 2390
},
{
"epoch": 13.305555555555555,
"grad_norm": 1.9067094326019287,
"learning_rate": 1.523172178776816e-05,
"loss": 0.2058,
"num_input_tokens_seen": 364096,
"step": 2395
},
{
"epoch": 13.333333333333334,
"grad_norm": 1.733651041984558,
"learning_rate": 1.512026868688915e-05,
"loss": 0.1853,
"num_input_tokens_seen": 364848,
"step": 2400
},
{
"epoch": 13.36111111111111,
"grad_norm": 2.2120025157928467,
"learning_rate": 1.5009047803013699e-05,
"loss": 0.2178,
"num_input_tokens_seen": 365568,
"step": 2405
},
{
"epoch": 13.38888888888889,
"grad_norm": 2.4871954917907715,
"learning_rate": 1.4898061750320212e-05,
"loss": 0.1482,
"num_input_tokens_seen": 366368,
"step": 2410
},
{
"epoch": 13.416666666666666,
"grad_norm": 2.5109379291534424,
"learning_rate": 1.4787313137467546e-05,
"loss": 0.1657,
"num_input_tokens_seen": 367168,
"step": 2415
},
{
"epoch": 13.444444444444445,
"grad_norm": 3.289309501647949,
"learning_rate": 1.4676804567533687e-05,
"loss": 0.2238,
"num_input_tokens_seen": 367904,
"step": 2420
},
{
"epoch": 13.472222222222221,
"grad_norm": 13.250516891479492,
"learning_rate": 1.4566538637954554e-05,
"loss": 0.1961,
"num_input_tokens_seen": 368672,
"step": 2425
},
{
"epoch": 13.5,
"grad_norm": 4.981893539428711,
"learning_rate": 1.4456517940462949e-05,
"loss": 0.2555,
"num_input_tokens_seen": 369424,
"step": 2430
},
{
"epoch": 13.527777777777779,
"grad_norm": 3.6648523807525635,
"learning_rate": 1.4346745061027644e-05,
"loss": 0.1898,
"num_input_tokens_seen": 370192,
"step": 2435
},
{
"epoch": 13.555555555555555,
"grad_norm": 4.466832160949707,
"learning_rate": 1.4237222579792618e-05,
"loss": 0.2684,
"num_input_tokens_seen": 370928,
"step": 2440
},
{
"epoch": 13.583333333333334,
"grad_norm": 0.43382784724235535,
"learning_rate": 1.4127953071016383e-05,
"loss": 0.1869,
"num_input_tokens_seen": 371664,
"step": 2445
},
{
"epoch": 13.61111111111111,
"grad_norm": 7.605555534362793,
"learning_rate": 1.4018939103011472e-05,
"loss": 0.2325,
"num_input_tokens_seen": 372448,
"step": 2450
},
{
"epoch": 13.63888888888889,
"grad_norm": 4.352413177490234,
"learning_rate": 1.3910183238084112e-05,
"loss": 0.2535,
"num_input_tokens_seen": 373232,
"step": 2455
},
{
"epoch": 13.666666666666666,
"grad_norm": 5.196812629699707,
"learning_rate": 1.3801688032473958e-05,
"loss": 0.3524,
"num_input_tokens_seen": 374000,
"step": 2460
},
{
"epoch": 13.694444444444445,
"grad_norm": 6.573197841644287,
"learning_rate": 1.369345603629406e-05,
"loss": 0.3079,
"num_input_tokens_seen": 374768,
"step": 2465
},
{
"epoch": 13.722222222222221,
"grad_norm": 2.1059021949768066,
"learning_rate": 1.3585489793470862e-05,
"loss": 0.1759,
"num_input_tokens_seen": 375552,
"step": 2470
},
{
"epoch": 13.75,
"grad_norm": 1.5465530157089233,
"learning_rate": 1.3477791841684451e-05,
"loss": 0.1818,
"num_input_tokens_seen": 376320,
"step": 2475
},
{
"epoch": 13.777777777777779,
"grad_norm": 2.8227381706237793,
"learning_rate": 1.337036471230889e-05,
"loss": 0.2174,
"num_input_tokens_seen": 377104,
"step": 2480
},
{
"epoch": 13.805555555555555,
"grad_norm": 2.1553452014923096,
"learning_rate": 1.3263210930352737e-05,
"loss": 0.1612,
"num_input_tokens_seen": 377872,
"step": 2485
},
{
"epoch": 13.833333333333334,
"grad_norm": 1.7421936988830566,
"learning_rate": 1.3156333014399674e-05,
"loss": 0.1359,
"num_input_tokens_seen": 378656,
"step": 2490
},
{
"epoch": 13.86111111111111,
"grad_norm": 1.0619189739227295,
"learning_rate": 1.3049733476549352e-05,
"loss": 0.1586,
"num_input_tokens_seen": 379408,
"step": 2495
},
{
"epoch": 13.88888888888889,
"grad_norm": 2.5276153087615967,
"learning_rate": 1.2943414822358285e-05,
"loss": 0.14,
"num_input_tokens_seen": 380144,
"step": 2500
},
{
"epoch": 13.916666666666666,
"grad_norm": 2.2656829357147217,
"learning_rate": 1.2837379550781003e-05,
"loss": 0.1432,
"num_input_tokens_seen": 380880,
"step": 2505
},
{
"epoch": 13.944444444444445,
"grad_norm": 3.669142723083496,
"learning_rate": 1.2731630154111296e-05,
"loss": 0.2479,
"num_input_tokens_seen": 381632,
"step": 2510
},
{
"epoch": 13.972222222222221,
"grad_norm": 3.9385387897491455,
"learning_rate": 1.262616911792365e-05,
"loss": 0.1723,
"num_input_tokens_seen": 382368,
"step": 2515
},
{
"epoch": 14.0,
"grad_norm": 5.834008693695068,
"learning_rate": 1.2520998921014792e-05,
"loss": 0.1748,
"num_input_tokens_seen": 383088,
"step": 2520
},
{
"epoch": 14.0,
"eval_loss": 0.2950591444969177,
"eval_runtime": 0.8591,
"eval_samples_per_second": 46.563,
"eval_steps_per_second": 23.282,
"num_input_tokens_seen": 383088,
"step": 2520
},
{
"epoch": 14.027777777777779,
"grad_norm": 0.713314414024353,
"learning_rate": 1.2416122035345507e-05,
"loss": 0.1308,
"num_input_tokens_seen": 383840,
"step": 2525
},
{
"epoch": 14.055555555555555,
"grad_norm": 4.134765625,
"learning_rate": 1.2311540925982403e-05,
"loss": 0.251,
"num_input_tokens_seen": 384624,
"step": 2530
},
{
"epoch": 14.083333333333334,
"grad_norm": 4.927321910858154,
"learning_rate": 1.2207258051040099e-05,
"loss": 0.209,
"num_input_tokens_seen": 385392,
"step": 2535
},
{
"epoch": 14.11111111111111,
"grad_norm": 3.297750949859619,
"learning_rate": 1.2103275861623378e-05,
"loss": 0.1824,
"num_input_tokens_seen": 386176,
"step": 2540
},
{
"epoch": 14.13888888888889,
"grad_norm": 3.990372657775879,
"learning_rate": 1.1999596801769616e-05,
"loss": 0.1749,
"num_input_tokens_seen": 386944,
"step": 2545
},
{
"epoch": 14.166666666666666,
"grad_norm": 3.1698896884918213,
"learning_rate": 1.189622330839129e-05,
"loss": 0.1024,
"num_input_tokens_seen": 387696,
"step": 2550
},
{
"epoch": 14.194444444444445,
"grad_norm": 1.4032636880874634,
"learning_rate": 1.179315781121874e-05,
"loss": 0.1795,
"num_input_tokens_seen": 388464,
"step": 2555
},
{
"epoch": 14.222222222222221,
"grad_norm": 5.369940280914307,
"learning_rate": 1.1690402732743042e-05,
"loss": 0.151,
"num_input_tokens_seen": 389232,
"step": 2560
},
{
"epoch": 14.25,
"grad_norm": 4.481320381164551,
"learning_rate": 1.158796048815906e-05,
"loss": 0.2119,
"num_input_tokens_seen": 390000,
"step": 2565
},
{
"epoch": 14.277777777777779,
"grad_norm": 1.490330696105957,
"learning_rate": 1.1485833485308702e-05,
"loss": 0.1076,
"num_input_tokens_seen": 390768,
"step": 2570
},
{
"epoch": 14.305555555555555,
"grad_norm": 6.507543087005615,
"learning_rate": 1.1384024124624324e-05,
"loss": 0.14,
"num_input_tokens_seen": 391568,
"step": 2575
},
{
"epoch": 14.333333333333334,
"grad_norm": 6.145547866821289,
"learning_rate": 1.1282534799072272e-05,
"loss": 0.197,
"num_input_tokens_seen": 392320,
"step": 2580
},
{
"epoch": 14.36111111111111,
"grad_norm": 18.988567352294922,
"learning_rate": 1.1181367894096684e-05,
"loss": 0.3318,
"num_input_tokens_seen": 393136,
"step": 2585
},
{
"epoch": 14.38888888888889,
"grad_norm": 19.283180236816406,
"learning_rate": 1.1080525787563393e-05,
"loss": 0.2645,
"num_input_tokens_seen": 393936,
"step": 2590
},
{
"epoch": 14.416666666666666,
"grad_norm": 10.825935363769531,
"learning_rate": 1.0980010849704036e-05,
"loss": 0.1461,
"num_input_tokens_seen": 394688,
"step": 2595
},
{
"epoch": 14.444444444444445,
"grad_norm": 31.148212432861328,
"learning_rate": 1.0879825443060362e-05,
"loss": 0.3501,
"num_input_tokens_seen": 395456,
"step": 2600
},
{
"epoch": 14.472222222222221,
"grad_norm": 9.394278526306152,
"learning_rate": 1.0779971922428711e-05,
"loss": 0.155,
"num_input_tokens_seen": 396224,
"step": 2605
},
{
"epoch": 14.5,
"grad_norm": 14.434137344360352,
"learning_rate": 1.0680452634804603e-05,
"loss": 0.2158,
"num_input_tokens_seen": 396960,
"step": 2610
},
{
"epoch": 14.527777777777779,
"grad_norm": 2.4239039421081543,
"learning_rate": 1.0581269919327643e-05,
"loss": 0.0862,
"num_input_tokens_seen": 397712,
"step": 2615
},
{
"epoch": 14.555555555555555,
"grad_norm": 2.1457433700561523,
"learning_rate": 1.0482426107226507e-05,
"loss": 0.0899,
"num_input_tokens_seen": 398448,
"step": 2620
},
{
"epoch": 14.583333333333334,
"grad_norm": 0.8032980561256409,
"learning_rate": 1.0383923521764174e-05,
"loss": 0.1994,
"num_input_tokens_seen": 399200,
"step": 2625
},
{
"epoch": 14.61111111111111,
"grad_norm": 12.607194900512695,
"learning_rate": 1.0285764478183284e-05,
"loss": 0.2465,
"num_input_tokens_seen": 399952,
"step": 2630
},
{
"epoch": 14.63888888888889,
"grad_norm": 12.285235404968262,
"learning_rate": 1.0187951283651736e-05,
"loss": 0.1971,
"num_input_tokens_seen": 400688,
"step": 2635
},
{
"epoch": 14.666666666666666,
"grad_norm": 9.90485668182373,
"learning_rate": 1.0090486237208463e-05,
"loss": 0.2033,
"num_input_tokens_seen": 401424,
"step": 2640
},
{
"epoch": 14.694444444444445,
"grad_norm": 7.427250385284424,
"learning_rate": 9.993371629709391e-06,
"loss": 0.0746,
"num_input_tokens_seen": 402176,
"step": 2645
},
{
"epoch": 14.722222222222221,
"grad_norm": 6.782092571258545,
"learning_rate": 9.89660974377359e-06,
"loss": 0.1266,
"num_input_tokens_seen": 402944,
"step": 2650
},
{
"epoch": 14.75,
"grad_norm": 1.4690288305282593,
"learning_rate": 9.800202853729651e-06,
"loss": 0.142,
"num_input_tokens_seen": 403696,
"step": 2655
},
{
"epoch": 14.777777777777779,
"grad_norm": 25.32921600341797,
"learning_rate": 9.704153225562171e-06,
"loss": 0.4962,
"num_input_tokens_seen": 404464,
"step": 2660
},
{
"epoch": 14.805555555555555,
"grad_norm": 1.8058884143829346,
"learning_rate": 9.608463116858542e-06,
"loss": 0.1042,
"num_input_tokens_seen": 405200,
"step": 2665
},
{
"epoch": 14.833333333333334,
"grad_norm": 3.9442355632781982,
"learning_rate": 9.51313477675588e-06,
"loss": 0.1045,
"num_input_tokens_seen": 405936,
"step": 2670
},
{
"epoch": 14.86111111111111,
"grad_norm": 20.509428024291992,
"learning_rate": 9.418170445888139e-06,
"loss": 0.3351,
"num_input_tokens_seen": 406688,
"step": 2675
},
{
"epoch": 14.88888888888889,
"grad_norm": 0.28635165095329285,
"learning_rate": 9.323572356333454e-06,
"loss": 0.1427,
"num_input_tokens_seen": 407424,
"step": 2680
},
{
"epoch": 14.916666666666666,
"grad_norm": 15.302724838256836,
"learning_rate": 9.22934273156172e-06,
"loss": 0.1361,
"num_input_tokens_seen": 408160,
"step": 2685
},
{
"epoch": 14.944444444444445,
"grad_norm": 9.12713623046875,
"learning_rate": 9.135483786382262e-06,
"loss": 0.2055,
"num_input_tokens_seen": 408912,
"step": 2690
},
{
"epoch": 14.972222222222221,
"grad_norm": 9.957405090332031,
"learning_rate": 9.0419977268918e-06,
"loss": 0.1449,
"num_input_tokens_seen": 409696,
"step": 2695
},
{
"epoch": 15.0,
"grad_norm": 6.853147983551025,
"learning_rate": 8.948886750422636e-06,
"loss": 0.0885,
"num_input_tokens_seen": 410448,
"step": 2700
},
{
"epoch": 15.0,
"eval_loss": 0.38744935393333435,
"eval_runtime": 0.8554,
"eval_samples_per_second": 46.759,
"eval_steps_per_second": 23.38,
"num_input_tokens_seen": 410448,
"step": 2700
},
{
"epoch": 15.027777777777779,
"grad_norm": 0.3130747675895691,
"learning_rate": 8.856153045490948e-06,
"loss": 0.0314,
"num_input_tokens_seen": 411184,
"step": 2705
},
{
"epoch": 15.055555555555555,
"grad_norm": 0.21885517239570618,
"learning_rate": 8.763798791745411e-06,
"loss": 0.0263,
"num_input_tokens_seen": 411936,
"step": 2710
},
{
"epoch": 15.083333333333334,
"grad_norm": 0.5549080967903137,
"learning_rate": 8.671826159915907e-06,
"loss": 0.1901,
"num_input_tokens_seen": 412720,
"step": 2715
},
{
"epoch": 15.11111111111111,
"grad_norm": 0.5954522490501404,
"learning_rate": 8.58023731176254e-06,
"loss": 0.2449,
"num_input_tokens_seen": 413472,
"step": 2720
},
{
"epoch": 15.13888888888889,
"grad_norm": 0.868930459022522,
"learning_rate": 8.489034400024812e-06,
"loss": 0.0795,
"num_input_tokens_seen": 414208,
"step": 2725
},
{
"epoch": 15.166666666666666,
"grad_norm": 17.721887588500977,
"learning_rate": 8.39821956837102e-06,
"loss": 0.1313,
"num_input_tokens_seen": 414944,
"step": 2730
},
{
"epoch": 15.194444444444445,
"grad_norm": 11.335846900939941,
"learning_rate": 8.3077949513479e-06,
"loss": 0.1054,
"num_input_tokens_seen": 415712,
"step": 2735
},
{
"epoch": 15.222222222222221,
"grad_norm": 3.936131238937378,
"learning_rate": 8.217762674330413e-06,
"loss": 0.1153,
"num_input_tokens_seen": 416448,
"step": 2740
},
{
"epoch": 15.25,
"grad_norm": 23.35863494873047,
"learning_rate": 8.128124853471814e-06,
"loss": 0.3075,
"num_input_tokens_seen": 417184,
"step": 2745
},
{
"epoch": 15.277777777777779,
"grad_norm": 21.123655319213867,
"learning_rate": 8.03888359565391e-06,
"loss": 0.1342,
"num_input_tokens_seen": 417920,
"step": 2750
},
{
"epoch": 15.305555555555555,
"grad_norm": 19.830791473388672,
"learning_rate": 7.950040998437542e-06,
"loss": 0.2187,
"num_input_tokens_seen": 418688,
"step": 2755
},
{
"epoch": 15.333333333333334,
"grad_norm": 1.0024510622024536,
"learning_rate": 7.86159915001326e-06,
"loss": 0.0683,
"num_input_tokens_seen": 419424,
"step": 2760
},
{
"epoch": 15.36111111111111,
"grad_norm": 1.5306583642959595,
"learning_rate": 7.7735601291523e-06,
"loss": 0.0755,
"num_input_tokens_seen": 420224,
"step": 2765
},
{
"epoch": 15.38888888888889,
"grad_norm": 2.4585580825805664,
"learning_rate": 7.685926005157651e-06,
"loss": 0.0281,
"num_input_tokens_seen": 420992,
"step": 2770
},
{
"epoch": 15.416666666666666,
"grad_norm": 23.670074462890625,
"learning_rate": 7.598698837815449e-06,
"loss": 0.1672,
"num_input_tokens_seen": 421744,
"step": 2775
},
{
"epoch": 15.444444444444445,
"grad_norm": 10.413293838500977,
"learning_rate": 7.511880677346578e-06,
"loss": 0.3337,
"num_input_tokens_seen": 422496,
"step": 2780
},
{
"epoch": 15.472222222222221,
"grad_norm": 0.5710373520851135,
"learning_rate": 7.4254735643584564e-06,
"loss": 0.0069,
"num_input_tokens_seen": 423264,
"step": 2785
},
{
"epoch": 15.5,
"grad_norm": 5.425608158111572,
"learning_rate": 7.339479529797111e-06,
"loss": 0.1213,
"num_input_tokens_seen": 424032,
"step": 2790
},
{
"epoch": 15.527777777777779,
"grad_norm": 0.2852936387062073,
"learning_rate": 7.2539005948993825e-06,
"loss": 0.0621,
"num_input_tokens_seen": 424816,
"step": 2795
},
{
"epoch": 15.555555555555555,
"grad_norm": 0.4949374496936798,
"learning_rate": 7.168738771145464e-06,
"loss": 0.1208,
"num_input_tokens_seen": 425584,
"step": 2800
},
{
"epoch": 15.583333333333334,
"grad_norm": 2.448948860168457,
"learning_rate": 7.083996060211607e-06,
"loss": 0.1817,
"num_input_tokens_seen": 426336,
"step": 2805
},
{
"epoch": 15.61111111111111,
"grad_norm": 0.2703257203102112,
"learning_rate": 6.9996744539230665e-06,
"loss": 0.1328,
"num_input_tokens_seen": 427120,
"step": 2810
},
{
"epoch": 15.63888888888889,
"grad_norm": 5.2962422370910645,
"learning_rate": 6.9157759342072995e-06,
"loss": 0.2257,
"num_input_tokens_seen": 427904,
"step": 2815
},
{
"epoch": 15.666666666666666,
"grad_norm": 2.691361665725708,
"learning_rate": 6.832302473047384e-06,
"loss": 0.0523,
"num_input_tokens_seen": 428672,
"step": 2820
},
{
"epoch": 15.694444444444445,
"grad_norm": 18.699260711669922,
"learning_rate": 6.7492560324356355e-06,
"loss": 0.1134,
"num_input_tokens_seen": 429456,
"step": 2825
},
{
"epoch": 15.722222222222221,
"grad_norm": 9.735588073730469,
"learning_rate": 6.666638564327532e-06,
"loss": 0.0496,
"num_input_tokens_seen": 430192,
"step": 2830
},
{
"epoch": 15.75,
"grad_norm": 1.238945722579956,
"learning_rate": 6.584452010595807e-06,
"loss": 0.2287,
"num_input_tokens_seen": 430944,
"step": 2835
},
{
"epoch": 15.777777777777779,
"grad_norm": 7.4446563720703125,
"learning_rate": 6.502698302984811e-06,
"loss": 0.0264,
"num_input_tokens_seen": 431680,
"step": 2840
},
{
"epoch": 15.805555555555555,
"grad_norm": 14.15257740020752,
"learning_rate": 6.421379363065142e-06,
"loss": 0.1039,
"num_input_tokens_seen": 432480,
"step": 2845
},
{
"epoch": 15.833333333333334,
"grad_norm": 14.9490385055542,
"learning_rate": 6.340497102188425e-06,
"loss": 0.135,
"num_input_tokens_seen": 433248,
"step": 2850
},
{
"epoch": 15.86111111111111,
"grad_norm": 0.2764110267162323,
"learning_rate": 6.26005342144241e-06,
"loss": 0.3508,
"num_input_tokens_seen": 434016,
"step": 2855
},
{
"epoch": 15.88888888888889,
"grad_norm": 0.11058162897825241,
"learning_rate": 6.180050211606303e-06,
"loss": 0.0026,
"num_input_tokens_seen": 434768,
"step": 2860
},
{
"epoch": 15.916666666666666,
"grad_norm": 22.15040397644043,
"learning_rate": 6.100489353106304e-06,
"loss": 0.3825,
"num_input_tokens_seen": 435504,
"step": 2865
},
{
"epoch": 15.944444444444445,
"grad_norm": 11.71772289276123,
"learning_rate": 6.021372715971437e-06,
"loss": 0.2274,
"num_input_tokens_seen": 436256,
"step": 2870
},
{
"epoch": 15.972222222222221,
"grad_norm": 0.5031313896179199,
"learning_rate": 5.942702159789554e-06,
"loss": 0.0297,
"num_input_tokens_seen": 437024,
"step": 2875
},
{
"epoch": 16.0,
"grad_norm": 21.48788833618164,
"learning_rate": 5.864479533663655e-06,
"loss": 0.1848,
"num_input_tokens_seen": 437776,
"step": 2880
},
{
"epoch": 16.0,
"eval_loss": 0.6435995697975159,
"eval_runtime": 0.8611,
"eval_samples_per_second": 46.454,
"eval_steps_per_second": 23.227,
"num_input_tokens_seen": 437776,
"step": 2880
},
{
"epoch": 16.02777777777778,
"grad_norm": 0.4101243019104004,
"learning_rate": 5.786706676168424e-06,
"loss": 0.0116,
"num_input_tokens_seen": 438576,
"step": 2885
},
{
"epoch": 16.055555555555557,
"grad_norm": 5.98344612121582,
"learning_rate": 5.709385415307006e-06,
"loss": 0.0365,
"num_input_tokens_seen": 439360,
"step": 2890
},
{
"epoch": 16.083333333333332,
"grad_norm": 7.092445373535156,
"learning_rate": 5.6325175684680374e-06,
"loss": 0.1803,
"num_input_tokens_seen": 440096,
"step": 2895
},
{
"epoch": 16.11111111111111,
"grad_norm": 9.738005638122559,
"learning_rate": 5.556104942382964e-06,
"loss": 0.0186,
"num_input_tokens_seen": 440848,
"step": 2900
},
{
"epoch": 16.13888888888889,
"grad_norm": 0.38426294922828674,
"learning_rate": 5.48014933308352e-06,
"loss": 0.1468,
"num_input_tokens_seen": 441616,
"step": 2905
},
{
"epoch": 16.166666666666668,
"grad_norm": 0.8579995036125183,
"learning_rate": 5.404652525859552e-06,
"loss": 0.1638,
"num_input_tokens_seen": 442352,
"step": 2910
},
{
"epoch": 16.194444444444443,
"grad_norm": 0.12119382619857788,
"learning_rate": 5.329616295217046e-06,
"loss": 0.0028,
"num_input_tokens_seen": 443088,
"step": 2915
},
{
"epoch": 16.22222222222222,
"grad_norm": 0.04318145290017128,
"learning_rate": 5.2550424048364185e-06,
"loss": 0.0059,
"num_input_tokens_seen": 443856,
"step": 2920
},
{
"epoch": 16.25,
"grad_norm": 25.193330764770508,
"learning_rate": 5.180932607531056e-06,
"loss": 0.3279,
"num_input_tokens_seen": 444608,
"step": 2925
},
{
"epoch": 16.27777777777778,
"grad_norm": 0.30459845066070557,
"learning_rate": 5.107288645206149e-06,
"loss": 0.0014,
"num_input_tokens_seen": 445344,
"step": 2930
},
{
"epoch": 16.305555555555557,
"grad_norm": 0.07173694670200348,
"learning_rate": 5.034112248817685e-06,
"loss": 0.0828,
"num_input_tokens_seen": 446128,
"step": 2935
},
{
"epoch": 16.333333333333332,
"grad_norm": 1.1420878171920776,
"learning_rate": 4.961405138331826e-06,
"loss": 0.0058,
"num_input_tokens_seen": 446912,
"step": 2940
},
{
"epoch": 16.36111111111111,
"grad_norm": 0.09082265198230743,
"learning_rate": 4.88916902268445e-06,
"loss": 0.1174,
"num_input_tokens_seen": 447664,
"step": 2945
},
{
"epoch": 16.38888888888889,
"grad_norm": 0.08622191846370697,
"learning_rate": 4.817405599741004e-06,
"loss": 0.0612,
"num_input_tokens_seen": 448416,
"step": 2950
},
{
"epoch": 16.416666666666668,
"grad_norm": 12.944429397583008,
"learning_rate": 4.746116556256569e-06,
"loss": 0.366,
"num_input_tokens_seen": 449184,
"step": 2955
},
{
"epoch": 16.444444444444443,
"grad_norm": 22.423507690429688,
"learning_rate": 4.6753035678362314e-06,
"loss": 0.0248,
"num_input_tokens_seen": 449936,
"step": 2960
},
{
"epoch": 16.47222222222222,
"grad_norm": 0.026965582743287086,
"learning_rate": 4.604968298895703e-06,
"loss": 0.0326,
"num_input_tokens_seen": 450688,
"step": 2965
},
{
"epoch": 16.5,
"grad_norm": 4.680528163909912,
"learning_rate": 4.535112402622185e-06,
"loss": 0.0039,
"num_input_tokens_seen": 451440,
"step": 2970
},
{
"epoch": 16.52777777777778,
"grad_norm": 0.01207332219928503,
"learning_rate": 4.465737520935517e-06,
"loss": 0.0057,
"num_input_tokens_seen": 452160,
"step": 2975
},
{
"epoch": 16.555555555555557,
"grad_norm": 0.024040911346673965,
"learning_rate": 4.396845284449608e-06,
"loss": 0.002,
"num_input_tokens_seen": 452944,
"step": 2980
},
{
"epoch": 16.583333333333332,
"grad_norm": 39.016632080078125,
"learning_rate": 4.328437312434067e-06,
"loss": 0.3633,
"num_input_tokens_seen": 453680,
"step": 2985
},
{
"epoch": 16.61111111111111,
"grad_norm": 0.005136567167937756,
"learning_rate": 4.2605152127761675e-06,
"loss": 0.0115,
"num_input_tokens_seen": 454432,
"step": 2990
},
{
"epoch": 16.63888888888889,
"grad_norm": 0.10608967393636703,
"learning_rate": 4.19308058194306e-06,
"loss": 0.0943,
"num_input_tokens_seen": 455232,
"step": 2995
},
{
"epoch": 16.666666666666668,
"grad_norm": 1.2691866159439087,
"learning_rate": 4.126135004944231e-06,
"loss": 0.0029,
"num_input_tokens_seen": 455984,
"step": 3000
},
{
"epoch": 16.694444444444443,
"grad_norm": 0.4157579243183136,
"learning_rate": 4.059680055294266e-06,
"loss": 0.0577,
"num_input_tokens_seen": 456736,
"step": 3005
},
{
"epoch": 16.72222222222222,
"grad_norm": 39.1358642578125,
"learning_rate": 3.993717294975863e-06,
"loss": 0.1053,
"num_input_tokens_seen": 457520,
"step": 3010
},
{
"epoch": 16.75,
"grad_norm": 43.38942337036133,
"learning_rate": 3.92824827440309e-06,
"loss": 0.2089,
"num_input_tokens_seen": 458256,
"step": 3015
},
{
"epoch": 16.77777777777778,
"grad_norm": 1.4325363636016846,
"learning_rate": 3.863274532384981e-06,
"loss": 0.003,
"num_input_tokens_seen": 459008,
"step": 3020
},
{
"epoch": 16.805555555555557,
"grad_norm": 34.902889251708984,
"learning_rate": 3.798797596089351e-06,
"loss": 0.0759,
"num_input_tokens_seen": 459808,
"step": 3025
},
{
"epoch": 16.833333333333332,
"grad_norm": 1.068337321281433,
"learning_rate": 3.73481898100691e-06,
"loss": 0.0139,
"num_input_tokens_seen": 460576,
"step": 3030
},
{
"epoch": 16.86111111111111,
"grad_norm": 0.5029184818267822,
"learning_rate": 3.6713401909156204e-06,
"loss": 0.0528,
"num_input_tokens_seen": 461328,
"step": 3035
},
{
"epoch": 16.88888888888889,
"grad_norm": 0.1493435800075531,
"learning_rate": 3.608362717845376e-06,
"loss": 0.0237,
"num_input_tokens_seen": 462096,
"step": 3040
},
{
"epoch": 16.916666666666668,
"grad_norm": 25.79793930053711,
"learning_rate": 3.5458880420429135e-06,
"loss": 0.1684,
"num_input_tokens_seen": 462848,
"step": 3045
},
{
"epoch": 16.944444444444443,
"grad_norm": 0.8746198415756226,
"learning_rate": 3.4839176319370394e-06,
"loss": 0.0018,
"num_input_tokens_seen": 463616,
"step": 3050
},
{
"epoch": 16.97222222222222,
"grad_norm": 0.01299325842410326,
"learning_rate": 3.4224529441040904e-06,
"loss": 0.1967,
"num_input_tokens_seen": 464384,
"step": 3055
},
{
"epoch": 17.0,
"grad_norm": 0.3657170236110687,
"learning_rate": 3.3614954232337374e-06,
"loss": 0.0092,
"num_input_tokens_seen": 465168,
"step": 3060
},
{
"epoch": 17.0,
"eval_loss": 0.8914836645126343,
"eval_runtime": 0.8427,
"eval_samples_per_second": 47.467,
"eval_steps_per_second": 23.734,
"num_input_tokens_seen": 465168,
"step": 3060
},
{
"epoch": 17.02777777777778,
"grad_norm": 0.033842090517282486,
"learning_rate": 3.3010465020949818e-06,
"loss": 0.0159,
"num_input_tokens_seen": 465920,
"step": 3065
},
{
"epoch": 17.055555555555557,
"grad_norm": 0.09151696413755417,
"learning_rate": 3.2411076015025075e-06,
"loss": 0.217,
"num_input_tokens_seen": 466688,
"step": 3070
},
{
"epoch": 17.083333333333332,
"grad_norm": 0.13220135867595673,
"learning_rate": 3.1816801302832848e-06,
"loss": 0.0058,
"num_input_tokens_seen": 467488,
"step": 3075
},
{
"epoch": 17.11111111111111,
"grad_norm": 0.2489822804927826,
"learning_rate": 3.1227654852434454e-06,
"loss": 0.001,
"num_input_tokens_seen": 468256,
"step": 3080
},
{
"epoch": 17.13888888888889,
"grad_norm": 8.796856880187988,
"learning_rate": 3.0643650511354484e-06,
"loss": 0.0073,
"num_input_tokens_seen": 469008,
"step": 3085
},
{
"epoch": 17.166666666666668,
"grad_norm": 32.82636260986328,
"learning_rate": 3.006480200625572e-06,
"loss": 0.2167,
"num_input_tokens_seen": 469808,
"step": 3090
},
{
"epoch": 17.194444444444443,
"grad_norm": 0.05829642340540886,
"learning_rate": 2.949112294261591e-06,
"loss": 0.0127,
"num_input_tokens_seen": 470560,
"step": 3095
},
{
"epoch": 17.22222222222222,
"grad_norm": 0.014127260074019432,
"learning_rate": 2.89226268044083e-06,
"loss": 0.1617,
"num_input_tokens_seen": 471328,
"step": 3100
},
{
"epoch": 17.25,
"grad_norm": 0.004986308049410582,
"learning_rate": 2.8359326953784737e-06,
"loss": 0.0014,
"num_input_tokens_seen": 472064,
"step": 3105
},
{
"epoch": 17.27777777777778,
"grad_norm": 0.09535623341798782,
"learning_rate": 2.780123663076142e-06,
"loss": 0.0004,
"num_input_tokens_seen": 472832,
"step": 3110
},
{
"epoch": 17.305555555555557,
"grad_norm": 10.574342727661133,
"learning_rate": 2.7248368952908053e-06,
"loss": 0.0079,
"num_input_tokens_seen": 473600,
"step": 3115
},
{
"epoch": 17.333333333333332,
"grad_norm": 0.5229879021644592,
"learning_rate": 2.670073691503902e-06,
"loss": 0.0046,
"num_input_tokens_seen": 474352,
"step": 3120
},
{
"epoch": 17.36111111111111,
"grad_norm": 0.13766419887542725,
"learning_rate": 2.6158353388908293e-06,
"loss": 0.0058,
"num_input_tokens_seen": 475104,
"step": 3125
},
{
"epoch": 17.38888888888889,
"grad_norm": 0.2758001387119293,
"learning_rate": 2.5621231122906873e-06,
"loss": 0.1262,
"num_input_tokens_seen": 475856,
"step": 3130
},
{
"epoch": 17.416666666666668,
"grad_norm": 0.24915482103824615,
"learning_rate": 2.5089382741762925e-06,
"loss": 0.0091,
"num_input_tokens_seen": 476640,
"step": 3135
},
{
"epoch": 17.444444444444443,
"grad_norm": 0.6348158121109009,
"learning_rate": 2.4562820746245386e-06,
"loss": 0.002,
"num_input_tokens_seen": 477408,
"step": 3140
},
{
"epoch": 17.47222222222222,
"grad_norm": 0.009842537343502045,
"learning_rate": 2.4041557512869878e-06,
"loss": 0.0016,
"num_input_tokens_seen": 478160,
"step": 3145
},
{
"epoch": 17.5,
"grad_norm": 0.12595131993293762,
"learning_rate": 2.3525605293607784e-06,
"loss": 0.0007,
"num_input_tokens_seen": 478928,
"step": 3150
},
{
"epoch": 17.52777777777778,
"grad_norm": 0.013150378130376339,
"learning_rate": 2.3014976215598503e-06,
"loss": 0.016,
"num_input_tokens_seen": 479696,
"step": 3155
},
{
"epoch": 17.555555555555557,
"grad_norm": 0.0263340026140213,
"learning_rate": 2.2509682280864224e-06,
"loss": 0.0076,
"num_input_tokens_seen": 480464,
"step": 3160
},
{
"epoch": 17.583333333333332,
"grad_norm": 54.26401138305664,
"learning_rate": 2.2009735366027795e-06,
"loss": 0.1149,
"num_input_tokens_seen": 481232,
"step": 3165
},
{
"epoch": 17.61111111111111,
"grad_norm": 34.03044891357422,
"learning_rate": 2.151514722203385e-06,
"loss": 0.097,
"num_input_tokens_seen": 481984,
"step": 3170
},
{
"epoch": 17.63888888888889,
"grad_norm": 1.2055052518844604,
"learning_rate": 2.1025929473872274e-06,
"loss": 0.0034,
"num_input_tokens_seen": 482768,
"step": 3175
},
{
"epoch": 17.666666666666668,
"grad_norm": 0.01310269720852375,
"learning_rate": 2.0542093620305042e-06,
"loss": 0.0002,
"num_input_tokens_seen": 483536,
"step": 3180
},
{
"epoch": 17.694444444444443,
"grad_norm": 0.12006166577339172,
"learning_rate": 2.0063651033596143e-06,
"loss": 0.0124,
"num_input_tokens_seen": 484320,
"step": 3185
},
{
"epoch": 17.72222222222222,
"grad_norm": 3.16572642326355,
"learning_rate": 1.9590612959244055e-06,
"loss": 0.0128,
"num_input_tokens_seen": 485056,
"step": 3190
},
{
"epoch": 17.75,
"grad_norm": 0.02956937998533249,
"learning_rate": 1.912299051571764e-06,
"loss": 0.0008,
"num_input_tokens_seen": 485808,
"step": 3195
},
{
"epoch": 17.77777777777778,
"grad_norm": 0.007258450146764517,
"learning_rate": 1.8660794694194573e-06,
"loss": 0.0161,
"num_input_tokens_seen": 486560,
"step": 3200
},
{
"epoch": 17.805555555555557,
"grad_norm": 0.009658108465373516,
"learning_rate": 1.8204036358303173e-06,
"loss": 0.1385,
"num_input_tokens_seen": 487312,
"step": 3205
},
{
"epoch": 17.833333333333332,
"grad_norm": 0.5556612014770508,
"learning_rate": 1.775272624386695e-06,
"loss": 0.0007,
"num_input_tokens_seen": 488080,
"step": 3210
},
{
"epoch": 17.86111111111111,
"grad_norm": 0.012735828757286072,
"learning_rate": 1.7306874958652408e-06,
"loss": 0.0409,
"num_input_tokens_seen": 488832,
"step": 3215
},
{
"epoch": 17.88888888888889,
"grad_norm": 0.5004767179489136,
"learning_rate": 1.686649298211951e-06,
"loss": 0.0264,
"num_input_tokens_seen": 489600,
"step": 3220
},
{
"epoch": 17.916666666666668,
"grad_norm": 38.82133865356445,
"learning_rate": 1.643159066517566e-06,
"loss": 0.0869,
"num_input_tokens_seen": 490336,
"step": 3225
},
{
"epoch": 17.944444444444443,
"grad_norm": 0.010125933215022087,
"learning_rate": 1.6002178229932107e-06,
"loss": 0.003,
"num_input_tokens_seen": 491056,
"step": 3230
},
{
"epoch": 17.97222222222222,
"grad_norm": 66.36505126953125,
"learning_rate": 1.5578265769463806e-06,
"loss": 0.1735,
"num_input_tokens_seen": 491792,
"step": 3235
},
{
"epoch": 18.0,
"grad_norm": 0.29106247425079346,
"learning_rate": 1.5159863247572236e-06,
"loss": 0.0543,
"num_input_tokens_seen": 492560,
"step": 3240
},
{
"epoch": 18.0,
"eval_loss": 1.0005823373794556,
"eval_runtime": 0.8454,
"eval_samples_per_second": 47.318,
"eval_steps_per_second": 23.659,
"num_input_tokens_seen": 492560,
"step": 3240
},
{
"epoch": 18.02777777777778,
"grad_norm": 0.005374926142394543,
"learning_rate": 1.4746980498551112e-06,
"loss": 0.0001,
"num_input_tokens_seen": 493280,
"step": 3245
},
{
"epoch": 18.055555555555557,
"grad_norm": 0.06802839785814285,
"learning_rate": 1.4339627226955392e-06,
"loss": 0.0002,
"num_input_tokens_seen": 494048,
"step": 3250
},
{
"epoch": 18.083333333333332,
"grad_norm": 0.2004125714302063,
"learning_rate": 1.3937813007373013e-06,
"loss": 0.0007,
"num_input_tokens_seen": 494784,
"step": 3255
},
{
"epoch": 18.11111111111111,
"grad_norm": 0.2468828707933426,
"learning_rate": 1.354154728419979e-06,
"loss": 0.1658,
"num_input_tokens_seen": 495552,
"step": 3260
},
{
"epoch": 18.13888888888889,
"grad_norm": 4.114477157592773,
"learning_rate": 1.31508393714177e-06,
"loss": 0.0021,
"num_input_tokens_seen": 496320,
"step": 3265
},
{
"epoch": 18.166666666666668,
"grad_norm": 0.005297894589602947,
"learning_rate": 1.276569845237574e-06,
"loss": 0.0193,
"num_input_tokens_seen": 497072,
"step": 3270
},
{
"epoch": 18.194444444444443,
"grad_norm": 1.2201541662216187,
"learning_rate": 1.2386133579574189e-06,
"loss": 0.0078,
"num_input_tokens_seen": 497808,
"step": 3275
},
{
"epoch": 18.22222222222222,
"grad_norm": 0.28695741295814514,
"learning_rate": 1.2012153674451715e-06,
"loss": 0.0005,
"num_input_tokens_seen": 498528,
"step": 3280
},
{
"epoch": 18.25,
"grad_norm": 0.01707194373011589,
"learning_rate": 1.1643767527175857e-06,
"loss": 0.0034,
"num_input_tokens_seen": 499296,
"step": 3285
},
{
"epoch": 18.27777777777778,
"grad_norm": 0.05932219699025154,
"learning_rate": 1.1280983796436245e-06,
"loss": 0.0233,
"num_input_tokens_seen": 500064,
"step": 3290
},
{
"epoch": 18.305555555555557,
"grad_norm": 0.0543174110352993,
"learning_rate": 1.0923811009241142e-06,
"loss": 0.0002,
"num_input_tokens_seen": 500848,
"step": 3295
},
{
"epoch": 18.333333333333332,
"grad_norm": 0.005716706160455942,
"learning_rate": 1.0572257560717086e-06,
"loss": 0.0861,
"num_input_tokens_seen": 501600,
"step": 3300
},
{
"epoch": 18.36111111111111,
"grad_norm": 0.004924902692437172,
"learning_rate": 1.0226331713911546e-06,
"loss": 0.0012,
"num_input_tokens_seen": 502352,
"step": 3305
},
{
"epoch": 18.38888888888889,
"grad_norm": 0.22492168843746185,
"learning_rate": 9.886041599598606e-07,
"loss": 0.0049,
"num_input_tokens_seen": 503120,
"step": 3310
},
{
"epoch": 18.416666666666668,
"grad_norm": 0.012456899508833885,
"learning_rate": 9.551395216087944e-07,
"loss": 0.0011,
"num_input_tokens_seen": 503872,
"step": 3315
},
{
"epoch": 18.444444444444443,
"grad_norm": 51.44154739379883,
"learning_rate": 9.222400429036854e-07,
"loss": 0.0504,
"num_input_tokens_seen": 504624,
"step": 3320
},
{
"epoch": 18.47222222222222,
"grad_norm": 2.0177342891693115,
"learning_rate": 8.899064971265276e-07,
"loss": 0.0048,
"num_input_tokens_seen": 505392,
"step": 3325
},
{
"epoch": 18.5,
"grad_norm": 0.26857876777648926,
"learning_rate": 8.581396442574135e-07,
"loss": 0.1322,
"num_input_tokens_seen": 506160,
"step": 3330
},
{
"epoch": 18.52777777777778,
"grad_norm": 0.036319322884082794,
"learning_rate": 8.269402309566743e-07,
"loss": 0.0058,
"num_input_tokens_seen": 506880,
"step": 3335
},
{
"epoch": 18.555555555555557,
"grad_norm": 0.32240647077560425,
"learning_rate": 7.963089905473092e-07,
"loss": 0.0184,
"num_input_tokens_seen": 507616,
"step": 3340
},
{
"epoch": 18.583333333333332,
"grad_norm": 0.011376683600246906,
"learning_rate": 7.662466429977699e-07,
"loss": 0.0009,
"num_input_tokens_seen": 508368,
"step": 3345
},
{
"epoch": 18.61111111111111,
"grad_norm": 0.20107612013816833,
"learning_rate": 7.367538949050345e-07,
"loss": 0.0825,
"num_input_tokens_seen": 509152,
"step": 3350
},
{
"epoch": 18.63888888888889,
"grad_norm": 0.08009302616119385,
"learning_rate": 7.078314394779961e-07,
"loss": 0.01,
"num_input_tokens_seen": 509936,
"step": 3355
},
{
"epoch": 18.666666666666668,
"grad_norm": 0.00780840078368783,
"learning_rate": 6.794799565211646e-07,
"loss": 0.0002,
"num_input_tokens_seen": 510688,
"step": 3360
},
{
"epoch": 18.694444444444443,
"grad_norm": 0.014461885206401348,
"learning_rate": 6.517001124186989e-07,
"loss": 0.0017,
"num_input_tokens_seen": 511440,
"step": 3365
},
{
"epoch": 18.72222222222222,
"grad_norm": 0.02738901786506176,
"learning_rate": 6.244925601187363e-07,
"loss": 0.1572,
"num_input_tokens_seen": 512192,
"step": 3370
},
{
"epoch": 18.75,
"grad_norm": 0.2982403337955475,
"learning_rate": 5.978579391180461e-07,
"loss": 0.0347,
"num_input_tokens_seen": 512960,
"step": 3375
},
{
"epoch": 18.77777777777778,
"grad_norm": 0.006348731461912394,
"learning_rate": 5.717968754469977e-07,
"loss": 0.0024,
"num_input_tokens_seen": 513712,
"step": 3380
},
{
"epoch": 18.805555555555557,
"grad_norm": 0.012313781306147575,
"learning_rate": 5.463099816548579e-07,
"loss": 0.0162,
"num_input_tokens_seen": 514496,
"step": 3385
},
{
"epoch": 18.833333333333332,
"grad_norm": 0.025961650535464287,
"learning_rate": 5.213978567953775e-07,
"loss": 0.0261,
"num_input_tokens_seen": 515296,
"step": 3390
},
{
"epoch": 18.86111111111111,
"grad_norm": 0.02834870107471943,
"learning_rate": 4.970610864127173e-07,
"loss": 0.0021,
"num_input_tokens_seen": 516032,
"step": 3395
},
{
"epoch": 18.88888888888889,
"grad_norm": 0.058260053396224976,
"learning_rate": 4.7330024252768555e-07,
"loss": 0.0003,
"num_input_tokens_seen": 516816,
"step": 3400
},
{
"epoch": 18.916666666666668,
"grad_norm": 3.5997354984283447,
"learning_rate": 4.5011588362429134e-07,
"loss": 0.0022,
"num_input_tokens_seen": 517584,
"step": 3405
},
{
"epoch": 18.944444444444443,
"grad_norm": 0.009501924738287926,
"learning_rate": 4.2750855463662143e-07,
"loss": 0.0024,
"num_input_tokens_seen": 518336,
"step": 3410
},
{
"epoch": 18.97222222222222,
"grad_norm": 0.0839788019657135,
"learning_rate": 4.05478786936031e-07,
"loss": 0.0029,
"num_input_tokens_seen": 519088,
"step": 3415
},
{
"epoch": 19.0,
"grad_norm": 0.05105578154325485,
"learning_rate": 3.8402709831865113e-07,
"loss": 0.1114,
"num_input_tokens_seen": 519840,
"step": 3420
},
{
"epoch": 19.0,
"eval_loss": 1.036169409751892,
"eval_runtime": 0.854,
"eval_samples_per_second": 46.839,
"eval_steps_per_second": 23.419,
"num_input_tokens_seen": 519840,
"step": 3420
},
{
"epoch": 19.02777777777778,
"grad_norm": 0.12579289078712463,
"learning_rate": 3.6315399299321484e-07,
"loss": 0.0218,
"num_input_tokens_seen": 520624,
"step": 3425
},
{
"epoch": 19.055555555555557,
"grad_norm": 0.00517948716878891,
"learning_rate": 3.428599615692141e-07,
"loss": 0.0019,
"num_input_tokens_seen": 521392,
"step": 3430
},
{
"epoch": 19.083333333333332,
"grad_norm": 7.694216251373291,
"learning_rate": 3.2314548104537545e-07,
"loss": 0.0078,
"num_input_tokens_seen": 522144,
"step": 3435
},
{
"epoch": 19.11111111111111,
"grad_norm": 16.037460327148438,
"learning_rate": 3.040110147984221e-07,
"loss": 0.0194,
"num_input_tokens_seen": 522896,
"step": 3440
},
{
"epoch": 19.13888888888889,
"grad_norm": 0.8977673649787903,
"learning_rate": 2.8545701257221e-07,
"loss": 0.0008,
"num_input_tokens_seen": 523632,
"step": 3445
},
{
"epoch": 19.166666666666668,
"grad_norm": 0.025021294131875038,
"learning_rate": 2.674839104671367e-07,
"loss": 0.0007,
"num_input_tokens_seen": 524400,
"step": 3450
},
{
"epoch": 19.194444444444443,
"grad_norm": 43.39835739135742,
"learning_rate": 2.5009213092991034e-07,
"loss": 0.0131,
"num_input_tokens_seen": 525200,
"step": 3455
},
{
"epoch": 19.22222222222222,
"grad_norm": 0.020788980647921562,
"learning_rate": 2.3328208274359942e-07,
"loss": 0.0003,
"num_input_tokens_seen": 525952,
"step": 3460
},
{
"epoch": 19.25,
"grad_norm": 0.006578272208571434,
"learning_rate": 2.170541610180432e-07,
"loss": 0.0003,
"num_input_tokens_seen": 526704,
"step": 3465
},
{
"epoch": 19.27777777777778,
"grad_norm": 0.020482726395130157,
"learning_rate": 2.014087471805509e-07,
"loss": 0.0009,
"num_input_tokens_seen": 527440,
"step": 3470
},
{
"epoch": 19.305555555555557,
"grad_norm": 0.029707515612244606,
"learning_rate": 1.8634620896695043e-07,
"loss": 0.0005,
"num_input_tokens_seen": 528208,
"step": 3475
},
{
"epoch": 19.333333333333332,
"grad_norm": 0.011560129933059216,
"learning_rate": 1.7186690041292586e-07,
"loss": 0.0009,
"num_input_tokens_seen": 529008,
"step": 3480
},
{
"epoch": 19.36111111111111,
"grad_norm": 0.9206594228744507,
"learning_rate": 1.5797116184571304e-07,
"loss": 0.0011,
"num_input_tokens_seen": 529760,
"step": 3485
},
{
"epoch": 19.38888888888889,
"grad_norm": 0.096881203353405,
"learning_rate": 1.4465931987609482e-07,
"loss": 0.0019,
"num_input_tokens_seen": 530480,
"step": 3490
},
{
"epoch": 19.416666666666668,
"grad_norm": 0.055456917732954025,
"learning_rate": 1.319316873907267e-07,
"loss": 0.0017,
"num_input_tokens_seen": 531216,
"step": 3495
},
{
"epoch": 19.444444444444443,
"grad_norm": 0.10053224116563797,
"learning_rate": 1.1978856354477595e-07,
"loss": 0.0654,
"num_input_tokens_seen": 531968,
"step": 3500
},
{
"epoch": 19.47222222222222,
"grad_norm": 3.4088387489318848,
"learning_rate": 1.0823023375489127e-07,
"loss": 0.0026,
"num_input_tokens_seen": 532736,
"step": 3505
},
{
"epoch": 19.5,
"grad_norm": 0.00926142930984497,
"learning_rate": 9.725696969249965e-08,
"loss": 0.0137,
"num_input_tokens_seen": 533504,
"step": 3510
},
{
"epoch": 19.52777777777778,
"grad_norm": 0.005964045878499746,
"learning_rate": 8.686902927741991e-08,
"loss": 0.0004,
"num_input_tokens_seen": 534256,
"step": 3515
},
{
"epoch": 19.555555555555557,
"grad_norm": 0.28770729899406433,
"learning_rate": 7.706665667180091e-08,
"loss": 0.0004,
"num_input_tokens_seen": 535008,
"step": 3520
},
{
"epoch": 19.583333333333332,
"grad_norm": 0.0930958241224289,
"learning_rate": 6.785008227437329e-08,
"loss": 0.0438,
"num_input_tokens_seen": 535792,
"step": 3525
},
{
"epoch": 19.61111111111111,
"grad_norm": 0.028239967301487923,
"learning_rate": 5.921952271504827e-08,
"loss": 0.0086,
"num_input_tokens_seen": 536560,
"step": 3530
},
{
"epoch": 19.63888888888889,
"grad_norm": 0.22114162147045135,
"learning_rate": 5.117518084981621e-08,
"loss": 0.0005,
"num_input_tokens_seen": 537328,
"step": 3535
},
{
"epoch": 19.666666666666668,
"grad_norm": 0.10239258408546448,
"learning_rate": 4.371724575597535e-08,
"loss": 0.0002,
"num_input_tokens_seen": 538064,
"step": 3540
},
{
"epoch": 19.694444444444443,
"grad_norm": 35.42240905761719,
"learning_rate": 3.684589272771044e-08,
"loss": 0.1792,
"num_input_tokens_seen": 538848,
"step": 3545
},
{
"epoch": 19.72222222222222,
"grad_norm": 0.09444889426231384,
"learning_rate": 3.056128327193486e-08,
"loss": 0.0761,
"num_input_tokens_seen": 539616,
"step": 3550
},
{
"epoch": 19.75,
"grad_norm": 0.004997141659259796,
"learning_rate": 2.486356510453258e-08,
"loss": 0.001,
"num_input_tokens_seen": 540352,
"step": 3555
},
{
"epoch": 19.77777777777778,
"grad_norm": 0.003712383331730962,
"learning_rate": 1.975287214685817e-08,
"loss": 0.0004,
"num_input_tokens_seen": 541104,
"step": 3560
},
{
"epoch": 19.805555555555557,
"grad_norm": 0.10159067809581757,
"learning_rate": 1.522932452260595e-08,
"loss": 0.0166,
"num_input_tokens_seen": 541856,
"step": 3565
},
{
"epoch": 19.833333333333332,
"grad_norm": 1.4221373796463013,
"learning_rate": 1.1293028554978935e-08,
"loss": 0.001,
"num_input_tokens_seen": 542640,
"step": 3570
},
{
"epoch": 19.86111111111111,
"grad_norm": 0.006227497011423111,
"learning_rate": 7.944076764190845e-09,
"loss": 0.0002,
"num_input_tokens_seen": 543392,
"step": 3575
},
{
"epoch": 19.88888888888889,
"grad_norm": 0.003915317822247744,
"learning_rate": 5.182547865290044e-09,
"loss": 0.0512,
"num_input_tokens_seen": 544128,
"step": 3580
},
{
"epoch": 19.916666666666668,
"grad_norm": 6.322549819946289,
"learning_rate": 3.008506766313812e-09,
"loss": 0.155,
"num_input_tokens_seen": 544880,
"step": 3585
},
{
"epoch": 19.944444444444443,
"grad_norm": 0.006800191942602396,
"learning_rate": 1.4220045667645566e-09,
"loss": 0.0003,
"num_input_tokens_seen": 545600,
"step": 3590
},
{
"epoch": 19.97222222222222,
"grad_norm": 0.013187541626393795,
"learning_rate": 4.2307855639411865e-10,
"loss": 0.006,
"num_input_tokens_seen": 546384,
"step": 3595
},
{
"epoch": 20.0,
"grad_norm": 0.07555617392063141,
"learning_rate": 1.1752214348903501e-11,
"loss": 0.0002,
"num_input_tokens_seen": 547136,
"step": 3600
},
{
"epoch": 20.0,
"eval_loss": 1.036588430404663,
"eval_runtime": 0.8689,
"eval_samples_per_second": 46.033,
"eval_steps_per_second": 23.016,
"num_input_tokens_seen": 547136,
"step": 3600
},
{
"epoch": 20.0,
"num_input_tokens_seen": 547136,
"step": 3600,
"total_flos": 2.463728679203635e+16,
"train_loss": 0.24590962828558985,
"train_runtime": 353.9588,
"train_samples_per_second": 20.341,
"train_steps_per_second": 10.171
}
],
"logging_steps": 5,
"max_steps": 3600,
"num_input_tokens_seen": 547136,
"num_train_epochs": 20,
"save_steps": 180,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.463728679203635e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}