mamung's picture
Training in progress, step 100, checkpoint
f14a294 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.3473684210526315,
"eval_steps": 50,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.013473684210526317,
"grad_norm": 10.973633766174316,
"learning_rate": 7.499999999999999e-06,
"loss": 2.1479,
"step": 1
},
{
"epoch": 0.013473684210526317,
"eval_loss": 2.5371525287628174,
"eval_runtime": 8.4173,
"eval_samples_per_second": 29.701,
"eval_steps_per_second": 14.85,
"step": 1
},
{
"epoch": 0.026947368421052633,
"grad_norm": 9.272090911865234,
"learning_rate": 1.4999999999999999e-05,
"loss": 2.0187,
"step": 2
},
{
"epoch": 0.04042105263157895,
"grad_norm": 7.878329753875732,
"learning_rate": 2.2499999999999998e-05,
"loss": 2.1563,
"step": 3
},
{
"epoch": 0.053894736842105266,
"grad_norm": 7.982365608215332,
"learning_rate": 2.9999999999999997e-05,
"loss": 2.282,
"step": 4
},
{
"epoch": 0.06736842105263158,
"grad_norm": 2.2053937911987305,
"learning_rate": 3.75e-05,
"loss": 2.1851,
"step": 5
},
{
"epoch": 0.0808421052631579,
"grad_norm": 1.8837133646011353,
"learning_rate": 4.4999999999999996e-05,
"loss": 2.112,
"step": 6
},
{
"epoch": 0.09431578947368421,
"grad_norm": 1.4526352882385254,
"learning_rate": 5.2499999999999995e-05,
"loss": 2.0328,
"step": 7
},
{
"epoch": 0.10778947368421053,
"grad_norm": 1.4846237897872925,
"learning_rate": 5.9999999999999995e-05,
"loss": 1.8783,
"step": 8
},
{
"epoch": 0.12126315789473684,
"grad_norm": 1.9902074337005615,
"learning_rate": 6.75e-05,
"loss": 1.9695,
"step": 9
},
{
"epoch": 0.13473684210526315,
"grad_norm": 1.404717206954956,
"learning_rate": 7.5e-05,
"loss": 1.9514,
"step": 10
},
{
"epoch": 0.1482105263157895,
"grad_norm": 1.4216639995574951,
"learning_rate": 8.25e-05,
"loss": 1.8557,
"step": 11
},
{
"epoch": 0.1616842105263158,
"grad_norm": 1.4013230800628662,
"learning_rate": 8.999999999999999e-05,
"loss": 1.8424,
"step": 12
},
{
"epoch": 0.1751578947368421,
"grad_norm": 1.1717782020568848,
"learning_rate": 9.75e-05,
"loss": 1.8625,
"step": 13
},
{
"epoch": 0.18863157894736843,
"grad_norm": 1.7480112314224243,
"learning_rate": 0.00010499999999999999,
"loss": 1.9016,
"step": 14
},
{
"epoch": 0.20210526315789473,
"grad_norm": 1.1352505683898926,
"learning_rate": 0.0001125,
"loss": 1.8889,
"step": 15
},
{
"epoch": 0.21557894736842106,
"grad_norm": 1.1477811336517334,
"learning_rate": 0.00011999999999999999,
"loss": 1.5794,
"step": 16
},
{
"epoch": 0.22905263157894737,
"grad_norm": 1.3586668968200684,
"learning_rate": 0.00012749999999999998,
"loss": 1.5011,
"step": 17
},
{
"epoch": 0.24252631578947367,
"grad_norm": 0.8325414657592773,
"learning_rate": 0.000135,
"loss": 1.6469,
"step": 18
},
{
"epoch": 0.256,
"grad_norm": 1.5747898817062378,
"learning_rate": 0.0001425,
"loss": 1.6895,
"step": 19
},
{
"epoch": 0.2694736842105263,
"grad_norm": 0.9997685551643372,
"learning_rate": 0.00015,
"loss": 1.5248,
"step": 20
},
{
"epoch": 0.2829473684210526,
"grad_norm": 1.195119857788086,
"learning_rate": 0.00014994217771805422,
"loss": 1.5649,
"step": 21
},
{
"epoch": 0.296421052631579,
"grad_norm": 0.8751718401908875,
"learning_rate": 0.00014976880002998458,
"loss": 1.5405,
"step": 22
},
{
"epoch": 0.3098947368421053,
"grad_norm": 0.8566117882728577,
"learning_rate": 0.00014948013427161947,
"loss": 1.5504,
"step": 23
},
{
"epoch": 0.3233684210526316,
"grad_norm": 0.7322584390640259,
"learning_rate": 0.00014907662554463532,
"loss": 1.5034,
"step": 24
},
{
"epoch": 0.3368421052631579,
"grad_norm": 0.9539948105812073,
"learning_rate": 0.00014855889603024227,
"loss": 1.4513,
"step": 25
},
{
"epoch": 0.3503157894736842,
"grad_norm": 0.7042058110237122,
"learning_rate": 0.00014792774402982574,
"loss": 1.5281,
"step": 26
},
{
"epoch": 0.36378947368421055,
"grad_norm": 0.6478146910667419,
"learning_rate": 0.0001471841427340235,
"loss": 1.5117,
"step": 27
},
{
"epoch": 0.37726315789473686,
"grad_norm": 0.6267299652099609,
"learning_rate": 0.00014632923872213652,
"loss": 1.383,
"step": 28
},
{
"epoch": 0.39073684210526316,
"grad_norm": 0.7713648676872253,
"learning_rate": 0.0001453643501941863,
"loss": 1.4844,
"step": 29
},
{
"epoch": 0.40421052631578946,
"grad_norm": 0.6838952898979187,
"learning_rate": 0.0001442909649383465,
"loss": 1.4825,
"step": 30
},
{
"epoch": 0.41768421052631577,
"grad_norm": 0.71690434217453,
"learning_rate": 0.0001431107380368811,
"loss": 1.4357,
"step": 31
},
{
"epoch": 0.43115789473684213,
"grad_norm": 0.6745509505271912,
"learning_rate": 0.00014182548931412757,
"loss": 1.4733,
"step": 32
},
{
"epoch": 0.44463157894736843,
"grad_norm": 0.7103040814399719,
"learning_rate": 0.0001404372005304598,
"loss": 1.3857,
"step": 33
},
{
"epoch": 0.45810526315789474,
"grad_norm": 0.6221896409988403,
"learning_rate": 0.0001389480123265569,
"loss": 1.2527,
"step": 34
},
{
"epoch": 0.47157894736842104,
"grad_norm": 0.562971293926239,
"learning_rate": 0.0001373602209226909,
"loss": 1.4486,
"step": 35
},
{
"epoch": 0.48505263157894735,
"grad_norm": 0.5778741240501404,
"learning_rate": 0.00013567627457812106,
"loss": 1.4134,
"step": 36
},
{
"epoch": 0.4985263157894737,
"grad_norm": 0.5704385042190552,
"learning_rate": 0.00013389876981605584,
"loss": 1.354,
"step": 37
},
{
"epoch": 0.512,
"grad_norm": 0.6227774024009705,
"learning_rate": 0.00013203044742000233,
"loss": 1.5718,
"step": 38
},
{
"epoch": 0.5254736842105263,
"grad_norm": 0.6505720615386963,
"learning_rate": 0.0001300741882076764,
"loss": 1.4278,
"step": 39
},
{
"epoch": 0.5389473684210526,
"grad_norm": 0.5715638995170593,
"learning_rate": 0.00012803300858899104,
"loss": 1.478,
"step": 40
},
{
"epoch": 0.5524210526315789,
"grad_norm": 0.6021521091461182,
"learning_rate": 0.00012591005591497064,
"loss": 1.3556,
"step": 41
},
{
"epoch": 0.5658947368421052,
"grad_norm": 0.6821895837783813,
"learning_rate": 0.00012370860362476374,
"loss": 1.4947,
"step": 42
},
{
"epoch": 0.5793684210526315,
"grad_norm": 0.5453934073448181,
"learning_rate": 0.00012143204619823755,
"loss": 1.2477,
"step": 43
},
{
"epoch": 0.592842105263158,
"grad_norm": 0.6702715754508972,
"learning_rate": 0.00011908389392193547,
"loss": 1.4835,
"step": 44
},
{
"epoch": 0.6063157894736843,
"grad_norm": 0.6350681185722351,
"learning_rate": 0.00011666776747647015,
"loss": 1.3993,
"step": 45
},
{
"epoch": 0.6197894736842106,
"grad_norm": 0.6066803336143494,
"learning_rate": 0.00011418739235369615,
"loss": 1.3974,
"step": 46
},
{
"epoch": 0.6332631578947369,
"grad_norm": 0.5659217238426208,
"learning_rate": 0.00011164659311227163,
"loss": 1.3268,
"step": 47
},
{
"epoch": 0.6467368421052632,
"grad_norm": 0.5807419419288635,
"learning_rate": 0.00010904928748046599,
"loss": 1.3408,
"step": 48
},
{
"epoch": 0.6602105263157895,
"grad_norm": 0.6375626921653748,
"learning_rate": 0.0001063994803153071,
"loss": 1.4854,
"step": 49
},
{
"epoch": 0.6736842105263158,
"grad_norm": 0.6418893933296204,
"learning_rate": 0.00010370125742738173,
"loss": 1.3679,
"step": 50
},
{
"epoch": 0.6736842105263158,
"eval_loss": 1.499872088432312,
"eval_runtime": 8.4304,
"eval_samples_per_second": 29.654,
"eval_steps_per_second": 14.827,
"step": 50
},
{
"epoch": 0.6871578947368421,
"grad_norm": 0.7283876538276672,
"learning_rate": 0.00010095877928081196,
"loss": 1.4891,
"step": 51
},
{
"epoch": 0.7006315789473684,
"grad_norm": 0.5404418706893921,
"learning_rate": 9.817627457812105e-05,
"loss": 1.0949,
"step": 52
},
{
"epoch": 0.7141052631578947,
"grad_norm": 0.6369166970252991,
"learning_rate": 9.535803373988056e-05,
"loss": 1.5966,
"step": 53
},
{
"epoch": 0.7275789473684211,
"grad_norm": 0.8547583818435669,
"learning_rate": 9.25084022891929e-05,
"loss": 1.3016,
"step": 54
},
{
"epoch": 0.7410526315789474,
"grad_norm": 0.6199703812599182,
"learning_rate": 8.963177415120962e-05,
"loss": 1.5274,
"step": 55
},
{
"epoch": 0.7545263157894737,
"grad_norm": 0.6474433541297913,
"learning_rate": 8.673258487801731e-05,
"loss": 1.3812,
"step": 56
},
{
"epoch": 0.768,
"grad_norm": 0.5689646601676941,
"learning_rate": 8.381530480933783e-05,
"loss": 1.4005,
"step": 57
},
{
"epoch": 0.7814736842105263,
"grad_norm": 0.5995835661888123,
"learning_rate": 8.088443217958837e-05,
"loss": 1.2694,
"step": 58
},
{
"epoch": 0.7949473684210526,
"grad_norm": 0.5566097497940063,
"learning_rate": 7.794448618193015e-05,
"loss": 1.4341,
"step": 59
},
{
"epoch": 0.8084210526315789,
"grad_norm": 0.6118280291557312,
"learning_rate": 7.5e-05,
"loss": 1.378,
"step": 60
},
{
"epoch": 0.8218947368421052,
"grad_norm": 0.5662732124328613,
"learning_rate": 7.205551381806987e-05,
"loss": 1.4138,
"step": 61
},
{
"epoch": 0.8353684210526315,
"grad_norm": 0.5312877893447876,
"learning_rate": 6.911556782041163e-05,
"loss": 1.4356,
"step": 62
},
{
"epoch": 0.8488421052631578,
"grad_norm": 0.5712493062019348,
"learning_rate": 6.618469519066217e-05,
"loss": 1.4667,
"step": 63
},
{
"epoch": 0.8623157894736843,
"grad_norm": 0.5462284684181213,
"learning_rate": 6.326741512198266e-05,
"loss": 1.4505,
"step": 64
},
{
"epoch": 0.8757894736842106,
"grad_norm": 0.6210593581199646,
"learning_rate": 6.036822584879038e-05,
"loss": 1.2947,
"step": 65
},
{
"epoch": 0.8892631578947369,
"grad_norm": 0.6453770399093628,
"learning_rate": 5.7491597710807114e-05,
"loss": 1.3575,
"step": 66
},
{
"epoch": 0.9027368421052632,
"grad_norm": 0.6173303127288818,
"learning_rate": 5.464196626011943e-05,
"loss": 1.3685,
"step": 67
},
{
"epoch": 0.9162105263157895,
"grad_norm": 0.6161783933639526,
"learning_rate": 5.182372542187895e-05,
"loss": 1.5084,
"step": 68
},
{
"epoch": 0.9296842105263158,
"grad_norm": 0.5926702618598938,
"learning_rate": 4.904122071918801e-05,
"loss": 1.5106,
"step": 69
},
{
"epoch": 0.9431578947368421,
"grad_norm": 0.7311588525772095,
"learning_rate": 4.6298742572618266e-05,
"loss": 1.3789,
"step": 70
},
{
"epoch": 0.9566315789473684,
"grad_norm": 0.5569392442703247,
"learning_rate": 4.360051968469291e-05,
"loss": 1.2037,
"step": 71
},
{
"epoch": 0.9701052631578947,
"grad_norm": 0.49740126729011536,
"learning_rate": 4.095071251953399e-05,
"loss": 1.3472,
"step": 72
},
{
"epoch": 0.983578947368421,
"grad_norm": 0.5706843733787537,
"learning_rate": 3.83534068877284e-05,
"loss": 1.4041,
"step": 73
},
{
"epoch": 0.9970526315789474,
"grad_norm": 0.5967234373092651,
"learning_rate": 3.5812607646303834e-05,
"loss": 1.31,
"step": 74
},
{
"epoch": 1.0105263157894737,
"grad_norm": 1.052331566810608,
"learning_rate": 3.333223252352985e-05,
"loss": 2.0664,
"step": 75
},
{
"epoch": 1.024,
"grad_norm": 0.6153193712234497,
"learning_rate": 3.091610607806452e-05,
"loss": 1.516,
"step": 76
},
{
"epoch": 1.0374736842105263,
"grad_norm": 0.5821354389190674,
"learning_rate": 2.856795380176244e-05,
"loss": 1.2732,
"step": 77
},
{
"epoch": 1.0509473684210526,
"grad_norm": 0.6261878609657288,
"learning_rate": 2.6291396375236232e-05,
"loss": 1.2817,
"step": 78
},
{
"epoch": 1.064421052631579,
"grad_norm": 0.5795064568519592,
"learning_rate": 2.4089944085029363e-05,
"loss": 1.3216,
"step": 79
},
{
"epoch": 1.0778947368421052,
"grad_norm": 0.5134410262107849,
"learning_rate": 2.1966991411008938e-05,
"loss": 1.2917,
"step": 80
},
{
"epoch": 1.0913684210526315,
"grad_norm": 0.6312588453292847,
"learning_rate": 1.99258117923236e-05,
"loss": 1.1945,
"step": 81
},
{
"epoch": 1.1048421052631578,
"grad_norm": 0.5317590832710266,
"learning_rate": 1.796955257999768e-05,
"loss": 1.2838,
"step": 82
},
{
"epoch": 1.1183157894736842,
"grad_norm": 0.6899747252464294,
"learning_rate": 1.6101230183944144e-05,
"loss": 1.3589,
"step": 83
},
{
"epoch": 1.1317894736842105,
"grad_norm": 0.5604771971702576,
"learning_rate": 1.4323725421878949e-05,
"loss": 1.1972,
"step": 84
},
{
"epoch": 1.1452631578947368,
"grad_norm": 0.6752080917358398,
"learning_rate": 1.2639779077309098e-05,
"loss": 1.5739,
"step": 85
},
{
"epoch": 1.158736842105263,
"grad_norm": 0.6165206432342529,
"learning_rate": 1.1051987673443085e-05,
"loss": 1.2631,
"step": 86
},
{
"epoch": 1.1722105263157894,
"grad_norm": 0.61894291639328,
"learning_rate": 9.56279946954021e-06,
"loss": 1.3167,
"step": 87
},
{
"epoch": 1.1856842105263157,
"grad_norm": 0.647057294845581,
"learning_rate": 8.174510685872415e-06,
"loss": 1.3185,
"step": 88
},
{
"epoch": 1.1991578947368422,
"grad_norm": 0.5210772752761841,
"learning_rate": 6.889261963118898e-06,
"loss": 1.2242,
"step": 89
},
{
"epoch": 1.2126315789473685,
"grad_norm": 0.5150516033172607,
"learning_rate": 5.709035061653494e-06,
"loss": 1.2346,
"step": 90
},
{
"epoch": 1.2261052631578948,
"grad_norm": 0.6364325284957886,
"learning_rate": 4.635649805813696e-06,
"loss": 1.3149,
"step": 91
},
{
"epoch": 1.2395789473684211,
"grad_norm": 0.6105322241783142,
"learning_rate": 3.670761277863485e-06,
"loss": 1.2594,
"step": 92
},
{
"epoch": 1.2530526315789474,
"grad_norm": 0.6162554621696472,
"learning_rate": 2.815857265976462e-06,
"loss": 1.277,
"step": 93
},
{
"epoch": 1.2665263157894737,
"grad_norm": 0.5331520438194275,
"learning_rate": 2.072255970174258e-06,
"loss": 1.2193,
"step": 94
},
{
"epoch": 1.28,
"grad_norm": 0.5716810822486877,
"learning_rate": 1.4411039697577175e-06,
"loss": 1.4455,
"step": 95
},
{
"epoch": 1.2934736842105263,
"grad_norm": 0.56926029920578,
"learning_rate": 9.233744553646754e-07,
"loss": 1.291,
"step": 96
},
{
"epoch": 1.3069473684210526,
"grad_norm": 0.5255064964294434,
"learning_rate": 5.198657283805279e-07,
"loss": 1.2986,
"step": 97
},
{
"epoch": 1.320421052631579,
"grad_norm": 0.5318115949630737,
"learning_rate": 2.311999700154027e-07,
"loss": 1.2152,
"step": 98
},
{
"epoch": 1.3338947368421052,
"grad_norm": 0.5788165330886841,
"learning_rate": 5.7822281945782424e-08,
"loss": 1.4851,
"step": 99
},
{
"epoch": 1.3473684210526315,
"grad_norm": 0.5943503379821777,
"learning_rate": 0.0,
"loss": 1.1959,
"step": 100
},
{
"epoch": 1.3473684210526315,
"eval_loss": 1.457924246788025,
"eval_runtime": 8.4236,
"eval_samples_per_second": 29.678,
"eval_steps_per_second": 14.839,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 9,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.33416392081408e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}