nttx's picture
Training in progress, step 3000, checkpoint
53aa481 verified
{
"best_metric": 0.7818862795829773,
"best_model_checkpoint": "miner_id_24/checkpoint-2700",
"epoch": 4.452606195511037,
"eval_steps": 150,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014839547393804488,
"eval_loss": 1.1009923219680786,
"eval_runtime": 14.0062,
"eval_samples_per_second": 40.554,
"eval_steps_per_second": 20.277,
"step": 1
},
{
"epoch": 0.07419773696902245,
"grad_norm": 0.1295081079006195,
"learning_rate": 0.0002,
"loss": 1.0386,
"step": 50
},
{
"epoch": 0.1483954739380449,
"grad_norm": 0.1271422952413559,
"learning_rate": 0.0001998582695676762,
"loss": 0.9538,
"step": 100
},
{
"epoch": 0.22259321090706732,
"grad_norm": 0.12388234585523605,
"learning_rate": 0.00019943348002101371,
"loss": 0.9411,
"step": 150
},
{
"epoch": 0.22259321090706732,
"eval_loss": 0.9268936514854431,
"eval_runtime": 14.0765,
"eval_samples_per_second": 40.351,
"eval_steps_per_second": 20.175,
"step": 150
},
{
"epoch": 0.2967909478760898,
"grad_norm": 0.13258318603038788,
"learning_rate": 0.00019872683547213446,
"loss": 0.9297,
"step": 200
},
{
"epoch": 0.3709886848451122,
"grad_norm": 0.1404084414243698,
"learning_rate": 0.00019774033898178667,
"loss": 0.9078,
"step": 250
},
{
"epoch": 0.44518642181413465,
"grad_norm": 0.14087454974651337,
"learning_rate": 0.0001964767868814516,
"loss": 0.9052,
"step": 300
},
{
"epoch": 0.44518642181413465,
"eval_loss": 0.8921840786933899,
"eval_runtime": 14.2434,
"eval_samples_per_second": 39.878,
"eval_steps_per_second": 19.939,
"step": 300
},
{
"epoch": 0.5193841587831571,
"grad_norm": 0.14395581185817719,
"learning_rate": 0.00019493976084683813,
"loss": 0.8842,
"step": 350
},
{
"epoch": 0.5935818957521796,
"grad_norm": 0.13259702920913696,
"learning_rate": 0.00019313361774523385,
"loss": 0.8869,
"step": 400
},
{
"epoch": 0.667779632721202,
"grad_norm": 0.141865536570549,
"learning_rate": 0.00019106347728549135,
"loss": 0.8705,
"step": 450
},
{
"epoch": 0.667779632721202,
"eval_loss": 0.8677888512611389,
"eval_runtime": 14.1021,
"eval_samples_per_second": 40.278,
"eval_steps_per_second": 20.139,
"step": 450
},
{
"epoch": 0.7419773696902244,
"grad_norm": 0.1400303989648819,
"learning_rate": 0.00018873520750565718,
"loss": 0.8765,
"step": 500
},
{
"epoch": 0.8161751066592469,
"grad_norm": 0.1415148228406906,
"learning_rate": 0.0001861554081393806,
"loss": 0.8718,
"step": 550
},
{
"epoch": 0.8903728436282693,
"grad_norm": 0.1443643420934677,
"learning_rate": 0.0001833313919082515,
"loss": 0.8544,
"step": 600
},
{
"epoch": 0.8903728436282693,
"eval_loss": 0.8513836860656738,
"eval_runtime": 14.4561,
"eval_samples_per_second": 39.291,
"eval_steps_per_second": 19.646,
"step": 600
},
{
"epoch": 0.9645705805972918,
"grad_norm": 0.15358565747737885,
"learning_rate": 0.00018027116379309638,
"loss": 0.855,
"step": 650
},
{
"epoch": 1.0389538119087367,
"grad_norm": 0.14816033840179443,
"learning_rate": 0.00017698339834299061,
"loss": 0.8524,
"step": 700
},
{
"epoch": 1.1131515488777592,
"grad_norm": 0.1639455258846283,
"learning_rate": 0.00017347741508630672,
"loss": 0.8167,
"step": 750
},
{
"epoch": 1.1131515488777592,
"eval_loss": 0.8397712111473083,
"eval_runtime": 14.3467,
"eval_samples_per_second": 39.591,
"eval_steps_per_second": 19.796,
"step": 750
},
{
"epoch": 1.1873492858467816,
"grad_norm": 0.1472490429878235,
"learning_rate": 0.0001697631521134985,
"loss": 0.8086,
"step": 800
},
{
"epoch": 1.261547022815804,
"grad_norm": 0.16415859758853912,
"learning_rate": 0.00016585113790650388,
"loss": 0.802,
"step": 850
},
{
"epoch": 1.3357447597848267,
"grad_norm": 0.1668313443660736,
"learning_rate": 0.0001617524614946192,
"loss": 0.7961,
"step": 900
},
{
"epoch": 1.3357447597848267,
"eval_loss": 0.8281386494636536,
"eval_runtime": 13.996,
"eval_samples_per_second": 40.583,
"eval_steps_per_second": 20.292,
"step": 900
},
{
"epoch": 1.409942496753849,
"grad_norm": 0.1581781506538391,
"learning_rate": 0.0001574787410214407,
"loss": 0.8142,
"step": 950
},
{
"epoch": 1.4841402337228715,
"grad_norm": 0.16410471498966217,
"learning_rate": 0.00015304209081197425,
"loss": 0.8208,
"step": 1000
},
{
"epoch": 1.5583379706918938,
"grad_norm": 0.16278082132339478,
"learning_rate": 0.00014845508703326504,
"loss": 0.7965,
"step": 1050
},
{
"epoch": 1.5583379706918938,
"eval_loss": 0.8185766339302063,
"eval_runtime": 14.096,
"eval_samples_per_second": 40.295,
"eval_steps_per_second": 20.148,
"step": 1050
},
{
"epoch": 1.6325357076609164,
"grad_norm": 0.1645856499671936,
"learning_rate": 0.00014373073204588556,
"loss": 0.796,
"step": 1100
},
{
"epoch": 1.706733444629939,
"grad_norm": 0.16889511048793793,
"learning_rate": 0.00013888241754733208,
"loss": 0.7972,
"step": 1150
},
{
"epoch": 1.7809311815989612,
"grad_norm": 0.16280411183834076,
"learning_rate": 0.00013392388661180303,
"loss": 0.7986,
"step": 1200
},
{
"epoch": 1.7809311815989612,
"eval_loss": 0.8099638223648071,
"eval_runtime": 14.0445,
"eval_samples_per_second": 40.443,
"eval_steps_per_second": 20.221,
"step": 1200
},
{
"epoch": 1.8551289185679836,
"grad_norm": 0.16928894817829132,
"learning_rate": 0.0001288691947339621,
"loss": 0.7992,
"step": 1250
},
{
"epoch": 1.929326655537006,
"grad_norm": 0.16871026158332825,
"learning_rate": 0.0001237326699871115,
"loss": 0.8071,
"step": 1300
},
{
"epoch": 2.0037098868484513,
"grad_norm": 0.1630755364894867,
"learning_rate": 0.00011852887240871145,
"loss": 0.7548,
"step": 1350
},
{
"epoch": 2.0037098868484513,
"eval_loss": 0.8026086688041687,
"eval_runtime": 14.0517,
"eval_samples_per_second": 40.422,
"eval_steps_per_second": 20.211,
"step": 1350
},
{
"epoch": 2.0779076238174734,
"grad_norm": 0.16451841592788696,
"learning_rate": 0.00011327255272837221,
"loss": 0.7476,
"step": 1400
},
{
"epoch": 2.152105360786496,
"grad_norm": 0.18384945392608643,
"learning_rate": 0.00010797861055530831,
"loss": 0.7691,
"step": 1450
},
{
"epoch": 2.2263030977555185,
"grad_norm": 0.18934905529022217,
"learning_rate": 0.00010266205214377748,
"loss": 0.7409,
"step": 1500
},
{
"epoch": 2.2263030977555185,
"eval_loss": 0.7997989654541016,
"eval_runtime": 13.98,
"eval_samples_per_second": 40.629,
"eval_steps_per_second": 20.315,
"step": 1500
},
{
"epoch": 2.300500834724541,
"grad_norm": 0.18251581490039825,
"learning_rate": 9.733794785622253e-05,
"loss": 0.7668,
"step": 1550
},
{
"epoch": 2.374698571693563,
"grad_norm": 0.1793128401041031,
"learning_rate": 9.202138944469168e-05,
"loss": 0.7456,
"step": 1600
},
{
"epoch": 2.4488963086625857,
"grad_norm": 0.17612189054489136,
"learning_rate": 8.672744727162781e-05,
"loss": 0.7508,
"step": 1650
},
{
"epoch": 2.4488963086625857,
"eval_loss": 0.7954422831535339,
"eval_runtime": 14.3584,
"eval_samples_per_second": 39.559,
"eval_steps_per_second": 19.779,
"step": 1650
},
{
"epoch": 2.523094045631608,
"grad_norm": 0.19443200528621674,
"learning_rate": 8.147112759128859e-05,
"loss": 0.7682,
"step": 1700
},
{
"epoch": 2.5972917826006308,
"grad_norm": 0.1849536895751953,
"learning_rate": 7.626733001288851e-05,
"loss": 0.7461,
"step": 1750
},
{
"epoch": 2.6714895195696533,
"grad_norm": 0.203544482588768,
"learning_rate": 7.113080526603792e-05,
"loss": 0.7606,
"step": 1800
},
{
"epoch": 2.6714895195696533,
"eval_loss": 0.7908361554145813,
"eval_runtime": 14.0191,
"eval_samples_per_second": 40.516,
"eval_steps_per_second": 20.258,
"step": 1800
},
{
"epoch": 2.745687256538676,
"grad_norm": 0.18425534665584564,
"learning_rate": 6.607611338819697e-05,
"loss": 0.7399,
"step": 1850
},
{
"epoch": 2.819884993507698,
"grad_norm": 0.18944227695465088,
"learning_rate": 6.111758245266794e-05,
"loss": 0.7481,
"step": 1900
},
{
"epoch": 2.8940827304767205,
"grad_norm": 0.17936797440052032,
"learning_rate": 5.626926795411447e-05,
"loss": 0.7366,
"step": 1950
},
{
"epoch": 2.8940827304767205,
"eval_loss": 0.7873192429542542,
"eval_runtime": 14.1781,
"eval_samples_per_second": 40.062,
"eval_steps_per_second": 20.031,
"step": 1950
},
{
"epoch": 2.968280467445743,
"grad_norm": 0.19529031217098236,
"learning_rate": 5.1544912966734994e-05,
"loss": 0.7521,
"step": 2000
},
{
"epoch": 3.042663698757188,
"grad_norm": 0.1839251071214676,
"learning_rate": 4.695790918802576e-05,
"loss": 0.76,
"step": 2050
},
{
"epoch": 3.1168614357262103,
"grad_norm": 0.18065090477466583,
"learning_rate": 4.252125897855932e-05,
"loss": 0.7173,
"step": 2100
},
{
"epoch": 3.1168614357262103,
"eval_loss": 0.7872137427330017,
"eval_runtime": 14.1181,
"eval_samples_per_second": 40.232,
"eval_steps_per_second": 20.116,
"step": 2100
},
{
"epoch": 3.191059172695233,
"grad_norm": 0.18571385741233826,
"learning_rate": 3.824753850538082e-05,
"loss": 0.7305,
"step": 2150
},
{
"epoch": 3.2652569096642554,
"grad_norm": 0.182917520403862,
"learning_rate": 3.414886209349615e-05,
"loss": 0.7339,
"step": 2200
},
{
"epoch": 3.3394546466332775,
"grad_norm": 0.19343669712543488,
"learning_rate": 3.0236847886501542e-05,
"loss": 0.7294,
"step": 2250
},
{
"epoch": 3.3394546466332775,
"eval_loss": 0.7847491502761841,
"eval_runtime": 14.0352,
"eval_samples_per_second": 40.47,
"eval_steps_per_second": 20.235,
"step": 2250
},
{
"epoch": 3.4136523836023,
"grad_norm": 0.19025780260562897,
"learning_rate": 2.6522584913693294e-05,
"loss": 0.7251,
"step": 2300
},
{
"epoch": 3.4878501205713226,
"grad_norm": 0.19098390638828278,
"learning_rate": 2.301660165700936e-05,
"loss": 0.7128,
"step": 2350
},
{
"epoch": 3.562047857540345,
"grad_norm": 0.1902209222316742,
"learning_rate": 1.9728836206903656e-05,
"loss": 0.7282,
"step": 2400
},
{
"epoch": 3.562047857540345,
"eval_loss": 0.7834780216217041,
"eval_runtime": 13.9633,
"eval_samples_per_second": 40.678,
"eval_steps_per_second": 20.339,
"step": 2400
},
{
"epoch": 3.6362455945093677,
"grad_norm": 0.19502930343151093,
"learning_rate": 1.6668608091748495e-05,
"loss": 0.7328,
"step": 2450
},
{
"epoch": 3.71044333147839,
"grad_norm": 0.18348978459835052,
"learning_rate": 1.3844591860619383e-05,
"loss": 0.7293,
"step": 2500
},
{
"epoch": 3.7846410684474123,
"grad_norm": 0.18518677353858948,
"learning_rate": 1.1264792494342857e-05,
"loss": 0.72,
"step": 2550
},
{
"epoch": 3.7846410684474123,
"eval_loss": 0.7825514078140259,
"eval_runtime": 14.1713,
"eval_samples_per_second": 40.081,
"eval_steps_per_second": 20.041,
"step": 2550
},
{
"epoch": 3.858838805416435,
"grad_norm": 0.1809045374393463,
"learning_rate": 8.936522714508678e-06,
"loss": 0.7246,
"step": 2600
},
{
"epoch": 3.933036542385457,
"grad_norm": 0.18573115766048431,
"learning_rate": 6.866382254766157e-06,
"loss": 0.724,
"step": 2650
},
{
"epoch": 4.007419773696903,
"grad_norm": 0.1879529505968094,
"learning_rate": 5.060239153161872e-06,
"loss": 0.7085,
"step": 2700
},
{
"epoch": 4.007419773696903,
"eval_loss": 0.7818862795829773,
"eval_runtime": 14.0534,
"eval_samples_per_second": 40.417,
"eval_steps_per_second": 20.209,
"step": 2700
},
{
"epoch": 4.081617510665924,
"grad_norm": 0.18808983266353607,
"learning_rate": 3.5232131185484076e-06,
"loss": 0.7197,
"step": 2750
},
{
"epoch": 4.155815247634947,
"grad_norm": 0.1956692934036255,
"learning_rate": 2.259661018213333e-06,
"loss": 0.7152,
"step": 2800
},
{
"epoch": 4.230012984603969,
"grad_norm": 0.1999446451663971,
"learning_rate": 1.2731645278655445e-06,
"loss": 0.7127,
"step": 2850
},
{
"epoch": 4.230012984603969,
"eval_loss": 0.7822389006614685,
"eval_runtime": 14.2177,
"eval_samples_per_second": 39.95,
"eval_steps_per_second": 19.975,
"step": 2850
},
{
"epoch": 4.304210721572992,
"grad_norm": 0.1753046214580536,
"learning_rate": 5.665199789862907e-07,
"loss": 0.7241,
"step": 2900
},
{
"epoch": 4.378408458542014,
"grad_norm": 0.19707240164279938,
"learning_rate": 1.4173043232380557e-07,
"loss": 0.7269,
"step": 2950
},
{
"epoch": 4.452606195511037,
"grad_norm": 0.1791616678237915,
"learning_rate": 0.0,
"loss": 0.7119,
"step": 3000
},
{
"epoch": 4.452606195511037,
"eval_loss": 0.7823454141616821,
"eval_runtime": 14.0926,
"eval_samples_per_second": 40.305,
"eval_steps_per_second": 20.152,
"step": 3000
}
],
"logging_steps": 50,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 150,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.4201657147392e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}