moe-kv-128 / trainer_state.json
kokolamba's picture
Upload best checkpoint from config
b244dab verified
{
"best_global_step": 3000,
"best_metric": 3.8677153542659872,
"best_model_checkpoint": "checkpoints/gpt2_sparse_moe_wiki/checkpoint-3000",
"epoch": 0.45211788973974965,
"eval_steps": 300,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007535298162329161,
"grad_norm": 1.240992784500122,
"learning_rate": 8.166666666666667e-05,
"loss": 9.6019,
"step": 50
},
{
"epoch": 0.015070596324658321,
"grad_norm": 0.5122302770614624,
"learning_rate": 0.000165,
"loss": 7.4964,
"step": 100
},
{
"epoch": 0.022605894486987483,
"grad_norm": 0.39637282490730286,
"learning_rate": 0.0002483333333333333,
"loss": 6.708,
"step": 150
},
{
"epoch": 0.030141192649316643,
"grad_norm": 0.7033893465995789,
"learning_rate": 0.0003316666666666667,
"loss": 6.2189,
"step": 200
},
{
"epoch": 0.037676490811645806,
"grad_norm": 0.455392062664032,
"learning_rate": 0.000415,
"loss": 5.8864,
"step": 250
},
{
"epoch": 0.045211788973974966,
"grad_norm": 0.6971898674964905,
"learning_rate": 0.0004983333333333334,
"loss": 5.6098,
"step": 300
},
{
"epoch": 0.045211788973974966,
"eval_loss": 5.4563346231373995,
"eval_perplexity": 234.23728104322976,
"eval_runtime": 41.896,
"eval_samples_per_second": 39.598,
"eval_steps_per_second": 0.621,
"step": 300
},
{
"epoch": 0.052747087136304126,
"grad_norm": 0.5648893117904663,
"learning_rate": 0.0004909259259259259,
"loss": 5.3778,
"step": 350
},
{
"epoch": 0.060282385298633286,
"grad_norm": 0.37668126821517944,
"learning_rate": 0.0004816666666666667,
"loss": 5.193,
"step": 400
},
{
"epoch": 0.06781768346096245,
"grad_norm": 0.5065845251083374,
"learning_rate": 0.0004724074074074074,
"loss": 5.0518,
"step": 450
},
{
"epoch": 0.07535298162329161,
"grad_norm": 0.33823442459106445,
"learning_rate": 0.00046314814814814813,
"loss": 4.9444,
"step": 500
},
{
"epoch": 0.08288827978562077,
"grad_norm": 0.4519856870174408,
"learning_rate": 0.00045388888888888893,
"loss": 4.8523,
"step": 550
},
{
"epoch": 0.09042357794794993,
"grad_norm": 0.44410082697868347,
"learning_rate": 0.00044462962962962967,
"loss": 4.7631,
"step": 600
},
{
"epoch": 0.09042357794794993,
"eval_loss": 4.637702769248809,
"eval_perplexity": 103.30675533050186,
"eval_runtime": 41.6331,
"eval_samples_per_second": 39.848,
"eval_steps_per_second": 0.625,
"step": 600
},
{
"epoch": 0.09795887611027909,
"grad_norm": 0.6111795902252197,
"learning_rate": 0.00043537037037037036,
"loss": 4.663,
"step": 650
},
{
"epoch": 0.10549417427260825,
"grad_norm": 0.42752805352211,
"learning_rate": 0.0004261111111111111,
"loss": 4.5909,
"step": 700
},
{
"epoch": 0.11302947243493741,
"grad_norm": 0.43343669176101685,
"learning_rate": 0.00041685185185185184,
"loss": 4.5207,
"step": 750
},
{
"epoch": 0.12056477059726657,
"grad_norm": 0.3778446316719055,
"learning_rate": 0.00040759259259259264,
"loss": 4.4668,
"step": 800
},
{
"epoch": 0.12810006875959573,
"grad_norm": 0.3948766589164734,
"learning_rate": 0.00039833333333333333,
"loss": 4.4229,
"step": 850
},
{
"epoch": 0.1356353669219249,
"grad_norm": 0.4046098291873932,
"learning_rate": 0.00038907407407407407,
"loss": 4.3883,
"step": 900
},
{
"epoch": 0.1356353669219249,
"eval_loss": 4.284032106708454,
"eval_perplexity": 72.53230919853132,
"eval_runtime": 42.8892,
"eval_samples_per_second": 38.681,
"eval_steps_per_second": 0.606,
"step": 900
},
{
"epoch": 0.14317066508425405,
"grad_norm": 0.4698060154914856,
"learning_rate": 0.0003798148148148148,
"loss": 4.3496,
"step": 950
},
{
"epoch": 0.15070596324658322,
"grad_norm": 0.5078156590461731,
"learning_rate": 0.0003705555555555556,
"loss": 4.3199,
"step": 1000
},
{
"epoch": 0.15824126140891237,
"grad_norm": 0.3680674135684967,
"learning_rate": 0.0003612962962962963,
"loss": 4.2949,
"step": 1050
},
{
"epoch": 0.16577655957124154,
"grad_norm": 0.4388742446899414,
"learning_rate": 0.00035203703703703704,
"loss": 4.266,
"step": 1100
},
{
"epoch": 0.1733118577335707,
"grad_norm": 0.4389984905719757,
"learning_rate": 0.0003427777777777778,
"loss": 4.2434,
"step": 1150
},
{
"epoch": 0.18084715589589986,
"grad_norm": 0.403689980506897,
"learning_rate": 0.0003335185185185185,
"loss": 4.2214,
"step": 1200
},
{
"epoch": 0.18084715589589986,
"eval_loss": 4.133626098675609,
"eval_perplexity": 62.403795489360064,
"eval_runtime": 42.5402,
"eval_samples_per_second": 38.998,
"eval_steps_per_second": 0.611,
"step": 1200
},
{
"epoch": 0.188382454058229,
"grad_norm": 0.5109913349151611,
"learning_rate": 0.00032425925925925927,
"loss": 4.2004,
"step": 1250
},
{
"epoch": 0.19591775222055818,
"grad_norm": 0.3623785972595215,
"learning_rate": 0.000315,
"loss": 4.1899,
"step": 1300
},
{
"epoch": 0.20345305038288733,
"grad_norm": 0.4568081796169281,
"learning_rate": 0.00030574074074074076,
"loss": 4.1702,
"step": 1350
},
{
"epoch": 0.2109883485452165,
"grad_norm": 0.3451692461967468,
"learning_rate": 0.00029648148148148144,
"loss": 4.1573,
"step": 1400
},
{
"epoch": 0.21852364670754565,
"grad_norm": 0.3655514419078827,
"learning_rate": 0.00028722222222222224,
"loss": 4.1367,
"step": 1450
},
{
"epoch": 0.22605894486987482,
"grad_norm": 0.3625420928001404,
"learning_rate": 0.000277962962962963,
"loss": 4.1268,
"step": 1500
},
{
"epoch": 0.22605894486987482,
"eval_loss": 4.0427056376487265,
"eval_perplexity": 56.98030248363226,
"eval_runtime": 42.8702,
"eval_samples_per_second": 38.698,
"eval_steps_per_second": 0.606,
"step": 1500
},
{
"epoch": 0.23359424303220397,
"grad_norm": 0.4124140739440918,
"learning_rate": 0.0002687037037037037,
"loss": 4.1176,
"step": 1550
},
{
"epoch": 0.24112954119453314,
"grad_norm": 0.3373737037181854,
"learning_rate": 0.0002594444444444444,
"loss": 4.1092,
"step": 1600
},
{
"epoch": 0.2486648393568623,
"grad_norm": 0.3414556086063385,
"learning_rate": 0.00025018518518518516,
"loss": 4.092,
"step": 1650
},
{
"epoch": 0.25620013751919146,
"grad_norm": 0.30866968631744385,
"learning_rate": 0.00024092592592592593,
"loss": 4.0841,
"step": 1700
},
{
"epoch": 0.26373543568152064,
"grad_norm": 0.31764939427375793,
"learning_rate": 0.00023166666666666667,
"loss": 4.0764,
"step": 1750
},
{
"epoch": 0.2712707338438498,
"grad_norm": 0.42184340953826904,
"learning_rate": 0.0002224074074074074,
"loss": 4.0666,
"step": 1800
},
{
"epoch": 0.2712707338438498,
"eval_loss": 3.9816309624713564,
"eval_perplexity": 53.604389719294815,
"eval_runtime": 42.5475,
"eval_samples_per_second": 38.992,
"eval_steps_per_second": 0.611,
"step": 1800
},
{
"epoch": 0.2788060320061789,
"grad_norm": 0.3581269085407257,
"learning_rate": 0.00021314814814814815,
"loss": 4.0579,
"step": 1850
},
{
"epoch": 0.2863413301685081,
"grad_norm": 0.33631259202957153,
"learning_rate": 0.0002038888888888889,
"loss": 4.0557,
"step": 1900
},
{
"epoch": 0.2938766283308373,
"grad_norm": 0.32767254114151,
"learning_rate": 0.00019462962962962964,
"loss": 4.0429,
"step": 1950
},
{
"epoch": 0.30141192649316645,
"grad_norm": 0.3312157094478607,
"learning_rate": 0.00018537037037037038,
"loss": 4.0388,
"step": 2000
},
{
"epoch": 0.30894722465549557,
"grad_norm": 0.3220396339893341,
"learning_rate": 0.00017611111111111112,
"loss": 4.0294,
"step": 2050
},
{
"epoch": 0.31648252281782474,
"grad_norm": 0.3097658157348633,
"learning_rate": 0.00016685185185185187,
"loss": 4.0213,
"step": 2100
},
{
"epoch": 0.31648252281782474,
"eval_loss": 3.9343392453526693,
"eval_perplexity": 51.1283554944596,
"eval_runtime": 42.915,
"eval_samples_per_second": 38.658,
"eval_steps_per_second": 0.606,
"step": 2100
},
{
"epoch": 0.3240178209801539,
"grad_norm": 0.32263439893722534,
"learning_rate": 0.00015759259259259258,
"loss": 4.0182,
"step": 2150
},
{
"epoch": 0.3315531191424831,
"grad_norm": 0.3611871004104614,
"learning_rate": 0.00014833333333333335,
"loss": 4.0101,
"step": 2200
},
{
"epoch": 0.3390884173048122,
"grad_norm": 0.3431486189365387,
"learning_rate": 0.00013907407407407407,
"loss": 4.0014,
"step": 2250
},
{
"epoch": 0.3466237154671414,
"grad_norm": 0.3198365569114685,
"learning_rate": 0.00012981481481481484,
"loss": 3.9968,
"step": 2300
},
{
"epoch": 0.35415901362947055,
"grad_norm": 0.31708911061286926,
"learning_rate": 0.00012055555555555555,
"loss": 3.9873,
"step": 2350
},
{
"epoch": 0.3616943117917997,
"grad_norm": 0.3318186104297638,
"learning_rate": 0.0001112962962962963,
"loss": 3.9928,
"step": 2400
},
{
"epoch": 0.3616943117917997,
"eval_loss": 3.901613183105629,
"eval_perplexity": 49.4822086178559,
"eval_runtime": 42.7457,
"eval_samples_per_second": 38.811,
"eval_steps_per_second": 0.608,
"step": 2400
},
{
"epoch": 0.36922960995412885,
"grad_norm": 0.31225499510765076,
"learning_rate": 0.00010203703703703704,
"loss": 3.9794,
"step": 2450
},
{
"epoch": 0.376764908116458,
"grad_norm": 0.36178267002105713,
"learning_rate": 9.277777777777778e-05,
"loss": 3.9816,
"step": 2500
},
{
"epoch": 0.3843002062787872,
"grad_norm": 0.3168866038322449,
"learning_rate": 8.351851851851852e-05,
"loss": 3.9739,
"step": 2550
},
{
"epoch": 0.39183550444111637,
"grad_norm": 0.244058296084404,
"learning_rate": 7.425925925925927e-05,
"loss": 3.9716,
"step": 2600
},
{
"epoch": 0.39937080260344554,
"grad_norm": 0.25681817531585693,
"learning_rate": 6.500000000000001e-05,
"loss": 3.9694,
"step": 2650
},
{
"epoch": 0.40690610076577466,
"grad_norm": 0.24345779418945312,
"learning_rate": 5.5740740740740744e-05,
"loss": 3.9646,
"step": 2700
},
{
"epoch": 0.40690610076577466,
"eval_loss": 3.8781924804982686,
"eval_perplexity": 48.336766414632706,
"eval_runtime": 43.041,
"eval_samples_per_second": 38.545,
"eval_steps_per_second": 0.604,
"step": 2700
},
{
"epoch": 0.41444139892810383,
"grad_norm": 0.24125564098358154,
"learning_rate": 4.6481481481481486e-05,
"loss": 3.9617,
"step": 2750
},
{
"epoch": 0.421976697090433,
"grad_norm": 0.2191689908504486,
"learning_rate": 3.722222222222222e-05,
"loss": 3.9597,
"step": 2800
},
{
"epoch": 0.4295119952527622,
"grad_norm": 0.22993455827236176,
"learning_rate": 2.7962962962962965e-05,
"loss": 3.9603,
"step": 2850
},
{
"epoch": 0.4370472934150913,
"grad_norm": 0.2122599482536316,
"learning_rate": 1.8703703703703707e-05,
"loss": 3.956,
"step": 2900
},
{
"epoch": 0.44458259157742047,
"grad_norm": 0.2053080052137375,
"learning_rate": 9.444444444444445e-06,
"loss": 3.9582,
"step": 2950
},
{
"epoch": 0.45211788973974965,
"grad_norm": 0.19383326172828674,
"learning_rate": 1.8518518518518518e-07,
"loss": 3.9558,
"step": 3000
},
{
"epoch": 0.45211788973974965,
"eval_loss": 3.8677153542659872,
"eval_perplexity": 47.832979737936746,
"eval_runtime": 42.8133,
"eval_samples_per_second": 38.75,
"eval_steps_per_second": 0.607,
"step": 3000
}
],
"logging_steps": 50,
"max_steps": 3000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.57669022695424e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}