| { | |
| "best_global_step": 3000, | |
| "best_metric": 3.8677153542659872, | |
| "best_model_checkpoint": "checkpoints/gpt2_sparse_moe_wiki/checkpoint-3000", | |
| "epoch": 0.45211788973974965, | |
| "eval_steps": 300, | |
| "global_step": 3000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007535298162329161, | |
| "grad_norm": 1.240992784500122, | |
| "learning_rate": 8.166666666666667e-05, | |
| "loss": 9.6019, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.015070596324658321, | |
| "grad_norm": 0.5122302770614624, | |
| "learning_rate": 0.000165, | |
| "loss": 7.4964, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.022605894486987483, | |
| "grad_norm": 0.39637282490730286, | |
| "learning_rate": 0.0002483333333333333, | |
| "loss": 6.708, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.030141192649316643, | |
| "grad_norm": 0.7033893465995789, | |
| "learning_rate": 0.0003316666666666667, | |
| "loss": 6.2189, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.037676490811645806, | |
| "grad_norm": 0.455392062664032, | |
| "learning_rate": 0.000415, | |
| "loss": 5.8864, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.045211788973974966, | |
| "grad_norm": 0.6971898674964905, | |
| "learning_rate": 0.0004983333333333334, | |
| "loss": 5.6098, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.045211788973974966, | |
| "eval_loss": 5.4563346231373995, | |
| "eval_perplexity": 234.23728104322976, | |
| "eval_runtime": 41.896, | |
| "eval_samples_per_second": 39.598, | |
| "eval_steps_per_second": 0.621, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.052747087136304126, | |
| "grad_norm": 0.5648893117904663, | |
| "learning_rate": 0.0004909259259259259, | |
| "loss": 5.3778, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.060282385298633286, | |
| "grad_norm": 0.37668126821517944, | |
| "learning_rate": 0.0004816666666666667, | |
| "loss": 5.193, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.06781768346096245, | |
| "grad_norm": 0.5065845251083374, | |
| "learning_rate": 0.0004724074074074074, | |
| "loss": 5.0518, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.07535298162329161, | |
| "grad_norm": 0.33823442459106445, | |
| "learning_rate": 0.00046314814814814813, | |
| "loss": 4.9444, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.08288827978562077, | |
| "grad_norm": 0.4519856870174408, | |
| "learning_rate": 0.00045388888888888893, | |
| "loss": 4.8523, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.09042357794794993, | |
| "grad_norm": 0.44410082697868347, | |
| "learning_rate": 0.00044462962962962967, | |
| "loss": 4.7631, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09042357794794993, | |
| "eval_loss": 4.637702769248809, | |
| "eval_perplexity": 103.30675533050186, | |
| "eval_runtime": 41.6331, | |
| "eval_samples_per_second": 39.848, | |
| "eval_steps_per_second": 0.625, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.09795887611027909, | |
| "grad_norm": 0.6111795902252197, | |
| "learning_rate": 0.00043537037037037036, | |
| "loss": 4.663, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.10549417427260825, | |
| "grad_norm": 0.42752805352211, | |
| "learning_rate": 0.0004261111111111111, | |
| "loss": 4.5909, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.11302947243493741, | |
| "grad_norm": 0.43343669176101685, | |
| "learning_rate": 0.00041685185185185184, | |
| "loss": 4.5207, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.12056477059726657, | |
| "grad_norm": 0.3778446316719055, | |
| "learning_rate": 0.00040759259259259264, | |
| "loss": 4.4668, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.12810006875959573, | |
| "grad_norm": 0.3948766589164734, | |
| "learning_rate": 0.00039833333333333333, | |
| "loss": 4.4229, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.1356353669219249, | |
| "grad_norm": 0.4046098291873932, | |
| "learning_rate": 0.00038907407407407407, | |
| "loss": 4.3883, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1356353669219249, | |
| "eval_loss": 4.284032106708454, | |
| "eval_perplexity": 72.53230919853132, | |
| "eval_runtime": 42.8892, | |
| "eval_samples_per_second": 38.681, | |
| "eval_steps_per_second": 0.606, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.14317066508425405, | |
| "grad_norm": 0.4698060154914856, | |
| "learning_rate": 0.0003798148148148148, | |
| "loss": 4.3496, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.15070596324658322, | |
| "grad_norm": 0.5078156590461731, | |
| "learning_rate": 0.0003705555555555556, | |
| "loss": 4.3199, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.15824126140891237, | |
| "grad_norm": 0.3680674135684967, | |
| "learning_rate": 0.0003612962962962963, | |
| "loss": 4.2949, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.16577655957124154, | |
| "grad_norm": 0.4388742446899414, | |
| "learning_rate": 0.00035203703703703704, | |
| "loss": 4.266, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.1733118577335707, | |
| "grad_norm": 0.4389984905719757, | |
| "learning_rate": 0.0003427777777777778, | |
| "loss": 4.2434, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.18084715589589986, | |
| "grad_norm": 0.403689980506897, | |
| "learning_rate": 0.0003335185185185185, | |
| "loss": 4.2214, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.18084715589589986, | |
| "eval_loss": 4.133626098675609, | |
| "eval_perplexity": 62.403795489360064, | |
| "eval_runtime": 42.5402, | |
| "eval_samples_per_second": 38.998, | |
| "eval_steps_per_second": 0.611, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.188382454058229, | |
| "grad_norm": 0.5109913349151611, | |
| "learning_rate": 0.00032425925925925927, | |
| "loss": 4.2004, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.19591775222055818, | |
| "grad_norm": 0.3623785972595215, | |
| "learning_rate": 0.000315, | |
| "loss": 4.1899, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.20345305038288733, | |
| "grad_norm": 0.4568081796169281, | |
| "learning_rate": 0.00030574074074074076, | |
| "loss": 4.1702, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.2109883485452165, | |
| "grad_norm": 0.3451692461967468, | |
| "learning_rate": 0.00029648148148148144, | |
| "loss": 4.1573, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.21852364670754565, | |
| "grad_norm": 0.3655514419078827, | |
| "learning_rate": 0.00028722222222222224, | |
| "loss": 4.1367, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.22605894486987482, | |
| "grad_norm": 0.3625420928001404, | |
| "learning_rate": 0.000277962962962963, | |
| "loss": 4.1268, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.22605894486987482, | |
| "eval_loss": 4.0427056376487265, | |
| "eval_perplexity": 56.98030248363226, | |
| "eval_runtime": 42.8702, | |
| "eval_samples_per_second": 38.698, | |
| "eval_steps_per_second": 0.606, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.23359424303220397, | |
| "grad_norm": 0.4124140739440918, | |
| "learning_rate": 0.0002687037037037037, | |
| "loss": 4.1176, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.24112954119453314, | |
| "grad_norm": 0.3373737037181854, | |
| "learning_rate": 0.0002594444444444444, | |
| "loss": 4.1092, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.2486648393568623, | |
| "grad_norm": 0.3414556086063385, | |
| "learning_rate": 0.00025018518518518516, | |
| "loss": 4.092, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.25620013751919146, | |
| "grad_norm": 0.30866968631744385, | |
| "learning_rate": 0.00024092592592592593, | |
| "loss": 4.0841, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.26373543568152064, | |
| "grad_norm": 0.31764939427375793, | |
| "learning_rate": 0.00023166666666666667, | |
| "loss": 4.0764, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2712707338438498, | |
| "grad_norm": 0.42184340953826904, | |
| "learning_rate": 0.0002224074074074074, | |
| "loss": 4.0666, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2712707338438498, | |
| "eval_loss": 3.9816309624713564, | |
| "eval_perplexity": 53.604389719294815, | |
| "eval_runtime": 42.5475, | |
| "eval_samples_per_second": 38.992, | |
| "eval_steps_per_second": 0.611, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.2788060320061789, | |
| "grad_norm": 0.3581269085407257, | |
| "learning_rate": 0.00021314814814814815, | |
| "loss": 4.0579, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2863413301685081, | |
| "grad_norm": 0.33631259202957153, | |
| "learning_rate": 0.0002038888888888889, | |
| "loss": 4.0557, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.2938766283308373, | |
| "grad_norm": 0.32767254114151, | |
| "learning_rate": 0.00019462962962962964, | |
| "loss": 4.0429, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.30141192649316645, | |
| "grad_norm": 0.3312157094478607, | |
| "learning_rate": 0.00018537037037037038, | |
| "loss": 4.0388, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.30894722465549557, | |
| "grad_norm": 0.3220396339893341, | |
| "learning_rate": 0.00017611111111111112, | |
| "loss": 4.0294, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.31648252281782474, | |
| "grad_norm": 0.3097658157348633, | |
| "learning_rate": 0.00016685185185185187, | |
| "loss": 4.0213, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.31648252281782474, | |
| "eval_loss": 3.9343392453526693, | |
| "eval_perplexity": 51.1283554944596, | |
| "eval_runtime": 42.915, | |
| "eval_samples_per_second": 38.658, | |
| "eval_steps_per_second": 0.606, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.3240178209801539, | |
| "grad_norm": 0.32263439893722534, | |
| "learning_rate": 0.00015759259259259258, | |
| "loss": 4.0182, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.3315531191424831, | |
| "grad_norm": 0.3611871004104614, | |
| "learning_rate": 0.00014833333333333335, | |
| "loss": 4.0101, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.3390884173048122, | |
| "grad_norm": 0.3431486189365387, | |
| "learning_rate": 0.00013907407407407407, | |
| "loss": 4.0014, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.3466237154671414, | |
| "grad_norm": 0.3198365569114685, | |
| "learning_rate": 0.00012981481481481484, | |
| "loss": 3.9968, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.35415901362947055, | |
| "grad_norm": 0.31708911061286926, | |
| "learning_rate": 0.00012055555555555555, | |
| "loss": 3.9873, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3616943117917997, | |
| "grad_norm": 0.3318186104297638, | |
| "learning_rate": 0.0001112962962962963, | |
| "loss": 3.9928, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.3616943117917997, | |
| "eval_loss": 3.901613183105629, | |
| "eval_perplexity": 49.4822086178559, | |
| "eval_runtime": 42.7457, | |
| "eval_samples_per_second": 38.811, | |
| "eval_steps_per_second": 0.608, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.36922960995412885, | |
| "grad_norm": 0.31225499510765076, | |
| "learning_rate": 0.00010203703703703704, | |
| "loss": 3.9794, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.376764908116458, | |
| "grad_norm": 0.36178267002105713, | |
| "learning_rate": 9.277777777777778e-05, | |
| "loss": 3.9816, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3843002062787872, | |
| "grad_norm": 0.3168866038322449, | |
| "learning_rate": 8.351851851851852e-05, | |
| "loss": 3.9739, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.39183550444111637, | |
| "grad_norm": 0.244058296084404, | |
| "learning_rate": 7.425925925925927e-05, | |
| "loss": 3.9716, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.39937080260344554, | |
| "grad_norm": 0.25681817531585693, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": 3.9694, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.40690610076577466, | |
| "grad_norm": 0.24345779418945312, | |
| "learning_rate": 5.5740740740740744e-05, | |
| "loss": 3.9646, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.40690610076577466, | |
| "eval_loss": 3.8781924804982686, | |
| "eval_perplexity": 48.336766414632706, | |
| "eval_runtime": 43.041, | |
| "eval_samples_per_second": 38.545, | |
| "eval_steps_per_second": 0.604, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.41444139892810383, | |
| "grad_norm": 0.24125564098358154, | |
| "learning_rate": 4.6481481481481486e-05, | |
| "loss": 3.9617, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.421976697090433, | |
| "grad_norm": 0.2191689908504486, | |
| "learning_rate": 3.722222222222222e-05, | |
| "loss": 3.9597, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.4295119952527622, | |
| "grad_norm": 0.22993455827236176, | |
| "learning_rate": 2.7962962962962965e-05, | |
| "loss": 3.9603, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.4370472934150913, | |
| "grad_norm": 0.2122599482536316, | |
| "learning_rate": 1.8703703703703707e-05, | |
| "loss": 3.956, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.44458259157742047, | |
| "grad_norm": 0.2053080052137375, | |
| "learning_rate": 9.444444444444445e-06, | |
| "loss": 3.9582, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.45211788973974965, | |
| "grad_norm": 0.19383326172828674, | |
| "learning_rate": 1.8518518518518518e-07, | |
| "loss": 3.9558, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.45211788973974965, | |
| "eval_loss": 3.8677153542659872, | |
| "eval_perplexity": 47.832979737936746, | |
| "eval_runtime": 42.8133, | |
| "eval_samples_per_second": 38.75, | |
| "eval_steps_per_second": 0.607, | |
| "step": 3000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 3000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.57669022695424e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |