{ "best_metric": 0.042669691145420074, "best_model_checkpoint": "miner_id_24/checkpoint-1200", "epoch": 1.3592599584670568, "eval_steps": 200, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007551444213705871, "eval_loss": 0.3643783926963806, "eval_runtime": 87.2468, "eval_samples_per_second": 12.791, "eval_steps_per_second": 3.198, "step": 1 }, { "epoch": 0.037757221068529356, "grad_norm": 1.4100414514541626, "learning_rate": 5e-05, "loss": 0.1364, "step": 50 }, { "epoch": 0.07551444213705871, "grad_norm": 0.6465395092964172, "learning_rate": 0.0001, "loss": 0.0635, "step": 100 }, { "epoch": 0.11327166320558807, "grad_norm": 1.6140310764312744, "learning_rate": 9.98292246503335e-05, "loss": 0.047, "step": 150 }, { "epoch": 0.15102888427411743, "grad_norm": 2.0268421173095703, "learning_rate": 9.931806517013612e-05, "loss": 0.0477, "step": 200 }, { "epoch": 0.15102888427411743, "eval_loss": 0.05864099785685539, "eval_runtime": 88.7167, "eval_samples_per_second": 12.579, "eval_steps_per_second": 3.145, "step": 200 }, { "epoch": 0.18878610534264678, "grad_norm": 1.6122969388961792, "learning_rate": 9.847001329696653e-05, "loss": 0.0468, "step": 250 }, { "epoch": 0.22654332641117614, "grad_norm": 0.023569952696561813, "learning_rate": 9.729086208503174e-05, "loss": 0.0393, "step": 300 }, { "epoch": 0.2643005474797055, "grad_norm": 0.6391419172286987, "learning_rate": 9.578866633275288e-05, "loss": 0.044, "step": 350 }, { "epoch": 0.30205776854823485, "grad_norm": 0.2637801766395569, "learning_rate": 9.397368756032445e-05, "loss": 0.0349, "step": 400 }, { "epoch": 0.30205776854823485, "eval_loss": 0.048699330538511276, "eval_runtime": 88.8453, "eval_samples_per_second": 12.561, "eval_steps_per_second": 3.14, "step": 400 }, { "epoch": 0.3398149896167642, "grad_norm": 1.7430977821350098, "learning_rate": 9.185832391312644e-05, "loss": 0.0443, "step": 450 }, { "epoch": 0.37757221068529356, "grad_norm": 0.3382541835308075, "learning_rate": 8.945702546981969e-05, "loss": 0.0392, "step": 500 }, { "epoch": 0.4153294317538229, "grad_norm": 1.5577205419540405, "learning_rate": 8.678619553365659e-05, "loss": 0.0361, "step": 550 }, { "epoch": 0.4530866528223523, "grad_norm": 0.838959276676178, "learning_rate": 8.386407858128706e-05, "loss": 0.0334, "step": 600 }, { "epoch": 0.4530866528223523, "eval_loss": 0.05116293579339981, "eval_runtime": 88.8487, "eval_samples_per_second": 12.561, "eval_steps_per_second": 3.14, "step": 600 }, { "epoch": 0.49084387389088163, "grad_norm": 1.6610183715820312, "learning_rate": 8.07106356344834e-05, "loss": 0.0422, "step": 650 }, { "epoch": 0.528601094959411, "grad_norm": 1.1309680938720703, "learning_rate": 7.734740790612136e-05, "loss": 0.0348, "step": 700 }, { "epoch": 0.5663583160279403, "grad_norm": 0.7380091547966003, "learning_rate": 7.379736965185368e-05, "loss": 0.0404, "step": 750 }, { "epoch": 0.6041155370964697, "grad_norm": 0.2660733461380005, "learning_rate": 7.008477123264848e-05, "loss": 0.0444, "step": 800 }, { "epoch": 0.6041155370964697, "eval_loss": 0.04871406406164169, "eval_runtime": 88.864, "eval_samples_per_second": 12.559, "eval_steps_per_second": 3.14, "step": 800 }, { "epoch": 0.641872758164999, "grad_norm": 0.5725349187850952, "learning_rate": 6.623497346023418e-05, "loss": 0.0347, "step": 850 }, { "epoch": 0.6796299792335284, "grad_norm": 0.9907957911491394, "learning_rate": 6.227427435703997e-05, "loss": 0.035, "step": 900 }, { "epoch": 0.7173872003020577, "grad_norm": 0.7868975400924683, "learning_rate": 5.8229729514036705e-05, "loss": 0.0344, "step": 950 }, { "epoch": 0.7551444213705871, "grad_norm": 2.8851935863494873, "learning_rate": 5.4128967273616625e-05, "loss": 0.0365, "step": 1000 }, { "epoch": 0.7551444213705871, "eval_loss": 0.0462164580821991, "eval_runtime": 88.9886, "eval_samples_per_second": 12.541, "eval_steps_per_second": 3.135, "step": 1000 }, { "epoch": 0.7929016424391164, "grad_norm": 0.26174625754356384, "learning_rate": 5e-05, "loss": 0.0385, "step": 1050 }, { "epoch": 0.8306588635076458, "grad_norm": 0.055037304759025574, "learning_rate": 4.5871032726383386e-05, "loss": 0.039, "step": 1100 }, { "epoch": 0.8684160845761751, "grad_norm": 0.029556207358837128, "learning_rate": 4.17702704859633e-05, "loss": 0.0302, "step": 1150 }, { "epoch": 0.9061733056447046, "grad_norm": 0.44684645533561707, "learning_rate": 3.772572564296005e-05, "loss": 0.0365, "step": 1200 }, { "epoch": 0.9061733056447046, "eval_loss": 0.042669691145420074, "eval_runtime": 88.4572, "eval_samples_per_second": 12.616, "eval_steps_per_second": 3.154, "step": 1200 }, { "epoch": 0.9439305267132339, "grad_norm": 0.031862616539001465, "learning_rate": 3.3765026539765834e-05, "loss": 0.0312, "step": 1250 }, { "epoch": 0.9816877477817633, "grad_norm": 0.028305215761065483, "learning_rate": 2.991522876735154e-05, "loss": 0.0333, "step": 1300 }, { "epoch": 1.0194449688502927, "grad_norm": 0.16851945221424103, "learning_rate": 2.6202630348146324e-05, "loss": 0.0341, "step": 1350 }, { "epoch": 1.057202189918822, "grad_norm": 0.09734618663787842, "learning_rate": 2.2652592093878666e-05, "loss": 0.0196, "step": 1400 }, { "epoch": 1.057202189918822, "eval_loss": 0.04361049458384514, "eval_runtime": 88.6046, "eval_samples_per_second": 12.595, "eval_steps_per_second": 3.149, "step": 1400 }, { "epoch": 1.0949594109873513, "grad_norm": 0.022358063608407974, "learning_rate": 1.928936436551661e-05, "loss": 0.015, "step": 1450 }, { "epoch": 1.1327166320558808, "grad_norm": 0.23758022487163544, "learning_rate": 1.6135921418712956e-05, "loss": 0.0156, "step": 1500 }, { "epoch": 1.17047385312441, "grad_norm": 0.011663041077554226, "learning_rate": 1.3213804466343421e-05, "loss": 0.0158, "step": 1550 }, { "epoch": 1.2082310741929394, "grad_norm": 0.0888688713312149, "learning_rate": 1.0542974530180327e-05, "loss": 0.0192, "step": 1600 }, { "epoch": 1.2082310741929394, "eval_loss": 0.04343624785542488, "eval_runtime": 88.893, "eval_samples_per_second": 12.554, "eval_steps_per_second": 3.139, "step": 1600 }, { "epoch": 1.2459882952614687, "grad_norm": 0.28665322065353394, "learning_rate": 8.141676086873572e-06, "loss": 0.0139, "step": 1650 }, { "epoch": 1.283745516329998, "grad_norm": 0.026126619428396225, "learning_rate": 6.026312439675552e-06, "loss": 0.0178, "step": 1700 }, { "epoch": 1.3215027373985275, "grad_norm": 0.12513647973537445, "learning_rate": 4.2113336672471245e-06, "loss": 0.0157, "step": 1750 }, { "epoch": 1.3592599584670568, "grad_norm": 0.0799918845295906, "learning_rate": 2.7091379149682685e-06, "loss": 0.0138, "step": 1800 }, { "epoch": 1.3592599584670568, "eval_loss": 0.043415576219558716, "eval_runtime": 88.8821, "eval_samples_per_second": 12.556, "eval_steps_per_second": 3.139, "step": 1800 } ], "logging_steps": 50, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2271277242822164e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }