{ "best_metric": 0.7818862795829773, "best_model_checkpoint": "miner_id_24/checkpoint-2700", "epoch": 4.452606195511037, "eval_steps": 150, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014839547393804488, "eval_loss": 1.1009923219680786, "eval_runtime": 14.0062, "eval_samples_per_second": 40.554, "eval_steps_per_second": 20.277, "step": 1 }, { "epoch": 0.07419773696902245, "grad_norm": 0.1295081079006195, "learning_rate": 0.0002, "loss": 1.0386, "step": 50 }, { "epoch": 0.1483954739380449, "grad_norm": 0.1271422952413559, "learning_rate": 0.0001998582695676762, "loss": 0.9538, "step": 100 }, { "epoch": 0.22259321090706732, "grad_norm": 0.12388234585523605, "learning_rate": 0.00019943348002101371, "loss": 0.9411, "step": 150 }, { "epoch": 0.22259321090706732, "eval_loss": 0.9268936514854431, "eval_runtime": 14.0765, "eval_samples_per_second": 40.351, "eval_steps_per_second": 20.175, "step": 150 }, { "epoch": 0.2967909478760898, "grad_norm": 0.13258318603038788, "learning_rate": 0.00019872683547213446, "loss": 0.9297, "step": 200 }, { "epoch": 0.3709886848451122, "grad_norm": 0.1404084414243698, "learning_rate": 0.00019774033898178667, "loss": 0.9078, "step": 250 }, { "epoch": 0.44518642181413465, "grad_norm": 0.14087454974651337, "learning_rate": 0.0001964767868814516, "loss": 0.9052, "step": 300 }, { "epoch": 0.44518642181413465, "eval_loss": 0.8921840786933899, "eval_runtime": 14.2434, "eval_samples_per_second": 39.878, "eval_steps_per_second": 19.939, "step": 300 }, { "epoch": 0.5193841587831571, "grad_norm": 0.14395581185817719, "learning_rate": 0.00019493976084683813, "loss": 0.8842, "step": 350 }, { "epoch": 0.5935818957521796, "grad_norm": 0.13259702920913696, "learning_rate": 0.00019313361774523385, "loss": 0.8869, "step": 400 }, { "epoch": 0.667779632721202, "grad_norm": 0.141865536570549, "learning_rate": 0.00019106347728549135, "loss": 0.8705, "step": 450 }, { "epoch": 0.667779632721202, "eval_loss": 0.8677888512611389, "eval_runtime": 14.1021, "eval_samples_per_second": 40.278, "eval_steps_per_second": 20.139, "step": 450 }, { "epoch": 0.7419773696902244, "grad_norm": 0.1400303989648819, "learning_rate": 0.00018873520750565718, "loss": 0.8765, "step": 500 }, { "epoch": 0.8161751066592469, "grad_norm": 0.1415148228406906, "learning_rate": 0.0001861554081393806, "loss": 0.8718, "step": 550 }, { "epoch": 0.8903728436282693, "grad_norm": 0.1443643420934677, "learning_rate": 0.0001833313919082515, "loss": 0.8544, "step": 600 }, { "epoch": 0.8903728436282693, "eval_loss": 0.8513836860656738, "eval_runtime": 14.4561, "eval_samples_per_second": 39.291, "eval_steps_per_second": 19.646, "step": 600 }, { "epoch": 0.9645705805972918, "grad_norm": 0.15358565747737885, "learning_rate": 0.00018027116379309638, "loss": 0.855, "step": 650 }, { "epoch": 1.0389538119087367, "grad_norm": 0.14816033840179443, "learning_rate": 0.00017698339834299061, "loss": 0.8524, "step": 700 }, { "epoch": 1.1131515488777592, "grad_norm": 0.1639455258846283, "learning_rate": 0.00017347741508630672, "loss": 0.8167, "step": 750 }, { "epoch": 1.1131515488777592, "eval_loss": 0.8397712111473083, "eval_runtime": 14.3467, "eval_samples_per_second": 39.591, "eval_steps_per_second": 19.796, "step": 750 }, { "epoch": 1.1873492858467816, "grad_norm": 0.1472490429878235, "learning_rate": 0.0001697631521134985, "loss": 0.8086, "step": 800 }, { "epoch": 1.261547022815804, "grad_norm": 0.16415859758853912, "learning_rate": 0.00016585113790650388, "loss": 0.802, "step": 850 }, { "epoch": 1.3357447597848267, "grad_norm": 0.1668313443660736, "learning_rate": 0.0001617524614946192, "loss": 0.7961, "step": 900 }, { "epoch": 1.3357447597848267, "eval_loss": 0.8281386494636536, "eval_runtime": 13.996, "eval_samples_per_second": 40.583, "eval_steps_per_second": 20.292, "step": 900 }, { "epoch": 1.409942496753849, "grad_norm": 0.1581781506538391, "learning_rate": 0.0001574787410214407, "loss": 0.8142, "step": 950 }, { "epoch": 1.4841402337228715, "grad_norm": 0.16410471498966217, "learning_rate": 0.00015304209081197425, "loss": 0.8208, "step": 1000 }, { "epoch": 1.5583379706918938, "grad_norm": 0.16278082132339478, "learning_rate": 0.00014845508703326504, "loss": 0.7965, "step": 1050 }, { "epoch": 1.5583379706918938, "eval_loss": 0.8185766339302063, "eval_runtime": 14.096, "eval_samples_per_second": 40.295, "eval_steps_per_second": 20.148, "step": 1050 }, { "epoch": 1.6325357076609164, "grad_norm": 0.1645856499671936, "learning_rate": 0.00014373073204588556, "loss": 0.796, "step": 1100 }, { "epoch": 1.706733444629939, "grad_norm": 0.16889511048793793, "learning_rate": 0.00013888241754733208, "loss": 0.7972, "step": 1150 }, { "epoch": 1.7809311815989612, "grad_norm": 0.16280411183834076, "learning_rate": 0.00013392388661180303, "loss": 0.7986, "step": 1200 }, { "epoch": 1.7809311815989612, "eval_loss": 0.8099638223648071, "eval_runtime": 14.0445, "eval_samples_per_second": 40.443, "eval_steps_per_second": 20.221, "step": 1200 }, { "epoch": 1.8551289185679836, "grad_norm": 0.16928894817829132, "learning_rate": 0.0001288691947339621, "loss": 0.7992, "step": 1250 }, { "epoch": 1.929326655537006, "grad_norm": 0.16871026158332825, "learning_rate": 0.0001237326699871115, "loss": 0.8071, "step": 1300 }, { "epoch": 2.0037098868484513, "grad_norm": 0.1630755364894867, "learning_rate": 0.00011852887240871145, "loss": 0.7548, "step": 1350 }, { "epoch": 2.0037098868484513, "eval_loss": 0.8026086688041687, "eval_runtime": 14.0517, "eval_samples_per_second": 40.422, "eval_steps_per_second": 20.211, "step": 1350 }, { "epoch": 2.0779076238174734, "grad_norm": 0.16451841592788696, "learning_rate": 0.00011327255272837221, "loss": 0.7476, "step": 1400 }, { "epoch": 2.152105360786496, "grad_norm": 0.18384945392608643, "learning_rate": 0.00010797861055530831, "loss": 0.7691, "step": 1450 }, { "epoch": 2.2263030977555185, "grad_norm": 0.18934905529022217, "learning_rate": 0.00010266205214377748, "loss": 0.7409, "step": 1500 }, { "epoch": 2.2263030977555185, "eval_loss": 0.7997989654541016, "eval_runtime": 13.98, "eval_samples_per_second": 40.629, "eval_steps_per_second": 20.315, "step": 1500 }, { "epoch": 2.300500834724541, "grad_norm": 0.18251581490039825, "learning_rate": 9.733794785622253e-05, "loss": 0.7668, "step": 1550 }, { "epoch": 2.374698571693563, "grad_norm": 0.1793128401041031, "learning_rate": 9.202138944469168e-05, "loss": 0.7456, "step": 1600 }, { "epoch": 2.4488963086625857, "grad_norm": 0.17612189054489136, "learning_rate": 8.672744727162781e-05, "loss": 0.7508, "step": 1650 }, { "epoch": 2.4488963086625857, "eval_loss": 0.7954422831535339, "eval_runtime": 14.3584, "eval_samples_per_second": 39.559, "eval_steps_per_second": 19.779, "step": 1650 }, { "epoch": 2.523094045631608, "grad_norm": 0.19443200528621674, "learning_rate": 8.147112759128859e-05, "loss": 0.7682, "step": 1700 }, { "epoch": 2.5972917826006308, "grad_norm": 0.1849536895751953, "learning_rate": 7.626733001288851e-05, "loss": 0.7461, "step": 1750 }, { "epoch": 2.6714895195696533, "grad_norm": 0.203544482588768, "learning_rate": 7.113080526603792e-05, "loss": 0.7606, "step": 1800 }, { "epoch": 2.6714895195696533, "eval_loss": 0.7908361554145813, "eval_runtime": 14.0191, "eval_samples_per_second": 40.516, "eval_steps_per_second": 20.258, "step": 1800 }, { "epoch": 2.745687256538676, "grad_norm": 0.18425534665584564, "learning_rate": 6.607611338819697e-05, "loss": 0.7399, "step": 1850 }, { "epoch": 2.819884993507698, "grad_norm": 0.18944227695465088, "learning_rate": 6.111758245266794e-05, "loss": 0.7481, "step": 1900 }, { "epoch": 2.8940827304767205, "grad_norm": 0.17936797440052032, "learning_rate": 5.626926795411447e-05, "loss": 0.7366, "step": 1950 }, { "epoch": 2.8940827304767205, "eval_loss": 0.7873192429542542, "eval_runtime": 14.1781, "eval_samples_per_second": 40.062, "eval_steps_per_second": 20.031, "step": 1950 }, { "epoch": 2.968280467445743, "grad_norm": 0.19529031217098236, "learning_rate": 5.1544912966734994e-05, "loss": 0.7521, "step": 2000 }, { "epoch": 3.042663698757188, "grad_norm": 0.1839251071214676, "learning_rate": 4.695790918802576e-05, "loss": 0.76, "step": 2050 }, { "epoch": 3.1168614357262103, "grad_norm": 0.18065090477466583, "learning_rate": 4.252125897855932e-05, "loss": 0.7173, "step": 2100 }, { "epoch": 3.1168614357262103, "eval_loss": 0.7872137427330017, "eval_runtime": 14.1181, "eval_samples_per_second": 40.232, "eval_steps_per_second": 20.116, "step": 2100 }, { "epoch": 3.191059172695233, "grad_norm": 0.18571385741233826, "learning_rate": 3.824753850538082e-05, "loss": 0.7305, "step": 2150 }, { "epoch": 3.2652569096642554, "grad_norm": 0.182917520403862, "learning_rate": 3.414886209349615e-05, "loss": 0.7339, "step": 2200 }, { "epoch": 3.3394546466332775, "grad_norm": 0.19343669712543488, "learning_rate": 3.0236847886501542e-05, "loss": 0.7294, "step": 2250 }, { "epoch": 3.3394546466332775, "eval_loss": 0.7847491502761841, "eval_runtime": 14.0352, "eval_samples_per_second": 40.47, "eval_steps_per_second": 20.235, "step": 2250 }, { "epoch": 3.4136523836023, "grad_norm": 0.19025780260562897, "learning_rate": 2.6522584913693294e-05, "loss": 0.7251, "step": 2300 }, { "epoch": 3.4878501205713226, "grad_norm": 0.19098390638828278, "learning_rate": 2.301660165700936e-05, "loss": 0.7128, "step": 2350 }, { "epoch": 3.562047857540345, "grad_norm": 0.1902209222316742, "learning_rate": 1.9728836206903656e-05, "loss": 0.7282, "step": 2400 }, { "epoch": 3.562047857540345, "eval_loss": 0.7834780216217041, "eval_runtime": 13.9633, "eval_samples_per_second": 40.678, "eval_steps_per_second": 20.339, "step": 2400 }, { "epoch": 3.6362455945093677, "grad_norm": 0.19502930343151093, "learning_rate": 1.6668608091748495e-05, "loss": 0.7328, "step": 2450 }, { "epoch": 3.71044333147839, "grad_norm": 0.18348978459835052, "learning_rate": 1.3844591860619383e-05, "loss": 0.7293, "step": 2500 }, { "epoch": 3.7846410684474123, "grad_norm": 0.18518677353858948, "learning_rate": 1.1264792494342857e-05, "loss": 0.72, "step": 2550 }, { "epoch": 3.7846410684474123, "eval_loss": 0.7825514078140259, "eval_runtime": 14.1713, "eval_samples_per_second": 40.081, "eval_steps_per_second": 20.041, "step": 2550 }, { "epoch": 3.858838805416435, "grad_norm": 0.1809045374393463, "learning_rate": 8.936522714508678e-06, "loss": 0.7246, "step": 2600 }, { "epoch": 3.933036542385457, "grad_norm": 0.18573115766048431, "learning_rate": 6.866382254766157e-06, "loss": 0.724, "step": 2650 }, { "epoch": 4.007419773696903, "grad_norm": 0.1879529505968094, "learning_rate": 5.060239153161872e-06, "loss": 0.7085, "step": 2700 }, { "epoch": 4.007419773696903, "eval_loss": 0.7818862795829773, "eval_runtime": 14.0534, "eval_samples_per_second": 40.417, "eval_steps_per_second": 20.209, "step": 2700 }, { "epoch": 4.081617510665924, "grad_norm": 0.18808983266353607, "learning_rate": 3.5232131185484076e-06, "loss": 0.7197, "step": 2750 }, { "epoch": 4.155815247634947, "grad_norm": 0.1956692934036255, "learning_rate": 2.259661018213333e-06, "loss": 0.7152, "step": 2800 }, { "epoch": 4.230012984603969, "grad_norm": 0.1999446451663971, "learning_rate": 1.2731645278655445e-06, "loss": 0.7127, "step": 2850 }, { "epoch": 4.230012984603969, "eval_loss": 0.7822389006614685, "eval_runtime": 14.2177, "eval_samples_per_second": 39.95, "eval_steps_per_second": 19.975, "step": 2850 }, { "epoch": 4.304210721572992, "grad_norm": 0.1753046214580536, "learning_rate": 5.665199789862907e-07, "loss": 0.7241, "step": 2900 }, { "epoch": 4.378408458542014, "grad_norm": 0.19707240164279938, "learning_rate": 1.4173043232380557e-07, "loss": 0.7269, "step": 2950 }, { "epoch": 4.452606195511037, "grad_norm": 0.1791616678237915, "learning_rate": 0.0, "loss": 0.7119, "step": 3000 }, { "epoch": 4.452606195511037, "eval_loss": 0.7823454141616821, "eval_runtime": 14.0926, "eval_samples_per_second": 40.305, "eval_steps_per_second": 20.152, "step": 3000 } ], "logging_steps": 50, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 150, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.4201657147392e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }