{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3473684210526315, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013473684210526317, "grad_norm": 10.973633766174316, "learning_rate": 7.499999999999999e-06, "loss": 2.1479, "step": 1 }, { "epoch": 0.013473684210526317, "eval_loss": 2.5371525287628174, "eval_runtime": 8.4173, "eval_samples_per_second": 29.701, "eval_steps_per_second": 14.85, "step": 1 }, { "epoch": 0.026947368421052633, "grad_norm": 9.272090911865234, "learning_rate": 1.4999999999999999e-05, "loss": 2.0187, "step": 2 }, { "epoch": 0.04042105263157895, "grad_norm": 7.878329753875732, "learning_rate": 2.2499999999999998e-05, "loss": 2.1563, "step": 3 }, { "epoch": 0.053894736842105266, "grad_norm": 7.982365608215332, "learning_rate": 2.9999999999999997e-05, "loss": 2.282, "step": 4 }, { "epoch": 0.06736842105263158, "grad_norm": 2.2053937911987305, "learning_rate": 3.75e-05, "loss": 2.1851, "step": 5 }, { "epoch": 0.0808421052631579, "grad_norm": 1.8837133646011353, "learning_rate": 4.4999999999999996e-05, "loss": 2.112, "step": 6 }, { "epoch": 0.09431578947368421, "grad_norm": 1.4526352882385254, "learning_rate": 5.2499999999999995e-05, "loss": 2.0328, "step": 7 }, { "epoch": 0.10778947368421053, "grad_norm": 1.4846237897872925, "learning_rate": 5.9999999999999995e-05, "loss": 1.8783, "step": 8 }, { "epoch": 0.12126315789473684, "grad_norm": 1.9902074337005615, "learning_rate": 6.75e-05, "loss": 1.9695, "step": 9 }, { "epoch": 0.13473684210526315, "grad_norm": 1.404717206954956, "learning_rate": 7.5e-05, "loss": 1.9514, "step": 10 }, { "epoch": 0.1482105263157895, "grad_norm": 1.4216639995574951, "learning_rate": 8.25e-05, "loss": 1.8557, "step": 11 }, { "epoch": 0.1616842105263158, "grad_norm": 1.4013230800628662, "learning_rate": 8.999999999999999e-05, "loss": 1.8424, "step": 12 }, { "epoch": 0.1751578947368421, "grad_norm": 1.1717782020568848, "learning_rate": 9.75e-05, "loss": 1.8625, "step": 13 }, { "epoch": 0.18863157894736843, "grad_norm": 1.7480112314224243, "learning_rate": 0.00010499999999999999, "loss": 1.9016, "step": 14 }, { "epoch": 0.20210526315789473, "grad_norm": 1.1352505683898926, "learning_rate": 0.0001125, "loss": 1.8889, "step": 15 }, { "epoch": 0.21557894736842106, "grad_norm": 1.1477811336517334, "learning_rate": 0.00011999999999999999, "loss": 1.5794, "step": 16 }, { "epoch": 0.22905263157894737, "grad_norm": 1.3586668968200684, "learning_rate": 0.00012749999999999998, "loss": 1.5011, "step": 17 }, { "epoch": 0.24252631578947367, "grad_norm": 0.8325414657592773, "learning_rate": 0.000135, "loss": 1.6469, "step": 18 }, { "epoch": 0.256, "grad_norm": 1.5747898817062378, "learning_rate": 0.0001425, "loss": 1.6895, "step": 19 }, { "epoch": 0.2694736842105263, "grad_norm": 0.9997685551643372, "learning_rate": 0.00015, "loss": 1.5248, "step": 20 }, { "epoch": 0.2829473684210526, "grad_norm": 1.195119857788086, "learning_rate": 0.00014994217771805422, "loss": 1.5649, "step": 21 }, { "epoch": 0.296421052631579, "grad_norm": 0.8751718401908875, "learning_rate": 0.00014976880002998458, "loss": 1.5405, "step": 22 }, { "epoch": 0.3098947368421053, "grad_norm": 0.8566117882728577, "learning_rate": 0.00014948013427161947, "loss": 1.5504, "step": 23 }, { "epoch": 0.3233684210526316, "grad_norm": 0.7322584390640259, "learning_rate": 0.00014907662554463532, "loss": 1.5034, "step": 24 }, { "epoch": 0.3368421052631579, "grad_norm": 0.9539948105812073, "learning_rate": 0.00014855889603024227, "loss": 1.4513, "step": 25 }, { "epoch": 0.3503157894736842, "grad_norm": 0.7042058110237122, "learning_rate": 0.00014792774402982574, "loss": 1.5281, "step": 26 }, { "epoch": 0.36378947368421055, "grad_norm": 0.6478146910667419, "learning_rate": 0.0001471841427340235, "loss": 1.5117, "step": 27 }, { "epoch": 0.37726315789473686, "grad_norm": 0.6267299652099609, "learning_rate": 0.00014632923872213652, "loss": 1.383, "step": 28 }, { "epoch": 0.39073684210526316, "grad_norm": 0.7713648676872253, "learning_rate": 0.0001453643501941863, "loss": 1.4844, "step": 29 }, { "epoch": 0.40421052631578946, "grad_norm": 0.6838952898979187, "learning_rate": 0.0001442909649383465, "loss": 1.4825, "step": 30 }, { "epoch": 0.41768421052631577, "grad_norm": 0.71690434217453, "learning_rate": 0.0001431107380368811, "loss": 1.4357, "step": 31 }, { "epoch": 0.43115789473684213, "grad_norm": 0.6745509505271912, "learning_rate": 0.00014182548931412757, "loss": 1.4733, "step": 32 }, { "epoch": 0.44463157894736843, "grad_norm": 0.7103040814399719, "learning_rate": 0.0001404372005304598, "loss": 1.3857, "step": 33 }, { "epoch": 0.45810526315789474, "grad_norm": 0.6221896409988403, "learning_rate": 0.0001389480123265569, "loss": 1.2527, "step": 34 }, { "epoch": 0.47157894736842104, "grad_norm": 0.562971293926239, "learning_rate": 0.0001373602209226909, "loss": 1.4486, "step": 35 }, { "epoch": 0.48505263157894735, "grad_norm": 0.5778741240501404, "learning_rate": 0.00013567627457812106, "loss": 1.4134, "step": 36 }, { "epoch": 0.4985263157894737, "grad_norm": 0.5704385042190552, "learning_rate": 0.00013389876981605584, "loss": 1.354, "step": 37 }, { "epoch": 0.512, "grad_norm": 0.6227774024009705, "learning_rate": 0.00013203044742000233, "loss": 1.5718, "step": 38 }, { "epoch": 0.5254736842105263, "grad_norm": 0.6505720615386963, "learning_rate": 0.0001300741882076764, "loss": 1.4278, "step": 39 }, { "epoch": 0.5389473684210526, "grad_norm": 0.5715638995170593, "learning_rate": 0.00012803300858899104, "loss": 1.478, "step": 40 }, { "epoch": 0.5524210526315789, "grad_norm": 0.6021521091461182, "learning_rate": 0.00012591005591497064, "loss": 1.3556, "step": 41 }, { "epoch": 0.5658947368421052, "grad_norm": 0.6821895837783813, "learning_rate": 0.00012370860362476374, "loss": 1.4947, "step": 42 }, { "epoch": 0.5793684210526315, "grad_norm": 0.5453934073448181, "learning_rate": 0.00012143204619823755, "loss": 1.2477, "step": 43 }, { "epoch": 0.592842105263158, "grad_norm": 0.6702715754508972, "learning_rate": 0.00011908389392193547, "loss": 1.4835, "step": 44 }, { "epoch": 0.6063157894736843, "grad_norm": 0.6350681185722351, "learning_rate": 0.00011666776747647015, "loss": 1.3993, "step": 45 }, { "epoch": 0.6197894736842106, "grad_norm": 0.6066803336143494, "learning_rate": 0.00011418739235369615, "loss": 1.3974, "step": 46 }, { "epoch": 0.6332631578947369, "grad_norm": 0.5659217238426208, "learning_rate": 0.00011164659311227163, "loss": 1.3268, "step": 47 }, { "epoch": 0.6467368421052632, "grad_norm": 0.5807419419288635, "learning_rate": 0.00010904928748046599, "loss": 1.3408, "step": 48 }, { "epoch": 0.6602105263157895, "grad_norm": 0.6375626921653748, "learning_rate": 0.0001063994803153071, "loss": 1.4854, "step": 49 }, { "epoch": 0.6736842105263158, "grad_norm": 0.6418893933296204, "learning_rate": 0.00010370125742738173, "loss": 1.3679, "step": 50 }, { "epoch": 0.6736842105263158, "eval_loss": 1.499872088432312, "eval_runtime": 8.4304, "eval_samples_per_second": 29.654, "eval_steps_per_second": 14.827, "step": 50 }, { "epoch": 0.6871578947368421, "grad_norm": 0.7283876538276672, "learning_rate": 0.00010095877928081196, "loss": 1.4891, "step": 51 }, { "epoch": 0.7006315789473684, "grad_norm": 0.5404418706893921, "learning_rate": 9.817627457812105e-05, "loss": 1.0949, "step": 52 }, { "epoch": 0.7141052631578947, "grad_norm": 0.6369166970252991, "learning_rate": 9.535803373988056e-05, "loss": 1.5966, "step": 53 }, { "epoch": 0.7275789473684211, "grad_norm": 0.8547583818435669, "learning_rate": 9.25084022891929e-05, "loss": 1.3016, "step": 54 }, { "epoch": 0.7410526315789474, "grad_norm": 0.6199703812599182, "learning_rate": 8.963177415120962e-05, "loss": 1.5274, "step": 55 }, { "epoch": 0.7545263157894737, "grad_norm": 0.6474433541297913, "learning_rate": 8.673258487801731e-05, "loss": 1.3812, "step": 56 }, { "epoch": 0.768, "grad_norm": 0.5689646601676941, "learning_rate": 8.381530480933783e-05, "loss": 1.4005, "step": 57 }, { "epoch": 0.7814736842105263, "grad_norm": 0.5995835661888123, "learning_rate": 8.088443217958837e-05, "loss": 1.2694, "step": 58 }, { "epoch": 0.7949473684210526, "grad_norm": 0.5566097497940063, "learning_rate": 7.794448618193015e-05, "loss": 1.4341, "step": 59 }, { "epoch": 0.8084210526315789, "grad_norm": 0.6118280291557312, "learning_rate": 7.5e-05, "loss": 1.378, "step": 60 }, { "epoch": 0.8218947368421052, "grad_norm": 0.5662732124328613, "learning_rate": 7.205551381806987e-05, "loss": 1.4138, "step": 61 }, { "epoch": 0.8353684210526315, "grad_norm": 0.5312877893447876, "learning_rate": 6.911556782041163e-05, "loss": 1.4356, "step": 62 }, { "epoch": 0.8488421052631578, "grad_norm": 0.5712493062019348, "learning_rate": 6.618469519066217e-05, "loss": 1.4667, "step": 63 }, { "epoch": 0.8623157894736843, "grad_norm": 0.5462284684181213, "learning_rate": 6.326741512198266e-05, "loss": 1.4505, "step": 64 }, { "epoch": 0.8757894736842106, "grad_norm": 0.6210593581199646, "learning_rate": 6.036822584879038e-05, "loss": 1.2947, "step": 65 }, { "epoch": 0.8892631578947369, "grad_norm": 0.6453770399093628, "learning_rate": 5.7491597710807114e-05, "loss": 1.3575, "step": 66 }, { "epoch": 0.9027368421052632, "grad_norm": 0.6173303127288818, "learning_rate": 5.464196626011943e-05, "loss": 1.3685, "step": 67 }, { "epoch": 0.9162105263157895, "grad_norm": 0.6161783933639526, "learning_rate": 5.182372542187895e-05, "loss": 1.5084, "step": 68 }, { "epoch": 0.9296842105263158, "grad_norm": 0.5926702618598938, "learning_rate": 4.904122071918801e-05, "loss": 1.5106, "step": 69 }, { "epoch": 0.9431578947368421, "grad_norm": 0.7311588525772095, "learning_rate": 4.6298742572618266e-05, "loss": 1.3789, "step": 70 }, { "epoch": 0.9566315789473684, "grad_norm": 0.5569392442703247, "learning_rate": 4.360051968469291e-05, "loss": 1.2037, "step": 71 }, { "epoch": 0.9701052631578947, "grad_norm": 0.49740126729011536, "learning_rate": 4.095071251953399e-05, "loss": 1.3472, "step": 72 }, { "epoch": 0.983578947368421, "grad_norm": 0.5706843733787537, "learning_rate": 3.83534068877284e-05, "loss": 1.4041, "step": 73 }, { "epoch": 0.9970526315789474, "grad_norm": 0.5967234373092651, "learning_rate": 3.5812607646303834e-05, "loss": 1.31, "step": 74 }, { "epoch": 1.0105263157894737, "grad_norm": 1.052331566810608, "learning_rate": 3.333223252352985e-05, "loss": 2.0664, "step": 75 }, { "epoch": 1.024, "grad_norm": 0.6153193712234497, "learning_rate": 3.091610607806452e-05, "loss": 1.516, "step": 76 }, { "epoch": 1.0374736842105263, "grad_norm": 0.5821354389190674, "learning_rate": 2.856795380176244e-05, "loss": 1.2732, "step": 77 }, { "epoch": 1.0509473684210526, "grad_norm": 0.6261878609657288, "learning_rate": 2.6291396375236232e-05, "loss": 1.2817, "step": 78 }, { "epoch": 1.064421052631579, "grad_norm": 0.5795064568519592, "learning_rate": 2.4089944085029363e-05, "loss": 1.3216, "step": 79 }, { "epoch": 1.0778947368421052, "grad_norm": 0.5134410262107849, "learning_rate": 2.1966991411008938e-05, "loss": 1.2917, "step": 80 }, { "epoch": 1.0913684210526315, "grad_norm": 0.6312588453292847, "learning_rate": 1.99258117923236e-05, "loss": 1.1945, "step": 81 }, { "epoch": 1.1048421052631578, "grad_norm": 0.5317590832710266, "learning_rate": 1.796955257999768e-05, "loss": 1.2838, "step": 82 }, { "epoch": 1.1183157894736842, "grad_norm": 0.6899747252464294, "learning_rate": 1.6101230183944144e-05, "loss": 1.3589, "step": 83 }, { "epoch": 1.1317894736842105, "grad_norm": 0.5604771971702576, "learning_rate": 1.4323725421878949e-05, "loss": 1.1972, "step": 84 }, { "epoch": 1.1452631578947368, "grad_norm": 0.6752080917358398, "learning_rate": 1.2639779077309098e-05, "loss": 1.5739, "step": 85 }, { "epoch": 1.158736842105263, "grad_norm": 0.6165206432342529, "learning_rate": 1.1051987673443085e-05, "loss": 1.2631, "step": 86 }, { "epoch": 1.1722105263157894, "grad_norm": 0.61894291639328, "learning_rate": 9.56279946954021e-06, "loss": 1.3167, "step": 87 }, { "epoch": 1.1856842105263157, "grad_norm": 0.647057294845581, "learning_rate": 8.174510685872415e-06, "loss": 1.3185, "step": 88 }, { "epoch": 1.1991578947368422, "grad_norm": 0.5210772752761841, "learning_rate": 6.889261963118898e-06, "loss": 1.2242, "step": 89 }, { "epoch": 1.2126315789473685, "grad_norm": 0.5150516033172607, "learning_rate": 5.709035061653494e-06, "loss": 1.2346, "step": 90 }, { "epoch": 1.2261052631578948, "grad_norm": 0.6364325284957886, "learning_rate": 4.635649805813696e-06, "loss": 1.3149, "step": 91 }, { "epoch": 1.2395789473684211, "grad_norm": 0.6105322241783142, "learning_rate": 3.670761277863485e-06, "loss": 1.2594, "step": 92 }, { "epoch": 1.2530526315789474, "grad_norm": 0.6162554621696472, "learning_rate": 2.815857265976462e-06, "loss": 1.277, "step": 93 }, { "epoch": 1.2665263157894737, "grad_norm": 0.5331520438194275, "learning_rate": 2.072255970174258e-06, "loss": 1.2193, "step": 94 }, { "epoch": 1.28, "grad_norm": 0.5716810822486877, "learning_rate": 1.4411039697577175e-06, "loss": 1.4455, "step": 95 }, { "epoch": 1.2934736842105263, "grad_norm": 0.56926029920578, "learning_rate": 9.233744553646754e-07, "loss": 1.291, "step": 96 }, { "epoch": 1.3069473684210526, "grad_norm": 0.5255064964294434, "learning_rate": 5.198657283805279e-07, "loss": 1.2986, "step": 97 }, { "epoch": 1.320421052631579, "grad_norm": 0.5318115949630737, "learning_rate": 2.311999700154027e-07, "loss": 1.2152, "step": 98 }, { "epoch": 1.3338947368421052, "grad_norm": 0.5788165330886841, "learning_rate": 5.7822281945782424e-08, "loss": 1.4851, "step": 99 }, { "epoch": 1.3473684210526315, "grad_norm": 0.5943503379821777, "learning_rate": 0.0, "loss": 1.1959, "step": 100 }, { "epoch": 1.3473684210526315, "eval_loss": 1.457924246788025, "eval_runtime": 8.4236, "eval_samples_per_second": 29.678, "eval_steps_per_second": 14.839, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.33416392081408e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }