| { | |
| "best_global_step": 115, | |
| "best_metric": 0.09458151459693909, | |
| "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-115", | |
| "epoch": 6.052631578947368, | |
| "eval_steps": 1, | |
| "global_step": 115, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 9.795289039611816, | |
| "learning_rate": 0.0, | |
| "loss": 3.2204, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "eval_loss": 3.1565215587615967, | |
| "eval_runtime": 0.9831, | |
| "eval_samples_per_second": 30.517, | |
| "eval_steps_per_second": 4.069, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 10.048436164855957, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 3.1604, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "eval_loss": 2.4775681495666504, | |
| "eval_runtime": 0.8971, | |
| "eval_samples_per_second": 33.44, | |
| "eval_steps_per_second": 4.459, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 5.148971080780029, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 2.3511, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "eval_loss": 2.0030856132507324, | |
| "eval_runtime": 0.8926, | |
| "eval_samples_per_second": 33.611, | |
| "eval_steps_per_second": 4.481, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 4.8437819480896, | |
| "learning_rate": 0.0001, | |
| "loss": 2.0198, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "eval_loss": 1.6053706407546997, | |
| "eval_runtime": 0.8924, | |
| "eval_samples_per_second": 33.618, | |
| "eval_steps_per_second": 4.482, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 4.386927604675293, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 1.6969, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "eval_loss": 1.4053733348846436, | |
| "eval_runtime": 0.8951, | |
| "eval_samples_per_second": 33.517, | |
| "eval_steps_per_second": 4.469, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 3.955519676208496, | |
| "learning_rate": 0.00016666666666666666, | |
| "loss": 1.4825, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "eval_loss": 1.3105080127716064, | |
| "eval_runtime": 0.893, | |
| "eval_samples_per_second": 33.593, | |
| "eval_steps_per_second": 4.479, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 3.6086604595184326, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3404, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "eval_loss": 1.2445138692855835, | |
| "eval_runtime": 0.8942, | |
| "eval_samples_per_second": 33.549, | |
| "eval_steps_per_second": 4.473, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 2.691216230392456, | |
| "learning_rate": 0.00023333333333333333, | |
| "loss": 1.2627, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "eval_loss": 1.1471664905548096, | |
| "eval_runtime": 0.8927, | |
| "eval_samples_per_second": 33.606, | |
| "eval_steps_per_second": 4.481, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 2.5174126625061035, | |
| "learning_rate": 0.0002666666666666667, | |
| "loss": 1.2037, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "eval_loss": 1.1372406482696533, | |
| "eval_runtime": 0.8947, | |
| "eval_samples_per_second": 33.529, | |
| "eval_steps_per_second": 4.471, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 2.893831253051758, | |
| "learning_rate": 0.0003, | |
| "loss": 1.1793, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "eval_loss": 1.0686627626419067, | |
| "eval_runtime": 0.8921, | |
| "eval_samples_per_second": 33.628, | |
| "eval_steps_per_second": 4.484, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 2.5055713653564453, | |
| "learning_rate": 0.0003333333333333333, | |
| "loss": 1.201, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "eval_loss": 1.0994912385940552, | |
| "eval_runtime": 0.8951, | |
| "eval_samples_per_second": 33.517, | |
| "eval_steps_per_second": 4.469, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 2.297982931137085, | |
| "learning_rate": 0.00036666666666666667, | |
| "loss": 1.177, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "eval_loss": 1.0981471538543701, | |
| "eval_runtime": 0.8926, | |
| "eval_samples_per_second": 33.608, | |
| "eval_steps_per_second": 4.481, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 2.8536081314086914, | |
| "learning_rate": 0.0004, | |
| "loss": 1.2106, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "eval_loss": 1.0119823217391968, | |
| "eval_runtime": 0.8936, | |
| "eval_samples_per_second": 33.574, | |
| "eval_steps_per_second": 4.477, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 1.8637670278549194, | |
| "learning_rate": 0.00043333333333333337, | |
| "loss": 1.0688, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "eval_loss": 1.0545283555984497, | |
| "eval_runtime": 0.893, | |
| "eval_samples_per_second": 33.596, | |
| "eval_steps_per_second": 4.479, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 2.6284332275390625, | |
| "learning_rate": 0.00046666666666666666, | |
| "loss": 1.1661, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "eval_loss": 1.031855821609497, | |
| "eval_runtime": 0.8928, | |
| "eval_samples_per_second": 33.603, | |
| "eval_steps_per_second": 4.48, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 1.9439812898635864, | |
| "learning_rate": 0.0005, | |
| "loss": 1.1859, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "eval_loss": 0.9958587884902954, | |
| "eval_runtime": 0.8982, | |
| "eval_samples_per_second": 33.401, | |
| "eval_steps_per_second": 4.453, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 1.7199311256408691, | |
| "learning_rate": 0.0004999776608025946, | |
| "loss": 1.1636, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "eval_loss": 0.9928242564201355, | |
| "eval_runtime": 0.8982, | |
| "eval_samples_per_second": 33.398, | |
| "eval_steps_per_second": 4.453, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 5.788880825042725, | |
| "learning_rate": 0.000499910647202696, | |
| "loss": 1.0348, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "eval_loss": 1.0362129211425781, | |
| "eval_runtime": 0.8954, | |
| "eval_samples_per_second": 33.504, | |
| "eval_steps_per_second": 4.467, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.156782388687134, | |
| "learning_rate": 0.0004997989711765446, | |
| "loss": 1.201, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.9807829260826111, | |
| "eval_runtime": 0.8926, | |
| "eval_samples_per_second": 33.611, | |
| "eval_steps_per_second": 4.481, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 1.9558554887771606, | |
| "learning_rate": 0.0004996426526821629, | |
| "loss": 0.8535, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "eval_loss": 0.9379722476005554, | |
| "eval_runtime": 0.8935, | |
| "eval_samples_per_second": 33.576, | |
| "eval_steps_per_second": 4.477, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 1.772550106048584, | |
| "learning_rate": 0.0004994417196557883, | |
| "loss": 0.968, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "eval_loss": 0.9845291376113892, | |
| "eval_runtime": 0.8928, | |
| "eval_samples_per_second": 33.603, | |
| "eval_steps_per_second": 4.48, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 2.108396291732788, | |
| "learning_rate": 0.0004991962080068813, | |
| "loss": 1.0552, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "eval_loss": 0.9239175319671631, | |
| "eval_runtime": 0.893, | |
| "eval_samples_per_second": 33.594, | |
| "eval_steps_per_second": 4.479, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 1.8215439319610596, | |
| "learning_rate": 0.0004989061616117073, | |
| "loss": 0.9825, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "eval_loss": 0.980516791343689, | |
| "eval_runtime": 0.8952, | |
| "eval_samples_per_second": 33.513, | |
| "eval_steps_per_second": 4.468, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 20.904949188232422, | |
| "learning_rate": 0.0004985716323054959, | |
| "loss": 0.9563, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "eval_loss": 1.112138271331787, | |
| "eval_runtime": 0.8954, | |
| "eval_samples_per_second": 33.505, | |
| "eval_steps_per_second": 4.467, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 2.785473585128784, | |
| "learning_rate": 0.0004981926798731766, | |
| "loss": 1.048, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "eval_loss": 0.9919915795326233, | |
| "eval_runtime": 0.8932, | |
| "eval_samples_per_second": 33.586, | |
| "eval_steps_per_second": 4.478, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 1.8656138181686401, | |
| "learning_rate": 0.000497769372038695, | |
| "loss": 1.0315, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "eval_loss": 0.9384483098983765, | |
| "eval_runtime": 0.8931, | |
| "eval_samples_per_second": 33.589, | |
| "eval_steps_per_second": 4.479, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 1.697496771812439, | |
| "learning_rate": 0.0004973017844529094, | |
| "loss": 1.0063, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "eval_loss": 0.904453694820404, | |
| "eval_runtime": 0.8918, | |
| "eval_samples_per_second": 33.64, | |
| "eval_steps_per_second": 4.485, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 1.7305934429168701, | |
| "learning_rate": 0.0004967900006800708, | |
| "loss": 0.8483, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "eval_loss": 0.876754879951477, | |
| "eval_runtime": 0.8933, | |
| "eval_samples_per_second": 33.584, | |
| "eval_steps_per_second": 4.478, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 1.7766728401184082, | |
| "learning_rate": 0.000496234112182889, | |
| "loss": 1.0118, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "eval_loss": 0.9041274785995483, | |
| "eval_runtime": 0.8949, | |
| "eval_samples_per_second": 33.524, | |
| "eval_steps_per_second": 4.47, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 1.9015165567398071, | |
| "learning_rate": 0.000495634218306187, | |
| "loss": 0.8917, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "eval_loss": 0.8897702693939209, | |
| "eval_runtime": 0.8926, | |
| "eval_samples_per_second": 33.611, | |
| "eval_steps_per_second": 4.481, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 1.4804080724716187, | |
| "learning_rate": 0.0004949904262591467, | |
| "loss": 1.0084, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "eval_loss": 0.885962188243866, | |
| "eval_runtime": 0.8988, | |
| "eval_samples_per_second": 33.378, | |
| "eval_steps_per_second": 4.45, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 1.819899320602417, | |
| "learning_rate": 0.0004943028510961491, | |
| "loss": 0.969, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "eval_loss": 0.8608292937278748, | |
| "eval_runtime": 0.8958, | |
| "eval_samples_per_second": 33.489, | |
| "eval_steps_per_second": 4.465, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 2.8180196285247803, | |
| "learning_rate": 0.0004935716156962127, | |
| "loss": 1.1318, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "eval_loss": 0.875141978263855, | |
| "eval_runtime": 0.8971, | |
| "eval_samples_per_second": 33.441, | |
| "eval_steps_per_second": 4.459, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 1.8047230243682861, | |
| "learning_rate": 0.000492796850741033, | |
| "loss": 1.0002, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "eval_loss": 0.89467453956604, | |
| "eval_runtime": 0.8966, | |
| "eval_samples_per_second": 33.46, | |
| "eval_steps_per_second": 4.461, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 2.6305246353149414, | |
| "learning_rate": 0.0004919786946916281, | |
| "loss": 1.1024, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "eval_loss": 0.8359136581420898, | |
| "eval_runtime": 0.8971, | |
| "eval_samples_per_second": 33.44, | |
| "eval_steps_per_second": 4.459, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 2.4130873680114746, | |
| "learning_rate": 0.0004911172937635942, | |
| "loss": 0.9314, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "eval_loss": 0.8058050274848938, | |
| "eval_runtime": 0.8959, | |
| "eval_samples_per_second": 33.487, | |
| "eval_steps_per_second": 4.465, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 1.580320119857788, | |
| "learning_rate": 0.0004902128019009741, | |
| "loss": 1.0036, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "eval_loss": 0.7546663880348206, | |
| "eval_runtime": 0.8967, | |
| "eval_samples_per_second": 33.457, | |
| "eval_steps_per_second": 4.461, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.6066155433654785, | |
| "learning_rate": 0.000489265380748746, | |
| "loss": 1.094, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.8417730331420898, | |
| "eval_runtime": 0.895, | |
| "eval_samples_per_second": 33.519, | |
| "eval_steps_per_second": 4.469, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 2.0526315789473686, | |
| "grad_norm": 2.4847571849823, | |
| "learning_rate": 0.0004882751996239352, | |
| "loss": 0.9106, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 2.0526315789473686, | |
| "eval_loss": 0.805930495262146, | |
| "eval_runtime": 0.8985, | |
| "eval_samples_per_second": 33.388, | |
| "eval_steps_per_second": 4.452, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 2.144543409347534, | |
| "learning_rate": 0.0004872424354853545, | |
| "loss": 0.8542, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.1052631578947367, | |
| "eval_loss": 0.7550076842308044, | |
| "eval_runtime": 0.8977, | |
| "eval_samples_per_second": 33.42, | |
| "eval_steps_per_second": 4.456, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.1578947368421053, | |
| "grad_norm": 1.2767819166183472, | |
| "learning_rate": 0.0004861672729019797, | |
| "loss": 0.7569, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 2.1578947368421053, | |
| "eval_loss": 0.720465362071991, | |
| "eval_runtime": 0.9013, | |
| "eval_samples_per_second": 33.285, | |
| "eval_steps_per_second": 4.438, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 1.4606373310089111, | |
| "learning_rate": 0.0004850499040199643, | |
| "loss": 0.6198, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 2.2105263157894735, | |
| "eval_loss": 0.7800072431564331, | |
| "eval_runtime": 0.8938, | |
| "eval_samples_per_second": 33.564, | |
| "eval_steps_per_second": 4.475, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 2.263157894736842, | |
| "grad_norm": 4.208314895629883, | |
| "learning_rate": 0.0004838905285283005, | |
| "loss": 0.8454, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 2.263157894736842, | |
| "eval_loss": 0.7882384657859802, | |
| "eval_runtime": 0.8955, | |
| "eval_samples_per_second": 33.502, | |
| "eval_steps_per_second": 4.467, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 2.8906519412994385, | |
| "learning_rate": 0.00048268935362313215, | |
| "loss": 0.8786, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 2.3157894736842106, | |
| "eval_loss": 0.7504675388336182, | |
| "eval_runtime": 0.8973, | |
| "eval_samples_per_second": 33.435, | |
| "eval_steps_per_second": 4.458, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 2.3684210526315788, | |
| "grad_norm": 2.5608749389648438, | |
| "learning_rate": 0.00048144659397072586, | |
| "loss": 0.7165, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 2.3684210526315788, | |
| "eval_loss": 0.7160356640815735, | |
| "eval_runtime": 0.8985, | |
| "eval_samples_per_second": 33.389, | |
| "eval_steps_per_second": 4.452, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 2.237501621246338, | |
| "learning_rate": 0.0004801624716691072, | |
| "loss": 0.9232, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 2.4210526315789473, | |
| "eval_loss": 0.7007637619972229, | |
| "eval_runtime": 0.8986, | |
| "eval_samples_per_second": 33.387, | |
| "eval_steps_per_second": 4.452, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 2.473684210526316, | |
| "grad_norm": 2.166039228439331, | |
| "learning_rate": 0.00047883721620836894, | |
| "loss": 0.782, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 2.473684210526316, | |
| "eval_loss": 0.6951841711997986, | |
| "eval_runtime": 0.9007, | |
| "eval_samples_per_second": 33.308, | |
| "eval_steps_per_second": 4.441, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 1.6499485969543457, | |
| "learning_rate": 0.0004774710644296578, | |
| "loss": 0.7387, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "eval_loss": 0.7041357755661011, | |
| "eval_runtime": 0.8999, | |
| "eval_samples_per_second": 33.337, | |
| "eval_steps_per_second": 4.445, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 2.5789473684210527, | |
| "grad_norm": 2.833061456680298, | |
| "learning_rate": 0.00047606426048284813, | |
| "loss": 0.8343, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 2.5789473684210527, | |
| "eval_loss": 0.6822550296783447, | |
| "eval_runtime": 0.9005, | |
| "eval_samples_per_second": 33.316, | |
| "eval_steps_per_second": 4.442, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 2.0135650634765625, | |
| "learning_rate": 0.00047461705578290833, | |
| "loss": 0.7768, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "eval_loss": 0.6283606886863708, | |
| "eval_runtime": 0.8974, | |
| "eval_samples_per_second": 33.428, | |
| "eval_steps_per_second": 4.457, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 2.6842105263157894, | |
| "grad_norm": 1.5658601522445679, | |
| "learning_rate": 0.0004731297089649703, | |
| "loss": 0.7418, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 2.6842105263157894, | |
| "eval_loss": 0.6374291181564331, | |
| "eval_runtime": 0.8918, | |
| "eval_samples_per_second": 33.641, | |
| "eval_steps_per_second": 4.485, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 1.7403415441513062, | |
| "learning_rate": 0.0004716024858381075, | |
| "loss": 0.7866, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 2.736842105263158, | |
| "eval_loss": 0.6586597561836243, | |
| "eval_runtime": 0.8957, | |
| "eval_samples_per_second": 33.495, | |
| "eval_steps_per_second": 4.466, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 2.7894736842105265, | |
| "grad_norm": 1.519404411315918, | |
| "learning_rate": 0.00047003565933783123, | |
| "loss": 0.8354, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 2.7894736842105265, | |
| "eval_loss": 0.691727340221405, | |
| "eval_runtime": 0.8923, | |
| "eval_samples_per_second": 33.62, | |
| "eval_steps_per_second": 4.483, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 1.5139788389205933, | |
| "learning_rate": 0.0004684295094773134, | |
| "loss": 0.7804, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 2.8421052631578947, | |
| "eval_loss": 0.6508743762969971, | |
| "eval_runtime": 0.8929, | |
| "eval_samples_per_second": 33.598, | |
| "eval_steps_per_second": 4.48, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 2.8947368421052633, | |
| "grad_norm": 1.5480479001998901, | |
| "learning_rate": 0.00046678432329734434, | |
| "loss": 0.7253, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 2.8947368421052633, | |
| "eval_loss": 0.6439611911773682, | |
| "eval_runtime": 0.894, | |
| "eval_samples_per_second": 33.557, | |
| "eval_steps_per_second": 4.474, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 1.5994068384170532, | |
| "learning_rate": 0.00046510039481503486, | |
| "loss": 0.842, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 2.9473684210526314, | |
| "eval_loss": 0.6327024698257446, | |
| "eval_runtime": 0.9041, | |
| "eval_samples_per_second": 33.184, | |
| "eval_steps_per_second": 4.424, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.6054733991622925, | |
| "learning_rate": 0.00046337802497127117, | |
| "loss": 0.8073, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.6213096976280212, | |
| "eval_runtime": 0.8992, | |
| "eval_samples_per_second": 33.362, | |
| "eval_steps_per_second": 4.448, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 3.0526315789473686, | |
| "grad_norm": 2.5787405967712402, | |
| "learning_rate": 0.00046161752157693284, | |
| "loss": 0.6017, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 3.0526315789473686, | |
| "eval_loss": 0.5892248749732971, | |
| "eval_runtime": 0.8922, | |
| "eval_samples_per_second": 33.624, | |
| "eval_steps_per_second": 4.483, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 3.1052631578947367, | |
| "grad_norm": 1.7601501941680908, | |
| "learning_rate": 0.0004598191992578828, | |
| "loss": 0.6071, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 3.1052631578947367, | |
| "eval_loss": 0.5735067129135132, | |
| "eval_runtime": 0.8924, | |
| "eval_samples_per_second": 33.618, | |
| "eval_steps_per_second": 4.482, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 3.1578947368421053, | |
| "grad_norm": 1.7480543851852417, | |
| "learning_rate": 0.00045798337939873923, | |
| "loss": 0.6597, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 3.1578947368421053, | |
| "eval_loss": 0.5306870341300964, | |
| "eval_runtime": 0.8938, | |
| "eval_samples_per_second": 33.566, | |
| "eval_steps_per_second": 4.475, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 3.2105263157894735, | |
| "grad_norm": 2.3808937072753906, | |
| "learning_rate": 0.0004561103900854401, | |
| "loss": 0.5372, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 3.2105263157894735, | |
| "eval_loss": 0.535223662853241, | |
| "eval_runtime": 0.8966, | |
| "eval_samples_per_second": 33.459, | |
| "eval_steps_per_second": 4.461, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 3.263157894736842, | |
| "grad_norm": 1.8272178173065186, | |
| "learning_rate": 0.0004542005660466094, | |
| "loss": 0.5399, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 3.263157894736842, | |
| "eval_loss": 0.5316082239151001, | |
| "eval_runtime": 0.8994, | |
| "eval_samples_per_second": 33.354, | |
| "eval_steps_per_second": 4.447, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 3.3157894736842106, | |
| "grad_norm": 2.0635435581207275, | |
| "learning_rate": 0.0004522542485937369, | |
| "loss": 0.5531, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 3.3157894736842106, | |
| "eval_loss": 0.5134085416793823, | |
| "eval_runtime": 0.8937, | |
| "eval_samples_per_second": 33.567, | |
| "eval_steps_per_second": 4.476, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 3.3684210526315788, | |
| "grad_norm": 2.268183708190918, | |
| "learning_rate": 0.0004502717855601809, | |
| "loss": 0.5291, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 3.3684210526315788, | |
| "eval_loss": 0.5419598817825317, | |
| "eval_runtime": 0.8959, | |
| "eval_samples_per_second": 33.486, | |
| "eval_steps_per_second": 4.465, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 3.4210526315789473, | |
| "grad_norm": 1.8800358772277832, | |
| "learning_rate": 0.0004482535312390058, | |
| "loss": 0.5501, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 3.4210526315789473, | |
| "eval_loss": 0.5209227800369263, | |
| "eval_runtime": 0.8927, | |
| "eval_samples_per_second": 33.606, | |
| "eval_steps_per_second": 4.481, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 3.473684210526316, | |
| "grad_norm": 3.1507558822631836, | |
| "learning_rate": 0.00044619984631966527, | |
| "loss": 0.5309, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 3.473684210526316, | |
| "eval_loss": 0.536996603012085, | |
| "eval_runtime": 0.8951, | |
| "eval_samples_per_second": 33.517, | |
| "eval_steps_per_second": 4.469, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 3.526315789473684, | |
| "grad_norm": 3.5700478553771973, | |
| "learning_rate": 0.0004441110978235418, | |
| "loss": 0.7223, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 3.526315789473684, | |
| "eval_loss": 0.5140640139579773, | |
| "eval_runtime": 0.8962, | |
| "eval_samples_per_second": 33.474, | |
| "eval_steps_per_second": 4.463, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 3.5789473684210527, | |
| "grad_norm": 1.758971929550171, | |
| "learning_rate": 0.0004419876590383554, | |
| "loss": 0.6927, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 3.5789473684210527, | |
| "eval_loss": 0.47072505950927734, | |
| "eval_runtime": 0.9127, | |
| "eval_samples_per_second": 32.87, | |
| "eval_steps_per_second": 4.383, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 3.6315789473684212, | |
| "grad_norm": 1.5274709463119507, | |
| "learning_rate": 0.00043982990945145146, | |
| "loss": 0.4762, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 3.6315789473684212, | |
| "eval_loss": 0.4518219828605652, | |
| "eval_runtime": 0.8967, | |
| "eval_samples_per_second": 33.456, | |
| "eval_steps_per_second": 4.461, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 3.6842105263157894, | |
| "grad_norm": 1.7685797214508057, | |
| "learning_rate": 0.0004376382346819819, | |
| "loss": 0.5629, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 3.6842105263157894, | |
| "eval_loss": 0.40707579255104065, | |
| "eval_runtime": 0.8934, | |
| "eval_samples_per_second": 33.581, | |
| "eval_steps_per_second": 4.478, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 3.736842105263158, | |
| "grad_norm": 1.6618574857711792, | |
| "learning_rate": 0.00043541302641198946, | |
| "loss": 0.5877, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 3.736842105263158, | |
| "eval_loss": 0.3780651390552521, | |
| "eval_runtime": 0.9024, | |
| "eval_samples_per_second": 33.246, | |
| "eval_steps_per_second": 4.433, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 3.7894736842105265, | |
| "grad_norm": 1.542702317237854, | |
| "learning_rate": 0.00043315468231640834, | |
| "loss": 0.5222, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 3.7894736842105265, | |
| "eval_loss": 0.3732970356941223, | |
| "eval_runtime": 0.9166, | |
| "eval_samples_per_second": 32.73, | |
| "eval_steps_per_second": 4.364, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 3.8421052631578947, | |
| "grad_norm": 1.8039391040802002, | |
| "learning_rate": 0.00043086360599199516, | |
| "loss": 0.5238, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 3.8421052631578947, | |
| "eval_loss": 0.3568810820579529, | |
| "eval_runtime": 0.9031, | |
| "eval_samples_per_second": 33.218, | |
| "eval_steps_per_second": 4.429, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 3.8947368421052633, | |
| "grad_norm": 1.6215863227844238, | |
| "learning_rate": 0.0004285402068852002, | |
| "loss": 0.6504, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 3.8947368421052633, | |
| "eval_loss": 0.3885921835899353, | |
| "eval_runtime": 0.896, | |
| "eval_samples_per_second": 33.483, | |
| "eval_steps_per_second": 4.464, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 3.9473684210526314, | |
| "grad_norm": 1.5152952671051025, | |
| "learning_rate": 0.00042618490021899383, | |
| "loss": 0.5694, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 3.9473684210526314, | |
| "eval_loss": 0.38745489716529846, | |
| "eval_runtime": 0.8939, | |
| "eval_samples_per_second": 33.562, | |
| "eval_steps_per_second": 4.475, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 2.6989200115203857, | |
| "learning_rate": 0.00042379810691866064, | |
| "loss": 0.5849, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.42535698413848877, | |
| "eval_runtime": 0.9073, | |
| "eval_samples_per_second": 33.066, | |
| "eval_steps_per_second": 4.409, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 4.052631578947368, | |
| "grad_norm": 1.7381691932678223, | |
| "learning_rate": 0.00042138025353657407, | |
| "loss": 0.3779, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 4.052631578947368, | |
| "eval_loss": 0.37115439772605896, | |
| "eval_runtime": 0.9112, | |
| "eval_samples_per_second": 32.922, | |
| "eval_steps_per_second": 4.39, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 4.105263157894737, | |
| "grad_norm": 2.188385248184204, | |
| "learning_rate": 0.00041893177217596633, | |
| "loss": 0.44, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 4.105263157894737, | |
| "eval_loss": 0.2926563322544098, | |
| "eval_runtime": 0.8982, | |
| "eval_samples_per_second": 33.401, | |
| "eval_steps_per_second": 4.453, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 4.157894736842105, | |
| "grad_norm": 2.3652961254119873, | |
| "learning_rate": 0.0004164531004137049, | |
| "loss": 0.3639, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 4.157894736842105, | |
| "eval_loss": 0.2751067876815796, | |
| "eval_runtime": 0.9146, | |
| "eval_samples_per_second": 32.8, | |
| "eval_steps_per_second": 4.373, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 4.2105263157894735, | |
| "grad_norm": 2.165874719619751, | |
| "learning_rate": 0.0004139446812220924, | |
| "loss": 0.2683, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 4.2105263157894735, | |
| "eval_loss": 0.2685202360153198, | |
| "eval_runtime": 0.9124, | |
| "eval_samples_per_second": 32.881, | |
| "eval_steps_per_second": 4.384, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 4.2631578947368425, | |
| "grad_norm": 1.7391912937164307, | |
| "learning_rate": 0.0004114069628897006, | |
| "loss": 0.2993, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 4.2631578947368425, | |
| "eval_loss": 0.33646491169929504, | |
| "eval_runtime": 0.8952, | |
| "eval_samples_per_second": 33.51, | |
| "eval_steps_per_second": 4.468, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 4.315789473684211, | |
| "grad_norm": 3.65714693069458, | |
| "learning_rate": 0.0004088403989412559, | |
| "loss": 0.4252, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 4.315789473684211, | |
| "eval_loss": 0.2839888632297516, | |
| "eval_runtime": 0.9057, | |
| "eval_samples_per_second": 33.123, | |
| "eval_steps_per_second": 4.416, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 4.368421052631579, | |
| "grad_norm": 2.1762771606445312, | |
| "learning_rate": 0.00040624544805658794, | |
| "loss": 0.3304, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 4.368421052631579, | |
| "eval_loss": 0.27002134919166565, | |
| "eval_runtime": 0.8939, | |
| "eval_samples_per_second": 33.562, | |
| "eval_steps_per_second": 4.475, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 4.421052631578947, | |
| "grad_norm": 2.1018354892730713, | |
| "learning_rate": 0.00040362257398865713, | |
| "loss": 0.4506, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 4.421052631578947, | |
| "eval_loss": 0.2557659149169922, | |
| "eval_runtime": 0.8969, | |
| "eval_samples_per_second": 33.45, | |
| "eval_steps_per_second": 4.46, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 4.473684210526316, | |
| "grad_norm": 1.7509180307388306, | |
| "learning_rate": 0.00040097224548067613, | |
| "loss": 0.3731, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 4.473684210526316, | |
| "eval_loss": 0.26859304308891296, | |
| "eval_runtime": 0.9009, | |
| "eval_samples_per_second": 33.299, | |
| "eval_steps_per_second": 4.44, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 4.526315789473684, | |
| "grad_norm": 1.971816897392273, | |
| "learning_rate": 0.0003982949361823388, | |
| "loss": 0.38, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 4.526315789473684, | |
| "eval_loss": 0.2624681293964386, | |
| "eval_runtime": 0.8949, | |
| "eval_samples_per_second": 33.524, | |
| "eval_steps_per_second": 4.47, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 4.578947368421053, | |
| "grad_norm": 1.4714068174362183, | |
| "learning_rate": 0.0003955911245651726, | |
| "loss": 0.3944, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 4.578947368421053, | |
| "eval_loss": 0.23652420938014984, | |
| "eval_runtime": 0.8952, | |
| "eval_samples_per_second": 33.511, | |
| "eval_steps_per_second": 4.468, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 4.631578947368421, | |
| "grad_norm": 2.6970834732055664, | |
| "learning_rate": 0.0003928612938370292, | |
| "loss": 0.3374, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 4.631578947368421, | |
| "eval_loss": 0.2716277241706848, | |
| "eval_runtime": 0.8932, | |
| "eval_samples_per_second": 33.588, | |
| "eval_steps_per_second": 4.478, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 4.684210526315789, | |
| "grad_norm": 1.9066615104675293, | |
| "learning_rate": 0.00039010593185572867, | |
| "loss": 0.2442, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 4.684210526315789, | |
| "eval_loss": 0.2999991476535797, | |
| "eval_runtime": 0.8939, | |
| "eval_samples_per_second": 33.559, | |
| "eval_steps_per_second": 4.475, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 4.7368421052631575, | |
| "grad_norm": 2.6232354640960693, | |
| "learning_rate": 0.00038732553104187296, | |
| "loss": 0.2857, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 4.7368421052631575, | |
| "eval_loss": 0.2302989959716797, | |
| "eval_runtime": 0.8938, | |
| "eval_samples_per_second": 33.564, | |
| "eval_steps_per_second": 4.475, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 4.7894736842105265, | |
| "grad_norm": 2.0710129737854004, | |
| "learning_rate": 0.0003845205882908432, | |
| "loss": 0.4195, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 4.7894736842105265, | |
| "eval_loss": 0.21816590428352356, | |
| "eval_runtime": 0.9251, | |
| "eval_samples_per_second": 32.429, | |
| "eval_steps_per_second": 4.324, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 4.842105263157895, | |
| "grad_norm": 1.8006062507629395, | |
| "learning_rate": 0.0003816916048839979, | |
| "loss": 0.2859, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 4.842105263157895, | |
| "eval_loss": 0.21071405708789825, | |
| "eval_runtime": 0.8965, | |
| "eval_samples_per_second": 33.462, | |
| "eval_steps_per_second": 4.462, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 4.894736842105263, | |
| "grad_norm": 1.6352888345718384, | |
| "learning_rate": 0.0003788390863990875, | |
| "loss": 0.4275, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 4.894736842105263, | |
| "eval_loss": 0.20206846296787262, | |
| "eval_runtime": 0.9052, | |
| "eval_samples_per_second": 33.144, | |
| "eval_steps_per_second": 4.419, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 4.947368421052632, | |
| "grad_norm": 1.6399378776550293, | |
| "learning_rate": 0.00037596354261990007, | |
| "loss": 0.389, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 4.947368421052632, | |
| "eval_loss": 0.19467315077781677, | |
| "eval_runtime": 0.8973, | |
| "eval_samples_per_second": 33.435, | |
| "eval_steps_per_second": 4.458, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 1.5680173635482788, | |
| "learning_rate": 0.0003730654874451569, | |
| "loss": 0.395, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.19546455144882202, | |
| "eval_runtime": 0.91, | |
| "eval_samples_per_second": 32.968, | |
| "eval_steps_per_second": 4.396, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 5.052631578947368, | |
| "grad_norm": 1.0308386087417603, | |
| "learning_rate": 0.00037014543879667093, | |
| "loss": 0.1384, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 5.052631578947368, | |
| "eval_loss": 0.18969732522964478, | |
| "eval_runtime": 0.9021, | |
| "eval_samples_per_second": 33.258, | |
| "eval_steps_per_second": 4.434, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 5.105263157894737, | |
| "grad_norm": 1.4042502641677856, | |
| "learning_rate": 0.0003672039185267878, | |
| "loss": 0.2291, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 5.105263157894737, | |
| "eval_loss": 0.16800740361213684, | |
| "eval_runtime": 0.8938, | |
| "eval_samples_per_second": 33.563, | |
| "eval_steps_per_second": 4.475, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 5.157894736842105, | |
| "grad_norm": 1.6313552856445312, | |
| "learning_rate": 0.00036424145232512333, | |
| "loss": 0.1736, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 5.157894736842105, | |
| "eval_loss": 0.16714099049568176, | |
| "eval_runtime": 0.9009, | |
| "eval_samples_per_second": 33.301, | |
| "eval_steps_per_second": 4.44, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 5.2105263157894735, | |
| "grad_norm": 1.8922698497772217, | |
| "learning_rate": 0.0003612585696246158, | |
| "loss": 0.1677, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 5.2105263157894735, | |
| "eval_loss": 0.179762065410614, | |
| "eval_runtime": 0.9039, | |
| "eval_samples_per_second": 33.188, | |
| "eval_steps_per_second": 4.425, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 5.2631578947368425, | |
| "grad_norm": 2.409526824951172, | |
| "learning_rate": 0.0003582558035069091, | |
| "loss": 0.2379, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 5.2631578947368425, | |
| "eval_loss": 0.1902371197938919, | |
| "eval_runtime": 0.9097, | |
| "eval_samples_per_second": 32.98, | |
| "eval_steps_per_second": 4.397, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 5.315789473684211, | |
| "grad_norm": 2.084869146347046, | |
| "learning_rate": 0.0003552336906070838, | |
| "loss": 0.2165, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 5.315789473684211, | |
| "eval_loss": 0.17252177000045776, | |
| "eval_runtime": 0.8948, | |
| "eval_samples_per_second": 33.528, | |
| "eval_steps_per_second": 4.47, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 5.368421052631579, | |
| "grad_norm": 1.655718207359314, | |
| "learning_rate": 0.000352192771017753, | |
| "loss": 0.223, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 5.368421052631579, | |
| "eval_loss": 0.18867380917072296, | |
| "eval_runtime": 0.8956, | |
| "eval_samples_per_second": 33.495, | |
| "eval_steps_per_second": 4.466, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 5.421052631578947, | |
| "grad_norm": 2.672633409500122, | |
| "learning_rate": 0.0003491335881925407, | |
| "loss": 0.161, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 5.421052631578947, | |
| "eval_loss": 0.1944020837545395, | |
| "eval_runtime": 0.8924, | |
| "eval_samples_per_second": 33.616, | |
| "eval_steps_per_second": 4.482, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 5.473684210526316, | |
| "grad_norm": 1.9712008237838745, | |
| "learning_rate": 0.0003460566888489593, | |
| "loss": 0.2525, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 5.473684210526316, | |
| "eval_loss": 0.17671068012714386, | |
| "eval_runtime": 0.897, | |
| "eval_samples_per_second": 33.446, | |
| "eval_steps_per_second": 4.459, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 5.526315789473684, | |
| "grad_norm": 2.2153072357177734, | |
| "learning_rate": 0.00034296262287070335, | |
| "loss": 0.2105, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 5.526315789473684, | |
| "eval_loss": 0.1715732216835022, | |
| "eval_runtime": 0.8951, | |
| "eval_samples_per_second": 33.514, | |
| "eval_steps_per_second": 4.469, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 5.578947368421053, | |
| "grad_norm": 1.8106168508529663, | |
| "learning_rate": 0.0003398519432093782, | |
| "loss": 0.259, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 5.578947368421053, | |
| "eval_loss": 0.1465868353843689, | |
| "eval_runtime": 0.9077, | |
| "eval_samples_per_second": 33.051, | |
| "eval_steps_per_second": 4.407, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 5.631578947368421, | |
| "grad_norm": 2.1159439086914062, | |
| "learning_rate": 0.0003367252057856802, | |
| "loss": 0.2065, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 5.631578947368421, | |
| "eval_loss": 0.14219093322753906, | |
| "eval_runtime": 0.9049, | |
| "eval_samples_per_second": 33.154, | |
| "eval_steps_per_second": 4.42, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 5.684210526315789, | |
| "grad_norm": 1.4467761516571045, | |
| "learning_rate": 0.00033358296939004547, | |
| "loss": 0.2083, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 5.684210526315789, | |
| "eval_loss": 0.1406753957271576, | |
| "eval_runtime": 0.8954, | |
| "eval_samples_per_second": 33.505, | |
| "eval_steps_per_second": 4.467, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 5.7368421052631575, | |
| "grad_norm": 1.3671239614486694, | |
| "learning_rate": 0.00033042579558278717, | |
| "loss": 0.1825, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 5.7368421052631575, | |
| "eval_loss": 0.13007155060768127, | |
| "eval_runtime": 0.8998, | |
| "eval_samples_per_second": 33.342, | |
| "eval_steps_per_second": 4.446, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 5.7894736842105265, | |
| "grad_norm": 1.479944109916687, | |
| "learning_rate": 0.00032725424859373687, | |
| "loss": 0.2244, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 5.7894736842105265, | |
| "eval_loss": 0.12692232429981232, | |
| "eval_runtime": 0.901, | |
| "eval_samples_per_second": 33.298, | |
| "eval_steps_per_second": 4.44, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 5.842105263157895, | |
| "grad_norm": 1.5173969268798828, | |
| "learning_rate": 0.0003240688952214085, | |
| "loss": 0.2273, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 5.842105263157895, | |
| "eval_loss": 0.12454597651958466, | |
| "eval_runtime": 0.8987, | |
| "eval_samples_per_second": 33.382, | |
| "eval_steps_per_second": 4.451, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 5.894736842105263, | |
| "grad_norm": 2.7870988845825195, | |
| "learning_rate": 0.00032087030473170445, | |
| "loss": 0.2101, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 5.894736842105263, | |
| "eval_loss": 0.12002909928560257, | |
| "eval_runtime": 0.893, | |
| "eval_samples_per_second": 33.593, | |
| "eval_steps_per_second": 4.479, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 5.947368421052632, | |
| "grad_norm": 1.3659342527389526, | |
| "learning_rate": 0.00031765904875617973, | |
| "loss": 0.1882, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 5.947368421052632, | |
| "eval_loss": 0.10573837906122208, | |
| "eval_runtime": 0.8956, | |
| "eval_samples_per_second": 33.496, | |
| "eval_steps_per_second": 4.466, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 1.8464044332504272, | |
| "learning_rate": 0.00031443570118988356, | |
| "loss": 0.2285, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.10221625119447708, | |
| "eval_runtime": 0.8955, | |
| "eval_samples_per_second": 33.501, | |
| "eval_steps_per_second": 4.467, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 6.052631578947368, | |
| "grad_norm": 1.3894392251968384, | |
| "learning_rate": 0.00031120083808879663, | |
| "loss": 0.1115, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 6.052631578947368, | |
| "eval_loss": 0.09458151459693909, | |
| "eval_runtime": 0.8981, | |
| "eval_samples_per_second": 33.405, | |
| "eval_steps_per_second": 4.454, | |
| "step": 115 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 14, | |
| "save_steps": 5, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4901149662148608.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |