{ "best_global_step": 55, "best_metric": 0.7241045236587524, "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-55", "epoch": 2.8947368421052633, "eval_steps": 1, "global_step": 55, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05263157894736842, "grad_norm": 0.7188231945037842, "learning_rate": 0.0, "loss": 3.2235, "step": 1 }, { "epoch": 0.05263157894736842, "eval_loss": 3.15524959564209, "eval_runtime": 3.3312, "eval_samples_per_second": 9.006, "eval_steps_per_second": 1.201, "step": 1 }, { "epoch": 0.10526315789473684, "grad_norm": 0.766629159450531, "learning_rate": 3.3333333333333335e-05, "loss": 3.165, "step": 2 }, { "epoch": 0.10526315789473684, "eval_loss": 3.1018595695495605, "eval_runtime": 3.28, "eval_samples_per_second": 9.146, "eval_steps_per_second": 1.219, "step": 2 }, { "epoch": 0.15789473684210525, "grad_norm": 0.6206756234169006, "learning_rate": 6.666666666666667e-05, "loss": 2.8628, "step": 3 }, { "epoch": 0.15789473684210525, "eval_loss": 2.97302508354187, "eval_runtime": 3.2899, "eval_samples_per_second": 9.119, "eval_steps_per_second": 1.216, "step": 3 }, { "epoch": 0.21052631578947367, "grad_norm": 0.6644885540008545, "learning_rate": 0.0001, "loss": 2.9711, "step": 4 }, { "epoch": 0.21052631578947367, "eval_loss": 2.762944221496582, "eval_runtime": 3.2987, "eval_samples_per_second": 9.095, "eval_steps_per_second": 1.213, "step": 4 }, { "epoch": 0.2631578947368421, "grad_norm": 0.6135285496711731, "learning_rate": 0.00013333333333333334, "loss": 2.7061, "step": 5 }, { "epoch": 0.2631578947368421, "eval_loss": 2.5087203979492188, "eval_runtime": 3.3091, "eval_samples_per_second": 9.066, "eval_steps_per_second": 1.209, "step": 5 }, { "epoch": 0.3157894736842105, "grad_norm": 0.5422775745391846, "learning_rate": 0.00016666666666666666, "loss": 2.4032, "step": 6 }, { "epoch": 0.3157894736842105, "eval_loss": 2.270092725753784, "eval_runtime": 3.3142, "eval_samples_per_second": 9.052, "eval_steps_per_second": 1.207, "step": 6 }, { "epoch": 0.3684210526315789, "grad_norm": 0.5579596161842346, "learning_rate": 0.0002, "loss": 2.272, "step": 7 }, { "epoch": 0.3684210526315789, "eval_loss": 2.0614399909973145, "eval_runtime": 3.3233, "eval_samples_per_second": 9.027, "eval_steps_per_second": 1.204, "step": 7 }, { "epoch": 0.42105263157894735, "grad_norm": 0.7365043759346008, "learning_rate": 0.00023333333333333333, "loss": 2.0297, "step": 8 }, { "epoch": 0.42105263157894735, "eval_loss": 1.8437634706497192, "eval_runtime": 3.3264, "eval_samples_per_second": 9.019, "eval_steps_per_second": 1.202, "step": 8 }, { "epoch": 0.47368421052631576, "grad_norm": 0.7677823901176453, "learning_rate": 0.0002666666666666667, "loss": 1.8911, "step": 9 }, { "epoch": 0.47368421052631576, "eval_loss": 1.615093469619751, "eval_runtime": 3.3357, "eval_samples_per_second": 8.994, "eval_steps_per_second": 1.199, "step": 9 }, { "epoch": 0.5263157894736842, "grad_norm": 0.7033586502075195, "learning_rate": 0.0003, "loss": 1.654, "step": 10 }, { "epoch": 0.5263157894736842, "eval_loss": 1.4461504220962524, "eval_runtime": 3.3549, "eval_samples_per_second": 8.942, "eval_steps_per_second": 1.192, "step": 10 }, { "epoch": 0.5789473684210527, "grad_norm": 0.721517026424408, "learning_rate": 0.0003333333333333333, "loss": 1.5364, "step": 11 }, { "epoch": 0.5789473684210527, "eval_loss": 1.3645799160003662, "eval_runtime": 3.361, "eval_samples_per_second": 8.926, "eval_steps_per_second": 1.19, "step": 11 }, { "epoch": 0.631578947368421, "grad_norm": 0.7304323315620422, "learning_rate": 0.00036666666666666667, "loss": 1.3689, "step": 12 }, { "epoch": 0.631578947368421, "eval_loss": 1.272360920906067, "eval_runtime": 3.3759, "eval_samples_per_second": 8.887, "eval_steps_per_second": 1.185, "step": 12 }, { "epoch": 0.6842105263157895, "grad_norm": 0.6370911002159119, "learning_rate": 0.0004, "loss": 1.329, "step": 13 }, { "epoch": 0.6842105263157895, "eval_loss": 1.19339120388031, "eval_runtime": 3.3835, "eval_samples_per_second": 8.867, "eval_steps_per_second": 1.182, "step": 13 }, { "epoch": 0.7368421052631579, "grad_norm": 0.5493318438529968, "learning_rate": 0.00043333333333333337, "loss": 1.1991, "step": 14 }, { "epoch": 0.7368421052631579, "eval_loss": 1.154818058013916, "eval_runtime": 3.3971, "eval_samples_per_second": 8.831, "eval_steps_per_second": 1.177, "step": 14 }, { "epoch": 0.7894736842105263, "grad_norm": 0.4599643051624298, "learning_rate": 0.00046666666666666666, "loss": 1.2358, "step": 15 }, { "epoch": 0.7894736842105263, "eval_loss": 1.1299824714660645, "eval_runtime": 3.4098, "eval_samples_per_second": 8.798, "eval_steps_per_second": 1.173, "step": 15 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5700777173042297, "learning_rate": 0.0005, "loss": 1.206, "step": 16 }, { "epoch": 0.8421052631578947, "eval_loss": 1.1079914569854736, "eval_runtime": 3.4063, "eval_samples_per_second": 8.807, "eval_steps_per_second": 1.174, "step": 16 }, { "epoch": 0.8947368421052632, "grad_norm": 0.44451233744621277, "learning_rate": 0.0004993910125649561, "loss": 1.2374, "step": 17 }, { "epoch": 0.8947368421052632, "eval_loss": 1.076997995376587, "eval_runtime": 3.4099, "eval_samples_per_second": 8.798, "eval_steps_per_second": 1.173, "step": 17 }, { "epoch": 0.9473684210526315, "grad_norm": 0.382600337266922, "learning_rate": 0.0004975670171853926, "loss": 1.0959, "step": 18 }, { "epoch": 0.9473684210526315, "eval_loss": 1.0459389686584473, "eval_runtime": 3.4174, "eval_samples_per_second": 8.779, "eval_steps_per_second": 1.17, "step": 18 }, { "epoch": 1.0, "grad_norm": 0.3735465109348297, "learning_rate": 0.0004945369001834514, "loss": 1.1433, "step": 19 }, { "epoch": 1.0, "eval_loss": 1.0354558229446411, "eval_runtime": 3.41, "eval_samples_per_second": 8.798, "eval_steps_per_second": 1.173, "step": 19 }, { "epoch": 1.0526315789473684, "grad_norm": 0.36878153681755066, "learning_rate": 0.0004903154239845797, "loss": 1.0467, "step": 20 }, { "epoch": 1.0526315789473684, "eval_loss": 1.0118752717971802, "eval_runtime": 3.4023, "eval_samples_per_second": 8.818, "eval_steps_per_second": 1.176, "step": 20 }, { "epoch": 1.1052631578947367, "grad_norm": 0.3709339499473572, "learning_rate": 0.0004849231551964771, "loss": 1.0453, "step": 21 }, { "epoch": 1.1052631578947367, "eval_loss": 0.9837953448295593, "eval_runtime": 3.3826, "eval_samples_per_second": 8.869, "eval_steps_per_second": 1.183, "step": 21 }, { "epoch": 1.1578947368421053, "grad_norm": 0.32317909598350525, "learning_rate": 0.0004783863644106502, "loss": 1.0573, "step": 22 }, { "epoch": 1.1578947368421053, "eval_loss": 0.9650039076805115, "eval_runtime": 3.3888, "eval_samples_per_second": 8.853, "eval_steps_per_second": 1.18, "step": 22 }, { "epoch": 1.2105263157894737, "grad_norm": 0.3465510606765747, "learning_rate": 0.00047073689821473173, "loss": 0.9613, "step": 23 }, { "epoch": 1.2105263157894737, "eval_loss": 0.9524248838424683, "eval_runtime": 3.389, "eval_samples_per_second": 8.852, "eval_steps_per_second": 1.18, "step": 23 }, { "epoch": 1.263157894736842, "grad_norm": 0.341265469789505, "learning_rate": 0.00046201202403910646, "loss": 1.0765, "step": 24 }, { "epoch": 1.263157894736842, "eval_loss": 0.9478815197944641, "eval_runtime": 3.3934, "eval_samples_per_second": 8.841, "eval_steps_per_second": 1.179, "step": 24 }, { "epoch": 1.3157894736842106, "grad_norm": 0.32804617285728455, "learning_rate": 0.0004522542485937369, "loss": 0.9063, "step": 25 }, { "epoch": 1.3157894736842106, "eval_loss": 0.9379161596298218, "eval_runtime": 3.394, "eval_samples_per_second": 8.839, "eval_steps_per_second": 1.179, "step": 25 }, { "epoch": 1.368421052631579, "grad_norm": 0.31782791018486023, "learning_rate": 0.0004415111107797445, "loss": 0.9969, "step": 26 }, { "epoch": 1.368421052631579, "eval_loss": 0.9347817897796631, "eval_runtime": 3.3909, "eval_samples_per_second": 8.847, "eval_steps_per_second": 1.18, "step": 26 }, { "epoch": 1.4210526315789473, "grad_norm": 0.3140616714954376, "learning_rate": 0.0004298349500846628, "loss": 0.9423, "step": 27 }, { "epoch": 1.4210526315789473, "eval_loss": 0.9298030138015747, "eval_runtime": 3.4047, "eval_samples_per_second": 8.811, "eval_steps_per_second": 1.175, "step": 27 }, { "epoch": 1.4736842105263157, "grad_norm": 0.3035232126712799, "learning_rate": 0.0004172826515897146, "loss": 0.8544, "step": 28 }, { "epoch": 1.4736842105263157, "eval_loss": 0.920465350151062, "eval_runtime": 3.4152, "eval_samples_per_second": 8.784, "eval_steps_per_second": 1.171, "step": 28 }, { "epoch": 1.526315789473684, "grad_norm": 0.36378970742225647, "learning_rate": 0.00040391536883141455, "loss": 1.0175, "step": 29 }, { "epoch": 1.526315789473684, "eval_loss": 0.9069837331771851, "eval_runtime": 3.4214, "eval_samples_per_second": 8.768, "eval_steps_per_second": 1.169, "step": 29 }, { "epoch": 1.5789473684210527, "grad_norm": 0.3729051947593689, "learning_rate": 0.0003897982258676867, "loss": 0.9851, "step": 30 }, { "epoch": 1.5789473684210527, "eval_loss": 0.8988735675811768, "eval_runtime": 3.4109, "eval_samples_per_second": 8.795, "eval_steps_per_second": 1.173, "step": 30 }, { "epoch": 1.631578947368421, "grad_norm": 0.3581544756889343, "learning_rate": 0.000375, "loss": 0.9229, "step": 31 }, { "epoch": 1.631578947368421, "eval_loss": 0.8822915554046631, "eval_runtime": 3.3783, "eval_samples_per_second": 8.88, "eval_steps_per_second": 1.184, "step": 31 }, { "epoch": 1.6842105263157894, "grad_norm": 0.28150516748428345, "learning_rate": 0.00035959278669726934, "loss": 0.94, "step": 32 }, { "epoch": 1.6842105263157894, "eval_loss": 0.8713746666908264, "eval_runtime": 3.4041, "eval_samples_per_second": 8.813, "eval_steps_per_second": 1.175, "step": 32 }, { "epoch": 1.736842105263158, "grad_norm": 0.30831000208854675, "learning_rate": 0.00034365164835397803, "loss": 1.0407, "step": 33 }, { "epoch": 1.736842105263158, "eval_loss": 0.8603693842887878, "eval_runtime": 3.417, "eval_samples_per_second": 8.78, "eval_steps_per_second": 1.171, "step": 33 }, { "epoch": 1.7894736842105263, "grad_norm": 0.31896907091140747, "learning_rate": 0.00032725424859373687, "loss": 0.9185, "step": 34 }, { "epoch": 1.7894736842105263, "eval_loss": 0.849823534488678, "eval_runtime": 3.4154, "eval_samples_per_second": 8.784, "eval_steps_per_second": 1.171, "step": 34 }, { "epoch": 1.8421052631578947, "grad_norm": 0.29725414514541626, "learning_rate": 0.0003104804738999169, "loss": 0.978, "step": 35 }, { "epoch": 1.8421052631578947, "eval_loss": 0.8390634655952454, "eval_runtime": 3.4119, "eval_samples_per_second": 8.793, "eval_steps_per_second": 1.172, "step": 35 }, { "epoch": 1.8947368421052633, "grad_norm": 0.3137111961841583, "learning_rate": 0.00029341204441673266, "loss": 0.9221, "step": 36 }, { "epoch": 1.8947368421052633, "eval_loss": 0.8293085098266602, "eval_runtime": 3.3951, "eval_samples_per_second": 8.836, "eval_steps_per_second": 1.178, "step": 36 }, { "epoch": 1.9473684210526314, "grad_norm": 0.267716646194458, "learning_rate": 0.0002761321158169134, "loss": 1.0078, "step": 37 }, { "epoch": 1.9473684210526314, "eval_loss": 0.8227899670600891, "eval_runtime": 3.3926, "eval_samples_per_second": 8.843, "eval_steps_per_second": 1.179, "step": 37 }, { "epoch": 2.0, "grad_norm": 0.3097141683101654, "learning_rate": 0.0002587248741756253, "loss": 1.0386, "step": 38 }, { "epoch": 2.0, "eval_loss": 0.8196889758110046, "eval_runtime": 3.3913, "eval_samples_per_second": 8.846, "eval_steps_per_second": 1.179, "step": 38 }, { "epoch": 2.0526315789473686, "grad_norm": 0.29532116651535034, "learning_rate": 0.00024127512582437484, "loss": 0.9046, "step": 39 }, { "epoch": 2.0526315789473686, "eval_loss": 0.8109915852546692, "eval_runtime": 3.3856, "eval_samples_per_second": 8.861, "eval_steps_per_second": 1.181, "step": 39 }, { "epoch": 2.1052631578947367, "grad_norm": 0.3160407245159149, "learning_rate": 0.00022386788418308668, "loss": 0.8684, "step": 40 }, { "epoch": 2.1052631578947367, "eval_loss": 0.799045979976654, "eval_runtime": 3.3859, "eval_samples_per_second": 8.86, "eval_steps_per_second": 1.181, "step": 40 }, { "epoch": 2.1578947368421053, "grad_norm": 0.2594124674797058, "learning_rate": 0.00020658795558326743, "loss": 0.8051, "step": 41 }, { "epoch": 2.1578947368421053, "eval_loss": 0.7873298525810242, "eval_runtime": 3.3873, "eval_samples_per_second": 8.857, "eval_steps_per_second": 1.181, "step": 41 }, { "epoch": 2.2105263157894735, "grad_norm": 0.2573184370994568, "learning_rate": 0.0001895195261000831, "loss": 0.7542, "step": 42 }, { "epoch": 2.2105263157894735, "eval_loss": 0.7783879637718201, "eval_runtime": 3.3897, "eval_samples_per_second": 8.85, "eval_steps_per_second": 1.18, "step": 42 }, { "epoch": 2.263157894736842, "grad_norm": 0.3050247132778168, "learning_rate": 0.00017274575140626317, "loss": 0.8833, "step": 43 }, { "epoch": 2.263157894736842, "eval_loss": 0.7714616060256958, "eval_runtime": 3.4031, "eval_samples_per_second": 8.815, "eval_steps_per_second": 1.175, "step": 43 }, { "epoch": 2.3157894736842106, "grad_norm": 0.27206432819366455, "learning_rate": 0.00015634835164602198, "loss": 0.8176, "step": 44 }, { "epoch": 2.3157894736842106, "eval_loss": 0.7637041807174683, "eval_runtime": 3.4006, "eval_samples_per_second": 8.822, "eval_steps_per_second": 1.176, "step": 44 }, { "epoch": 2.3684210526315788, "grad_norm": 0.24384012818336487, "learning_rate": 0.00014040721330273062, "loss": 0.7616, "step": 45 }, { "epoch": 2.3684210526315788, "eval_loss": 0.7560217380523682, "eval_runtime": 3.4005, "eval_samples_per_second": 8.822, "eval_steps_per_second": 1.176, "step": 45 }, { "epoch": 2.4210526315789473, "grad_norm": 0.25645551085472107, "learning_rate": 0.00012500000000000006, "loss": 0.7888, "step": 46 }, { "epoch": 2.4210526315789473, "eval_loss": 0.7505295872688293, "eval_runtime": 3.3925, "eval_samples_per_second": 8.843, "eval_steps_per_second": 1.179, "step": 46 }, { "epoch": 2.473684210526316, "grad_norm": 0.27820125222206116, "learning_rate": 0.00011020177413231333, "loss": 0.7584, "step": 47 }, { "epoch": 2.473684210526316, "eval_loss": 0.7445800304412842, "eval_runtime": 3.3928, "eval_samples_per_second": 8.842, "eval_steps_per_second": 1.179, "step": 47 }, { "epoch": 2.526315789473684, "grad_norm": 0.23925091326236725, "learning_rate": 9.608463116858542e-05, "loss": 0.7504, "step": 48 }, { "epoch": 2.526315789473684, "eval_loss": 0.7403488755226135, "eval_runtime": 3.4026, "eval_samples_per_second": 8.817, "eval_steps_per_second": 1.176, "step": 48 }, { "epoch": 2.5789473684210527, "grad_norm": 0.32143712043762207, "learning_rate": 8.271734841028553e-05, "loss": 0.8269, "step": 49 }, { "epoch": 2.5789473684210527, "eval_loss": 0.7371814250946045, "eval_runtime": 3.3997, "eval_samples_per_second": 8.824, "eval_steps_per_second": 1.177, "step": 49 }, { "epoch": 2.6315789473684212, "grad_norm": 0.2628876864910126, "learning_rate": 7.016504991533726e-05, "loss": 0.7076, "step": 50 }, { "epoch": 2.6315789473684212, "eval_loss": 0.7335822582244873, "eval_runtime": 3.4029, "eval_samples_per_second": 8.816, "eval_steps_per_second": 1.175, "step": 50 }, { "epoch": 2.6842105263157894, "grad_norm": 0.30318617820739746, "learning_rate": 5.848888922025553e-05, "loss": 0.7792, "step": 51 }, { "epoch": 2.6842105263157894, "eval_loss": 0.7297669053077698, "eval_runtime": 3.3726, "eval_samples_per_second": 8.895, "eval_steps_per_second": 1.186, "step": 51 }, { "epoch": 2.736842105263158, "grad_norm": 0.3162338435649872, "learning_rate": 4.7745751406263163e-05, "loss": 0.7217, "step": 52 }, { "epoch": 2.736842105263158, "eval_loss": 0.728228747844696, "eval_runtime": 3.3989, "eval_samples_per_second": 8.827, "eval_steps_per_second": 1.177, "step": 52 }, { "epoch": 2.7894736842105265, "grad_norm": 0.2733875513076782, "learning_rate": 3.798797596089351e-05, "loss": 0.8098, "step": 53 }, { "epoch": 2.7894736842105265, "eval_loss": 0.7270908355712891, "eval_runtime": 3.4122, "eval_samples_per_second": 8.792, "eval_steps_per_second": 1.172, "step": 53 }, { "epoch": 2.8421052631578947, "grad_norm": 0.26100900769233704, "learning_rate": 2.9263101785268254e-05, "loss": 0.7631, "step": 54 }, { "epoch": 2.8421052631578947, "eval_loss": 0.7254647016525269, "eval_runtime": 3.4244, "eval_samples_per_second": 8.761, "eval_steps_per_second": 1.168, "step": 54 }, { "epoch": 2.8947368421052633, "grad_norm": 0.2827248275279999, "learning_rate": 2.1613635589349755e-05, "loss": 0.7716, "step": 55 }, { "epoch": 2.8947368421052633, "eval_loss": 0.7241045236587524, "eval_runtime": 3.4133, "eval_samples_per_second": 8.789, "eval_steps_per_second": 1.172, "step": 55 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2315465393725440.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }