{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.125, "grad_norm": 21.28069305419922, "learning_rate": 2.5e-05, "loss": 1.2067, "step": 1 }, { "epoch": 0.125, "eval_accuracy": 0.3548387096774194, "eval_f1": 0.2145748987854251, "eval_loss": 1.2299962043762207, "eval_runtime": 1.8781, "eval_samples_per_second": 33.012, "eval_steps_per_second": 2.13, "step": 1 }, { "epoch": 0.25, "grad_norm": 16.937559127807617, "learning_rate": 5e-05, "loss": 1.3485, "step": 2 }, { "epoch": 0.25, "eval_accuracy": 0.3709677419354839, "eval_f1": 0.18930041152263377, "eval_loss": 1.1593687534332275, "eval_runtime": 1.874, "eval_samples_per_second": 33.084, "eval_steps_per_second": 2.134, "step": 2 }, { "epoch": 0.375, "grad_norm": 21.755922317504883, "learning_rate": 4.935897435897436e-05, "loss": 1.2559, "step": 3 }, { "epoch": 0.375, "eval_accuracy": 0.46774193548387094, "eval_f1": 0.23272633346248225, "eval_loss": 1.0302419662475586, "eval_runtime": 1.875, "eval_samples_per_second": 33.067, "eval_steps_per_second": 2.133, "step": 3 }, { "epoch": 0.5, "grad_norm": 10.690485954284668, "learning_rate": 4.871794871794872e-05, "loss": 1.0408, "step": 4 }, { "epoch": 0.5, "eval_accuracy": 0.46774193548387094, "eval_f1": 0.24990730441231, "eval_loss": 0.9755544066429138, "eval_runtime": 1.8757, "eval_samples_per_second": 33.055, "eval_steps_per_second": 2.133, "step": 4 }, { "epoch": 0.625, "grad_norm": 8.635258674621582, "learning_rate": 4.8076923076923084e-05, "loss": 0.8126, "step": 5 }, { "epoch": 0.625, "eval_accuracy": 0.46774193548387094, "eval_f1": 0.2485632183908046, "eval_loss": 0.9575037956237793, "eval_runtime": 1.9274, "eval_samples_per_second": 32.168, "eval_steps_per_second": 2.075, "step": 5 }, { "epoch": 0.75, "grad_norm": 11.213544845581055, "learning_rate": 4.7435897435897435e-05, "loss": 0.9848, "step": 6 }, { "epoch": 0.75, "eval_accuracy": 0.5161290322580645, "eval_f1": 0.28421309872922773, "eval_loss": 0.951880693435669, "eval_runtime": 1.9311, "eval_samples_per_second": 32.106, "eval_steps_per_second": 2.071, "step": 6 }, { "epoch": 0.875, "grad_norm": 7.83177375793457, "learning_rate": 4.67948717948718e-05, "loss": 0.8319, "step": 7 }, { "epoch": 0.875, "eval_accuracy": 0.532258064516129, "eval_f1": 0.3174603174603175, "eval_loss": 0.962181806564331, "eval_runtime": 1.8805, "eval_samples_per_second": 32.97, "eval_steps_per_second": 2.127, "step": 7 }, { "epoch": 1.0, "grad_norm": 12.740473747253418, "learning_rate": 4.615384615384616e-05, "loss": 1.0211, "step": 8 }, { "epoch": 1.0, "eval_accuracy": 0.5, "eval_f1": 0.3313632781717888, "eval_loss": 0.9630481600761414, "eval_runtime": 1.8802, "eval_samples_per_second": 32.975, "eval_steps_per_second": 2.127, "step": 8 }, { "epoch": 1.125, "grad_norm": 12.164949417114258, "learning_rate": 4.5512820512820516e-05, "loss": 0.8899, "step": 9 }, { "epoch": 1.125, "eval_accuracy": 0.5, "eval_f1": 0.3108974358974359, "eval_loss": 0.9681987166404724, "eval_runtime": 1.8773, "eval_samples_per_second": 33.026, "eval_steps_per_second": 2.131, "step": 9 }, { "epoch": 1.25, "grad_norm": 6.146948337554932, "learning_rate": 4.4871794871794874e-05, "loss": 0.7234, "step": 10 }, { "epoch": 1.25, "eval_accuracy": 0.5, "eval_f1": 0.3108974358974359, "eval_loss": 0.9844852685928345, "eval_runtime": 1.8795, "eval_samples_per_second": 32.988, "eval_steps_per_second": 2.128, "step": 10 }, { "epoch": 1.375, "grad_norm": 8.822149276733398, "learning_rate": 4.423076923076923e-05, "loss": 1.0579, "step": 11 }, { "epoch": 1.375, "eval_accuracy": 0.5, "eval_f1": 0.3108974358974359, "eval_loss": 0.9911479353904724, "eval_runtime": 1.8777, "eval_samples_per_second": 33.019, "eval_steps_per_second": 2.13, "step": 11 }, { "epoch": 1.5, "grad_norm": 9.482677459716797, "learning_rate": 4.358974358974359e-05, "loss": 0.8232, "step": 12 }, { "epoch": 1.5, "eval_accuracy": 0.5, "eval_f1": 0.3108974358974359, "eval_loss": 0.9866904020309448, "eval_runtime": 1.9302, "eval_samples_per_second": 32.121, "eval_steps_per_second": 2.072, "step": 12 }, { "epoch": 1.625, "grad_norm": 17.120052337646484, "learning_rate": 4.294871794871795e-05, "loss": 1.063, "step": 13 }, { "epoch": 1.625, "eval_accuracy": 0.5, "eval_f1": 0.3108974358974359, "eval_loss": 0.9797441959381104, "eval_runtime": 1.9281, "eval_samples_per_second": 32.157, "eval_steps_per_second": 2.075, "step": 13 }, { "epoch": 1.75, "grad_norm": 8.349448204040527, "learning_rate": 4.230769230769231e-05, "loss": 0.9684, "step": 14 }, { "epoch": 1.75, "eval_accuracy": 0.46774193548387094, "eval_f1": 0.2941421721909527, "eval_loss": 0.9708881974220276, "eval_runtime": 1.9314, "eval_samples_per_second": 32.1, "eval_steps_per_second": 2.071, "step": 14 }, { "epoch": 1.875, "grad_norm": 4.90641975402832, "learning_rate": 4.166666666666667e-05, "loss": 0.9342, "step": 15 }, { "epoch": 1.875, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.31024531024531027, "eval_loss": 0.9690807461738586, "eval_runtime": 1.8766, "eval_samples_per_second": 33.038, "eval_steps_per_second": 2.131, "step": 15 }, { "epoch": 2.0, "grad_norm": 9.04134750366211, "learning_rate": 4.1025641025641023e-05, "loss": 0.7387, "step": 16 }, { "epoch": 2.0, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.31024531024531027, "eval_loss": 0.9698801636695862, "eval_runtime": 1.8773, "eval_samples_per_second": 33.026, "eval_steps_per_second": 2.131, "step": 16 }, { "epoch": 2.125, "grad_norm": 5.919072151184082, "learning_rate": 4.038461538461539e-05, "loss": 0.8603, "step": 17 }, { "epoch": 2.125, "eval_accuracy": 0.4838709677419355, "eval_f1": 0.30253164556962026, "eval_loss": 0.9661668539047241, "eval_runtime": 1.928, "eval_samples_per_second": 32.158, "eval_steps_per_second": 2.075, "step": 17 }, { "epoch": 2.25, "grad_norm": 7.429778575897217, "learning_rate": 3.974358974358974e-05, "loss": 0.8114, "step": 18 }, { "epoch": 2.25, "eval_accuracy": 0.5, "eval_f1": 0.3189493433395872, "eval_loss": 0.9636289477348328, "eval_runtime": 1.9287, "eval_samples_per_second": 32.145, "eval_steps_per_second": 2.074, "step": 18 }, { "epoch": 2.375, "grad_norm": 6.09760046005249, "learning_rate": 3.9102564102564105e-05, "loss": 0.9508, "step": 19 }, { "epoch": 2.375, "eval_accuracy": 0.5, "eval_f1": 0.3189493433395872, "eval_loss": 0.9564090967178345, "eval_runtime": 1.8792, "eval_samples_per_second": 32.992, "eval_steps_per_second": 2.129, "step": 19 }, { "epoch": 2.5, "grad_norm": 4.588860988616943, "learning_rate": 3.846153846153846e-05, "loss": 0.7554, "step": 20 }, { "epoch": 2.5, "eval_accuracy": 0.5161290322580645, "eval_f1": 0.33455732354141166, "eval_loss": 0.9543712735176086, "eval_runtime": 1.9285, "eval_samples_per_second": 32.15, "eval_steps_per_second": 2.074, "step": 20 }, { "epoch": 2.625, "grad_norm": 4.015089988708496, "learning_rate": 3.782051282051282e-05, "loss": 0.8772, "step": 21 }, { "epoch": 2.625, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.36436436436436437, "eval_loss": 0.9526702165603638, "eval_runtime": 1.9277, "eval_samples_per_second": 32.162, "eval_steps_per_second": 2.075, "step": 21 }, { "epoch": 2.75, "grad_norm": 7.522365570068359, "learning_rate": 3.717948717948718e-05, "loss": 0.848, "step": 22 }, { "epoch": 2.75, "eval_accuracy": 0.5806451612903226, "eval_f1": 0.3924349881796691, "eval_loss": 0.9473089575767517, "eval_runtime": 1.927, "eval_samples_per_second": 32.175, "eval_steps_per_second": 2.076, "step": 22 }, { "epoch": 2.875, "grad_norm": 10.20969295501709, "learning_rate": 3.653846153846154e-05, "loss": 0.6632, "step": 23 }, { "epoch": 2.875, "eval_accuracy": 0.6451612903225806, "eval_f1": 0.4470854555744199, "eval_loss": 0.9435995817184448, "eval_runtime": 1.9274, "eval_samples_per_second": 32.168, "eval_steps_per_second": 2.075, "step": 23 }, { "epoch": 3.0, "grad_norm": 4.576776027679443, "learning_rate": 3.58974358974359e-05, "loss": 0.7259, "step": 24 }, { "epoch": 3.0, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.47088750706613897, "eval_loss": 0.9455369710922241, "eval_runtime": 1.9275, "eval_samples_per_second": 32.166, "eval_steps_per_second": 2.075, "step": 24 }, { "epoch": 3.125, "grad_norm": 8.878484725952148, "learning_rate": 3.525641025641026e-05, "loss": 0.7974, "step": 25 }, { "epoch": 3.125, "eval_accuracy": 0.6935483870967742, "eval_f1": 0.48200757575757575, "eval_loss": 0.9411699771881104, "eval_runtime": 1.9284, "eval_samples_per_second": 32.151, "eval_steps_per_second": 2.074, "step": 25 }, { "epoch": 3.25, "grad_norm": 7.3488450050354, "learning_rate": 3.461538461538462e-05, "loss": 0.7728, "step": 26 }, { "epoch": 3.25, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.45350669818754924, "eval_loss": 0.9431821703910828, "eval_runtime": 1.928, "eval_samples_per_second": 32.157, "eval_steps_per_second": 2.075, "step": 26 }, { "epoch": 3.375, "grad_norm": 4.182816028594971, "learning_rate": 3.397435897435898e-05, "loss": 0.9005, "step": 27 }, { "epoch": 3.375, "eval_accuracy": 0.5806451612903226, "eval_f1": 0.3848238482384824, "eval_loss": 0.9425324201583862, "eval_runtime": 1.9307, "eval_samples_per_second": 32.112, "eval_steps_per_second": 2.072, "step": 27 }, { "epoch": 3.5, "grad_norm": 6.473806858062744, "learning_rate": 3.3333333333333335e-05, "loss": 0.8324, "step": 28 }, { "epoch": 3.5, "eval_accuracy": 0.5483870967741935, "eval_f1": 0.35341880341880344, "eval_loss": 0.9440051913261414, "eval_runtime": 1.9291, "eval_samples_per_second": 32.139, "eval_steps_per_second": 2.073, "step": 28 }, { "epoch": 3.625, "grad_norm": 7.45881986618042, "learning_rate": 3.269230769230769e-05, "loss": 0.7577, "step": 29 }, { "epoch": 3.625, "eval_accuracy": 0.532258064516129, "eval_f1": 0.33679878709118477, "eval_loss": 0.9446666836738586, "eval_runtime": 1.929, "eval_samples_per_second": 32.142, "eval_steps_per_second": 2.074, "step": 29 }, { "epoch": 3.75, "grad_norm": 5.7591633796691895, "learning_rate": 3.205128205128206e-05, "loss": 0.7887, "step": 30 }, { "epoch": 3.75, "eval_accuracy": 0.532258064516129, "eval_f1": 0.33679878709118477, "eval_loss": 0.9500220417976379, "eval_runtime": 1.9285, "eval_samples_per_second": 32.149, "eval_steps_per_second": 2.074, "step": 30 }, { "epoch": 3.875, "grad_norm": 11.719704627990723, "learning_rate": 3.141025641025641e-05, "loss": 0.6706, "step": 31 }, { "epoch": 3.875, "eval_accuracy": 0.5806451612903226, "eval_f1": 0.38984088127294986, "eval_loss": 0.9507701992988586, "eval_runtime": 1.9302, "eval_samples_per_second": 32.121, "eval_steps_per_second": 2.072, "step": 31 }, { "epoch": 4.0, "grad_norm": 6.659142017364502, "learning_rate": 3.0769230769230774e-05, "loss": 0.7348, "step": 32 }, { "epoch": 4.0, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4682125603864735, "eval_loss": 0.9422016739845276, "eval_runtime": 1.9309, "eval_samples_per_second": 32.109, "eval_steps_per_second": 2.072, "step": 32 }, { "epoch": 4.125, "grad_norm": 8.99232006072998, "learning_rate": 3.012820512820513e-05, "loss": 0.797, "step": 33 }, { "epoch": 4.125, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.46957520091848454, "eval_loss": 0.9406993985176086, "eval_runtime": 1.8808, "eval_samples_per_second": 32.965, "eval_steps_per_second": 2.127, "step": 33 }, { "epoch": 4.25, "grad_norm": 15.037948608398438, "learning_rate": 2.948717948717949e-05, "loss": 0.7472, "step": 34 }, { "epoch": 4.25, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4708994708994709, "eval_loss": 0.9398488402366638, "eval_runtime": 1.9301, "eval_samples_per_second": 32.122, "eval_steps_per_second": 2.072, "step": 34 }, { "epoch": 4.375, "grad_norm": 9.858637809753418, "learning_rate": 2.8846153846153845e-05, "loss": 0.7184, "step": 35 }, { "epoch": 4.375, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.45972458026787394, "eval_loss": 0.9320343732833862, "eval_runtime": 1.9306, "eval_samples_per_second": 32.115, "eval_steps_per_second": 2.072, "step": 35 }, { "epoch": 4.5, "grad_norm": 9.198653221130371, "learning_rate": 2.8205128205128207e-05, "loss": 0.8042, "step": 36 }, { "epoch": 4.5, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.45972458026787394, "eval_loss": 0.9294925928115845, "eval_runtime": 1.9308, "eval_samples_per_second": 32.111, "eval_steps_per_second": 2.072, "step": 36 }, { "epoch": 4.625, "grad_norm": 5.223246097564697, "learning_rate": 2.756410256410257e-05, "loss": 0.7408, "step": 37 }, { "epoch": 4.625, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4708994708994709, "eval_loss": 0.9262655973434448, "eval_runtime": 1.9307, "eval_samples_per_second": 32.113, "eval_steps_per_second": 2.072, "step": 37 }, { "epoch": 4.75, "grad_norm": 6.609065532684326, "learning_rate": 2.6923076923076923e-05, "loss": 0.5936, "step": 38 }, { "epoch": 4.75, "eval_accuracy": 0.6935483870967742, "eval_f1": 0.48192090395480225, "eval_loss": 0.9254209399223328, "eval_runtime": 1.9339, "eval_samples_per_second": 32.059, "eval_steps_per_second": 2.068, "step": 38 }, { "epoch": 4.875, "grad_norm": 6.2056803703308105, "learning_rate": 2.6282051282051285e-05, "loss": 0.5937, "step": 39 }, { "epoch": 4.875, "eval_accuracy": 0.6935483870967742, "eval_f1": 0.48192090395480225, "eval_loss": 0.9212784171104431, "eval_runtime": 1.9303, "eval_samples_per_second": 32.12, "eval_steps_per_second": 2.072, "step": 39 }, { "epoch": 5.0, "grad_norm": 4.507566452026367, "learning_rate": 2.564102564102564e-05, "loss": 0.7497, "step": 40 }, { "epoch": 5.0, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9186185002326965, "eval_runtime": 1.9335, "eval_samples_per_second": 32.066, "eval_steps_per_second": 2.069, "step": 40 }, { "epoch": 5.125, "grad_norm": 13.483494758605957, "learning_rate": 2.5e-05, "loss": 1.135, "step": 41 }, { "epoch": 5.125, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9170433878898621, "eval_runtime": 1.9361, "eval_samples_per_second": 32.023, "eval_steps_per_second": 2.066, "step": 41 }, { "epoch": 5.25, "grad_norm": 8.313968658447266, "learning_rate": 2.435897435897436e-05, "loss": 0.6677, "step": 42 }, { "epoch": 5.25, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9127059578895569, "eval_runtime": 1.9338, "eval_samples_per_second": 32.061, "eval_steps_per_second": 2.068, "step": 42 }, { "epoch": 5.375, "grad_norm": 7.541636943817139, "learning_rate": 2.3717948717948718e-05, "loss": 0.7515, "step": 43 }, { "epoch": 5.375, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9163720011711121, "eval_runtime": 1.9345, "eval_samples_per_second": 32.049, "eval_steps_per_second": 2.068, "step": 43 }, { "epoch": 5.5, "grad_norm": 9.568167686462402, "learning_rate": 2.307692307692308e-05, "loss": 0.6488, "step": 44 }, { "epoch": 5.5, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9121409058570862, "eval_runtime": 1.8843, "eval_samples_per_second": 32.904, "eval_steps_per_second": 2.123, "step": 44 }, { "epoch": 5.625, "grad_norm": 3.9321157932281494, "learning_rate": 2.2435897435897437e-05, "loss": 0.7113, "step": 45 }, { "epoch": 5.625, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9119479060173035, "eval_runtime": 1.9363, "eval_samples_per_second": 32.02, "eval_steps_per_second": 2.066, "step": 45 }, { "epoch": 5.75, "grad_norm": 4.554017543792725, "learning_rate": 2.1794871794871795e-05, "loss": 0.6476, "step": 46 }, { "epoch": 5.75, "eval_accuracy": 0.6935483870967742, "eval_f1": 0.48192090395480225, "eval_loss": 0.9131627082824707, "eval_runtime": 1.8815, "eval_samples_per_second": 32.953, "eval_steps_per_second": 2.126, "step": 46 }, { "epoch": 5.875, "grad_norm": 4.490871429443359, "learning_rate": 2.1153846153846154e-05, "loss": 0.5748, "step": 47 }, { "epoch": 5.875, "eval_accuracy": 0.6935483870967742, "eval_f1": 0.48192090395480225, "eval_loss": 0.9163798689842224, "eval_runtime": 1.8811, "eval_samples_per_second": 32.959, "eval_steps_per_second": 2.126, "step": 47 }, { "epoch": 6.0, "grad_norm": 4.915560722351074, "learning_rate": 2.0512820512820512e-05, "loss": 0.7456, "step": 48 }, { "epoch": 6.0, "eval_accuracy": 0.6935483870967742, "eval_f1": 0.48192090395480225, "eval_loss": 0.9137986898422241, "eval_runtime": 1.9333, "eval_samples_per_second": 32.069, "eval_steps_per_second": 2.069, "step": 48 }, { "epoch": 6.125, "grad_norm": 5.666485786437988, "learning_rate": 1.987179487179487e-05, "loss": 0.7035, "step": 49 }, { "epoch": 6.125, "eval_accuracy": 0.6935483870967742, "eval_f1": 0.48192090395480225, "eval_loss": 0.9176655411720276, "eval_runtime": 1.8845, "eval_samples_per_second": 32.9, "eval_steps_per_second": 2.123, "step": 49 }, { "epoch": 6.25, "grad_norm": 4.38871431350708, "learning_rate": 1.923076923076923e-05, "loss": 0.6157, "step": 50 }, { "epoch": 6.25, "eval_accuracy": 0.6935483870967742, "eval_f1": 0.48192090395480225, "eval_loss": 0.9223140478134155, "eval_runtime": 1.8794, "eval_samples_per_second": 32.99, "eval_steps_per_second": 2.128, "step": 50 }, { "epoch": 6.375, "grad_norm": 11.442863464355469, "learning_rate": 1.858974358974359e-05, "loss": 0.561, "step": 51 }, { "epoch": 6.375, "eval_accuracy": 0.6935483870967742, "eval_f1": 0.48192090395480225, "eval_loss": 0.9202437996864319, "eval_runtime": 1.9299, "eval_samples_per_second": 32.125, "eval_steps_per_second": 2.073, "step": 51 }, { "epoch": 6.5, "grad_norm": 7.4318928718566895, "learning_rate": 1.794871794871795e-05, "loss": 0.7139, "step": 52 }, { "epoch": 6.5, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9179559350013733, "eval_runtime": 1.8777, "eval_samples_per_second": 33.02, "eval_steps_per_second": 2.13, "step": 52 }, { "epoch": 6.625, "grad_norm": 5.853055477142334, "learning_rate": 1.730769230769231e-05, "loss": 0.6748, "step": 53 }, { "epoch": 6.625, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9139246940612793, "eval_runtime": 1.9266, "eval_samples_per_second": 32.182, "eval_steps_per_second": 2.076, "step": 53 }, { "epoch": 6.75, "grad_norm": 5.003907680511475, "learning_rate": 1.6666666666666667e-05, "loss": 0.5338, "step": 54 }, { "epoch": 6.75, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9166889786720276, "eval_runtime": 1.9286, "eval_samples_per_second": 32.148, "eval_steps_per_second": 2.074, "step": 54 }, { "epoch": 6.875, "grad_norm": 4.901527404785156, "learning_rate": 1.602564102564103e-05, "loss": 0.6889, "step": 55 }, { "epoch": 6.875, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9157655835151672, "eval_runtime": 1.9276, "eval_samples_per_second": 32.164, "eval_steps_per_second": 2.075, "step": 55 }, { "epoch": 7.0, "grad_norm": 5.2783918380737305, "learning_rate": 1.5384615384615387e-05, "loss": 0.8556, "step": 56 }, { "epoch": 7.0, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.921044647693634, "eval_runtime": 1.9306, "eval_samples_per_second": 32.115, "eval_steps_per_second": 2.072, "step": 56 }, { "epoch": 7.125, "grad_norm": 10.69826602935791, "learning_rate": 1.4743589743589745e-05, "loss": 0.8448, "step": 57 }, { "epoch": 7.125, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.916961669921875, "eval_runtime": 1.9308, "eval_samples_per_second": 32.111, "eval_steps_per_second": 2.072, "step": 57 }, { "epoch": 7.25, "grad_norm": 6.701515197753906, "learning_rate": 1.4102564102564104e-05, "loss": 0.6535, "step": 58 }, { "epoch": 7.25, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9177826642990112, "eval_runtime": 1.9295, "eval_samples_per_second": 32.133, "eval_steps_per_second": 2.073, "step": 58 }, { "epoch": 7.375, "grad_norm": 5.344858646392822, "learning_rate": 1.3461538461538462e-05, "loss": 0.6185, "step": 59 }, { "epoch": 7.375, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9164468050003052, "eval_runtime": 1.9345, "eval_samples_per_second": 32.049, "eval_steps_per_second": 2.068, "step": 59 }, { "epoch": 7.5, "grad_norm": 5.061964511871338, "learning_rate": 1.282051282051282e-05, "loss": 0.7128, "step": 60 }, { "epoch": 7.5, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9154215455055237, "eval_runtime": 1.9313, "eval_samples_per_second": 32.103, "eval_steps_per_second": 2.071, "step": 60 }, { "epoch": 7.625, "grad_norm": 6.167932033538818, "learning_rate": 1.217948717948718e-05, "loss": 0.5839, "step": 61 }, { "epoch": 7.625, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9095370173454285, "eval_runtime": 1.9307, "eval_samples_per_second": 32.112, "eval_steps_per_second": 2.072, "step": 61 }, { "epoch": 7.75, "grad_norm": 2.6592514514923096, "learning_rate": 1.153846153846154e-05, "loss": 0.5118, "step": 62 }, { "epoch": 7.75, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9109383821487427, "eval_runtime": 1.9289, "eval_samples_per_second": 32.143, "eval_steps_per_second": 2.074, "step": 62 }, { "epoch": 7.875, "grad_norm": 10.835193634033203, "learning_rate": 1.0897435897435898e-05, "loss": 0.6146, "step": 63 }, { "epoch": 7.875, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9067746996879578, "eval_runtime": 1.9278, "eval_samples_per_second": 32.161, "eval_steps_per_second": 2.075, "step": 63 }, { "epoch": 8.0, "grad_norm": 11.179938316345215, "learning_rate": 1.0256410256410256e-05, "loss": 0.9724, "step": 64 }, { "epoch": 8.0, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9018973112106323, "eval_runtime": 1.9274, "eval_samples_per_second": 32.168, "eval_steps_per_second": 2.075, "step": 64 }, { "epoch": 8.125, "grad_norm": 6.455613136291504, "learning_rate": 9.615384615384616e-06, "loss": 0.6459, "step": 65 }, { "epoch": 8.125, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9056165218353271, "eval_runtime": 1.9292, "eval_samples_per_second": 32.138, "eval_steps_per_second": 2.073, "step": 65 }, { "epoch": 8.25, "grad_norm": 7.206265449523926, "learning_rate": 8.974358974358976e-06, "loss": 0.7838, "step": 66 }, { "epoch": 8.25, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.909091591835022, "eval_runtime": 1.9295, "eval_samples_per_second": 32.132, "eval_steps_per_second": 2.073, "step": 66 }, { "epoch": 8.375, "grad_norm": 5.377885341644287, "learning_rate": 8.333333333333334e-06, "loss": 0.7472, "step": 67 }, { "epoch": 8.375, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9005382657051086, "eval_runtime": 1.8794, "eval_samples_per_second": 32.99, "eval_steps_per_second": 2.128, "step": 67 }, { "epoch": 8.5, "grad_norm": 5.025684356689453, "learning_rate": 7.692307692307694e-06, "loss": 0.4137, "step": 68 }, { "epoch": 8.5, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9060821533203125, "eval_runtime": 1.8832, "eval_samples_per_second": 32.923, "eval_steps_per_second": 2.124, "step": 68 }, { "epoch": 8.625, "grad_norm": 6.750956058502197, "learning_rate": 7.051282051282052e-06, "loss": 0.5145, "step": 69 }, { "epoch": 8.625, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.8989046216011047, "eval_runtime": 1.9282, "eval_samples_per_second": 32.155, "eval_steps_per_second": 2.075, "step": 69 }, { "epoch": 8.75, "grad_norm": 5.707816123962402, "learning_rate": 6.41025641025641e-06, "loss": 0.7129, "step": 70 }, { "epoch": 8.75, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.8961211442947388, "eval_runtime": 1.929, "eval_samples_per_second": 32.14, "eval_steps_per_second": 2.074, "step": 70 }, { "epoch": 8.875, "grad_norm": 3.8523380756378174, "learning_rate": 5.76923076923077e-06, "loss": 0.554, "step": 71 }, { "epoch": 8.875, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.8937899470329285, "eval_runtime": 1.9304, "eval_samples_per_second": 32.117, "eval_steps_per_second": 2.072, "step": 71 }, { "epoch": 9.0, "grad_norm": 7.517840385437012, "learning_rate": 5.128205128205128e-06, "loss": 0.7014, "step": 72 }, { "epoch": 9.0, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.9007149934768677, "eval_runtime": 1.9291, "eval_samples_per_second": 32.139, "eval_steps_per_second": 2.073, "step": 72 }, { "epoch": 9.125, "grad_norm": 7.3192362785339355, "learning_rate": 4.487179487179488e-06, "loss": 0.7076, "step": 73 }, { "epoch": 9.125, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.8964553475379944, "eval_runtime": 1.9292, "eval_samples_per_second": 32.137, "eval_steps_per_second": 2.073, "step": 73 }, { "epoch": 9.25, "grad_norm": 6.544466495513916, "learning_rate": 3.846153846153847e-06, "loss": 0.6754, "step": 74 }, { "epoch": 9.25, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.9004920125007629, "eval_runtime": 1.9283, "eval_samples_per_second": 32.153, "eval_steps_per_second": 2.074, "step": 74 }, { "epoch": 9.375, "grad_norm": 5.344063758850098, "learning_rate": 3.205128205128205e-06, "loss": 0.591, "step": 75 }, { "epoch": 9.375, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.8975746631622314, "eval_runtime": 1.8785, "eval_samples_per_second": 33.006, "eval_steps_per_second": 2.129, "step": 75 }, { "epoch": 9.5, "grad_norm": 7.665821075439453, "learning_rate": 2.564102564102564e-06, "loss": 0.6753, "step": 76 }, { "epoch": 9.5, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.8977981209754944, "eval_runtime": 1.9303, "eval_samples_per_second": 32.119, "eval_steps_per_second": 2.072, "step": 76 }, { "epoch": 9.625, "grad_norm": 6.320114612579346, "learning_rate": 1.9230769230769234e-06, "loss": 0.6099, "step": 77 }, { "epoch": 9.625, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.8969810009002686, "eval_runtime": 1.9299, "eval_samples_per_second": 32.126, "eval_steps_per_second": 2.073, "step": 77 }, { "epoch": 9.75, "grad_norm": 7.137789249420166, "learning_rate": 1.282051282051282e-06, "loss": 0.7863, "step": 78 }, { "epoch": 9.75, "eval_accuracy": 0.6774193548387096, "eval_f1": 0.4704331450094162, "eval_loss": 0.8956599235534668, "eval_runtime": 1.9278, "eval_samples_per_second": 32.161, "eval_steps_per_second": 2.075, "step": 78 }, { "epoch": 9.875, "grad_norm": 5.855066299438477, "learning_rate": 6.41025641025641e-07, "loss": 0.5154, "step": 79 }, { "epoch": 9.875, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.8908558487892151, "eval_runtime": 1.8786, "eval_samples_per_second": 33.003, "eval_steps_per_second": 2.129, "step": 79 }, { "epoch": 10.0, "grad_norm": 6.291125774383545, "learning_rate": 0.0, "loss": 0.6067, "step": 80 }, { "epoch": 10.0, "eval_accuracy": 0.6612903225806451, "eval_f1": 0.4595103578154425, "eval_loss": 0.8924511075019836, "eval_runtime": 1.9287, "eval_samples_per_second": 32.145, "eval_steps_per_second": 2.074, "step": 80 }, { "epoch": 10.0, "step": 80, "total_flos": 1.4174270632493056e+16, "train_loss": 0.7646934121847153, "train_runtime": 412.0561, "train_samples_per_second": 5.922, "train_steps_per_second": 0.194 } ], "logging_steps": 1, "max_steps": 80, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.4174270632493056e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }