{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.037037037037037035, "grad_norm": 4.8827619552612305, "learning_rate": 3.7037037037037037e-06, "loss": 0.2228, "step": 100 }, { "epoch": 0.037037037037037035, "eval_all-nli-dev_cosine_accuracy": 0.9889583333333334, "eval_loss": 0.10640299320220947, "eval_runtime": 97.5038, "eval_samples_per_second": 147.687, "eval_steps_per_second": 9.23, "step": 100 }, { "epoch": 0.07407407407407407, "grad_norm": 2.8969228267669678, "learning_rate": 7.4074074074074075e-06, "loss": 0.1292, "step": 200 }, { "epoch": 0.07407407407407407, "eval_all-nli-dev_cosine_accuracy": 0.9938194444444445, "eval_loss": 0.05102457106113434, "eval_runtime": 99.0099, "eval_samples_per_second": 145.44, "eval_steps_per_second": 9.09, "step": 200 }, { "epoch": 0.1111111111111111, "grad_norm": 1.966164469718933, "learning_rate": 9.876543209876543e-06, "loss": 0.0785, "step": 300 }, { "epoch": 0.1111111111111111, "eval_all-nli-dev_cosine_accuracy": 0.9944444444444445, "eval_loss": 0.040028076618909836, "eval_runtime": 98.8666, "eval_samples_per_second": 145.651, "eval_steps_per_second": 9.103, "step": 300 }, { "epoch": 0.14814814814814814, "grad_norm": 1.080277919769287, "learning_rate": 9.465020576131688e-06, "loss": 0.0675, "step": 400 }, { "epoch": 0.14814814814814814, "eval_all-nli-dev_cosine_accuracy": 0.9954861111111111, "eval_loss": 0.03450320288538933, "eval_runtime": 99.5727, "eval_samples_per_second": 144.618, "eval_steps_per_second": 9.039, "step": 400 }, { "epoch": 0.18518518518518517, "grad_norm": 1.5480653047561646, "learning_rate": 9.053497942386832e-06, "loss": 0.0667, "step": 500 }, { "epoch": 0.18518518518518517, "eval_all-nli-dev_cosine_accuracy": 0.9952777777777778, "eval_loss": 0.031959593296051025, "eval_runtime": 97.8179, "eval_samples_per_second": 147.212, "eval_steps_per_second": 9.201, "step": 500 }, { "epoch": 0.2222222222222222, "grad_norm": 1.9074684381484985, "learning_rate": 8.641975308641975e-06, "loss": 0.0644, "step": 600 }, { "epoch": 0.2222222222222222, "eval_all-nli-dev_cosine_accuracy": 0.995625, "eval_loss": 0.030600089579820633, "eval_runtime": 98.4261, "eval_samples_per_second": 146.303, "eval_steps_per_second": 9.144, "step": 600 }, { "epoch": 0.25925925925925924, "grad_norm": 3.8694491386413574, "learning_rate": 8.23045267489712e-06, "loss": 0.067, "step": 700 }, { "epoch": 0.25925925925925924, "eval_all-nli-dev_cosine_accuracy": 0.9959027777777778, "eval_loss": 0.030407674610614777, "eval_runtime": 98.4984, "eval_samples_per_second": 146.195, "eval_steps_per_second": 9.137, "step": 700 }, { "epoch": 0.2962962962962963, "grad_norm": 1.7101613283157349, "learning_rate": 7.818930041152263e-06, "loss": 0.0568, "step": 800 }, { "epoch": 0.2962962962962963, "eval_all-nli-dev_cosine_accuracy": 0.995625, "eval_loss": 0.02961079403758049, "eval_runtime": 98.1294, "eval_samples_per_second": 146.745, "eval_steps_per_second": 9.172, "step": 800 }, { "epoch": 0.3333333333333333, "grad_norm": 2.4039230346679688, "learning_rate": 7.4074074074074075e-06, "loss": 0.0617, "step": 900 }, { "epoch": 0.3333333333333333, "eval_all-nli-dev_cosine_accuracy": 0.9957638888888889, "eval_loss": 0.0286862775683403, "eval_runtime": 97.8253, "eval_samples_per_second": 147.201, "eval_steps_per_second": 9.2, "step": 900 }, { "epoch": 0.37037037037037035, "grad_norm": 1.3637861013412476, "learning_rate": 6.9958847736625525e-06, "loss": 0.0556, "step": 1000 }, { "epoch": 0.37037037037037035, "eval_all-nli-dev_cosine_accuracy": 0.99625, "eval_loss": 0.027397217229008675, "eval_runtime": 98.5282, "eval_samples_per_second": 146.151, "eval_steps_per_second": 9.134, "step": 1000 }, { "epoch": 0.4074074074074074, "grad_norm": 2.049680709838867, "learning_rate": 6.584362139917696e-06, "loss": 0.0532, "step": 1100 }, { "epoch": 0.4074074074074074, "eval_all-nli-dev_cosine_accuracy": 0.99625, "eval_loss": 0.027111150324344635, "eval_runtime": 99.6305, "eval_samples_per_second": 144.534, "eval_steps_per_second": 9.033, "step": 1100 }, { "epoch": 0.4444444444444444, "grad_norm": 1.6650844812393188, "learning_rate": 6.17283950617284e-06, "loss": 0.0524, "step": 1200 }, { "epoch": 0.4444444444444444, "eval_all-nli-dev_cosine_accuracy": 0.9965972222222222, "eval_loss": 0.026169853284955025, "eval_runtime": 99.4848, "eval_samples_per_second": 144.746, "eval_steps_per_second": 9.047, "step": 1200 }, { "epoch": 0.48148148148148145, "grad_norm": 2.308643341064453, "learning_rate": 5.761316872427984e-06, "loss": 0.0529, "step": 1300 }, { "epoch": 0.48148148148148145, "eval_all-nli-dev_cosine_accuracy": 0.9961805555555555, "eval_loss": 0.026670673862099648, "eval_runtime": 105.3249, "eval_samples_per_second": 136.72, "eval_steps_per_second": 8.545, "step": 1300 }, { "epoch": 0.5185185185185185, "grad_norm": 1.1921712160110474, "learning_rate": 5.349794238683128e-06, "loss": 0.0527, "step": 1400 }, { "epoch": 0.5185185185185185, "eval_all-nli-dev_cosine_accuracy": 0.9961805555555555, "eval_loss": 0.025993267074227333, "eval_runtime": 101.3038, "eval_samples_per_second": 142.147, "eval_steps_per_second": 8.884, "step": 1400 }, { "epoch": 0.5555555555555556, "grad_norm": 2.8418076038360596, "learning_rate": 4.938271604938272e-06, "loss": 0.0479, "step": 1500 }, { "epoch": 0.5555555555555556, "eval_all-nli-dev_cosine_accuracy": 0.99625, "eval_loss": 0.025305895134806633, "eval_runtime": 101.7867, "eval_samples_per_second": 141.472, "eval_steps_per_second": 8.842, "step": 1500 }, { "epoch": 0.5925925925925926, "grad_norm": 3.0896897315979004, "learning_rate": 4.526748971193416e-06, "loss": 0.0515, "step": 1600 }, { "epoch": 0.5925925925925926, "eval_all-nli-dev_cosine_accuracy": 0.9966666666666667, "eval_loss": 0.024532195180654526, "eval_runtime": 101.8042, "eval_samples_per_second": 141.448, "eval_steps_per_second": 8.84, "step": 1600 }, { "epoch": 0.6296296296296297, "grad_norm": 2.7592620849609375, "learning_rate": 4.11522633744856e-06, "loss": 0.0512, "step": 1700 }, { "epoch": 0.6296296296296297, "eval_all-nli-dev_cosine_accuracy": 0.9961805555555555, "eval_loss": 0.025122441351413727, "eval_runtime": 102.3766, "eval_samples_per_second": 140.657, "eval_steps_per_second": 8.791, "step": 1700 }, { "epoch": 0.6666666666666666, "grad_norm": 0.41445350646972656, "learning_rate": 3.7037037037037037e-06, "loss": 0.0548, "step": 1800 }, { "epoch": 0.6666666666666666, "eval_all-nli-dev_cosine_accuracy": 0.9963194444444444, "eval_loss": 0.024524033069610596, "eval_runtime": 97.2602, "eval_samples_per_second": 148.056, "eval_steps_per_second": 9.254, "step": 1800 }, { "epoch": 0.7037037037037037, "grad_norm": 1.6982859373092651, "learning_rate": 3.292181069958848e-06, "loss": 0.0476, "step": 1900 }, { "epoch": 0.7037037037037037, "eval_all-nli-dev_cosine_accuracy": 0.9964583333333333, "eval_loss": 0.024558432400226593, "eval_runtime": 99.9108, "eval_samples_per_second": 144.129, "eval_steps_per_second": 9.008, "step": 1900 }, { "epoch": 0.7407407407407407, "grad_norm": 1.9297990798950195, "learning_rate": 2.880658436213992e-06, "loss": 0.0456, "step": 2000 }, { "epoch": 0.7407407407407407, "eval_all-nli-dev_cosine_accuracy": 0.9961111111111111, "eval_loss": 0.024668598547577858, "eval_runtime": 106.8633, "eval_samples_per_second": 134.752, "eval_steps_per_second": 8.422, "step": 2000 }, { "epoch": 0.7777777777777778, "grad_norm": 1.5807716846466064, "learning_rate": 2.469135802469136e-06, "loss": 0.0548, "step": 2100 }, { "epoch": 0.7777777777777778, "eval_all-nli-dev_cosine_accuracy": 0.9964583333333333, "eval_loss": 0.024200452491641045, "eval_runtime": 101.8908, "eval_samples_per_second": 141.328, "eval_steps_per_second": 8.833, "step": 2100 }, { "epoch": 0.8148148148148148, "grad_norm": 4.243816375732422, "learning_rate": 2.05761316872428e-06, "loss": 0.051, "step": 2200 }, { "epoch": 0.8148148148148148, "eval_all-nli-dev_cosine_accuracy": 0.9964583333333333, "eval_loss": 0.024141203612089157, "eval_runtime": 101.3185, "eval_samples_per_second": 142.126, "eval_steps_per_second": 8.883, "step": 2200 }, { "epoch": 0.8518518518518519, "grad_norm": 1.1512444019317627, "learning_rate": 1.646090534979424e-06, "loss": 0.0472, "step": 2300 }, { "epoch": 0.8518518518518519, "eval_all-nli-dev_cosine_accuracy": 0.9967361111111112, "eval_loss": 0.02424301952123642, "eval_runtime": 100.0984, "eval_samples_per_second": 143.858, "eval_steps_per_second": 8.991, "step": 2300 }, { "epoch": 0.8888888888888888, "grad_norm": 0.8177826404571533, "learning_rate": 1.234567901234568e-06, "loss": 0.0492, "step": 2400 }, { "epoch": 0.8888888888888888, "eval_all-nli-dev_cosine_accuracy": 0.9967361111111112, "eval_loss": 0.024101639166474342, "eval_runtime": 100.6902, "eval_samples_per_second": 143.013, "eval_steps_per_second": 8.938, "step": 2400 }, { "epoch": 0.9259259259259259, "grad_norm": 0.5140101909637451, "learning_rate": 8.23045267489712e-07, "loss": 0.0463, "step": 2500 }, { "epoch": 0.9259259259259259, "eval_all-nli-dev_cosine_accuracy": 0.9967361111111112, "eval_loss": 0.02386292815208435, "eval_runtime": 101.9918, "eval_samples_per_second": 141.188, "eval_steps_per_second": 8.824, "step": 2500 }, { "epoch": 0.9629629629629629, "grad_norm": 3.3629631996154785, "learning_rate": 4.11522633744856e-07, "loss": 0.0484, "step": 2600 }, { "epoch": 0.9629629629629629, "eval_all-nli-dev_cosine_accuracy": 0.9966666666666667, "eval_loss": 0.02382882498204708, "eval_runtime": 100.8961, "eval_samples_per_second": 142.721, "eval_steps_per_second": 8.92, "step": 2600 }, { "epoch": 1.0, "grad_norm": 2.4896204471588135, "learning_rate": 0.0, "loss": 0.0498, "step": 2700 }, { "epoch": 1.0, "eval_all-nli-dev_cosine_accuracy": 0.9967361111111112, "eval_loss": 0.023831075057387352, "eval_runtime": 100.2374, "eval_samples_per_second": 143.659, "eval_steps_per_second": 8.979, "step": 2700 } ], "logging_steps": 100, "max_steps": 2700, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }