{ "best_global_step": 1400, "best_metric": 1.6602575778961182, "best_model_checkpoint": "outputs/checkpoint-1400", "epoch": 1.0027855153203342, "eval_steps": 200, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027855153203342618, "grad_norm": 28.48779296875, "learning_rate": 9.090909090909091e-07, "loss": 6.8261, "step": 50 }, { "epoch": 0.055710306406685235, "grad_norm": 16.59324836730957, "learning_rate": 1.8367346938775512e-06, "loss": 5.5246, "step": 100 }, { "epoch": 0.08356545961002786, "grad_norm": 10.027810096740723, "learning_rate": 2.764378478664193e-06, "loss": 3.4, "step": 150 }, { "epoch": 0.11142061281337047, "grad_norm": 16.358600616455078, "learning_rate": 3.6920222634508353e-06, "loss": 2.6641, "step": 200 }, { "epoch": 0.11142061281337047, "eval_loss": 2.535517454147339, "eval_runtime": 58.8724, "eval_samples_per_second": 46.355, "eval_steps_per_second": 5.809, "step": 200 }, { "epoch": 0.1392757660167131, "grad_norm": 7.62955904006958, "learning_rate": 4.619666048237477e-06, "loss": 2.4208, "step": 250 }, { "epoch": 0.1671309192200557, "grad_norm": 5.676515579223633, "learning_rate": 5.547309833024119e-06, "loss": 2.2272, "step": 300 }, { "epoch": 0.19498607242339833, "grad_norm": 6.469155788421631, "learning_rate": 6.474953617810761e-06, "loss": 2.0871, "step": 350 }, { "epoch": 0.22284122562674094, "grad_norm": 6.341733455657959, "learning_rate": 7.402597402597404e-06, "loss": 1.9703, "step": 400 }, { "epoch": 0.22284122562674094, "eval_loss": 1.9659122228622437, "eval_runtime": 56.3654, "eval_samples_per_second": 48.416, "eval_steps_per_second": 6.068, "step": 400 }, { "epoch": 0.25069637883008355, "grad_norm": 7.506978511810303, "learning_rate": 8.330241187384045e-06, "loss": 1.8903, "step": 450 }, { "epoch": 0.2785515320334262, "grad_norm": 5.684309959411621, "learning_rate": 9.257884972170687e-06, "loss": 1.8074, "step": 500 }, { "epoch": 0.3064066852367688, "grad_norm": 4.537693977355957, "learning_rate": 9.999894931770648e-06, "loss": 1.7854, "step": 550 }, { "epoch": 0.3342618384401114, "grad_norm": 24.553997039794922, "learning_rate": 9.996218007374501e-06, "loss": 1.7252, "step": 600 }, { "epoch": 0.3342618384401114, "eval_loss": 1.74297034740448, "eval_runtime": 59.9228, "eval_samples_per_second": 45.542, "eval_steps_per_second": 5.707, "step": 600 }, { "epoch": 0.362116991643454, "grad_norm": 4.550091743469238, "learning_rate": 9.987292086409739e-06, "loss": 1.6874, "step": 650 }, { "epoch": 0.38997214484679665, "grad_norm": 4.49098014831543, "learning_rate": 9.973126546395205e-06, "loss": 1.6982, "step": 700 }, { "epoch": 0.4178272980501393, "grad_norm": 5.295725345611572, "learning_rate": 9.953736269561984e-06, "loss": 1.6394, "step": 750 }, { "epoch": 0.4456824512534819, "grad_norm": 5.552262306213379, "learning_rate": 9.929141627218212e-06, "loss": 1.6033, "step": 800 }, { "epoch": 0.4456824512534819, "eval_loss": 1.6921230554580688, "eval_runtime": 59.5066, "eval_samples_per_second": 45.86, "eval_steps_per_second": 5.747, "step": 800 }, { "epoch": 0.4735376044568245, "grad_norm": 7.223403453826904, "learning_rate": 9.899368458347117e-06, "loss": 1.6089, "step": 850 }, { "epoch": 0.5013927576601671, "grad_norm": 4.941762924194336, "learning_rate": 9.86444804246072e-06, "loss": 1.5874, "step": 900 }, { "epoch": 0.5292479108635098, "grad_norm": 6.057460784912109, "learning_rate": 9.824417066737782e-06, "loss": 1.5895, "step": 950 }, { "epoch": 0.5571030640668524, "grad_norm": 6.217402935028076, "learning_rate": 9.779317587480471e-06, "loss": 1.5377, "step": 1000 }, { "epoch": 0.5571030640668524, "eval_loss": 1.6874749660491943, "eval_runtime": 59.8971, "eval_samples_per_second": 45.561, "eval_steps_per_second": 5.71, "step": 1000 }, { "epoch": 0.584958217270195, "grad_norm": 6.518588542938232, "learning_rate": 9.729196985930261e-06, "loss": 1.5332, "step": 1050 }, { "epoch": 0.6128133704735376, "grad_norm": 5.452282905578613, "learning_rate": 9.674107918489489e-06, "loss": 1.515, "step": 1100 }, { "epoch": 0.6406685236768802, "grad_norm": 5.754498481750488, "learning_rate": 9.61410826140085e-06, "loss": 1.5108, "step": 1150 }, { "epoch": 0.6685236768802229, "grad_norm": 5.382887840270996, "learning_rate": 9.549261049942972e-06, "loss": 1.4732, "step": 1200 }, { "epoch": 0.6685236768802229, "eval_loss": 1.6714078187942505, "eval_runtime": 58.2937, "eval_samples_per_second": 46.815, "eval_steps_per_second": 5.867, "step": 1200 }, { "epoch": 0.6963788300835655, "grad_norm": 5.8490891456604, "learning_rate": 9.479634412205929e-06, "loss": 1.4491, "step": 1250 }, { "epoch": 0.724233983286908, "grad_norm": 6.567768096923828, "learning_rate": 9.405301497516274e-06, "loss": 1.4845, "step": 1300 }, { "epoch": 0.7520891364902507, "grad_norm": 6.346134662628174, "learning_rate": 9.326340399586805e-06, "loss": 1.4787, "step": 1350 }, { "epoch": 0.7799442896935933, "grad_norm": 6.748211860656738, "learning_rate": 9.24283407447178e-06, "loss": 1.47, "step": 1400 }, { "epoch": 0.7799442896935933, "eval_loss": 1.6602575778961182, "eval_runtime": 61.3099, "eval_samples_per_second": 44.512, "eval_steps_per_second": 5.578, "step": 1400 }, { "epoch": 0.807799442896936, "grad_norm": 6.448507308959961, "learning_rate": 9.154870253413776e-06, "loss": 1.3974, "step": 1450 }, { "epoch": 0.8356545961002786, "grad_norm": 6.654584884643555, "learning_rate": 9.062541350673782e-06, "loss": 1.3915, "step": 1500 }, { "epoch": 0.8635097493036211, "grad_norm": 7.59186315536499, "learning_rate": 8.965944366441332e-06, "loss": 1.3932, "step": 1550 }, { "epoch": 0.8913649025069638, "grad_norm": 7.587399005889893, "learning_rate": 8.86518078492669e-06, "loss": 1.3702, "step": 1600 }, { "epoch": 0.8913649025069638, "eval_loss": 1.6901545524597168, "eval_runtime": 58.7741, "eval_samples_per_second": 46.432, "eval_steps_per_second": 5.819, "step": 1600 }, { "epoch": 0.9192200557103064, "grad_norm": 8.80136489868164, "learning_rate": 8.760356467742144e-06, "loss": 1.3641, "step": 1650 }, { "epoch": 0.947075208913649, "grad_norm": 7.431648254394531, "learning_rate": 8.651581542684438e-06, "loss": 1.3336, "step": 1700 }, { "epoch": 0.9749303621169917, "grad_norm": 13.54917049407959, "learning_rate": 8.538970288035174e-06, "loss": 1.3335, "step": 1750 }, { "epoch": 1.0027855153203342, "grad_norm": 7.557823657989502, "learning_rate": 8.42264101250073e-06, "loss": 1.2826, "step": 1800 }, { "epoch": 1.0027855153203342, "eval_loss": 1.6702473163604736, "eval_runtime": 61.0186, "eval_samples_per_second": 44.724, "eval_steps_per_second": 5.605, "step": 1800 } ], "logging_steps": 50, "max_steps": 5385, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.19765988253696e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }