RigoBERTa-2.0 / trainer_state.json
GuillemGSubies's picture
Initial commit
8de8d3c verified
{
"best_metric": 1.4579006433486938,
"best_model_checkpoint": "/home/alejandro.vaca/new_checkpoints_xlm_roberta/checkpoint-78800",
"epoch": 0.22749530494715703,
"global_step": 141600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 1.5191565642755143e-09,
"loss": 2.6373,
"step": 1
},
{
"epoch": 0.0,
"learning_rate": 6.076626257102056e-07,
"loss": 2.2331,
"step": 400
},
{
"epoch": 0.0,
"eval_loss": 1.9753497838974,
"eval_runtime": 146.0905,
"eval_samples_per_second": 136.901,
"eval_steps_per_second": 2.143,
"step": 400
},
{
"epoch": 0.01,
"learning_rate": 1.2153252514204113e-06,
"loss": 2.0391,
"step": 800
},
{
"epoch": 0.01,
"eval_loss": 1.9167841672897339,
"eval_runtime": 146.7121,
"eval_samples_per_second": 136.321,
"eval_steps_per_second": 2.133,
"step": 800
},
{
"epoch": 0.01,
"learning_rate": 1.822987877130617e-06,
"loss": 1.9963,
"step": 1200
},
{
"epoch": 0.01,
"eval_loss": 1.8917230367660522,
"eval_runtime": 144.3485,
"eval_samples_per_second": 138.554,
"eval_steps_per_second": 2.168,
"step": 1200
},
{
"epoch": 0.01,
"learning_rate": 2.4306505028408226e-06,
"loss": 1.9736,
"step": 1600
},
{
"epoch": 0.01,
"eval_loss": 1.8841623067855835,
"eval_runtime": 142.7005,
"eval_samples_per_second": 140.154,
"eval_steps_per_second": 2.193,
"step": 1600
},
{
"epoch": 0.01,
"learning_rate": 3.0383131285510288e-06,
"loss": 1.9579,
"step": 2000
},
{
"epoch": 0.01,
"eval_loss": 1.8723009824752808,
"eval_runtime": 144.148,
"eval_samples_per_second": 138.746,
"eval_steps_per_second": 2.171,
"step": 2000
},
{
"epoch": 0.02,
"learning_rate": 3.645975754261234e-06,
"loss": 1.9465,
"step": 2400
},
{
"epoch": 0.02,
"eval_loss": 1.8523199558258057,
"eval_runtime": 143.784,
"eval_samples_per_second": 139.098,
"eval_steps_per_second": 2.177,
"step": 2400
},
{
"epoch": 0.02,
"learning_rate": 4.25363837997144e-06,
"loss": 1.9352,
"step": 2800
},
{
"epoch": 0.02,
"eval_loss": 1.843543529510498,
"eval_runtime": 146.189,
"eval_samples_per_second": 136.809,
"eval_steps_per_second": 2.141,
"step": 2800
},
{
"epoch": 0.02,
"learning_rate": 4.861301005681645e-06,
"loss": 1.9244,
"step": 3200
},
{
"epoch": 0.02,
"eval_loss": 1.8356839418411255,
"eval_runtime": 143.4998,
"eval_samples_per_second": 139.373,
"eval_steps_per_second": 2.181,
"step": 3200
},
{
"epoch": 0.03,
"learning_rate": 5.468963631391851e-06,
"loss": 1.9143,
"step": 3600
},
{
"epoch": 0.03,
"eval_loss": 1.8183759450912476,
"eval_runtime": 144.8799,
"eval_samples_per_second": 138.045,
"eval_steps_per_second": 2.16,
"step": 3600
},
{
"epoch": 0.03,
"learning_rate": 6.0766262571020576e-06,
"loss": 1.9042,
"step": 4000
},
{
"epoch": 0.03,
"eval_loss": 1.8169724941253662,
"eval_runtime": 145.0112,
"eval_samples_per_second": 137.92,
"eval_steps_per_second": 2.158,
"step": 4000
},
{
"epoch": 0.03,
"learning_rate": 6.684288882812263e-06,
"loss": 1.8971,
"step": 4400
},
{
"epoch": 0.03,
"eval_loss": 1.808371901512146,
"eval_runtime": 147.4613,
"eval_samples_per_second": 135.629,
"eval_steps_per_second": 2.123,
"step": 4400
},
{
"epoch": 0.04,
"learning_rate": 7.291951508522468e-06,
"loss": 1.888,
"step": 4800
},
{
"epoch": 0.04,
"eval_loss": 1.8037678003311157,
"eval_runtime": 150.0493,
"eval_samples_per_second": 133.29,
"eval_steps_per_second": 2.086,
"step": 4800
},
{
"epoch": 0.04,
"learning_rate": 7.899614134232675e-06,
"loss": 1.8809,
"step": 5200
},
{
"epoch": 0.04,
"eval_loss": 1.7955741882324219,
"eval_runtime": 148.2714,
"eval_samples_per_second": 134.888,
"eval_steps_per_second": 2.111,
"step": 5200
},
{
"epoch": 0.04,
"learning_rate": 8.50727675994288e-06,
"loss": 1.8741,
"step": 5600
},
{
"epoch": 0.04,
"eval_loss": 1.7888526916503906,
"eval_runtime": 147.2993,
"eval_samples_per_second": 135.778,
"eval_steps_per_second": 2.125,
"step": 5600
},
{
"epoch": 0.04,
"learning_rate": 9.114939385653086e-06,
"loss": 1.8685,
"step": 6000
},
{
"epoch": 0.04,
"eval_loss": 1.785848617553711,
"eval_runtime": 143.6092,
"eval_samples_per_second": 139.267,
"eval_steps_per_second": 2.18,
"step": 6000
},
{
"epoch": 0.05,
"learning_rate": 9.72260201136329e-06,
"loss": 1.8589,
"step": 6400
},
{
"epoch": 0.05,
"eval_loss": 1.781029462814331,
"eval_runtime": 147.7751,
"eval_samples_per_second": 135.341,
"eval_steps_per_second": 2.118,
"step": 6400
},
{
"epoch": 0.05,
"learning_rate": 1.0330264637073497e-05,
"loss": 1.8544,
"step": 6800
},
{
"epoch": 0.05,
"eval_loss": 1.7769867181777954,
"eval_runtime": 147.2523,
"eval_samples_per_second": 135.821,
"eval_steps_per_second": 2.126,
"step": 6800
},
{
"epoch": 0.05,
"learning_rate": 1.0937927262783703e-05,
"loss": 1.8481,
"step": 7200
},
{
"epoch": 0.05,
"eval_loss": 1.7637484073638916,
"eval_runtime": 143.1769,
"eval_samples_per_second": 139.687,
"eval_steps_per_second": 2.186,
"step": 7200
},
{
"epoch": 0.06,
"learning_rate": 1.1545589888493909e-05,
"loss": 1.8428,
"step": 7600
},
{
"epoch": 0.06,
"eval_loss": 1.756960391998291,
"eval_runtime": 145.8319,
"eval_samples_per_second": 137.144,
"eval_steps_per_second": 2.146,
"step": 7600
},
{
"epoch": 0.06,
"learning_rate": 1.2153252514204115e-05,
"loss": 1.8373,
"step": 8000
},
{
"epoch": 0.06,
"eval_loss": 1.7565785646438599,
"eval_runtime": 144.3741,
"eval_samples_per_second": 138.529,
"eval_steps_per_second": 2.168,
"step": 8000
},
{
"epoch": 0.06,
"learning_rate": 1.276091513991432e-05,
"loss": 1.8304,
"step": 8400
},
{
"epoch": 0.06,
"eval_loss": 1.742794156074524,
"eval_runtime": 146.6168,
"eval_samples_per_second": 136.41,
"eval_steps_per_second": 2.135,
"step": 8400
},
{
"epoch": 0.06,
"learning_rate": 1.3368577765624526e-05,
"loss": 1.8259,
"step": 8800
},
{
"epoch": 0.06,
"eval_loss": 1.7337759733200073,
"eval_runtime": 145.9226,
"eval_samples_per_second": 137.059,
"eval_steps_per_second": 2.145,
"step": 8800
},
{
"epoch": 0.07,
"learning_rate": 1.3976240391334734e-05,
"loss": 1.8219,
"step": 9200
},
{
"epoch": 0.07,
"eval_loss": 1.7424650192260742,
"eval_runtime": 145.6453,
"eval_samples_per_second": 137.32,
"eval_steps_per_second": 2.149,
"step": 9200
},
{
"epoch": 0.07,
"learning_rate": 1.4583903017044936e-05,
"loss": 1.8162,
"step": 9600
},
{
"epoch": 0.07,
"eval_loss": 1.7316113710403442,
"eval_runtime": 145.2248,
"eval_samples_per_second": 137.718,
"eval_steps_per_second": 2.155,
"step": 9600
},
{
"epoch": 0.07,
"learning_rate": 1.5191565642755143e-05,
"loss": 1.8112,
"step": 10000
},
{
"epoch": 0.07,
"eval_loss": 1.7247357368469238,
"eval_runtime": 146.3969,
"eval_samples_per_second": 136.615,
"eval_steps_per_second": 2.138,
"step": 10000
},
{
"epoch": 0.08,
"learning_rate": 1.579922826846535e-05,
"loss": 1.807,
"step": 10400
},
{
"epoch": 0.08,
"eval_loss": 1.725953459739685,
"eval_runtime": 144.2953,
"eval_samples_per_second": 138.605,
"eval_steps_per_second": 2.169,
"step": 10400
},
{
"epoch": 0.08,
"learning_rate": 1.6406890894175555e-05,
"loss": 1.8034,
"step": 10800
},
{
"epoch": 0.08,
"eval_loss": 1.721238136291504,
"eval_runtime": 145.7833,
"eval_samples_per_second": 137.19,
"eval_steps_per_second": 2.147,
"step": 10800
},
{
"epoch": 0.08,
"learning_rate": 1.701455351988576e-05,
"loss": 1.7984,
"step": 11200
},
{
"epoch": 0.08,
"eval_loss": 1.7204127311706543,
"eval_runtime": 147.5961,
"eval_samples_per_second": 135.505,
"eval_steps_per_second": 2.121,
"step": 11200
},
{
"epoch": 0.08,
"learning_rate": 1.7622216145595964e-05,
"loss": 1.7944,
"step": 11600
},
{
"epoch": 0.08,
"eval_loss": 1.7186585664749146,
"eval_runtime": 143.1913,
"eval_samples_per_second": 139.673,
"eval_steps_per_second": 2.186,
"step": 11600
},
{
"epoch": 0.09,
"learning_rate": 1.8229878771306172e-05,
"loss": 1.7915,
"step": 12000
},
{
"epoch": 0.09,
"eval_loss": 1.7116312980651855,
"eval_runtime": 149.8115,
"eval_samples_per_second": 133.501,
"eval_steps_per_second": 2.089,
"step": 12000
},
{
"epoch": 0.09,
"learning_rate": 1.883754139701638e-05,
"loss": 1.7864,
"step": 12400
},
{
"epoch": 0.09,
"eval_loss": 1.705054521560669,
"eval_runtime": 147.1783,
"eval_samples_per_second": 135.89,
"eval_steps_per_second": 2.127,
"step": 12400
},
{
"epoch": 0.09,
"learning_rate": 1.944520402272658e-05,
"loss": 1.7819,
"step": 12800
},
{
"epoch": 0.09,
"eval_loss": 1.6974027156829834,
"eval_runtime": 151.8787,
"eval_samples_per_second": 131.684,
"eval_steps_per_second": 2.061,
"step": 12800
},
{
"epoch": 0.1,
"learning_rate": 2.005286664843679e-05,
"loss": 1.7751,
"step": 13200
},
{
"epoch": 0.1,
"eval_loss": 1.7015215158462524,
"eval_runtime": 166.3617,
"eval_samples_per_second": 120.22,
"eval_steps_per_second": 1.881,
"step": 13200
},
{
"epoch": 0.1,
"learning_rate": 2.0660529274146993e-05,
"loss": 1.774,
"step": 13600
},
{
"epoch": 0.1,
"eval_loss": 1.697357177734375,
"eval_runtime": 210.2219,
"eval_samples_per_second": 95.138,
"eval_steps_per_second": 1.489,
"step": 13600
},
{
"epoch": 0.1,
"learning_rate": 2.12681918998572e-05,
"loss": 1.7685,
"step": 14000
},
{
"epoch": 0.1,
"eval_loss": 1.7000294923782349,
"eval_runtime": 388.8288,
"eval_samples_per_second": 51.437,
"eval_steps_per_second": 0.805,
"step": 14000
},
{
"epoch": 0.11,
"learning_rate": 2.1875854525567406e-05,
"loss": 1.7656,
"step": 14400
},
{
"epoch": 0.11,
"eval_loss": 1.6891347169876099,
"eval_runtime": 149.9008,
"eval_samples_per_second": 133.422,
"eval_steps_per_second": 2.088,
"step": 14400
},
{
"epoch": 0.11,
"learning_rate": 2.248351715127761e-05,
"loss": 1.7601,
"step": 14800
},
{
"epoch": 0.11,
"eval_loss": 1.691114902496338,
"eval_runtime": 148.0615,
"eval_samples_per_second": 135.079,
"eval_steps_per_second": 2.114,
"step": 14800
},
{
"epoch": 0.11,
"learning_rate": 2.3091179776987818e-05,
"loss": 1.7574,
"step": 15200
},
{
"epoch": 0.11,
"eval_loss": 1.6803953647613525,
"eval_runtime": 162.6955,
"eval_samples_per_second": 122.929,
"eval_steps_per_second": 1.924,
"step": 15200
},
{
"epoch": 0.11,
"learning_rate": 2.3698842402698022e-05,
"loss": 1.7528,
"step": 15600
},
{
"epoch": 0.11,
"eval_loss": 1.6794207096099854,
"eval_runtime": 152.5086,
"eval_samples_per_second": 131.14,
"eval_steps_per_second": 2.052,
"step": 15600
},
{
"epoch": 0.12,
"learning_rate": 2.430650502840823e-05,
"loss": 1.7494,
"step": 16000
},
{
"epoch": 0.12,
"eval_loss": 1.6750398874282837,
"eval_runtime": 150.7356,
"eval_samples_per_second": 132.683,
"eval_steps_per_second": 2.076,
"step": 16000
},
{
"epoch": 0.12,
"learning_rate": 2.4914167654118435e-05,
"loss": 1.7441,
"step": 16400
},
{
"epoch": 0.12,
"eval_loss": 1.6635680198669434,
"eval_runtime": 158.1954,
"eval_samples_per_second": 126.426,
"eval_steps_per_second": 1.979,
"step": 16400
},
{
"epoch": 0.12,
"learning_rate": 2.552183027982864e-05,
"loss": 1.7405,
"step": 16800
},
{
"epoch": 0.12,
"eval_loss": 1.660568118095398,
"eval_runtime": 144.9472,
"eval_samples_per_second": 137.981,
"eval_steps_per_second": 2.159,
"step": 16800
},
{
"epoch": 0.13,
"learning_rate": 2.6129492905538844e-05,
"loss": 1.7373,
"step": 17200
},
{
"epoch": 0.13,
"eval_loss": 1.6654884815216064,
"eval_runtime": 161.3267,
"eval_samples_per_second": 123.972,
"eval_steps_per_second": 1.94,
"step": 17200
},
{
"epoch": 0.13,
"learning_rate": 2.673715553124905e-05,
"loss": 1.7336,
"step": 17600
},
{
"epoch": 0.13,
"eval_loss": 1.6575931310653687,
"eval_runtime": 248.169,
"eval_samples_per_second": 80.59,
"eval_steps_per_second": 1.261,
"step": 17600
},
{
"epoch": 0.13,
"learning_rate": 2.7344818156959256e-05,
"loss": 1.7291,
"step": 18000
},
{
"epoch": 0.13,
"eval_loss": 1.6604431867599487,
"eval_runtime": 145.1056,
"eval_samples_per_second": 137.831,
"eval_steps_per_second": 2.157,
"step": 18000
},
{
"epoch": 0.13,
"learning_rate": 2.7952480782669467e-05,
"loss": 1.7243,
"step": 18400
},
{
"epoch": 0.13,
"eval_loss": 1.6801910400390625,
"eval_runtime": 153.7947,
"eval_samples_per_second": 130.043,
"eval_steps_per_second": 2.035,
"step": 18400
},
{
"epoch": 0.14,
"learning_rate": 2.856014340837967e-05,
"loss": 1.7214,
"step": 18800
},
{
"epoch": 0.14,
"eval_loss": 1.6495254039764404,
"eval_runtime": 148.9903,
"eval_samples_per_second": 134.237,
"eval_steps_per_second": 2.101,
"step": 18800
},
{
"epoch": 0.14,
"learning_rate": 2.9167806034089873e-05,
"loss": 1.7178,
"step": 19200
},
{
"epoch": 0.14,
"eval_loss": 1.6446107625961304,
"eval_runtime": 149.3174,
"eval_samples_per_second": 133.943,
"eval_steps_per_second": 2.096,
"step": 19200
},
{
"epoch": 0.14,
"learning_rate": 2.977546865980008e-05,
"loss": 1.7146,
"step": 19600
},
{
"epoch": 0.14,
"eval_loss": 1.641605019569397,
"eval_runtime": 154.456,
"eval_samples_per_second": 129.487,
"eval_steps_per_second": 2.026,
"step": 19600
},
{
"epoch": 0.15,
"learning_rate": 3.0383131285510285e-05,
"loss": 1.7118,
"step": 20000
},
{
"epoch": 0.15,
"eval_loss": 1.6381052732467651,
"eval_runtime": 151.3863,
"eval_samples_per_second": 132.112,
"eval_steps_per_second": 2.068,
"step": 20000
},
{
"epoch": 0.15,
"learning_rate": 3.099079391122049e-05,
"loss": 1.7083,
"step": 20400
},
{
"epoch": 0.15,
"eval_loss": 1.6341092586517334,
"eval_runtime": 238.9093,
"eval_samples_per_second": 83.714,
"eval_steps_per_second": 1.31,
"step": 20400
},
{
"epoch": 0.15,
"learning_rate": 3.15984565369307e-05,
"loss": 1.7062,
"step": 20800
},
{
"epoch": 0.15,
"eval_loss": 1.6292831897735596,
"eval_runtime": 152.4932,
"eval_samples_per_second": 131.153,
"eval_steps_per_second": 2.053,
"step": 20800
},
{
"epoch": 0.15,
"learning_rate": 3.22061191626409e-05,
"loss": 1.7054,
"step": 21200
},
{
"epoch": 0.15,
"eval_loss": 1.6273330450057983,
"eval_runtime": 151.097,
"eval_samples_per_second": 132.365,
"eval_steps_per_second": 2.072,
"step": 21200
},
{
"epoch": 0.16,
"learning_rate": 3.281378178835111e-05,
"loss": 1.7012,
"step": 21600
},
{
"epoch": 0.16,
"eval_loss": 1.6267642974853516,
"eval_runtime": 149.0125,
"eval_samples_per_second": 134.217,
"eval_steps_per_second": 2.1,
"step": 21600
},
{
"epoch": 0.16,
"learning_rate": 3.3421444414061314e-05,
"loss": 1.6993,
"step": 22000
},
{
"epoch": 0.16,
"eval_loss": 1.6256201267242432,
"eval_runtime": 149.4973,
"eval_samples_per_second": 133.782,
"eval_steps_per_second": 2.094,
"step": 22000
},
{
"epoch": 0.16,
"learning_rate": 3.402910703977152e-05,
"loss": 1.697,
"step": 22400
},
{
"epoch": 0.16,
"eval_loss": 1.6158908605575562,
"eval_runtime": 152.4757,
"eval_samples_per_second": 131.168,
"eval_steps_per_second": 2.053,
"step": 22400
},
{
"epoch": 0.17,
"learning_rate": 3.463676966548173e-05,
"loss": 1.6938,
"step": 22800
},
{
"epoch": 0.17,
"eval_loss": 1.6134721040725708,
"eval_runtime": 152.3857,
"eval_samples_per_second": 131.246,
"eval_steps_per_second": 2.054,
"step": 22800
},
{
"epoch": 0.17,
"learning_rate": 3.524443229119193e-05,
"loss": 1.6923,
"step": 23200
},
{
"epoch": 0.17,
"eval_loss": 1.6194721460342407,
"eval_runtime": 149.9763,
"eval_samples_per_second": 133.354,
"eval_steps_per_second": 2.087,
"step": 23200
},
{
"epoch": 0.17,
"learning_rate": 3.585209491690214e-05,
"loss": 1.6888,
"step": 23600
},
{
"epoch": 0.17,
"eval_loss": 1.6149234771728516,
"eval_runtime": 150.7266,
"eval_samples_per_second": 132.691,
"eval_steps_per_second": 2.077,
"step": 23600
},
{
"epoch": 0.18,
"learning_rate": 3.6459757542612344e-05,
"loss": 1.687,
"step": 24000
},
{
"epoch": 0.18,
"eval_loss": 1.6148015260696411,
"eval_runtime": 152.1295,
"eval_samples_per_second": 131.467,
"eval_steps_per_second": 2.057,
"step": 24000
},
{
"epoch": 0.18,
"learning_rate": 3.706742016832255e-05,
"loss": 1.6886,
"step": 24400
},
{
"epoch": 0.18,
"eval_loss": 1.6169975996017456,
"eval_runtime": 152.146,
"eval_samples_per_second": 131.453,
"eval_steps_per_second": 2.057,
"step": 24400
},
{
"epoch": 0.18,
"learning_rate": 3.767508279403276e-05,
"loss": 1.6865,
"step": 24800
},
{
"epoch": 0.18,
"eval_loss": 1.6124180555343628,
"eval_runtime": 174.6369,
"eval_samples_per_second": 114.523,
"eval_steps_per_second": 1.792,
"step": 24800
},
{
"epoch": 0.18,
"learning_rate": 3.828274541974296e-05,
"loss": 1.6829,
"step": 25200
},
{
"epoch": 0.18,
"eval_loss": 1.6170154809951782,
"eval_runtime": 262.8027,
"eval_samples_per_second": 76.103,
"eval_steps_per_second": 1.191,
"step": 25200
},
{
"epoch": 0.19,
"learning_rate": 3.889040804545316e-05,
"loss": 1.6813,
"step": 25600
},
{
"epoch": 0.19,
"eval_loss": 1.6040676832199097,
"eval_runtime": 255.431,
"eval_samples_per_second": 78.299,
"eval_steps_per_second": 1.225,
"step": 25600
},
{
"epoch": 0.19,
"learning_rate": 3.949807067116337e-05,
"loss": 1.6806,
"step": 26000
},
{
"epoch": 0.19,
"eval_loss": 1.6070351600646973,
"eval_runtime": 151.6507,
"eval_samples_per_second": 131.882,
"eval_steps_per_second": 2.064,
"step": 26000
},
{
"epoch": 0.19,
"learning_rate": 4.010573329687358e-05,
"loss": 1.6763,
"step": 26400
},
{
"epoch": 0.19,
"eval_loss": 1.599661946296692,
"eval_runtime": 150.2287,
"eval_samples_per_second": 133.13,
"eval_steps_per_second": 2.083,
"step": 26400
},
{
"epoch": 0.2,
"learning_rate": 4.071339592258379e-05,
"loss": 1.6733,
"step": 26800
},
{
"epoch": 0.2,
"eval_loss": 1.6072720289230347,
"eval_runtime": 152.0466,
"eval_samples_per_second": 131.539,
"eval_steps_per_second": 2.059,
"step": 26800
},
{
"epoch": 0.2,
"learning_rate": 4.1321058548293986e-05,
"loss": 1.6695,
"step": 27200
},
{
"epoch": 0.2,
"eval_loss": 1.6115573644638062,
"eval_runtime": 148.9069,
"eval_samples_per_second": 134.312,
"eval_steps_per_second": 2.102,
"step": 27200
},
{
"epoch": 0.2,
"learning_rate": 4.192872117400419e-05,
"loss": 1.6687,
"step": 27600
},
{
"epoch": 0.2,
"eval_loss": 1.611473798751831,
"eval_runtime": 158.4393,
"eval_samples_per_second": 126.231,
"eval_steps_per_second": 1.976,
"step": 27600
},
{
"epoch": 0.2,
"learning_rate": 4.25363837997144e-05,
"loss": 1.6673,
"step": 28000
},
{
"epoch": 0.2,
"eval_loss": 1.606929898262024,
"eval_runtime": 159.9938,
"eval_samples_per_second": 125.005,
"eval_steps_per_second": 1.956,
"step": 28000
},
{
"epoch": 0.21,
"learning_rate": 4.3144046425424607e-05,
"loss": 1.6655,
"step": 28400
},
{
"epoch": 0.21,
"eval_loss": 1.5869165658950806,
"eval_runtime": 154.5639,
"eval_samples_per_second": 129.396,
"eval_steps_per_second": 2.025,
"step": 28400
},
{
"epoch": 0.21,
"learning_rate": 4.375170905113481e-05,
"loss": 1.6622,
"step": 28800
},
{
"epoch": 0.21,
"eval_loss": 1.6052591800689697,
"eval_runtime": 368.5722,
"eval_samples_per_second": 54.263,
"eval_steps_per_second": 0.849,
"step": 28800
},
{
"epoch": 0.21,
"learning_rate": 4.4359371676845016e-05,
"loss": 1.6598,
"step": 29200
},
{
"epoch": 0.21,
"eval_loss": 1.58935546875,
"eval_runtime": 152.6646,
"eval_samples_per_second": 131.006,
"eval_steps_per_second": 2.05,
"step": 29200
},
{
"epoch": 0.22,
"learning_rate": 4.496703430255522e-05,
"loss": 1.659,
"step": 29600
},
{
"epoch": 0.22,
"eval_loss": 1.5808852910995483,
"eval_runtime": 155.4857,
"eval_samples_per_second": 128.629,
"eval_steps_per_second": 2.013,
"step": 29600
},
{
"epoch": 0.22,
"learning_rate": 4.557469692826543e-05,
"loss": 1.6583,
"step": 30000
},
{
"epoch": 0.22,
"eval_loss": 1.588645100593567,
"eval_runtime": 277.2998,
"eval_samples_per_second": 72.124,
"eval_steps_per_second": 1.129,
"step": 30000
},
{
"epoch": 0.22,
"learning_rate": 4.6182359553975636e-05,
"loss": 1.6555,
"step": 30400
},
{
"epoch": 0.22,
"eval_loss": 1.5864471197128296,
"eval_runtime": 161.6086,
"eval_samples_per_second": 123.756,
"eval_steps_per_second": 1.937,
"step": 30400
},
{
"epoch": 0.22,
"learning_rate": 4.679002217968584e-05,
"loss": 1.6559,
"step": 30800
},
{
"epoch": 0.22,
"eval_loss": 1.583774209022522,
"eval_runtime": 346.1205,
"eval_samples_per_second": 57.783,
"eval_steps_per_second": 0.904,
"step": 30800
},
{
"epoch": 0.23,
"learning_rate": 4.7397684805396045e-05,
"loss": 1.6522,
"step": 31200
},
{
"epoch": 0.23,
"eval_loss": 1.5791034698486328,
"eval_runtime": 326.2692,
"eval_samples_per_second": 61.299,
"eval_steps_per_second": 0.959,
"step": 31200
},
{
"epoch": 0.23,
"learning_rate": 4.800534743110625e-05,
"loss": 1.6499,
"step": 31600
},
{
"epoch": 0.23,
"eval_loss": 1.5826290845870972,
"eval_runtime": 175.7208,
"eval_samples_per_second": 113.817,
"eval_steps_per_second": 1.781,
"step": 31600
},
{
"epoch": 0.23,
"learning_rate": 4.861301005681646e-05,
"loss": 1.6506,
"step": 32000
},
{
"epoch": 0.23,
"eval_loss": 1.5759295225143433,
"eval_runtime": 317.0144,
"eval_samples_per_second": 63.089,
"eval_steps_per_second": 0.987,
"step": 32000
},
{
"epoch": 0.24,
"learning_rate": 4.9220672682526665e-05,
"loss": 1.6498,
"step": 32400
},
{
"epoch": 0.24,
"eval_loss": 1.5828478336334229,
"eval_runtime": 219.1147,
"eval_samples_per_second": 91.276,
"eval_steps_per_second": 1.428,
"step": 32400
},
{
"epoch": 0.24,
"learning_rate": 4.982833530823687e-05,
"loss": 1.6473,
"step": 32800
},
{
"epoch": 0.24,
"eval_loss": 1.572839617729187,
"eval_runtime": 188.3492,
"eval_samples_per_second": 106.186,
"eval_steps_per_second": 1.662,
"step": 32800
},
{
"epoch": 0.24,
"learning_rate": 4.99405445046135e-05,
"loss": 1.644,
"step": 33200
},
{
"epoch": 0.24,
"eval_loss": 1.5747781991958618,
"eval_runtime": 261.2843,
"eval_samples_per_second": 76.545,
"eval_steps_per_second": 1.198,
"step": 33200
},
{
"epoch": 0.25,
"learning_rate": 4.9857679702681096e-05,
"loss": 1.6419,
"step": 33600
},
{
"epoch": 0.25,
"eval_loss": 1.569125771522522,
"eval_runtime": 238.4252,
"eval_samples_per_second": 83.884,
"eval_steps_per_second": 1.313,
"step": 33600
},
{
"epoch": 0.25,
"learning_rate": 4.977481490074868e-05,
"loss": 1.6416,
"step": 34000
},
{
"epoch": 0.25,
"eval_loss": 1.5649021863937378,
"eval_runtime": 403.1613,
"eval_samples_per_second": 49.608,
"eval_steps_per_second": 0.776,
"step": 34000
},
{
"epoch": 0.25,
"learning_rate": 4.969195009881628e-05,
"loss": 1.6365,
"step": 34400
},
{
"epoch": 0.25,
"eval_loss": 1.5665974617004395,
"eval_runtime": 154.8734,
"eval_samples_per_second": 129.138,
"eval_steps_per_second": 2.021,
"step": 34400
},
{
"epoch": 0.25,
"learning_rate": 4.9609085296883874e-05,
"loss": 1.6348,
"step": 34800
},
{
"epoch": 0.25,
"eval_loss": 1.5668097734451294,
"eval_runtime": 193.4192,
"eval_samples_per_second": 103.402,
"eval_steps_per_second": 1.618,
"step": 34800
},
{
"epoch": 0.26,
"learning_rate": 4.9526220494951466e-05,
"loss": 1.6342,
"step": 35200
},
{
"epoch": 0.26,
"eval_loss": 1.5644603967666626,
"eval_runtime": 525.4863,
"eval_samples_per_second": 38.06,
"eval_steps_per_second": 0.596,
"step": 35200
},
{
"epoch": 0.26,
"learning_rate": 4.944335569301905e-05,
"loss": 1.6319,
"step": 35600
},
{
"epoch": 0.26,
"eval_loss": 1.5583738088607788,
"eval_runtime": 152.5499,
"eval_samples_per_second": 131.105,
"eval_steps_per_second": 2.052,
"step": 35600
},
{
"epoch": 0.26,
"learning_rate": 4.936049089108665e-05,
"loss": 1.6304,
"step": 36000
},
{
"epoch": 0.26,
"eval_loss": 1.5624059438705444,
"eval_runtime": 195.0553,
"eval_samples_per_second": 102.535,
"eval_steps_per_second": 1.605,
"step": 36000
},
{
"epoch": 0.27,
"learning_rate": 4.9277626089154245e-05,
"loss": 1.6287,
"step": 36400
},
{
"epoch": 0.27,
"eval_loss": 1.5545308589935303,
"eval_runtime": 220.9752,
"eval_samples_per_second": 90.508,
"eval_steps_per_second": 1.416,
"step": 36400
},
{
"epoch": 0.27,
"learning_rate": 4.919476128722184e-05,
"loss": 1.6301,
"step": 36800
},
{
"epoch": 0.27,
"eval_loss": 1.5592070817947388,
"eval_runtime": 213.6052,
"eval_samples_per_second": 93.631,
"eval_steps_per_second": 1.465,
"step": 36800
},
{
"epoch": 0.27,
"learning_rate": 4.911189648528943e-05,
"loss": 1.6272,
"step": 37200
},
{
"epoch": 0.27,
"eval_loss": 1.5615522861480713,
"eval_runtime": 263.8607,
"eval_samples_per_second": 75.798,
"eval_steps_per_second": 1.186,
"step": 37200
},
{
"epoch": 0.27,
"learning_rate": 4.9029031683357016e-05,
"loss": 1.6267,
"step": 37600
},
{
"epoch": 0.27,
"eval_loss": 1.558023452758789,
"eval_runtime": 1163.3408,
"eval_samples_per_second": 17.192,
"eval_steps_per_second": 0.269,
"step": 37600
},
{
"epoch": 0.28,
"learning_rate": 4.8946166881424615e-05,
"loss": 1.624,
"step": 38000
},
{
"epoch": 0.28,
"eval_loss": 1.550244688987732,
"eval_runtime": 156.4776,
"eval_samples_per_second": 127.814,
"eval_steps_per_second": 2.0,
"step": 38000
},
{
"epoch": 0.28,
"learning_rate": 4.886330207949221e-05,
"loss": 1.6238,
"step": 38400
},
{
"epoch": 0.28,
"eval_loss": 1.5512545108795166,
"eval_runtime": 178.078,
"eval_samples_per_second": 112.31,
"eval_steps_per_second": 1.758,
"step": 38400
},
{
"epoch": 0.28,
"learning_rate": 4.87804372775598e-05,
"loss": 1.623,
"step": 38800
},
{
"epoch": 0.28,
"eval_loss": 1.5499157905578613,
"eval_runtime": 411.8408,
"eval_samples_per_second": 48.562,
"eval_steps_per_second": 0.76,
"step": 38800
},
{
"epoch": 0.29,
"learning_rate": 4.869757247562739e-05,
"loss": 1.6214,
"step": 39200
},
{
"epoch": 0.29,
"eval_loss": 1.554477572441101,
"eval_runtime": 152.8841,
"eval_samples_per_second": 130.818,
"eval_steps_per_second": 2.047,
"step": 39200
},
{
"epoch": 0.29,
"learning_rate": 4.8614707673694986e-05,
"loss": 1.6173,
"step": 39600
},
{
"epoch": 0.29,
"eval_loss": 1.5494047403335571,
"eval_runtime": 154.2296,
"eval_samples_per_second": 129.677,
"eval_steps_per_second": 2.029,
"step": 39600
},
{
"epoch": 0.29,
"learning_rate": 4.853184287176258e-05,
"loss": 1.6159,
"step": 40000
},
{
"epoch": 0.29,
"eval_loss": 1.5492123365402222,
"eval_runtime": 358.728,
"eval_samples_per_second": 55.753,
"eval_steps_per_second": 0.873,
"step": 40000
},
{
"epoch": 0.29,
"learning_rate": 4.844897806983017e-05,
"loss": 1.6131,
"step": 40400
},
{
"epoch": 0.29,
"eval_loss": 1.5435105562210083,
"eval_runtime": 424.1722,
"eval_samples_per_second": 47.151,
"eval_steps_per_second": 0.738,
"step": 40400
},
{
"epoch": 0.3,
"learning_rate": 4.8366113267897764e-05,
"loss": 1.6125,
"step": 40800
},
{
"epoch": 0.3,
"eval_loss": 1.5407049655914307,
"eval_runtime": 233.4949,
"eval_samples_per_second": 85.655,
"eval_steps_per_second": 1.341,
"step": 40800
},
{
"epoch": 0.3,
"learning_rate": 4.828324846596536e-05,
"loss": 1.6129,
"step": 41200
},
{
"epoch": 0.3,
"eval_loss": 1.5503147840499878,
"eval_runtime": 154.5487,
"eval_samples_per_second": 129.409,
"eval_steps_per_second": 2.025,
"step": 41200
},
{
"epoch": 0.3,
"learning_rate": 4.820038366403295e-05,
"loss": 1.61,
"step": 41600
},
{
"epoch": 0.3,
"eval_loss": 1.5319013595581055,
"eval_runtime": 229.3937,
"eval_samples_per_second": 87.186,
"eval_steps_per_second": 1.364,
"step": 41600
},
{
"epoch": 0.31,
"learning_rate": 4.811751886210054e-05,
"loss": 1.6083,
"step": 42000
},
{
"epoch": 0.31,
"eval_loss": 1.540002465248108,
"eval_runtime": 200.5559,
"eval_samples_per_second": 99.723,
"eval_steps_per_second": 1.561,
"step": 42000
},
{
"epoch": 0.31,
"learning_rate": 4.8034654060168135e-05,
"loss": 1.6049,
"step": 42400
},
{
"epoch": 0.31,
"eval_loss": 1.5374138355255127,
"eval_runtime": 353.7763,
"eval_samples_per_second": 56.533,
"eval_steps_per_second": 0.885,
"step": 42400
},
{
"epoch": 0.31,
"learning_rate": 4.795178925823573e-05,
"loss": 1.6048,
"step": 42800
},
{
"epoch": 0.31,
"eval_loss": 1.5372508764266968,
"eval_runtime": 306.8656,
"eval_samples_per_second": 65.175,
"eval_steps_per_second": 1.02,
"step": 42800
},
{
"epoch": 0.32,
"learning_rate": 4.786892445630332e-05,
"loss": 1.6036,
"step": 43200
},
{
"epoch": 0.32,
"eval_loss": 1.538548469543457,
"eval_runtime": 875.5249,
"eval_samples_per_second": 22.843,
"eval_steps_per_second": 0.357,
"step": 43200
},
{
"epoch": 0.32,
"learning_rate": 4.778605965437091e-05,
"loss": 1.6025,
"step": 43600
},
{
"epoch": 0.32,
"eval_loss": 1.5447686910629272,
"eval_runtime": 216.6802,
"eval_samples_per_second": 92.302,
"eval_steps_per_second": 1.445,
"step": 43600
},
{
"epoch": 0.32,
"learning_rate": 4.7703194852438506e-05,
"loss": 1.5987,
"step": 44000
},
{
"epoch": 0.32,
"eval_loss": 1.534464716911316,
"eval_runtime": 527.5707,
"eval_samples_per_second": 37.91,
"eval_steps_per_second": 0.593,
"step": 44000
},
{
"epoch": 0.32,
"learning_rate": 4.76203300505061e-05,
"loss": 1.5995,
"step": 44400
},
{
"epoch": 0.32,
"eval_loss": 1.537174105644226,
"eval_runtime": 157.1885,
"eval_samples_per_second": 127.236,
"eval_steps_per_second": 1.991,
"step": 44400
},
{
"epoch": 0.33,
"learning_rate": 4.753746524857369e-05,
"loss": 1.5995,
"step": 44800
},
{
"epoch": 0.33,
"eval_loss": 1.5312557220458984,
"eval_runtime": 590.8847,
"eval_samples_per_second": 33.848,
"eval_steps_per_second": 0.53,
"step": 44800
},
{
"epoch": 0.33,
"learning_rate": 4.7454600446641284e-05,
"loss": 1.6002,
"step": 45200
},
{
"epoch": 0.33,
"eval_loss": 1.5247910022735596,
"eval_runtime": 197.0178,
"eval_samples_per_second": 101.514,
"eval_steps_per_second": 1.589,
"step": 45200
},
{
"epoch": 0.33,
"learning_rate": 4.737173564470888e-05,
"loss": 1.5985,
"step": 45600
},
{
"epoch": 0.33,
"eval_loss": 1.5312753915786743,
"eval_runtime": 217.2767,
"eval_samples_per_second": 92.049,
"eval_steps_per_second": 1.441,
"step": 45600
},
{
"epoch": 0.34,
"learning_rate": 4.728887084277647e-05,
"loss": 1.5975,
"step": 46000
},
{
"epoch": 0.34,
"eval_loss": 1.5283282995224,
"eval_runtime": 247.4783,
"eval_samples_per_second": 80.815,
"eval_steps_per_second": 1.265,
"step": 46000
},
{
"epoch": 0.34,
"learning_rate": 4.720600604084406e-05,
"loss": 1.5942,
"step": 46400
},
{
"epoch": 0.34,
"eval_loss": 1.5262142419815063,
"eval_runtime": 943.4579,
"eval_samples_per_second": 21.199,
"eval_steps_per_second": 0.332,
"step": 46400
},
{
"epoch": 0.34,
"learning_rate": 4.7123141238911655e-05,
"loss": 1.5946,
"step": 46800
},
{
"epoch": 0.34,
"eval_loss": 1.5237544775009155,
"eval_runtime": 157.7499,
"eval_samples_per_second": 126.783,
"eval_steps_per_second": 1.984,
"step": 46800
},
{
"epoch": 0.34,
"learning_rate": 4.704027643697925e-05,
"loss": 1.592,
"step": 47200
},
{
"epoch": 0.34,
"eval_loss": 1.5289279222488403,
"eval_runtime": 380.6257,
"eval_samples_per_second": 52.545,
"eval_steps_per_second": 0.822,
"step": 47200
},
{
"epoch": 0.35,
"learning_rate": 4.695741163504685e-05,
"loss": 1.5924,
"step": 47600
},
{
"epoch": 0.35,
"eval_loss": 1.523956298828125,
"eval_runtime": 154.9689,
"eval_samples_per_second": 129.058,
"eval_steps_per_second": 2.02,
"step": 47600
},
{
"epoch": 0.35,
"learning_rate": 4.687454683311443e-05,
"loss": 1.5901,
"step": 48000
},
{
"epoch": 0.35,
"eval_loss": 1.5227019786834717,
"eval_runtime": 728.6822,
"eval_samples_per_second": 27.447,
"eval_steps_per_second": 0.43,
"step": 48000
},
{
"epoch": 0.35,
"learning_rate": 4.6791682031182026e-05,
"loss": 1.589,
"step": 48400
},
{
"epoch": 0.35,
"eval_loss": 1.5262556076049805,
"eval_runtime": 156.5557,
"eval_samples_per_second": 127.75,
"eval_steps_per_second": 1.999,
"step": 48400
},
{
"epoch": 0.36,
"learning_rate": 4.670881722924962e-05,
"loss": 1.5875,
"step": 48800
},
{
"epoch": 0.36,
"eval_loss": 1.5185788869857788,
"eval_runtime": 355.4244,
"eval_samples_per_second": 56.271,
"eval_steps_per_second": 0.881,
"step": 48800
},
{
"epoch": 0.36,
"learning_rate": 4.662595242731722e-05,
"loss": 1.5867,
"step": 49200
},
{
"epoch": 0.36,
"eval_loss": 1.51908278465271,
"eval_runtime": 729.5519,
"eval_samples_per_second": 27.414,
"eval_steps_per_second": 0.429,
"step": 49200
},
{
"epoch": 0.36,
"learning_rate": 4.6543087625384804e-05,
"loss": 1.5849,
"step": 49600
},
{
"epoch": 0.36,
"eval_loss": 1.5164732933044434,
"eval_runtime": 852.7453,
"eval_samples_per_second": 23.454,
"eval_steps_per_second": 0.367,
"step": 49600
},
{
"epoch": 0.36,
"learning_rate": 4.64602228234524e-05,
"loss": 1.5828,
"step": 50000
},
{
"epoch": 0.36,
"eval_loss": 1.5202162265777588,
"eval_runtime": 157.0148,
"eval_samples_per_second": 127.377,
"eval_steps_per_second": 1.993,
"step": 50000
},
{
"epoch": 0.37,
"learning_rate": 4.637735802151999e-05,
"loss": 1.5816,
"step": 50400
},
{
"epoch": 0.37,
"eval_loss": 1.5152881145477295,
"eval_runtime": 211.7581,
"eval_samples_per_second": 94.447,
"eval_steps_per_second": 1.478,
"step": 50400
},
{
"epoch": 0.37,
"learning_rate": 4.629449321958758e-05,
"loss": 1.5809,
"step": 50800
},
{
"epoch": 0.37,
"eval_loss": 1.5141160488128662,
"eval_runtime": 164.3824,
"eval_samples_per_second": 121.668,
"eval_steps_per_second": 1.904,
"step": 50800
},
{
"epoch": 0.37,
"learning_rate": 4.621162841765518e-05,
"loss": 1.5771,
"step": 51200
},
{
"epoch": 0.37,
"eval_loss": 1.5138821601867676,
"eval_runtime": 462.6007,
"eval_samples_per_second": 43.234,
"eval_steps_per_second": 0.677,
"step": 51200
},
{
"epoch": 0.38,
"learning_rate": 4.612876361572277e-05,
"loss": 1.5775,
"step": 51600
},
{
"epoch": 0.38,
"eval_loss": 1.509470820426941,
"eval_runtime": 775.3154,
"eval_samples_per_second": 25.796,
"eval_steps_per_second": 0.404,
"step": 51600
},
{
"epoch": 0.38,
"learning_rate": 4.604589881379036e-05,
"loss": 1.5767,
"step": 52000
},
{
"epoch": 0.38,
"eval_loss": 1.5092774629592896,
"eval_runtime": 186.3503,
"eval_samples_per_second": 107.325,
"eval_steps_per_second": 1.68,
"step": 52000
},
{
"epoch": 0.38,
"learning_rate": 4.596303401185795e-05,
"loss": 1.5757,
"step": 52400
},
{
"epoch": 0.38,
"eval_loss": 1.5057079792022705,
"eval_runtime": 159.2417,
"eval_samples_per_second": 125.595,
"eval_steps_per_second": 1.966,
"step": 52400
},
{
"epoch": 0.39,
"learning_rate": 4.588016920992555e-05,
"loss": 1.5752,
"step": 52800
},
{
"epoch": 0.39,
"eval_loss": 1.5144433975219727,
"eval_runtime": 159.6541,
"eval_samples_per_second": 125.271,
"eval_steps_per_second": 1.96,
"step": 52800
},
{
"epoch": 0.39,
"learning_rate": 4.579730440799314e-05,
"loss": 1.5752,
"step": 53200
},
{
"epoch": 0.39,
"eval_loss": 1.506042242050171,
"eval_runtime": 406.841,
"eval_samples_per_second": 49.159,
"eval_steps_per_second": 0.769,
"step": 53200
},
{
"epoch": 0.39,
"learning_rate": 4.571443960606073e-05,
"loss": 1.5759,
"step": 53600
},
{
"epoch": 0.39,
"eval_loss": 1.511734962463379,
"eval_runtime": 956.1026,
"eval_samples_per_second": 20.918,
"eval_steps_per_second": 0.327,
"step": 53600
},
{
"epoch": 0.39,
"learning_rate": 4.5631574804128324e-05,
"loss": 1.5749,
"step": 54000
},
{
"epoch": 0.39,
"eval_loss": 1.5020769834518433,
"eval_runtime": 261.4557,
"eval_samples_per_second": 76.495,
"eval_steps_per_second": 1.197,
"step": 54000
},
{
"epoch": 0.4,
"learning_rate": 4.554871000219592e-05,
"loss": 1.5732,
"step": 54400
},
{
"epoch": 0.4,
"eval_loss": 1.536434531211853,
"eval_runtime": 200.7371,
"eval_samples_per_second": 99.633,
"eval_steps_per_second": 1.559,
"step": 54400
},
{
"epoch": 0.4,
"learning_rate": 4.5465845200263516e-05,
"loss": 1.5728,
"step": 54800
},
{
"epoch": 0.4,
"eval_loss": 1.5178890228271484,
"eval_runtime": 188.6203,
"eval_samples_per_second": 106.033,
"eval_steps_per_second": 1.659,
"step": 54800
},
{
"epoch": 0.4,
"learning_rate": 4.53829803983311e-05,
"loss": 1.5742,
"step": 55200
},
{
"epoch": 0.4,
"eval_loss": 1.503977656364441,
"eval_runtime": 232.3531,
"eval_samples_per_second": 86.076,
"eval_steps_per_second": 1.347,
"step": 55200
},
{
"epoch": 0.41,
"learning_rate": 4.5300115596398695e-05,
"loss": 1.5701,
"step": 55600
},
{
"epoch": 0.41,
"eval_loss": 1.5044046640396118,
"eval_runtime": 248.7056,
"eval_samples_per_second": 80.416,
"eval_steps_per_second": 1.259,
"step": 55600
},
{
"epoch": 0.41,
"learning_rate": 4.5217250794466294e-05,
"loss": 1.569,
"step": 56000
},
{
"epoch": 0.41,
"eval_loss": 1.5002530813217163,
"eval_runtime": 155.8533,
"eval_samples_per_second": 128.326,
"eval_steps_per_second": 2.008,
"step": 56000
},
{
"epoch": 0.41,
"learning_rate": 4.513438599253389e-05,
"loss": 1.5671,
"step": 56400
},
{
"epoch": 0.41,
"eval_loss": 1.5035356283187866,
"eval_runtime": 591.7888,
"eval_samples_per_second": 33.796,
"eval_steps_per_second": 0.529,
"step": 56400
},
{
"epoch": 0.41,
"learning_rate": 4.505152119060147e-05,
"loss": 1.5663,
"step": 56800
},
{
"epoch": 0.41,
"eval_loss": 1.5083376169204712,
"eval_runtime": 235.8323,
"eval_samples_per_second": 84.806,
"eval_steps_per_second": 1.327,
"step": 56800
},
{
"epoch": 0.0,
"learning_rate": 4.4968656388669065e-05,
"loss": 1.566,
"step": 57200
},
{
"epoch": 0.0,
"eval_loss": 1.500092625617981,
"eval_runtime": 122.1502,
"eval_samples_per_second": 163.733,
"eval_steps_per_second": 2.562,
"step": 57200
},
{
"epoch": 0.01,
"learning_rate": 4.4885791586736665e-05,
"loss": 1.5667,
"step": 57600
},
{
"epoch": 0.01,
"eval_loss": 1.4975863695144653,
"eval_runtime": 122.6954,
"eval_samples_per_second": 163.005,
"eval_steps_per_second": 2.551,
"step": 57600
},
{
"epoch": 0.01,
"learning_rate": 4.480292678480426e-05,
"loss": 1.5657,
"step": 58000
},
{
"epoch": 0.01,
"eval_loss": 1.4931672811508179,
"eval_runtime": 123.0954,
"eval_samples_per_second": 162.476,
"eval_steps_per_second": 2.543,
"step": 58000
},
{
"epoch": 0.01,
"learning_rate": 4.472006198287185e-05,
"loss": 1.5642,
"step": 58400
},
{
"epoch": 0.01,
"eval_loss": 1.4972225427627563,
"eval_runtime": 123.364,
"eval_samples_per_second": 162.122,
"eval_steps_per_second": 2.537,
"step": 58400
},
{
"epoch": 0.01,
"learning_rate": 4.4637197180939436e-05,
"loss": 1.5622,
"step": 58800
},
{
"epoch": 0.01,
"eval_loss": 1.49701988697052,
"eval_runtime": 123.4986,
"eval_samples_per_second": 161.945,
"eval_steps_per_second": 2.534,
"step": 58800
},
{
"epoch": 0.02,
"learning_rate": 4.4554332379007036e-05,
"loss": 1.5607,
"step": 59200
},
{
"epoch": 0.02,
"eval_loss": 1.4874858856201172,
"eval_runtime": 123.9331,
"eval_samples_per_second": 161.377,
"eval_steps_per_second": 2.526,
"step": 59200
},
{
"epoch": 0.02,
"learning_rate": 4.447146757707463e-05,
"loss": 1.5607,
"step": 59600
},
{
"epoch": 0.02,
"eval_loss": 1.4898470640182495,
"eval_runtime": 120.8464,
"eval_samples_per_second": 165.499,
"eval_steps_per_second": 2.59,
"step": 59600
},
{
"epoch": 0.02,
"learning_rate": 4.438860277514222e-05,
"loss": 1.5586,
"step": 60000
},
{
"epoch": 0.02,
"eval_loss": 1.494850754737854,
"eval_runtime": 124.1267,
"eval_samples_per_second": 161.126,
"eval_steps_per_second": 2.522,
"step": 60000
},
{
"epoch": 0.03,
"learning_rate": 4.430573797320981e-05,
"loss": 1.5582,
"step": 60400
},
{
"epoch": 0.03,
"eval_loss": 1.4933040142059326,
"eval_runtime": 122.7367,
"eval_samples_per_second": 162.95,
"eval_steps_per_second": 2.55,
"step": 60400
},
{
"epoch": 0.03,
"learning_rate": 4.4222873171277407e-05,
"loss": 1.5579,
"step": 60800
},
{
"epoch": 0.03,
"eval_loss": 1.4987492561340332,
"eval_runtime": 123.7785,
"eval_samples_per_second": 161.579,
"eval_steps_per_second": 2.529,
"step": 60800
},
{
"epoch": 0.03,
"learning_rate": 4.4140008369345e-05,
"loss": 1.5577,
"step": 61200
},
{
"epoch": 0.03,
"eval_loss": 1.489683747291565,
"eval_runtime": 121.2724,
"eval_samples_per_second": 164.918,
"eval_steps_per_second": 2.581,
"step": 61200
},
{
"epoch": 0.04,
"learning_rate": 4.405714356741259e-05,
"loss": 1.5574,
"step": 61600
},
{
"epoch": 0.04,
"eval_loss": 1.4959229230880737,
"eval_runtime": 121.5786,
"eval_samples_per_second": 164.503,
"eval_steps_per_second": 2.574,
"step": 61600
},
{
"epoch": 0.04,
"learning_rate": 4.397427876548018e-05,
"loss": 1.5551,
"step": 62000
},
{
"epoch": 0.04,
"eval_loss": 1.496133804321289,
"eval_runtime": 121.3994,
"eval_samples_per_second": 164.746,
"eval_steps_per_second": 2.578,
"step": 62000
},
{
"epoch": 0.04,
"learning_rate": 4.389141396354778e-05,
"loss": 1.5549,
"step": 62400
},
{
"epoch": 0.04,
"eval_loss": 1.4901236295700073,
"eval_runtime": 122.3573,
"eval_samples_per_second": 163.456,
"eval_steps_per_second": 2.558,
"step": 62400
},
{
"epoch": 0.0,
"learning_rate": 4.380854916161537e-05,
"loss": 1.5535,
"step": 62800
},
{
"epoch": 0.0,
"eval_loss": 1.4875001907348633,
"eval_runtime": 105.5644,
"eval_samples_per_second": 189.458,
"eval_steps_per_second": 2.965,
"step": 62800
},
{
"epoch": 0.01,
"learning_rate": 4.372568435968296e-05,
"loss": 1.5542,
"step": 63200
},
{
"epoch": 0.01,
"eval_loss": 1.4935693740844727,
"eval_runtime": 112.1109,
"eval_samples_per_second": 178.395,
"eval_steps_per_second": 2.792,
"step": 63200
},
{
"epoch": 0.01,
"learning_rate": 4.3642819557750556e-05,
"loss": 1.5512,
"step": 63600
},
{
"epoch": 0.01,
"eval_loss": 1.4915146827697754,
"eval_runtime": 109.7302,
"eval_samples_per_second": 182.265,
"eval_steps_per_second": 2.852,
"step": 63600
},
{
"epoch": 0.01,
"learning_rate": 4.355995475581814e-05,
"loss": 1.5515,
"step": 64000
},
{
"epoch": 0.01,
"eval_loss": 1.4876190423965454,
"eval_runtime": 111.5094,
"eval_samples_per_second": 179.357,
"eval_steps_per_second": 2.807,
"step": 64000
},
{
"epoch": 0.01,
"learning_rate": 4.347708995388574e-05,
"loss": 1.549,
"step": 64400
},
{
"epoch": 0.01,
"eval_loss": 1.4836992025375366,
"eval_runtime": 112.3893,
"eval_samples_per_second": 177.953,
"eval_steps_per_second": 2.785,
"step": 64400
},
{
"epoch": 0.02,
"learning_rate": 4.3394225151953334e-05,
"loss": 1.5479,
"step": 64800
},
{
"epoch": 0.02,
"eval_loss": 1.4897727966308594,
"eval_runtime": 111.5923,
"eval_samples_per_second": 179.224,
"eval_steps_per_second": 2.805,
"step": 64800
},
{
"epoch": 0.02,
"learning_rate": 4.3311360350020926e-05,
"loss": 1.5492,
"step": 65200
},
{
"epoch": 0.02,
"eval_loss": 1.484372615814209,
"eval_runtime": 111.9819,
"eval_samples_per_second": 178.6,
"eval_steps_per_second": 2.795,
"step": 65200
},
{
"epoch": 0.02,
"learning_rate": 4.322849554808851e-05,
"loss": 1.5468,
"step": 65600
},
{
"epoch": 0.02,
"eval_loss": 1.4826014041900635,
"eval_runtime": 112.199,
"eval_samples_per_second": 178.255,
"eval_steps_per_second": 2.79,
"step": 65600
},
{
"epoch": 0.03,
"learning_rate": 4.314563074615611e-05,
"loss": 1.5476,
"step": 66000
},
{
"epoch": 0.03,
"eval_loss": 1.4857112169265747,
"eval_runtime": 112.9874,
"eval_samples_per_second": 177.011,
"eval_steps_per_second": 2.77,
"step": 66000
},
{
"epoch": 0.03,
"learning_rate": 4.3062765944223705e-05,
"loss": 1.5473,
"step": 66400
},
{
"epoch": 0.03,
"eval_loss": 1.487414836883545,
"eval_runtime": 114.6833,
"eval_samples_per_second": 174.393,
"eval_steps_per_second": 2.729,
"step": 66400
},
{
"epoch": 0.03,
"learning_rate": 4.29799011422913e-05,
"loss": 1.5487,
"step": 66800
},
{
"epoch": 0.03,
"eval_loss": 1.4894484281539917,
"eval_runtime": 109.3915,
"eval_samples_per_second": 182.83,
"eval_steps_per_second": 2.861,
"step": 66800
},
{
"epoch": 0.04,
"learning_rate": 4.289703634035889e-05,
"loss": 1.5476,
"step": 67200
},
{
"epoch": 0.04,
"eval_loss": 1.4839718341827393,
"eval_runtime": 114.1702,
"eval_samples_per_second": 175.177,
"eval_steps_per_second": 2.742,
"step": 67200
},
{
"epoch": 0.04,
"learning_rate": 4.281417153842648e-05,
"loss": 1.5459,
"step": 67600
},
{
"epoch": 0.04,
"eval_loss": 1.4786709547042847,
"eval_runtime": 112.7446,
"eval_samples_per_second": 177.392,
"eval_steps_per_second": 2.776,
"step": 67600
},
{
"epoch": 0.04,
"learning_rate": 4.2731306736494075e-05,
"loss": 1.5431,
"step": 68000
},
{
"epoch": 0.04,
"eval_loss": 1.48154616355896,
"eval_runtime": 113.692,
"eval_samples_per_second": 175.914,
"eval_steps_per_second": 2.753,
"step": 68000
},
{
"epoch": 0.04,
"learning_rate": 4.264844193456167e-05,
"loss": 1.544,
"step": 68400
},
{
"epoch": 0.04,
"eval_loss": 1.4801952838897705,
"eval_runtime": 111.6945,
"eval_samples_per_second": 179.06,
"eval_steps_per_second": 2.802,
"step": 68400
},
{
"epoch": 0.05,
"learning_rate": 4.256557713262926e-05,
"loss": 1.5436,
"step": 68800
},
{
"epoch": 0.05,
"eval_loss": 1.478300929069519,
"eval_runtime": 114.4135,
"eval_samples_per_second": 174.805,
"eval_steps_per_second": 2.736,
"step": 68800
},
{
"epoch": 0.05,
"learning_rate": 4.2482712330696853e-05,
"loss": 1.5411,
"step": 69200
},
{
"epoch": 0.05,
"eval_loss": 1.484221339225769,
"eval_runtime": 114.3599,
"eval_samples_per_second": 174.886,
"eval_steps_per_second": 2.737,
"step": 69200
},
{
"epoch": 0.05,
"learning_rate": 4.2399847528764446e-05,
"loss": 1.5446,
"step": 69600
},
{
"epoch": 0.05,
"eval_loss": 1.4805113077163696,
"eval_runtime": 115.5225,
"eval_samples_per_second": 173.126,
"eval_steps_per_second": 2.709,
"step": 69600
},
{
"epoch": 0.06,
"learning_rate": 4.231698272683204e-05,
"loss": 1.5441,
"step": 70000
},
{
"epoch": 0.06,
"eval_loss": 1.4875138998031616,
"eval_runtime": 114.7419,
"eval_samples_per_second": 174.304,
"eval_steps_per_second": 2.728,
"step": 70000
},
{
"epoch": 0.06,
"learning_rate": 4.223411792489963e-05,
"loss": 1.5446,
"step": 70400
},
{
"epoch": 0.06,
"eval_loss": 1.4801757335662842,
"eval_runtime": 119.231,
"eval_samples_per_second": 167.742,
"eval_steps_per_second": 2.625,
"step": 70400
},
{
"epoch": 0.06,
"learning_rate": 4.2151253122967224e-05,
"loss": 1.5443,
"step": 70800
},
{
"epoch": 0.06,
"eval_loss": 1.4772462844848633,
"eval_runtime": 115.591,
"eval_samples_per_second": 173.024,
"eval_steps_per_second": 2.708,
"step": 70800
},
{
"epoch": 0.06,
"learning_rate": 4.206838832103482e-05,
"loss": 1.5411,
"step": 71200
},
{
"epoch": 0.06,
"eval_loss": 1.4795691967010498,
"eval_runtime": 118.6003,
"eval_samples_per_second": 168.634,
"eval_steps_per_second": 2.639,
"step": 71200
},
{
"epoch": 0.07,
"learning_rate": 4.198552351910241e-05,
"loss": 1.5413,
"step": 71600
},
{
"epoch": 0.07,
"eval_loss": 1.4804329872131348,
"eval_runtime": 119.1285,
"eval_samples_per_second": 167.886,
"eval_steps_per_second": 2.627,
"step": 71600
},
{
"epoch": 0.07,
"learning_rate": 4.190265871717e-05,
"loss": 1.5415,
"step": 72000
},
{
"epoch": 0.07,
"eval_loss": 1.4793719053268433,
"eval_runtime": 117.4287,
"eval_samples_per_second": 170.316,
"eval_steps_per_second": 2.665,
"step": 72000
},
{
"epoch": 0.07,
"learning_rate": 4.1819793915237595e-05,
"loss": 1.5414,
"step": 72400
},
{
"epoch": 0.07,
"eval_loss": 1.4818830490112305,
"eval_runtime": 119.5626,
"eval_samples_per_second": 167.276,
"eval_steps_per_second": 2.618,
"step": 72400
},
{
"epoch": 0.08,
"learning_rate": 4.173692911330519e-05,
"loss": 1.5423,
"step": 72800
},
{
"epoch": 0.08,
"eval_loss": 1.4779819250106812,
"eval_runtime": 118.4406,
"eval_samples_per_second": 168.861,
"eval_steps_per_second": 2.643,
"step": 72800
},
{
"epoch": 0.08,
"learning_rate": 4.165406431137278e-05,
"loss": 1.5386,
"step": 73200
},
{
"epoch": 0.08,
"eval_loss": 1.4759750366210938,
"eval_runtime": 115.7547,
"eval_samples_per_second": 172.779,
"eval_steps_per_second": 2.704,
"step": 73200
},
{
"epoch": 0.08,
"learning_rate": 4.157119950944037e-05,
"loss": 1.5386,
"step": 73600
},
{
"epoch": 0.08,
"eval_loss": 1.4726980924606323,
"eval_runtime": 119.2625,
"eval_samples_per_second": 167.697,
"eval_steps_per_second": 2.624,
"step": 73600
},
{
"epoch": 0.08,
"learning_rate": 4.148833470750797e-05,
"loss": 1.5375,
"step": 74000
},
{
"epoch": 0.08,
"eval_loss": 1.4739803075790405,
"eval_runtime": 118.9815,
"eval_samples_per_second": 168.093,
"eval_steps_per_second": 2.631,
"step": 74000
},
{
"epoch": 0.09,
"learning_rate": 4.140546990557556e-05,
"loss": 1.5376,
"step": 74400
},
{
"epoch": 0.09,
"eval_loss": 1.4678592681884766,
"eval_runtime": 117.6181,
"eval_samples_per_second": 170.042,
"eval_steps_per_second": 2.661,
"step": 74400
},
{
"epoch": 0.09,
"learning_rate": 4.132260510364315e-05,
"loss": 1.5365,
"step": 74800
},
{
"epoch": 0.09,
"eval_loss": 1.4694132804870605,
"eval_runtime": 118.0975,
"eval_samples_per_second": 169.352,
"eval_steps_per_second": 2.65,
"step": 74800
},
{
"epoch": 0.09,
"learning_rate": 4.1239740301710744e-05,
"loss": 1.5356,
"step": 75200
},
{
"epoch": 0.09,
"eval_loss": 1.4689810276031494,
"eval_runtime": 119.48,
"eval_samples_per_second": 167.392,
"eval_steps_per_second": 2.62,
"step": 75200
},
{
"epoch": 0.1,
"learning_rate": 4.1156875499778344e-05,
"loss": 1.5353,
"step": 75600
},
{
"epoch": 0.1,
"eval_loss": 1.4731059074401855,
"eval_runtime": 117.5581,
"eval_samples_per_second": 170.129,
"eval_steps_per_second": 2.663,
"step": 75600
},
{
"epoch": 0.1,
"learning_rate": 4.107401069784593e-05,
"loss": 1.5348,
"step": 76000
},
{
"epoch": 0.1,
"eval_loss": 1.466073751449585,
"eval_runtime": 118.6436,
"eval_samples_per_second": 168.572,
"eval_steps_per_second": 2.638,
"step": 76000
},
{
"epoch": 0.1,
"learning_rate": 4.099114589591352e-05,
"loss": 1.5336,
"step": 76400
},
{
"epoch": 0.1,
"eval_loss": 1.4694697856903076,
"eval_runtime": 117.8705,
"eval_samples_per_second": 169.678,
"eval_steps_per_second": 2.655,
"step": 76400
},
{
"epoch": 0.11,
"learning_rate": 4.0908281093981115e-05,
"loss": 1.5331,
"step": 76800
},
{
"epoch": 0.11,
"eval_loss": 1.470395803451538,
"eval_runtime": 119.1567,
"eval_samples_per_second": 167.846,
"eval_steps_per_second": 2.627,
"step": 76800
},
{
"epoch": 0.11,
"learning_rate": 4.0825416292048714e-05,
"loss": 1.5336,
"step": 77200
},
{
"epoch": 0.11,
"eval_loss": 1.4707101583480835,
"eval_runtime": 217.6239,
"eval_samples_per_second": 91.902,
"eval_steps_per_second": 1.438,
"step": 77200
},
{
"epoch": 0.0,
"learning_rate": 4.074255149011631e-05,
"loss": 1.5303,
"step": 77600
},
{
"epoch": 0.0,
"eval_loss": 1.4677211046218872,
"eval_runtime": 111.0323,
"eval_samples_per_second": 180.128,
"eval_steps_per_second": 2.819,
"step": 77600
},
{
"epoch": 0.01,
"learning_rate": 4.065968668818389e-05,
"loss": 1.5302,
"step": 78000
},
{
"epoch": 0.01,
"eval_loss": 1.4664534330368042,
"eval_runtime": 111.6113,
"eval_samples_per_second": 179.193,
"eval_steps_per_second": 2.804,
"step": 78000
},
{
"epoch": 0.01,
"learning_rate": 4.0576821886251486e-05,
"loss": 1.5288,
"step": 78400
},
{
"epoch": 0.01,
"eval_loss": 1.4657336473464966,
"eval_runtime": 109.321,
"eval_samples_per_second": 182.947,
"eval_steps_per_second": 2.863,
"step": 78400
},
{
"epoch": 0.01,
"learning_rate": 4.049395708431908e-05,
"loss": 1.5284,
"step": 78800
},
{
"epoch": 0.01,
"eval_loss": 1.4579006433486938,
"eval_runtime": 109.1836,
"eval_samples_per_second": 183.178,
"eval_steps_per_second": 2.867,
"step": 78800
},
{
"epoch": 0.01,
"learning_rate": 4.041109228238668e-05,
"loss": 1.5277,
"step": 79200
},
{
"epoch": 0.01,
"eval_loss": 1.4642364978790283,
"eval_runtime": 108.8787,
"eval_samples_per_second": 183.691,
"eval_steps_per_second": 2.875,
"step": 79200
},
{
"epoch": 0.02,
"learning_rate": 4.0328227480454264e-05,
"loss": 1.5254,
"step": 79600
},
{
"epoch": 0.02,
"eval_loss": 1.4699641466140747,
"eval_runtime": 110.6507,
"eval_samples_per_second": 180.749,
"eval_steps_per_second": 2.829,
"step": 79600
},
{
"epoch": 0.02,
"learning_rate": 4.024536267852186e-05,
"loss": 1.526,
"step": 80000
},
{
"epoch": 0.02,
"eval_loss": 1.4663636684417725,
"eval_runtime": 108.472,
"eval_samples_per_second": 184.379,
"eval_steps_per_second": 2.886,
"step": 80000
},
{
"epoch": 0.02,
"learning_rate": 4.016249787658945e-05,
"loss": 1.5242,
"step": 80400
},
{
"epoch": 0.02,
"eval_loss": 1.4651668071746826,
"eval_runtime": 111.7826,
"eval_samples_per_second": 178.919,
"eval_steps_per_second": 2.8,
"step": 80400
},
{
"epoch": 0.03,
"learning_rate": 4.007963307465705e-05,
"loss": 1.523,
"step": 80800
},
{
"epoch": 0.03,
"eval_loss": 1.4634953737258911,
"eval_runtime": 110.7712,
"eval_samples_per_second": 180.552,
"eval_steps_per_second": 2.826,
"step": 80800
},
{
"epoch": 0.0,
"learning_rate": 3.9109053272894466e-05,
"loss": 1.524,
"step": 81200
},
{
"epoch": 0.0,
"eval_loss": 2.39056134223938,
"eval_runtime": 20.0958,
"eval_samples_per_second": 175.112,
"eval_steps_per_second": 5.474,
"step": 81200
},
{
"epoch": 0.01,
"learning_rate": 3.902803602027052e-05,
"loss": 1.5242,
"step": 81600
},
{
"epoch": 0.01,
"eval_loss": 2.3666460514068604,
"eval_runtime": 18.7692,
"eval_samples_per_second": 187.488,
"eval_steps_per_second": 5.861,
"step": 81600
},
{
"epoch": 0.01,
"learning_rate": 3.894701876764657e-05,
"loss": 1.5237,
"step": 82000
},
{
"epoch": 0.01,
"eval_loss": 2.385453939437866,
"eval_runtime": 19.4223,
"eval_samples_per_second": 181.184,
"eval_steps_per_second": 5.664,
"step": 82000
},
{
"epoch": 0.01,
"learning_rate": 3.886600151502263e-05,
"loss": 1.5226,
"step": 82400
},
{
"epoch": 0.01,
"eval_loss": 2.393972396850586,
"eval_runtime": 19.0938,
"eval_samples_per_second": 184.301,
"eval_steps_per_second": 5.761,
"step": 82400
},
{
"epoch": 0.01,
"learning_rate": 3.8784984262398676e-05,
"loss": 1.5218,
"step": 82800
},
{
"epoch": 0.01,
"eval_loss": 2.456040620803833,
"eval_runtime": 18.9643,
"eval_samples_per_second": 185.56,
"eval_steps_per_second": 5.8,
"step": 82800
},
{
"epoch": 0.02,
"learning_rate": 3.870396700977473e-05,
"loss": 1.5215,
"step": 83200
},
{
"epoch": 0.02,
"eval_loss": 2.395426034927368,
"eval_runtime": 19.2189,
"eval_samples_per_second": 183.101,
"eval_steps_per_second": 5.724,
"step": 83200
},
{
"epoch": 0.02,
"learning_rate": 3.862294975715079e-05,
"loss": 1.521,
"step": 83600
},
{
"epoch": 0.02,
"eval_loss": 2.4465413093566895,
"eval_runtime": 18.8719,
"eval_samples_per_second": 186.468,
"eval_steps_per_second": 5.829,
"step": 83600
},
{
"epoch": 0.02,
"learning_rate": 3.854193250452684e-05,
"loss": 1.5209,
"step": 84000
},
{
"epoch": 0.02,
"eval_loss": 2.396277904510498,
"eval_runtime": 18.9346,
"eval_samples_per_second": 185.85,
"eval_steps_per_second": 5.809,
"step": 84000
},
{
"epoch": 0.0,
"learning_rate": 3.846091525190289e-05,
"loss": 1.5188,
"step": 84400
},
{
"epoch": 0.0,
"eval_loss": 2.4277689456939697,
"eval_runtime": 20.9367,
"eval_samples_per_second": 168.078,
"eval_steps_per_second": 5.254,
"step": 84400
},
{
"epoch": 0.01,
"learning_rate": 3.837989799927895e-05,
"loss": 1.5177,
"step": 84800
},
{
"epoch": 0.01,
"eval_loss": 2.378986120223999,
"eval_runtime": 20.2239,
"eval_samples_per_second": 174.002,
"eval_steps_per_second": 5.439,
"step": 84800
},
{
"epoch": 0.01,
"learning_rate": 3.8298880746655e-05,
"loss": 1.5184,
"step": 85200
},
{
"epoch": 0.01,
"eval_loss": 2.395463705062866,
"eval_runtime": 19.3659,
"eval_samples_per_second": 181.711,
"eval_steps_per_second": 5.68,
"step": 85200
},
{
"epoch": 0.01,
"learning_rate": 3.8217863494031056e-05,
"loss": 1.5166,
"step": 85600
},
{
"epoch": 0.01,
"eval_loss": 2.421231269836426,
"eval_runtime": 20.4856,
"eval_samples_per_second": 171.779,
"eval_steps_per_second": 5.37,
"step": 85600
},
{
"epoch": 0.01,
"learning_rate": 3.813684624140711e-05,
"loss": 1.5158,
"step": 86000
},
{
"epoch": 0.01,
"eval_loss": 2.4270944595336914,
"eval_runtime": 19.2825,
"eval_samples_per_second": 182.497,
"eval_steps_per_second": 5.705,
"step": 86000
},
{
"epoch": 0.02,
"learning_rate": 3.8055828988783165e-05,
"loss": 1.5157,
"step": 86400
},
{
"epoch": 0.02,
"eval_loss": 2.4186675548553467,
"eval_runtime": 19.3721,
"eval_samples_per_second": 181.653,
"eval_steps_per_second": 5.678,
"step": 86400
},
{
"epoch": 0.02,
"learning_rate": 3.797481173615922e-05,
"loss": 1.5156,
"step": 86800
},
{
"epoch": 0.02,
"eval_loss": 2.4075629711151123,
"eval_runtime": 19.2183,
"eval_samples_per_second": 183.107,
"eval_steps_per_second": 5.724,
"step": 86800
},
{
"epoch": 0.02,
"learning_rate": 3.789379448353527e-05,
"loss": 1.5147,
"step": 87200
},
{
"epoch": 0.02,
"eval_loss": 2.471975803375244,
"eval_runtime": 20.3949,
"eval_samples_per_second": 172.543,
"eval_steps_per_second": 5.394,
"step": 87200
},
{
"epoch": 0.0,
"learning_rate": 3.781277723091132e-05,
"loss": 1.5127,
"step": 87600
},
{
"epoch": 0.0,
"eval_loss": 2.3385655879974365,
"eval_runtime": 20.157,
"eval_samples_per_second": 174.58,
"eval_steps_per_second": 5.457,
"step": 87600
},
{
"epoch": 0.01,
"learning_rate": 3.773175997828738e-05,
"loss": 1.5129,
"step": 88000
},
{
"epoch": 0.01,
"eval_loss": 2.381673574447632,
"eval_runtime": 20.5337,
"eval_samples_per_second": 171.377,
"eval_steps_per_second": 5.357,
"step": 88000
},
{
"epoch": 0.01,
"learning_rate": 3.765074272566343e-05,
"loss": 1.5123,
"step": 88400
},
{
"epoch": 0.01,
"eval_loss": 2.35689377784729,
"eval_runtime": 20.7328,
"eval_samples_per_second": 169.731,
"eval_steps_per_second": 5.306,
"step": 88400
},
{
"epoch": 0.01,
"learning_rate": 3.7569725473039484e-05,
"loss": 1.5121,
"step": 88800
},
{
"epoch": 0.01,
"eval_loss": 2.3643054962158203,
"eval_runtime": 19.1222,
"eval_samples_per_second": 184.026,
"eval_steps_per_second": 5.752,
"step": 88800
},
{
"epoch": 0.01,
"learning_rate": 3.7488708220415545e-05,
"loss": 1.5118,
"step": 89200
},
{
"epoch": 0.01,
"eval_loss": 2.334357261657715,
"eval_runtime": 19.3908,
"eval_samples_per_second": 181.478,
"eval_steps_per_second": 5.673,
"step": 89200
},
{
"epoch": 0.02,
"learning_rate": 3.740769096779159e-05,
"loss": 1.5102,
"step": 89600
},
{
"epoch": 0.02,
"eval_loss": 2.401927947998047,
"eval_runtime": 20.1285,
"eval_samples_per_second": 174.827,
"eval_steps_per_second": 5.465,
"step": 89600
},
{
"epoch": 0.02,
"learning_rate": 3.7326673715167647e-05,
"loss": 1.5097,
"step": 90000
},
{
"epoch": 0.02,
"eval_loss": 2.4241695404052734,
"eval_runtime": 20.5668,
"eval_samples_per_second": 171.101,
"eval_steps_per_second": 5.348,
"step": 90000
},
{
"epoch": 0.02,
"learning_rate": 3.72456564625437e-05,
"loss": 1.5103,
"step": 90400
},
{
"epoch": 0.02,
"eval_loss": 2.393686532974243,
"eval_runtime": 19.2168,
"eval_samples_per_second": 183.121,
"eval_steps_per_second": 5.724,
"step": 90400
},
{
"epoch": 0.03,
"learning_rate": 3.7164639209919755e-05,
"loss": 1.5112,
"step": 90800
},
{
"epoch": 0.03,
"eval_loss": 2.3694939613342285,
"eval_runtime": 20.1373,
"eval_samples_per_second": 174.751,
"eval_steps_per_second": 5.463,
"step": 90800
},
{
"epoch": 0.03,
"learning_rate": 3.70836219572958e-05,
"loss": 1.5108,
"step": 91200
},
{
"epoch": 0.03,
"eval_loss": 2.345815420150757,
"eval_runtime": 20.1959,
"eval_samples_per_second": 174.243,
"eval_steps_per_second": 5.447,
"step": 91200
},
{
"epoch": 0.03,
"learning_rate": 3.7002604704671864e-05,
"loss": 1.511,
"step": 91600
},
{
"epoch": 0.03,
"eval_loss": 2.3629839420318604,
"eval_runtime": 19.3875,
"eval_samples_per_second": 181.508,
"eval_steps_per_second": 5.674,
"step": 91600
},
{
"epoch": 0.03,
"learning_rate": 3.692158745204792e-05,
"loss": 1.5089,
"step": 92000
},
{
"epoch": 0.03,
"eval_loss": 2.385115385055542,
"eval_runtime": 20.4471,
"eval_samples_per_second": 172.103,
"eval_steps_per_second": 5.38,
"step": 92000
},
{
"epoch": 0.04,
"learning_rate": 3.6840570199423966e-05,
"loss": 1.5095,
"step": 92400
},
{
"epoch": 0.04,
"eval_loss": 2.319392442703247,
"eval_runtime": 19.8755,
"eval_samples_per_second": 177.052,
"eval_steps_per_second": 5.534,
"step": 92400
},
{
"epoch": 0.04,
"learning_rate": 3.6759552946800027e-05,
"loss": 1.5094,
"step": 92800
},
{
"epoch": 0.04,
"eval_loss": 2.3495166301727295,
"eval_runtime": 19.4501,
"eval_samples_per_second": 180.925,
"eval_steps_per_second": 5.656,
"step": 92800
},
{
"epoch": 0.04,
"learning_rate": 3.6678535694176074e-05,
"loss": 1.5101,
"step": 93200
},
{
"epoch": 0.04,
"eval_loss": 2.365245819091797,
"eval_runtime": 19.578,
"eval_samples_per_second": 179.743,
"eval_steps_per_second": 5.619,
"step": 93200
},
{
"epoch": 0.05,
"learning_rate": 3.659751844155213e-05,
"loss": 1.5089,
"step": 93600
},
{
"epoch": 0.05,
"eval_loss": 2.371981143951416,
"eval_runtime": 19.798,
"eval_samples_per_second": 177.745,
"eval_steps_per_second": 5.556,
"step": 93600
},
{
"epoch": 0.05,
"learning_rate": 3.651650118892818e-05,
"loss": 1.509,
"step": 94000
},
{
"epoch": 0.05,
"eval_loss": 2.332063913345337,
"eval_runtime": 19.3403,
"eval_samples_per_second": 181.952,
"eval_steps_per_second": 5.688,
"step": 94000
},
{
"epoch": 0.05,
"learning_rate": 3.643548393630424e-05,
"loss": 1.5096,
"step": 94400
},
{
"epoch": 0.05,
"eval_loss": 2.404459238052368,
"eval_runtime": 19.2128,
"eval_samples_per_second": 183.159,
"eval_steps_per_second": 5.725,
"step": 94400
},
{
"epoch": 0.06,
"learning_rate": 3.635446668368029e-05,
"loss": 1.5089,
"step": 94800
},
{
"epoch": 0.06,
"eval_loss": 2.3641324043273926,
"eval_runtime": 19.4859,
"eval_samples_per_second": 180.592,
"eval_steps_per_second": 5.645,
"step": 94800
},
{
"epoch": 0.06,
"learning_rate": 3.6273449431056346e-05,
"loss": 1.5084,
"step": 95200
},
{
"epoch": 0.06,
"eval_loss": 2.3842105865478516,
"eval_runtime": 19.764,
"eval_samples_per_second": 178.051,
"eval_steps_per_second": 5.566,
"step": 95200
},
{
"epoch": 0.06,
"learning_rate": 3.61924321784324e-05,
"loss": 1.5089,
"step": 95600
},
{
"epoch": 0.06,
"eval_loss": 2.3656747341156006,
"eval_runtime": 20.585,
"eval_samples_per_second": 170.949,
"eval_steps_per_second": 5.344,
"step": 95600
},
{
"epoch": 0.06,
"learning_rate": 3.6111414925808454e-05,
"loss": 1.5097,
"step": 96000
},
{
"epoch": 0.06,
"eval_loss": 2.374446153640747,
"eval_runtime": 19.4426,
"eval_samples_per_second": 180.994,
"eval_steps_per_second": 5.658,
"step": 96000
},
{
"epoch": 0.07,
"learning_rate": 3.603039767318451e-05,
"loss": 1.5072,
"step": 96400
},
{
"epoch": 0.07,
"eval_loss": 2.385554552078247,
"eval_runtime": 20.3681,
"eval_samples_per_second": 172.771,
"eval_steps_per_second": 5.401,
"step": 96400
},
{
"epoch": 0.0,
"learning_rate": 3.5949380420560556e-05,
"loss": 1.5041,
"step": 96800
},
{
"epoch": 0.0,
"eval_loss": 2.3629019260406494,
"eval_runtime": 18.0818,
"eval_samples_per_second": 194.616,
"eval_steps_per_second": 6.083,
"step": 96800
},
{
"epoch": 0.01,
"learning_rate": 3.586836316793662e-05,
"loss": 1.5036,
"step": 97200
},
{
"epoch": 0.01,
"eval_loss": 2.3723270893096924,
"eval_runtime": 17.4087,
"eval_samples_per_second": 202.14,
"eval_steps_per_second": 6.319,
"step": 97200
},
{
"epoch": 0.01,
"learning_rate": 3.578734591531267e-05,
"loss": 1.504,
"step": 97600
},
{
"epoch": 0.01,
"eval_loss": 2.390188217163086,
"eval_runtime": 17.5005,
"eval_samples_per_second": 201.081,
"eval_steps_per_second": 6.286,
"step": 97600
},
{
"epoch": 0.01,
"learning_rate": 3.570632866268872e-05,
"loss": 1.5034,
"step": 98000
},
{
"epoch": 0.01,
"eval_loss": 2.3117146492004395,
"eval_runtime": 17.3837,
"eval_samples_per_second": 202.431,
"eval_steps_per_second": 6.328,
"step": 98000
},
{
"epoch": 0.01,
"learning_rate": 3.562531141006478e-05,
"loss": 1.5021,
"step": 98400
},
{
"epoch": 0.01,
"eval_loss": 2.3584558963775635,
"eval_runtime": 18.524,
"eval_samples_per_second": 189.97,
"eval_steps_per_second": 5.938,
"step": 98400
},
{
"epoch": 0.02,
"learning_rate": 3.554429415744083e-05,
"loss": 1.501,
"step": 98800
},
{
"epoch": 0.02,
"eval_loss": 2.2931323051452637,
"eval_runtime": 17.3901,
"eval_samples_per_second": 202.357,
"eval_steps_per_second": 6.325,
"step": 98800
},
{
"epoch": 0.02,
"learning_rate": 3.546327690481688e-05,
"loss": 1.501,
"step": 99200
},
{
"epoch": 0.02,
"eval_loss": 2.3333306312561035,
"eval_runtime": 17.4003,
"eval_samples_per_second": 202.238,
"eval_steps_per_second": 6.322,
"step": 99200
},
{
"epoch": 0.02,
"learning_rate": 3.5382259652192936e-05,
"loss": 1.4992,
"step": 99600
},
{
"epoch": 0.02,
"eval_loss": 2.342263698577881,
"eval_runtime": 17.3606,
"eval_samples_per_second": 202.701,
"eval_steps_per_second": 6.336,
"step": 99600
},
{
"epoch": 0.03,
"learning_rate": 3.530124239956899e-05,
"loss": 1.5008,
"step": 100000
},
{
"epoch": 0.03,
"eval_loss": 2.336986541748047,
"eval_runtime": 17.0114,
"eval_samples_per_second": 206.861,
"eval_steps_per_second": 6.466,
"step": 100000
},
{
"epoch": 0.03,
"learning_rate": 3.5220225146945045e-05,
"loss": 1.5002,
"step": 100400
},
{
"epoch": 0.03,
"eval_loss": 2.3513643741607666,
"eval_runtime": 17.6104,
"eval_samples_per_second": 199.825,
"eval_steps_per_second": 6.246,
"step": 100400
},
{
"epoch": 0.03,
"learning_rate": 3.51392078943211e-05,
"loss": 1.5016,
"step": 100800
},
{
"epoch": 0.03,
"eval_loss": 2.3241846561431885,
"eval_runtime": 17.6475,
"eval_samples_per_second": 199.405,
"eval_steps_per_second": 6.233,
"step": 100800
},
{
"epoch": 0.03,
"learning_rate": 3.505819064169715e-05,
"loss": 1.4988,
"step": 101200
},
{
"epoch": 0.03,
"eval_loss": 2.359363317489624,
"eval_runtime": 17.067,
"eval_samples_per_second": 206.187,
"eval_steps_per_second": 6.445,
"step": 101200
},
{
"epoch": 0.04,
"learning_rate": 3.49771733890732e-05,
"loss": 1.4992,
"step": 101600
},
{
"epoch": 0.04,
"eval_loss": 2.348477363586426,
"eval_runtime": 17.779,
"eval_samples_per_second": 197.93,
"eval_steps_per_second": 6.187,
"step": 101600
},
{
"epoch": 0.04,
"learning_rate": 3.489615613644926e-05,
"loss": 1.5003,
"step": 102000
},
{
"epoch": 0.04,
"eval_loss": 2.4026684761047363,
"eval_runtime": 17.0398,
"eval_samples_per_second": 206.516,
"eval_steps_per_second": 6.455,
"step": 102000
},
{
"epoch": 0.04,
"learning_rate": 3.481513888382531e-05,
"loss": 1.4994,
"step": 102400
},
{
"epoch": 0.04,
"eval_loss": 2.365537643432617,
"eval_runtime": 17.5601,
"eval_samples_per_second": 200.397,
"eval_steps_per_second": 6.264,
"step": 102400
},
{
"epoch": 0.05,
"learning_rate": 3.4734121631201364e-05,
"loss": 1.499,
"step": 102800
},
{
"epoch": 0.05,
"eval_loss": 2.381800651550293,
"eval_runtime": 16.8498,
"eval_samples_per_second": 208.846,
"eval_steps_per_second": 6.528,
"step": 102800
},
{
"epoch": 0.05,
"learning_rate": 3.4653104378577425e-05,
"loss": 1.4996,
"step": 103200
},
{
"epoch": 0.05,
"eval_loss": 2.401005506515503,
"eval_runtime": 16.9826,
"eval_samples_per_second": 207.212,
"eval_steps_per_second": 6.477,
"step": 103200
},
{
"epoch": 0.05,
"learning_rate": 3.457208712595347e-05,
"loss": 1.4985,
"step": 103600
},
{
"epoch": 0.05,
"eval_loss": 2.399085283279419,
"eval_runtime": 17.0074,
"eval_samples_per_second": 206.91,
"eval_steps_per_second": 6.468,
"step": 103600
},
{
"epoch": 0.06,
"learning_rate": 3.4491069873329527e-05,
"loss": 1.4984,
"step": 104000
},
{
"epoch": 0.06,
"eval_loss": 2.3661704063415527,
"eval_runtime": 16.9552,
"eval_samples_per_second": 207.547,
"eval_steps_per_second": 6.488,
"step": 104000
},
{
"epoch": 0.06,
"learning_rate": 3.441005262070558e-05,
"loss": 1.4975,
"step": 104400
},
{
"epoch": 0.06,
"eval_loss": 2.4111948013305664,
"eval_runtime": 16.975,
"eval_samples_per_second": 207.304,
"eval_steps_per_second": 6.48,
"step": 104400
},
{
"epoch": 0.06,
"learning_rate": 3.4329035368081635e-05,
"loss": 1.4987,
"step": 104800
},
{
"epoch": 0.06,
"eval_loss": 2.3549654483795166,
"eval_runtime": 17.004,
"eval_samples_per_second": 206.951,
"eval_steps_per_second": 6.469,
"step": 104800
},
{
"epoch": 0.06,
"learning_rate": 3.424801811545769e-05,
"loss": 1.4975,
"step": 105200
},
{
"epoch": 0.06,
"eval_loss": 2.3696866035461426,
"eval_runtime": 16.9769,
"eval_samples_per_second": 207.282,
"eval_steps_per_second": 6.479,
"step": 105200
},
{
"epoch": 0.07,
"learning_rate": 3.4167000862833744e-05,
"loss": 1.4978,
"step": 105600
},
{
"epoch": 0.07,
"eval_loss": 2.4747281074523926,
"eval_runtime": 17.0304,
"eval_samples_per_second": 206.63,
"eval_steps_per_second": 6.459,
"step": 105600
},
{
"epoch": 0.07,
"learning_rate": 3.40859836102098e-05,
"loss": 1.4985,
"step": 106000
},
{
"epoch": 0.07,
"eval_loss": 2.3790531158447266,
"eval_runtime": 16.9847,
"eval_samples_per_second": 207.187,
"eval_steps_per_second": 6.476,
"step": 106000
},
{
"epoch": 0.07,
"learning_rate": 3.400496635758585e-05,
"loss": 1.4961,
"step": 106400
},
{
"epoch": 0.07,
"eval_loss": 2.390604019165039,
"eval_runtime": 17.3582,
"eval_samples_per_second": 202.729,
"eval_steps_per_second": 6.337,
"step": 106400
},
{
"epoch": 0.0,
"learning_rate": 3.392394910496191e-05,
"loss": 1.4959,
"step": 106800
},
{
"epoch": 0.0,
"eval_loss": 2.415346622467041,
"eval_runtime": 20.0907,
"eval_samples_per_second": 175.156,
"eval_steps_per_second": 5.475,
"step": 106800
},
{
"epoch": 0.01,
"learning_rate": 3.3842931852337954e-05,
"loss": 1.4956,
"step": 107200
},
{
"epoch": 0.01,
"eval_loss": 2.4299123287200928,
"eval_runtime": 18.8725,
"eval_samples_per_second": 186.462,
"eval_steps_per_second": 5.829,
"step": 107200
},
{
"epoch": 0.01,
"learning_rate": 3.376191459971401e-05,
"loss": 1.4964,
"step": 107600
},
{
"epoch": 0.01,
"eval_loss": 2.448073625564575,
"eval_runtime": 18.7704,
"eval_samples_per_second": 187.476,
"eval_steps_per_second": 5.86,
"step": 107600
},
{
"epoch": 0.01,
"learning_rate": 3.368089734709006e-05,
"loss": 1.497,
"step": 108000
},
{
"epoch": 0.01,
"eval_loss": 2.390690565109253,
"eval_runtime": 18.5096,
"eval_samples_per_second": 190.118,
"eval_steps_per_second": 5.943,
"step": 108000
},
{
"epoch": 0.01,
"learning_rate": 3.359988009446612e-05,
"loss": 1.4955,
"step": 108400
},
{
"epoch": 0.01,
"eval_loss": 2.383636713027954,
"eval_runtime": 18.4941,
"eval_samples_per_second": 190.277,
"eval_steps_per_second": 5.948,
"step": 108400
},
{
"epoch": 0.02,
"learning_rate": 3.351886284184217e-05,
"loss": 1.4953,
"step": 108800
},
{
"epoch": 0.02,
"eval_loss": 2.400592565536499,
"eval_runtime": 18.4735,
"eval_samples_per_second": 190.489,
"eval_steps_per_second": 5.954,
"step": 108800
},
{
"epoch": 0.02,
"learning_rate": 3.3437845589218226e-05,
"loss": 1.4939,
"step": 109200
},
{
"epoch": 0.02,
"eval_loss": 2.349822759628296,
"eval_runtime": 18.6128,
"eval_samples_per_second": 189.063,
"eval_steps_per_second": 5.91,
"step": 109200
},
{
"epoch": 0.02,
"learning_rate": 3.335682833659428e-05,
"loss": 1.4943,
"step": 109600
},
{
"epoch": 0.02,
"eval_loss": 2.3708629608154297,
"eval_runtime": 18.5009,
"eval_samples_per_second": 190.207,
"eval_steps_per_second": 5.946,
"step": 109600
},
{
"epoch": 0.03,
"learning_rate": 3.3275811083970334e-05,
"loss": 1.4942,
"step": 110000
},
{
"epoch": 0.03,
"eval_loss": 2.338743209838867,
"eval_runtime": 18.4865,
"eval_samples_per_second": 190.355,
"eval_steps_per_second": 5.95,
"step": 110000
},
{
"epoch": 0.03,
"learning_rate": 3.319479383134639e-05,
"loss": 1.4923,
"step": 110400
},
{
"epoch": 0.03,
"eval_loss": 2.4041731357574463,
"eval_runtime": 18.5038,
"eval_samples_per_second": 190.177,
"eval_steps_per_second": 5.945,
"step": 110400
},
{
"epoch": 0.0,
"learning_rate": 3.3113776578722436e-05,
"loss": 1.4934,
"step": 110800
},
{
"epoch": 0.0,
"eval_loss": 2.4086883068084717,
"eval_runtime": 17.8895,
"eval_samples_per_second": 196.707,
"eval_steps_per_second": 6.149,
"step": 110800
},
{
"epoch": 0.01,
"learning_rate": 3.30327593260985e-05,
"loss": 1.4917,
"step": 111200
},
{
"epoch": 0.01,
"eval_loss": 2.3683786392211914,
"eval_runtime": 17.4874,
"eval_samples_per_second": 201.23,
"eval_steps_per_second": 6.29,
"step": 111200
},
{
"epoch": 0.01,
"learning_rate": 3.2951742073474545e-05,
"loss": 1.4926,
"step": 111600
},
{
"epoch": 0.01,
"eval_loss": 2.3743233680725098,
"eval_runtime": 17.4669,
"eval_samples_per_second": 201.467,
"eval_steps_per_second": 6.298,
"step": 111600
},
{
"epoch": 0.01,
"learning_rate": 3.28707248208506e-05,
"loss": 1.4913,
"step": 112000
},
{
"epoch": 0.01,
"eval_loss": 2.3969030380249023,
"eval_runtime": 17.4406,
"eval_samples_per_second": 201.77,
"eval_steps_per_second": 6.307,
"step": 112000
},
{
"epoch": 0.01,
"learning_rate": 3.278970756822666e-05,
"loss": 1.4923,
"step": 112400
},
{
"epoch": 0.01,
"eval_loss": 2.373997688293457,
"eval_runtime": 17.6827,
"eval_samples_per_second": 199.008,
"eval_steps_per_second": 6.221,
"step": 112400
},
{
"epoch": 0.02,
"learning_rate": 3.270869031560271e-05,
"loss": 1.4913,
"step": 112800
},
{
"epoch": 0.02,
"eval_loss": 2.3612871170043945,
"eval_runtime": 17.4041,
"eval_samples_per_second": 202.193,
"eval_steps_per_second": 6.32,
"step": 112800
},
{
"epoch": 0.02,
"learning_rate": 3.262767306297876e-05,
"loss": 1.4909,
"step": 113200
},
{
"epoch": 0.02,
"eval_loss": 2.3404111862182617,
"eval_runtime": 17.5513,
"eval_samples_per_second": 200.498,
"eval_steps_per_second": 6.267,
"step": 113200
},
{
"epoch": 0.02,
"learning_rate": 3.2546655810354816e-05,
"loss": 1.491,
"step": 113600
},
{
"epoch": 0.02,
"eval_loss": 2.2388041019439697,
"eval_runtime": 17.6295,
"eval_samples_per_second": 199.609,
"eval_steps_per_second": 6.24,
"step": 113600
},
{
"epoch": 0.03,
"learning_rate": 3.246563855773087e-05,
"loss": 1.4896,
"step": 114000
},
{
"epoch": 0.03,
"eval_loss": 2.3492588996887207,
"eval_runtime": 17.3833,
"eval_samples_per_second": 202.436,
"eval_steps_per_second": 6.328,
"step": 114000
},
{
"epoch": 0.03,
"learning_rate": 3.238462130510692e-05,
"loss": 1.4899,
"step": 114400
},
{
"epoch": 0.03,
"eval_loss": 2.347364664077759,
"eval_runtime": 17.468,
"eval_samples_per_second": 201.454,
"eval_steps_per_second": 6.297,
"step": 114400
},
{
"epoch": 0.03,
"learning_rate": 3.230360405248298e-05,
"loss": 1.4881,
"step": 114800
},
{
"epoch": 0.03,
"eval_loss": 2.315025568008423,
"eval_runtime": 17.4,
"eval_samples_per_second": 202.242,
"eval_steps_per_second": 6.322,
"step": 114800
},
{
"epoch": 0.03,
"learning_rate": 3.2222586799859033e-05,
"loss": 1.4905,
"step": 115200
},
{
"epoch": 0.03,
"eval_loss": 2.344813346862793,
"eval_runtime": 17.3103,
"eval_samples_per_second": 203.29,
"eval_steps_per_second": 6.355,
"step": 115200
},
{
"epoch": 0.04,
"learning_rate": 3.214156954723508e-05,
"loss": 1.4894,
"step": 115600
},
{
"epoch": 0.04,
"eval_loss": 2.350853443145752,
"eval_runtime": 17.3476,
"eval_samples_per_second": 202.852,
"eval_steps_per_second": 6.341,
"step": 115600
},
{
"epoch": 0.04,
"learning_rate": 3.206055229461114e-05,
"loss": 1.4885,
"step": 116000
},
{
"epoch": 0.04,
"eval_loss": 2.273857355117798,
"eval_runtime": 17.3165,
"eval_samples_per_second": 203.217,
"eval_steps_per_second": 6.352,
"step": 116000
},
{
"epoch": 0.04,
"learning_rate": 3.197953504198719e-05,
"loss": 1.4895,
"step": 116400
},
{
"epoch": 0.04,
"eval_loss": 2.3339993953704834,
"eval_runtime": 17.3637,
"eval_samples_per_second": 202.664,
"eval_steps_per_second": 6.335,
"step": 116400
},
{
"epoch": 0.05,
"learning_rate": 3.1898517789363244e-05,
"loss": 1.4886,
"step": 116800
},
{
"epoch": 0.05,
"eval_loss": 2.3035190105438232,
"eval_runtime": 17.249,
"eval_samples_per_second": 204.011,
"eval_steps_per_second": 6.377,
"step": 116800
},
{
"epoch": 0.05,
"learning_rate": 3.18175005367393e-05,
"loss": 1.4867,
"step": 117200
},
{
"epoch": 0.05,
"eval_loss": 2.355330467224121,
"eval_runtime": 17.2592,
"eval_samples_per_second": 203.891,
"eval_steps_per_second": 6.373,
"step": 117200
},
{
"epoch": 0.05,
"learning_rate": 3.173648328411535e-05,
"loss": 1.4859,
"step": 117600
},
{
"epoch": 0.05,
"eval_loss": 2.3306944370269775,
"eval_runtime": 17.5199,
"eval_samples_per_second": 200.857,
"eval_steps_per_second": 6.279,
"step": 117600
},
{
"epoch": 0.06,
"learning_rate": 3.165546603149141e-05,
"loss": 1.4879,
"step": 118000
},
{
"epoch": 0.06,
"eval_loss": 2.3352627754211426,
"eval_runtime": 17.5475,
"eval_samples_per_second": 200.542,
"eval_steps_per_second": 6.269,
"step": 118000
},
{
"epoch": 0.06,
"learning_rate": 3.157444877886746e-05,
"loss": 1.4863,
"step": 118400
},
{
"epoch": 0.06,
"eval_loss": 2.357405662536621,
"eval_runtime": 17.7502,
"eval_samples_per_second": 198.252,
"eval_steps_per_second": 6.197,
"step": 118400
},
{
"epoch": 0.06,
"learning_rate": 3.1493431526243515e-05,
"loss": 1.4858,
"step": 118800
},
{
"epoch": 0.06,
"eval_loss": 2.3991518020629883,
"eval_runtime": 17.6792,
"eval_samples_per_second": 199.048,
"eval_steps_per_second": 6.222,
"step": 118800
},
{
"epoch": 0.06,
"learning_rate": 3.141241427361957e-05,
"loss": 1.4855,
"step": 119200
},
{
"epoch": 0.06,
"eval_loss": 2.353144884109497,
"eval_runtime": 17.7114,
"eval_samples_per_second": 198.685,
"eval_steps_per_second": 6.211,
"step": 119200
},
{
"epoch": 0.07,
"learning_rate": 3.1331397020995624e-05,
"loss": 1.4856,
"step": 119600
},
{
"epoch": 0.07,
"eval_loss": 2.409151315689087,
"eval_runtime": 17.7645,
"eval_samples_per_second": 198.092,
"eval_steps_per_second": 6.192,
"step": 119600
},
{
"epoch": 0.07,
"learning_rate": 3.125037976837167e-05,
"loss": 1.4876,
"step": 120000
},
{
"epoch": 0.07,
"eval_loss": 2.3355095386505127,
"eval_runtime": 17.7334,
"eval_samples_per_second": 198.439,
"eval_steps_per_second": 6.203,
"step": 120000
},
{
"epoch": 0.07,
"learning_rate": 3.116936251574773e-05,
"loss": 1.4874,
"step": 120400
},
{
"epoch": 0.07,
"eval_loss": 2.3579752445220947,
"eval_runtime": 17.7018,
"eval_samples_per_second": 198.793,
"eval_steps_per_second": 6.214,
"step": 120400
},
{
"epoch": 0.08,
"learning_rate": 3.108834526312379e-05,
"loss": 1.4867,
"step": 120800
},
{
"epoch": 0.08,
"eval_loss": 2.3405985832214355,
"eval_runtime": 17.7175,
"eval_samples_per_second": 198.617,
"eval_steps_per_second": 6.209,
"step": 120800
},
{
"epoch": 0.08,
"learning_rate": 3.1007328010499834e-05,
"loss": 1.4847,
"step": 121200
},
{
"epoch": 0.08,
"eval_loss": 2.321049213409424,
"eval_runtime": 17.748,
"eval_samples_per_second": 198.276,
"eval_steps_per_second": 6.198,
"step": 121200
},
{
"epoch": 0.08,
"learning_rate": 3.0926310757875895e-05,
"loss": 1.4842,
"step": 121600
},
{
"epoch": 0.08,
"eval_loss": 2.3495261669158936,
"eval_runtime": 17.6755,
"eval_samples_per_second": 199.09,
"eval_steps_per_second": 6.223,
"step": 121600
},
{
"epoch": 0.08,
"learning_rate": 3.084529350525194e-05,
"loss": 1.484,
"step": 122000
},
{
"epoch": 0.08,
"eval_loss": 2.3278751373291016,
"eval_runtime": 17.6587,
"eval_samples_per_second": 199.278,
"eval_steps_per_second": 6.229,
"step": 122000
},
{
"epoch": 0.09,
"learning_rate": 3.0764276252628e-05,
"loss": 1.4817,
"step": 122400
},
{
"epoch": 0.09,
"eval_loss": 2.352627754211426,
"eval_runtime": 17.7968,
"eval_samples_per_second": 197.732,
"eval_steps_per_second": 6.181,
"step": 122400
},
{
"epoch": 0.09,
"learning_rate": 3.068325900000405e-05,
"loss": 1.4823,
"step": 122800
},
{
"epoch": 0.09,
"eval_loss": 2.3326263427734375,
"eval_runtime": 17.8301,
"eval_samples_per_second": 197.363,
"eval_steps_per_second": 6.169,
"step": 122800
},
{
"epoch": 0.09,
"learning_rate": 3.0602241747380106e-05,
"loss": 1.4814,
"step": 123200
},
{
"epoch": 0.09,
"eval_loss": 2.4039418697357178,
"eval_runtime": 17.726,
"eval_samples_per_second": 198.522,
"eval_steps_per_second": 6.206,
"step": 123200
},
{
"epoch": 0.1,
"learning_rate": 3.052122449475616e-05,
"loss": 1.4802,
"step": 123600
},
{
"epoch": 0.1,
"eval_loss": 2.3534297943115234,
"eval_runtime": 18.0233,
"eval_samples_per_second": 195.247,
"eval_steps_per_second": 6.103,
"step": 123600
},
{
"epoch": 0.1,
"learning_rate": 3.044020724213221e-05,
"loss": 1.4823,
"step": 124000
},
{
"epoch": 0.1,
"eval_loss": 2.3589508533477783,
"eval_runtime": 18.0015,
"eval_samples_per_second": 195.484,
"eval_steps_per_second": 6.111,
"step": 124000
},
{
"epoch": 0.1,
"learning_rate": 3.035918998950827e-05,
"loss": 1.4806,
"step": 124400
},
{
"epoch": 0.1,
"eval_loss": 2.3476579189300537,
"eval_runtime": 18.054,
"eval_samples_per_second": 194.916,
"eval_steps_per_second": 6.093,
"step": 124400
},
{
"epoch": 0.1,
"learning_rate": 3.027817273688432e-05,
"loss": 1.481,
"step": 124800
},
{
"epoch": 0.1,
"eval_loss": 2.3086392879486084,
"eval_runtime": 18.0863,
"eval_samples_per_second": 194.567,
"eval_steps_per_second": 6.082,
"step": 124800
},
{
"epoch": 0.11,
"learning_rate": 3.0197155484260374e-05,
"loss": 1.4798,
"step": 125200
},
{
"epoch": 0.11,
"eval_loss": 2.331632375717163,
"eval_runtime": 18.0209,
"eval_samples_per_second": 195.274,
"eval_steps_per_second": 6.104,
"step": 125200
},
{
"epoch": 0.11,
"learning_rate": 3.0116138231636425e-05,
"loss": 1.481,
"step": 125600
},
{
"epoch": 0.11,
"eval_loss": 2.321038246154785,
"eval_runtime": 18.138,
"eval_samples_per_second": 194.012,
"eval_steps_per_second": 6.065,
"step": 125600
},
{
"epoch": 0.11,
"learning_rate": 3.0035120979012482e-05,
"loss": 1.4792,
"step": 126000
},
{
"epoch": 0.11,
"eval_loss": 2.3609230518341064,
"eval_runtime": 18.1227,
"eval_samples_per_second": 194.176,
"eval_steps_per_second": 6.07,
"step": 126000
},
{
"epoch": 0.12,
"learning_rate": 2.9954103726388537e-05,
"loss": 1.4783,
"step": 126400
},
{
"epoch": 0.12,
"eval_loss": 2.348484516143799,
"eval_runtime": 18.2068,
"eval_samples_per_second": 193.279,
"eval_steps_per_second": 6.042,
"step": 126400
},
{
"epoch": 0.12,
"learning_rate": 2.9873086473764588e-05,
"loss": 1.4783,
"step": 126800
},
{
"epoch": 0.12,
"eval_loss": 2.3550658226013184,
"eval_runtime": 18.1831,
"eval_samples_per_second": 193.532,
"eval_steps_per_second": 6.05,
"step": 126800
},
{
"epoch": 0.12,
"learning_rate": 2.9792069221140645e-05,
"loss": 1.478,
"step": 127200
},
{
"epoch": 0.12,
"eval_loss": 2.352349042892456,
"eval_runtime": 18.3773,
"eval_samples_per_second": 191.487,
"eval_steps_per_second": 5.986,
"step": 127200
},
{
"epoch": 0.13,
"learning_rate": 2.9711051968516696e-05,
"loss": 1.479,
"step": 127600
},
{
"epoch": 0.13,
"eval_loss": 2.3229057788848877,
"eval_runtime": 18.4727,
"eval_samples_per_second": 190.498,
"eval_steps_per_second": 5.955,
"step": 127600
},
{
"epoch": 0.13,
"learning_rate": 2.963003471589275e-05,
"loss": 1.4787,
"step": 128000
},
{
"epoch": 0.13,
"eval_loss": 2.3134686946868896,
"eval_runtime": 18.5086,
"eval_samples_per_second": 190.128,
"eval_steps_per_second": 5.943,
"step": 128000
},
{
"epoch": 0.13,
"learning_rate": 2.95490174632688e-05,
"loss": 1.4775,
"step": 128400
},
{
"epoch": 0.13,
"eval_loss": 2.27996826171875,
"eval_runtime": 18.3605,
"eval_samples_per_second": 191.661,
"eval_steps_per_second": 5.991,
"step": 128400
},
{
"epoch": 0.13,
"learning_rate": 2.946800021064486e-05,
"loss": 1.4766,
"step": 128800
},
{
"epoch": 0.13,
"eval_loss": 2.2963178157806396,
"eval_runtime": 18.3042,
"eval_samples_per_second": 192.251,
"eval_steps_per_second": 6.01,
"step": 128800
},
{
"epoch": 0.14,
"learning_rate": 2.9386982958020913e-05,
"loss": 1.4762,
"step": 129200
},
{
"epoch": 0.14,
"eval_loss": 2.3238120079040527,
"eval_runtime": 18.4678,
"eval_samples_per_second": 190.548,
"eval_steps_per_second": 5.956,
"step": 129200
},
{
"epoch": 0.14,
"learning_rate": 2.9305965705396964e-05,
"loss": 1.4769,
"step": 129600
},
{
"epoch": 0.14,
"eval_loss": 2.3036534786224365,
"eval_runtime": 18.3198,
"eval_samples_per_second": 192.087,
"eval_steps_per_second": 6.004,
"step": 129600
},
{
"epoch": 0.14,
"learning_rate": 2.9224948452773022e-05,
"loss": 1.4756,
"step": 130000
},
{
"epoch": 0.14,
"eval_loss": 2.3685128688812256,
"eval_runtime": 18.2275,
"eval_samples_per_second": 193.06,
"eval_steps_per_second": 6.035,
"step": 130000
},
{
"epoch": 0.15,
"learning_rate": 2.9143931200149073e-05,
"loss": 1.4752,
"step": 130400
},
{
"epoch": 0.15,
"eval_loss": 2.288372278213501,
"eval_runtime": 18.3274,
"eval_samples_per_second": 192.008,
"eval_steps_per_second": 6.002,
"step": 130400
},
{
"epoch": 0.15,
"learning_rate": 2.9062913947525127e-05,
"loss": 1.4747,
"step": 130800
},
{
"epoch": 0.15,
"eval_loss": 2.3392255306243896,
"eval_runtime": 18.2629,
"eval_samples_per_second": 192.686,
"eval_steps_per_second": 6.023,
"step": 130800
},
{
"epoch": 0.15,
"learning_rate": 2.8981896694901178e-05,
"loss": 1.4738,
"step": 131200
},
{
"epoch": 0.15,
"eval_loss": 2.3563013076782227,
"eval_runtime": 18.4362,
"eval_samples_per_second": 190.875,
"eval_steps_per_second": 5.967,
"step": 131200
},
{
"epoch": 0.15,
"learning_rate": 2.8900879442277236e-05,
"loss": 1.4749,
"step": 131600
},
{
"epoch": 0.15,
"eval_loss": 2.330927610397339,
"eval_runtime": 18.1578,
"eval_samples_per_second": 193.801,
"eval_steps_per_second": 6.058,
"step": 131600
},
{
"epoch": 0.16,
"learning_rate": 2.881986218965329e-05,
"loss": 1.4748,
"step": 132000
},
{
"epoch": 0.16,
"eval_loss": 2.33650279045105,
"eval_runtime": 18.3527,
"eval_samples_per_second": 191.743,
"eval_steps_per_second": 5.994,
"step": 132000
},
{
"epoch": 0.16,
"learning_rate": 2.873884493702934e-05,
"loss": 1.4737,
"step": 132400
},
{
"epoch": 0.16,
"eval_loss": 2.3835794925689697,
"eval_runtime": 18.2768,
"eval_samples_per_second": 192.539,
"eval_steps_per_second": 6.019,
"step": 132400
},
{
"epoch": 0.16,
"learning_rate": 2.86578276844054e-05,
"loss": 1.474,
"step": 132800
},
{
"epoch": 0.16,
"eval_loss": 2.4150733947753906,
"eval_runtime": 18.2593,
"eval_samples_per_second": 192.724,
"eval_steps_per_second": 6.024,
"step": 132800
},
{
"epoch": 0.17,
"learning_rate": 2.8576810431781446e-05,
"loss": 1.4743,
"step": 133200
},
{
"epoch": 0.17,
"eval_loss": 2.36186146736145,
"eval_runtime": 18.123,
"eval_samples_per_second": 194.173,
"eval_steps_per_second": 6.07,
"step": 133200
},
{
"epoch": 0.17,
"learning_rate": 2.8495793179157504e-05,
"loss": 1.4735,
"step": 133600
},
{
"epoch": 0.17,
"eval_loss": 2.356795310974121,
"eval_runtime": 18.2043,
"eval_samples_per_second": 193.306,
"eval_steps_per_second": 6.043,
"step": 133600
},
{
"epoch": 0.17,
"learning_rate": 2.8414775926533555e-05,
"loss": 1.4735,
"step": 134000
},
{
"epoch": 0.17,
"eval_loss": 2.3677237033843994,
"eval_runtime": 18.253,
"eval_samples_per_second": 192.791,
"eval_steps_per_second": 6.026,
"step": 134000
},
{
"epoch": 0.17,
"learning_rate": 2.833375867390961e-05,
"loss": 1.4715,
"step": 134400
},
{
"epoch": 0.17,
"eval_loss": 2.361776113510132,
"eval_runtime": 18.182,
"eval_samples_per_second": 193.543,
"eval_steps_per_second": 6.05,
"step": 134400
},
{
"epoch": 0.18,
"learning_rate": 2.8252741421285667e-05,
"loss": 1.4726,
"step": 134800
},
{
"epoch": 0.18,
"eval_loss": 2.3906137943267822,
"eval_runtime": 18.0913,
"eval_samples_per_second": 194.513,
"eval_steps_per_second": 6.08,
"step": 134800
},
{
"epoch": 0.18,
"learning_rate": 2.8171724168661718e-05,
"loss": 1.4716,
"step": 135200
},
{
"epoch": 0.18,
"eval_loss": 2.340426445007324,
"eval_runtime": 18.1553,
"eval_samples_per_second": 193.828,
"eval_steps_per_second": 6.059,
"step": 135200
},
{
"epoch": 0.18,
"learning_rate": 2.8090706916037772e-05,
"loss": 1.4719,
"step": 135600
},
{
"epoch": 0.18,
"eval_loss": 2.340381383895874,
"eval_runtime": 18.1363,
"eval_samples_per_second": 194.031,
"eval_steps_per_second": 6.065,
"step": 135600
},
{
"epoch": 0.19,
"learning_rate": 2.8009689663413823e-05,
"loss": 1.4725,
"step": 136000
},
{
"epoch": 0.19,
"eval_loss": 2.370542526245117,
"eval_runtime": 18.2157,
"eval_samples_per_second": 193.185,
"eval_steps_per_second": 6.039,
"step": 136000
},
{
"epoch": 0.19,
"learning_rate": 2.792867241078988e-05,
"loss": 1.4713,
"step": 136400
},
{
"epoch": 0.19,
"eval_loss": 2.360673189163208,
"eval_runtime": 18.2181,
"eval_samples_per_second": 193.159,
"eval_steps_per_second": 6.038,
"step": 136400
},
{
"epoch": 0.19,
"learning_rate": 2.784765515816593e-05,
"loss": 1.4714,
"step": 136800
},
{
"epoch": 0.19,
"eval_loss": 2.3657426834106445,
"eval_runtime": 18.2301,
"eval_samples_per_second": 193.032,
"eval_steps_per_second": 6.034,
"step": 136800
},
{
"epoch": 0.2,
"learning_rate": 2.7766637905541986e-05,
"loss": 1.4706,
"step": 137200
},
{
"epoch": 0.2,
"eval_loss": 2.3723626136779785,
"eval_runtime": 18.1723,
"eval_samples_per_second": 193.646,
"eval_steps_per_second": 6.053,
"step": 137200
},
{
"epoch": 0.2,
"learning_rate": 2.7685620652918044e-05,
"loss": 1.47,
"step": 137600
},
{
"epoch": 0.2,
"eval_loss": 2.3738961219787598,
"eval_runtime": 18.1983,
"eval_samples_per_second": 193.37,
"eval_steps_per_second": 6.045,
"step": 137600
},
{
"epoch": 0.2,
"learning_rate": 2.7604603400294094e-05,
"loss": 1.4686,
"step": 138000
},
{
"epoch": 0.2,
"eval_loss": 2.3388829231262207,
"eval_runtime": 18.2362,
"eval_samples_per_second": 192.968,
"eval_steps_per_second": 6.032,
"step": 138000
},
{
"epoch": 0.2,
"learning_rate": 2.752358614767015e-05,
"loss": 1.469,
"step": 138400
},
{
"epoch": 0.2,
"eval_loss": 2.3783812522888184,
"eval_runtime": 18.2567,
"eval_samples_per_second": 192.751,
"eval_steps_per_second": 6.025,
"step": 138400
},
{
"epoch": 0.21,
"learning_rate": 2.74425688950462e-05,
"loss": 1.4682,
"step": 138800
},
{
"epoch": 0.21,
"eval_loss": 2.3429505825042725,
"eval_runtime": 18.2164,
"eval_samples_per_second": 193.177,
"eval_steps_per_second": 6.039,
"step": 138800
},
{
"epoch": 0.21,
"learning_rate": 2.7361551642422257e-05,
"loss": 1.4698,
"step": 139200
},
{
"epoch": 0.21,
"eval_loss": 2.3579936027526855,
"eval_runtime": 18.1836,
"eval_samples_per_second": 193.526,
"eval_steps_per_second": 6.049,
"step": 139200
},
{
"epoch": 0.21,
"learning_rate": 2.7280534389798308e-05,
"loss": 1.4676,
"step": 139600
},
{
"epoch": 0.21,
"eval_loss": 2.3819713592529297,
"eval_runtime": 18.2677,
"eval_samples_per_second": 192.635,
"eval_steps_per_second": 6.022,
"step": 139600
},
{
"epoch": 0.22,
"learning_rate": 2.7199517137174363e-05,
"loss": 1.4683,
"step": 140000
},
{
"epoch": 0.22,
"eval_loss": 2.426044225692749,
"eval_runtime": 18.2225,
"eval_samples_per_second": 193.113,
"eval_steps_per_second": 6.036,
"step": 140000
},
{
"epoch": 0.22,
"learning_rate": 2.711849988455042e-05,
"loss": 1.4677,
"step": 140400
},
{
"epoch": 0.22,
"eval_loss": 2.3789823055267334,
"eval_runtime": 31.3826,
"eval_samples_per_second": 112.132,
"eval_steps_per_second": 3.505,
"step": 140400
},
{
"epoch": 0.22,
"learning_rate": 2.703748263192647e-05,
"loss": 1.4686,
"step": 140800
},
{
"epoch": 0.22,
"eval_loss": 2.329643487930298,
"eval_runtime": 18.3935,
"eval_samples_per_second": 191.317,
"eval_steps_per_second": 5.98,
"step": 140800
},
{
"epoch": 0.22,
"learning_rate": 2.6956465379302525e-05,
"loss": 1.4679,
"step": 141200
},
{
"epoch": 0.22,
"eval_loss": 2.4011151790618896,
"eval_runtime": 18.2288,
"eval_samples_per_second": 193.046,
"eval_steps_per_second": 6.034,
"step": 141200
},
{
"epoch": 0.23,
"learning_rate": 2.6875448126678576e-05,
"loss": 1.4676,
"step": 141600
},
{
"epoch": 0.23,
"eval_loss": 2.377561092376709,
"eval_runtime": 20.2447,
"eval_samples_per_second": 173.823,
"eval_steps_per_second": 5.434,
"step": 141600
}
],
"max_steps": 274290,
"num_train_epochs": 2,
"total_flos": 2.641163282310901e+20,
"trial_name": null,
"trial_params": null
}