Qwen2.5-VL-KvasirVQA-x1-ft / trainer_state.json
SushantGautam's picture
Upload folder using huggingface_hub
a12a592 verified
{
"best_global_step": 3333,
"best_metric": 0.39062577,
"best_model_checkpoint": "/global/D1/homes/sushant/Kvasir-VQA-x1/output_vqa_x1/v0-20250521-005603/checkpoint-3333",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3333,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009000900090009,
"grad_norm": 7.174169063568115,
"learning_rate": 2.0000000000000002e-07,
"loss": 3.304050922393799,
"memory(GiB)": 66.97,
"step": 1,
"token_acc": 0.4874715261958998,
"train_speed(iter/s)": 0.019902
},
{
"epoch": 0.009000900090009001,
"grad_norm": 6.479684829711914,
"learning_rate": 2.0000000000000003e-06,
"loss": 3.1309598286946616,
"memory(GiB)": 67.86,
"step": 10,
"token_acc": 0.4754664823773324,
"train_speed(iter/s)": 0.036367
},
{
"epoch": 0.018001800180018002,
"grad_norm": 7.254239559173584,
"learning_rate": 4.000000000000001e-06,
"loss": 3.1008956909179686,
"memory(GiB)": 67.86,
"step": 20,
"token_acc": 0.4788295278208823,
"train_speed(iter/s)": 0.038518
},
{
"epoch": 0.027002700270027002,
"grad_norm": 7.573488712310791,
"learning_rate": 6e-06,
"loss": 2.772838592529297,
"memory(GiB)": 67.86,
"step": 30,
"token_acc": 0.5009667024704618,
"train_speed(iter/s)": 0.039255
},
{
"epoch": 0.036003600360036005,
"grad_norm": 4.112278938293457,
"learning_rate": 8.000000000000001e-06,
"loss": 2.004438781738281,
"memory(GiB)": 67.86,
"step": 40,
"token_acc": 0.5291459557162224,
"train_speed(iter/s)": 0.03954
},
{
"epoch": 0.045004500450045004,
"grad_norm": 1.6541378498077393,
"learning_rate": 1e-05,
"loss": 1.8318557739257812,
"memory(GiB)": 67.86,
"step": 50,
"token_acc": 0.5645268034414295,
"train_speed(iter/s)": 0.039725
},
{
"epoch": 0.054005400540054004,
"grad_norm": 1.6028215885162354,
"learning_rate": 1.2e-05,
"loss": 1.5337283134460449,
"memory(GiB)": 67.86,
"step": 60,
"token_acc": 0.6214239621423963,
"train_speed(iter/s)": 0.039848
},
{
"epoch": 0.063006300630063,
"grad_norm": 1.5401345491409302,
"learning_rate": 1.4e-05,
"loss": 1.361149787902832,
"memory(GiB)": 68.03,
"step": 70,
"token_acc": 0.6230786366674093,
"train_speed(iter/s)": 0.040044
},
{
"epoch": 0.07200720072007201,
"grad_norm": 1.1455940008163452,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.1177097320556642,
"memory(GiB)": 68.03,
"step": 80,
"token_acc": 0.6871520342612419,
"train_speed(iter/s)": 0.040185
},
{
"epoch": 0.081008100810081,
"grad_norm": 1.6210144758224487,
"learning_rate": 1.8e-05,
"loss": 0.9828067779541015,
"memory(GiB)": 68.03,
"step": 90,
"token_acc": 0.727211495285137,
"train_speed(iter/s)": 0.040239
},
{
"epoch": 0.09000900090009001,
"grad_norm": 1.810027837753296,
"learning_rate": 2e-05,
"loss": 0.9005414962768554,
"memory(GiB)": 68.08,
"step": 100,
"token_acc": 0.7328737613097802,
"train_speed(iter/s)": 0.040295
},
{
"epoch": 0.09900990099009901,
"grad_norm": 1.3532379865646362,
"learning_rate": 1.9999527877255423e-05,
"loss": 0.7943315982818604,
"memory(GiB)": 68.08,
"step": 110,
"token_acc": 0.7508290957329207,
"train_speed(iter/s)": 0.040336
},
{
"epoch": 0.10801080108010801,
"grad_norm": 1.267624855041504,
"learning_rate": 1.999811155360166e-05,
"loss": 0.7341960906982422,
"memory(GiB)": 68.08,
"step": 120,
"token_acc": 0.7815533980582524,
"train_speed(iter/s)": 0.040377
},
{
"epoch": 0.11701170117011701,
"grad_norm": 1.372360348701477,
"learning_rate": 1.9995751162774435e-05,
"loss": 0.7223796844482422,
"memory(GiB)": 68.08,
"step": 130,
"token_acc": 0.7679759605065465,
"train_speed(iter/s)": 0.040454
},
{
"epoch": 0.126012601260126,
"grad_norm": 1.3093749284744263,
"learning_rate": 1.9992446927652592e-05,
"loss": 0.6822004318237305,
"memory(GiB)": 68.08,
"step": 140,
"token_acc": 0.7690819178671253,
"train_speed(iter/s)": 0.04049
},
{
"epoch": 0.135013501350135,
"grad_norm": 1.5192912817001343,
"learning_rate": 1.9988199160237038e-05,
"loss": 0.6598445892333984,
"memory(GiB)": 68.08,
"step": 150,
"token_acc": 0.794921875,
"train_speed(iter/s)": 0.040521
},
{
"epoch": 0.14401440144014402,
"grad_norm": 1.4876294136047363,
"learning_rate": 1.9983008261621295e-05,
"loss": 0.6424094200134277,
"memory(GiB)": 68.08,
"step": 160,
"token_acc": 0.784965034965035,
"train_speed(iter/s)": 0.040569
},
{
"epoch": 0.15301530153015303,
"grad_norm": 1.7920929193496704,
"learning_rate": 1.9976874721953625e-05,
"loss": 0.6222011089324951,
"memory(GiB)": 68.08,
"step": 170,
"token_acc": 0.7873362445414848,
"train_speed(iter/s)": 0.040571
},
{
"epoch": 0.162016201620162,
"grad_norm": 1.496816873550415,
"learning_rate": 1.996979912039074e-05,
"loss": 0.6065957069396972,
"memory(GiB)": 68.08,
"step": 180,
"token_acc": 0.797979797979798,
"train_speed(iter/s)": 0.040601
},
{
"epoch": 0.171017101710171,
"grad_norm": 1.631809949874878,
"learning_rate": 1.9961782125043134e-05,
"loss": 0.6100308895111084,
"memory(GiB)": 68.08,
"step": 190,
"token_acc": 0.7921653971708379,
"train_speed(iter/s)": 0.040622
},
{
"epoch": 0.18001800180018002,
"grad_norm": 1.635206937789917,
"learning_rate": 1.9952824492911967e-05,
"loss": 0.597900390625,
"memory(GiB)": 68.08,
"step": 200,
"token_acc": 0.8024363233665559,
"train_speed(iter/s)": 0.040621
},
{
"epoch": 0.18901890189018902,
"grad_norm": 1.7094358205795288,
"learning_rate": 1.9942927069817618e-05,
"loss": 0.5765604972839355,
"memory(GiB)": 68.08,
"step": 210,
"token_acc": 0.8184182015167931,
"train_speed(iter/s)": 0.040647
},
{
"epoch": 0.19801980198019803,
"grad_norm": 1.7368491888046265,
"learning_rate": 1.99320907903198e-05,
"loss": 0.5700692176818848,
"memory(GiB)": 68.08,
"step": 220,
"token_acc": 0.8155997378195324,
"train_speed(iter/s)": 0.040667
},
{
"epoch": 0.207020702070207,
"grad_norm": 1.6970064640045166,
"learning_rate": 1.9920316677629312e-05,
"loss": 0.5586367607116699,
"memory(GiB)": 68.08,
"step": 230,
"token_acc": 0.8134110787172012,
"train_speed(iter/s)": 0.040669
},
{
"epoch": 0.21602160216021601,
"grad_norm": 1.6440980434417725,
"learning_rate": 1.9907605843511434e-05,
"loss": 0.5400181293487549,
"memory(GiB)": 68.08,
"step": 240,
"token_acc": 0.8248341625207297,
"train_speed(iter/s)": 0.040672
},
{
"epoch": 0.22502250225022502,
"grad_norm": 1.848779559135437,
"learning_rate": 1.9893959488180948e-05,
"loss": 0.5552643775939942,
"memory(GiB)": 68.08,
"step": 250,
"token_acc": 0.8090929154711984,
"train_speed(iter/s)": 0.040677
},
{
"epoch": 0.23402340234023403,
"grad_norm": 1.746717929840088,
"learning_rate": 1.9879378900188796e-05,
"loss": 0.5367072105407715,
"memory(GiB)": 68.08,
"step": 260,
"token_acc": 0.8096885813148789,
"train_speed(iter/s)": 0.040681
},
{
"epoch": 0.24302430243024303,
"grad_norm": 2.212620973587036,
"learning_rate": 1.9863865456300422e-05,
"loss": 0.5621134757995605,
"memory(GiB)": 68.08,
"step": 270,
"token_acc": 0.8111765989958525,
"train_speed(iter/s)": 0.040691
},
{
"epoch": 0.252025202520252,
"grad_norm": 1.815075159072876,
"learning_rate": 1.9847420621365773e-05,
"loss": 0.5444355964660644,
"memory(GiB)": 68.08,
"step": 280,
"token_acc": 0.8252319929297393,
"train_speed(iter/s)": 0.040694
},
{
"epoch": 0.26102610261026105,
"grad_norm": 1.6822190284729004,
"learning_rate": 1.983004594818096e-05,
"loss": 0.509169626235962,
"memory(GiB)": 68.08,
"step": 290,
"token_acc": 0.8245873889123995,
"train_speed(iter/s)": 0.040697
},
{
"epoch": 0.27002700270027,
"grad_norm": 1.7498018741607666,
"learning_rate": 1.981174307734167e-05,
"loss": 0.5199090480804444,
"memory(GiB)": 68.08,
"step": 300,
"token_acc": 0.8331916702082448,
"train_speed(iter/s)": 0.040678
},
{
"epoch": 0.279027902790279,
"grad_norm": 1.875012755393982,
"learning_rate": 1.9792513737088223e-05,
"loss": 0.5095804691314697,
"memory(GiB)": 68.08,
"step": 310,
"token_acc": 0.8261736049601417,
"train_speed(iter/s)": 0.040669
},
{
"epoch": 0.28802880288028804,
"grad_norm": 1.8016622066497803,
"learning_rate": 1.9772359743142396e-05,
"loss": 0.49691128730773926,
"memory(GiB)": 68.08,
"step": 320,
"token_acc": 0.8243214362043172,
"train_speed(iter/s)": 0.04068
},
{
"epoch": 0.297029702970297,
"grad_norm": 1.927909016609192,
"learning_rate": 1.975128299853598e-05,
"loss": 0.5156735897064209,
"memory(GiB)": 68.08,
"step": 330,
"token_acc": 0.8241394527802295,
"train_speed(iter/s)": 0.040684
},
{
"epoch": 0.30603060306030605,
"grad_norm": 1.7440602779388428,
"learning_rate": 1.9729285493431074e-05,
"loss": 0.5245149612426758,
"memory(GiB)": 68.24,
"step": 340,
"token_acc": 0.8179177837354781,
"train_speed(iter/s)": 0.040684
},
{
"epoch": 0.31503150315031503,
"grad_norm": 1.9903383255004883,
"learning_rate": 1.9706369304932176e-05,
"loss": 0.5069475173950195,
"memory(GiB)": 68.3,
"step": 350,
"token_acc": 0.8318876497315159,
"train_speed(iter/s)": 0.040686
},
{
"epoch": 0.324032403240324,
"grad_norm": 1.9196044206619263,
"learning_rate": 1.968253659689005e-05,
"loss": 0.5040374279022217,
"memory(GiB)": 68.3,
"step": 360,
"token_acc": 0.8283985303652475,
"train_speed(iter/s)": 0.040682
},
{
"epoch": 0.33303330333033304,
"grad_norm": 1.9835383892059326,
"learning_rate": 1.96577896196974e-05,
"loss": 0.5163045883178711,
"memory(GiB)": 68.3,
"step": 370,
"token_acc": 0.8187339406680683,
"train_speed(iter/s)": 0.040679
},
{
"epoch": 0.342034203420342,
"grad_norm": 2.098388195037842,
"learning_rate": 1.9632130710076383e-05,
"loss": 0.5065926074981689,
"memory(GiB)": 68.3,
"step": 380,
"token_acc": 0.8242616033755275,
"train_speed(iter/s)": 0.04068
},
{
"epoch": 0.35103510351035105,
"grad_norm": 1.8806556463241577,
"learning_rate": 1.960556229085797e-05,
"loss": 0.4967801094055176,
"memory(GiB)": 68.3,
"step": 390,
"token_acc": 0.8285966071821987,
"train_speed(iter/s)": 0.040692
},
{
"epoch": 0.36003600360036003,
"grad_norm": 2.0447497367858887,
"learning_rate": 1.9578086870753153e-05,
"loss": 0.5042286872863769,
"memory(GiB)": 68.3,
"step": 400,
"token_acc": 0.8263780406159339,
"train_speed(iter/s)": 0.040693
},
{
"epoch": 0.369036903690369,
"grad_norm": 1.947168231010437,
"learning_rate": 1.954970704411609e-05,
"loss": 0.5015206336975098,
"memory(GiB)": 68.3,
"step": 410,
"token_acc": 0.8200773860705073,
"train_speed(iter/s)": 0.04069
},
{
"epoch": 0.37803780378037805,
"grad_norm": 1.855016827583313,
"learning_rate": 1.9520425490699107e-05,
"loss": 0.4870131492614746,
"memory(GiB)": 68.3,
"step": 420,
"token_acc": 0.8407563025210084,
"train_speed(iter/s)": 0.040704
},
{
"epoch": 0.387038703870387,
"grad_norm": 1.8995352983474731,
"learning_rate": 1.9490244975399678e-05,
"loss": 0.48991098403930666,
"memory(GiB)": 68.3,
"step": 430,
"token_acc": 0.8367172472750588,
"train_speed(iter/s)": 0.040707
},
{
"epoch": 0.39603960396039606,
"grad_norm": 1.9746062755584717,
"learning_rate": 1.9459168347999343e-05,
"loss": 0.49413495063781737,
"memory(GiB)": 68.3,
"step": 440,
"token_acc": 0.8217993079584776,
"train_speed(iter/s)": 0.040722
},
{
"epoch": 0.40504050405040504,
"grad_norm": 1.9922826290130615,
"learning_rate": 1.9427198542894628e-05,
"loss": 0.478054141998291,
"memory(GiB)": 68.3,
"step": 450,
"token_acc": 0.8396687194733489,
"train_speed(iter/s)": 0.040729
},
{
"epoch": 0.414041404140414,
"grad_norm": 1.8262529373168945,
"learning_rate": 1.9394338578819957e-05,
"loss": 0.4965967178344727,
"memory(GiB)": 68.3,
"step": 460,
"token_acc": 0.8291083916083916,
"train_speed(iter/s)": 0.04073
},
{
"epoch": 0.42304230423042305,
"grad_norm": 1.6194044351577759,
"learning_rate": 1.936059155856262e-05,
"loss": 0.47453508377075193,
"memory(GiB)": 68.3,
"step": 470,
"token_acc": 0.8382074479276247,
"train_speed(iter/s)": 0.040729
},
{
"epoch": 0.43204320432043203,
"grad_norm": 1.9184072017669678,
"learning_rate": 1.932596066866978e-05,
"loss": 0.4665153980255127,
"memory(GiB)": 68.3,
"step": 480,
"token_acc": 0.8344993441189331,
"train_speed(iter/s)": 0.040723
},
{
"epoch": 0.44104410441044106,
"grad_norm": 1.7491145133972168,
"learning_rate": 1.929044917914759e-05,
"loss": 0.4606966972351074,
"memory(GiB)": 68.3,
"step": 490,
"token_acc": 0.84466817341278,
"train_speed(iter/s)": 0.040709
},
{
"epoch": 0.45004500450045004,
"grad_norm": 1.97507643699646,
"learning_rate": 1.9254060443152435e-05,
"loss": 0.47635550498962403,
"memory(GiB)": 68.3,
"step": 500,
"token_acc": 0.8395522388059702,
"train_speed(iter/s)": 0.040715
},
{
"epoch": 0.45004500450045004,
"eval_loss": 0.48444515466690063,
"eval_runtime": 117.4773,
"eval_samples_per_second": 12.215,
"eval_steps_per_second": 0.383,
"eval_token_acc": 0.8321749696233293,
"step": 500
},
{
"epoch": 0.459045904590459,
"grad_norm": 2.1366748809814453,
"learning_rate": 1.921679789667429e-05,
"loss": 0.4868021965026855,
"memory(GiB)": 74.54,
"step": 510,
"token_acc": 0.8326992287917738,
"train_speed(iter/s)": 0.040314
},
{
"epoch": 0.46804680468046805,
"grad_norm": 2.1436243057250977,
"learning_rate": 1.9178665058212306e-05,
"loss": 0.4831557273864746,
"memory(GiB)": 74.54,
"step": 520,
"token_acc": 0.8337397472844159,
"train_speed(iter/s)": 0.04031
},
{
"epoch": 0.47704770477047703,
"grad_norm": 1.887610673904419,
"learning_rate": 1.9139665528442544e-05,
"loss": 0.4900979995727539,
"memory(GiB)": 74.54,
"step": 530,
"token_acc": 0.8252338580880675,
"train_speed(iter/s)": 0.040315
},
{
"epoch": 0.48604860486048607,
"grad_norm": 1.778539776802063,
"learning_rate": 1.909980298987802e-05,
"loss": 0.4595688819885254,
"memory(GiB)": 74.54,
"step": 540,
"token_acc": 0.8390126692878986,
"train_speed(iter/s)": 0.040313
},
{
"epoch": 0.49504950495049505,
"grad_norm": 2.031074285507202,
"learning_rate": 1.9059081206520954e-05,
"loss": 0.47982397079467776,
"memory(GiB)": 74.54,
"step": 550,
"token_acc": 0.8332963374028857,
"train_speed(iter/s)": 0.040319
},
{
"epoch": 0.504050405040504,
"grad_norm": 1.7411119937896729,
"learning_rate": 1.9017504023507366e-05,
"loss": 0.47092242240905763,
"memory(GiB)": 74.54,
"step": 560,
"token_acc": 0.8331826401446655,
"train_speed(iter/s)": 0.040327
},
{
"epoch": 0.513051305130513,
"grad_norm": 1.9507403373718262,
"learning_rate": 1.897507536674401e-05,
"loss": 0.473051929473877,
"memory(GiB)": 74.54,
"step": 570,
"token_acc": 0.8324808184143222,
"train_speed(iter/s)": 0.040337
},
{
"epoch": 0.5220522052205221,
"grad_norm": 1.8194775581359863,
"learning_rate": 1.8931799242537664e-05,
"loss": 0.4804567813873291,
"memory(GiB)": 74.54,
"step": 580,
"token_acc": 0.8376344086021505,
"train_speed(iter/s)": 0.04034
},
{
"epoch": 0.5310531053105311,
"grad_norm": 1.663552165031433,
"learning_rate": 1.8887679737216835e-05,
"loss": 0.4625405311584473,
"memory(GiB)": 74.54,
"step": 590,
"token_acc": 0.8455850369725968,
"train_speed(iter/s)": 0.04034
},
{
"epoch": 0.54005400540054,
"grad_norm": 1.968461036682129,
"learning_rate": 1.8842721016745905e-05,
"loss": 0.4602372646331787,
"memory(GiB)": 74.54,
"step": 600,
"token_acc": 0.8317933641327173,
"train_speed(iter/s)": 0.040343
},
{
"epoch": 0.549054905490549,
"grad_norm": 1.9484490156173706,
"learning_rate": 1.8796927326331783e-05,
"loss": 0.45257129669189455,
"memory(GiB)": 74.54,
"step": 610,
"token_acc": 0.8373316498316499,
"train_speed(iter/s)": 0.040343
},
{
"epoch": 0.558055805580558,
"grad_norm": 2.0010809898376465,
"learning_rate": 1.8750302990023023e-05,
"loss": 0.4624796390533447,
"memory(GiB)": 74.54,
"step": 620,
"token_acc": 0.8330117899249732,
"train_speed(iter/s)": 0.04035
},
{
"epoch": 0.5670567056705671,
"grad_norm": 2.1292455196380615,
"learning_rate": 1.8702852410301556e-05,
"loss": 0.4666603565216064,
"memory(GiB)": 74.54,
"step": 630,
"token_acc": 0.8413180143073922,
"train_speed(iter/s)": 0.040354
},
{
"epoch": 0.5760576057605761,
"grad_norm": 1.8475803136825562,
"learning_rate": 1.865458006766696e-05,
"loss": 0.4536900520324707,
"memory(GiB)": 74.54,
"step": 640,
"token_acc": 0.8346206269877329,
"train_speed(iter/s)": 0.040359
},
{
"epoch": 0.585058505850585,
"grad_norm": 1.9390885829925537,
"learning_rate": 1.860549052021342e-05,
"loss": 0.4544112205505371,
"memory(GiB)": 74.54,
"step": 650,
"token_acc": 0.8367626886145405,
"train_speed(iter/s)": 0.040355
},
{
"epoch": 0.594059405940594,
"grad_norm": 1.7429540157318115,
"learning_rate": 1.8555588403199304e-05,
"loss": 0.4384955406188965,
"memory(GiB)": 74.54,
"step": 660,
"token_acc": 0.8417298261257244,
"train_speed(iter/s)": 0.04035
},
{
"epoch": 0.603060306030603,
"grad_norm": 2.0337181091308594,
"learning_rate": 1.8504878428609506e-05,
"loss": 0.46024494171142577,
"memory(GiB)": 74.54,
"step": 670,
"token_acc": 0.8392979256895373,
"train_speed(iter/s)": 0.040343
},
{
"epoch": 0.6120612061206121,
"grad_norm": 1.9363151788711548,
"learning_rate": 1.8453365384710506e-05,
"loss": 0.4446521759033203,
"memory(GiB)": 74.54,
"step": 680,
"token_acc": 0.8308807379749615,
"train_speed(iter/s)": 0.04034
},
{
"epoch": 0.6210621062106211,
"grad_norm": 1.9249675273895264,
"learning_rate": 1.8401054135598228e-05,
"loss": 0.44910879135131837,
"memory(GiB)": 74.54,
"step": 690,
"token_acc": 0.8436960276338514,
"train_speed(iter/s)": 0.040347
},
{
"epoch": 0.6300630063006301,
"grad_norm": 2.0293335914611816,
"learning_rate": 1.834794962073878e-05,
"loss": 0.4501783847808838,
"memory(GiB)": 74.54,
"step": 700,
"token_acc": 0.8366346742903819,
"train_speed(iter/s)": 0.040353
},
{
"epoch": 0.639063906390639,
"grad_norm": 2.1260316371917725,
"learning_rate": 1.829405685450202e-05,
"loss": 0.4506657600402832,
"memory(GiB)": 74.54,
"step": 710,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 0.040362
},
{
"epoch": 0.648064806480648,
"grad_norm": 1.8729071617126465,
"learning_rate": 1.8239380925688087e-05,
"loss": 0.4430402755737305,
"memory(GiB)": 74.54,
"step": 720,
"token_acc": 0.8478399659502022,
"train_speed(iter/s)": 0.040365
},
{
"epoch": 0.6570657065706571,
"grad_norm": 1.9187947511672974,
"learning_rate": 1.8183926997046905e-05,
"loss": 0.4478912353515625,
"memory(GiB)": 74.54,
"step": 730,
"token_acc": 0.8519141775347077,
"train_speed(iter/s)": 0.040364
},
{
"epoch": 0.6660666066606661,
"grad_norm": 2.07631254196167,
"learning_rate": 1.812770030479066e-05,
"loss": 0.4402505397796631,
"memory(GiB)": 74.54,
"step": 740,
"token_acc": 0.8526605893576426,
"train_speed(iter/s)": 0.040366
},
{
"epoch": 0.6750675067506751,
"grad_norm": 1.8189442157745361,
"learning_rate": 1.8070706158099417e-05,
"loss": 0.4404914855957031,
"memory(GiB)": 74.54,
"step": 750,
"token_acc": 0.8409304511278195,
"train_speed(iter/s)": 0.040367
},
{
"epoch": 0.684068406840684,
"grad_norm": 1.9871678352355957,
"learning_rate": 1.8012949938619756e-05,
"loss": 0.4483049392700195,
"memory(GiB)": 74.54,
"step": 760,
"token_acc": 0.8431750106974754,
"train_speed(iter/s)": 0.040371
},
{
"epoch": 0.693069306930693,
"grad_norm": 1.8938976526260376,
"learning_rate": 1.7954437099956657e-05,
"loss": 0.44423818588256836,
"memory(GiB)": 74.54,
"step": 770,
"token_acc": 0.8477157360406091,
"train_speed(iter/s)": 0.040371
},
{
"epoch": 0.7020702070207021,
"grad_norm": 1.8947218656539917,
"learning_rate": 1.7895173167158514e-05,
"loss": 0.4492767333984375,
"memory(GiB)": 74.54,
"step": 780,
"token_acc": 0.837278737470676,
"train_speed(iter/s)": 0.040374
},
{
"epoch": 0.7110711071107111,
"grad_norm": 1.9695574045181274,
"learning_rate": 1.7835163736195447e-05,
"loss": 0.44904842376708987,
"memory(GiB)": 74.54,
"step": 790,
"token_acc": 0.8408003479773815,
"train_speed(iter/s)": 0.040375
},
{
"epoch": 0.7200720072007201,
"grad_norm": 2.00817608833313,
"learning_rate": 1.777441447343091e-05,
"loss": 0.45390868186950684,
"memory(GiB)": 74.54,
"step": 800,
"token_acc": 0.8411726099321811,
"train_speed(iter/s)": 0.040379
},
{
"epoch": 0.729072907290729,
"grad_norm": 2.0400583744049072,
"learning_rate": 1.7712931115086633e-05,
"loss": 0.4411576747894287,
"memory(GiB)": 74.54,
"step": 810,
"token_acc": 0.8399218071242398,
"train_speed(iter/s)": 0.04038
},
{
"epoch": 0.738073807380738,
"grad_norm": 2.0157155990600586,
"learning_rate": 1.7650719466700994e-05,
"loss": 0.44756488800048827,
"memory(GiB)": 74.54,
"step": 820,
"token_acc": 0.842788038698329,
"train_speed(iter/s)": 0.040376
},
{
"epoch": 0.7470747074707471,
"grad_norm": 1.7088335752487183,
"learning_rate": 1.7587785402580828e-05,
"loss": 0.43597002029418946,
"memory(GiB)": 74.54,
"step": 830,
"token_acc": 0.8466036887089519,
"train_speed(iter/s)": 0.040379
},
{
"epoch": 0.7560756075607561,
"grad_norm": 2.1911604404449463,
"learning_rate": 1.752413486524675e-05,
"loss": 0.44062347412109376,
"memory(GiB)": 74.54,
"step": 840,
"token_acc": 0.8505315822388994,
"train_speed(iter/s)": 0.040375
},
{
"epoch": 0.7650765076507651,
"grad_norm": 2.0149848461151123,
"learning_rate": 1.7459773864872042e-05,
"loss": 0.4424751281738281,
"memory(GiB)": 74.54,
"step": 850,
"token_acc": 0.8476879246110015,
"train_speed(iter/s)": 0.040376
},
{
"epoch": 0.774077407740774,
"grad_norm": 1.9014195203781128,
"learning_rate": 1.7394708478715127e-05,
"loss": 0.4621281623840332,
"memory(GiB)": 74.54,
"step": 860,
"token_acc": 0.8423601937472479,
"train_speed(iter/s)": 0.040378
},
{
"epoch": 0.783078307830783,
"grad_norm": 2.0565760135650635,
"learning_rate": 1.7328944850545745e-05,
"loss": 0.4593350410461426,
"memory(GiB)": 74.54,
"step": 870,
"token_acc": 0.8399521531100479,
"train_speed(iter/s)": 0.040378
},
{
"epoch": 0.7920792079207921,
"grad_norm": 2.0428307056427,
"learning_rate": 1.7262489190064818e-05,
"loss": 0.43943395614624026,
"memory(GiB)": 74.54,
"step": 880,
"token_acc": 0.8423470453121737,
"train_speed(iter/s)": 0.04038
},
{
"epoch": 0.8010801080108011,
"grad_norm": 2.316945791244507,
"learning_rate": 1.7195347772318116e-05,
"loss": 0.43985910415649415,
"memory(GiB)": 74.54,
"step": 890,
"token_acc": 0.8351231838281743,
"train_speed(iter/s)": 0.040379
},
{
"epoch": 0.8100810081008101,
"grad_norm": 2.041092872619629,
"learning_rate": 1.7127526937103713e-05,
"loss": 0.4424757957458496,
"memory(GiB)": 74.54,
"step": 900,
"token_acc": 0.841919080256467,
"train_speed(iter/s)": 0.04038
},
{
"epoch": 0.819081908190819,
"grad_norm": 2.2010583877563477,
"learning_rate": 1.705903308837339e-05,
"loss": 0.4423489570617676,
"memory(GiB)": 74.54,
"step": 910,
"token_acc": 0.8436460412508316,
"train_speed(iter/s)": 0.040384
},
{
"epoch": 0.828082808280828,
"grad_norm": 1.804849624633789,
"learning_rate": 1.6989872693627916e-05,
"loss": 0.43178791999816896,
"memory(GiB)": 74.54,
"step": 920,
"token_acc": 0.8569312169312169,
"train_speed(iter/s)": 0.040391
},
{
"epoch": 0.8370837083708371,
"grad_norm": 2.3260996341705322,
"learning_rate": 1.6920052283306364e-05,
"loss": 0.4507165431976318,
"memory(GiB)": 74.54,
"step": 930,
"token_acc": 0.8385640099345225,
"train_speed(iter/s)": 0.040389
},
{
"epoch": 0.8460846084608461,
"grad_norm": 2.029878616333008,
"learning_rate": 1.684957845016949e-05,
"loss": 0.423465633392334,
"memory(GiB)": 74.54,
"step": 940,
"token_acc": 0.8474983613720778,
"train_speed(iter/s)": 0.040396
},
{
"epoch": 0.8550855085508551,
"grad_norm": 2.0568127632141113,
"learning_rate": 1.677845784867719e-05,
"loss": 0.426534366607666,
"memory(GiB)": 74.54,
"step": 950,
"token_acc": 0.8443046506403056,
"train_speed(iter/s)": 0.040397
},
{
"epoch": 0.8640864086408641,
"grad_norm": 2.0447678565979004,
"learning_rate": 1.6706697194360186e-05,
"loss": 0.43904976844787597,
"memory(GiB)": 74.54,
"step": 960,
"token_acc": 0.843986543313709,
"train_speed(iter/s)": 0.040403
},
{
"epoch": 0.873087308730873,
"grad_norm": 1.8627592325210571,
"learning_rate": 1.6634303263185885e-05,
"loss": 0.4334832191467285,
"memory(GiB)": 74.54,
"step": 970,
"token_acc": 0.8500109003706126,
"train_speed(iter/s)": 0.040406
},
{
"epoch": 0.8820882088208821,
"grad_norm": 2.1592624187469482,
"learning_rate": 1.656128289091859e-05,
"loss": 0.43813695907592776,
"memory(GiB)": 74.54,
"step": 980,
"token_acc": 0.8389203308663474,
"train_speed(iter/s)": 0.040402
},
{
"epoch": 0.8910891089108911,
"grad_norm": 1.7612345218658447,
"learning_rate": 1.6487642972474006e-05,
"loss": 0.43879289627075196,
"memory(GiB)": 74.54,
"step": 990,
"token_acc": 0.8460222412318221,
"train_speed(iter/s)": 0.040402
},
{
"epoch": 0.9000900090009001,
"grad_norm": 2.0122318267822266,
"learning_rate": 1.641339046126822e-05,
"loss": 0.4455322265625,
"memory(GiB)": 74.54,
"step": 1000,
"token_acc": 0.8455068614431164,
"train_speed(iter/s)": 0.040397
},
{
"epoch": 0.9000900090009001,
"eval_loss": 0.43926388025283813,
"eval_runtime": 113.4684,
"eval_samples_per_second": 12.647,
"eval_steps_per_second": 0.397,
"eval_token_acc": 0.8410206561360875,
"step": 1000
},
{
"epoch": 0.9090909090909091,
"grad_norm": 2.066300630569458,
"learning_rate": 1.6338532368561105e-05,
"loss": 0.4375774383544922,
"memory(GiB)": 74.54,
"step": 1010,
"token_acc": 0.8390414378432351,
"train_speed(iter/s)": 0.040187
},
{
"epoch": 0.918091809180918,
"grad_norm": 2.2568578720092773,
"learning_rate": 1.62630757627943e-05,
"loss": 0.4385653495788574,
"memory(GiB)": 74.54,
"step": 1020,
"token_acc": 0.8342480790340285,
"train_speed(iter/s)": 0.040185
},
{
"epoch": 0.9270927092709271,
"grad_norm": 1.963052749633789,
"learning_rate": 1.6187027768923767e-05,
"loss": 0.43105306625366213,
"memory(GiB)": 74.54,
"step": 1030,
"token_acc": 0.8509454949944383,
"train_speed(iter/s)": 0.040187
},
{
"epoch": 0.9360936093609361,
"grad_norm": 1.902685523033142,
"learning_rate": 1.6110395567747025e-05,
"loss": 0.4382938385009766,
"memory(GiB)": 74.54,
"step": 1040,
"token_acc": 0.8346938775510204,
"train_speed(iter/s)": 0.040185
},
{
"epoch": 0.9450945094509451,
"grad_norm": 1.8732327222824097,
"learning_rate": 1.6033186395225095e-05,
"loss": 0.41572961807250974,
"memory(GiB)": 74.54,
"step": 1050,
"token_acc": 0.85475935828877,
"train_speed(iter/s)": 0.04019
},
{
"epoch": 0.9540954095409541,
"grad_norm": 1.869422197341919,
"learning_rate": 1.5955407541799274e-05,
"loss": 0.43001718521118165,
"memory(GiB)": 74.54,
"step": 1060,
"token_acc": 0.8342636324602833,
"train_speed(iter/s)": 0.040189
},
{
"epoch": 0.963096309630963,
"grad_norm": 2.065873861312866,
"learning_rate": 1.5877066351702707e-05,
"loss": 0.43995866775512693,
"memory(GiB)": 74.54,
"step": 1070,
"token_acc": 0.8477516059957173,
"train_speed(iter/s)": 0.040194
},
{
"epoch": 0.9720972097209721,
"grad_norm": 2.1846609115600586,
"learning_rate": 1.5798170222266933e-05,
"loss": 0.4312899589538574,
"memory(GiB)": 74.54,
"step": 1080,
"token_acc": 0.8568353067814855,
"train_speed(iter/s)": 0.040196
},
{
"epoch": 0.9810981098109811,
"grad_norm": 2.151474714279175,
"learning_rate": 1.571872660322338e-05,
"loss": 0.431905460357666,
"memory(GiB)": 74.54,
"step": 1090,
"token_acc": 0.8473539953615855,
"train_speed(iter/s)": 0.040202
},
{
"epoch": 0.9900990099009901,
"grad_norm": 2.0136258602142334,
"learning_rate": 1.563874299599995e-05,
"loss": 0.4207723140716553,
"memory(GiB)": 74.54,
"step": 1100,
"token_acc": 0.8404571428571429,
"train_speed(iter/s)": 0.040206
},
{
"epoch": 0.9990999099909991,
"grad_norm": 2.0286359786987305,
"learning_rate": 1.555822695301266e-05,
"loss": 0.41998815536499023,
"memory(GiB)": 74.54,
"step": 1110,
"token_acc": 0.8462002412545235,
"train_speed(iter/s)": 0.040207
},
{
"epoch": 1.008100810081008,
"grad_norm": 2.1229543685913086,
"learning_rate": 1.5477186076952567e-05,
"loss": 0.41786656379699705,
"memory(GiB)": 74.54,
"step": 1120,
"token_acc": 0.8457294195541823,
"train_speed(iter/s)": 0.040226
},
{
"epoch": 1.0171017101710171,
"grad_norm": 2.2496182918548584,
"learning_rate": 1.5395628020067825e-05,
"loss": 0.41992764472961425,
"memory(GiB)": 74.54,
"step": 1130,
"token_acc": 0.8452407614781635,
"train_speed(iter/s)": 0.040225
},
{
"epoch": 1.026102610261026,
"grad_norm": 2.0818288326263428,
"learning_rate": 1.531356048344117e-05,
"loss": 0.41519851684570314,
"memory(GiB)": 74.54,
"step": 1140,
"token_acc": 0.8480816145486804,
"train_speed(iter/s)": 0.040226
},
{
"epoch": 1.035103510351035,
"grad_norm": 1.9498157501220703,
"learning_rate": 1.523099121626273e-05,
"loss": 0.4007615089416504,
"memory(GiB)": 74.54,
"step": 1150,
"token_acc": 0.8642224012892828,
"train_speed(iter/s)": 0.040229
},
{
"epoch": 1.0441044104410442,
"grad_norm": 2.238085985183716,
"learning_rate": 1.5147928015098309e-05,
"loss": 0.416591739654541,
"memory(GiB)": 74.54,
"step": 1160,
"token_acc": 0.8449678800856532,
"train_speed(iter/s)": 0.040231
},
{
"epoch": 1.053105310531053,
"grad_norm": 1.884536862373352,
"learning_rate": 1.506437872315321e-05,
"loss": 0.4058389663696289,
"memory(GiB)": 74.54,
"step": 1170,
"token_acc": 0.8544316996871741,
"train_speed(iter/s)": 0.040234
},
{
"epoch": 1.0621062106210621,
"grad_norm": 2.506772041320801,
"learning_rate": 1.4980351229531642e-05,
"loss": 0.4066319465637207,
"memory(GiB)": 74.54,
"step": 1180,
"token_acc": 0.8476423487544484,
"train_speed(iter/s)": 0.040236
},
{
"epoch": 1.071107110711071,
"grad_norm": 2.208542823791504,
"learning_rate": 1.4895853468491779e-05,
"loss": 0.4183638572692871,
"memory(GiB)": 74.54,
"step": 1190,
"token_acc": 0.8479634066652145,
"train_speed(iter/s)": 0.040233
},
{
"epoch": 1.08010801080108,
"grad_norm": 2.0623791217803955,
"learning_rate": 1.4810893418696595e-05,
"loss": 0.4236001014709473,
"memory(GiB)": 74.54,
"step": 1200,
"token_acc": 0.8621627274628739,
"train_speed(iter/s)": 0.040231
},
{
"epoch": 1.0891089108910892,
"grad_norm": 1.9633852243423462,
"learning_rate": 1.4725479102460467e-05,
"loss": 0.4070269584655762,
"memory(GiB)": 74.54,
"step": 1210,
"token_acc": 0.8519945602901179,
"train_speed(iter/s)": 0.040233
},
{
"epoch": 1.098109810981098,
"grad_norm": 2.425140857696533,
"learning_rate": 1.4639618584991679e-05,
"loss": 0.4048626899719238,
"memory(GiB)": 74.54,
"step": 1220,
"token_acc": 0.8575699338031176,
"train_speed(iter/s)": 0.040237
},
{
"epoch": 1.1071107110711071,
"grad_norm": 1.9179662466049194,
"learning_rate": 1.455331997363086e-05,
"loss": 0.41301331520080564,
"memory(GiB)": 74.54,
"step": 1230,
"token_acc": 0.8553283100107643,
"train_speed(iter/s)": 0.040242
},
{
"epoch": 1.116111611161116,
"grad_norm": 2.332228660583496,
"learning_rate": 1.4466591417085462e-05,
"loss": 0.4197710037231445,
"memory(GiB)": 74.54,
"step": 1240,
"token_acc": 0.8447427293064877,
"train_speed(iter/s)": 0.040246
},
{
"epoch": 1.125112511251125,
"grad_norm": 2.093475580215454,
"learning_rate": 1.4379441104660313e-05,
"loss": 0.4093982696533203,
"memory(GiB)": 74.54,
"step": 1250,
"token_acc": 0.8562723261189326,
"train_speed(iter/s)": 0.040245
},
{
"epoch": 1.1341134113411342,
"grad_norm": 2.2746119499206543,
"learning_rate": 1.4291877265484352e-05,
"loss": 0.4102977752685547,
"memory(GiB)": 74.54,
"step": 1260,
"token_acc": 0.854287556415216,
"train_speed(iter/s)": 0.040249
},
{
"epoch": 1.143114311431143,
"grad_norm": 2.2232649326324463,
"learning_rate": 1.4203908167733596e-05,
"loss": 0.418546724319458,
"memory(GiB)": 74.54,
"step": 1270,
"token_acc": 0.8427280550774526,
"train_speed(iter/s)": 0.040255
},
{
"epoch": 1.1521152115211521,
"grad_norm": 1.9787334203720093,
"learning_rate": 1.4115542117850415e-05,
"loss": 0.410016393661499,
"memory(GiB)": 74.54,
"step": 1280,
"token_acc": 0.86048545812377,
"train_speed(iter/s)": 0.040258
},
{
"epoch": 1.161116111611161,
"grad_norm": 2.3660764694213867,
"learning_rate": 1.4026787459759215e-05,
"loss": 0.4094221591949463,
"memory(GiB)": 74.54,
"step": 1290,
"token_acc": 0.8500684618895481,
"train_speed(iter/s)": 0.040257
},
{
"epoch": 1.17011701170117,
"grad_norm": 2.0939202308654785,
"learning_rate": 1.3937652574078543e-05,
"loss": 0.40435123443603516,
"memory(GiB)": 74.54,
"step": 1300,
"token_acc": 0.8442178346712953,
"train_speed(iter/s)": 0.040258
},
{
"epoch": 1.1791179117911792,
"grad_norm": 2.3308207988739014,
"learning_rate": 1.3848145877329778e-05,
"loss": 0.4132570743560791,
"memory(GiB)": 74.54,
"step": 1310,
"token_acc": 0.8504208935894668,
"train_speed(iter/s)": 0.040261
},
{
"epoch": 1.188118811881188,
"grad_norm": 2.053710460662842,
"learning_rate": 1.3758275821142382e-05,
"loss": 0.39916296005249025,
"memory(GiB)": 74.54,
"step": 1320,
"token_acc": 0.8543060651845457,
"train_speed(iter/s)": 0.04026
},
{
"epoch": 1.1971197119711972,
"grad_norm": 2.4674737453460693,
"learning_rate": 1.3668050891455873e-05,
"loss": 0.3984804630279541,
"memory(GiB)": 74.54,
"step": 1330,
"token_acc": 0.8585640138408305,
"train_speed(iter/s)": 0.040259
},
{
"epoch": 1.206120612061206,
"grad_norm": 2.1947102546691895,
"learning_rate": 1.357747960771854e-05,
"loss": 0.42041912078857424,
"memory(GiB)": 74.54,
"step": 1340,
"token_acc": 0.8391608391608392,
"train_speed(iter/s)": 0.040262
},
{
"epoch": 1.215121512151215,
"grad_norm": 2.0035359859466553,
"learning_rate": 1.3486570522082989e-05,
"loss": 0.4119097709655762,
"memory(GiB)": 74.54,
"step": 1350,
"token_acc": 0.8620765508139023,
"train_speed(iter/s)": 0.040265
},
{
"epoch": 1.2241224122412242,
"grad_norm": 2.161275863647461,
"learning_rate": 1.3395332218598629e-05,
"loss": 0.4057816982269287,
"memory(GiB)": 74.54,
"step": 1360,
"token_acc": 0.8410107334525939,
"train_speed(iter/s)": 0.040268
},
{
"epoch": 1.233123312331233,
"grad_norm": 2.300550937652588,
"learning_rate": 1.3303773312401107e-05,
"loss": 0.40541529655456543,
"memory(GiB)": 74.54,
"step": 1370,
"token_acc": 0.8559489773477018,
"train_speed(iter/s)": 0.040269
},
{
"epoch": 1.2421242124212422,
"grad_norm": 2.306222915649414,
"learning_rate": 1.3211902448898841e-05,
"loss": 0.40516185760498047,
"memory(GiB)": 74.54,
"step": 1380,
"token_acc": 0.8569854561480829,
"train_speed(iter/s)": 0.04027
},
{
"epoch": 1.251125112511251,
"grad_norm": 2.1976640224456787,
"learning_rate": 1.3119728302956676e-05,
"loss": 0.4062767505645752,
"memory(GiB)": 74.54,
"step": 1390,
"token_acc": 0.8493668073761387,
"train_speed(iter/s)": 0.040273
},
{
"epoch": 1.2601260126012601,
"grad_norm": 2.333188056945801,
"learning_rate": 1.302725957807676e-05,
"loss": 0.39322872161865235,
"memory(GiB)": 74.54,
"step": 1400,
"token_acc": 0.860806663743972,
"train_speed(iter/s)": 0.040272
},
{
"epoch": 1.2691269126912692,
"grad_norm": 2.356128215789795,
"learning_rate": 1.2934505005576738e-05,
"loss": 0.39969046115875245,
"memory(GiB)": 74.54,
"step": 1410,
"token_acc": 0.8573583279465632,
"train_speed(iter/s)": 0.040268
},
{
"epoch": 1.278127812781278,
"grad_norm": 2.1411805152893066,
"learning_rate": 1.2841473343765269e-05,
"loss": 0.39504408836364746,
"memory(GiB)": 74.54,
"step": 1420,
"token_acc": 0.8612200435729848,
"train_speed(iter/s)": 0.040269
},
{
"epoch": 1.2871287128712872,
"grad_norm": 2.187964677810669,
"learning_rate": 1.274817337711506e-05,
"loss": 0.4120161056518555,
"memory(GiB)": 74.54,
"step": 1430,
"token_acc": 0.849435382685069,
"train_speed(iter/s)": 0.040272
},
{
"epoch": 1.296129612961296,
"grad_norm": 2.098618745803833,
"learning_rate": 1.2654613915433373e-05,
"loss": 0.39701004028320314,
"memory(GiB)": 74.54,
"step": 1440,
"token_acc": 0.8512253307308609,
"train_speed(iter/s)": 0.040274
},
{
"epoch": 1.3051305130513051,
"grad_norm": 2.000491142272949,
"learning_rate": 1.2560803793030179e-05,
"loss": 0.40303592681884765,
"memory(GiB)": 74.54,
"step": 1450,
"token_acc": 0.8583260680034873,
"train_speed(iter/s)": 0.040274
},
{
"epoch": 1.3141314131413142,
"grad_norm": 2.1380844116210938,
"learning_rate": 1.2466751867883959e-05,
"loss": 0.397491455078125,
"memory(GiB)": 74.54,
"step": 1460,
"token_acc": 0.8592755214050494,
"train_speed(iter/s)": 0.040276
},
{
"epoch": 1.323132313231323,
"grad_norm": 2.110633611679077,
"learning_rate": 1.2372467020805332e-05,
"loss": 0.4155548095703125,
"memory(GiB)": 74.54,
"step": 1470,
"token_acc": 0.8501522401043932,
"train_speed(iter/s)": 0.040278
},
{
"epoch": 1.3321332133213322,
"grad_norm": 2.1096761226654053,
"learning_rate": 1.2277958154598444e-05,
"loss": 0.41139373779296873,
"memory(GiB)": 74.54,
"step": 1480,
"token_acc": 0.8384369287020109,
"train_speed(iter/s)": 0.040279
},
{
"epoch": 1.341134113411341,
"grad_norm": 2.346917152404785,
"learning_rate": 1.2183234193220362e-05,
"loss": 0.3898932456970215,
"memory(GiB)": 74.54,
"step": 1490,
"token_acc": 0.8620309050772627,
"train_speed(iter/s)": 0.04028
},
{
"epoch": 1.3501350135013501,
"grad_norm": 2.1962385177612305,
"learning_rate": 1.2088304080938404e-05,
"loss": 0.3953920841217041,
"memory(GiB)": 74.54,
"step": 1500,
"token_acc": 0.8660930950805207,
"train_speed(iter/s)": 0.040278
},
{
"epoch": 1.3501350135013501,
"eval_loss": 0.42292386293411255,
"eval_runtime": 112.5032,
"eval_samples_per_second": 12.755,
"eval_steps_per_second": 0.4,
"eval_token_acc": 0.8482138517618469,
"step": 1500
},
{
"epoch": 1.3591359135913592,
"grad_norm": 2.1046359539031982,
"learning_rate": 1.1993176781485608e-05,
"loss": 0.4179078578948975,
"memory(GiB)": 74.54,
"step": 1510,
"token_acc": 0.8453704665904603,
"train_speed(iter/s)": 0.040153
},
{
"epoch": 1.368136813681368,
"grad_norm": 2.0981786251068115,
"learning_rate": 1.1897861277214304e-05,
"loss": 0.38443617820739745,
"memory(GiB)": 74.54,
"step": 1520,
"token_acc": 0.8514383855732074,
"train_speed(iter/s)": 0.040151
},
{
"epoch": 1.3771377137713772,
"grad_norm": 2.335702419281006,
"learning_rate": 1.1802366568247998e-05,
"loss": 0.39206039905548096,
"memory(GiB)": 74.54,
"step": 1530,
"token_acc": 0.8556973163220414,
"train_speed(iter/s)": 0.040152
},
{
"epoch": 1.386138613861386,
"grad_norm": 2.2659618854522705,
"learning_rate": 1.1706701671631504e-05,
"loss": 0.39416942596435545,
"memory(GiB)": 74.54,
"step": 1540,
"token_acc": 0.8575920934411501,
"train_speed(iter/s)": 0.040154
},
{
"epoch": 1.3951395139513951,
"grad_norm": 2.3435161113739014,
"learning_rate": 1.1610875620479531e-05,
"loss": 0.4044766426086426,
"memory(GiB)": 74.54,
"step": 1550,
"token_acc": 0.8510254676583277,
"train_speed(iter/s)": 0.040156
},
{
"epoch": 1.4041404140414042,
"grad_norm": 2.155761241912842,
"learning_rate": 1.1514897463123735e-05,
"loss": 0.39972786903381347,
"memory(GiB)": 74.54,
"step": 1560,
"token_acc": 0.858606101091071,
"train_speed(iter/s)": 0.040158
},
{
"epoch": 1.413141314131413,
"grad_norm": 2.231323719024658,
"learning_rate": 1.141877626225833e-05,
"loss": 0.4081737518310547,
"memory(GiB)": 74.54,
"step": 1570,
"token_acc": 0.8568965517241379,
"train_speed(iter/s)": 0.040158
},
{
"epoch": 1.4221422142214222,
"grad_norm": 2.0848968029022217,
"learning_rate": 1.1322521094084352e-05,
"loss": 0.4104423999786377,
"memory(GiB)": 74.54,
"step": 1580,
"token_acc": 0.8589771972548151,
"train_speed(iter/s)": 0.04016
},
{
"epoch": 1.431143114311431,
"grad_norm": 2.1602284908294678,
"learning_rate": 1.1226141047452628e-05,
"loss": 0.39746341705322263,
"memory(GiB)": 74.54,
"step": 1590,
"token_acc": 0.8528940745824755,
"train_speed(iter/s)": 0.040163
},
{
"epoch": 1.4401440144014401,
"grad_norm": 2.202800750732422,
"learning_rate": 1.1129645223005592e-05,
"loss": 0.3975072383880615,
"memory(GiB)": 74.54,
"step": 1600,
"token_acc": 0.85933056224021,
"train_speed(iter/s)": 0.040165
},
{
"epoch": 1.4491449144914492,
"grad_norm": 2.0750746726989746,
"learning_rate": 1.103304273231794e-05,
"loss": 0.4078987598419189,
"memory(GiB)": 74.54,
"step": 1610,
"token_acc": 0.8481820114820328,
"train_speed(iter/s)": 0.040169
},
{
"epoch": 1.458145814581458,
"grad_norm": 2.0705268383026123,
"learning_rate": 1.0936342697036276e-05,
"loss": 0.40749187469482423,
"memory(GiB)": 74.54,
"step": 1620,
"token_acc": 0.8431718061674008,
"train_speed(iter/s)": 0.04017
},
{
"epoch": 1.4671467146714672,
"grad_norm": 2.2939624786376953,
"learning_rate": 1.0839554248017816e-05,
"loss": 0.39917492866516113,
"memory(GiB)": 74.54,
"step": 1630,
"token_acc": 0.8533273981749387,
"train_speed(iter/s)": 0.040171
},
{
"epoch": 1.476147614761476,
"grad_norm": 2.232426166534424,
"learning_rate": 1.0742686524468193e-05,
"loss": 0.3895902156829834,
"memory(GiB)": 74.54,
"step": 1640,
"token_acc": 0.8666959964804224,
"train_speed(iter/s)": 0.040172
},
{
"epoch": 1.4851485148514851,
"grad_norm": 2.317064046859741,
"learning_rate": 1.0645748673078513e-05,
"loss": 0.4001925468444824,
"memory(GiB)": 74.54,
"step": 1650,
"token_acc": 0.8580047403576815,
"train_speed(iter/s)": 0.040177
},
{
"epoch": 1.4941494149414942,
"grad_norm": 2.4603018760681152,
"learning_rate": 1.0548749847161666e-05,
"loss": 0.4078868865966797,
"memory(GiB)": 74.54,
"step": 1660,
"token_acc": 0.8525682355469589,
"train_speed(iter/s)": 0.04018
},
{
"epoch": 1.5031503150315033,
"grad_norm": 2.2700588703155518,
"learning_rate": 1.0451699205788031e-05,
"loss": 0.3826925277709961,
"memory(GiB)": 74.54,
"step": 1670,
"token_acc": 0.8540529189416212,
"train_speed(iter/s)": 0.040177
},
{
"epoch": 1.5121512151215122,
"grad_norm": 2.1843454837799072,
"learning_rate": 1.0354605912920643e-05,
"loss": 0.39476428031921384,
"memory(GiB)": 74.54,
"step": 1680,
"token_acc": 0.8572723153602175,
"train_speed(iter/s)": 0.040177
},
{
"epoch": 1.521152115211521,
"grad_norm": 2.183195114135742,
"learning_rate": 1.0257479136549889e-05,
"loss": 0.4017205715179443,
"memory(GiB)": 74.54,
"step": 1690,
"token_acc": 0.858510389913612,
"train_speed(iter/s)": 0.040177
},
{
"epoch": 1.5301530153015301,
"grad_norm": 2.2219948768615723,
"learning_rate": 1.0160328047827805e-05,
"loss": 0.3950798988342285,
"memory(GiB)": 74.54,
"step": 1700,
"token_acc": 0.859968881973772,
"train_speed(iter/s)": 0.04018
},
{
"epoch": 1.5391539153915392,
"grad_norm": 2.1306684017181396,
"learning_rate": 1.006316182020213e-05,
"loss": 0.3851861238479614,
"memory(GiB)": 74.54,
"step": 1710,
"token_acc": 0.8605112384310268,
"train_speed(iter/s)": 0.040185
},
{
"epoch": 1.5481548154815483,
"grad_norm": 2.3634705543518066,
"learning_rate": 9.965989628550073e-06,
"loss": 0.3927136421203613,
"memory(GiB)": 74.54,
"step": 1720,
"token_acc": 0.8631741821396994,
"train_speed(iter/s)": 0.040185
},
{
"epoch": 1.5571557155715572,
"grad_norm": 2.1868417263031006,
"learning_rate": 9.868820648311998e-06,
"loss": 0.3937791585922241,
"memory(GiB)": 74.54,
"step": 1730,
"token_acc": 0.8506729331339458,
"train_speed(iter/s)": 0.04019
},
{
"epoch": 1.566156615661566,
"grad_norm": 2.058154344558716,
"learning_rate": 9.771664054625036e-06,
"loss": 0.4051863193511963,
"memory(GiB)": 74.54,
"step": 1740,
"token_acc": 0.8571127057830308,
"train_speed(iter/s)": 0.04019
},
{
"epoch": 1.5751575157515751,
"grad_norm": 2.278233051300049,
"learning_rate": 9.674529021456711e-06,
"loss": 0.3995014429092407,
"memory(GiB)": 74.54,
"step": 1750,
"token_acc": 0.8531134736385333,
"train_speed(iter/s)": 0.04019
},
{
"epoch": 1.5841584158415842,
"grad_norm": 2.4994163513183594,
"learning_rate": 9.577424720738725e-06,
"loss": 0.3964822769165039,
"memory(GiB)": 74.54,
"step": 1760,
"token_acc": 0.8614113159567705,
"train_speed(iter/s)": 0.040189
},
{
"epoch": 1.5931593159315933,
"grad_norm": 2.2877440452575684,
"learning_rate": 9.480360321500866e-06,
"loss": 0.3912468433380127,
"memory(GiB)": 74.54,
"step": 1770,
"token_acc": 0.8542329726288987,
"train_speed(iter/s)": 0.04019
},
{
"epoch": 1.6021602160216022,
"grad_norm": 2.2842419147491455,
"learning_rate": 9.38334498900525e-06,
"loss": 0.396860408782959,
"memory(GiB)": 74.54,
"step": 1780,
"token_acc": 0.8597612958226769,
"train_speed(iter/s)": 0.040193
},
{
"epoch": 1.611161116111611,
"grad_norm": 2.171830415725708,
"learning_rate": 9.28638788388088e-06,
"loss": 0.39132468700408934,
"memory(GiB)": 74.54,
"step": 1790,
"token_acc": 0.8446624087591241,
"train_speed(iter/s)": 0.040193
},
{
"epoch": 1.6201620162016201,
"grad_norm": 2.2504782676696777,
"learning_rate": 9.189498161258678e-06,
"loss": 0.39133219718933104,
"memory(GiB)": 74.54,
"step": 1800,
"token_acc": 0.8526747195858498,
"train_speed(iter/s)": 0.040193
},
{
"epoch": 1.6291629162916292,
"grad_norm": 2.2380685806274414,
"learning_rate": 9.092684969906994e-06,
"loss": 0.39520695209503176,
"memory(GiB)": 74.54,
"step": 1810,
"token_acc": 0.8510874389702618,
"train_speed(iter/s)": 0.040195
},
{
"epoch": 1.6381638163816383,
"grad_norm": 2.3991379737854004,
"learning_rate": 8.995957451367751e-06,
"loss": 0.39344358444213867,
"memory(GiB)": 74.54,
"step": 1820,
"token_acc": 0.8661971830985915,
"train_speed(iter/s)": 0.040196
},
{
"epoch": 1.6471647164716472,
"grad_norm": 2.167818307876587,
"learning_rate": 8.899324739093255e-06,
"loss": 0.38270139694213867,
"memory(GiB)": 74.54,
"step": 1830,
"token_acc": 0.8632143593975655,
"train_speed(iter/s)": 0.040195
},
{
"epoch": 1.656165616561656,
"grad_norm": 2.1482577323913574,
"learning_rate": 8.802795957583774e-06,
"loss": 0.38856942653656007,
"memory(GiB)": 74.54,
"step": 1840,
"token_acc": 0.8508108108108108,
"train_speed(iter/s)": 0.040197
},
{
"epoch": 1.6651665166516652,
"grad_norm": 2.223714828491211,
"learning_rate": 8.706380221525959e-06,
"loss": 0.3878568172454834,
"memory(GiB)": 74.54,
"step": 1850,
"token_acc": 0.8518351722585004,
"train_speed(iter/s)": 0.040198
},
{
"epoch": 1.6741674167416742,
"grad_norm": 2.1293275356292725,
"learning_rate": 8.610086634932195e-06,
"loss": 0.3860627174377441,
"memory(GiB)": 74.54,
"step": 1860,
"token_acc": 0.8636664460622104,
"train_speed(iter/s)": 0.0402
},
{
"epoch": 1.6831683168316833,
"grad_norm": 2.2796740531921387,
"learning_rate": 8.513924290280955e-06,
"loss": 0.4010897636413574,
"memory(GiB)": 74.54,
"step": 1870,
"token_acc": 0.8624,
"train_speed(iter/s)": 0.040198
},
{
"epoch": 1.6921692169216922,
"grad_norm": 2.063302516937256,
"learning_rate": 8.417902267658264e-06,
"loss": 0.3978671312332153,
"memory(GiB)": 74.54,
"step": 1880,
"token_acc": 0.8563941299790356,
"train_speed(iter/s)": 0.040199
},
{
"epoch": 1.701170117011701,
"grad_norm": 2.589029550552368,
"learning_rate": 8.322029633900293e-06,
"loss": 0.4007380485534668,
"memory(GiB)": 74.54,
"step": 1890,
"token_acc": 0.8558875219683656,
"train_speed(iter/s)": 0.040201
},
{
"epoch": 1.7101710171017102,
"grad_norm": 2.1972382068634033,
"learning_rate": 8.226315441737232e-06,
"loss": 0.39293272495269777,
"memory(GiB)": 74.54,
"step": 1900,
"token_acc": 0.8606382978723405,
"train_speed(iter/s)": 0.040201
},
{
"epoch": 1.7191719171917192,
"grad_norm": 2.1070621013641357,
"learning_rate": 8.130768728938503e-06,
"loss": 0.4030153274536133,
"memory(GiB)": 74.54,
"step": 1910,
"token_acc": 0.858612883309323,
"train_speed(iter/s)": 0.040199
},
{
"epoch": 1.7281728172817283,
"grad_norm": 2.4515891075134277,
"learning_rate": 8.035398517459367e-06,
"loss": 0.3846758842468262,
"memory(GiB)": 74.54,
"step": 1920,
"token_acc": 0.8604975587072774,
"train_speed(iter/s)": 0.040203
},
{
"epoch": 1.7371737173717372,
"grad_norm": 2.4625024795532227,
"learning_rate": 7.940213812589018e-06,
"loss": 0.3977564096450806,
"memory(GiB)": 74.54,
"step": 1930,
"token_acc": 0.8620689655172413,
"train_speed(iter/s)": 0.040207
},
{
"epoch": 1.746174617461746,
"grad_norm": 2.358564853668213,
"learning_rate": 7.84522360210028e-06,
"loss": 0.3818389415740967,
"memory(GiB)": 74.54,
"step": 1940,
"token_acc": 0.8622779519331244,
"train_speed(iter/s)": 0.040208
},
{
"epoch": 1.7551755175517552,
"grad_norm": 2.43326473236084,
"learning_rate": 7.750436855400924e-06,
"loss": 0.40569381713867186,
"memory(GiB)": 74.54,
"step": 1950,
"token_acc": 0.8431502316346791,
"train_speed(iter/s)": 0.040209
},
{
"epoch": 1.7641764176417642,
"grad_norm": 2.141272783279419,
"learning_rate": 7.655862522686759e-06,
"loss": 0.4061896324157715,
"memory(GiB)": 74.54,
"step": 1960,
"token_acc": 0.8561802484733628,
"train_speed(iter/s)": 0.040213
},
{
"epoch": 1.7731773177317733,
"grad_norm": 2.1799638271331787,
"learning_rate": 7.561509534096486e-06,
"loss": 0.3843768835067749,
"memory(GiB)": 74.54,
"step": 1970,
"token_acc": 0.8601476840456478,
"train_speed(iter/s)": 0.040213
},
{
"epoch": 1.7821782178217822,
"grad_norm": 2.2130813598632812,
"learning_rate": 7.467386798868492e-06,
"loss": 0.383782172203064,
"memory(GiB)": 74.54,
"step": 1980,
"token_acc": 0.8536738538831903,
"train_speed(iter/s)": 0.040213
},
{
"epoch": 1.791179117911791,
"grad_norm": 2.2999327182769775,
"learning_rate": 7.373503204499589e-06,
"loss": 0.3898015975952148,
"memory(GiB)": 74.54,
"step": 1990,
"token_acc": 0.8597833014659019,
"train_speed(iter/s)": 0.040213
},
{
"epoch": 1.8001800180018002,
"grad_norm": 2.0685296058654785,
"learning_rate": 7.279867615905836e-06,
"loss": 0.39383411407470703,
"memory(GiB)": 74.54,
"step": 2000,
"token_acc": 0.8522530329289428,
"train_speed(iter/s)": 0.040217
},
{
"epoch": 1.8001800180018002,
"eval_loss": 0.40739279985427856,
"eval_runtime": 113.0562,
"eval_samples_per_second": 12.693,
"eval_steps_per_second": 0.398,
"eval_token_acc": 0.8513244228432564,
"step": 2000
},
{
"epoch": 1.8091809180918093,
"grad_norm": 2.3695876598358154,
"learning_rate": 7.186488874585441e-06,
"loss": 0.38712072372436523,
"memory(GiB)": 76.18,
"step": 2010,
"token_acc": 0.8560460652591171,
"train_speed(iter/s)": 0.040111
},
{
"epoch": 1.8181818181818183,
"grad_norm": 2.2949750423431396,
"learning_rate": 7.093375797783935e-06,
"loss": 0.38932750225067136,
"memory(GiB)": 76.18,
"step": 2020,
"token_acc": 0.8515789473684211,
"train_speed(iter/s)": 0.040113
},
{
"epoch": 1.8271827182718272,
"grad_norm": 2.102889060974121,
"learning_rate": 7.0005371776615884e-06,
"loss": 0.3895460844039917,
"memory(GiB)": 76.18,
"step": 2030,
"token_acc": 0.8582169709989259,
"train_speed(iter/s)": 0.040117
},
{
"epoch": 1.836183618361836,
"grad_norm": 2.2533607482910156,
"learning_rate": 6.907981780463233e-06,
"loss": 0.3849326133728027,
"memory(GiB)": 76.18,
"step": 2040,
"token_acc": 0.8707364762111667,
"train_speed(iter/s)": 0.040118
},
{
"epoch": 1.8451845184518452,
"grad_norm": 2.058211326599121,
"learning_rate": 6.815718345690496e-06,
"loss": 0.38345019817352294,
"memory(GiB)": 76.18,
"step": 2050,
"token_acc": 0.85548358275631,
"train_speed(iter/s)": 0.040122
},
{
"epoch": 1.8541854185418543,
"grad_norm": 2.466780424118042,
"learning_rate": 6.72375558527659e-06,
"loss": 0.38396077156066893,
"memory(GiB)": 76.18,
"step": 2060,
"token_acc": 0.8563974591651543,
"train_speed(iter/s)": 0.040122
},
{
"epoch": 1.8631863186318633,
"grad_norm": 2.325998544692993,
"learning_rate": 6.632102182763681e-06,
"loss": 0.3884021759033203,
"memory(GiB)": 76.18,
"step": 2070,
"token_acc": 0.8589527027027027,
"train_speed(iter/s)": 0.040123
},
{
"epoch": 1.8721872187218722,
"grad_norm": 2.3079795837402344,
"learning_rate": 6.540766792482962e-06,
"loss": 0.4022721290588379,
"memory(GiB)": 76.18,
"step": 2080,
"token_acc": 0.8444188722669735,
"train_speed(iter/s)": 0.040126
},
{
"epoch": 1.881188118811881,
"grad_norm": 2.305443525314331,
"learning_rate": 6.449758038737458e-06,
"loss": 0.3774123668670654,
"memory(GiB)": 76.18,
"step": 2090,
"token_acc": 0.859161246916349,
"train_speed(iter/s)": 0.040128
},
{
"epoch": 1.8901890189018902,
"grad_norm": 2.306131362915039,
"learning_rate": 6.359084514987688e-06,
"loss": 0.38950314521789553,
"memory(GiB)": 76.18,
"step": 2100,
"token_acc": 0.8646680942184154,
"train_speed(iter/s)": 0.040128
},
{
"epoch": 1.8991899189918993,
"grad_norm": 2.5018227100372314,
"learning_rate": 6.268754783040228e-06,
"loss": 0.3790890693664551,
"memory(GiB)": 76.18,
"step": 2110,
"token_acc": 0.8660165359338563,
"train_speed(iter/s)": 0.040128
},
{
"epoch": 1.9081908190819084,
"grad_norm": 2.1461129188537598,
"learning_rate": 6.17877737223928e-06,
"loss": 0.37567844390869143,
"memory(GiB)": 76.18,
"step": 2120,
"token_acc": 0.8673469387755102,
"train_speed(iter/s)": 0.040129
},
{
"epoch": 1.9171917191719172,
"grad_norm": 2.1912460327148438,
"learning_rate": 6.089160778661262e-06,
"loss": 0.37552733421325685,
"memory(GiB)": 76.18,
"step": 2130,
"token_acc": 0.8715083798882681,
"train_speed(iter/s)": 0.040128
},
{
"epoch": 1.926192619261926,
"grad_norm": 2.2097115516662598,
"learning_rate": 5.999913464312606e-06,
"loss": 0.37886598110198977,
"memory(GiB)": 76.18,
"step": 2140,
"token_acc": 0.8663426488456865,
"train_speed(iter/s)": 0.040129
},
{
"epoch": 1.9351935193519352,
"grad_norm": 2.239027976989746,
"learning_rate": 5.911043856330701e-06,
"loss": 0.4021574020385742,
"memory(GiB)": 76.18,
"step": 2150,
"token_acc": 0.8618796662274923,
"train_speed(iter/s)": 0.040132
},
{
"epoch": 1.9441944194419443,
"grad_norm": 2.1112523078918457,
"learning_rate": 5.822560346188204e-06,
"loss": 0.3870594024658203,
"memory(GiB)": 76.18,
"step": 2160,
"token_acc": 0.8622662266226623,
"train_speed(iter/s)": 0.040134
},
{
"epoch": 1.9531953195319534,
"grad_norm": 2.1353354454040527,
"learning_rate": 5.7344712889006424e-06,
"loss": 0.38895013332366946,
"memory(GiB)": 76.18,
"step": 2170,
"token_acc": 0.8509840674789129,
"train_speed(iter/s)": 0.040134
},
{
"epoch": 1.9621962196219622,
"grad_norm": 2.064527988433838,
"learning_rate": 5.646785002237509e-06,
"loss": 0.3719027519226074,
"memory(GiB)": 76.18,
"step": 2180,
"token_acc": 0.8651858368154828,
"train_speed(iter/s)": 0.040134
},
{
"epoch": 1.971197119711971,
"grad_norm": 2.2494568824768066,
"learning_rate": 5.5595097659368765e-06,
"loss": 0.37720603942871095,
"memory(GiB)": 76.18,
"step": 2190,
"token_acc": 0.8660617844026788,
"train_speed(iter/s)": 0.040134
},
{
"epoch": 1.9801980198019802,
"grad_norm": 2.422858715057373,
"learning_rate": 5.472653820923564e-06,
"loss": 0.3978924036026001,
"memory(GiB)": 76.18,
"step": 2200,
"token_acc": 0.8567662565905096,
"train_speed(iter/s)": 0.040138
},
{
"epoch": 1.9891989198919893,
"grad_norm": 2.5676939487457275,
"learning_rate": 5.386225368530995e-06,
"loss": 0.39810938835144044,
"memory(GiB)": 76.18,
"step": 2210,
"token_acc": 0.8570179274158286,
"train_speed(iter/s)": 0.04014
},
{
"epoch": 1.9981998199819984,
"grad_norm": 2.2991700172424316,
"learning_rate": 5.300232569726805e-06,
"loss": 0.3851327657699585,
"memory(GiB)": 76.18,
"step": 2220,
"token_acc": 0.8624459120929173,
"train_speed(iter/s)": 0.040141
},
{
"epoch": 2.007200720072007,
"grad_norm": 2.1788246631622314,
"learning_rate": 5.2146835443422215e-06,
"loss": 0.3738105773925781,
"memory(GiB)": 76.18,
"step": 2230,
"token_acc": 0.8664259927797834,
"train_speed(iter/s)": 0.04015
},
{
"epoch": 2.016201620162016,
"grad_norm": 2.2583391666412354,
"learning_rate": 5.129586370305389e-06,
"loss": 0.37696280479431155,
"memory(GiB)": 76.18,
"step": 2240,
"token_acc": 0.8627628306579245,
"train_speed(iter/s)": 0.040149
},
{
"epoch": 2.025202520252025,
"grad_norm": 2.3937697410583496,
"learning_rate": 5.0449490828785745e-06,
"loss": 0.35777480602264405,
"memory(GiB)": 76.18,
"step": 2250,
"token_acc": 0.8723312486521457,
"train_speed(iter/s)": 0.040148
},
{
"epoch": 2.0342034203420343,
"grad_norm": 2.3122761249542236,
"learning_rate": 4.960779673899465e-06,
"loss": 0.3647487163543701,
"memory(GiB)": 76.18,
"step": 2260,
"token_acc": 0.8682050144220103,
"train_speed(iter/s)": 0.04015
},
{
"epoch": 2.0432043204320434,
"grad_norm": 2.3489394187927246,
"learning_rate": 4.8770860910265315e-06,
"loss": 0.3610623836517334,
"memory(GiB)": 76.18,
"step": 2270,
"token_acc": 0.8642826367944851,
"train_speed(iter/s)": 0.040151
},
{
"epoch": 2.052205220522052,
"grad_norm": 2.564075469970703,
"learning_rate": 4.793876236988593e-06,
"loss": 0.3656606674194336,
"memory(GiB)": 76.18,
"step": 2280,
"token_acc": 0.8674548848786559,
"train_speed(iter/s)": 0.040152
},
{
"epoch": 2.061206120612061,
"grad_norm": 2.3542511463165283,
"learning_rate": 4.711157968838577e-06,
"loss": 0.38109097480773924,
"memory(GiB)": 76.18,
"step": 2290,
"token_acc": 0.8542568542568543,
"train_speed(iter/s)": 0.040154
},
{
"epoch": 2.07020702070207,
"grad_norm": 2.5607492923736572,
"learning_rate": 4.628939097211641e-06,
"loss": 0.3731189966201782,
"memory(GiB)": 76.18,
"step": 2300,
"token_acc": 0.8808107512667989,
"train_speed(iter/s)": 0.040155
},
{
"epoch": 2.0792079207920793,
"grad_norm": 2.4762189388275146,
"learning_rate": 4.547227385587648e-06,
"loss": 0.3798922300338745,
"memory(GiB)": 76.18,
"step": 2310,
"token_acc": 0.8597145993413831,
"train_speed(iter/s)": 0.040157
},
{
"epoch": 2.0882088208820884,
"grad_norm": 2.485635280609131,
"learning_rate": 4.466030549558116e-06,
"loss": 0.3755971670150757,
"memory(GiB)": 76.18,
"step": 2320,
"token_acc": 0.8549968704360525,
"train_speed(iter/s)": 0.040157
},
{
"epoch": 2.097209720972097,
"grad_norm": 2.2108871936798096,
"learning_rate": 4.385356256097656e-06,
"loss": 0.35892772674560547,
"memory(GiB)": 76.18,
"step": 2330,
"token_acc": 0.8641063515509602,
"train_speed(iter/s)": 0.040157
},
{
"epoch": 2.106210621062106,
"grad_norm": 2.559431791305542,
"learning_rate": 4.305212122840038e-06,
"loss": 0.36676650047302245,
"memory(GiB)": 76.18,
"step": 2340,
"token_acc": 0.8685561258647624,
"train_speed(iter/s)": 0.040159
},
{
"epoch": 2.115211521152115,
"grad_norm": 2.3263328075408936,
"learning_rate": 4.22560571735889e-06,
"loss": 0.3723811149597168,
"memory(GiB)": 76.18,
"step": 2350,
"token_acc": 0.8562313908974905,
"train_speed(iter/s)": 0.04016
},
{
"epoch": 2.1242124212421243,
"grad_norm": 2.4957282543182373,
"learning_rate": 4.146544556453146e-06,
"loss": 0.3725306987762451,
"memory(GiB)": 76.18,
"step": 2360,
"token_acc": 0.8700726712177934,
"train_speed(iter/s)": 0.040162
},
{
"epoch": 2.1332133213321334,
"grad_norm": 2.5752525329589844,
"learning_rate": 4.068036105437259e-06,
"loss": 0.3709956884384155,
"memory(GiB)": 76.18,
"step": 2370,
"token_acc": 0.8635585970915313,
"train_speed(iter/s)": 0.040163
},
{
"epoch": 2.142214221422142,
"grad_norm": 2.509699583053589,
"learning_rate": 3.990087777436303e-06,
"loss": 0.37915217876434326,
"memory(GiB)": 76.18,
"step": 2380,
"token_acc": 0.8585365853658536,
"train_speed(iter/s)": 0.040161
},
{
"epoch": 2.151215121512151,
"grad_norm": 2.5639617443084717,
"learning_rate": 3.9127069326859815e-06,
"loss": 0.36791577339172366,
"memory(GiB)": 76.18,
"step": 2390,
"token_acc": 0.8695652173913043,
"train_speed(iter/s)": 0.040161
},
{
"epoch": 2.16021602160216,
"grad_norm": 2.5950934886932373,
"learning_rate": 3.835900877837665e-06,
"loss": 0.37401318550109863,
"memory(GiB)": 76.18,
"step": 2400,
"token_acc": 0.8627917026793431,
"train_speed(iter/s)": 0.04016
},
{
"epoch": 2.1692169216921693,
"grad_norm": 2.627086639404297,
"learning_rate": 3.7596768652684324e-06,
"loss": 0.37379937171936034,
"memory(GiB)": 76.18,
"step": 2410,
"token_acc": 0.8596715717637022,
"train_speed(iter/s)": 0.040162
},
{
"epoch": 2.1782178217821784,
"grad_norm": 3.0903186798095703,
"learning_rate": 3.6840420923962873e-06,
"loss": 0.36346681118011476,
"memory(GiB)": 76.18,
"step": 2420,
"token_acc": 0.8670668953687821,
"train_speed(iter/s)": 0.040164
},
{
"epoch": 2.187218721872187,
"grad_norm": 2.4955599308013916,
"learning_rate": 3.609003701000535e-06,
"loss": 0.35879087448120117,
"memory(GiB)": 76.18,
"step": 2430,
"token_acc": 0.8731778425655977,
"train_speed(iter/s)": 0.040165
},
{
"epoch": 2.196219621962196,
"grad_norm": 2.3009448051452637,
"learning_rate": 3.5345687765474444e-06,
"loss": 0.37301011085510255,
"memory(GiB)": 76.18,
"step": 2440,
"token_acc": 0.8637790332705587,
"train_speed(iter/s)": 0.040167
},
{
"epoch": 2.205220522052205,
"grad_norm": 2.5973548889160156,
"learning_rate": 3.4607443475211745e-06,
"loss": 0.37910096645355223,
"memory(GiB)": 76.18,
"step": 2450,
"token_acc": 0.862,
"train_speed(iter/s)": 0.040169
},
{
"epoch": 2.2142214221422143,
"grad_norm": 2.7337653636932373,
"learning_rate": 3.3875373847601365e-06,
"loss": 0.36832966804504397,
"memory(GiB)": 76.18,
"step": 2460,
"token_acc": 0.8709608843537415,
"train_speed(iter/s)": 0.040171
},
{
"epoch": 2.2232223222322234,
"grad_norm": 2.4979779720306396,
"learning_rate": 3.314954800798763e-06,
"loss": 0.35463604927062986,
"memory(GiB)": 76.18,
"step": 2470,
"token_acc": 0.8807906114885732,
"train_speed(iter/s)": 0.040173
},
{
"epoch": 2.232223222322232,
"grad_norm": 2.651418685913086,
"learning_rate": 3.24300344921481e-06,
"loss": 0.3576260805130005,
"memory(GiB)": 76.18,
"step": 2480,
"token_acc": 0.8673512154233026,
"train_speed(iter/s)": 0.040173
},
{
"epoch": 2.241224122412241,
"grad_norm": 2.2821831703186035,
"learning_rate": 3.1716901239821918e-06,
"loss": 0.3680659294128418,
"memory(GiB)": 76.18,
"step": 2490,
"token_acc": 0.8615550755939525,
"train_speed(iter/s)": 0.040176
},
{
"epoch": 2.25022502250225,
"grad_norm": 2.532939910888672,
"learning_rate": 3.1010215588294724e-06,
"loss": 0.3763418674468994,
"memory(GiB)": 76.18,
"step": 2500,
"token_acc": 0.8679738562091504,
"train_speed(iter/s)": 0.040176
},
{
"epoch": 2.25022502250225,
"eval_loss": 0.39449381828308105,
"eval_runtime": 112.8212,
"eval_samples_per_second": 12.719,
"eval_steps_per_second": 0.399,
"eval_token_acc": 0.8566221142162819,
"step": 2500
},
{
"epoch": 2.2592259225922593,
"grad_norm": 2.495901584625244,
"learning_rate": 3.031004426604044e-06,
"loss": 0.3614701271057129,
"memory(GiB)": 76.18,
"step": 2510,
"token_acc": 0.8576721210250077,
"train_speed(iter/s)": 0.040102
},
{
"epoch": 2.2682268226822684,
"grad_norm": 2.6652517318725586,
"learning_rate": 2.961645338642032e-06,
"loss": 0.3705326557159424,
"memory(GiB)": 76.18,
"step": 2520,
"token_acc": 0.8555579261787924,
"train_speed(iter/s)": 0.040101
},
{
"epoch": 2.2772277227722775,
"grad_norm": 2.2919044494628906,
"learning_rate": 2.892950844144028e-06,
"loss": 0.3567212581634521,
"memory(GiB)": 76.18,
"step": 2530,
"token_acc": 0.8672348060103162,
"train_speed(iter/s)": 0.0401
},
{
"epoch": 2.286228622862286,
"grad_norm": 2.7642829418182373,
"learning_rate": 2.8249274295566863e-06,
"loss": 0.3735655784606934,
"memory(GiB)": 76.18,
"step": 2540,
"token_acc": 0.8645260611392127,
"train_speed(iter/s)": 0.040102
},
{
"epoch": 2.295229522952295,
"grad_norm": 2.2890052795410156,
"learning_rate": 2.7575815179602527e-06,
"loss": 0.36810617446899413,
"memory(GiB)": 76.18,
"step": 2550,
"token_acc": 0.8708510638297873,
"train_speed(iter/s)": 0.040105
},
{
"epoch": 2.3042304230423043,
"grad_norm": 2.5169107913970947,
"learning_rate": 2.6909194684620453e-06,
"loss": 0.3683924674987793,
"memory(GiB)": 76.18,
"step": 2560,
"token_acc": 0.8675250982103885,
"train_speed(iter/s)": 0.040108
},
{
"epoch": 2.3132313231323134,
"grad_norm": 2.696864128112793,
"learning_rate": 2.6249475755960185e-06,
"loss": 0.3705678701400757,
"memory(GiB)": 76.18,
"step": 2570,
"token_acc": 0.8628597122302158,
"train_speed(iter/s)": 0.040109
},
{
"epoch": 2.322232223222322,
"grad_norm": 2.4484846591949463,
"learning_rate": 2.559672068728398e-06,
"loss": 0.36278524398803713,
"memory(GiB)": 76.18,
"step": 2580,
"token_acc": 0.8645696810834426,
"train_speed(iter/s)": 0.04011
},
{
"epoch": 2.331233123312331,
"grad_norm": 2.4576802253723145,
"learning_rate": 2.4950991114694755e-06,
"loss": 0.3606465578079224,
"memory(GiB)": 76.18,
"step": 2590,
"token_acc": 0.8734927015020097,
"train_speed(iter/s)": 0.040113
},
{
"epoch": 2.34023402340234,
"grad_norm": 2.6191623210906982,
"learning_rate": 2.4312348010916088e-06,
"loss": 0.36288201808929443,
"memory(GiB)": 76.18,
"step": 2600,
"token_acc": 0.8631202691337259,
"train_speed(iter/s)": 0.040113
},
{
"epoch": 2.3492349234923493,
"grad_norm": 2.6887686252593994,
"learning_rate": 2.3680851679535024e-06,
"loss": 0.3752190589904785,
"memory(GiB)": 76.18,
"step": 2610,
"token_acc": 0.8617521367521368,
"train_speed(iter/s)": 0.040114
},
{
"epoch": 2.3582358235823584,
"grad_norm": 2.481362819671631,
"learning_rate": 2.305656174930776e-06,
"loss": 0.36593198776245117,
"memory(GiB)": 76.18,
"step": 2620,
"token_acc": 0.8668838219326819,
"train_speed(iter/s)": 0.040116
},
{
"epoch": 2.3672367236723675,
"grad_norm": 2.629666328430176,
"learning_rate": 2.243953716852938e-06,
"loss": 0.3610795021057129,
"memory(GiB)": 76.18,
"step": 2630,
"token_acc": 0.8612348822406111,
"train_speed(iter/s)": 0.040117
},
{
"epoch": 2.376237623762376,
"grad_norm": 2.433375597000122,
"learning_rate": 2.1829836199467568e-06,
"loss": 0.3648895263671875,
"memory(GiB)": 76.18,
"step": 2640,
"token_acc": 0.8715654952076677,
"train_speed(iter/s)": 0.040119
},
{
"epoch": 2.385238523852385,
"grad_norm": 2.5231969356536865,
"learning_rate": 2.1227516412861303e-06,
"loss": 0.34891419410705565,
"memory(GiB)": 76.18,
"step": 2650,
"token_acc": 0.8747478822105688,
"train_speed(iter/s)": 0.040119
},
{
"epoch": 2.3942394239423943,
"grad_norm": 2.6941776275634766,
"learning_rate": 2.063263468248472e-06,
"loss": 0.35621964931488037,
"memory(GiB)": 76.18,
"step": 2660,
"token_acc": 0.8614357262103506,
"train_speed(iter/s)": 0.040119
},
{
"epoch": 2.4032403240324034,
"grad_norm": 2.4811367988586426,
"learning_rate": 2.0045247179776927e-06,
"loss": 0.36508636474609374,
"memory(GiB)": 76.18,
"step": 2670,
"token_acc": 0.865956984575277,
"train_speed(iter/s)": 0.040122
},
{
"epoch": 2.412241224122412,
"grad_norm": 2.5584983825683594,
"learning_rate": 1.946540936853787e-06,
"loss": 0.36142873764038086,
"memory(GiB)": 76.18,
"step": 2680,
"token_acc": 0.8618881118881119,
"train_speed(iter/s)": 0.040122
},
{
"epoch": 2.421242124212421,
"grad_norm": 2.639416217803955,
"learning_rate": 1.8893175999691315e-06,
"loss": 0.3669375658035278,
"memory(GiB)": 76.18,
"step": 2690,
"token_acc": 0.8706407137064072,
"train_speed(iter/s)": 0.040123
},
{
"epoch": 2.43024302430243,
"grad_norm": 2.526108980178833,
"learning_rate": 1.8328601106114974e-06,
"loss": 0.36782519817352294,
"memory(GiB)": 76.18,
"step": 2700,
"token_acc": 0.8681867535287731,
"train_speed(iter/s)": 0.040125
},
{
"epoch": 2.4392439243924393,
"grad_norm": 2.4853765964508057,
"learning_rate": 1.7771737997538551e-06,
"loss": 0.3661306858062744,
"memory(GiB)": 76.18,
"step": 2710,
"token_acc": 0.8591703056768559,
"train_speed(iter/s)": 0.040126
},
{
"epoch": 2.4482448244824484,
"grad_norm": 2.546694040298462,
"learning_rate": 1.7222639255509855e-06,
"loss": 0.3565016269683838,
"memory(GiB)": 76.18,
"step": 2720,
"token_acc": 0.8700276536907041,
"train_speed(iter/s)": 0.040126
},
{
"epoch": 2.4572457245724575,
"grad_norm": 2.6145668029785156,
"learning_rate": 1.6681356728429909e-06,
"loss": 0.3617668628692627,
"memory(GiB)": 76.18,
"step": 2730,
"token_acc": 0.8759859772129711,
"train_speed(iter/s)": 0.040127
},
{
"epoch": 2.466246624662466,
"grad_norm": 2.4962821006774902,
"learning_rate": 1.6147941526657151e-06,
"loss": 0.36135101318359375,
"memory(GiB)": 76.18,
"step": 2740,
"token_acc": 0.8689489751417357,
"train_speed(iter/s)": 0.040127
},
{
"epoch": 2.4752475247524752,
"grad_norm": 2.476327896118164,
"learning_rate": 1.5622444017681438e-06,
"loss": 0.3584137916564941,
"memory(GiB)": 76.18,
"step": 2750,
"token_acc": 0.8637279033340792,
"train_speed(iter/s)": 0.040128
},
{
"epoch": 2.4842484248424843,
"grad_norm": 2.5135715007781982,
"learning_rate": 1.5104913821367995e-06,
"loss": 0.352571439743042,
"memory(GiB)": 76.18,
"step": 2760,
"token_acc": 0.8638624119353502,
"train_speed(iter/s)": 0.040127
},
{
"epoch": 2.4932493249324934,
"grad_norm": 2.535942316055298,
"learning_rate": 1.4595399805272138e-06,
"loss": 0.35703449249267577,
"memory(GiB)": 76.18,
"step": 2770,
"token_acc": 0.8715143715143715,
"train_speed(iter/s)": 0.040129
},
{
"epoch": 2.502250225022502,
"grad_norm": 2.5901577472686768,
"learning_rate": 1.409395008002501e-06,
"loss": 0.3632636070251465,
"memory(GiB)": 76.18,
"step": 2780,
"token_acc": 0.8740141137401412,
"train_speed(iter/s)": 0.040131
},
{
"epoch": 2.511251125112511,
"grad_norm": 2.4865550994873047,
"learning_rate": 1.3600611994790737e-06,
"loss": 0.36820478439331056,
"memory(GiB)": 76.18,
"step": 2790,
"token_acc": 0.8674225904928042,
"train_speed(iter/s)": 0.040131
},
{
"epoch": 2.5202520252025202,
"grad_norm": 2.745784044265747,
"learning_rate": 1.311543213279548e-06,
"loss": 0.36357576847076417,
"memory(GiB)": 76.18,
"step": 2800,
"token_acc": 0.8688079619995476,
"train_speed(iter/s)": 0.040134
},
{
"epoch": 2.5292529252925293,
"grad_norm": 2.613213300704956,
"learning_rate": 1.2638456306928838e-06,
"loss": 0.35836281776428225,
"memory(GiB)": 76.18,
"step": 2810,
"token_acc": 0.8775203775203775,
"train_speed(iter/s)": 0.040135
},
{
"epoch": 2.5382538253825384,
"grad_norm": 2.856757879257202,
"learning_rate": 1.2169729555418008e-06,
"loss": 0.35776748657226565,
"memory(GiB)": 76.18,
"step": 2820,
"token_acc": 0.8681778169014085,
"train_speed(iter/s)": 0.040136
},
{
"epoch": 2.5472547254725475,
"grad_norm": 2.5222392082214355,
"learning_rate": 1.1709296137575088e-06,
"loss": 0.357517409324646,
"memory(GiB)": 76.18,
"step": 2830,
"token_acc": 0.8692437684833122,
"train_speed(iter/s)": 0.040138
},
{
"epoch": 2.556255625562556,
"grad_norm": 2.6644461154937744,
"learning_rate": 1.1257199529617846e-06,
"loss": 0.3525848388671875,
"memory(GiB)": 76.18,
"step": 2840,
"token_acc": 0.8726828274597678,
"train_speed(iter/s)": 0.04014
},
{
"epoch": 2.5652565256525652,
"grad_norm": 3.0361390113830566,
"learning_rate": 1.0813482420564569e-06,
"loss": 0.36429810523986816,
"memory(GiB)": 76.18,
"step": 2850,
"token_acc": 0.8605402909258831,
"train_speed(iter/s)": 0.040142
},
{
"epoch": 2.5742574257425743,
"grad_norm": 2.2939305305480957,
"learning_rate": 1.0378186708203097e-06,
"loss": 0.3595736026763916,
"memory(GiB)": 76.18,
"step": 2860,
"token_acc": 0.8699784017278618,
"train_speed(iter/s)": 0.040145
},
{
"epoch": 2.5832583258325834,
"grad_norm": 2.8929970264434814,
"learning_rate": 9.951353495134741e-07,
"loss": 0.3722720146179199,
"memory(GiB)": 76.18,
"step": 2870,
"token_acc": 0.8633415343323642,
"train_speed(iter/s)": 0.040147
},
{
"epoch": 2.592259225922592,
"grad_norm": 2.766711711883545,
"learning_rate": 9.533023084893112e-07,
"loss": 0.3628982067108154,
"memory(GiB)": 76.18,
"step": 2880,
"token_acc": 0.8731262220291115,
"train_speed(iter/s)": 0.040148
},
{
"epoch": 2.601260126012601,
"grad_norm": 2.6322643756866455,
"learning_rate": 9.123234978138485e-07,
"loss": 0.3563962459564209,
"memory(GiB)": 76.18,
"step": 2890,
"token_acc": 0.8709398007795582,
"train_speed(iter/s)": 0.040149
},
{
"epoch": 2.6102610261026102,
"grad_norm": 2.3969507217407227,
"learning_rate": 8.722027868927973e-07,
"loss": 0.3593640089035034,
"memory(GiB)": 76.18,
"step": 2900,
"token_acc": 0.8687513763488218,
"train_speed(iter/s)": 0.040149
},
{
"epoch": 2.6192619261926193,
"grad_norm": 2.662048101425171,
"learning_rate": 8.32943964106192e-07,
"loss": 0.36847290992736814,
"memory(GiB)": 76.18,
"step": 2910,
"token_acc": 0.8610752688172043,
"train_speed(iter/s)": 0.040152
},
{
"epoch": 2.6282628262826284,
"grad_norm": 2.6064634323120117,
"learning_rate": 7.945507364506632e-07,
"loss": 0.3641893625259399,
"memory(GiB)": 76.18,
"step": 2920,
"token_acc": 0.8610666056305791,
"train_speed(iter/s)": 0.040154
},
{
"epoch": 2.6372637263726375,
"grad_norm": 2.4192819595336914,
"learning_rate": 7.57026729189414e-07,
"loss": 0.3702700138092041,
"memory(GiB)": 76.18,
"step": 2930,
"token_acc": 0.8613074204946997,
"train_speed(iter/s)": 0.040157
},
{
"epoch": 2.646264626462646,
"grad_norm": 2.3483784198760986,
"learning_rate": 7.203754855099009e-07,
"loss": 0.36264016628265383,
"memory(GiB)": 76.18,
"step": 2940,
"token_acc": 0.8588575238941987,
"train_speed(iter/s)": 0.04016
},
{
"epoch": 2.6552655265526552,
"grad_norm": 2.5846633911132812,
"learning_rate": 6.846004661892813e-07,
"loss": 0.37308740615844727,
"memory(GiB)": 76.18,
"step": 2950,
"token_acc": 0.8615806304248516,
"train_speed(iter/s)": 0.040161
},
{
"epoch": 2.6642664266426643,
"grad_norm": 2.6962997913360596,
"learning_rate": 6.497050492676126e-07,
"loss": 0.36321473121643066,
"memory(GiB)": 76.18,
"step": 2960,
"token_acc": 0.8618261826182618,
"train_speed(iter/s)": 0.040163
},
{
"epoch": 2.6732673267326734,
"grad_norm": 2.416895627975464,
"learning_rate": 6.156925297288996e-07,
"loss": 0.34958364963531496,
"memory(GiB)": 76.18,
"step": 2970,
"token_acc": 0.8714713430282293,
"train_speed(iter/s)": 0.040164
},
{
"epoch": 2.682268226822682,
"grad_norm": 2.3380393981933594,
"learning_rate": 5.825661191899534e-07,
"loss": 0.36399097442626954,
"memory(GiB)": 76.18,
"step": 2980,
"token_acc": 0.8697334479793637,
"train_speed(iter/s)": 0.040165
},
{
"epoch": 2.691269126912691,
"grad_norm": 2.4997997283935547,
"learning_rate": 5.503289455971495e-07,
"loss": 0.3497540235519409,
"memory(GiB)": 76.18,
"step": 2990,
"token_acc": 0.8589799476896252,
"train_speed(iter/s)": 0.040167
},
{
"epoch": 2.7002700270027002,
"grad_norm": 2.7024405002593994,
"learning_rate": 5.18984052931063e-07,
"loss": 0.36266303062438965,
"memory(GiB)": 76.18,
"step": 3000,
"token_acc": 0.8634655532359081,
"train_speed(iter/s)": 0.040168
},
{
"epoch": 2.7002700270027002,
"eval_loss": 0.3909822702407837,
"eval_runtime": 113.741,
"eval_samples_per_second": 12.616,
"eval_steps_per_second": 0.396,
"eval_token_acc": 0.8578371810449574,
"step": 3000
},
{
"epoch": 2.7092709270927093,
"grad_norm": 2.7375988960266113,
"learning_rate": 4.885344009190429e-07,
"loss": 0.36505513191223143,
"memory(GiB)": 76.18,
"step": 3010,
"token_acc": 0.8647040722125346,
"train_speed(iter/s)": 0.040096
},
{
"epoch": 2.7182718271827184,
"grad_norm": 2.5784595012664795,
"learning_rate": 4.5898286475574483e-07,
"loss": 0.36314241886138915,
"memory(GiB)": 76.18,
"step": 3020,
"token_acc": 0.8750795334040297,
"train_speed(iter/s)": 0.040096
},
{
"epoch": 2.7272727272727275,
"grad_norm": 2.59897518157959,
"learning_rate": 4.30332234831643e-07,
"loss": 0.3617940664291382,
"memory(GiB)": 76.18,
"step": 3030,
"token_acc": 0.8697020562316408,
"train_speed(iter/s)": 0.040097
},
{
"epoch": 2.736273627362736,
"grad_norm": 2.331024646759033,
"learning_rate": 4.025852164695432e-07,
"loss": 0.35245676040649415,
"memory(GiB)": 76.18,
"step": 3040,
"token_acc": 0.8609855820959759,
"train_speed(iter/s)": 0.040098
},
{
"epoch": 2.7452745274527453,
"grad_norm": 2.9060468673706055,
"learning_rate": 3.7574442966913816e-07,
"loss": 0.37049217224121095,
"memory(GiB)": 76.18,
"step": 3050,
"token_acc": 0.8594235033259423,
"train_speed(iter/s)": 0.040099
},
{
"epoch": 2.7542754275427543,
"grad_norm": 2.7476565837860107,
"learning_rate": 3.498124088596133e-07,
"loss": 0.35335454940795896,
"memory(GiB)": 76.18,
"step": 3060,
"token_acc": 0.8769035532994924,
"train_speed(iter/s)": 0.040098
},
{
"epoch": 2.7632763276327634,
"grad_norm": 2.47446346282959,
"learning_rate": 3.2479160266033595e-07,
"loss": 0.3646056652069092,
"memory(GiB)": 76.18,
"step": 3070,
"token_acc": 0.8609637488947833,
"train_speed(iter/s)": 0.040099
},
{
"epoch": 2.772277227722772,
"grad_norm": 2.518899440765381,
"learning_rate": 3.0068437364964563e-07,
"loss": 0.36437718868255614,
"memory(GiB)": 76.18,
"step": 3080,
"token_acc": 0.8751534997953336,
"train_speed(iter/s)": 0.040101
},
{
"epoch": 2.781278127812781,
"grad_norm": 2.4832963943481445,
"learning_rate": 2.774929981417662e-07,
"loss": 0.36618633270263673,
"memory(GiB)": 76.18,
"step": 3090,
"token_acc": 0.8648288128056915,
"train_speed(iter/s)": 0.040101
},
{
"epoch": 2.7902790279027903,
"grad_norm": 2.6481244564056396,
"learning_rate": 2.5521966597186976e-07,
"loss": 0.3651879787445068,
"memory(GiB)": 76.18,
"step": 3100,
"token_acc": 0.8597326082030364,
"train_speed(iter/s)": 0.040102
},
{
"epoch": 2.7992799279927993,
"grad_norm": 2.6947715282440186,
"learning_rate": 2.3386648028930093e-07,
"loss": 0.35363340377807617,
"memory(GiB)": 76.18,
"step": 3110,
"token_acc": 0.8761429758935994,
"train_speed(iter/s)": 0.040104
},
{
"epoch": 2.8082808280828084,
"grad_norm": 2.7126548290252686,
"learning_rate": 2.134354573589825e-07,
"loss": 0.3739881753921509,
"memory(GiB)": 76.18,
"step": 3120,
"token_acc": 0.8569641367806505,
"train_speed(iter/s)": 0.040106
},
{
"epoch": 2.8172817281728175,
"grad_norm": 2.6334176063537598,
"learning_rate": 1.939285263710411e-07,
"loss": 0.37378754615783694,
"memory(GiB)": 76.18,
"step": 3130,
"token_acc": 0.8621212121212121,
"train_speed(iter/s)": 0.040109
},
{
"epoch": 2.826282628262826,
"grad_norm": 2.6771504878997803,
"learning_rate": 1.7534752925863264e-07,
"loss": 0.3727731227874756,
"memory(GiB)": 76.18,
"step": 3140,
"token_acc": 0.8573262032085561,
"train_speed(iter/s)": 0.040111
},
{
"epoch": 2.8352835283528353,
"grad_norm": 2.7885513305664062,
"learning_rate": 1.5769422052403172e-07,
"loss": 0.3634767770767212,
"memory(GiB)": 76.18,
"step": 3150,
"token_acc": 0.8657498362802881,
"train_speed(iter/s)": 0.040111
},
{
"epoch": 2.8442844284428443,
"grad_norm": 2.770448684692383,
"learning_rate": 1.409702670729518e-07,
"loss": 0.3641348123550415,
"memory(GiB)": 76.18,
"step": 3160,
"token_acc": 0.8695652173913043,
"train_speed(iter/s)": 0.040111
},
{
"epoch": 2.8532853285328534,
"grad_norm": 2.716731309890747,
"learning_rate": 1.2517724805715115e-07,
"loss": 0.36133828163146975,
"memory(GiB)": 76.18,
"step": 3170,
"token_acc": 0.8693168837103039,
"train_speed(iter/s)": 0.040112
},
{
"epoch": 2.862286228622862,
"grad_norm": 2.320976734161377,
"learning_rate": 1.1031665472532871e-07,
"loss": 0.3573209285736084,
"memory(GiB)": 76.18,
"step": 3180,
"token_acc": 0.8647353517752123,
"train_speed(iter/s)": 0.040115
},
{
"epoch": 2.871287128712871,
"grad_norm": 2.6834940910339355,
"learning_rate": 9.638989028230572e-08,
"loss": 0.3642300605773926,
"memory(GiB)": 76.18,
"step": 3190,
"token_acc": 0.8666237113402062,
"train_speed(iter/s)": 0.040116
},
{
"epoch": 2.8802880288028803,
"grad_norm": 2.8395378589630127,
"learning_rate": 8.339826975653165e-08,
"loss": 0.3668497562408447,
"memory(GiB)": 76.18,
"step": 3200,
"token_acc": 0.8565969880872106,
"train_speed(iter/s)": 0.040118
},
{
"epoch": 2.8892889288928894,
"grad_norm": 2.8500564098358154,
"learning_rate": 7.134301987591686e-08,
"loss": 0.35763015747070315,
"memory(GiB)": 76.18,
"step": 3210,
"token_acc": 0.8680448647459864,
"train_speed(iter/s)": 0.04012
},
{
"epoch": 2.8982898289828984,
"grad_norm": 2.391807794570923,
"learning_rate": 6.022527895198971e-08,
"loss": 0.3681647300720215,
"memory(GiB)": 76.18,
"step": 3220,
"token_acc": 0.8623626989464246,
"train_speed(iter/s)": 0.040122
},
{
"epoch": 2.9072907290729075,
"grad_norm": 2.870159149169922,
"learning_rate": 5.004609677242478e-08,
"loss": 0.3709531307220459,
"memory(GiB)": 76.18,
"step": 3230,
"token_acc": 0.8634751773049646,
"train_speed(iter/s)": 0.040123
},
{
"epoch": 2.916291629162916,
"grad_norm": 2.3860719203948975,
"learning_rate": 4.0806434501907686e-08,
"loss": 0.3573091745376587,
"memory(GiB)": 76.18,
"step": 3240,
"token_acc": 0.8636980108499096,
"train_speed(iter/s)": 0.040125
},
{
"epoch": 2.9252925292529253,
"grad_norm": 2.533841609954834,
"learning_rate": 3.2507164591378817e-08,
"loss": 0.35629446506500245,
"memory(GiB)": 76.18,
"step": 3250,
"token_acc": 0.8767689962987154,
"train_speed(iter/s)": 0.040126
},
{
"epoch": 2.9342934293429344,
"grad_norm": 2.7338736057281494,
"learning_rate": 2.5149070695656974e-08,
"loss": 0.36386995315551757,
"memory(GiB)": 76.18,
"step": 3260,
"token_acc": 0.8695070265447246,
"train_speed(iter/s)": 0.040129
},
{
"epoch": 2.9432943294329434,
"grad_norm": 2.5814294815063477,
"learning_rate": 1.873284759943861e-08,
"loss": 0.3609006881713867,
"memory(GiB)": 76.18,
"step": 3270,
"token_acc": 0.8714535137494543,
"train_speed(iter/s)": 0.040129
},
{
"epoch": 2.952295229522952,
"grad_norm": 2.6087794303894043,
"learning_rate": 1.325910115169471e-08,
"loss": 0.36290225982666013,
"memory(GiB)": 76.18,
"step": 3280,
"token_acc": 0.8663007683863886,
"train_speed(iter/s)": 0.04013
},
{
"epoch": 2.961296129612961,
"grad_norm": 2.4624176025390625,
"learning_rate": 8.728348208466575e-09,
"loss": 0.36122841835021974,
"memory(GiB)": 76.18,
"step": 3290,
"token_acc": 0.8651804670912951,
"train_speed(iter/s)": 0.040133
},
{
"epoch": 2.9702970297029703,
"grad_norm": 2.5794880390167236,
"learning_rate": 5.1410165840548586e-09,
"loss": 0.35005528926849366,
"memory(GiB)": 76.18,
"step": 3300,
"token_acc": 0.873643074250977,
"train_speed(iter/s)": 0.040135
},
{
"epoch": 2.9792979297929794,
"grad_norm": 2.730228900909424,
"learning_rate": 2.4974450106318715e-09,
"loss": 0.3484092473983765,
"memory(GiB)": 76.18,
"step": 3310,
"token_acc": 0.8741692512184316,
"train_speed(iter/s)": 0.040137
},
{
"epoch": 2.9882988298829884,
"grad_norm": 2.4973952770233154,
"learning_rate": 7.978831062493975e-10,
"loss": 0.360276198387146,
"memory(GiB)": 76.18,
"step": 3320,
"token_acc": 0.8717892425905598,
"train_speed(iter/s)": 0.040139
},
{
"epoch": 2.9972997299729975,
"grad_norm": 2.646111488342285,
"learning_rate": 4.249135127420978e-11,
"loss": 0.34623007774353026,
"memory(GiB)": 76.18,
"step": 3330,
"token_acc": 0.8752749670039596,
"train_speed(iter/s)": 0.040141
},
{
"epoch": 3.0,
"eval_loss": 0.3906257748603821,
"eval_runtime": 111.4189,
"eval_samples_per_second": 12.879,
"eval_steps_per_second": 0.404,
"eval_token_acc": 0.8578371810449574,
"step": 3333
}
],
"logging_steps": 10,
"max_steps": 3333,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3350909053056844e+19,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}