{ "best_global_step": 3333, "best_metric": 0.39062577, "best_model_checkpoint": "/global/D1/homes/sushant/Kvasir-VQA-x1/output_vqa_x1/v0-20250521-005603/checkpoint-3333", "epoch": 3.0, "eval_steps": 500, "global_step": 3333, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009000900090009, "grad_norm": 7.174169063568115, "learning_rate": 2.0000000000000002e-07, "loss": 3.304050922393799, "memory(GiB)": 66.97, "step": 1, "token_acc": 0.4874715261958998, "train_speed(iter/s)": 0.019902 }, { "epoch": 0.009000900090009001, "grad_norm": 6.479684829711914, "learning_rate": 2.0000000000000003e-06, "loss": 3.1309598286946616, "memory(GiB)": 67.86, "step": 10, "token_acc": 0.4754664823773324, "train_speed(iter/s)": 0.036367 }, { "epoch": 0.018001800180018002, "grad_norm": 7.254239559173584, "learning_rate": 4.000000000000001e-06, "loss": 3.1008956909179686, "memory(GiB)": 67.86, "step": 20, "token_acc": 0.4788295278208823, "train_speed(iter/s)": 0.038518 }, { "epoch": 0.027002700270027002, "grad_norm": 7.573488712310791, "learning_rate": 6e-06, "loss": 2.772838592529297, "memory(GiB)": 67.86, "step": 30, "token_acc": 0.5009667024704618, "train_speed(iter/s)": 0.039255 }, { "epoch": 0.036003600360036005, "grad_norm": 4.112278938293457, "learning_rate": 8.000000000000001e-06, "loss": 2.004438781738281, "memory(GiB)": 67.86, "step": 40, "token_acc": 0.5291459557162224, "train_speed(iter/s)": 0.03954 }, { "epoch": 0.045004500450045004, "grad_norm": 1.6541378498077393, "learning_rate": 1e-05, "loss": 1.8318557739257812, "memory(GiB)": 67.86, "step": 50, "token_acc": 0.5645268034414295, "train_speed(iter/s)": 0.039725 }, { "epoch": 0.054005400540054004, "grad_norm": 1.6028215885162354, "learning_rate": 1.2e-05, "loss": 1.5337283134460449, "memory(GiB)": 67.86, "step": 60, "token_acc": 0.6214239621423963, "train_speed(iter/s)": 0.039848 }, { "epoch": 0.063006300630063, "grad_norm": 1.5401345491409302, "learning_rate": 1.4e-05, "loss": 1.361149787902832, "memory(GiB)": 68.03, "step": 70, "token_acc": 0.6230786366674093, "train_speed(iter/s)": 0.040044 }, { "epoch": 0.07200720072007201, "grad_norm": 1.1455940008163452, "learning_rate": 1.6000000000000003e-05, "loss": 1.1177097320556642, "memory(GiB)": 68.03, "step": 80, "token_acc": 0.6871520342612419, "train_speed(iter/s)": 0.040185 }, { "epoch": 0.081008100810081, "grad_norm": 1.6210144758224487, "learning_rate": 1.8e-05, "loss": 0.9828067779541015, "memory(GiB)": 68.03, "step": 90, "token_acc": 0.727211495285137, "train_speed(iter/s)": 0.040239 }, { "epoch": 0.09000900090009001, "grad_norm": 1.810027837753296, "learning_rate": 2e-05, "loss": 0.9005414962768554, "memory(GiB)": 68.08, "step": 100, "token_acc": 0.7328737613097802, "train_speed(iter/s)": 0.040295 }, { "epoch": 0.09900990099009901, "grad_norm": 1.3532379865646362, "learning_rate": 1.9999527877255423e-05, "loss": 0.7943315982818604, "memory(GiB)": 68.08, "step": 110, "token_acc": 0.7508290957329207, "train_speed(iter/s)": 0.040336 }, { "epoch": 0.10801080108010801, "grad_norm": 1.267624855041504, "learning_rate": 1.999811155360166e-05, "loss": 0.7341960906982422, "memory(GiB)": 68.08, "step": 120, "token_acc": 0.7815533980582524, "train_speed(iter/s)": 0.040377 }, { "epoch": 0.11701170117011701, "grad_norm": 1.372360348701477, "learning_rate": 1.9995751162774435e-05, "loss": 0.7223796844482422, "memory(GiB)": 68.08, "step": 130, "token_acc": 0.7679759605065465, "train_speed(iter/s)": 0.040454 }, { "epoch": 0.126012601260126, "grad_norm": 1.3093749284744263, "learning_rate": 1.9992446927652592e-05, "loss": 0.6822004318237305, "memory(GiB)": 68.08, "step": 140, "token_acc": 0.7690819178671253, "train_speed(iter/s)": 0.04049 }, { "epoch": 0.135013501350135, "grad_norm": 1.5192912817001343, "learning_rate": 1.9988199160237038e-05, "loss": 0.6598445892333984, "memory(GiB)": 68.08, "step": 150, "token_acc": 0.794921875, "train_speed(iter/s)": 0.040521 }, { "epoch": 0.14401440144014402, "grad_norm": 1.4876294136047363, "learning_rate": 1.9983008261621295e-05, "loss": 0.6424094200134277, "memory(GiB)": 68.08, "step": 160, "token_acc": 0.784965034965035, "train_speed(iter/s)": 0.040569 }, { "epoch": 0.15301530153015303, "grad_norm": 1.7920929193496704, "learning_rate": 1.9976874721953625e-05, "loss": 0.6222011089324951, "memory(GiB)": 68.08, "step": 170, "token_acc": 0.7873362445414848, "train_speed(iter/s)": 0.040571 }, { "epoch": 0.162016201620162, "grad_norm": 1.496816873550415, "learning_rate": 1.996979912039074e-05, "loss": 0.6065957069396972, "memory(GiB)": 68.08, "step": 180, "token_acc": 0.797979797979798, "train_speed(iter/s)": 0.040601 }, { "epoch": 0.171017101710171, "grad_norm": 1.631809949874878, "learning_rate": 1.9961782125043134e-05, "loss": 0.6100308895111084, "memory(GiB)": 68.08, "step": 190, "token_acc": 0.7921653971708379, "train_speed(iter/s)": 0.040622 }, { "epoch": 0.18001800180018002, "grad_norm": 1.635206937789917, "learning_rate": 1.9952824492911967e-05, "loss": 0.597900390625, "memory(GiB)": 68.08, "step": 200, "token_acc": 0.8024363233665559, "train_speed(iter/s)": 0.040621 }, { "epoch": 0.18901890189018902, "grad_norm": 1.7094358205795288, "learning_rate": 1.9942927069817618e-05, "loss": 0.5765604972839355, "memory(GiB)": 68.08, "step": 210, "token_acc": 0.8184182015167931, "train_speed(iter/s)": 0.040647 }, { "epoch": 0.19801980198019803, "grad_norm": 1.7368491888046265, "learning_rate": 1.99320907903198e-05, "loss": 0.5700692176818848, "memory(GiB)": 68.08, "step": 220, "token_acc": 0.8155997378195324, "train_speed(iter/s)": 0.040667 }, { "epoch": 0.207020702070207, "grad_norm": 1.6970064640045166, "learning_rate": 1.9920316677629312e-05, "loss": 0.5586367607116699, "memory(GiB)": 68.08, "step": 230, "token_acc": 0.8134110787172012, "train_speed(iter/s)": 0.040669 }, { "epoch": 0.21602160216021601, "grad_norm": 1.6440980434417725, "learning_rate": 1.9907605843511434e-05, "loss": 0.5400181293487549, "memory(GiB)": 68.08, "step": 240, "token_acc": 0.8248341625207297, "train_speed(iter/s)": 0.040672 }, { "epoch": 0.22502250225022502, "grad_norm": 1.848779559135437, "learning_rate": 1.9893959488180948e-05, "loss": 0.5552643775939942, "memory(GiB)": 68.08, "step": 250, "token_acc": 0.8090929154711984, "train_speed(iter/s)": 0.040677 }, { "epoch": 0.23402340234023403, "grad_norm": 1.746717929840088, "learning_rate": 1.9879378900188796e-05, "loss": 0.5367072105407715, "memory(GiB)": 68.08, "step": 260, "token_acc": 0.8096885813148789, "train_speed(iter/s)": 0.040681 }, { "epoch": 0.24302430243024303, "grad_norm": 2.212620973587036, "learning_rate": 1.9863865456300422e-05, "loss": 0.5621134757995605, "memory(GiB)": 68.08, "step": 270, "token_acc": 0.8111765989958525, "train_speed(iter/s)": 0.040691 }, { "epoch": 0.252025202520252, "grad_norm": 1.815075159072876, "learning_rate": 1.9847420621365773e-05, "loss": 0.5444355964660644, "memory(GiB)": 68.08, "step": 280, "token_acc": 0.8252319929297393, "train_speed(iter/s)": 0.040694 }, { "epoch": 0.26102610261026105, "grad_norm": 1.6822190284729004, "learning_rate": 1.983004594818096e-05, "loss": 0.509169626235962, "memory(GiB)": 68.08, "step": 290, "token_acc": 0.8245873889123995, "train_speed(iter/s)": 0.040697 }, { "epoch": 0.27002700270027, "grad_norm": 1.7498018741607666, "learning_rate": 1.981174307734167e-05, "loss": 0.5199090480804444, "memory(GiB)": 68.08, "step": 300, "token_acc": 0.8331916702082448, "train_speed(iter/s)": 0.040678 }, { "epoch": 0.279027902790279, "grad_norm": 1.875012755393982, "learning_rate": 1.9792513737088223e-05, "loss": 0.5095804691314697, "memory(GiB)": 68.08, "step": 310, "token_acc": 0.8261736049601417, "train_speed(iter/s)": 0.040669 }, { "epoch": 0.28802880288028804, "grad_norm": 1.8016622066497803, "learning_rate": 1.9772359743142396e-05, "loss": 0.49691128730773926, "memory(GiB)": 68.08, "step": 320, "token_acc": 0.8243214362043172, "train_speed(iter/s)": 0.04068 }, { "epoch": 0.297029702970297, "grad_norm": 1.927909016609192, "learning_rate": 1.975128299853598e-05, "loss": 0.5156735897064209, "memory(GiB)": 68.08, "step": 330, "token_acc": 0.8241394527802295, "train_speed(iter/s)": 0.040684 }, { "epoch": 0.30603060306030605, "grad_norm": 1.7440602779388428, "learning_rate": 1.9729285493431074e-05, "loss": 0.5245149612426758, "memory(GiB)": 68.24, "step": 340, "token_acc": 0.8179177837354781, "train_speed(iter/s)": 0.040684 }, { "epoch": 0.31503150315031503, "grad_norm": 1.9903383255004883, "learning_rate": 1.9706369304932176e-05, "loss": 0.5069475173950195, "memory(GiB)": 68.3, "step": 350, "token_acc": 0.8318876497315159, "train_speed(iter/s)": 0.040686 }, { "epoch": 0.324032403240324, "grad_norm": 1.9196044206619263, "learning_rate": 1.968253659689005e-05, "loss": 0.5040374279022217, "memory(GiB)": 68.3, "step": 360, "token_acc": 0.8283985303652475, "train_speed(iter/s)": 0.040682 }, { "epoch": 0.33303330333033304, "grad_norm": 1.9835383892059326, "learning_rate": 1.96577896196974e-05, "loss": 0.5163045883178711, "memory(GiB)": 68.3, "step": 370, "token_acc": 0.8187339406680683, "train_speed(iter/s)": 0.040679 }, { "epoch": 0.342034203420342, "grad_norm": 2.098388195037842, "learning_rate": 1.9632130710076383e-05, "loss": 0.5065926074981689, "memory(GiB)": 68.3, "step": 380, "token_acc": 0.8242616033755275, "train_speed(iter/s)": 0.04068 }, { "epoch": 0.35103510351035105, "grad_norm": 1.8806556463241577, "learning_rate": 1.960556229085797e-05, "loss": 0.4967801094055176, "memory(GiB)": 68.3, "step": 390, "token_acc": 0.8285966071821987, "train_speed(iter/s)": 0.040692 }, { "epoch": 0.36003600360036003, "grad_norm": 2.0447497367858887, "learning_rate": 1.9578086870753153e-05, "loss": 0.5042286872863769, "memory(GiB)": 68.3, "step": 400, "token_acc": 0.8263780406159339, "train_speed(iter/s)": 0.040693 }, { "epoch": 0.369036903690369, "grad_norm": 1.947168231010437, "learning_rate": 1.954970704411609e-05, "loss": 0.5015206336975098, "memory(GiB)": 68.3, "step": 410, "token_acc": 0.8200773860705073, "train_speed(iter/s)": 0.04069 }, { "epoch": 0.37803780378037805, "grad_norm": 1.855016827583313, "learning_rate": 1.9520425490699107e-05, "loss": 0.4870131492614746, "memory(GiB)": 68.3, "step": 420, "token_acc": 0.8407563025210084, "train_speed(iter/s)": 0.040704 }, { "epoch": 0.387038703870387, "grad_norm": 1.8995352983474731, "learning_rate": 1.9490244975399678e-05, "loss": 0.48991098403930666, "memory(GiB)": 68.3, "step": 430, "token_acc": 0.8367172472750588, "train_speed(iter/s)": 0.040707 }, { "epoch": 0.39603960396039606, "grad_norm": 1.9746062755584717, "learning_rate": 1.9459168347999343e-05, "loss": 0.49413495063781737, "memory(GiB)": 68.3, "step": 440, "token_acc": 0.8217993079584776, "train_speed(iter/s)": 0.040722 }, { "epoch": 0.40504050405040504, "grad_norm": 1.9922826290130615, "learning_rate": 1.9427198542894628e-05, "loss": 0.478054141998291, "memory(GiB)": 68.3, "step": 450, "token_acc": 0.8396687194733489, "train_speed(iter/s)": 0.040729 }, { "epoch": 0.414041404140414, "grad_norm": 1.8262529373168945, "learning_rate": 1.9394338578819957e-05, "loss": 0.4965967178344727, "memory(GiB)": 68.3, "step": 460, "token_acc": 0.8291083916083916, "train_speed(iter/s)": 0.04073 }, { "epoch": 0.42304230423042305, "grad_norm": 1.6194044351577759, "learning_rate": 1.936059155856262e-05, "loss": 0.47453508377075193, "memory(GiB)": 68.3, "step": 470, "token_acc": 0.8382074479276247, "train_speed(iter/s)": 0.040729 }, { "epoch": 0.43204320432043203, "grad_norm": 1.9184072017669678, "learning_rate": 1.932596066866978e-05, "loss": 0.4665153980255127, "memory(GiB)": 68.3, "step": 480, "token_acc": 0.8344993441189331, "train_speed(iter/s)": 0.040723 }, { "epoch": 0.44104410441044106, "grad_norm": 1.7491145133972168, "learning_rate": 1.929044917914759e-05, "loss": 0.4606966972351074, "memory(GiB)": 68.3, "step": 490, "token_acc": 0.84466817341278, "train_speed(iter/s)": 0.040709 }, { "epoch": 0.45004500450045004, "grad_norm": 1.97507643699646, "learning_rate": 1.9254060443152435e-05, "loss": 0.47635550498962403, "memory(GiB)": 68.3, "step": 500, "token_acc": 0.8395522388059702, "train_speed(iter/s)": 0.040715 }, { "epoch": 0.45004500450045004, "eval_loss": 0.48444515466690063, "eval_runtime": 117.4773, "eval_samples_per_second": 12.215, "eval_steps_per_second": 0.383, "eval_token_acc": 0.8321749696233293, "step": 500 }, { "epoch": 0.459045904590459, "grad_norm": 2.1366748809814453, "learning_rate": 1.921679789667429e-05, "loss": 0.4868021965026855, "memory(GiB)": 74.54, "step": 510, "token_acc": 0.8326992287917738, "train_speed(iter/s)": 0.040314 }, { "epoch": 0.46804680468046805, "grad_norm": 2.1436243057250977, "learning_rate": 1.9178665058212306e-05, "loss": 0.4831557273864746, "memory(GiB)": 74.54, "step": 520, "token_acc": 0.8337397472844159, "train_speed(iter/s)": 0.04031 }, { "epoch": 0.47704770477047703, "grad_norm": 1.887610673904419, "learning_rate": 1.9139665528442544e-05, "loss": 0.4900979995727539, "memory(GiB)": 74.54, "step": 530, "token_acc": 0.8252338580880675, "train_speed(iter/s)": 0.040315 }, { "epoch": 0.48604860486048607, "grad_norm": 1.778539776802063, "learning_rate": 1.909980298987802e-05, "loss": 0.4595688819885254, "memory(GiB)": 74.54, "step": 540, "token_acc": 0.8390126692878986, "train_speed(iter/s)": 0.040313 }, { "epoch": 0.49504950495049505, "grad_norm": 2.031074285507202, "learning_rate": 1.9059081206520954e-05, "loss": 0.47982397079467776, "memory(GiB)": 74.54, "step": 550, "token_acc": 0.8332963374028857, "train_speed(iter/s)": 0.040319 }, { "epoch": 0.504050405040504, "grad_norm": 1.7411119937896729, "learning_rate": 1.9017504023507366e-05, "loss": 0.47092242240905763, "memory(GiB)": 74.54, "step": 560, "token_acc": 0.8331826401446655, "train_speed(iter/s)": 0.040327 }, { "epoch": 0.513051305130513, "grad_norm": 1.9507403373718262, "learning_rate": 1.897507536674401e-05, "loss": 0.473051929473877, "memory(GiB)": 74.54, "step": 570, "token_acc": 0.8324808184143222, "train_speed(iter/s)": 0.040337 }, { "epoch": 0.5220522052205221, "grad_norm": 1.8194775581359863, "learning_rate": 1.8931799242537664e-05, "loss": 0.4804567813873291, "memory(GiB)": 74.54, "step": 580, "token_acc": 0.8376344086021505, "train_speed(iter/s)": 0.04034 }, { "epoch": 0.5310531053105311, "grad_norm": 1.663552165031433, "learning_rate": 1.8887679737216835e-05, "loss": 0.4625405311584473, "memory(GiB)": 74.54, "step": 590, "token_acc": 0.8455850369725968, "train_speed(iter/s)": 0.04034 }, { "epoch": 0.54005400540054, "grad_norm": 1.968461036682129, "learning_rate": 1.8842721016745905e-05, "loss": 0.4602372646331787, "memory(GiB)": 74.54, "step": 600, "token_acc": 0.8317933641327173, "train_speed(iter/s)": 0.040343 }, { "epoch": 0.549054905490549, "grad_norm": 1.9484490156173706, "learning_rate": 1.8796927326331783e-05, "loss": 0.45257129669189455, "memory(GiB)": 74.54, "step": 610, "token_acc": 0.8373316498316499, "train_speed(iter/s)": 0.040343 }, { "epoch": 0.558055805580558, "grad_norm": 2.0010809898376465, "learning_rate": 1.8750302990023023e-05, "loss": 0.4624796390533447, "memory(GiB)": 74.54, "step": 620, "token_acc": 0.8330117899249732, "train_speed(iter/s)": 0.04035 }, { "epoch": 0.5670567056705671, "grad_norm": 2.1292455196380615, "learning_rate": 1.8702852410301556e-05, "loss": 0.4666603565216064, "memory(GiB)": 74.54, "step": 630, "token_acc": 0.8413180143073922, "train_speed(iter/s)": 0.040354 }, { "epoch": 0.5760576057605761, "grad_norm": 1.8475803136825562, "learning_rate": 1.865458006766696e-05, "loss": 0.4536900520324707, "memory(GiB)": 74.54, "step": 640, "token_acc": 0.8346206269877329, "train_speed(iter/s)": 0.040359 }, { "epoch": 0.585058505850585, "grad_norm": 1.9390885829925537, "learning_rate": 1.860549052021342e-05, "loss": 0.4544112205505371, "memory(GiB)": 74.54, "step": 650, "token_acc": 0.8367626886145405, "train_speed(iter/s)": 0.040355 }, { "epoch": 0.594059405940594, "grad_norm": 1.7429540157318115, "learning_rate": 1.8555588403199304e-05, "loss": 0.4384955406188965, "memory(GiB)": 74.54, "step": 660, "token_acc": 0.8417298261257244, "train_speed(iter/s)": 0.04035 }, { "epoch": 0.603060306030603, "grad_norm": 2.0337181091308594, "learning_rate": 1.8504878428609506e-05, "loss": 0.46024494171142577, "memory(GiB)": 74.54, "step": 670, "token_acc": 0.8392979256895373, "train_speed(iter/s)": 0.040343 }, { "epoch": 0.6120612061206121, "grad_norm": 1.9363151788711548, "learning_rate": 1.8453365384710506e-05, "loss": 0.4446521759033203, "memory(GiB)": 74.54, "step": 680, "token_acc": 0.8308807379749615, "train_speed(iter/s)": 0.04034 }, { "epoch": 0.6210621062106211, "grad_norm": 1.9249675273895264, "learning_rate": 1.8401054135598228e-05, "loss": 0.44910879135131837, "memory(GiB)": 74.54, "step": 690, "token_acc": 0.8436960276338514, "train_speed(iter/s)": 0.040347 }, { "epoch": 0.6300630063006301, "grad_norm": 2.0293335914611816, "learning_rate": 1.834794962073878e-05, "loss": 0.4501783847808838, "memory(GiB)": 74.54, "step": 700, "token_acc": 0.8366346742903819, "train_speed(iter/s)": 0.040353 }, { "epoch": 0.639063906390639, "grad_norm": 2.1260316371917725, "learning_rate": 1.829405685450202e-05, "loss": 0.4506657600402832, "memory(GiB)": 74.54, "step": 710, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.040362 }, { "epoch": 0.648064806480648, "grad_norm": 1.8729071617126465, "learning_rate": 1.8239380925688087e-05, "loss": 0.4430402755737305, "memory(GiB)": 74.54, "step": 720, "token_acc": 0.8478399659502022, "train_speed(iter/s)": 0.040365 }, { "epoch": 0.6570657065706571, "grad_norm": 1.9187947511672974, "learning_rate": 1.8183926997046905e-05, "loss": 0.4478912353515625, "memory(GiB)": 74.54, "step": 730, "token_acc": 0.8519141775347077, "train_speed(iter/s)": 0.040364 }, { "epoch": 0.6660666066606661, "grad_norm": 2.07631254196167, "learning_rate": 1.812770030479066e-05, "loss": 0.4402505397796631, "memory(GiB)": 74.54, "step": 740, "token_acc": 0.8526605893576426, "train_speed(iter/s)": 0.040366 }, { "epoch": 0.6750675067506751, "grad_norm": 1.8189442157745361, "learning_rate": 1.8070706158099417e-05, "loss": 0.4404914855957031, "memory(GiB)": 74.54, "step": 750, "token_acc": 0.8409304511278195, "train_speed(iter/s)": 0.040367 }, { "epoch": 0.684068406840684, "grad_norm": 1.9871678352355957, "learning_rate": 1.8012949938619756e-05, "loss": 0.4483049392700195, "memory(GiB)": 74.54, "step": 760, "token_acc": 0.8431750106974754, "train_speed(iter/s)": 0.040371 }, { "epoch": 0.693069306930693, "grad_norm": 1.8938976526260376, "learning_rate": 1.7954437099956657e-05, "loss": 0.44423818588256836, "memory(GiB)": 74.54, "step": 770, "token_acc": 0.8477157360406091, "train_speed(iter/s)": 0.040371 }, { "epoch": 0.7020702070207021, "grad_norm": 1.8947218656539917, "learning_rate": 1.7895173167158514e-05, "loss": 0.4492767333984375, "memory(GiB)": 74.54, "step": 780, "token_acc": 0.837278737470676, "train_speed(iter/s)": 0.040374 }, { "epoch": 0.7110711071107111, "grad_norm": 1.9695574045181274, "learning_rate": 1.7835163736195447e-05, "loss": 0.44904842376708987, "memory(GiB)": 74.54, "step": 790, "token_acc": 0.8408003479773815, "train_speed(iter/s)": 0.040375 }, { "epoch": 0.7200720072007201, "grad_norm": 2.00817608833313, "learning_rate": 1.777441447343091e-05, "loss": 0.45390868186950684, "memory(GiB)": 74.54, "step": 800, "token_acc": 0.8411726099321811, "train_speed(iter/s)": 0.040379 }, { "epoch": 0.729072907290729, "grad_norm": 2.0400583744049072, "learning_rate": 1.7712931115086633e-05, "loss": 0.4411576747894287, "memory(GiB)": 74.54, "step": 810, "token_acc": 0.8399218071242398, "train_speed(iter/s)": 0.04038 }, { "epoch": 0.738073807380738, "grad_norm": 2.0157155990600586, "learning_rate": 1.7650719466700994e-05, "loss": 0.44756488800048827, "memory(GiB)": 74.54, "step": 820, "token_acc": 0.842788038698329, "train_speed(iter/s)": 0.040376 }, { "epoch": 0.7470747074707471, "grad_norm": 1.7088335752487183, "learning_rate": 1.7587785402580828e-05, "loss": 0.43597002029418946, "memory(GiB)": 74.54, "step": 830, "token_acc": 0.8466036887089519, "train_speed(iter/s)": 0.040379 }, { "epoch": 0.7560756075607561, "grad_norm": 2.1911604404449463, "learning_rate": 1.752413486524675e-05, "loss": 0.44062347412109376, "memory(GiB)": 74.54, "step": 840, "token_acc": 0.8505315822388994, "train_speed(iter/s)": 0.040375 }, { "epoch": 0.7650765076507651, "grad_norm": 2.0149848461151123, "learning_rate": 1.7459773864872042e-05, "loss": 0.4424751281738281, "memory(GiB)": 74.54, "step": 850, "token_acc": 0.8476879246110015, "train_speed(iter/s)": 0.040376 }, { "epoch": 0.774077407740774, "grad_norm": 1.9014195203781128, "learning_rate": 1.7394708478715127e-05, "loss": 0.4621281623840332, "memory(GiB)": 74.54, "step": 860, "token_acc": 0.8423601937472479, "train_speed(iter/s)": 0.040378 }, { "epoch": 0.783078307830783, "grad_norm": 2.0565760135650635, "learning_rate": 1.7328944850545745e-05, "loss": 0.4593350410461426, "memory(GiB)": 74.54, "step": 870, "token_acc": 0.8399521531100479, "train_speed(iter/s)": 0.040378 }, { "epoch": 0.7920792079207921, "grad_norm": 2.0428307056427, "learning_rate": 1.7262489190064818e-05, "loss": 0.43943395614624026, "memory(GiB)": 74.54, "step": 880, "token_acc": 0.8423470453121737, "train_speed(iter/s)": 0.04038 }, { "epoch": 0.8010801080108011, "grad_norm": 2.316945791244507, "learning_rate": 1.7195347772318116e-05, "loss": 0.43985910415649415, "memory(GiB)": 74.54, "step": 890, "token_acc": 0.8351231838281743, "train_speed(iter/s)": 0.040379 }, { "epoch": 0.8100810081008101, "grad_norm": 2.041092872619629, "learning_rate": 1.7127526937103713e-05, "loss": 0.4424757957458496, "memory(GiB)": 74.54, "step": 900, "token_acc": 0.841919080256467, "train_speed(iter/s)": 0.04038 }, { "epoch": 0.819081908190819, "grad_norm": 2.2010583877563477, "learning_rate": 1.705903308837339e-05, "loss": 0.4423489570617676, "memory(GiB)": 74.54, "step": 910, "token_acc": 0.8436460412508316, "train_speed(iter/s)": 0.040384 }, { "epoch": 0.828082808280828, "grad_norm": 1.804849624633789, "learning_rate": 1.6989872693627916e-05, "loss": 0.43178791999816896, "memory(GiB)": 74.54, "step": 920, "token_acc": 0.8569312169312169, "train_speed(iter/s)": 0.040391 }, { "epoch": 0.8370837083708371, "grad_norm": 2.3260996341705322, "learning_rate": 1.6920052283306364e-05, "loss": 0.4507165431976318, "memory(GiB)": 74.54, "step": 930, "token_acc": 0.8385640099345225, "train_speed(iter/s)": 0.040389 }, { "epoch": 0.8460846084608461, "grad_norm": 2.029878616333008, "learning_rate": 1.684957845016949e-05, "loss": 0.423465633392334, "memory(GiB)": 74.54, "step": 940, "token_acc": 0.8474983613720778, "train_speed(iter/s)": 0.040396 }, { "epoch": 0.8550855085508551, "grad_norm": 2.0568127632141113, "learning_rate": 1.677845784867719e-05, "loss": 0.426534366607666, "memory(GiB)": 74.54, "step": 950, "token_acc": 0.8443046506403056, "train_speed(iter/s)": 0.040397 }, { "epoch": 0.8640864086408641, "grad_norm": 2.0447678565979004, "learning_rate": 1.6706697194360186e-05, "loss": 0.43904976844787597, "memory(GiB)": 74.54, "step": 960, "token_acc": 0.843986543313709, "train_speed(iter/s)": 0.040403 }, { "epoch": 0.873087308730873, "grad_norm": 1.8627592325210571, "learning_rate": 1.6634303263185885e-05, "loss": 0.4334832191467285, "memory(GiB)": 74.54, "step": 970, "token_acc": 0.8500109003706126, "train_speed(iter/s)": 0.040406 }, { "epoch": 0.8820882088208821, "grad_norm": 2.1592624187469482, "learning_rate": 1.656128289091859e-05, "loss": 0.43813695907592776, "memory(GiB)": 74.54, "step": 980, "token_acc": 0.8389203308663474, "train_speed(iter/s)": 0.040402 }, { "epoch": 0.8910891089108911, "grad_norm": 1.7612345218658447, "learning_rate": 1.6487642972474006e-05, "loss": 0.43879289627075196, "memory(GiB)": 74.54, "step": 990, "token_acc": 0.8460222412318221, "train_speed(iter/s)": 0.040402 }, { "epoch": 0.9000900090009001, "grad_norm": 2.0122318267822266, "learning_rate": 1.641339046126822e-05, "loss": 0.4455322265625, "memory(GiB)": 74.54, "step": 1000, "token_acc": 0.8455068614431164, "train_speed(iter/s)": 0.040397 }, { "epoch": 0.9000900090009001, "eval_loss": 0.43926388025283813, "eval_runtime": 113.4684, "eval_samples_per_second": 12.647, "eval_steps_per_second": 0.397, "eval_token_acc": 0.8410206561360875, "step": 1000 }, { "epoch": 0.9090909090909091, "grad_norm": 2.066300630569458, "learning_rate": 1.6338532368561105e-05, "loss": 0.4375774383544922, "memory(GiB)": 74.54, "step": 1010, "token_acc": 0.8390414378432351, "train_speed(iter/s)": 0.040187 }, { "epoch": 0.918091809180918, "grad_norm": 2.2568578720092773, "learning_rate": 1.62630757627943e-05, "loss": 0.4385653495788574, "memory(GiB)": 74.54, "step": 1020, "token_acc": 0.8342480790340285, "train_speed(iter/s)": 0.040185 }, { "epoch": 0.9270927092709271, "grad_norm": 1.963052749633789, "learning_rate": 1.6187027768923767e-05, "loss": 0.43105306625366213, "memory(GiB)": 74.54, "step": 1030, "token_acc": 0.8509454949944383, "train_speed(iter/s)": 0.040187 }, { "epoch": 0.9360936093609361, "grad_norm": 1.902685523033142, "learning_rate": 1.6110395567747025e-05, "loss": 0.4382938385009766, "memory(GiB)": 74.54, "step": 1040, "token_acc": 0.8346938775510204, "train_speed(iter/s)": 0.040185 }, { "epoch": 0.9450945094509451, "grad_norm": 1.8732327222824097, "learning_rate": 1.6033186395225095e-05, "loss": 0.41572961807250974, "memory(GiB)": 74.54, "step": 1050, "token_acc": 0.85475935828877, "train_speed(iter/s)": 0.04019 }, { "epoch": 0.9540954095409541, "grad_norm": 1.869422197341919, "learning_rate": 1.5955407541799274e-05, "loss": 0.43001718521118165, "memory(GiB)": 74.54, "step": 1060, "token_acc": 0.8342636324602833, "train_speed(iter/s)": 0.040189 }, { "epoch": 0.963096309630963, "grad_norm": 2.065873861312866, "learning_rate": 1.5877066351702707e-05, "loss": 0.43995866775512693, "memory(GiB)": 74.54, "step": 1070, "token_acc": 0.8477516059957173, "train_speed(iter/s)": 0.040194 }, { "epoch": 0.9720972097209721, "grad_norm": 2.1846609115600586, "learning_rate": 1.5798170222266933e-05, "loss": 0.4312899589538574, "memory(GiB)": 74.54, "step": 1080, "token_acc": 0.8568353067814855, "train_speed(iter/s)": 0.040196 }, { "epoch": 0.9810981098109811, "grad_norm": 2.151474714279175, "learning_rate": 1.571872660322338e-05, "loss": 0.431905460357666, "memory(GiB)": 74.54, "step": 1090, "token_acc": 0.8473539953615855, "train_speed(iter/s)": 0.040202 }, { "epoch": 0.9900990099009901, "grad_norm": 2.0136258602142334, "learning_rate": 1.563874299599995e-05, "loss": 0.4207723140716553, "memory(GiB)": 74.54, "step": 1100, "token_acc": 0.8404571428571429, "train_speed(iter/s)": 0.040206 }, { "epoch": 0.9990999099909991, "grad_norm": 2.0286359786987305, "learning_rate": 1.555822695301266e-05, "loss": 0.41998815536499023, "memory(GiB)": 74.54, "step": 1110, "token_acc": 0.8462002412545235, "train_speed(iter/s)": 0.040207 }, { "epoch": 1.008100810081008, "grad_norm": 2.1229543685913086, "learning_rate": 1.5477186076952567e-05, "loss": 0.41786656379699705, "memory(GiB)": 74.54, "step": 1120, "token_acc": 0.8457294195541823, "train_speed(iter/s)": 0.040226 }, { "epoch": 1.0171017101710171, "grad_norm": 2.2496182918548584, "learning_rate": 1.5395628020067825e-05, "loss": 0.41992764472961425, "memory(GiB)": 74.54, "step": 1130, "token_acc": 0.8452407614781635, "train_speed(iter/s)": 0.040225 }, { "epoch": 1.026102610261026, "grad_norm": 2.0818288326263428, "learning_rate": 1.531356048344117e-05, "loss": 0.41519851684570314, "memory(GiB)": 74.54, "step": 1140, "token_acc": 0.8480816145486804, "train_speed(iter/s)": 0.040226 }, { "epoch": 1.035103510351035, "grad_norm": 1.9498157501220703, "learning_rate": 1.523099121626273e-05, "loss": 0.4007615089416504, "memory(GiB)": 74.54, "step": 1150, "token_acc": 0.8642224012892828, "train_speed(iter/s)": 0.040229 }, { "epoch": 1.0441044104410442, "grad_norm": 2.238085985183716, "learning_rate": 1.5147928015098309e-05, "loss": 0.416591739654541, "memory(GiB)": 74.54, "step": 1160, "token_acc": 0.8449678800856532, "train_speed(iter/s)": 0.040231 }, { "epoch": 1.053105310531053, "grad_norm": 1.884536862373352, "learning_rate": 1.506437872315321e-05, "loss": 0.4058389663696289, "memory(GiB)": 74.54, "step": 1170, "token_acc": 0.8544316996871741, "train_speed(iter/s)": 0.040234 }, { "epoch": 1.0621062106210621, "grad_norm": 2.506772041320801, "learning_rate": 1.4980351229531642e-05, "loss": 0.4066319465637207, "memory(GiB)": 74.54, "step": 1180, "token_acc": 0.8476423487544484, "train_speed(iter/s)": 0.040236 }, { "epoch": 1.071107110711071, "grad_norm": 2.208542823791504, "learning_rate": 1.4895853468491779e-05, "loss": 0.4183638572692871, "memory(GiB)": 74.54, "step": 1190, "token_acc": 0.8479634066652145, "train_speed(iter/s)": 0.040233 }, { "epoch": 1.08010801080108, "grad_norm": 2.0623791217803955, "learning_rate": 1.4810893418696595e-05, "loss": 0.4236001014709473, "memory(GiB)": 74.54, "step": 1200, "token_acc": 0.8621627274628739, "train_speed(iter/s)": 0.040231 }, { "epoch": 1.0891089108910892, "grad_norm": 1.9633852243423462, "learning_rate": 1.4725479102460467e-05, "loss": 0.4070269584655762, "memory(GiB)": 74.54, "step": 1210, "token_acc": 0.8519945602901179, "train_speed(iter/s)": 0.040233 }, { "epoch": 1.098109810981098, "grad_norm": 2.425140857696533, "learning_rate": 1.4639618584991679e-05, "loss": 0.4048626899719238, "memory(GiB)": 74.54, "step": 1220, "token_acc": 0.8575699338031176, "train_speed(iter/s)": 0.040237 }, { "epoch": 1.1071107110711071, "grad_norm": 1.9179662466049194, "learning_rate": 1.455331997363086e-05, "loss": 0.41301331520080564, "memory(GiB)": 74.54, "step": 1230, "token_acc": 0.8553283100107643, "train_speed(iter/s)": 0.040242 }, { "epoch": 1.116111611161116, "grad_norm": 2.332228660583496, "learning_rate": 1.4466591417085462e-05, "loss": 0.4197710037231445, "memory(GiB)": 74.54, "step": 1240, "token_acc": 0.8447427293064877, "train_speed(iter/s)": 0.040246 }, { "epoch": 1.125112511251125, "grad_norm": 2.093475580215454, "learning_rate": 1.4379441104660313e-05, "loss": 0.4093982696533203, "memory(GiB)": 74.54, "step": 1250, "token_acc": 0.8562723261189326, "train_speed(iter/s)": 0.040245 }, { "epoch": 1.1341134113411342, "grad_norm": 2.2746119499206543, "learning_rate": 1.4291877265484352e-05, "loss": 0.4102977752685547, "memory(GiB)": 74.54, "step": 1260, "token_acc": 0.854287556415216, "train_speed(iter/s)": 0.040249 }, { "epoch": 1.143114311431143, "grad_norm": 2.2232649326324463, "learning_rate": 1.4203908167733596e-05, "loss": 0.418546724319458, "memory(GiB)": 74.54, "step": 1270, "token_acc": 0.8427280550774526, "train_speed(iter/s)": 0.040255 }, { "epoch": 1.1521152115211521, "grad_norm": 1.9787334203720093, "learning_rate": 1.4115542117850415e-05, "loss": 0.410016393661499, "memory(GiB)": 74.54, "step": 1280, "token_acc": 0.86048545812377, "train_speed(iter/s)": 0.040258 }, { "epoch": 1.161116111611161, "grad_norm": 2.3660764694213867, "learning_rate": 1.4026787459759215e-05, "loss": 0.4094221591949463, "memory(GiB)": 74.54, "step": 1290, "token_acc": 0.8500684618895481, "train_speed(iter/s)": 0.040257 }, { "epoch": 1.17011701170117, "grad_norm": 2.0939202308654785, "learning_rate": 1.3937652574078543e-05, "loss": 0.40435123443603516, "memory(GiB)": 74.54, "step": 1300, "token_acc": 0.8442178346712953, "train_speed(iter/s)": 0.040258 }, { "epoch": 1.1791179117911792, "grad_norm": 2.3308207988739014, "learning_rate": 1.3848145877329778e-05, "loss": 0.4132570743560791, "memory(GiB)": 74.54, "step": 1310, "token_acc": 0.8504208935894668, "train_speed(iter/s)": 0.040261 }, { "epoch": 1.188118811881188, "grad_norm": 2.053710460662842, "learning_rate": 1.3758275821142382e-05, "loss": 0.39916296005249025, "memory(GiB)": 74.54, "step": 1320, "token_acc": 0.8543060651845457, "train_speed(iter/s)": 0.04026 }, { "epoch": 1.1971197119711972, "grad_norm": 2.4674737453460693, "learning_rate": 1.3668050891455873e-05, "loss": 0.3984804630279541, "memory(GiB)": 74.54, "step": 1330, "token_acc": 0.8585640138408305, "train_speed(iter/s)": 0.040259 }, { "epoch": 1.206120612061206, "grad_norm": 2.1947102546691895, "learning_rate": 1.357747960771854e-05, "loss": 0.42041912078857424, "memory(GiB)": 74.54, "step": 1340, "token_acc": 0.8391608391608392, "train_speed(iter/s)": 0.040262 }, { "epoch": 1.215121512151215, "grad_norm": 2.0035359859466553, "learning_rate": 1.3486570522082989e-05, "loss": 0.4119097709655762, "memory(GiB)": 74.54, "step": 1350, "token_acc": 0.8620765508139023, "train_speed(iter/s)": 0.040265 }, { "epoch": 1.2241224122412242, "grad_norm": 2.161275863647461, "learning_rate": 1.3395332218598629e-05, "loss": 0.4057816982269287, "memory(GiB)": 74.54, "step": 1360, "token_acc": 0.8410107334525939, "train_speed(iter/s)": 0.040268 }, { "epoch": 1.233123312331233, "grad_norm": 2.300550937652588, "learning_rate": 1.3303773312401107e-05, "loss": 0.40541529655456543, "memory(GiB)": 74.54, "step": 1370, "token_acc": 0.8559489773477018, "train_speed(iter/s)": 0.040269 }, { "epoch": 1.2421242124212422, "grad_norm": 2.306222915649414, "learning_rate": 1.3211902448898841e-05, "loss": 0.40516185760498047, "memory(GiB)": 74.54, "step": 1380, "token_acc": 0.8569854561480829, "train_speed(iter/s)": 0.04027 }, { "epoch": 1.251125112511251, "grad_norm": 2.1976640224456787, "learning_rate": 1.3119728302956676e-05, "loss": 0.4062767505645752, "memory(GiB)": 74.54, "step": 1390, "token_acc": 0.8493668073761387, "train_speed(iter/s)": 0.040273 }, { "epoch": 1.2601260126012601, "grad_norm": 2.333188056945801, "learning_rate": 1.302725957807676e-05, "loss": 0.39322872161865235, "memory(GiB)": 74.54, "step": 1400, "token_acc": 0.860806663743972, "train_speed(iter/s)": 0.040272 }, { "epoch": 1.2691269126912692, "grad_norm": 2.356128215789795, "learning_rate": 1.2934505005576738e-05, "loss": 0.39969046115875245, "memory(GiB)": 74.54, "step": 1410, "token_acc": 0.8573583279465632, "train_speed(iter/s)": 0.040268 }, { "epoch": 1.278127812781278, "grad_norm": 2.1411805152893066, "learning_rate": 1.2841473343765269e-05, "loss": 0.39504408836364746, "memory(GiB)": 74.54, "step": 1420, "token_acc": 0.8612200435729848, "train_speed(iter/s)": 0.040269 }, { "epoch": 1.2871287128712872, "grad_norm": 2.187964677810669, "learning_rate": 1.274817337711506e-05, "loss": 0.4120161056518555, "memory(GiB)": 74.54, "step": 1430, "token_acc": 0.849435382685069, "train_speed(iter/s)": 0.040272 }, { "epoch": 1.296129612961296, "grad_norm": 2.098618745803833, "learning_rate": 1.2654613915433373e-05, "loss": 0.39701004028320314, "memory(GiB)": 74.54, "step": 1440, "token_acc": 0.8512253307308609, "train_speed(iter/s)": 0.040274 }, { "epoch": 1.3051305130513051, "grad_norm": 2.000491142272949, "learning_rate": 1.2560803793030179e-05, "loss": 0.40303592681884765, "memory(GiB)": 74.54, "step": 1450, "token_acc": 0.8583260680034873, "train_speed(iter/s)": 0.040274 }, { "epoch": 1.3141314131413142, "grad_norm": 2.1380844116210938, "learning_rate": 1.2466751867883959e-05, "loss": 0.397491455078125, "memory(GiB)": 74.54, "step": 1460, "token_acc": 0.8592755214050494, "train_speed(iter/s)": 0.040276 }, { "epoch": 1.323132313231323, "grad_norm": 2.110633611679077, "learning_rate": 1.2372467020805332e-05, "loss": 0.4155548095703125, "memory(GiB)": 74.54, "step": 1470, "token_acc": 0.8501522401043932, "train_speed(iter/s)": 0.040278 }, { "epoch": 1.3321332133213322, "grad_norm": 2.1096761226654053, "learning_rate": 1.2277958154598444e-05, "loss": 0.41139373779296873, "memory(GiB)": 74.54, "step": 1480, "token_acc": 0.8384369287020109, "train_speed(iter/s)": 0.040279 }, { "epoch": 1.341134113411341, "grad_norm": 2.346917152404785, "learning_rate": 1.2183234193220362e-05, "loss": 0.3898932456970215, "memory(GiB)": 74.54, "step": 1490, "token_acc": 0.8620309050772627, "train_speed(iter/s)": 0.04028 }, { "epoch": 1.3501350135013501, "grad_norm": 2.1962385177612305, "learning_rate": 1.2088304080938404e-05, "loss": 0.3953920841217041, "memory(GiB)": 74.54, "step": 1500, "token_acc": 0.8660930950805207, "train_speed(iter/s)": 0.040278 }, { "epoch": 1.3501350135013501, "eval_loss": 0.42292386293411255, "eval_runtime": 112.5032, "eval_samples_per_second": 12.755, "eval_steps_per_second": 0.4, "eval_token_acc": 0.8482138517618469, "step": 1500 }, { "epoch": 1.3591359135913592, "grad_norm": 2.1046359539031982, "learning_rate": 1.1993176781485608e-05, "loss": 0.4179078578948975, "memory(GiB)": 74.54, "step": 1510, "token_acc": 0.8453704665904603, "train_speed(iter/s)": 0.040153 }, { "epoch": 1.368136813681368, "grad_norm": 2.0981786251068115, "learning_rate": 1.1897861277214304e-05, "loss": 0.38443617820739745, "memory(GiB)": 74.54, "step": 1520, "token_acc": 0.8514383855732074, "train_speed(iter/s)": 0.040151 }, { "epoch": 1.3771377137713772, "grad_norm": 2.335702419281006, "learning_rate": 1.1802366568247998e-05, "loss": 0.39206039905548096, "memory(GiB)": 74.54, "step": 1530, "token_acc": 0.8556973163220414, "train_speed(iter/s)": 0.040152 }, { "epoch": 1.386138613861386, "grad_norm": 2.2659618854522705, "learning_rate": 1.1706701671631504e-05, "loss": 0.39416942596435545, "memory(GiB)": 74.54, "step": 1540, "token_acc": 0.8575920934411501, "train_speed(iter/s)": 0.040154 }, { "epoch": 1.3951395139513951, "grad_norm": 2.3435161113739014, "learning_rate": 1.1610875620479531e-05, "loss": 0.4044766426086426, "memory(GiB)": 74.54, "step": 1550, "token_acc": 0.8510254676583277, "train_speed(iter/s)": 0.040156 }, { "epoch": 1.4041404140414042, "grad_norm": 2.155761241912842, "learning_rate": 1.1514897463123735e-05, "loss": 0.39972786903381347, "memory(GiB)": 74.54, "step": 1560, "token_acc": 0.858606101091071, "train_speed(iter/s)": 0.040158 }, { "epoch": 1.413141314131413, "grad_norm": 2.231323719024658, "learning_rate": 1.141877626225833e-05, "loss": 0.4081737518310547, "memory(GiB)": 74.54, "step": 1570, "token_acc": 0.8568965517241379, "train_speed(iter/s)": 0.040158 }, { "epoch": 1.4221422142214222, "grad_norm": 2.0848968029022217, "learning_rate": 1.1322521094084352e-05, "loss": 0.4104423999786377, "memory(GiB)": 74.54, "step": 1580, "token_acc": 0.8589771972548151, "train_speed(iter/s)": 0.04016 }, { "epoch": 1.431143114311431, "grad_norm": 2.1602284908294678, "learning_rate": 1.1226141047452628e-05, "loss": 0.39746341705322263, "memory(GiB)": 74.54, "step": 1590, "token_acc": 0.8528940745824755, "train_speed(iter/s)": 0.040163 }, { "epoch": 1.4401440144014401, "grad_norm": 2.202800750732422, "learning_rate": 1.1129645223005592e-05, "loss": 0.3975072383880615, "memory(GiB)": 74.54, "step": 1600, "token_acc": 0.85933056224021, "train_speed(iter/s)": 0.040165 }, { "epoch": 1.4491449144914492, "grad_norm": 2.0750746726989746, "learning_rate": 1.103304273231794e-05, "loss": 0.4078987598419189, "memory(GiB)": 74.54, "step": 1610, "token_acc": 0.8481820114820328, "train_speed(iter/s)": 0.040169 }, { "epoch": 1.458145814581458, "grad_norm": 2.0705268383026123, "learning_rate": 1.0936342697036276e-05, "loss": 0.40749187469482423, "memory(GiB)": 74.54, "step": 1620, "token_acc": 0.8431718061674008, "train_speed(iter/s)": 0.04017 }, { "epoch": 1.4671467146714672, "grad_norm": 2.2939624786376953, "learning_rate": 1.0839554248017816e-05, "loss": 0.39917492866516113, "memory(GiB)": 74.54, "step": 1630, "token_acc": 0.8533273981749387, "train_speed(iter/s)": 0.040171 }, { "epoch": 1.476147614761476, "grad_norm": 2.232426166534424, "learning_rate": 1.0742686524468193e-05, "loss": 0.3895902156829834, "memory(GiB)": 74.54, "step": 1640, "token_acc": 0.8666959964804224, "train_speed(iter/s)": 0.040172 }, { "epoch": 1.4851485148514851, "grad_norm": 2.317064046859741, "learning_rate": 1.0645748673078513e-05, "loss": 0.4001925468444824, "memory(GiB)": 74.54, "step": 1650, "token_acc": 0.8580047403576815, "train_speed(iter/s)": 0.040177 }, { "epoch": 1.4941494149414942, "grad_norm": 2.4603018760681152, "learning_rate": 1.0548749847161666e-05, "loss": 0.4078868865966797, "memory(GiB)": 74.54, "step": 1660, "token_acc": 0.8525682355469589, "train_speed(iter/s)": 0.04018 }, { "epoch": 1.5031503150315033, "grad_norm": 2.2700588703155518, "learning_rate": 1.0451699205788031e-05, "loss": 0.3826925277709961, "memory(GiB)": 74.54, "step": 1670, "token_acc": 0.8540529189416212, "train_speed(iter/s)": 0.040177 }, { "epoch": 1.5121512151215122, "grad_norm": 2.1843454837799072, "learning_rate": 1.0354605912920643e-05, "loss": 0.39476428031921384, "memory(GiB)": 74.54, "step": 1680, "token_acc": 0.8572723153602175, "train_speed(iter/s)": 0.040177 }, { "epoch": 1.521152115211521, "grad_norm": 2.183195114135742, "learning_rate": 1.0257479136549889e-05, "loss": 0.4017205715179443, "memory(GiB)": 74.54, "step": 1690, "token_acc": 0.858510389913612, "train_speed(iter/s)": 0.040177 }, { "epoch": 1.5301530153015301, "grad_norm": 2.2219948768615723, "learning_rate": 1.0160328047827805e-05, "loss": 0.3950798988342285, "memory(GiB)": 74.54, "step": 1700, "token_acc": 0.859968881973772, "train_speed(iter/s)": 0.04018 }, { "epoch": 1.5391539153915392, "grad_norm": 2.1306684017181396, "learning_rate": 1.006316182020213e-05, "loss": 0.3851861238479614, "memory(GiB)": 74.54, "step": 1710, "token_acc": 0.8605112384310268, "train_speed(iter/s)": 0.040185 }, { "epoch": 1.5481548154815483, "grad_norm": 2.3634705543518066, "learning_rate": 9.965989628550073e-06, "loss": 0.3927136421203613, "memory(GiB)": 74.54, "step": 1720, "token_acc": 0.8631741821396994, "train_speed(iter/s)": 0.040185 }, { "epoch": 1.5571557155715572, "grad_norm": 2.1868417263031006, "learning_rate": 9.868820648311998e-06, "loss": 0.3937791585922241, "memory(GiB)": 74.54, "step": 1730, "token_acc": 0.8506729331339458, "train_speed(iter/s)": 0.04019 }, { "epoch": 1.566156615661566, "grad_norm": 2.058154344558716, "learning_rate": 9.771664054625036e-06, "loss": 0.4051863193511963, "memory(GiB)": 74.54, "step": 1740, "token_acc": 0.8571127057830308, "train_speed(iter/s)": 0.04019 }, { "epoch": 1.5751575157515751, "grad_norm": 2.278233051300049, "learning_rate": 9.674529021456711e-06, "loss": 0.3995014429092407, "memory(GiB)": 74.54, "step": 1750, "token_acc": 0.8531134736385333, "train_speed(iter/s)": 0.04019 }, { "epoch": 1.5841584158415842, "grad_norm": 2.4994163513183594, "learning_rate": 9.577424720738725e-06, "loss": 0.3964822769165039, "memory(GiB)": 74.54, "step": 1760, "token_acc": 0.8614113159567705, "train_speed(iter/s)": 0.040189 }, { "epoch": 1.5931593159315933, "grad_norm": 2.2877440452575684, "learning_rate": 9.480360321500866e-06, "loss": 0.3912468433380127, "memory(GiB)": 74.54, "step": 1770, "token_acc": 0.8542329726288987, "train_speed(iter/s)": 0.04019 }, { "epoch": 1.6021602160216022, "grad_norm": 2.2842419147491455, "learning_rate": 9.38334498900525e-06, "loss": 0.396860408782959, "memory(GiB)": 74.54, "step": 1780, "token_acc": 0.8597612958226769, "train_speed(iter/s)": 0.040193 }, { "epoch": 1.611161116111611, "grad_norm": 2.171830415725708, "learning_rate": 9.28638788388088e-06, "loss": 0.39132468700408934, "memory(GiB)": 74.54, "step": 1790, "token_acc": 0.8446624087591241, "train_speed(iter/s)": 0.040193 }, { "epoch": 1.6201620162016201, "grad_norm": 2.2504782676696777, "learning_rate": 9.189498161258678e-06, "loss": 0.39133219718933104, "memory(GiB)": 74.54, "step": 1800, "token_acc": 0.8526747195858498, "train_speed(iter/s)": 0.040193 }, { "epoch": 1.6291629162916292, "grad_norm": 2.2380685806274414, "learning_rate": 9.092684969906994e-06, "loss": 0.39520695209503176, "memory(GiB)": 74.54, "step": 1810, "token_acc": 0.8510874389702618, "train_speed(iter/s)": 0.040195 }, { "epoch": 1.6381638163816383, "grad_norm": 2.3991379737854004, "learning_rate": 8.995957451367751e-06, "loss": 0.39344358444213867, "memory(GiB)": 74.54, "step": 1820, "token_acc": 0.8661971830985915, "train_speed(iter/s)": 0.040196 }, { "epoch": 1.6471647164716472, "grad_norm": 2.167818307876587, "learning_rate": 8.899324739093255e-06, "loss": 0.38270139694213867, "memory(GiB)": 74.54, "step": 1830, "token_acc": 0.8632143593975655, "train_speed(iter/s)": 0.040195 }, { "epoch": 1.656165616561656, "grad_norm": 2.1482577323913574, "learning_rate": 8.802795957583774e-06, "loss": 0.38856942653656007, "memory(GiB)": 74.54, "step": 1840, "token_acc": 0.8508108108108108, "train_speed(iter/s)": 0.040197 }, { "epoch": 1.6651665166516652, "grad_norm": 2.223714828491211, "learning_rate": 8.706380221525959e-06, "loss": 0.3878568172454834, "memory(GiB)": 74.54, "step": 1850, "token_acc": 0.8518351722585004, "train_speed(iter/s)": 0.040198 }, { "epoch": 1.6741674167416742, "grad_norm": 2.1293275356292725, "learning_rate": 8.610086634932195e-06, "loss": 0.3860627174377441, "memory(GiB)": 74.54, "step": 1860, "token_acc": 0.8636664460622104, "train_speed(iter/s)": 0.0402 }, { "epoch": 1.6831683168316833, "grad_norm": 2.2796740531921387, "learning_rate": 8.513924290280955e-06, "loss": 0.4010897636413574, "memory(GiB)": 74.54, "step": 1870, "token_acc": 0.8624, "train_speed(iter/s)": 0.040198 }, { "epoch": 1.6921692169216922, "grad_norm": 2.063302516937256, "learning_rate": 8.417902267658264e-06, "loss": 0.3978671312332153, "memory(GiB)": 74.54, "step": 1880, "token_acc": 0.8563941299790356, "train_speed(iter/s)": 0.040199 }, { "epoch": 1.701170117011701, "grad_norm": 2.589029550552368, "learning_rate": 8.322029633900293e-06, "loss": 0.4007380485534668, "memory(GiB)": 74.54, "step": 1890, "token_acc": 0.8558875219683656, "train_speed(iter/s)": 0.040201 }, { "epoch": 1.7101710171017102, "grad_norm": 2.1972382068634033, "learning_rate": 8.226315441737232e-06, "loss": 0.39293272495269777, "memory(GiB)": 74.54, "step": 1900, "token_acc": 0.8606382978723405, "train_speed(iter/s)": 0.040201 }, { "epoch": 1.7191719171917192, "grad_norm": 2.1070621013641357, "learning_rate": 8.130768728938503e-06, "loss": 0.4030153274536133, "memory(GiB)": 74.54, "step": 1910, "token_acc": 0.858612883309323, "train_speed(iter/s)": 0.040199 }, { "epoch": 1.7281728172817283, "grad_norm": 2.4515891075134277, "learning_rate": 8.035398517459367e-06, "loss": 0.3846758842468262, "memory(GiB)": 74.54, "step": 1920, "token_acc": 0.8604975587072774, "train_speed(iter/s)": 0.040203 }, { "epoch": 1.7371737173717372, "grad_norm": 2.4625024795532227, "learning_rate": 7.940213812589018e-06, "loss": 0.3977564096450806, "memory(GiB)": 74.54, "step": 1930, "token_acc": 0.8620689655172413, "train_speed(iter/s)": 0.040207 }, { "epoch": 1.746174617461746, "grad_norm": 2.358564853668213, "learning_rate": 7.84522360210028e-06, "loss": 0.3818389415740967, "memory(GiB)": 74.54, "step": 1940, "token_acc": 0.8622779519331244, "train_speed(iter/s)": 0.040208 }, { "epoch": 1.7551755175517552, "grad_norm": 2.43326473236084, "learning_rate": 7.750436855400924e-06, "loss": 0.40569381713867186, "memory(GiB)": 74.54, "step": 1950, "token_acc": 0.8431502316346791, "train_speed(iter/s)": 0.040209 }, { "epoch": 1.7641764176417642, "grad_norm": 2.141272783279419, "learning_rate": 7.655862522686759e-06, "loss": 0.4061896324157715, "memory(GiB)": 74.54, "step": 1960, "token_acc": 0.8561802484733628, "train_speed(iter/s)": 0.040213 }, { "epoch": 1.7731773177317733, "grad_norm": 2.1799638271331787, "learning_rate": 7.561509534096486e-06, "loss": 0.3843768835067749, "memory(GiB)": 74.54, "step": 1970, "token_acc": 0.8601476840456478, "train_speed(iter/s)": 0.040213 }, { "epoch": 1.7821782178217822, "grad_norm": 2.2130813598632812, "learning_rate": 7.467386798868492e-06, "loss": 0.383782172203064, "memory(GiB)": 74.54, "step": 1980, "token_acc": 0.8536738538831903, "train_speed(iter/s)": 0.040213 }, { "epoch": 1.791179117911791, "grad_norm": 2.2999327182769775, "learning_rate": 7.373503204499589e-06, "loss": 0.3898015975952148, "memory(GiB)": 74.54, "step": 1990, "token_acc": 0.8597833014659019, "train_speed(iter/s)": 0.040213 }, { "epoch": 1.8001800180018002, "grad_norm": 2.0685296058654785, "learning_rate": 7.279867615905836e-06, "loss": 0.39383411407470703, "memory(GiB)": 74.54, "step": 2000, "token_acc": 0.8522530329289428, "train_speed(iter/s)": 0.040217 }, { "epoch": 1.8001800180018002, "eval_loss": 0.40739279985427856, "eval_runtime": 113.0562, "eval_samples_per_second": 12.693, "eval_steps_per_second": 0.398, "eval_token_acc": 0.8513244228432564, "step": 2000 }, { "epoch": 1.8091809180918093, "grad_norm": 2.3695876598358154, "learning_rate": 7.186488874585441e-06, "loss": 0.38712072372436523, "memory(GiB)": 76.18, "step": 2010, "token_acc": 0.8560460652591171, "train_speed(iter/s)": 0.040111 }, { "epoch": 1.8181818181818183, "grad_norm": 2.2949750423431396, "learning_rate": 7.093375797783935e-06, "loss": 0.38932750225067136, "memory(GiB)": 76.18, "step": 2020, "token_acc": 0.8515789473684211, "train_speed(iter/s)": 0.040113 }, { "epoch": 1.8271827182718272, "grad_norm": 2.102889060974121, "learning_rate": 7.0005371776615884e-06, "loss": 0.3895460844039917, "memory(GiB)": 76.18, "step": 2030, "token_acc": 0.8582169709989259, "train_speed(iter/s)": 0.040117 }, { "epoch": 1.836183618361836, "grad_norm": 2.2533607482910156, "learning_rate": 6.907981780463233e-06, "loss": 0.3849326133728027, "memory(GiB)": 76.18, "step": 2040, "token_acc": 0.8707364762111667, "train_speed(iter/s)": 0.040118 }, { "epoch": 1.8451845184518452, "grad_norm": 2.058211326599121, "learning_rate": 6.815718345690496e-06, "loss": 0.38345019817352294, "memory(GiB)": 76.18, "step": 2050, "token_acc": 0.85548358275631, "train_speed(iter/s)": 0.040122 }, { "epoch": 1.8541854185418543, "grad_norm": 2.466780424118042, "learning_rate": 6.72375558527659e-06, "loss": 0.38396077156066893, "memory(GiB)": 76.18, "step": 2060, "token_acc": 0.8563974591651543, "train_speed(iter/s)": 0.040122 }, { "epoch": 1.8631863186318633, "grad_norm": 2.325998544692993, "learning_rate": 6.632102182763681e-06, "loss": 0.3884021759033203, "memory(GiB)": 76.18, "step": 2070, "token_acc": 0.8589527027027027, "train_speed(iter/s)": 0.040123 }, { "epoch": 1.8721872187218722, "grad_norm": 2.3079795837402344, "learning_rate": 6.540766792482962e-06, "loss": 0.4022721290588379, "memory(GiB)": 76.18, "step": 2080, "token_acc": 0.8444188722669735, "train_speed(iter/s)": 0.040126 }, { "epoch": 1.881188118811881, "grad_norm": 2.305443525314331, "learning_rate": 6.449758038737458e-06, "loss": 0.3774123668670654, "memory(GiB)": 76.18, "step": 2090, "token_acc": 0.859161246916349, "train_speed(iter/s)": 0.040128 }, { "epoch": 1.8901890189018902, "grad_norm": 2.306131362915039, "learning_rate": 6.359084514987688e-06, "loss": 0.38950314521789553, "memory(GiB)": 76.18, "step": 2100, "token_acc": 0.8646680942184154, "train_speed(iter/s)": 0.040128 }, { "epoch": 1.8991899189918993, "grad_norm": 2.5018227100372314, "learning_rate": 6.268754783040228e-06, "loss": 0.3790890693664551, "memory(GiB)": 76.18, "step": 2110, "token_acc": 0.8660165359338563, "train_speed(iter/s)": 0.040128 }, { "epoch": 1.9081908190819084, "grad_norm": 2.1461129188537598, "learning_rate": 6.17877737223928e-06, "loss": 0.37567844390869143, "memory(GiB)": 76.18, "step": 2120, "token_acc": 0.8673469387755102, "train_speed(iter/s)": 0.040129 }, { "epoch": 1.9171917191719172, "grad_norm": 2.1912460327148438, "learning_rate": 6.089160778661262e-06, "loss": 0.37552733421325685, "memory(GiB)": 76.18, "step": 2130, "token_acc": 0.8715083798882681, "train_speed(iter/s)": 0.040128 }, { "epoch": 1.926192619261926, "grad_norm": 2.2097115516662598, "learning_rate": 5.999913464312606e-06, "loss": 0.37886598110198977, "memory(GiB)": 76.18, "step": 2140, "token_acc": 0.8663426488456865, "train_speed(iter/s)": 0.040129 }, { "epoch": 1.9351935193519352, "grad_norm": 2.239027976989746, "learning_rate": 5.911043856330701e-06, "loss": 0.4021574020385742, "memory(GiB)": 76.18, "step": 2150, "token_acc": 0.8618796662274923, "train_speed(iter/s)": 0.040132 }, { "epoch": 1.9441944194419443, "grad_norm": 2.1112523078918457, "learning_rate": 5.822560346188204e-06, "loss": 0.3870594024658203, "memory(GiB)": 76.18, "step": 2160, "token_acc": 0.8622662266226623, "train_speed(iter/s)": 0.040134 }, { "epoch": 1.9531953195319534, "grad_norm": 2.1353354454040527, "learning_rate": 5.7344712889006424e-06, "loss": 0.38895013332366946, "memory(GiB)": 76.18, "step": 2170, "token_acc": 0.8509840674789129, "train_speed(iter/s)": 0.040134 }, { "epoch": 1.9621962196219622, "grad_norm": 2.064527988433838, "learning_rate": 5.646785002237509e-06, "loss": 0.3719027519226074, "memory(GiB)": 76.18, "step": 2180, "token_acc": 0.8651858368154828, "train_speed(iter/s)": 0.040134 }, { "epoch": 1.971197119711971, "grad_norm": 2.2494568824768066, "learning_rate": 5.5595097659368765e-06, "loss": 0.37720603942871095, "memory(GiB)": 76.18, "step": 2190, "token_acc": 0.8660617844026788, "train_speed(iter/s)": 0.040134 }, { "epoch": 1.9801980198019802, "grad_norm": 2.422858715057373, "learning_rate": 5.472653820923564e-06, "loss": 0.3978924036026001, "memory(GiB)": 76.18, "step": 2200, "token_acc": 0.8567662565905096, "train_speed(iter/s)": 0.040138 }, { "epoch": 1.9891989198919893, "grad_norm": 2.5676939487457275, "learning_rate": 5.386225368530995e-06, "loss": 0.39810938835144044, "memory(GiB)": 76.18, "step": 2210, "token_acc": 0.8570179274158286, "train_speed(iter/s)": 0.04014 }, { "epoch": 1.9981998199819984, "grad_norm": 2.2991700172424316, "learning_rate": 5.300232569726805e-06, "loss": 0.3851327657699585, "memory(GiB)": 76.18, "step": 2220, "token_acc": 0.8624459120929173, "train_speed(iter/s)": 0.040141 }, { "epoch": 2.007200720072007, "grad_norm": 2.1788246631622314, "learning_rate": 5.2146835443422215e-06, "loss": 0.3738105773925781, "memory(GiB)": 76.18, "step": 2230, "token_acc": 0.8664259927797834, "train_speed(iter/s)": 0.04015 }, { "epoch": 2.016201620162016, "grad_norm": 2.2583391666412354, "learning_rate": 5.129586370305389e-06, "loss": 0.37696280479431155, "memory(GiB)": 76.18, "step": 2240, "token_acc": 0.8627628306579245, "train_speed(iter/s)": 0.040149 }, { "epoch": 2.025202520252025, "grad_norm": 2.3937697410583496, "learning_rate": 5.0449490828785745e-06, "loss": 0.35777480602264405, "memory(GiB)": 76.18, "step": 2250, "token_acc": 0.8723312486521457, "train_speed(iter/s)": 0.040148 }, { "epoch": 2.0342034203420343, "grad_norm": 2.3122761249542236, "learning_rate": 4.960779673899465e-06, "loss": 0.3647487163543701, "memory(GiB)": 76.18, "step": 2260, "token_acc": 0.8682050144220103, "train_speed(iter/s)": 0.04015 }, { "epoch": 2.0432043204320434, "grad_norm": 2.3489394187927246, "learning_rate": 4.8770860910265315e-06, "loss": 0.3610623836517334, "memory(GiB)": 76.18, "step": 2270, "token_acc": 0.8642826367944851, "train_speed(iter/s)": 0.040151 }, { "epoch": 2.052205220522052, "grad_norm": 2.564075469970703, "learning_rate": 4.793876236988593e-06, "loss": 0.3656606674194336, "memory(GiB)": 76.18, "step": 2280, "token_acc": 0.8674548848786559, "train_speed(iter/s)": 0.040152 }, { "epoch": 2.061206120612061, "grad_norm": 2.3542511463165283, "learning_rate": 4.711157968838577e-06, "loss": 0.38109097480773924, "memory(GiB)": 76.18, "step": 2290, "token_acc": 0.8542568542568543, "train_speed(iter/s)": 0.040154 }, { "epoch": 2.07020702070207, "grad_norm": 2.5607492923736572, "learning_rate": 4.628939097211641e-06, "loss": 0.3731189966201782, "memory(GiB)": 76.18, "step": 2300, "token_acc": 0.8808107512667989, "train_speed(iter/s)": 0.040155 }, { "epoch": 2.0792079207920793, "grad_norm": 2.4762189388275146, "learning_rate": 4.547227385587648e-06, "loss": 0.3798922300338745, "memory(GiB)": 76.18, "step": 2310, "token_acc": 0.8597145993413831, "train_speed(iter/s)": 0.040157 }, { "epoch": 2.0882088208820884, "grad_norm": 2.485635280609131, "learning_rate": 4.466030549558116e-06, "loss": 0.3755971670150757, "memory(GiB)": 76.18, "step": 2320, "token_acc": 0.8549968704360525, "train_speed(iter/s)": 0.040157 }, { "epoch": 2.097209720972097, "grad_norm": 2.2108871936798096, "learning_rate": 4.385356256097656e-06, "loss": 0.35892772674560547, "memory(GiB)": 76.18, "step": 2330, "token_acc": 0.8641063515509602, "train_speed(iter/s)": 0.040157 }, { "epoch": 2.106210621062106, "grad_norm": 2.559431791305542, "learning_rate": 4.305212122840038e-06, "loss": 0.36676650047302245, "memory(GiB)": 76.18, "step": 2340, "token_acc": 0.8685561258647624, "train_speed(iter/s)": 0.040159 }, { "epoch": 2.115211521152115, "grad_norm": 2.3263328075408936, "learning_rate": 4.22560571735889e-06, "loss": 0.3723811149597168, "memory(GiB)": 76.18, "step": 2350, "token_acc": 0.8562313908974905, "train_speed(iter/s)": 0.04016 }, { "epoch": 2.1242124212421243, "grad_norm": 2.4957282543182373, "learning_rate": 4.146544556453146e-06, "loss": 0.3725306987762451, "memory(GiB)": 76.18, "step": 2360, "token_acc": 0.8700726712177934, "train_speed(iter/s)": 0.040162 }, { "epoch": 2.1332133213321334, "grad_norm": 2.5752525329589844, "learning_rate": 4.068036105437259e-06, "loss": 0.3709956884384155, "memory(GiB)": 76.18, "step": 2370, "token_acc": 0.8635585970915313, "train_speed(iter/s)": 0.040163 }, { "epoch": 2.142214221422142, "grad_norm": 2.509699583053589, "learning_rate": 3.990087777436303e-06, "loss": 0.37915217876434326, "memory(GiB)": 76.18, "step": 2380, "token_acc": 0.8585365853658536, "train_speed(iter/s)": 0.040161 }, { "epoch": 2.151215121512151, "grad_norm": 2.5639617443084717, "learning_rate": 3.9127069326859815e-06, "loss": 0.36791577339172366, "memory(GiB)": 76.18, "step": 2390, "token_acc": 0.8695652173913043, "train_speed(iter/s)": 0.040161 }, { "epoch": 2.16021602160216, "grad_norm": 2.5950934886932373, "learning_rate": 3.835900877837665e-06, "loss": 0.37401318550109863, "memory(GiB)": 76.18, "step": 2400, "token_acc": 0.8627917026793431, "train_speed(iter/s)": 0.04016 }, { "epoch": 2.1692169216921693, "grad_norm": 2.627086639404297, "learning_rate": 3.7596768652684324e-06, "loss": 0.37379937171936034, "memory(GiB)": 76.18, "step": 2410, "token_acc": 0.8596715717637022, "train_speed(iter/s)": 0.040162 }, { "epoch": 2.1782178217821784, "grad_norm": 3.0903186798095703, "learning_rate": 3.6840420923962873e-06, "loss": 0.36346681118011476, "memory(GiB)": 76.18, "step": 2420, "token_acc": 0.8670668953687821, "train_speed(iter/s)": 0.040164 }, { "epoch": 2.187218721872187, "grad_norm": 2.4955599308013916, "learning_rate": 3.609003701000535e-06, "loss": 0.35879087448120117, "memory(GiB)": 76.18, "step": 2430, "token_acc": 0.8731778425655977, "train_speed(iter/s)": 0.040165 }, { "epoch": 2.196219621962196, "grad_norm": 2.3009448051452637, "learning_rate": 3.5345687765474444e-06, "loss": 0.37301011085510255, "memory(GiB)": 76.18, "step": 2440, "token_acc": 0.8637790332705587, "train_speed(iter/s)": 0.040167 }, { "epoch": 2.205220522052205, "grad_norm": 2.5973548889160156, "learning_rate": 3.4607443475211745e-06, "loss": 0.37910096645355223, "memory(GiB)": 76.18, "step": 2450, "token_acc": 0.862, "train_speed(iter/s)": 0.040169 }, { "epoch": 2.2142214221422143, "grad_norm": 2.7337653636932373, "learning_rate": 3.3875373847601365e-06, "loss": 0.36832966804504397, "memory(GiB)": 76.18, "step": 2460, "token_acc": 0.8709608843537415, "train_speed(iter/s)": 0.040171 }, { "epoch": 2.2232223222322234, "grad_norm": 2.4979779720306396, "learning_rate": 3.314954800798763e-06, "loss": 0.35463604927062986, "memory(GiB)": 76.18, "step": 2470, "token_acc": 0.8807906114885732, "train_speed(iter/s)": 0.040173 }, { "epoch": 2.232223222322232, "grad_norm": 2.651418685913086, "learning_rate": 3.24300344921481e-06, "loss": 0.3576260805130005, "memory(GiB)": 76.18, "step": 2480, "token_acc": 0.8673512154233026, "train_speed(iter/s)": 0.040173 }, { "epoch": 2.241224122412241, "grad_norm": 2.2821831703186035, "learning_rate": 3.1716901239821918e-06, "loss": 0.3680659294128418, "memory(GiB)": 76.18, "step": 2490, "token_acc": 0.8615550755939525, "train_speed(iter/s)": 0.040176 }, { "epoch": 2.25022502250225, "grad_norm": 2.532939910888672, "learning_rate": 3.1010215588294724e-06, "loss": 0.3763418674468994, "memory(GiB)": 76.18, "step": 2500, "token_acc": 0.8679738562091504, "train_speed(iter/s)": 0.040176 }, { "epoch": 2.25022502250225, "eval_loss": 0.39449381828308105, "eval_runtime": 112.8212, "eval_samples_per_second": 12.719, "eval_steps_per_second": 0.399, "eval_token_acc": 0.8566221142162819, "step": 2500 }, { "epoch": 2.2592259225922593, "grad_norm": 2.495901584625244, "learning_rate": 3.031004426604044e-06, "loss": 0.3614701271057129, "memory(GiB)": 76.18, "step": 2510, "token_acc": 0.8576721210250077, "train_speed(iter/s)": 0.040102 }, { "epoch": 2.2682268226822684, "grad_norm": 2.6652517318725586, "learning_rate": 2.961645338642032e-06, "loss": 0.3705326557159424, "memory(GiB)": 76.18, "step": 2520, "token_acc": 0.8555579261787924, "train_speed(iter/s)": 0.040101 }, { "epoch": 2.2772277227722775, "grad_norm": 2.2919044494628906, "learning_rate": 2.892950844144028e-06, "loss": 0.3567212581634521, "memory(GiB)": 76.18, "step": 2530, "token_acc": 0.8672348060103162, "train_speed(iter/s)": 0.0401 }, { "epoch": 2.286228622862286, "grad_norm": 2.7642829418182373, "learning_rate": 2.8249274295566863e-06, "loss": 0.3735655784606934, "memory(GiB)": 76.18, "step": 2540, "token_acc": 0.8645260611392127, "train_speed(iter/s)": 0.040102 }, { "epoch": 2.295229522952295, "grad_norm": 2.2890052795410156, "learning_rate": 2.7575815179602527e-06, "loss": 0.36810617446899413, "memory(GiB)": 76.18, "step": 2550, "token_acc": 0.8708510638297873, "train_speed(iter/s)": 0.040105 }, { "epoch": 2.3042304230423043, "grad_norm": 2.5169107913970947, "learning_rate": 2.6909194684620453e-06, "loss": 0.3683924674987793, "memory(GiB)": 76.18, "step": 2560, "token_acc": 0.8675250982103885, "train_speed(iter/s)": 0.040108 }, { "epoch": 2.3132313231323134, "grad_norm": 2.696864128112793, "learning_rate": 2.6249475755960185e-06, "loss": 0.3705678701400757, "memory(GiB)": 76.18, "step": 2570, "token_acc": 0.8628597122302158, "train_speed(iter/s)": 0.040109 }, { "epoch": 2.322232223222322, "grad_norm": 2.4484846591949463, "learning_rate": 2.559672068728398e-06, "loss": 0.36278524398803713, "memory(GiB)": 76.18, "step": 2580, "token_acc": 0.8645696810834426, "train_speed(iter/s)": 0.04011 }, { "epoch": 2.331233123312331, "grad_norm": 2.4576802253723145, "learning_rate": 2.4950991114694755e-06, "loss": 0.3606465578079224, "memory(GiB)": 76.18, "step": 2590, "token_acc": 0.8734927015020097, "train_speed(iter/s)": 0.040113 }, { "epoch": 2.34023402340234, "grad_norm": 2.6191623210906982, "learning_rate": 2.4312348010916088e-06, "loss": 0.36288201808929443, "memory(GiB)": 76.18, "step": 2600, "token_acc": 0.8631202691337259, "train_speed(iter/s)": 0.040113 }, { "epoch": 2.3492349234923493, "grad_norm": 2.6887686252593994, "learning_rate": 2.3680851679535024e-06, "loss": 0.3752190589904785, "memory(GiB)": 76.18, "step": 2610, "token_acc": 0.8617521367521368, "train_speed(iter/s)": 0.040114 }, { "epoch": 2.3582358235823584, "grad_norm": 2.481362819671631, "learning_rate": 2.305656174930776e-06, "loss": 0.36593198776245117, "memory(GiB)": 76.18, "step": 2620, "token_acc": 0.8668838219326819, "train_speed(iter/s)": 0.040116 }, { "epoch": 2.3672367236723675, "grad_norm": 2.629666328430176, "learning_rate": 2.243953716852938e-06, "loss": 0.3610795021057129, "memory(GiB)": 76.18, "step": 2630, "token_acc": 0.8612348822406111, "train_speed(iter/s)": 0.040117 }, { "epoch": 2.376237623762376, "grad_norm": 2.433375597000122, "learning_rate": 2.1829836199467568e-06, "loss": 0.3648895263671875, "memory(GiB)": 76.18, "step": 2640, "token_acc": 0.8715654952076677, "train_speed(iter/s)": 0.040119 }, { "epoch": 2.385238523852385, "grad_norm": 2.5231969356536865, "learning_rate": 2.1227516412861303e-06, "loss": 0.34891419410705565, "memory(GiB)": 76.18, "step": 2650, "token_acc": 0.8747478822105688, "train_speed(iter/s)": 0.040119 }, { "epoch": 2.3942394239423943, "grad_norm": 2.6941776275634766, "learning_rate": 2.063263468248472e-06, "loss": 0.35621964931488037, "memory(GiB)": 76.18, "step": 2660, "token_acc": 0.8614357262103506, "train_speed(iter/s)": 0.040119 }, { "epoch": 2.4032403240324034, "grad_norm": 2.4811367988586426, "learning_rate": 2.0045247179776927e-06, "loss": 0.36508636474609374, "memory(GiB)": 76.18, "step": 2670, "token_acc": 0.865956984575277, "train_speed(iter/s)": 0.040122 }, { "epoch": 2.412241224122412, "grad_norm": 2.5584983825683594, "learning_rate": 1.946540936853787e-06, "loss": 0.36142873764038086, "memory(GiB)": 76.18, "step": 2680, "token_acc": 0.8618881118881119, "train_speed(iter/s)": 0.040122 }, { "epoch": 2.421242124212421, "grad_norm": 2.639416217803955, "learning_rate": 1.8893175999691315e-06, "loss": 0.3669375658035278, "memory(GiB)": 76.18, "step": 2690, "token_acc": 0.8706407137064072, "train_speed(iter/s)": 0.040123 }, { "epoch": 2.43024302430243, "grad_norm": 2.526108980178833, "learning_rate": 1.8328601106114974e-06, "loss": 0.36782519817352294, "memory(GiB)": 76.18, "step": 2700, "token_acc": 0.8681867535287731, "train_speed(iter/s)": 0.040125 }, { "epoch": 2.4392439243924393, "grad_norm": 2.4853765964508057, "learning_rate": 1.7771737997538551e-06, "loss": 0.3661306858062744, "memory(GiB)": 76.18, "step": 2710, "token_acc": 0.8591703056768559, "train_speed(iter/s)": 0.040126 }, { "epoch": 2.4482448244824484, "grad_norm": 2.546694040298462, "learning_rate": 1.7222639255509855e-06, "loss": 0.3565016269683838, "memory(GiB)": 76.18, "step": 2720, "token_acc": 0.8700276536907041, "train_speed(iter/s)": 0.040126 }, { "epoch": 2.4572457245724575, "grad_norm": 2.6145668029785156, "learning_rate": 1.6681356728429909e-06, "loss": 0.3617668628692627, "memory(GiB)": 76.18, "step": 2730, "token_acc": 0.8759859772129711, "train_speed(iter/s)": 0.040127 }, { "epoch": 2.466246624662466, "grad_norm": 2.4962821006774902, "learning_rate": 1.6147941526657151e-06, "loss": 0.36135101318359375, "memory(GiB)": 76.18, "step": 2740, "token_acc": 0.8689489751417357, "train_speed(iter/s)": 0.040127 }, { "epoch": 2.4752475247524752, "grad_norm": 2.476327896118164, "learning_rate": 1.5622444017681438e-06, "loss": 0.3584137916564941, "memory(GiB)": 76.18, "step": 2750, "token_acc": 0.8637279033340792, "train_speed(iter/s)": 0.040128 }, { "epoch": 2.4842484248424843, "grad_norm": 2.5135715007781982, "learning_rate": 1.5104913821367995e-06, "loss": 0.352571439743042, "memory(GiB)": 76.18, "step": 2760, "token_acc": 0.8638624119353502, "train_speed(iter/s)": 0.040127 }, { "epoch": 2.4932493249324934, "grad_norm": 2.535942316055298, "learning_rate": 1.4595399805272138e-06, "loss": 0.35703449249267577, "memory(GiB)": 76.18, "step": 2770, "token_acc": 0.8715143715143715, "train_speed(iter/s)": 0.040129 }, { "epoch": 2.502250225022502, "grad_norm": 2.5901577472686768, "learning_rate": 1.409395008002501e-06, "loss": 0.3632636070251465, "memory(GiB)": 76.18, "step": 2780, "token_acc": 0.8740141137401412, "train_speed(iter/s)": 0.040131 }, { "epoch": 2.511251125112511, "grad_norm": 2.4865550994873047, "learning_rate": 1.3600611994790737e-06, "loss": 0.36820478439331056, "memory(GiB)": 76.18, "step": 2790, "token_acc": 0.8674225904928042, "train_speed(iter/s)": 0.040131 }, { "epoch": 2.5202520252025202, "grad_norm": 2.745784044265747, "learning_rate": 1.311543213279548e-06, "loss": 0.36357576847076417, "memory(GiB)": 76.18, "step": 2800, "token_acc": 0.8688079619995476, "train_speed(iter/s)": 0.040134 }, { "epoch": 2.5292529252925293, "grad_norm": 2.613213300704956, "learning_rate": 1.2638456306928838e-06, "loss": 0.35836281776428225, "memory(GiB)": 76.18, "step": 2810, "token_acc": 0.8775203775203775, "train_speed(iter/s)": 0.040135 }, { "epoch": 2.5382538253825384, "grad_norm": 2.856757879257202, "learning_rate": 1.2169729555418008e-06, "loss": 0.35776748657226565, "memory(GiB)": 76.18, "step": 2820, "token_acc": 0.8681778169014085, "train_speed(iter/s)": 0.040136 }, { "epoch": 2.5472547254725475, "grad_norm": 2.5222392082214355, "learning_rate": 1.1709296137575088e-06, "loss": 0.357517409324646, "memory(GiB)": 76.18, "step": 2830, "token_acc": 0.8692437684833122, "train_speed(iter/s)": 0.040138 }, { "epoch": 2.556255625562556, "grad_norm": 2.6644461154937744, "learning_rate": 1.1257199529617846e-06, "loss": 0.3525848388671875, "memory(GiB)": 76.18, "step": 2840, "token_acc": 0.8726828274597678, "train_speed(iter/s)": 0.04014 }, { "epoch": 2.5652565256525652, "grad_norm": 3.0361390113830566, "learning_rate": 1.0813482420564569e-06, "loss": 0.36429810523986816, "memory(GiB)": 76.18, "step": 2850, "token_acc": 0.8605402909258831, "train_speed(iter/s)": 0.040142 }, { "epoch": 2.5742574257425743, "grad_norm": 2.2939305305480957, "learning_rate": 1.0378186708203097e-06, "loss": 0.3595736026763916, "memory(GiB)": 76.18, "step": 2860, "token_acc": 0.8699784017278618, "train_speed(iter/s)": 0.040145 }, { "epoch": 2.5832583258325834, "grad_norm": 2.8929970264434814, "learning_rate": 9.951353495134741e-07, "loss": 0.3722720146179199, "memory(GiB)": 76.18, "step": 2870, "token_acc": 0.8633415343323642, "train_speed(iter/s)": 0.040147 }, { "epoch": 2.592259225922592, "grad_norm": 2.766711711883545, "learning_rate": 9.533023084893112e-07, "loss": 0.3628982067108154, "memory(GiB)": 76.18, "step": 2880, "token_acc": 0.8731262220291115, "train_speed(iter/s)": 0.040148 }, { "epoch": 2.601260126012601, "grad_norm": 2.6322643756866455, "learning_rate": 9.123234978138485e-07, "loss": 0.3563962459564209, "memory(GiB)": 76.18, "step": 2890, "token_acc": 0.8709398007795582, "train_speed(iter/s)": 0.040149 }, { "epoch": 2.6102610261026102, "grad_norm": 2.3969507217407227, "learning_rate": 8.722027868927973e-07, "loss": 0.3593640089035034, "memory(GiB)": 76.18, "step": 2900, "token_acc": 0.8687513763488218, "train_speed(iter/s)": 0.040149 }, { "epoch": 2.6192619261926193, "grad_norm": 2.662048101425171, "learning_rate": 8.32943964106192e-07, "loss": 0.36847290992736814, "memory(GiB)": 76.18, "step": 2910, "token_acc": 0.8610752688172043, "train_speed(iter/s)": 0.040152 }, { "epoch": 2.6282628262826284, "grad_norm": 2.6064634323120117, "learning_rate": 7.945507364506632e-07, "loss": 0.3641893625259399, "memory(GiB)": 76.18, "step": 2920, "token_acc": 0.8610666056305791, "train_speed(iter/s)": 0.040154 }, { "epoch": 2.6372637263726375, "grad_norm": 2.4192819595336914, "learning_rate": 7.57026729189414e-07, "loss": 0.3702700138092041, "memory(GiB)": 76.18, "step": 2930, "token_acc": 0.8613074204946997, "train_speed(iter/s)": 0.040157 }, { "epoch": 2.646264626462646, "grad_norm": 2.3483784198760986, "learning_rate": 7.203754855099009e-07, "loss": 0.36264016628265383, "memory(GiB)": 76.18, "step": 2940, "token_acc": 0.8588575238941987, "train_speed(iter/s)": 0.04016 }, { "epoch": 2.6552655265526552, "grad_norm": 2.5846633911132812, "learning_rate": 6.846004661892813e-07, "loss": 0.37308740615844727, "memory(GiB)": 76.18, "step": 2950, "token_acc": 0.8615806304248516, "train_speed(iter/s)": 0.040161 }, { "epoch": 2.6642664266426643, "grad_norm": 2.6962997913360596, "learning_rate": 6.497050492676126e-07, "loss": 0.36321473121643066, "memory(GiB)": 76.18, "step": 2960, "token_acc": 0.8618261826182618, "train_speed(iter/s)": 0.040163 }, { "epoch": 2.6732673267326734, "grad_norm": 2.416895627975464, "learning_rate": 6.156925297288996e-07, "loss": 0.34958364963531496, "memory(GiB)": 76.18, "step": 2970, "token_acc": 0.8714713430282293, "train_speed(iter/s)": 0.040164 }, { "epoch": 2.682268226822682, "grad_norm": 2.3380393981933594, "learning_rate": 5.825661191899534e-07, "loss": 0.36399097442626954, "memory(GiB)": 76.18, "step": 2980, "token_acc": 0.8697334479793637, "train_speed(iter/s)": 0.040165 }, { "epoch": 2.691269126912691, "grad_norm": 2.4997997283935547, "learning_rate": 5.503289455971495e-07, "loss": 0.3497540235519409, "memory(GiB)": 76.18, "step": 2990, "token_acc": 0.8589799476896252, "train_speed(iter/s)": 0.040167 }, { "epoch": 2.7002700270027002, "grad_norm": 2.7024405002593994, "learning_rate": 5.18984052931063e-07, "loss": 0.36266303062438965, "memory(GiB)": 76.18, "step": 3000, "token_acc": 0.8634655532359081, "train_speed(iter/s)": 0.040168 }, { "epoch": 2.7002700270027002, "eval_loss": 0.3909822702407837, "eval_runtime": 113.741, "eval_samples_per_second": 12.616, "eval_steps_per_second": 0.396, "eval_token_acc": 0.8578371810449574, "step": 3000 }, { "epoch": 2.7092709270927093, "grad_norm": 2.7375988960266113, "learning_rate": 4.885344009190429e-07, "loss": 0.36505513191223143, "memory(GiB)": 76.18, "step": 3010, "token_acc": 0.8647040722125346, "train_speed(iter/s)": 0.040096 }, { "epoch": 2.7182718271827184, "grad_norm": 2.5784595012664795, "learning_rate": 4.5898286475574483e-07, "loss": 0.36314241886138915, "memory(GiB)": 76.18, "step": 3020, "token_acc": 0.8750795334040297, "train_speed(iter/s)": 0.040096 }, { "epoch": 2.7272727272727275, "grad_norm": 2.59897518157959, "learning_rate": 4.30332234831643e-07, "loss": 0.3617940664291382, "memory(GiB)": 76.18, "step": 3030, "token_acc": 0.8697020562316408, "train_speed(iter/s)": 0.040097 }, { "epoch": 2.736273627362736, "grad_norm": 2.331024646759033, "learning_rate": 4.025852164695432e-07, "loss": 0.35245676040649415, "memory(GiB)": 76.18, "step": 3040, "token_acc": 0.8609855820959759, "train_speed(iter/s)": 0.040098 }, { "epoch": 2.7452745274527453, "grad_norm": 2.9060468673706055, "learning_rate": 3.7574442966913816e-07, "loss": 0.37049217224121095, "memory(GiB)": 76.18, "step": 3050, "token_acc": 0.8594235033259423, "train_speed(iter/s)": 0.040099 }, { "epoch": 2.7542754275427543, "grad_norm": 2.7476565837860107, "learning_rate": 3.498124088596133e-07, "loss": 0.35335454940795896, "memory(GiB)": 76.18, "step": 3060, "token_acc": 0.8769035532994924, "train_speed(iter/s)": 0.040098 }, { "epoch": 2.7632763276327634, "grad_norm": 2.47446346282959, "learning_rate": 3.2479160266033595e-07, "loss": 0.3646056652069092, "memory(GiB)": 76.18, "step": 3070, "token_acc": 0.8609637488947833, "train_speed(iter/s)": 0.040099 }, { "epoch": 2.772277227722772, "grad_norm": 2.518899440765381, "learning_rate": 3.0068437364964563e-07, "loss": 0.36437718868255614, "memory(GiB)": 76.18, "step": 3080, "token_acc": 0.8751534997953336, "train_speed(iter/s)": 0.040101 }, { "epoch": 2.781278127812781, "grad_norm": 2.4832963943481445, "learning_rate": 2.774929981417662e-07, "loss": 0.36618633270263673, "memory(GiB)": 76.18, "step": 3090, "token_acc": 0.8648288128056915, "train_speed(iter/s)": 0.040101 }, { "epoch": 2.7902790279027903, "grad_norm": 2.6481244564056396, "learning_rate": 2.5521966597186976e-07, "loss": 0.3651879787445068, "memory(GiB)": 76.18, "step": 3100, "token_acc": 0.8597326082030364, "train_speed(iter/s)": 0.040102 }, { "epoch": 2.7992799279927993, "grad_norm": 2.6947715282440186, "learning_rate": 2.3386648028930093e-07, "loss": 0.35363340377807617, "memory(GiB)": 76.18, "step": 3110, "token_acc": 0.8761429758935994, "train_speed(iter/s)": 0.040104 }, { "epoch": 2.8082808280828084, "grad_norm": 2.7126548290252686, "learning_rate": 2.134354573589825e-07, "loss": 0.3739881753921509, "memory(GiB)": 76.18, "step": 3120, "token_acc": 0.8569641367806505, "train_speed(iter/s)": 0.040106 }, { "epoch": 2.8172817281728175, "grad_norm": 2.6334176063537598, "learning_rate": 1.939285263710411e-07, "loss": 0.37378754615783694, "memory(GiB)": 76.18, "step": 3130, "token_acc": 0.8621212121212121, "train_speed(iter/s)": 0.040109 }, { "epoch": 2.826282628262826, "grad_norm": 2.6771504878997803, "learning_rate": 1.7534752925863264e-07, "loss": 0.3727731227874756, "memory(GiB)": 76.18, "step": 3140, "token_acc": 0.8573262032085561, "train_speed(iter/s)": 0.040111 }, { "epoch": 2.8352835283528353, "grad_norm": 2.7885513305664062, "learning_rate": 1.5769422052403172e-07, "loss": 0.3634767770767212, "memory(GiB)": 76.18, "step": 3150, "token_acc": 0.8657498362802881, "train_speed(iter/s)": 0.040111 }, { "epoch": 2.8442844284428443, "grad_norm": 2.770448684692383, "learning_rate": 1.409702670729518e-07, "loss": 0.3641348123550415, "memory(GiB)": 76.18, "step": 3160, "token_acc": 0.8695652173913043, "train_speed(iter/s)": 0.040111 }, { "epoch": 2.8532853285328534, "grad_norm": 2.716731309890747, "learning_rate": 1.2517724805715115e-07, "loss": 0.36133828163146975, "memory(GiB)": 76.18, "step": 3170, "token_acc": 0.8693168837103039, "train_speed(iter/s)": 0.040112 }, { "epoch": 2.862286228622862, "grad_norm": 2.320976734161377, "learning_rate": 1.1031665472532871e-07, "loss": 0.3573209285736084, "memory(GiB)": 76.18, "step": 3180, "token_acc": 0.8647353517752123, "train_speed(iter/s)": 0.040115 }, { "epoch": 2.871287128712871, "grad_norm": 2.6834940910339355, "learning_rate": 9.638989028230572e-08, "loss": 0.3642300605773926, "memory(GiB)": 76.18, "step": 3190, "token_acc": 0.8666237113402062, "train_speed(iter/s)": 0.040116 }, { "epoch": 2.8802880288028803, "grad_norm": 2.8395378589630127, "learning_rate": 8.339826975653165e-08, "loss": 0.3668497562408447, "memory(GiB)": 76.18, "step": 3200, "token_acc": 0.8565969880872106, "train_speed(iter/s)": 0.040118 }, { "epoch": 2.8892889288928894, "grad_norm": 2.8500564098358154, "learning_rate": 7.134301987591686e-08, "loss": 0.35763015747070315, "memory(GiB)": 76.18, "step": 3210, "token_acc": 0.8680448647459864, "train_speed(iter/s)": 0.04012 }, { "epoch": 2.8982898289828984, "grad_norm": 2.391807794570923, "learning_rate": 6.022527895198971e-08, "loss": 0.3681647300720215, "memory(GiB)": 76.18, "step": 3220, "token_acc": 0.8623626989464246, "train_speed(iter/s)": 0.040122 }, { "epoch": 2.9072907290729075, "grad_norm": 2.870159149169922, "learning_rate": 5.004609677242478e-08, "loss": 0.3709531307220459, "memory(GiB)": 76.18, "step": 3230, "token_acc": 0.8634751773049646, "train_speed(iter/s)": 0.040123 }, { "epoch": 2.916291629162916, "grad_norm": 2.3860719203948975, "learning_rate": 4.0806434501907686e-08, "loss": 0.3573091745376587, "memory(GiB)": 76.18, "step": 3240, "token_acc": 0.8636980108499096, "train_speed(iter/s)": 0.040125 }, { "epoch": 2.9252925292529253, "grad_norm": 2.533841609954834, "learning_rate": 3.2507164591378817e-08, "loss": 0.35629446506500245, "memory(GiB)": 76.18, "step": 3250, "token_acc": 0.8767689962987154, "train_speed(iter/s)": 0.040126 }, { "epoch": 2.9342934293429344, "grad_norm": 2.7338736057281494, "learning_rate": 2.5149070695656974e-08, "loss": 0.36386995315551757, "memory(GiB)": 76.18, "step": 3260, "token_acc": 0.8695070265447246, "train_speed(iter/s)": 0.040129 }, { "epoch": 2.9432943294329434, "grad_norm": 2.5814294815063477, "learning_rate": 1.873284759943861e-08, "loss": 0.3609006881713867, "memory(GiB)": 76.18, "step": 3270, "token_acc": 0.8714535137494543, "train_speed(iter/s)": 0.040129 }, { "epoch": 2.952295229522952, "grad_norm": 2.6087794303894043, "learning_rate": 1.325910115169471e-08, "loss": 0.36290225982666013, "memory(GiB)": 76.18, "step": 3280, "token_acc": 0.8663007683863886, "train_speed(iter/s)": 0.04013 }, { "epoch": 2.961296129612961, "grad_norm": 2.4624176025390625, "learning_rate": 8.728348208466575e-09, "loss": 0.36122841835021974, "memory(GiB)": 76.18, "step": 3290, "token_acc": 0.8651804670912951, "train_speed(iter/s)": 0.040133 }, { "epoch": 2.9702970297029703, "grad_norm": 2.5794880390167236, "learning_rate": 5.1410165840548586e-09, "loss": 0.35005528926849366, "memory(GiB)": 76.18, "step": 3300, "token_acc": 0.873643074250977, "train_speed(iter/s)": 0.040135 }, { "epoch": 2.9792979297929794, "grad_norm": 2.730228900909424, "learning_rate": 2.4974450106318715e-09, "loss": 0.3484092473983765, "memory(GiB)": 76.18, "step": 3310, "token_acc": 0.8741692512184316, "train_speed(iter/s)": 0.040137 }, { "epoch": 2.9882988298829884, "grad_norm": 2.4973952770233154, "learning_rate": 7.978831062493975e-10, "loss": 0.360276198387146, "memory(GiB)": 76.18, "step": 3320, "token_acc": 0.8717892425905598, "train_speed(iter/s)": 0.040139 }, { "epoch": 2.9972997299729975, "grad_norm": 2.646111488342285, "learning_rate": 4.249135127420978e-11, "loss": 0.34623007774353026, "memory(GiB)": 76.18, "step": 3330, "token_acc": 0.8752749670039596, "train_speed(iter/s)": 0.040141 }, { "epoch": 3.0, "eval_loss": 0.3906257748603821, "eval_runtime": 111.4189, "eval_samples_per_second": 12.879, "eval_steps_per_second": 0.404, "eval_token_acc": 0.8578371810449574, "step": 3333 } ], "logging_steps": 10, "max_steps": 3333, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3350909053056844e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }