| { | |
| "best_global_step": 3333, | |
| "best_metric": 0.39062577, | |
| "best_model_checkpoint": "/global/D1/homes/sushant/Kvasir-VQA-x1/output_vqa_x1/v0-20250521-005603/checkpoint-3333", | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 3333, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0009000900090009, | |
| "grad_norm": 7.174169063568115, | |
| "learning_rate": 2.0000000000000002e-07, | |
| "loss": 3.304050922393799, | |
| "memory(GiB)": 66.97, | |
| "step": 1, | |
| "token_acc": 0.4874715261958998, | |
| "train_speed(iter/s)": 0.019902 | |
| }, | |
| { | |
| "epoch": 0.009000900090009001, | |
| "grad_norm": 6.479684829711914, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 3.1309598286946616, | |
| "memory(GiB)": 67.86, | |
| "step": 10, | |
| "token_acc": 0.4754664823773324, | |
| "train_speed(iter/s)": 0.036367 | |
| }, | |
| { | |
| "epoch": 0.018001800180018002, | |
| "grad_norm": 7.254239559173584, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 3.1008956909179686, | |
| "memory(GiB)": 67.86, | |
| "step": 20, | |
| "token_acc": 0.4788295278208823, | |
| "train_speed(iter/s)": 0.038518 | |
| }, | |
| { | |
| "epoch": 0.027002700270027002, | |
| "grad_norm": 7.573488712310791, | |
| "learning_rate": 6e-06, | |
| "loss": 2.772838592529297, | |
| "memory(GiB)": 67.86, | |
| "step": 30, | |
| "token_acc": 0.5009667024704618, | |
| "train_speed(iter/s)": 0.039255 | |
| }, | |
| { | |
| "epoch": 0.036003600360036005, | |
| "grad_norm": 4.112278938293457, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 2.004438781738281, | |
| "memory(GiB)": 67.86, | |
| "step": 40, | |
| "token_acc": 0.5291459557162224, | |
| "train_speed(iter/s)": 0.03954 | |
| }, | |
| { | |
| "epoch": 0.045004500450045004, | |
| "grad_norm": 1.6541378498077393, | |
| "learning_rate": 1e-05, | |
| "loss": 1.8318557739257812, | |
| "memory(GiB)": 67.86, | |
| "step": 50, | |
| "token_acc": 0.5645268034414295, | |
| "train_speed(iter/s)": 0.039725 | |
| }, | |
| { | |
| "epoch": 0.054005400540054004, | |
| "grad_norm": 1.6028215885162354, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.5337283134460449, | |
| "memory(GiB)": 67.86, | |
| "step": 60, | |
| "token_acc": 0.6214239621423963, | |
| "train_speed(iter/s)": 0.039848 | |
| }, | |
| { | |
| "epoch": 0.063006300630063, | |
| "grad_norm": 1.5401345491409302, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.361149787902832, | |
| "memory(GiB)": 68.03, | |
| "step": 70, | |
| "token_acc": 0.6230786366674093, | |
| "train_speed(iter/s)": 0.040044 | |
| }, | |
| { | |
| "epoch": 0.07200720072007201, | |
| "grad_norm": 1.1455940008163452, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.1177097320556642, | |
| "memory(GiB)": 68.03, | |
| "step": 80, | |
| "token_acc": 0.6871520342612419, | |
| "train_speed(iter/s)": 0.040185 | |
| }, | |
| { | |
| "epoch": 0.081008100810081, | |
| "grad_norm": 1.6210144758224487, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.9828067779541015, | |
| "memory(GiB)": 68.03, | |
| "step": 90, | |
| "token_acc": 0.727211495285137, | |
| "train_speed(iter/s)": 0.040239 | |
| }, | |
| { | |
| "epoch": 0.09000900090009001, | |
| "grad_norm": 1.810027837753296, | |
| "learning_rate": 2e-05, | |
| "loss": 0.9005414962768554, | |
| "memory(GiB)": 68.08, | |
| "step": 100, | |
| "token_acc": 0.7328737613097802, | |
| "train_speed(iter/s)": 0.040295 | |
| }, | |
| { | |
| "epoch": 0.09900990099009901, | |
| "grad_norm": 1.3532379865646362, | |
| "learning_rate": 1.9999527877255423e-05, | |
| "loss": 0.7943315982818604, | |
| "memory(GiB)": 68.08, | |
| "step": 110, | |
| "token_acc": 0.7508290957329207, | |
| "train_speed(iter/s)": 0.040336 | |
| }, | |
| { | |
| "epoch": 0.10801080108010801, | |
| "grad_norm": 1.267624855041504, | |
| "learning_rate": 1.999811155360166e-05, | |
| "loss": 0.7341960906982422, | |
| "memory(GiB)": 68.08, | |
| "step": 120, | |
| "token_acc": 0.7815533980582524, | |
| "train_speed(iter/s)": 0.040377 | |
| }, | |
| { | |
| "epoch": 0.11701170117011701, | |
| "grad_norm": 1.372360348701477, | |
| "learning_rate": 1.9995751162774435e-05, | |
| "loss": 0.7223796844482422, | |
| "memory(GiB)": 68.08, | |
| "step": 130, | |
| "token_acc": 0.7679759605065465, | |
| "train_speed(iter/s)": 0.040454 | |
| }, | |
| { | |
| "epoch": 0.126012601260126, | |
| "grad_norm": 1.3093749284744263, | |
| "learning_rate": 1.9992446927652592e-05, | |
| "loss": 0.6822004318237305, | |
| "memory(GiB)": 68.08, | |
| "step": 140, | |
| "token_acc": 0.7690819178671253, | |
| "train_speed(iter/s)": 0.04049 | |
| }, | |
| { | |
| "epoch": 0.135013501350135, | |
| "grad_norm": 1.5192912817001343, | |
| "learning_rate": 1.9988199160237038e-05, | |
| "loss": 0.6598445892333984, | |
| "memory(GiB)": 68.08, | |
| "step": 150, | |
| "token_acc": 0.794921875, | |
| "train_speed(iter/s)": 0.040521 | |
| }, | |
| { | |
| "epoch": 0.14401440144014402, | |
| "grad_norm": 1.4876294136047363, | |
| "learning_rate": 1.9983008261621295e-05, | |
| "loss": 0.6424094200134277, | |
| "memory(GiB)": 68.08, | |
| "step": 160, | |
| "token_acc": 0.784965034965035, | |
| "train_speed(iter/s)": 0.040569 | |
| }, | |
| { | |
| "epoch": 0.15301530153015303, | |
| "grad_norm": 1.7920929193496704, | |
| "learning_rate": 1.9976874721953625e-05, | |
| "loss": 0.6222011089324951, | |
| "memory(GiB)": 68.08, | |
| "step": 170, | |
| "token_acc": 0.7873362445414848, | |
| "train_speed(iter/s)": 0.040571 | |
| }, | |
| { | |
| "epoch": 0.162016201620162, | |
| "grad_norm": 1.496816873550415, | |
| "learning_rate": 1.996979912039074e-05, | |
| "loss": 0.6065957069396972, | |
| "memory(GiB)": 68.08, | |
| "step": 180, | |
| "token_acc": 0.797979797979798, | |
| "train_speed(iter/s)": 0.040601 | |
| }, | |
| { | |
| "epoch": 0.171017101710171, | |
| "grad_norm": 1.631809949874878, | |
| "learning_rate": 1.9961782125043134e-05, | |
| "loss": 0.6100308895111084, | |
| "memory(GiB)": 68.08, | |
| "step": 190, | |
| "token_acc": 0.7921653971708379, | |
| "train_speed(iter/s)": 0.040622 | |
| }, | |
| { | |
| "epoch": 0.18001800180018002, | |
| "grad_norm": 1.635206937789917, | |
| "learning_rate": 1.9952824492911967e-05, | |
| "loss": 0.597900390625, | |
| "memory(GiB)": 68.08, | |
| "step": 200, | |
| "token_acc": 0.8024363233665559, | |
| "train_speed(iter/s)": 0.040621 | |
| }, | |
| { | |
| "epoch": 0.18901890189018902, | |
| "grad_norm": 1.7094358205795288, | |
| "learning_rate": 1.9942927069817618e-05, | |
| "loss": 0.5765604972839355, | |
| "memory(GiB)": 68.08, | |
| "step": 210, | |
| "token_acc": 0.8184182015167931, | |
| "train_speed(iter/s)": 0.040647 | |
| }, | |
| { | |
| "epoch": 0.19801980198019803, | |
| "grad_norm": 1.7368491888046265, | |
| "learning_rate": 1.99320907903198e-05, | |
| "loss": 0.5700692176818848, | |
| "memory(GiB)": 68.08, | |
| "step": 220, | |
| "token_acc": 0.8155997378195324, | |
| "train_speed(iter/s)": 0.040667 | |
| }, | |
| { | |
| "epoch": 0.207020702070207, | |
| "grad_norm": 1.6970064640045166, | |
| "learning_rate": 1.9920316677629312e-05, | |
| "loss": 0.5586367607116699, | |
| "memory(GiB)": 68.08, | |
| "step": 230, | |
| "token_acc": 0.8134110787172012, | |
| "train_speed(iter/s)": 0.040669 | |
| }, | |
| { | |
| "epoch": 0.21602160216021601, | |
| "grad_norm": 1.6440980434417725, | |
| "learning_rate": 1.9907605843511434e-05, | |
| "loss": 0.5400181293487549, | |
| "memory(GiB)": 68.08, | |
| "step": 240, | |
| "token_acc": 0.8248341625207297, | |
| "train_speed(iter/s)": 0.040672 | |
| }, | |
| { | |
| "epoch": 0.22502250225022502, | |
| "grad_norm": 1.848779559135437, | |
| "learning_rate": 1.9893959488180948e-05, | |
| "loss": 0.5552643775939942, | |
| "memory(GiB)": 68.08, | |
| "step": 250, | |
| "token_acc": 0.8090929154711984, | |
| "train_speed(iter/s)": 0.040677 | |
| }, | |
| { | |
| "epoch": 0.23402340234023403, | |
| "grad_norm": 1.746717929840088, | |
| "learning_rate": 1.9879378900188796e-05, | |
| "loss": 0.5367072105407715, | |
| "memory(GiB)": 68.08, | |
| "step": 260, | |
| "token_acc": 0.8096885813148789, | |
| "train_speed(iter/s)": 0.040681 | |
| }, | |
| { | |
| "epoch": 0.24302430243024303, | |
| "grad_norm": 2.212620973587036, | |
| "learning_rate": 1.9863865456300422e-05, | |
| "loss": 0.5621134757995605, | |
| "memory(GiB)": 68.08, | |
| "step": 270, | |
| "token_acc": 0.8111765989958525, | |
| "train_speed(iter/s)": 0.040691 | |
| }, | |
| { | |
| "epoch": 0.252025202520252, | |
| "grad_norm": 1.815075159072876, | |
| "learning_rate": 1.9847420621365773e-05, | |
| "loss": 0.5444355964660644, | |
| "memory(GiB)": 68.08, | |
| "step": 280, | |
| "token_acc": 0.8252319929297393, | |
| "train_speed(iter/s)": 0.040694 | |
| }, | |
| { | |
| "epoch": 0.26102610261026105, | |
| "grad_norm": 1.6822190284729004, | |
| "learning_rate": 1.983004594818096e-05, | |
| "loss": 0.509169626235962, | |
| "memory(GiB)": 68.08, | |
| "step": 290, | |
| "token_acc": 0.8245873889123995, | |
| "train_speed(iter/s)": 0.040697 | |
| }, | |
| { | |
| "epoch": 0.27002700270027, | |
| "grad_norm": 1.7498018741607666, | |
| "learning_rate": 1.981174307734167e-05, | |
| "loss": 0.5199090480804444, | |
| "memory(GiB)": 68.08, | |
| "step": 300, | |
| "token_acc": 0.8331916702082448, | |
| "train_speed(iter/s)": 0.040678 | |
| }, | |
| { | |
| "epoch": 0.279027902790279, | |
| "grad_norm": 1.875012755393982, | |
| "learning_rate": 1.9792513737088223e-05, | |
| "loss": 0.5095804691314697, | |
| "memory(GiB)": 68.08, | |
| "step": 310, | |
| "token_acc": 0.8261736049601417, | |
| "train_speed(iter/s)": 0.040669 | |
| }, | |
| { | |
| "epoch": 0.28802880288028804, | |
| "grad_norm": 1.8016622066497803, | |
| "learning_rate": 1.9772359743142396e-05, | |
| "loss": 0.49691128730773926, | |
| "memory(GiB)": 68.08, | |
| "step": 320, | |
| "token_acc": 0.8243214362043172, | |
| "train_speed(iter/s)": 0.04068 | |
| }, | |
| { | |
| "epoch": 0.297029702970297, | |
| "grad_norm": 1.927909016609192, | |
| "learning_rate": 1.975128299853598e-05, | |
| "loss": 0.5156735897064209, | |
| "memory(GiB)": 68.08, | |
| "step": 330, | |
| "token_acc": 0.8241394527802295, | |
| "train_speed(iter/s)": 0.040684 | |
| }, | |
| { | |
| "epoch": 0.30603060306030605, | |
| "grad_norm": 1.7440602779388428, | |
| "learning_rate": 1.9729285493431074e-05, | |
| "loss": 0.5245149612426758, | |
| "memory(GiB)": 68.24, | |
| "step": 340, | |
| "token_acc": 0.8179177837354781, | |
| "train_speed(iter/s)": 0.040684 | |
| }, | |
| { | |
| "epoch": 0.31503150315031503, | |
| "grad_norm": 1.9903383255004883, | |
| "learning_rate": 1.9706369304932176e-05, | |
| "loss": 0.5069475173950195, | |
| "memory(GiB)": 68.3, | |
| "step": 350, | |
| "token_acc": 0.8318876497315159, | |
| "train_speed(iter/s)": 0.040686 | |
| }, | |
| { | |
| "epoch": 0.324032403240324, | |
| "grad_norm": 1.9196044206619263, | |
| "learning_rate": 1.968253659689005e-05, | |
| "loss": 0.5040374279022217, | |
| "memory(GiB)": 68.3, | |
| "step": 360, | |
| "token_acc": 0.8283985303652475, | |
| "train_speed(iter/s)": 0.040682 | |
| }, | |
| { | |
| "epoch": 0.33303330333033304, | |
| "grad_norm": 1.9835383892059326, | |
| "learning_rate": 1.96577896196974e-05, | |
| "loss": 0.5163045883178711, | |
| "memory(GiB)": 68.3, | |
| "step": 370, | |
| "token_acc": 0.8187339406680683, | |
| "train_speed(iter/s)": 0.040679 | |
| }, | |
| { | |
| "epoch": 0.342034203420342, | |
| "grad_norm": 2.098388195037842, | |
| "learning_rate": 1.9632130710076383e-05, | |
| "loss": 0.5065926074981689, | |
| "memory(GiB)": 68.3, | |
| "step": 380, | |
| "token_acc": 0.8242616033755275, | |
| "train_speed(iter/s)": 0.04068 | |
| }, | |
| { | |
| "epoch": 0.35103510351035105, | |
| "grad_norm": 1.8806556463241577, | |
| "learning_rate": 1.960556229085797e-05, | |
| "loss": 0.4967801094055176, | |
| "memory(GiB)": 68.3, | |
| "step": 390, | |
| "token_acc": 0.8285966071821987, | |
| "train_speed(iter/s)": 0.040692 | |
| }, | |
| { | |
| "epoch": 0.36003600360036003, | |
| "grad_norm": 2.0447497367858887, | |
| "learning_rate": 1.9578086870753153e-05, | |
| "loss": 0.5042286872863769, | |
| "memory(GiB)": 68.3, | |
| "step": 400, | |
| "token_acc": 0.8263780406159339, | |
| "train_speed(iter/s)": 0.040693 | |
| }, | |
| { | |
| "epoch": 0.369036903690369, | |
| "grad_norm": 1.947168231010437, | |
| "learning_rate": 1.954970704411609e-05, | |
| "loss": 0.5015206336975098, | |
| "memory(GiB)": 68.3, | |
| "step": 410, | |
| "token_acc": 0.8200773860705073, | |
| "train_speed(iter/s)": 0.04069 | |
| }, | |
| { | |
| "epoch": 0.37803780378037805, | |
| "grad_norm": 1.855016827583313, | |
| "learning_rate": 1.9520425490699107e-05, | |
| "loss": 0.4870131492614746, | |
| "memory(GiB)": 68.3, | |
| "step": 420, | |
| "token_acc": 0.8407563025210084, | |
| "train_speed(iter/s)": 0.040704 | |
| }, | |
| { | |
| "epoch": 0.387038703870387, | |
| "grad_norm": 1.8995352983474731, | |
| "learning_rate": 1.9490244975399678e-05, | |
| "loss": 0.48991098403930666, | |
| "memory(GiB)": 68.3, | |
| "step": 430, | |
| "token_acc": 0.8367172472750588, | |
| "train_speed(iter/s)": 0.040707 | |
| }, | |
| { | |
| "epoch": 0.39603960396039606, | |
| "grad_norm": 1.9746062755584717, | |
| "learning_rate": 1.9459168347999343e-05, | |
| "loss": 0.49413495063781737, | |
| "memory(GiB)": 68.3, | |
| "step": 440, | |
| "token_acc": 0.8217993079584776, | |
| "train_speed(iter/s)": 0.040722 | |
| }, | |
| { | |
| "epoch": 0.40504050405040504, | |
| "grad_norm": 1.9922826290130615, | |
| "learning_rate": 1.9427198542894628e-05, | |
| "loss": 0.478054141998291, | |
| "memory(GiB)": 68.3, | |
| "step": 450, | |
| "token_acc": 0.8396687194733489, | |
| "train_speed(iter/s)": 0.040729 | |
| }, | |
| { | |
| "epoch": 0.414041404140414, | |
| "grad_norm": 1.8262529373168945, | |
| "learning_rate": 1.9394338578819957e-05, | |
| "loss": 0.4965967178344727, | |
| "memory(GiB)": 68.3, | |
| "step": 460, | |
| "token_acc": 0.8291083916083916, | |
| "train_speed(iter/s)": 0.04073 | |
| }, | |
| { | |
| "epoch": 0.42304230423042305, | |
| "grad_norm": 1.6194044351577759, | |
| "learning_rate": 1.936059155856262e-05, | |
| "loss": 0.47453508377075193, | |
| "memory(GiB)": 68.3, | |
| "step": 470, | |
| "token_acc": 0.8382074479276247, | |
| "train_speed(iter/s)": 0.040729 | |
| }, | |
| { | |
| "epoch": 0.43204320432043203, | |
| "grad_norm": 1.9184072017669678, | |
| "learning_rate": 1.932596066866978e-05, | |
| "loss": 0.4665153980255127, | |
| "memory(GiB)": 68.3, | |
| "step": 480, | |
| "token_acc": 0.8344993441189331, | |
| "train_speed(iter/s)": 0.040723 | |
| }, | |
| { | |
| "epoch": 0.44104410441044106, | |
| "grad_norm": 1.7491145133972168, | |
| "learning_rate": 1.929044917914759e-05, | |
| "loss": 0.4606966972351074, | |
| "memory(GiB)": 68.3, | |
| "step": 490, | |
| "token_acc": 0.84466817341278, | |
| "train_speed(iter/s)": 0.040709 | |
| }, | |
| { | |
| "epoch": 0.45004500450045004, | |
| "grad_norm": 1.97507643699646, | |
| "learning_rate": 1.9254060443152435e-05, | |
| "loss": 0.47635550498962403, | |
| "memory(GiB)": 68.3, | |
| "step": 500, | |
| "token_acc": 0.8395522388059702, | |
| "train_speed(iter/s)": 0.040715 | |
| }, | |
| { | |
| "epoch": 0.45004500450045004, | |
| "eval_loss": 0.48444515466690063, | |
| "eval_runtime": 117.4773, | |
| "eval_samples_per_second": 12.215, | |
| "eval_steps_per_second": 0.383, | |
| "eval_token_acc": 0.8321749696233293, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.459045904590459, | |
| "grad_norm": 2.1366748809814453, | |
| "learning_rate": 1.921679789667429e-05, | |
| "loss": 0.4868021965026855, | |
| "memory(GiB)": 74.54, | |
| "step": 510, | |
| "token_acc": 0.8326992287917738, | |
| "train_speed(iter/s)": 0.040314 | |
| }, | |
| { | |
| "epoch": 0.46804680468046805, | |
| "grad_norm": 2.1436243057250977, | |
| "learning_rate": 1.9178665058212306e-05, | |
| "loss": 0.4831557273864746, | |
| "memory(GiB)": 74.54, | |
| "step": 520, | |
| "token_acc": 0.8337397472844159, | |
| "train_speed(iter/s)": 0.04031 | |
| }, | |
| { | |
| "epoch": 0.47704770477047703, | |
| "grad_norm": 1.887610673904419, | |
| "learning_rate": 1.9139665528442544e-05, | |
| "loss": 0.4900979995727539, | |
| "memory(GiB)": 74.54, | |
| "step": 530, | |
| "token_acc": 0.8252338580880675, | |
| "train_speed(iter/s)": 0.040315 | |
| }, | |
| { | |
| "epoch": 0.48604860486048607, | |
| "grad_norm": 1.778539776802063, | |
| "learning_rate": 1.909980298987802e-05, | |
| "loss": 0.4595688819885254, | |
| "memory(GiB)": 74.54, | |
| "step": 540, | |
| "token_acc": 0.8390126692878986, | |
| "train_speed(iter/s)": 0.040313 | |
| }, | |
| { | |
| "epoch": 0.49504950495049505, | |
| "grad_norm": 2.031074285507202, | |
| "learning_rate": 1.9059081206520954e-05, | |
| "loss": 0.47982397079467776, | |
| "memory(GiB)": 74.54, | |
| "step": 550, | |
| "token_acc": 0.8332963374028857, | |
| "train_speed(iter/s)": 0.040319 | |
| }, | |
| { | |
| "epoch": 0.504050405040504, | |
| "grad_norm": 1.7411119937896729, | |
| "learning_rate": 1.9017504023507366e-05, | |
| "loss": 0.47092242240905763, | |
| "memory(GiB)": 74.54, | |
| "step": 560, | |
| "token_acc": 0.8331826401446655, | |
| "train_speed(iter/s)": 0.040327 | |
| }, | |
| { | |
| "epoch": 0.513051305130513, | |
| "grad_norm": 1.9507403373718262, | |
| "learning_rate": 1.897507536674401e-05, | |
| "loss": 0.473051929473877, | |
| "memory(GiB)": 74.54, | |
| "step": 570, | |
| "token_acc": 0.8324808184143222, | |
| "train_speed(iter/s)": 0.040337 | |
| }, | |
| { | |
| "epoch": 0.5220522052205221, | |
| "grad_norm": 1.8194775581359863, | |
| "learning_rate": 1.8931799242537664e-05, | |
| "loss": 0.4804567813873291, | |
| "memory(GiB)": 74.54, | |
| "step": 580, | |
| "token_acc": 0.8376344086021505, | |
| "train_speed(iter/s)": 0.04034 | |
| }, | |
| { | |
| "epoch": 0.5310531053105311, | |
| "grad_norm": 1.663552165031433, | |
| "learning_rate": 1.8887679737216835e-05, | |
| "loss": 0.4625405311584473, | |
| "memory(GiB)": 74.54, | |
| "step": 590, | |
| "token_acc": 0.8455850369725968, | |
| "train_speed(iter/s)": 0.04034 | |
| }, | |
| { | |
| "epoch": 0.54005400540054, | |
| "grad_norm": 1.968461036682129, | |
| "learning_rate": 1.8842721016745905e-05, | |
| "loss": 0.4602372646331787, | |
| "memory(GiB)": 74.54, | |
| "step": 600, | |
| "token_acc": 0.8317933641327173, | |
| "train_speed(iter/s)": 0.040343 | |
| }, | |
| { | |
| "epoch": 0.549054905490549, | |
| "grad_norm": 1.9484490156173706, | |
| "learning_rate": 1.8796927326331783e-05, | |
| "loss": 0.45257129669189455, | |
| "memory(GiB)": 74.54, | |
| "step": 610, | |
| "token_acc": 0.8373316498316499, | |
| "train_speed(iter/s)": 0.040343 | |
| }, | |
| { | |
| "epoch": 0.558055805580558, | |
| "grad_norm": 2.0010809898376465, | |
| "learning_rate": 1.8750302990023023e-05, | |
| "loss": 0.4624796390533447, | |
| "memory(GiB)": 74.54, | |
| "step": 620, | |
| "token_acc": 0.8330117899249732, | |
| "train_speed(iter/s)": 0.04035 | |
| }, | |
| { | |
| "epoch": 0.5670567056705671, | |
| "grad_norm": 2.1292455196380615, | |
| "learning_rate": 1.8702852410301556e-05, | |
| "loss": 0.4666603565216064, | |
| "memory(GiB)": 74.54, | |
| "step": 630, | |
| "token_acc": 0.8413180143073922, | |
| "train_speed(iter/s)": 0.040354 | |
| }, | |
| { | |
| "epoch": 0.5760576057605761, | |
| "grad_norm": 1.8475803136825562, | |
| "learning_rate": 1.865458006766696e-05, | |
| "loss": 0.4536900520324707, | |
| "memory(GiB)": 74.54, | |
| "step": 640, | |
| "token_acc": 0.8346206269877329, | |
| "train_speed(iter/s)": 0.040359 | |
| }, | |
| { | |
| "epoch": 0.585058505850585, | |
| "grad_norm": 1.9390885829925537, | |
| "learning_rate": 1.860549052021342e-05, | |
| "loss": 0.4544112205505371, | |
| "memory(GiB)": 74.54, | |
| "step": 650, | |
| "token_acc": 0.8367626886145405, | |
| "train_speed(iter/s)": 0.040355 | |
| }, | |
| { | |
| "epoch": 0.594059405940594, | |
| "grad_norm": 1.7429540157318115, | |
| "learning_rate": 1.8555588403199304e-05, | |
| "loss": 0.4384955406188965, | |
| "memory(GiB)": 74.54, | |
| "step": 660, | |
| "token_acc": 0.8417298261257244, | |
| "train_speed(iter/s)": 0.04035 | |
| }, | |
| { | |
| "epoch": 0.603060306030603, | |
| "grad_norm": 2.0337181091308594, | |
| "learning_rate": 1.8504878428609506e-05, | |
| "loss": 0.46024494171142577, | |
| "memory(GiB)": 74.54, | |
| "step": 670, | |
| "token_acc": 0.8392979256895373, | |
| "train_speed(iter/s)": 0.040343 | |
| }, | |
| { | |
| "epoch": 0.6120612061206121, | |
| "grad_norm": 1.9363151788711548, | |
| "learning_rate": 1.8453365384710506e-05, | |
| "loss": 0.4446521759033203, | |
| "memory(GiB)": 74.54, | |
| "step": 680, | |
| "token_acc": 0.8308807379749615, | |
| "train_speed(iter/s)": 0.04034 | |
| }, | |
| { | |
| "epoch": 0.6210621062106211, | |
| "grad_norm": 1.9249675273895264, | |
| "learning_rate": 1.8401054135598228e-05, | |
| "loss": 0.44910879135131837, | |
| "memory(GiB)": 74.54, | |
| "step": 690, | |
| "token_acc": 0.8436960276338514, | |
| "train_speed(iter/s)": 0.040347 | |
| }, | |
| { | |
| "epoch": 0.6300630063006301, | |
| "grad_norm": 2.0293335914611816, | |
| "learning_rate": 1.834794962073878e-05, | |
| "loss": 0.4501783847808838, | |
| "memory(GiB)": 74.54, | |
| "step": 700, | |
| "token_acc": 0.8366346742903819, | |
| "train_speed(iter/s)": 0.040353 | |
| }, | |
| { | |
| "epoch": 0.639063906390639, | |
| "grad_norm": 2.1260316371917725, | |
| "learning_rate": 1.829405685450202e-05, | |
| "loss": 0.4506657600402832, | |
| "memory(GiB)": 74.54, | |
| "step": 710, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 0.040362 | |
| }, | |
| { | |
| "epoch": 0.648064806480648, | |
| "grad_norm": 1.8729071617126465, | |
| "learning_rate": 1.8239380925688087e-05, | |
| "loss": 0.4430402755737305, | |
| "memory(GiB)": 74.54, | |
| "step": 720, | |
| "token_acc": 0.8478399659502022, | |
| "train_speed(iter/s)": 0.040365 | |
| }, | |
| { | |
| "epoch": 0.6570657065706571, | |
| "grad_norm": 1.9187947511672974, | |
| "learning_rate": 1.8183926997046905e-05, | |
| "loss": 0.4478912353515625, | |
| "memory(GiB)": 74.54, | |
| "step": 730, | |
| "token_acc": 0.8519141775347077, | |
| "train_speed(iter/s)": 0.040364 | |
| }, | |
| { | |
| "epoch": 0.6660666066606661, | |
| "grad_norm": 2.07631254196167, | |
| "learning_rate": 1.812770030479066e-05, | |
| "loss": 0.4402505397796631, | |
| "memory(GiB)": 74.54, | |
| "step": 740, | |
| "token_acc": 0.8526605893576426, | |
| "train_speed(iter/s)": 0.040366 | |
| }, | |
| { | |
| "epoch": 0.6750675067506751, | |
| "grad_norm": 1.8189442157745361, | |
| "learning_rate": 1.8070706158099417e-05, | |
| "loss": 0.4404914855957031, | |
| "memory(GiB)": 74.54, | |
| "step": 750, | |
| "token_acc": 0.8409304511278195, | |
| "train_speed(iter/s)": 0.040367 | |
| }, | |
| { | |
| "epoch": 0.684068406840684, | |
| "grad_norm": 1.9871678352355957, | |
| "learning_rate": 1.8012949938619756e-05, | |
| "loss": 0.4483049392700195, | |
| "memory(GiB)": 74.54, | |
| "step": 760, | |
| "token_acc": 0.8431750106974754, | |
| "train_speed(iter/s)": 0.040371 | |
| }, | |
| { | |
| "epoch": 0.693069306930693, | |
| "grad_norm": 1.8938976526260376, | |
| "learning_rate": 1.7954437099956657e-05, | |
| "loss": 0.44423818588256836, | |
| "memory(GiB)": 74.54, | |
| "step": 770, | |
| "token_acc": 0.8477157360406091, | |
| "train_speed(iter/s)": 0.040371 | |
| }, | |
| { | |
| "epoch": 0.7020702070207021, | |
| "grad_norm": 1.8947218656539917, | |
| "learning_rate": 1.7895173167158514e-05, | |
| "loss": 0.4492767333984375, | |
| "memory(GiB)": 74.54, | |
| "step": 780, | |
| "token_acc": 0.837278737470676, | |
| "train_speed(iter/s)": 0.040374 | |
| }, | |
| { | |
| "epoch": 0.7110711071107111, | |
| "grad_norm": 1.9695574045181274, | |
| "learning_rate": 1.7835163736195447e-05, | |
| "loss": 0.44904842376708987, | |
| "memory(GiB)": 74.54, | |
| "step": 790, | |
| "token_acc": 0.8408003479773815, | |
| "train_speed(iter/s)": 0.040375 | |
| }, | |
| { | |
| "epoch": 0.7200720072007201, | |
| "grad_norm": 2.00817608833313, | |
| "learning_rate": 1.777441447343091e-05, | |
| "loss": 0.45390868186950684, | |
| "memory(GiB)": 74.54, | |
| "step": 800, | |
| "token_acc": 0.8411726099321811, | |
| "train_speed(iter/s)": 0.040379 | |
| }, | |
| { | |
| "epoch": 0.729072907290729, | |
| "grad_norm": 2.0400583744049072, | |
| "learning_rate": 1.7712931115086633e-05, | |
| "loss": 0.4411576747894287, | |
| "memory(GiB)": 74.54, | |
| "step": 810, | |
| "token_acc": 0.8399218071242398, | |
| "train_speed(iter/s)": 0.04038 | |
| }, | |
| { | |
| "epoch": 0.738073807380738, | |
| "grad_norm": 2.0157155990600586, | |
| "learning_rate": 1.7650719466700994e-05, | |
| "loss": 0.44756488800048827, | |
| "memory(GiB)": 74.54, | |
| "step": 820, | |
| "token_acc": 0.842788038698329, | |
| "train_speed(iter/s)": 0.040376 | |
| }, | |
| { | |
| "epoch": 0.7470747074707471, | |
| "grad_norm": 1.7088335752487183, | |
| "learning_rate": 1.7587785402580828e-05, | |
| "loss": 0.43597002029418946, | |
| "memory(GiB)": 74.54, | |
| "step": 830, | |
| "token_acc": 0.8466036887089519, | |
| "train_speed(iter/s)": 0.040379 | |
| }, | |
| { | |
| "epoch": 0.7560756075607561, | |
| "grad_norm": 2.1911604404449463, | |
| "learning_rate": 1.752413486524675e-05, | |
| "loss": 0.44062347412109376, | |
| "memory(GiB)": 74.54, | |
| "step": 840, | |
| "token_acc": 0.8505315822388994, | |
| "train_speed(iter/s)": 0.040375 | |
| }, | |
| { | |
| "epoch": 0.7650765076507651, | |
| "grad_norm": 2.0149848461151123, | |
| "learning_rate": 1.7459773864872042e-05, | |
| "loss": 0.4424751281738281, | |
| "memory(GiB)": 74.54, | |
| "step": 850, | |
| "token_acc": 0.8476879246110015, | |
| "train_speed(iter/s)": 0.040376 | |
| }, | |
| { | |
| "epoch": 0.774077407740774, | |
| "grad_norm": 1.9014195203781128, | |
| "learning_rate": 1.7394708478715127e-05, | |
| "loss": 0.4621281623840332, | |
| "memory(GiB)": 74.54, | |
| "step": 860, | |
| "token_acc": 0.8423601937472479, | |
| "train_speed(iter/s)": 0.040378 | |
| }, | |
| { | |
| "epoch": 0.783078307830783, | |
| "grad_norm": 2.0565760135650635, | |
| "learning_rate": 1.7328944850545745e-05, | |
| "loss": 0.4593350410461426, | |
| "memory(GiB)": 74.54, | |
| "step": 870, | |
| "token_acc": 0.8399521531100479, | |
| "train_speed(iter/s)": 0.040378 | |
| }, | |
| { | |
| "epoch": 0.7920792079207921, | |
| "grad_norm": 2.0428307056427, | |
| "learning_rate": 1.7262489190064818e-05, | |
| "loss": 0.43943395614624026, | |
| "memory(GiB)": 74.54, | |
| "step": 880, | |
| "token_acc": 0.8423470453121737, | |
| "train_speed(iter/s)": 0.04038 | |
| }, | |
| { | |
| "epoch": 0.8010801080108011, | |
| "grad_norm": 2.316945791244507, | |
| "learning_rate": 1.7195347772318116e-05, | |
| "loss": 0.43985910415649415, | |
| "memory(GiB)": 74.54, | |
| "step": 890, | |
| "token_acc": 0.8351231838281743, | |
| "train_speed(iter/s)": 0.040379 | |
| }, | |
| { | |
| "epoch": 0.8100810081008101, | |
| "grad_norm": 2.041092872619629, | |
| "learning_rate": 1.7127526937103713e-05, | |
| "loss": 0.4424757957458496, | |
| "memory(GiB)": 74.54, | |
| "step": 900, | |
| "token_acc": 0.841919080256467, | |
| "train_speed(iter/s)": 0.04038 | |
| }, | |
| { | |
| "epoch": 0.819081908190819, | |
| "grad_norm": 2.2010583877563477, | |
| "learning_rate": 1.705903308837339e-05, | |
| "loss": 0.4423489570617676, | |
| "memory(GiB)": 74.54, | |
| "step": 910, | |
| "token_acc": 0.8436460412508316, | |
| "train_speed(iter/s)": 0.040384 | |
| }, | |
| { | |
| "epoch": 0.828082808280828, | |
| "grad_norm": 1.804849624633789, | |
| "learning_rate": 1.6989872693627916e-05, | |
| "loss": 0.43178791999816896, | |
| "memory(GiB)": 74.54, | |
| "step": 920, | |
| "token_acc": 0.8569312169312169, | |
| "train_speed(iter/s)": 0.040391 | |
| }, | |
| { | |
| "epoch": 0.8370837083708371, | |
| "grad_norm": 2.3260996341705322, | |
| "learning_rate": 1.6920052283306364e-05, | |
| "loss": 0.4507165431976318, | |
| "memory(GiB)": 74.54, | |
| "step": 930, | |
| "token_acc": 0.8385640099345225, | |
| "train_speed(iter/s)": 0.040389 | |
| }, | |
| { | |
| "epoch": 0.8460846084608461, | |
| "grad_norm": 2.029878616333008, | |
| "learning_rate": 1.684957845016949e-05, | |
| "loss": 0.423465633392334, | |
| "memory(GiB)": 74.54, | |
| "step": 940, | |
| "token_acc": 0.8474983613720778, | |
| "train_speed(iter/s)": 0.040396 | |
| }, | |
| { | |
| "epoch": 0.8550855085508551, | |
| "grad_norm": 2.0568127632141113, | |
| "learning_rate": 1.677845784867719e-05, | |
| "loss": 0.426534366607666, | |
| "memory(GiB)": 74.54, | |
| "step": 950, | |
| "token_acc": 0.8443046506403056, | |
| "train_speed(iter/s)": 0.040397 | |
| }, | |
| { | |
| "epoch": 0.8640864086408641, | |
| "grad_norm": 2.0447678565979004, | |
| "learning_rate": 1.6706697194360186e-05, | |
| "loss": 0.43904976844787597, | |
| "memory(GiB)": 74.54, | |
| "step": 960, | |
| "token_acc": 0.843986543313709, | |
| "train_speed(iter/s)": 0.040403 | |
| }, | |
| { | |
| "epoch": 0.873087308730873, | |
| "grad_norm": 1.8627592325210571, | |
| "learning_rate": 1.6634303263185885e-05, | |
| "loss": 0.4334832191467285, | |
| "memory(GiB)": 74.54, | |
| "step": 970, | |
| "token_acc": 0.8500109003706126, | |
| "train_speed(iter/s)": 0.040406 | |
| }, | |
| { | |
| "epoch": 0.8820882088208821, | |
| "grad_norm": 2.1592624187469482, | |
| "learning_rate": 1.656128289091859e-05, | |
| "loss": 0.43813695907592776, | |
| "memory(GiB)": 74.54, | |
| "step": 980, | |
| "token_acc": 0.8389203308663474, | |
| "train_speed(iter/s)": 0.040402 | |
| }, | |
| { | |
| "epoch": 0.8910891089108911, | |
| "grad_norm": 1.7612345218658447, | |
| "learning_rate": 1.6487642972474006e-05, | |
| "loss": 0.43879289627075196, | |
| "memory(GiB)": 74.54, | |
| "step": 990, | |
| "token_acc": 0.8460222412318221, | |
| "train_speed(iter/s)": 0.040402 | |
| }, | |
| { | |
| "epoch": 0.9000900090009001, | |
| "grad_norm": 2.0122318267822266, | |
| "learning_rate": 1.641339046126822e-05, | |
| "loss": 0.4455322265625, | |
| "memory(GiB)": 74.54, | |
| "step": 1000, | |
| "token_acc": 0.8455068614431164, | |
| "train_speed(iter/s)": 0.040397 | |
| }, | |
| { | |
| "epoch": 0.9000900090009001, | |
| "eval_loss": 0.43926388025283813, | |
| "eval_runtime": 113.4684, | |
| "eval_samples_per_second": 12.647, | |
| "eval_steps_per_second": 0.397, | |
| "eval_token_acc": 0.8410206561360875, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 2.066300630569458, | |
| "learning_rate": 1.6338532368561105e-05, | |
| "loss": 0.4375774383544922, | |
| "memory(GiB)": 74.54, | |
| "step": 1010, | |
| "token_acc": 0.8390414378432351, | |
| "train_speed(iter/s)": 0.040187 | |
| }, | |
| { | |
| "epoch": 0.918091809180918, | |
| "grad_norm": 2.2568578720092773, | |
| "learning_rate": 1.62630757627943e-05, | |
| "loss": 0.4385653495788574, | |
| "memory(GiB)": 74.54, | |
| "step": 1020, | |
| "token_acc": 0.8342480790340285, | |
| "train_speed(iter/s)": 0.040185 | |
| }, | |
| { | |
| "epoch": 0.9270927092709271, | |
| "grad_norm": 1.963052749633789, | |
| "learning_rate": 1.6187027768923767e-05, | |
| "loss": 0.43105306625366213, | |
| "memory(GiB)": 74.54, | |
| "step": 1030, | |
| "token_acc": 0.8509454949944383, | |
| "train_speed(iter/s)": 0.040187 | |
| }, | |
| { | |
| "epoch": 0.9360936093609361, | |
| "grad_norm": 1.902685523033142, | |
| "learning_rate": 1.6110395567747025e-05, | |
| "loss": 0.4382938385009766, | |
| "memory(GiB)": 74.54, | |
| "step": 1040, | |
| "token_acc": 0.8346938775510204, | |
| "train_speed(iter/s)": 0.040185 | |
| }, | |
| { | |
| "epoch": 0.9450945094509451, | |
| "grad_norm": 1.8732327222824097, | |
| "learning_rate": 1.6033186395225095e-05, | |
| "loss": 0.41572961807250974, | |
| "memory(GiB)": 74.54, | |
| "step": 1050, | |
| "token_acc": 0.85475935828877, | |
| "train_speed(iter/s)": 0.04019 | |
| }, | |
| { | |
| "epoch": 0.9540954095409541, | |
| "grad_norm": 1.869422197341919, | |
| "learning_rate": 1.5955407541799274e-05, | |
| "loss": 0.43001718521118165, | |
| "memory(GiB)": 74.54, | |
| "step": 1060, | |
| "token_acc": 0.8342636324602833, | |
| "train_speed(iter/s)": 0.040189 | |
| }, | |
| { | |
| "epoch": 0.963096309630963, | |
| "grad_norm": 2.065873861312866, | |
| "learning_rate": 1.5877066351702707e-05, | |
| "loss": 0.43995866775512693, | |
| "memory(GiB)": 74.54, | |
| "step": 1070, | |
| "token_acc": 0.8477516059957173, | |
| "train_speed(iter/s)": 0.040194 | |
| }, | |
| { | |
| "epoch": 0.9720972097209721, | |
| "grad_norm": 2.1846609115600586, | |
| "learning_rate": 1.5798170222266933e-05, | |
| "loss": 0.4312899589538574, | |
| "memory(GiB)": 74.54, | |
| "step": 1080, | |
| "token_acc": 0.8568353067814855, | |
| "train_speed(iter/s)": 0.040196 | |
| }, | |
| { | |
| "epoch": 0.9810981098109811, | |
| "grad_norm": 2.151474714279175, | |
| "learning_rate": 1.571872660322338e-05, | |
| "loss": 0.431905460357666, | |
| "memory(GiB)": 74.54, | |
| "step": 1090, | |
| "token_acc": 0.8473539953615855, | |
| "train_speed(iter/s)": 0.040202 | |
| }, | |
| { | |
| "epoch": 0.9900990099009901, | |
| "grad_norm": 2.0136258602142334, | |
| "learning_rate": 1.563874299599995e-05, | |
| "loss": 0.4207723140716553, | |
| "memory(GiB)": 74.54, | |
| "step": 1100, | |
| "token_acc": 0.8404571428571429, | |
| "train_speed(iter/s)": 0.040206 | |
| }, | |
| { | |
| "epoch": 0.9990999099909991, | |
| "grad_norm": 2.0286359786987305, | |
| "learning_rate": 1.555822695301266e-05, | |
| "loss": 0.41998815536499023, | |
| "memory(GiB)": 74.54, | |
| "step": 1110, | |
| "token_acc": 0.8462002412545235, | |
| "train_speed(iter/s)": 0.040207 | |
| }, | |
| { | |
| "epoch": 1.008100810081008, | |
| "grad_norm": 2.1229543685913086, | |
| "learning_rate": 1.5477186076952567e-05, | |
| "loss": 0.41786656379699705, | |
| "memory(GiB)": 74.54, | |
| "step": 1120, | |
| "token_acc": 0.8457294195541823, | |
| "train_speed(iter/s)": 0.040226 | |
| }, | |
| { | |
| "epoch": 1.0171017101710171, | |
| "grad_norm": 2.2496182918548584, | |
| "learning_rate": 1.5395628020067825e-05, | |
| "loss": 0.41992764472961425, | |
| "memory(GiB)": 74.54, | |
| "step": 1130, | |
| "token_acc": 0.8452407614781635, | |
| "train_speed(iter/s)": 0.040225 | |
| }, | |
| { | |
| "epoch": 1.026102610261026, | |
| "grad_norm": 2.0818288326263428, | |
| "learning_rate": 1.531356048344117e-05, | |
| "loss": 0.41519851684570314, | |
| "memory(GiB)": 74.54, | |
| "step": 1140, | |
| "token_acc": 0.8480816145486804, | |
| "train_speed(iter/s)": 0.040226 | |
| }, | |
| { | |
| "epoch": 1.035103510351035, | |
| "grad_norm": 1.9498157501220703, | |
| "learning_rate": 1.523099121626273e-05, | |
| "loss": 0.4007615089416504, | |
| "memory(GiB)": 74.54, | |
| "step": 1150, | |
| "token_acc": 0.8642224012892828, | |
| "train_speed(iter/s)": 0.040229 | |
| }, | |
| { | |
| "epoch": 1.0441044104410442, | |
| "grad_norm": 2.238085985183716, | |
| "learning_rate": 1.5147928015098309e-05, | |
| "loss": 0.416591739654541, | |
| "memory(GiB)": 74.54, | |
| "step": 1160, | |
| "token_acc": 0.8449678800856532, | |
| "train_speed(iter/s)": 0.040231 | |
| }, | |
| { | |
| "epoch": 1.053105310531053, | |
| "grad_norm": 1.884536862373352, | |
| "learning_rate": 1.506437872315321e-05, | |
| "loss": 0.4058389663696289, | |
| "memory(GiB)": 74.54, | |
| "step": 1170, | |
| "token_acc": 0.8544316996871741, | |
| "train_speed(iter/s)": 0.040234 | |
| }, | |
| { | |
| "epoch": 1.0621062106210621, | |
| "grad_norm": 2.506772041320801, | |
| "learning_rate": 1.4980351229531642e-05, | |
| "loss": 0.4066319465637207, | |
| "memory(GiB)": 74.54, | |
| "step": 1180, | |
| "token_acc": 0.8476423487544484, | |
| "train_speed(iter/s)": 0.040236 | |
| }, | |
| { | |
| "epoch": 1.071107110711071, | |
| "grad_norm": 2.208542823791504, | |
| "learning_rate": 1.4895853468491779e-05, | |
| "loss": 0.4183638572692871, | |
| "memory(GiB)": 74.54, | |
| "step": 1190, | |
| "token_acc": 0.8479634066652145, | |
| "train_speed(iter/s)": 0.040233 | |
| }, | |
| { | |
| "epoch": 1.08010801080108, | |
| "grad_norm": 2.0623791217803955, | |
| "learning_rate": 1.4810893418696595e-05, | |
| "loss": 0.4236001014709473, | |
| "memory(GiB)": 74.54, | |
| "step": 1200, | |
| "token_acc": 0.8621627274628739, | |
| "train_speed(iter/s)": 0.040231 | |
| }, | |
| { | |
| "epoch": 1.0891089108910892, | |
| "grad_norm": 1.9633852243423462, | |
| "learning_rate": 1.4725479102460467e-05, | |
| "loss": 0.4070269584655762, | |
| "memory(GiB)": 74.54, | |
| "step": 1210, | |
| "token_acc": 0.8519945602901179, | |
| "train_speed(iter/s)": 0.040233 | |
| }, | |
| { | |
| "epoch": 1.098109810981098, | |
| "grad_norm": 2.425140857696533, | |
| "learning_rate": 1.4639618584991679e-05, | |
| "loss": 0.4048626899719238, | |
| "memory(GiB)": 74.54, | |
| "step": 1220, | |
| "token_acc": 0.8575699338031176, | |
| "train_speed(iter/s)": 0.040237 | |
| }, | |
| { | |
| "epoch": 1.1071107110711071, | |
| "grad_norm": 1.9179662466049194, | |
| "learning_rate": 1.455331997363086e-05, | |
| "loss": 0.41301331520080564, | |
| "memory(GiB)": 74.54, | |
| "step": 1230, | |
| "token_acc": 0.8553283100107643, | |
| "train_speed(iter/s)": 0.040242 | |
| }, | |
| { | |
| "epoch": 1.116111611161116, | |
| "grad_norm": 2.332228660583496, | |
| "learning_rate": 1.4466591417085462e-05, | |
| "loss": 0.4197710037231445, | |
| "memory(GiB)": 74.54, | |
| "step": 1240, | |
| "token_acc": 0.8447427293064877, | |
| "train_speed(iter/s)": 0.040246 | |
| }, | |
| { | |
| "epoch": 1.125112511251125, | |
| "grad_norm": 2.093475580215454, | |
| "learning_rate": 1.4379441104660313e-05, | |
| "loss": 0.4093982696533203, | |
| "memory(GiB)": 74.54, | |
| "step": 1250, | |
| "token_acc": 0.8562723261189326, | |
| "train_speed(iter/s)": 0.040245 | |
| }, | |
| { | |
| "epoch": 1.1341134113411342, | |
| "grad_norm": 2.2746119499206543, | |
| "learning_rate": 1.4291877265484352e-05, | |
| "loss": 0.4102977752685547, | |
| "memory(GiB)": 74.54, | |
| "step": 1260, | |
| "token_acc": 0.854287556415216, | |
| "train_speed(iter/s)": 0.040249 | |
| }, | |
| { | |
| "epoch": 1.143114311431143, | |
| "grad_norm": 2.2232649326324463, | |
| "learning_rate": 1.4203908167733596e-05, | |
| "loss": 0.418546724319458, | |
| "memory(GiB)": 74.54, | |
| "step": 1270, | |
| "token_acc": 0.8427280550774526, | |
| "train_speed(iter/s)": 0.040255 | |
| }, | |
| { | |
| "epoch": 1.1521152115211521, | |
| "grad_norm": 1.9787334203720093, | |
| "learning_rate": 1.4115542117850415e-05, | |
| "loss": 0.410016393661499, | |
| "memory(GiB)": 74.54, | |
| "step": 1280, | |
| "token_acc": 0.86048545812377, | |
| "train_speed(iter/s)": 0.040258 | |
| }, | |
| { | |
| "epoch": 1.161116111611161, | |
| "grad_norm": 2.3660764694213867, | |
| "learning_rate": 1.4026787459759215e-05, | |
| "loss": 0.4094221591949463, | |
| "memory(GiB)": 74.54, | |
| "step": 1290, | |
| "token_acc": 0.8500684618895481, | |
| "train_speed(iter/s)": 0.040257 | |
| }, | |
| { | |
| "epoch": 1.17011701170117, | |
| "grad_norm": 2.0939202308654785, | |
| "learning_rate": 1.3937652574078543e-05, | |
| "loss": 0.40435123443603516, | |
| "memory(GiB)": 74.54, | |
| "step": 1300, | |
| "token_acc": 0.8442178346712953, | |
| "train_speed(iter/s)": 0.040258 | |
| }, | |
| { | |
| "epoch": 1.1791179117911792, | |
| "grad_norm": 2.3308207988739014, | |
| "learning_rate": 1.3848145877329778e-05, | |
| "loss": 0.4132570743560791, | |
| "memory(GiB)": 74.54, | |
| "step": 1310, | |
| "token_acc": 0.8504208935894668, | |
| "train_speed(iter/s)": 0.040261 | |
| }, | |
| { | |
| "epoch": 1.188118811881188, | |
| "grad_norm": 2.053710460662842, | |
| "learning_rate": 1.3758275821142382e-05, | |
| "loss": 0.39916296005249025, | |
| "memory(GiB)": 74.54, | |
| "step": 1320, | |
| "token_acc": 0.8543060651845457, | |
| "train_speed(iter/s)": 0.04026 | |
| }, | |
| { | |
| "epoch": 1.1971197119711972, | |
| "grad_norm": 2.4674737453460693, | |
| "learning_rate": 1.3668050891455873e-05, | |
| "loss": 0.3984804630279541, | |
| "memory(GiB)": 74.54, | |
| "step": 1330, | |
| "token_acc": 0.8585640138408305, | |
| "train_speed(iter/s)": 0.040259 | |
| }, | |
| { | |
| "epoch": 1.206120612061206, | |
| "grad_norm": 2.1947102546691895, | |
| "learning_rate": 1.357747960771854e-05, | |
| "loss": 0.42041912078857424, | |
| "memory(GiB)": 74.54, | |
| "step": 1340, | |
| "token_acc": 0.8391608391608392, | |
| "train_speed(iter/s)": 0.040262 | |
| }, | |
| { | |
| "epoch": 1.215121512151215, | |
| "grad_norm": 2.0035359859466553, | |
| "learning_rate": 1.3486570522082989e-05, | |
| "loss": 0.4119097709655762, | |
| "memory(GiB)": 74.54, | |
| "step": 1350, | |
| "token_acc": 0.8620765508139023, | |
| "train_speed(iter/s)": 0.040265 | |
| }, | |
| { | |
| "epoch": 1.2241224122412242, | |
| "grad_norm": 2.161275863647461, | |
| "learning_rate": 1.3395332218598629e-05, | |
| "loss": 0.4057816982269287, | |
| "memory(GiB)": 74.54, | |
| "step": 1360, | |
| "token_acc": 0.8410107334525939, | |
| "train_speed(iter/s)": 0.040268 | |
| }, | |
| { | |
| "epoch": 1.233123312331233, | |
| "grad_norm": 2.300550937652588, | |
| "learning_rate": 1.3303773312401107e-05, | |
| "loss": 0.40541529655456543, | |
| "memory(GiB)": 74.54, | |
| "step": 1370, | |
| "token_acc": 0.8559489773477018, | |
| "train_speed(iter/s)": 0.040269 | |
| }, | |
| { | |
| "epoch": 1.2421242124212422, | |
| "grad_norm": 2.306222915649414, | |
| "learning_rate": 1.3211902448898841e-05, | |
| "loss": 0.40516185760498047, | |
| "memory(GiB)": 74.54, | |
| "step": 1380, | |
| "token_acc": 0.8569854561480829, | |
| "train_speed(iter/s)": 0.04027 | |
| }, | |
| { | |
| "epoch": 1.251125112511251, | |
| "grad_norm": 2.1976640224456787, | |
| "learning_rate": 1.3119728302956676e-05, | |
| "loss": 0.4062767505645752, | |
| "memory(GiB)": 74.54, | |
| "step": 1390, | |
| "token_acc": 0.8493668073761387, | |
| "train_speed(iter/s)": 0.040273 | |
| }, | |
| { | |
| "epoch": 1.2601260126012601, | |
| "grad_norm": 2.333188056945801, | |
| "learning_rate": 1.302725957807676e-05, | |
| "loss": 0.39322872161865235, | |
| "memory(GiB)": 74.54, | |
| "step": 1400, | |
| "token_acc": 0.860806663743972, | |
| "train_speed(iter/s)": 0.040272 | |
| }, | |
| { | |
| "epoch": 1.2691269126912692, | |
| "grad_norm": 2.356128215789795, | |
| "learning_rate": 1.2934505005576738e-05, | |
| "loss": 0.39969046115875245, | |
| "memory(GiB)": 74.54, | |
| "step": 1410, | |
| "token_acc": 0.8573583279465632, | |
| "train_speed(iter/s)": 0.040268 | |
| }, | |
| { | |
| "epoch": 1.278127812781278, | |
| "grad_norm": 2.1411805152893066, | |
| "learning_rate": 1.2841473343765269e-05, | |
| "loss": 0.39504408836364746, | |
| "memory(GiB)": 74.54, | |
| "step": 1420, | |
| "token_acc": 0.8612200435729848, | |
| "train_speed(iter/s)": 0.040269 | |
| }, | |
| { | |
| "epoch": 1.2871287128712872, | |
| "grad_norm": 2.187964677810669, | |
| "learning_rate": 1.274817337711506e-05, | |
| "loss": 0.4120161056518555, | |
| "memory(GiB)": 74.54, | |
| "step": 1430, | |
| "token_acc": 0.849435382685069, | |
| "train_speed(iter/s)": 0.040272 | |
| }, | |
| { | |
| "epoch": 1.296129612961296, | |
| "grad_norm": 2.098618745803833, | |
| "learning_rate": 1.2654613915433373e-05, | |
| "loss": 0.39701004028320314, | |
| "memory(GiB)": 74.54, | |
| "step": 1440, | |
| "token_acc": 0.8512253307308609, | |
| "train_speed(iter/s)": 0.040274 | |
| }, | |
| { | |
| "epoch": 1.3051305130513051, | |
| "grad_norm": 2.000491142272949, | |
| "learning_rate": 1.2560803793030179e-05, | |
| "loss": 0.40303592681884765, | |
| "memory(GiB)": 74.54, | |
| "step": 1450, | |
| "token_acc": 0.8583260680034873, | |
| "train_speed(iter/s)": 0.040274 | |
| }, | |
| { | |
| "epoch": 1.3141314131413142, | |
| "grad_norm": 2.1380844116210938, | |
| "learning_rate": 1.2466751867883959e-05, | |
| "loss": 0.397491455078125, | |
| "memory(GiB)": 74.54, | |
| "step": 1460, | |
| "token_acc": 0.8592755214050494, | |
| "train_speed(iter/s)": 0.040276 | |
| }, | |
| { | |
| "epoch": 1.323132313231323, | |
| "grad_norm": 2.110633611679077, | |
| "learning_rate": 1.2372467020805332e-05, | |
| "loss": 0.4155548095703125, | |
| "memory(GiB)": 74.54, | |
| "step": 1470, | |
| "token_acc": 0.8501522401043932, | |
| "train_speed(iter/s)": 0.040278 | |
| }, | |
| { | |
| "epoch": 1.3321332133213322, | |
| "grad_norm": 2.1096761226654053, | |
| "learning_rate": 1.2277958154598444e-05, | |
| "loss": 0.41139373779296873, | |
| "memory(GiB)": 74.54, | |
| "step": 1480, | |
| "token_acc": 0.8384369287020109, | |
| "train_speed(iter/s)": 0.040279 | |
| }, | |
| { | |
| "epoch": 1.341134113411341, | |
| "grad_norm": 2.346917152404785, | |
| "learning_rate": 1.2183234193220362e-05, | |
| "loss": 0.3898932456970215, | |
| "memory(GiB)": 74.54, | |
| "step": 1490, | |
| "token_acc": 0.8620309050772627, | |
| "train_speed(iter/s)": 0.04028 | |
| }, | |
| { | |
| "epoch": 1.3501350135013501, | |
| "grad_norm": 2.1962385177612305, | |
| "learning_rate": 1.2088304080938404e-05, | |
| "loss": 0.3953920841217041, | |
| "memory(GiB)": 74.54, | |
| "step": 1500, | |
| "token_acc": 0.8660930950805207, | |
| "train_speed(iter/s)": 0.040278 | |
| }, | |
| { | |
| "epoch": 1.3501350135013501, | |
| "eval_loss": 0.42292386293411255, | |
| "eval_runtime": 112.5032, | |
| "eval_samples_per_second": 12.755, | |
| "eval_steps_per_second": 0.4, | |
| "eval_token_acc": 0.8482138517618469, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.3591359135913592, | |
| "grad_norm": 2.1046359539031982, | |
| "learning_rate": 1.1993176781485608e-05, | |
| "loss": 0.4179078578948975, | |
| "memory(GiB)": 74.54, | |
| "step": 1510, | |
| "token_acc": 0.8453704665904603, | |
| "train_speed(iter/s)": 0.040153 | |
| }, | |
| { | |
| "epoch": 1.368136813681368, | |
| "grad_norm": 2.0981786251068115, | |
| "learning_rate": 1.1897861277214304e-05, | |
| "loss": 0.38443617820739745, | |
| "memory(GiB)": 74.54, | |
| "step": 1520, | |
| "token_acc": 0.8514383855732074, | |
| "train_speed(iter/s)": 0.040151 | |
| }, | |
| { | |
| "epoch": 1.3771377137713772, | |
| "grad_norm": 2.335702419281006, | |
| "learning_rate": 1.1802366568247998e-05, | |
| "loss": 0.39206039905548096, | |
| "memory(GiB)": 74.54, | |
| "step": 1530, | |
| "token_acc": 0.8556973163220414, | |
| "train_speed(iter/s)": 0.040152 | |
| }, | |
| { | |
| "epoch": 1.386138613861386, | |
| "grad_norm": 2.2659618854522705, | |
| "learning_rate": 1.1706701671631504e-05, | |
| "loss": 0.39416942596435545, | |
| "memory(GiB)": 74.54, | |
| "step": 1540, | |
| "token_acc": 0.8575920934411501, | |
| "train_speed(iter/s)": 0.040154 | |
| }, | |
| { | |
| "epoch": 1.3951395139513951, | |
| "grad_norm": 2.3435161113739014, | |
| "learning_rate": 1.1610875620479531e-05, | |
| "loss": 0.4044766426086426, | |
| "memory(GiB)": 74.54, | |
| "step": 1550, | |
| "token_acc": 0.8510254676583277, | |
| "train_speed(iter/s)": 0.040156 | |
| }, | |
| { | |
| "epoch": 1.4041404140414042, | |
| "grad_norm": 2.155761241912842, | |
| "learning_rate": 1.1514897463123735e-05, | |
| "loss": 0.39972786903381347, | |
| "memory(GiB)": 74.54, | |
| "step": 1560, | |
| "token_acc": 0.858606101091071, | |
| "train_speed(iter/s)": 0.040158 | |
| }, | |
| { | |
| "epoch": 1.413141314131413, | |
| "grad_norm": 2.231323719024658, | |
| "learning_rate": 1.141877626225833e-05, | |
| "loss": 0.4081737518310547, | |
| "memory(GiB)": 74.54, | |
| "step": 1570, | |
| "token_acc": 0.8568965517241379, | |
| "train_speed(iter/s)": 0.040158 | |
| }, | |
| { | |
| "epoch": 1.4221422142214222, | |
| "grad_norm": 2.0848968029022217, | |
| "learning_rate": 1.1322521094084352e-05, | |
| "loss": 0.4104423999786377, | |
| "memory(GiB)": 74.54, | |
| "step": 1580, | |
| "token_acc": 0.8589771972548151, | |
| "train_speed(iter/s)": 0.04016 | |
| }, | |
| { | |
| "epoch": 1.431143114311431, | |
| "grad_norm": 2.1602284908294678, | |
| "learning_rate": 1.1226141047452628e-05, | |
| "loss": 0.39746341705322263, | |
| "memory(GiB)": 74.54, | |
| "step": 1590, | |
| "token_acc": 0.8528940745824755, | |
| "train_speed(iter/s)": 0.040163 | |
| }, | |
| { | |
| "epoch": 1.4401440144014401, | |
| "grad_norm": 2.202800750732422, | |
| "learning_rate": 1.1129645223005592e-05, | |
| "loss": 0.3975072383880615, | |
| "memory(GiB)": 74.54, | |
| "step": 1600, | |
| "token_acc": 0.85933056224021, | |
| "train_speed(iter/s)": 0.040165 | |
| }, | |
| { | |
| "epoch": 1.4491449144914492, | |
| "grad_norm": 2.0750746726989746, | |
| "learning_rate": 1.103304273231794e-05, | |
| "loss": 0.4078987598419189, | |
| "memory(GiB)": 74.54, | |
| "step": 1610, | |
| "token_acc": 0.8481820114820328, | |
| "train_speed(iter/s)": 0.040169 | |
| }, | |
| { | |
| "epoch": 1.458145814581458, | |
| "grad_norm": 2.0705268383026123, | |
| "learning_rate": 1.0936342697036276e-05, | |
| "loss": 0.40749187469482423, | |
| "memory(GiB)": 74.54, | |
| "step": 1620, | |
| "token_acc": 0.8431718061674008, | |
| "train_speed(iter/s)": 0.04017 | |
| }, | |
| { | |
| "epoch": 1.4671467146714672, | |
| "grad_norm": 2.2939624786376953, | |
| "learning_rate": 1.0839554248017816e-05, | |
| "loss": 0.39917492866516113, | |
| "memory(GiB)": 74.54, | |
| "step": 1630, | |
| "token_acc": 0.8533273981749387, | |
| "train_speed(iter/s)": 0.040171 | |
| }, | |
| { | |
| "epoch": 1.476147614761476, | |
| "grad_norm": 2.232426166534424, | |
| "learning_rate": 1.0742686524468193e-05, | |
| "loss": 0.3895902156829834, | |
| "memory(GiB)": 74.54, | |
| "step": 1640, | |
| "token_acc": 0.8666959964804224, | |
| "train_speed(iter/s)": 0.040172 | |
| }, | |
| { | |
| "epoch": 1.4851485148514851, | |
| "grad_norm": 2.317064046859741, | |
| "learning_rate": 1.0645748673078513e-05, | |
| "loss": 0.4001925468444824, | |
| "memory(GiB)": 74.54, | |
| "step": 1650, | |
| "token_acc": 0.8580047403576815, | |
| "train_speed(iter/s)": 0.040177 | |
| }, | |
| { | |
| "epoch": 1.4941494149414942, | |
| "grad_norm": 2.4603018760681152, | |
| "learning_rate": 1.0548749847161666e-05, | |
| "loss": 0.4078868865966797, | |
| "memory(GiB)": 74.54, | |
| "step": 1660, | |
| "token_acc": 0.8525682355469589, | |
| "train_speed(iter/s)": 0.04018 | |
| }, | |
| { | |
| "epoch": 1.5031503150315033, | |
| "grad_norm": 2.2700588703155518, | |
| "learning_rate": 1.0451699205788031e-05, | |
| "loss": 0.3826925277709961, | |
| "memory(GiB)": 74.54, | |
| "step": 1670, | |
| "token_acc": 0.8540529189416212, | |
| "train_speed(iter/s)": 0.040177 | |
| }, | |
| { | |
| "epoch": 1.5121512151215122, | |
| "grad_norm": 2.1843454837799072, | |
| "learning_rate": 1.0354605912920643e-05, | |
| "loss": 0.39476428031921384, | |
| "memory(GiB)": 74.54, | |
| "step": 1680, | |
| "token_acc": 0.8572723153602175, | |
| "train_speed(iter/s)": 0.040177 | |
| }, | |
| { | |
| "epoch": 1.521152115211521, | |
| "grad_norm": 2.183195114135742, | |
| "learning_rate": 1.0257479136549889e-05, | |
| "loss": 0.4017205715179443, | |
| "memory(GiB)": 74.54, | |
| "step": 1690, | |
| "token_acc": 0.858510389913612, | |
| "train_speed(iter/s)": 0.040177 | |
| }, | |
| { | |
| "epoch": 1.5301530153015301, | |
| "grad_norm": 2.2219948768615723, | |
| "learning_rate": 1.0160328047827805e-05, | |
| "loss": 0.3950798988342285, | |
| "memory(GiB)": 74.54, | |
| "step": 1700, | |
| "token_acc": 0.859968881973772, | |
| "train_speed(iter/s)": 0.04018 | |
| }, | |
| { | |
| "epoch": 1.5391539153915392, | |
| "grad_norm": 2.1306684017181396, | |
| "learning_rate": 1.006316182020213e-05, | |
| "loss": 0.3851861238479614, | |
| "memory(GiB)": 74.54, | |
| "step": 1710, | |
| "token_acc": 0.8605112384310268, | |
| "train_speed(iter/s)": 0.040185 | |
| }, | |
| { | |
| "epoch": 1.5481548154815483, | |
| "grad_norm": 2.3634705543518066, | |
| "learning_rate": 9.965989628550073e-06, | |
| "loss": 0.3927136421203613, | |
| "memory(GiB)": 74.54, | |
| "step": 1720, | |
| "token_acc": 0.8631741821396994, | |
| "train_speed(iter/s)": 0.040185 | |
| }, | |
| { | |
| "epoch": 1.5571557155715572, | |
| "grad_norm": 2.1868417263031006, | |
| "learning_rate": 9.868820648311998e-06, | |
| "loss": 0.3937791585922241, | |
| "memory(GiB)": 74.54, | |
| "step": 1730, | |
| "token_acc": 0.8506729331339458, | |
| "train_speed(iter/s)": 0.04019 | |
| }, | |
| { | |
| "epoch": 1.566156615661566, | |
| "grad_norm": 2.058154344558716, | |
| "learning_rate": 9.771664054625036e-06, | |
| "loss": 0.4051863193511963, | |
| "memory(GiB)": 74.54, | |
| "step": 1740, | |
| "token_acc": 0.8571127057830308, | |
| "train_speed(iter/s)": 0.04019 | |
| }, | |
| { | |
| "epoch": 1.5751575157515751, | |
| "grad_norm": 2.278233051300049, | |
| "learning_rate": 9.674529021456711e-06, | |
| "loss": 0.3995014429092407, | |
| "memory(GiB)": 74.54, | |
| "step": 1750, | |
| "token_acc": 0.8531134736385333, | |
| "train_speed(iter/s)": 0.04019 | |
| }, | |
| { | |
| "epoch": 1.5841584158415842, | |
| "grad_norm": 2.4994163513183594, | |
| "learning_rate": 9.577424720738725e-06, | |
| "loss": 0.3964822769165039, | |
| "memory(GiB)": 74.54, | |
| "step": 1760, | |
| "token_acc": 0.8614113159567705, | |
| "train_speed(iter/s)": 0.040189 | |
| }, | |
| { | |
| "epoch": 1.5931593159315933, | |
| "grad_norm": 2.2877440452575684, | |
| "learning_rate": 9.480360321500866e-06, | |
| "loss": 0.3912468433380127, | |
| "memory(GiB)": 74.54, | |
| "step": 1770, | |
| "token_acc": 0.8542329726288987, | |
| "train_speed(iter/s)": 0.04019 | |
| }, | |
| { | |
| "epoch": 1.6021602160216022, | |
| "grad_norm": 2.2842419147491455, | |
| "learning_rate": 9.38334498900525e-06, | |
| "loss": 0.396860408782959, | |
| "memory(GiB)": 74.54, | |
| "step": 1780, | |
| "token_acc": 0.8597612958226769, | |
| "train_speed(iter/s)": 0.040193 | |
| }, | |
| { | |
| "epoch": 1.611161116111611, | |
| "grad_norm": 2.171830415725708, | |
| "learning_rate": 9.28638788388088e-06, | |
| "loss": 0.39132468700408934, | |
| "memory(GiB)": 74.54, | |
| "step": 1790, | |
| "token_acc": 0.8446624087591241, | |
| "train_speed(iter/s)": 0.040193 | |
| }, | |
| { | |
| "epoch": 1.6201620162016201, | |
| "grad_norm": 2.2504782676696777, | |
| "learning_rate": 9.189498161258678e-06, | |
| "loss": 0.39133219718933104, | |
| "memory(GiB)": 74.54, | |
| "step": 1800, | |
| "token_acc": 0.8526747195858498, | |
| "train_speed(iter/s)": 0.040193 | |
| }, | |
| { | |
| "epoch": 1.6291629162916292, | |
| "grad_norm": 2.2380685806274414, | |
| "learning_rate": 9.092684969906994e-06, | |
| "loss": 0.39520695209503176, | |
| "memory(GiB)": 74.54, | |
| "step": 1810, | |
| "token_acc": 0.8510874389702618, | |
| "train_speed(iter/s)": 0.040195 | |
| }, | |
| { | |
| "epoch": 1.6381638163816383, | |
| "grad_norm": 2.3991379737854004, | |
| "learning_rate": 8.995957451367751e-06, | |
| "loss": 0.39344358444213867, | |
| "memory(GiB)": 74.54, | |
| "step": 1820, | |
| "token_acc": 0.8661971830985915, | |
| "train_speed(iter/s)": 0.040196 | |
| }, | |
| { | |
| "epoch": 1.6471647164716472, | |
| "grad_norm": 2.167818307876587, | |
| "learning_rate": 8.899324739093255e-06, | |
| "loss": 0.38270139694213867, | |
| "memory(GiB)": 74.54, | |
| "step": 1830, | |
| "token_acc": 0.8632143593975655, | |
| "train_speed(iter/s)": 0.040195 | |
| }, | |
| { | |
| "epoch": 1.656165616561656, | |
| "grad_norm": 2.1482577323913574, | |
| "learning_rate": 8.802795957583774e-06, | |
| "loss": 0.38856942653656007, | |
| "memory(GiB)": 74.54, | |
| "step": 1840, | |
| "token_acc": 0.8508108108108108, | |
| "train_speed(iter/s)": 0.040197 | |
| }, | |
| { | |
| "epoch": 1.6651665166516652, | |
| "grad_norm": 2.223714828491211, | |
| "learning_rate": 8.706380221525959e-06, | |
| "loss": 0.3878568172454834, | |
| "memory(GiB)": 74.54, | |
| "step": 1850, | |
| "token_acc": 0.8518351722585004, | |
| "train_speed(iter/s)": 0.040198 | |
| }, | |
| { | |
| "epoch": 1.6741674167416742, | |
| "grad_norm": 2.1293275356292725, | |
| "learning_rate": 8.610086634932195e-06, | |
| "loss": 0.3860627174377441, | |
| "memory(GiB)": 74.54, | |
| "step": 1860, | |
| "token_acc": 0.8636664460622104, | |
| "train_speed(iter/s)": 0.0402 | |
| }, | |
| { | |
| "epoch": 1.6831683168316833, | |
| "grad_norm": 2.2796740531921387, | |
| "learning_rate": 8.513924290280955e-06, | |
| "loss": 0.4010897636413574, | |
| "memory(GiB)": 74.54, | |
| "step": 1870, | |
| "token_acc": 0.8624, | |
| "train_speed(iter/s)": 0.040198 | |
| }, | |
| { | |
| "epoch": 1.6921692169216922, | |
| "grad_norm": 2.063302516937256, | |
| "learning_rate": 8.417902267658264e-06, | |
| "loss": 0.3978671312332153, | |
| "memory(GiB)": 74.54, | |
| "step": 1880, | |
| "token_acc": 0.8563941299790356, | |
| "train_speed(iter/s)": 0.040199 | |
| }, | |
| { | |
| "epoch": 1.701170117011701, | |
| "grad_norm": 2.589029550552368, | |
| "learning_rate": 8.322029633900293e-06, | |
| "loss": 0.4007380485534668, | |
| "memory(GiB)": 74.54, | |
| "step": 1890, | |
| "token_acc": 0.8558875219683656, | |
| "train_speed(iter/s)": 0.040201 | |
| }, | |
| { | |
| "epoch": 1.7101710171017102, | |
| "grad_norm": 2.1972382068634033, | |
| "learning_rate": 8.226315441737232e-06, | |
| "loss": 0.39293272495269777, | |
| "memory(GiB)": 74.54, | |
| "step": 1900, | |
| "token_acc": 0.8606382978723405, | |
| "train_speed(iter/s)": 0.040201 | |
| }, | |
| { | |
| "epoch": 1.7191719171917192, | |
| "grad_norm": 2.1070621013641357, | |
| "learning_rate": 8.130768728938503e-06, | |
| "loss": 0.4030153274536133, | |
| "memory(GiB)": 74.54, | |
| "step": 1910, | |
| "token_acc": 0.858612883309323, | |
| "train_speed(iter/s)": 0.040199 | |
| }, | |
| { | |
| "epoch": 1.7281728172817283, | |
| "grad_norm": 2.4515891075134277, | |
| "learning_rate": 8.035398517459367e-06, | |
| "loss": 0.3846758842468262, | |
| "memory(GiB)": 74.54, | |
| "step": 1920, | |
| "token_acc": 0.8604975587072774, | |
| "train_speed(iter/s)": 0.040203 | |
| }, | |
| { | |
| "epoch": 1.7371737173717372, | |
| "grad_norm": 2.4625024795532227, | |
| "learning_rate": 7.940213812589018e-06, | |
| "loss": 0.3977564096450806, | |
| "memory(GiB)": 74.54, | |
| "step": 1930, | |
| "token_acc": 0.8620689655172413, | |
| "train_speed(iter/s)": 0.040207 | |
| }, | |
| { | |
| "epoch": 1.746174617461746, | |
| "grad_norm": 2.358564853668213, | |
| "learning_rate": 7.84522360210028e-06, | |
| "loss": 0.3818389415740967, | |
| "memory(GiB)": 74.54, | |
| "step": 1940, | |
| "token_acc": 0.8622779519331244, | |
| "train_speed(iter/s)": 0.040208 | |
| }, | |
| { | |
| "epoch": 1.7551755175517552, | |
| "grad_norm": 2.43326473236084, | |
| "learning_rate": 7.750436855400924e-06, | |
| "loss": 0.40569381713867186, | |
| "memory(GiB)": 74.54, | |
| "step": 1950, | |
| "token_acc": 0.8431502316346791, | |
| "train_speed(iter/s)": 0.040209 | |
| }, | |
| { | |
| "epoch": 1.7641764176417642, | |
| "grad_norm": 2.141272783279419, | |
| "learning_rate": 7.655862522686759e-06, | |
| "loss": 0.4061896324157715, | |
| "memory(GiB)": 74.54, | |
| "step": 1960, | |
| "token_acc": 0.8561802484733628, | |
| "train_speed(iter/s)": 0.040213 | |
| }, | |
| { | |
| "epoch": 1.7731773177317733, | |
| "grad_norm": 2.1799638271331787, | |
| "learning_rate": 7.561509534096486e-06, | |
| "loss": 0.3843768835067749, | |
| "memory(GiB)": 74.54, | |
| "step": 1970, | |
| "token_acc": 0.8601476840456478, | |
| "train_speed(iter/s)": 0.040213 | |
| }, | |
| { | |
| "epoch": 1.7821782178217822, | |
| "grad_norm": 2.2130813598632812, | |
| "learning_rate": 7.467386798868492e-06, | |
| "loss": 0.383782172203064, | |
| "memory(GiB)": 74.54, | |
| "step": 1980, | |
| "token_acc": 0.8536738538831903, | |
| "train_speed(iter/s)": 0.040213 | |
| }, | |
| { | |
| "epoch": 1.791179117911791, | |
| "grad_norm": 2.2999327182769775, | |
| "learning_rate": 7.373503204499589e-06, | |
| "loss": 0.3898015975952148, | |
| "memory(GiB)": 74.54, | |
| "step": 1990, | |
| "token_acc": 0.8597833014659019, | |
| "train_speed(iter/s)": 0.040213 | |
| }, | |
| { | |
| "epoch": 1.8001800180018002, | |
| "grad_norm": 2.0685296058654785, | |
| "learning_rate": 7.279867615905836e-06, | |
| "loss": 0.39383411407470703, | |
| "memory(GiB)": 74.54, | |
| "step": 2000, | |
| "token_acc": 0.8522530329289428, | |
| "train_speed(iter/s)": 0.040217 | |
| }, | |
| { | |
| "epoch": 1.8001800180018002, | |
| "eval_loss": 0.40739279985427856, | |
| "eval_runtime": 113.0562, | |
| "eval_samples_per_second": 12.693, | |
| "eval_steps_per_second": 0.398, | |
| "eval_token_acc": 0.8513244228432564, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.8091809180918093, | |
| "grad_norm": 2.3695876598358154, | |
| "learning_rate": 7.186488874585441e-06, | |
| "loss": 0.38712072372436523, | |
| "memory(GiB)": 76.18, | |
| "step": 2010, | |
| "token_acc": 0.8560460652591171, | |
| "train_speed(iter/s)": 0.040111 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 2.2949750423431396, | |
| "learning_rate": 7.093375797783935e-06, | |
| "loss": 0.38932750225067136, | |
| "memory(GiB)": 76.18, | |
| "step": 2020, | |
| "token_acc": 0.8515789473684211, | |
| "train_speed(iter/s)": 0.040113 | |
| }, | |
| { | |
| "epoch": 1.8271827182718272, | |
| "grad_norm": 2.102889060974121, | |
| "learning_rate": 7.0005371776615884e-06, | |
| "loss": 0.3895460844039917, | |
| "memory(GiB)": 76.18, | |
| "step": 2030, | |
| "token_acc": 0.8582169709989259, | |
| "train_speed(iter/s)": 0.040117 | |
| }, | |
| { | |
| "epoch": 1.836183618361836, | |
| "grad_norm": 2.2533607482910156, | |
| "learning_rate": 6.907981780463233e-06, | |
| "loss": 0.3849326133728027, | |
| "memory(GiB)": 76.18, | |
| "step": 2040, | |
| "token_acc": 0.8707364762111667, | |
| "train_speed(iter/s)": 0.040118 | |
| }, | |
| { | |
| "epoch": 1.8451845184518452, | |
| "grad_norm": 2.058211326599121, | |
| "learning_rate": 6.815718345690496e-06, | |
| "loss": 0.38345019817352294, | |
| "memory(GiB)": 76.18, | |
| "step": 2050, | |
| "token_acc": 0.85548358275631, | |
| "train_speed(iter/s)": 0.040122 | |
| }, | |
| { | |
| "epoch": 1.8541854185418543, | |
| "grad_norm": 2.466780424118042, | |
| "learning_rate": 6.72375558527659e-06, | |
| "loss": 0.38396077156066893, | |
| "memory(GiB)": 76.18, | |
| "step": 2060, | |
| "token_acc": 0.8563974591651543, | |
| "train_speed(iter/s)": 0.040122 | |
| }, | |
| { | |
| "epoch": 1.8631863186318633, | |
| "grad_norm": 2.325998544692993, | |
| "learning_rate": 6.632102182763681e-06, | |
| "loss": 0.3884021759033203, | |
| "memory(GiB)": 76.18, | |
| "step": 2070, | |
| "token_acc": 0.8589527027027027, | |
| "train_speed(iter/s)": 0.040123 | |
| }, | |
| { | |
| "epoch": 1.8721872187218722, | |
| "grad_norm": 2.3079795837402344, | |
| "learning_rate": 6.540766792482962e-06, | |
| "loss": 0.4022721290588379, | |
| "memory(GiB)": 76.18, | |
| "step": 2080, | |
| "token_acc": 0.8444188722669735, | |
| "train_speed(iter/s)": 0.040126 | |
| }, | |
| { | |
| "epoch": 1.881188118811881, | |
| "grad_norm": 2.305443525314331, | |
| "learning_rate": 6.449758038737458e-06, | |
| "loss": 0.3774123668670654, | |
| "memory(GiB)": 76.18, | |
| "step": 2090, | |
| "token_acc": 0.859161246916349, | |
| "train_speed(iter/s)": 0.040128 | |
| }, | |
| { | |
| "epoch": 1.8901890189018902, | |
| "grad_norm": 2.306131362915039, | |
| "learning_rate": 6.359084514987688e-06, | |
| "loss": 0.38950314521789553, | |
| "memory(GiB)": 76.18, | |
| "step": 2100, | |
| "token_acc": 0.8646680942184154, | |
| "train_speed(iter/s)": 0.040128 | |
| }, | |
| { | |
| "epoch": 1.8991899189918993, | |
| "grad_norm": 2.5018227100372314, | |
| "learning_rate": 6.268754783040228e-06, | |
| "loss": 0.3790890693664551, | |
| "memory(GiB)": 76.18, | |
| "step": 2110, | |
| "token_acc": 0.8660165359338563, | |
| "train_speed(iter/s)": 0.040128 | |
| }, | |
| { | |
| "epoch": 1.9081908190819084, | |
| "grad_norm": 2.1461129188537598, | |
| "learning_rate": 6.17877737223928e-06, | |
| "loss": 0.37567844390869143, | |
| "memory(GiB)": 76.18, | |
| "step": 2120, | |
| "token_acc": 0.8673469387755102, | |
| "train_speed(iter/s)": 0.040129 | |
| }, | |
| { | |
| "epoch": 1.9171917191719172, | |
| "grad_norm": 2.1912460327148438, | |
| "learning_rate": 6.089160778661262e-06, | |
| "loss": 0.37552733421325685, | |
| "memory(GiB)": 76.18, | |
| "step": 2130, | |
| "token_acc": 0.8715083798882681, | |
| "train_speed(iter/s)": 0.040128 | |
| }, | |
| { | |
| "epoch": 1.926192619261926, | |
| "grad_norm": 2.2097115516662598, | |
| "learning_rate": 5.999913464312606e-06, | |
| "loss": 0.37886598110198977, | |
| "memory(GiB)": 76.18, | |
| "step": 2140, | |
| "token_acc": 0.8663426488456865, | |
| "train_speed(iter/s)": 0.040129 | |
| }, | |
| { | |
| "epoch": 1.9351935193519352, | |
| "grad_norm": 2.239027976989746, | |
| "learning_rate": 5.911043856330701e-06, | |
| "loss": 0.4021574020385742, | |
| "memory(GiB)": 76.18, | |
| "step": 2150, | |
| "token_acc": 0.8618796662274923, | |
| "train_speed(iter/s)": 0.040132 | |
| }, | |
| { | |
| "epoch": 1.9441944194419443, | |
| "grad_norm": 2.1112523078918457, | |
| "learning_rate": 5.822560346188204e-06, | |
| "loss": 0.3870594024658203, | |
| "memory(GiB)": 76.18, | |
| "step": 2160, | |
| "token_acc": 0.8622662266226623, | |
| "train_speed(iter/s)": 0.040134 | |
| }, | |
| { | |
| "epoch": 1.9531953195319534, | |
| "grad_norm": 2.1353354454040527, | |
| "learning_rate": 5.7344712889006424e-06, | |
| "loss": 0.38895013332366946, | |
| "memory(GiB)": 76.18, | |
| "step": 2170, | |
| "token_acc": 0.8509840674789129, | |
| "train_speed(iter/s)": 0.040134 | |
| }, | |
| { | |
| "epoch": 1.9621962196219622, | |
| "grad_norm": 2.064527988433838, | |
| "learning_rate": 5.646785002237509e-06, | |
| "loss": 0.3719027519226074, | |
| "memory(GiB)": 76.18, | |
| "step": 2180, | |
| "token_acc": 0.8651858368154828, | |
| "train_speed(iter/s)": 0.040134 | |
| }, | |
| { | |
| "epoch": 1.971197119711971, | |
| "grad_norm": 2.2494568824768066, | |
| "learning_rate": 5.5595097659368765e-06, | |
| "loss": 0.37720603942871095, | |
| "memory(GiB)": 76.18, | |
| "step": 2190, | |
| "token_acc": 0.8660617844026788, | |
| "train_speed(iter/s)": 0.040134 | |
| }, | |
| { | |
| "epoch": 1.9801980198019802, | |
| "grad_norm": 2.422858715057373, | |
| "learning_rate": 5.472653820923564e-06, | |
| "loss": 0.3978924036026001, | |
| "memory(GiB)": 76.18, | |
| "step": 2200, | |
| "token_acc": 0.8567662565905096, | |
| "train_speed(iter/s)": 0.040138 | |
| }, | |
| { | |
| "epoch": 1.9891989198919893, | |
| "grad_norm": 2.5676939487457275, | |
| "learning_rate": 5.386225368530995e-06, | |
| "loss": 0.39810938835144044, | |
| "memory(GiB)": 76.18, | |
| "step": 2210, | |
| "token_acc": 0.8570179274158286, | |
| "train_speed(iter/s)": 0.04014 | |
| }, | |
| { | |
| "epoch": 1.9981998199819984, | |
| "grad_norm": 2.2991700172424316, | |
| "learning_rate": 5.300232569726805e-06, | |
| "loss": 0.3851327657699585, | |
| "memory(GiB)": 76.18, | |
| "step": 2220, | |
| "token_acc": 0.8624459120929173, | |
| "train_speed(iter/s)": 0.040141 | |
| }, | |
| { | |
| "epoch": 2.007200720072007, | |
| "grad_norm": 2.1788246631622314, | |
| "learning_rate": 5.2146835443422215e-06, | |
| "loss": 0.3738105773925781, | |
| "memory(GiB)": 76.18, | |
| "step": 2230, | |
| "token_acc": 0.8664259927797834, | |
| "train_speed(iter/s)": 0.04015 | |
| }, | |
| { | |
| "epoch": 2.016201620162016, | |
| "grad_norm": 2.2583391666412354, | |
| "learning_rate": 5.129586370305389e-06, | |
| "loss": 0.37696280479431155, | |
| "memory(GiB)": 76.18, | |
| "step": 2240, | |
| "token_acc": 0.8627628306579245, | |
| "train_speed(iter/s)": 0.040149 | |
| }, | |
| { | |
| "epoch": 2.025202520252025, | |
| "grad_norm": 2.3937697410583496, | |
| "learning_rate": 5.0449490828785745e-06, | |
| "loss": 0.35777480602264405, | |
| "memory(GiB)": 76.18, | |
| "step": 2250, | |
| "token_acc": 0.8723312486521457, | |
| "train_speed(iter/s)": 0.040148 | |
| }, | |
| { | |
| "epoch": 2.0342034203420343, | |
| "grad_norm": 2.3122761249542236, | |
| "learning_rate": 4.960779673899465e-06, | |
| "loss": 0.3647487163543701, | |
| "memory(GiB)": 76.18, | |
| "step": 2260, | |
| "token_acc": 0.8682050144220103, | |
| "train_speed(iter/s)": 0.04015 | |
| }, | |
| { | |
| "epoch": 2.0432043204320434, | |
| "grad_norm": 2.3489394187927246, | |
| "learning_rate": 4.8770860910265315e-06, | |
| "loss": 0.3610623836517334, | |
| "memory(GiB)": 76.18, | |
| "step": 2270, | |
| "token_acc": 0.8642826367944851, | |
| "train_speed(iter/s)": 0.040151 | |
| }, | |
| { | |
| "epoch": 2.052205220522052, | |
| "grad_norm": 2.564075469970703, | |
| "learning_rate": 4.793876236988593e-06, | |
| "loss": 0.3656606674194336, | |
| "memory(GiB)": 76.18, | |
| "step": 2280, | |
| "token_acc": 0.8674548848786559, | |
| "train_speed(iter/s)": 0.040152 | |
| }, | |
| { | |
| "epoch": 2.061206120612061, | |
| "grad_norm": 2.3542511463165283, | |
| "learning_rate": 4.711157968838577e-06, | |
| "loss": 0.38109097480773924, | |
| "memory(GiB)": 76.18, | |
| "step": 2290, | |
| "token_acc": 0.8542568542568543, | |
| "train_speed(iter/s)": 0.040154 | |
| }, | |
| { | |
| "epoch": 2.07020702070207, | |
| "grad_norm": 2.5607492923736572, | |
| "learning_rate": 4.628939097211641e-06, | |
| "loss": 0.3731189966201782, | |
| "memory(GiB)": 76.18, | |
| "step": 2300, | |
| "token_acc": 0.8808107512667989, | |
| "train_speed(iter/s)": 0.040155 | |
| }, | |
| { | |
| "epoch": 2.0792079207920793, | |
| "grad_norm": 2.4762189388275146, | |
| "learning_rate": 4.547227385587648e-06, | |
| "loss": 0.3798922300338745, | |
| "memory(GiB)": 76.18, | |
| "step": 2310, | |
| "token_acc": 0.8597145993413831, | |
| "train_speed(iter/s)": 0.040157 | |
| }, | |
| { | |
| "epoch": 2.0882088208820884, | |
| "grad_norm": 2.485635280609131, | |
| "learning_rate": 4.466030549558116e-06, | |
| "loss": 0.3755971670150757, | |
| "memory(GiB)": 76.18, | |
| "step": 2320, | |
| "token_acc": 0.8549968704360525, | |
| "train_speed(iter/s)": 0.040157 | |
| }, | |
| { | |
| "epoch": 2.097209720972097, | |
| "grad_norm": 2.2108871936798096, | |
| "learning_rate": 4.385356256097656e-06, | |
| "loss": 0.35892772674560547, | |
| "memory(GiB)": 76.18, | |
| "step": 2330, | |
| "token_acc": 0.8641063515509602, | |
| "train_speed(iter/s)": 0.040157 | |
| }, | |
| { | |
| "epoch": 2.106210621062106, | |
| "grad_norm": 2.559431791305542, | |
| "learning_rate": 4.305212122840038e-06, | |
| "loss": 0.36676650047302245, | |
| "memory(GiB)": 76.18, | |
| "step": 2340, | |
| "token_acc": 0.8685561258647624, | |
| "train_speed(iter/s)": 0.040159 | |
| }, | |
| { | |
| "epoch": 2.115211521152115, | |
| "grad_norm": 2.3263328075408936, | |
| "learning_rate": 4.22560571735889e-06, | |
| "loss": 0.3723811149597168, | |
| "memory(GiB)": 76.18, | |
| "step": 2350, | |
| "token_acc": 0.8562313908974905, | |
| "train_speed(iter/s)": 0.04016 | |
| }, | |
| { | |
| "epoch": 2.1242124212421243, | |
| "grad_norm": 2.4957282543182373, | |
| "learning_rate": 4.146544556453146e-06, | |
| "loss": 0.3725306987762451, | |
| "memory(GiB)": 76.18, | |
| "step": 2360, | |
| "token_acc": 0.8700726712177934, | |
| "train_speed(iter/s)": 0.040162 | |
| }, | |
| { | |
| "epoch": 2.1332133213321334, | |
| "grad_norm": 2.5752525329589844, | |
| "learning_rate": 4.068036105437259e-06, | |
| "loss": 0.3709956884384155, | |
| "memory(GiB)": 76.18, | |
| "step": 2370, | |
| "token_acc": 0.8635585970915313, | |
| "train_speed(iter/s)": 0.040163 | |
| }, | |
| { | |
| "epoch": 2.142214221422142, | |
| "grad_norm": 2.509699583053589, | |
| "learning_rate": 3.990087777436303e-06, | |
| "loss": 0.37915217876434326, | |
| "memory(GiB)": 76.18, | |
| "step": 2380, | |
| "token_acc": 0.8585365853658536, | |
| "train_speed(iter/s)": 0.040161 | |
| }, | |
| { | |
| "epoch": 2.151215121512151, | |
| "grad_norm": 2.5639617443084717, | |
| "learning_rate": 3.9127069326859815e-06, | |
| "loss": 0.36791577339172366, | |
| "memory(GiB)": 76.18, | |
| "step": 2390, | |
| "token_acc": 0.8695652173913043, | |
| "train_speed(iter/s)": 0.040161 | |
| }, | |
| { | |
| "epoch": 2.16021602160216, | |
| "grad_norm": 2.5950934886932373, | |
| "learning_rate": 3.835900877837665e-06, | |
| "loss": 0.37401318550109863, | |
| "memory(GiB)": 76.18, | |
| "step": 2400, | |
| "token_acc": 0.8627917026793431, | |
| "train_speed(iter/s)": 0.04016 | |
| }, | |
| { | |
| "epoch": 2.1692169216921693, | |
| "grad_norm": 2.627086639404297, | |
| "learning_rate": 3.7596768652684324e-06, | |
| "loss": 0.37379937171936034, | |
| "memory(GiB)": 76.18, | |
| "step": 2410, | |
| "token_acc": 0.8596715717637022, | |
| "train_speed(iter/s)": 0.040162 | |
| }, | |
| { | |
| "epoch": 2.1782178217821784, | |
| "grad_norm": 3.0903186798095703, | |
| "learning_rate": 3.6840420923962873e-06, | |
| "loss": 0.36346681118011476, | |
| "memory(GiB)": 76.18, | |
| "step": 2420, | |
| "token_acc": 0.8670668953687821, | |
| "train_speed(iter/s)": 0.040164 | |
| }, | |
| { | |
| "epoch": 2.187218721872187, | |
| "grad_norm": 2.4955599308013916, | |
| "learning_rate": 3.609003701000535e-06, | |
| "loss": 0.35879087448120117, | |
| "memory(GiB)": 76.18, | |
| "step": 2430, | |
| "token_acc": 0.8731778425655977, | |
| "train_speed(iter/s)": 0.040165 | |
| }, | |
| { | |
| "epoch": 2.196219621962196, | |
| "grad_norm": 2.3009448051452637, | |
| "learning_rate": 3.5345687765474444e-06, | |
| "loss": 0.37301011085510255, | |
| "memory(GiB)": 76.18, | |
| "step": 2440, | |
| "token_acc": 0.8637790332705587, | |
| "train_speed(iter/s)": 0.040167 | |
| }, | |
| { | |
| "epoch": 2.205220522052205, | |
| "grad_norm": 2.5973548889160156, | |
| "learning_rate": 3.4607443475211745e-06, | |
| "loss": 0.37910096645355223, | |
| "memory(GiB)": 76.18, | |
| "step": 2450, | |
| "token_acc": 0.862, | |
| "train_speed(iter/s)": 0.040169 | |
| }, | |
| { | |
| "epoch": 2.2142214221422143, | |
| "grad_norm": 2.7337653636932373, | |
| "learning_rate": 3.3875373847601365e-06, | |
| "loss": 0.36832966804504397, | |
| "memory(GiB)": 76.18, | |
| "step": 2460, | |
| "token_acc": 0.8709608843537415, | |
| "train_speed(iter/s)": 0.040171 | |
| }, | |
| { | |
| "epoch": 2.2232223222322234, | |
| "grad_norm": 2.4979779720306396, | |
| "learning_rate": 3.314954800798763e-06, | |
| "loss": 0.35463604927062986, | |
| "memory(GiB)": 76.18, | |
| "step": 2470, | |
| "token_acc": 0.8807906114885732, | |
| "train_speed(iter/s)": 0.040173 | |
| }, | |
| { | |
| "epoch": 2.232223222322232, | |
| "grad_norm": 2.651418685913086, | |
| "learning_rate": 3.24300344921481e-06, | |
| "loss": 0.3576260805130005, | |
| "memory(GiB)": 76.18, | |
| "step": 2480, | |
| "token_acc": 0.8673512154233026, | |
| "train_speed(iter/s)": 0.040173 | |
| }, | |
| { | |
| "epoch": 2.241224122412241, | |
| "grad_norm": 2.2821831703186035, | |
| "learning_rate": 3.1716901239821918e-06, | |
| "loss": 0.3680659294128418, | |
| "memory(GiB)": 76.18, | |
| "step": 2490, | |
| "token_acc": 0.8615550755939525, | |
| "train_speed(iter/s)": 0.040176 | |
| }, | |
| { | |
| "epoch": 2.25022502250225, | |
| "grad_norm": 2.532939910888672, | |
| "learning_rate": 3.1010215588294724e-06, | |
| "loss": 0.3763418674468994, | |
| "memory(GiB)": 76.18, | |
| "step": 2500, | |
| "token_acc": 0.8679738562091504, | |
| "train_speed(iter/s)": 0.040176 | |
| }, | |
| { | |
| "epoch": 2.25022502250225, | |
| "eval_loss": 0.39449381828308105, | |
| "eval_runtime": 112.8212, | |
| "eval_samples_per_second": 12.719, | |
| "eval_steps_per_second": 0.399, | |
| "eval_token_acc": 0.8566221142162819, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.2592259225922593, | |
| "grad_norm": 2.495901584625244, | |
| "learning_rate": 3.031004426604044e-06, | |
| "loss": 0.3614701271057129, | |
| "memory(GiB)": 76.18, | |
| "step": 2510, | |
| "token_acc": 0.8576721210250077, | |
| "train_speed(iter/s)": 0.040102 | |
| }, | |
| { | |
| "epoch": 2.2682268226822684, | |
| "grad_norm": 2.6652517318725586, | |
| "learning_rate": 2.961645338642032e-06, | |
| "loss": 0.3705326557159424, | |
| "memory(GiB)": 76.18, | |
| "step": 2520, | |
| "token_acc": 0.8555579261787924, | |
| "train_speed(iter/s)": 0.040101 | |
| }, | |
| { | |
| "epoch": 2.2772277227722775, | |
| "grad_norm": 2.2919044494628906, | |
| "learning_rate": 2.892950844144028e-06, | |
| "loss": 0.3567212581634521, | |
| "memory(GiB)": 76.18, | |
| "step": 2530, | |
| "token_acc": 0.8672348060103162, | |
| "train_speed(iter/s)": 0.0401 | |
| }, | |
| { | |
| "epoch": 2.286228622862286, | |
| "grad_norm": 2.7642829418182373, | |
| "learning_rate": 2.8249274295566863e-06, | |
| "loss": 0.3735655784606934, | |
| "memory(GiB)": 76.18, | |
| "step": 2540, | |
| "token_acc": 0.8645260611392127, | |
| "train_speed(iter/s)": 0.040102 | |
| }, | |
| { | |
| "epoch": 2.295229522952295, | |
| "grad_norm": 2.2890052795410156, | |
| "learning_rate": 2.7575815179602527e-06, | |
| "loss": 0.36810617446899413, | |
| "memory(GiB)": 76.18, | |
| "step": 2550, | |
| "token_acc": 0.8708510638297873, | |
| "train_speed(iter/s)": 0.040105 | |
| }, | |
| { | |
| "epoch": 2.3042304230423043, | |
| "grad_norm": 2.5169107913970947, | |
| "learning_rate": 2.6909194684620453e-06, | |
| "loss": 0.3683924674987793, | |
| "memory(GiB)": 76.18, | |
| "step": 2560, | |
| "token_acc": 0.8675250982103885, | |
| "train_speed(iter/s)": 0.040108 | |
| }, | |
| { | |
| "epoch": 2.3132313231323134, | |
| "grad_norm": 2.696864128112793, | |
| "learning_rate": 2.6249475755960185e-06, | |
| "loss": 0.3705678701400757, | |
| "memory(GiB)": 76.18, | |
| "step": 2570, | |
| "token_acc": 0.8628597122302158, | |
| "train_speed(iter/s)": 0.040109 | |
| }, | |
| { | |
| "epoch": 2.322232223222322, | |
| "grad_norm": 2.4484846591949463, | |
| "learning_rate": 2.559672068728398e-06, | |
| "loss": 0.36278524398803713, | |
| "memory(GiB)": 76.18, | |
| "step": 2580, | |
| "token_acc": 0.8645696810834426, | |
| "train_speed(iter/s)": 0.04011 | |
| }, | |
| { | |
| "epoch": 2.331233123312331, | |
| "grad_norm": 2.4576802253723145, | |
| "learning_rate": 2.4950991114694755e-06, | |
| "loss": 0.3606465578079224, | |
| "memory(GiB)": 76.18, | |
| "step": 2590, | |
| "token_acc": 0.8734927015020097, | |
| "train_speed(iter/s)": 0.040113 | |
| }, | |
| { | |
| "epoch": 2.34023402340234, | |
| "grad_norm": 2.6191623210906982, | |
| "learning_rate": 2.4312348010916088e-06, | |
| "loss": 0.36288201808929443, | |
| "memory(GiB)": 76.18, | |
| "step": 2600, | |
| "token_acc": 0.8631202691337259, | |
| "train_speed(iter/s)": 0.040113 | |
| }, | |
| { | |
| "epoch": 2.3492349234923493, | |
| "grad_norm": 2.6887686252593994, | |
| "learning_rate": 2.3680851679535024e-06, | |
| "loss": 0.3752190589904785, | |
| "memory(GiB)": 76.18, | |
| "step": 2610, | |
| "token_acc": 0.8617521367521368, | |
| "train_speed(iter/s)": 0.040114 | |
| }, | |
| { | |
| "epoch": 2.3582358235823584, | |
| "grad_norm": 2.481362819671631, | |
| "learning_rate": 2.305656174930776e-06, | |
| "loss": 0.36593198776245117, | |
| "memory(GiB)": 76.18, | |
| "step": 2620, | |
| "token_acc": 0.8668838219326819, | |
| "train_speed(iter/s)": 0.040116 | |
| }, | |
| { | |
| "epoch": 2.3672367236723675, | |
| "grad_norm": 2.629666328430176, | |
| "learning_rate": 2.243953716852938e-06, | |
| "loss": 0.3610795021057129, | |
| "memory(GiB)": 76.18, | |
| "step": 2630, | |
| "token_acc": 0.8612348822406111, | |
| "train_speed(iter/s)": 0.040117 | |
| }, | |
| { | |
| "epoch": 2.376237623762376, | |
| "grad_norm": 2.433375597000122, | |
| "learning_rate": 2.1829836199467568e-06, | |
| "loss": 0.3648895263671875, | |
| "memory(GiB)": 76.18, | |
| "step": 2640, | |
| "token_acc": 0.8715654952076677, | |
| "train_speed(iter/s)": 0.040119 | |
| }, | |
| { | |
| "epoch": 2.385238523852385, | |
| "grad_norm": 2.5231969356536865, | |
| "learning_rate": 2.1227516412861303e-06, | |
| "loss": 0.34891419410705565, | |
| "memory(GiB)": 76.18, | |
| "step": 2650, | |
| "token_acc": 0.8747478822105688, | |
| "train_speed(iter/s)": 0.040119 | |
| }, | |
| { | |
| "epoch": 2.3942394239423943, | |
| "grad_norm": 2.6941776275634766, | |
| "learning_rate": 2.063263468248472e-06, | |
| "loss": 0.35621964931488037, | |
| "memory(GiB)": 76.18, | |
| "step": 2660, | |
| "token_acc": 0.8614357262103506, | |
| "train_speed(iter/s)": 0.040119 | |
| }, | |
| { | |
| "epoch": 2.4032403240324034, | |
| "grad_norm": 2.4811367988586426, | |
| "learning_rate": 2.0045247179776927e-06, | |
| "loss": 0.36508636474609374, | |
| "memory(GiB)": 76.18, | |
| "step": 2670, | |
| "token_acc": 0.865956984575277, | |
| "train_speed(iter/s)": 0.040122 | |
| }, | |
| { | |
| "epoch": 2.412241224122412, | |
| "grad_norm": 2.5584983825683594, | |
| "learning_rate": 1.946540936853787e-06, | |
| "loss": 0.36142873764038086, | |
| "memory(GiB)": 76.18, | |
| "step": 2680, | |
| "token_acc": 0.8618881118881119, | |
| "train_speed(iter/s)": 0.040122 | |
| }, | |
| { | |
| "epoch": 2.421242124212421, | |
| "grad_norm": 2.639416217803955, | |
| "learning_rate": 1.8893175999691315e-06, | |
| "loss": 0.3669375658035278, | |
| "memory(GiB)": 76.18, | |
| "step": 2690, | |
| "token_acc": 0.8706407137064072, | |
| "train_speed(iter/s)": 0.040123 | |
| }, | |
| { | |
| "epoch": 2.43024302430243, | |
| "grad_norm": 2.526108980178833, | |
| "learning_rate": 1.8328601106114974e-06, | |
| "loss": 0.36782519817352294, | |
| "memory(GiB)": 76.18, | |
| "step": 2700, | |
| "token_acc": 0.8681867535287731, | |
| "train_speed(iter/s)": 0.040125 | |
| }, | |
| { | |
| "epoch": 2.4392439243924393, | |
| "grad_norm": 2.4853765964508057, | |
| "learning_rate": 1.7771737997538551e-06, | |
| "loss": 0.3661306858062744, | |
| "memory(GiB)": 76.18, | |
| "step": 2710, | |
| "token_acc": 0.8591703056768559, | |
| "train_speed(iter/s)": 0.040126 | |
| }, | |
| { | |
| "epoch": 2.4482448244824484, | |
| "grad_norm": 2.546694040298462, | |
| "learning_rate": 1.7222639255509855e-06, | |
| "loss": 0.3565016269683838, | |
| "memory(GiB)": 76.18, | |
| "step": 2720, | |
| "token_acc": 0.8700276536907041, | |
| "train_speed(iter/s)": 0.040126 | |
| }, | |
| { | |
| "epoch": 2.4572457245724575, | |
| "grad_norm": 2.6145668029785156, | |
| "learning_rate": 1.6681356728429909e-06, | |
| "loss": 0.3617668628692627, | |
| "memory(GiB)": 76.18, | |
| "step": 2730, | |
| "token_acc": 0.8759859772129711, | |
| "train_speed(iter/s)": 0.040127 | |
| }, | |
| { | |
| "epoch": 2.466246624662466, | |
| "grad_norm": 2.4962821006774902, | |
| "learning_rate": 1.6147941526657151e-06, | |
| "loss": 0.36135101318359375, | |
| "memory(GiB)": 76.18, | |
| "step": 2740, | |
| "token_acc": 0.8689489751417357, | |
| "train_speed(iter/s)": 0.040127 | |
| }, | |
| { | |
| "epoch": 2.4752475247524752, | |
| "grad_norm": 2.476327896118164, | |
| "learning_rate": 1.5622444017681438e-06, | |
| "loss": 0.3584137916564941, | |
| "memory(GiB)": 76.18, | |
| "step": 2750, | |
| "token_acc": 0.8637279033340792, | |
| "train_speed(iter/s)": 0.040128 | |
| }, | |
| { | |
| "epoch": 2.4842484248424843, | |
| "grad_norm": 2.5135715007781982, | |
| "learning_rate": 1.5104913821367995e-06, | |
| "loss": 0.352571439743042, | |
| "memory(GiB)": 76.18, | |
| "step": 2760, | |
| "token_acc": 0.8638624119353502, | |
| "train_speed(iter/s)": 0.040127 | |
| }, | |
| { | |
| "epoch": 2.4932493249324934, | |
| "grad_norm": 2.535942316055298, | |
| "learning_rate": 1.4595399805272138e-06, | |
| "loss": 0.35703449249267577, | |
| "memory(GiB)": 76.18, | |
| "step": 2770, | |
| "token_acc": 0.8715143715143715, | |
| "train_speed(iter/s)": 0.040129 | |
| }, | |
| { | |
| "epoch": 2.502250225022502, | |
| "grad_norm": 2.5901577472686768, | |
| "learning_rate": 1.409395008002501e-06, | |
| "loss": 0.3632636070251465, | |
| "memory(GiB)": 76.18, | |
| "step": 2780, | |
| "token_acc": 0.8740141137401412, | |
| "train_speed(iter/s)": 0.040131 | |
| }, | |
| { | |
| "epoch": 2.511251125112511, | |
| "grad_norm": 2.4865550994873047, | |
| "learning_rate": 1.3600611994790737e-06, | |
| "loss": 0.36820478439331056, | |
| "memory(GiB)": 76.18, | |
| "step": 2790, | |
| "token_acc": 0.8674225904928042, | |
| "train_speed(iter/s)": 0.040131 | |
| }, | |
| { | |
| "epoch": 2.5202520252025202, | |
| "grad_norm": 2.745784044265747, | |
| "learning_rate": 1.311543213279548e-06, | |
| "loss": 0.36357576847076417, | |
| "memory(GiB)": 76.18, | |
| "step": 2800, | |
| "token_acc": 0.8688079619995476, | |
| "train_speed(iter/s)": 0.040134 | |
| }, | |
| { | |
| "epoch": 2.5292529252925293, | |
| "grad_norm": 2.613213300704956, | |
| "learning_rate": 1.2638456306928838e-06, | |
| "loss": 0.35836281776428225, | |
| "memory(GiB)": 76.18, | |
| "step": 2810, | |
| "token_acc": 0.8775203775203775, | |
| "train_speed(iter/s)": 0.040135 | |
| }, | |
| { | |
| "epoch": 2.5382538253825384, | |
| "grad_norm": 2.856757879257202, | |
| "learning_rate": 1.2169729555418008e-06, | |
| "loss": 0.35776748657226565, | |
| "memory(GiB)": 76.18, | |
| "step": 2820, | |
| "token_acc": 0.8681778169014085, | |
| "train_speed(iter/s)": 0.040136 | |
| }, | |
| { | |
| "epoch": 2.5472547254725475, | |
| "grad_norm": 2.5222392082214355, | |
| "learning_rate": 1.1709296137575088e-06, | |
| "loss": 0.357517409324646, | |
| "memory(GiB)": 76.18, | |
| "step": 2830, | |
| "token_acc": 0.8692437684833122, | |
| "train_speed(iter/s)": 0.040138 | |
| }, | |
| { | |
| "epoch": 2.556255625562556, | |
| "grad_norm": 2.6644461154937744, | |
| "learning_rate": 1.1257199529617846e-06, | |
| "loss": 0.3525848388671875, | |
| "memory(GiB)": 76.18, | |
| "step": 2840, | |
| "token_acc": 0.8726828274597678, | |
| "train_speed(iter/s)": 0.04014 | |
| }, | |
| { | |
| "epoch": 2.5652565256525652, | |
| "grad_norm": 3.0361390113830566, | |
| "learning_rate": 1.0813482420564569e-06, | |
| "loss": 0.36429810523986816, | |
| "memory(GiB)": 76.18, | |
| "step": 2850, | |
| "token_acc": 0.8605402909258831, | |
| "train_speed(iter/s)": 0.040142 | |
| }, | |
| { | |
| "epoch": 2.5742574257425743, | |
| "grad_norm": 2.2939305305480957, | |
| "learning_rate": 1.0378186708203097e-06, | |
| "loss": 0.3595736026763916, | |
| "memory(GiB)": 76.18, | |
| "step": 2860, | |
| "token_acc": 0.8699784017278618, | |
| "train_speed(iter/s)": 0.040145 | |
| }, | |
| { | |
| "epoch": 2.5832583258325834, | |
| "grad_norm": 2.8929970264434814, | |
| "learning_rate": 9.951353495134741e-07, | |
| "loss": 0.3722720146179199, | |
| "memory(GiB)": 76.18, | |
| "step": 2870, | |
| "token_acc": 0.8633415343323642, | |
| "train_speed(iter/s)": 0.040147 | |
| }, | |
| { | |
| "epoch": 2.592259225922592, | |
| "grad_norm": 2.766711711883545, | |
| "learning_rate": 9.533023084893112e-07, | |
| "loss": 0.3628982067108154, | |
| "memory(GiB)": 76.18, | |
| "step": 2880, | |
| "token_acc": 0.8731262220291115, | |
| "train_speed(iter/s)": 0.040148 | |
| }, | |
| { | |
| "epoch": 2.601260126012601, | |
| "grad_norm": 2.6322643756866455, | |
| "learning_rate": 9.123234978138485e-07, | |
| "loss": 0.3563962459564209, | |
| "memory(GiB)": 76.18, | |
| "step": 2890, | |
| "token_acc": 0.8709398007795582, | |
| "train_speed(iter/s)": 0.040149 | |
| }, | |
| { | |
| "epoch": 2.6102610261026102, | |
| "grad_norm": 2.3969507217407227, | |
| "learning_rate": 8.722027868927973e-07, | |
| "loss": 0.3593640089035034, | |
| "memory(GiB)": 76.18, | |
| "step": 2900, | |
| "token_acc": 0.8687513763488218, | |
| "train_speed(iter/s)": 0.040149 | |
| }, | |
| { | |
| "epoch": 2.6192619261926193, | |
| "grad_norm": 2.662048101425171, | |
| "learning_rate": 8.32943964106192e-07, | |
| "loss": 0.36847290992736814, | |
| "memory(GiB)": 76.18, | |
| "step": 2910, | |
| "token_acc": 0.8610752688172043, | |
| "train_speed(iter/s)": 0.040152 | |
| }, | |
| { | |
| "epoch": 2.6282628262826284, | |
| "grad_norm": 2.6064634323120117, | |
| "learning_rate": 7.945507364506632e-07, | |
| "loss": 0.3641893625259399, | |
| "memory(GiB)": 76.18, | |
| "step": 2920, | |
| "token_acc": 0.8610666056305791, | |
| "train_speed(iter/s)": 0.040154 | |
| }, | |
| { | |
| "epoch": 2.6372637263726375, | |
| "grad_norm": 2.4192819595336914, | |
| "learning_rate": 7.57026729189414e-07, | |
| "loss": 0.3702700138092041, | |
| "memory(GiB)": 76.18, | |
| "step": 2930, | |
| "token_acc": 0.8613074204946997, | |
| "train_speed(iter/s)": 0.040157 | |
| }, | |
| { | |
| "epoch": 2.646264626462646, | |
| "grad_norm": 2.3483784198760986, | |
| "learning_rate": 7.203754855099009e-07, | |
| "loss": 0.36264016628265383, | |
| "memory(GiB)": 76.18, | |
| "step": 2940, | |
| "token_acc": 0.8588575238941987, | |
| "train_speed(iter/s)": 0.04016 | |
| }, | |
| { | |
| "epoch": 2.6552655265526552, | |
| "grad_norm": 2.5846633911132812, | |
| "learning_rate": 6.846004661892813e-07, | |
| "loss": 0.37308740615844727, | |
| "memory(GiB)": 76.18, | |
| "step": 2950, | |
| "token_acc": 0.8615806304248516, | |
| "train_speed(iter/s)": 0.040161 | |
| }, | |
| { | |
| "epoch": 2.6642664266426643, | |
| "grad_norm": 2.6962997913360596, | |
| "learning_rate": 6.497050492676126e-07, | |
| "loss": 0.36321473121643066, | |
| "memory(GiB)": 76.18, | |
| "step": 2960, | |
| "token_acc": 0.8618261826182618, | |
| "train_speed(iter/s)": 0.040163 | |
| }, | |
| { | |
| "epoch": 2.6732673267326734, | |
| "grad_norm": 2.416895627975464, | |
| "learning_rate": 6.156925297288996e-07, | |
| "loss": 0.34958364963531496, | |
| "memory(GiB)": 76.18, | |
| "step": 2970, | |
| "token_acc": 0.8714713430282293, | |
| "train_speed(iter/s)": 0.040164 | |
| }, | |
| { | |
| "epoch": 2.682268226822682, | |
| "grad_norm": 2.3380393981933594, | |
| "learning_rate": 5.825661191899534e-07, | |
| "loss": 0.36399097442626954, | |
| "memory(GiB)": 76.18, | |
| "step": 2980, | |
| "token_acc": 0.8697334479793637, | |
| "train_speed(iter/s)": 0.040165 | |
| }, | |
| { | |
| "epoch": 2.691269126912691, | |
| "grad_norm": 2.4997997283935547, | |
| "learning_rate": 5.503289455971495e-07, | |
| "loss": 0.3497540235519409, | |
| "memory(GiB)": 76.18, | |
| "step": 2990, | |
| "token_acc": 0.8589799476896252, | |
| "train_speed(iter/s)": 0.040167 | |
| }, | |
| { | |
| "epoch": 2.7002700270027002, | |
| "grad_norm": 2.7024405002593994, | |
| "learning_rate": 5.18984052931063e-07, | |
| "loss": 0.36266303062438965, | |
| "memory(GiB)": 76.18, | |
| "step": 3000, | |
| "token_acc": 0.8634655532359081, | |
| "train_speed(iter/s)": 0.040168 | |
| }, | |
| { | |
| "epoch": 2.7002700270027002, | |
| "eval_loss": 0.3909822702407837, | |
| "eval_runtime": 113.741, | |
| "eval_samples_per_second": 12.616, | |
| "eval_steps_per_second": 0.396, | |
| "eval_token_acc": 0.8578371810449574, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.7092709270927093, | |
| "grad_norm": 2.7375988960266113, | |
| "learning_rate": 4.885344009190429e-07, | |
| "loss": 0.36505513191223143, | |
| "memory(GiB)": 76.18, | |
| "step": 3010, | |
| "token_acc": 0.8647040722125346, | |
| "train_speed(iter/s)": 0.040096 | |
| }, | |
| { | |
| "epoch": 2.7182718271827184, | |
| "grad_norm": 2.5784595012664795, | |
| "learning_rate": 4.5898286475574483e-07, | |
| "loss": 0.36314241886138915, | |
| "memory(GiB)": 76.18, | |
| "step": 3020, | |
| "token_acc": 0.8750795334040297, | |
| "train_speed(iter/s)": 0.040096 | |
| }, | |
| { | |
| "epoch": 2.7272727272727275, | |
| "grad_norm": 2.59897518157959, | |
| "learning_rate": 4.30332234831643e-07, | |
| "loss": 0.3617940664291382, | |
| "memory(GiB)": 76.18, | |
| "step": 3030, | |
| "token_acc": 0.8697020562316408, | |
| "train_speed(iter/s)": 0.040097 | |
| }, | |
| { | |
| "epoch": 2.736273627362736, | |
| "grad_norm": 2.331024646759033, | |
| "learning_rate": 4.025852164695432e-07, | |
| "loss": 0.35245676040649415, | |
| "memory(GiB)": 76.18, | |
| "step": 3040, | |
| "token_acc": 0.8609855820959759, | |
| "train_speed(iter/s)": 0.040098 | |
| }, | |
| { | |
| "epoch": 2.7452745274527453, | |
| "grad_norm": 2.9060468673706055, | |
| "learning_rate": 3.7574442966913816e-07, | |
| "loss": 0.37049217224121095, | |
| "memory(GiB)": 76.18, | |
| "step": 3050, | |
| "token_acc": 0.8594235033259423, | |
| "train_speed(iter/s)": 0.040099 | |
| }, | |
| { | |
| "epoch": 2.7542754275427543, | |
| "grad_norm": 2.7476565837860107, | |
| "learning_rate": 3.498124088596133e-07, | |
| "loss": 0.35335454940795896, | |
| "memory(GiB)": 76.18, | |
| "step": 3060, | |
| "token_acc": 0.8769035532994924, | |
| "train_speed(iter/s)": 0.040098 | |
| }, | |
| { | |
| "epoch": 2.7632763276327634, | |
| "grad_norm": 2.47446346282959, | |
| "learning_rate": 3.2479160266033595e-07, | |
| "loss": 0.3646056652069092, | |
| "memory(GiB)": 76.18, | |
| "step": 3070, | |
| "token_acc": 0.8609637488947833, | |
| "train_speed(iter/s)": 0.040099 | |
| }, | |
| { | |
| "epoch": 2.772277227722772, | |
| "grad_norm": 2.518899440765381, | |
| "learning_rate": 3.0068437364964563e-07, | |
| "loss": 0.36437718868255614, | |
| "memory(GiB)": 76.18, | |
| "step": 3080, | |
| "token_acc": 0.8751534997953336, | |
| "train_speed(iter/s)": 0.040101 | |
| }, | |
| { | |
| "epoch": 2.781278127812781, | |
| "grad_norm": 2.4832963943481445, | |
| "learning_rate": 2.774929981417662e-07, | |
| "loss": 0.36618633270263673, | |
| "memory(GiB)": 76.18, | |
| "step": 3090, | |
| "token_acc": 0.8648288128056915, | |
| "train_speed(iter/s)": 0.040101 | |
| }, | |
| { | |
| "epoch": 2.7902790279027903, | |
| "grad_norm": 2.6481244564056396, | |
| "learning_rate": 2.5521966597186976e-07, | |
| "loss": 0.3651879787445068, | |
| "memory(GiB)": 76.18, | |
| "step": 3100, | |
| "token_acc": 0.8597326082030364, | |
| "train_speed(iter/s)": 0.040102 | |
| }, | |
| { | |
| "epoch": 2.7992799279927993, | |
| "grad_norm": 2.6947715282440186, | |
| "learning_rate": 2.3386648028930093e-07, | |
| "loss": 0.35363340377807617, | |
| "memory(GiB)": 76.18, | |
| "step": 3110, | |
| "token_acc": 0.8761429758935994, | |
| "train_speed(iter/s)": 0.040104 | |
| }, | |
| { | |
| "epoch": 2.8082808280828084, | |
| "grad_norm": 2.7126548290252686, | |
| "learning_rate": 2.134354573589825e-07, | |
| "loss": 0.3739881753921509, | |
| "memory(GiB)": 76.18, | |
| "step": 3120, | |
| "token_acc": 0.8569641367806505, | |
| "train_speed(iter/s)": 0.040106 | |
| }, | |
| { | |
| "epoch": 2.8172817281728175, | |
| "grad_norm": 2.6334176063537598, | |
| "learning_rate": 1.939285263710411e-07, | |
| "loss": 0.37378754615783694, | |
| "memory(GiB)": 76.18, | |
| "step": 3130, | |
| "token_acc": 0.8621212121212121, | |
| "train_speed(iter/s)": 0.040109 | |
| }, | |
| { | |
| "epoch": 2.826282628262826, | |
| "grad_norm": 2.6771504878997803, | |
| "learning_rate": 1.7534752925863264e-07, | |
| "loss": 0.3727731227874756, | |
| "memory(GiB)": 76.18, | |
| "step": 3140, | |
| "token_acc": 0.8573262032085561, | |
| "train_speed(iter/s)": 0.040111 | |
| }, | |
| { | |
| "epoch": 2.8352835283528353, | |
| "grad_norm": 2.7885513305664062, | |
| "learning_rate": 1.5769422052403172e-07, | |
| "loss": 0.3634767770767212, | |
| "memory(GiB)": 76.18, | |
| "step": 3150, | |
| "token_acc": 0.8657498362802881, | |
| "train_speed(iter/s)": 0.040111 | |
| }, | |
| { | |
| "epoch": 2.8442844284428443, | |
| "grad_norm": 2.770448684692383, | |
| "learning_rate": 1.409702670729518e-07, | |
| "loss": 0.3641348123550415, | |
| "memory(GiB)": 76.18, | |
| "step": 3160, | |
| "token_acc": 0.8695652173913043, | |
| "train_speed(iter/s)": 0.040111 | |
| }, | |
| { | |
| "epoch": 2.8532853285328534, | |
| "grad_norm": 2.716731309890747, | |
| "learning_rate": 1.2517724805715115e-07, | |
| "loss": 0.36133828163146975, | |
| "memory(GiB)": 76.18, | |
| "step": 3170, | |
| "token_acc": 0.8693168837103039, | |
| "train_speed(iter/s)": 0.040112 | |
| }, | |
| { | |
| "epoch": 2.862286228622862, | |
| "grad_norm": 2.320976734161377, | |
| "learning_rate": 1.1031665472532871e-07, | |
| "loss": 0.3573209285736084, | |
| "memory(GiB)": 76.18, | |
| "step": 3180, | |
| "token_acc": 0.8647353517752123, | |
| "train_speed(iter/s)": 0.040115 | |
| }, | |
| { | |
| "epoch": 2.871287128712871, | |
| "grad_norm": 2.6834940910339355, | |
| "learning_rate": 9.638989028230572e-08, | |
| "loss": 0.3642300605773926, | |
| "memory(GiB)": 76.18, | |
| "step": 3190, | |
| "token_acc": 0.8666237113402062, | |
| "train_speed(iter/s)": 0.040116 | |
| }, | |
| { | |
| "epoch": 2.8802880288028803, | |
| "grad_norm": 2.8395378589630127, | |
| "learning_rate": 8.339826975653165e-08, | |
| "loss": 0.3668497562408447, | |
| "memory(GiB)": 76.18, | |
| "step": 3200, | |
| "token_acc": 0.8565969880872106, | |
| "train_speed(iter/s)": 0.040118 | |
| }, | |
| { | |
| "epoch": 2.8892889288928894, | |
| "grad_norm": 2.8500564098358154, | |
| "learning_rate": 7.134301987591686e-08, | |
| "loss": 0.35763015747070315, | |
| "memory(GiB)": 76.18, | |
| "step": 3210, | |
| "token_acc": 0.8680448647459864, | |
| "train_speed(iter/s)": 0.04012 | |
| }, | |
| { | |
| "epoch": 2.8982898289828984, | |
| "grad_norm": 2.391807794570923, | |
| "learning_rate": 6.022527895198971e-08, | |
| "loss": 0.3681647300720215, | |
| "memory(GiB)": 76.18, | |
| "step": 3220, | |
| "token_acc": 0.8623626989464246, | |
| "train_speed(iter/s)": 0.040122 | |
| }, | |
| { | |
| "epoch": 2.9072907290729075, | |
| "grad_norm": 2.870159149169922, | |
| "learning_rate": 5.004609677242478e-08, | |
| "loss": 0.3709531307220459, | |
| "memory(GiB)": 76.18, | |
| "step": 3230, | |
| "token_acc": 0.8634751773049646, | |
| "train_speed(iter/s)": 0.040123 | |
| }, | |
| { | |
| "epoch": 2.916291629162916, | |
| "grad_norm": 2.3860719203948975, | |
| "learning_rate": 4.0806434501907686e-08, | |
| "loss": 0.3573091745376587, | |
| "memory(GiB)": 76.18, | |
| "step": 3240, | |
| "token_acc": 0.8636980108499096, | |
| "train_speed(iter/s)": 0.040125 | |
| }, | |
| { | |
| "epoch": 2.9252925292529253, | |
| "grad_norm": 2.533841609954834, | |
| "learning_rate": 3.2507164591378817e-08, | |
| "loss": 0.35629446506500245, | |
| "memory(GiB)": 76.18, | |
| "step": 3250, | |
| "token_acc": 0.8767689962987154, | |
| "train_speed(iter/s)": 0.040126 | |
| }, | |
| { | |
| "epoch": 2.9342934293429344, | |
| "grad_norm": 2.7338736057281494, | |
| "learning_rate": 2.5149070695656974e-08, | |
| "loss": 0.36386995315551757, | |
| "memory(GiB)": 76.18, | |
| "step": 3260, | |
| "token_acc": 0.8695070265447246, | |
| "train_speed(iter/s)": 0.040129 | |
| }, | |
| { | |
| "epoch": 2.9432943294329434, | |
| "grad_norm": 2.5814294815063477, | |
| "learning_rate": 1.873284759943861e-08, | |
| "loss": 0.3609006881713867, | |
| "memory(GiB)": 76.18, | |
| "step": 3270, | |
| "token_acc": 0.8714535137494543, | |
| "train_speed(iter/s)": 0.040129 | |
| }, | |
| { | |
| "epoch": 2.952295229522952, | |
| "grad_norm": 2.6087794303894043, | |
| "learning_rate": 1.325910115169471e-08, | |
| "loss": 0.36290225982666013, | |
| "memory(GiB)": 76.18, | |
| "step": 3280, | |
| "token_acc": 0.8663007683863886, | |
| "train_speed(iter/s)": 0.04013 | |
| }, | |
| { | |
| "epoch": 2.961296129612961, | |
| "grad_norm": 2.4624176025390625, | |
| "learning_rate": 8.728348208466575e-09, | |
| "loss": 0.36122841835021974, | |
| "memory(GiB)": 76.18, | |
| "step": 3290, | |
| "token_acc": 0.8651804670912951, | |
| "train_speed(iter/s)": 0.040133 | |
| }, | |
| { | |
| "epoch": 2.9702970297029703, | |
| "grad_norm": 2.5794880390167236, | |
| "learning_rate": 5.1410165840548586e-09, | |
| "loss": 0.35005528926849366, | |
| "memory(GiB)": 76.18, | |
| "step": 3300, | |
| "token_acc": 0.873643074250977, | |
| "train_speed(iter/s)": 0.040135 | |
| }, | |
| { | |
| "epoch": 2.9792979297929794, | |
| "grad_norm": 2.730228900909424, | |
| "learning_rate": 2.4974450106318715e-09, | |
| "loss": 0.3484092473983765, | |
| "memory(GiB)": 76.18, | |
| "step": 3310, | |
| "token_acc": 0.8741692512184316, | |
| "train_speed(iter/s)": 0.040137 | |
| }, | |
| { | |
| "epoch": 2.9882988298829884, | |
| "grad_norm": 2.4973952770233154, | |
| "learning_rate": 7.978831062493975e-10, | |
| "loss": 0.360276198387146, | |
| "memory(GiB)": 76.18, | |
| "step": 3320, | |
| "token_acc": 0.8717892425905598, | |
| "train_speed(iter/s)": 0.040139 | |
| }, | |
| { | |
| "epoch": 2.9972997299729975, | |
| "grad_norm": 2.646111488342285, | |
| "learning_rate": 4.249135127420978e-11, | |
| "loss": 0.34623007774353026, | |
| "memory(GiB)": 76.18, | |
| "step": 3330, | |
| "token_acc": 0.8752749670039596, | |
| "train_speed(iter/s)": 0.040141 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.3906257748603821, | |
| "eval_runtime": 111.4189, | |
| "eval_samples_per_second": 12.879, | |
| "eval_steps_per_second": 0.404, | |
| "eval_token_acc": 0.8578371810449574, | |
| "step": 3333 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3333, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3350909053056844e+19, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |