diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,180452 @@ +{ + "best_metric": 1.01177001, + "best_model_checkpoint": "/yldm0226/llm_pretrain_output/qwen2_5-14b/v1-20240919-083153/checkpoint-356000", + "epoch": 1.9999916112895266, + "eval_steps": 2000, + "global_step": 357622, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "acc": 0.61102152, + "epoch": 5.592473648963225e-06, + "grad_norm": 9.25, + "learning_rate": 5.592215635834919e-10, + "loss": 1.74166703, + "memory(GiB)": 197.09, + "step": 1, + "train_speed(iter/s)": 0.025006 + }, + { + "acc": 0.64123098, + "epoch": 0.00011184947297926451, + "grad_norm": 12.5, + "learning_rate": 1.1184431271669836e-08, + "loss": 1.70050491, + "memory(GiB)": 221.62, + "step": 20, + "train_speed(iter/s)": 0.157094 + }, + { + "acc": 0.62680359, + "epoch": 0.00022369894595852902, + "grad_norm": 14.25, + "learning_rate": 2.2368862543339672e-08, + "loss": 1.76243782, + "memory(GiB)": 221.62, + "step": 40, + "train_speed(iter/s)": 0.18438 + }, + { + "acc": 0.63444319, + "epoch": 0.0003355484189377935, + "grad_norm": 14.8125, + "learning_rate": 3.355329381500951e-08, + "loss": 1.71627274, + "memory(GiB)": 291.07, + "step": 60, + "train_speed(iter/s)": 0.193686 + }, + { + "acc": 0.61691194, + "epoch": 0.00044739789191705805, + "grad_norm": 11.5, + "learning_rate": 4.4737725086679345e-08, + "loss": 1.79340515, + "memory(GiB)": 291.07, + "step": 80, + "train_speed(iter/s)": 0.202839 + }, + { + "acc": 0.64025903, + "epoch": 0.0005592473648963225, + "grad_norm": 7.46875, + "learning_rate": 5.5922156358349186e-08, + "loss": 1.70019588, + "memory(GiB)": 291.07, + "step": 100, + "train_speed(iter/s)": 0.208029 + }, + { + "acc": 0.63357964, + "epoch": 0.000671096837875587, + "grad_norm": 18.875, + "learning_rate": 6.710658763001902e-08, + "loss": 1.72948875, + "memory(GiB)": 291.07, + "step": 120, + "train_speed(iter/s)": 0.212145 + }, + { + "acc": 0.63253183, + "epoch": 0.0007829463108548516, + "grad_norm": 10.1875, + "learning_rate": 7.829101890168886e-08, + "loss": 1.74023457, + "memory(GiB)": 291.07, + "step": 140, + "train_speed(iter/s)": 0.214859 + }, + { + "acc": 0.63258324, + "epoch": 0.0008947957838341161, + "grad_norm": 8.3125, + "learning_rate": 8.947545017335869e-08, + "loss": 1.69740257, + "memory(GiB)": 291.07, + "step": 160, + "train_speed(iter/s)": 0.218256 + }, + { + "acc": 0.63887515, + "epoch": 0.0010066452568133805, + "grad_norm": 10.875, + "learning_rate": 1.0065988144502852e-07, + "loss": 1.69588757, + "memory(GiB)": 291.07, + "step": 180, + "train_speed(iter/s)": 0.220464 + }, + { + "acc": 0.64367356, + "epoch": 0.001118494729792645, + "grad_norm": 9.75, + "learning_rate": 1.1184431271669837e-07, + "loss": 1.66351013, + "memory(GiB)": 291.07, + "step": 200, + "train_speed(iter/s)": 0.22197 + }, + { + "acc": 0.64214659, + "epoch": 0.0012303442027719095, + "grad_norm": 16.25, + "learning_rate": 1.230287439883682e-07, + "loss": 1.67754364, + "memory(GiB)": 291.07, + "step": 220, + "train_speed(iter/s)": 0.222636 + }, + { + "acc": 0.63121114, + "epoch": 0.001342193675751174, + "grad_norm": 7.71875, + "learning_rate": 1.3421317526003804e-07, + "loss": 1.75961475, + "memory(GiB)": 291.07, + "step": 240, + "train_speed(iter/s)": 0.222625 + }, + { + "acc": 0.63773336, + "epoch": 0.0014540431487304385, + "grad_norm": 11.375, + "learning_rate": 1.4539760653170788e-07, + "loss": 1.7173418, + "memory(GiB)": 291.07, + "step": 260, + "train_speed(iter/s)": 0.223177 + }, + { + "acc": 0.63509459, + "epoch": 0.0015658926217097032, + "grad_norm": 9.25, + "learning_rate": 1.565820378033777e-07, + "loss": 1.67170391, + "memory(GiB)": 291.07, + "step": 280, + "train_speed(iter/s)": 0.224206 + }, + { + "acc": 0.64230943, + "epoch": 0.0016777420946889677, + "grad_norm": 8.0625, + "learning_rate": 1.6776646907504757e-07, + "loss": 1.68659191, + "memory(GiB)": 291.07, + "step": 300, + "train_speed(iter/s)": 0.225299 + }, + { + "acc": 0.64408345, + "epoch": 0.0017895915676682322, + "grad_norm": 10.0625, + "learning_rate": 1.7895090034671738e-07, + "loss": 1.68645325, + "memory(GiB)": 291.07, + "step": 320, + "train_speed(iter/s)": 0.226053 + }, + { + "acc": 0.62676215, + "epoch": 0.0019014410406474967, + "grad_norm": 10.75, + "learning_rate": 1.9013533161838724e-07, + "loss": 1.73695507, + "memory(GiB)": 291.07, + "step": 340, + "train_speed(iter/s)": 0.226703 + }, + { + "acc": 0.64296484, + "epoch": 0.002013290513626761, + "grad_norm": 10.875, + "learning_rate": 2.0131976289005705e-07, + "loss": 1.69327259, + "memory(GiB)": 291.07, + "step": 360, + "train_speed(iter/s)": 0.226991 + }, + { + "acc": 0.63737688, + "epoch": 0.0021251399866060255, + "grad_norm": 11.625, + "learning_rate": 2.1250419416172688e-07, + "loss": 1.69941998, + "memory(GiB)": 291.07, + "step": 380, + "train_speed(iter/s)": 0.227795 + }, + { + "acc": 0.61969328, + "epoch": 0.00223698945958529, + "grad_norm": 8.75, + "learning_rate": 2.2368862543339674e-07, + "loss": 1.75694485, + "memory(GiB)": 291.07, + "step": 400, + "train_speed(iter/s)": 0.227922 + }, + { + "acc": 0.62480431, + "epoch": 0.0023488389325645545, + "grad_norm": 9.0, + "learning_rate": 2.3487305670506655e-07, + "loss": 1.75218048, + "memory(GiB)": 291.07, + "step": 420, + "train_speed(iter/s)": 0.228423 + }, + { + "acc": 0.62828465, + "epoch": 0.002460688405543819, + "grad_norm": 19.625, + "learning_rate": 2.460574879767364e-07, + "loss": 1.72060432, + "memory(GiB)": 291.07, + "step": 440, + "train_speed(iter/s)": 0.229381 + }, + { + "acc": 0.63121514, + "epoch": 0.0025725378785230834, + "grad_norm": 11.875, + "learning_rate": 2.5724191924840625e-07, + "loss": 1.74755402, + "memory(GiB)": 291.07, + "step": 460, + "train_speed(iter/s)": 0.229837 + }, + { + "acc": 0.63478308, + "epoch": 0.002684387351502348, + "grad_norm": 10.8125, + "learning_rate": 2.684263505200761e-07, + "loss": 1.70622864, + "memory(GiB)": 291.07, + "step": 480, + "train_speed(iter/s)": 0.230075 + }, + { + "acc": 0.64095759, + "epoch": 0.0027962368244816124, + "grad_norm": 17.375, + "learning_rate": 2.796107817917459e-07, + "loss": 1.70366688, + "memory(GiB)": 291.07, + "step": 500, + "train_speed(iter/s)": 0.230023 + }, + { + "acc": 0.63565893, + "epoch": 0.002908086297460877, + "grad_norm": 9.0625, + "learning_rate": 2.9079521306341575e-07, + "loss": 1.68935165, + "memory(GiB)": 291.07, + "step": 520, + "train_speed(iter/s)": 0.230185 + }, + { + "acc": 0.63270149, + "epoch": 0.003019935770440142, + "grad_norm": 11.5, + "learning_rate": 3.019796443350856e-07, + "loss": 1.74541321, + "memory(GiB)": 291.07, + "step": 540, + "train_speed(iter/s)": 0.230788 + }, + { + "acc": 0.6382484, + "epoch": 0.0031317852434194064, + "grad_norm": 8.9375, + "learning_rate": 3.131640756067554e-07, + "loss": 1.70573997, + "memory(GiB)": 291.07, + "step": 560, + "train_speed(iter/s)": 0.230935 + }, + { + "acc": 0.63324342, + "epoch": 0.003243634716398671, + "grad_norm": 9.5625, + "learning_rate": 3.2434850687842526e-07, + "loss": 1.70596504, + "memory(GiB)": 291.07, + "step": 580, + "train_speed(iter/s)": 0.231479 + }, + { + "acc": 0.64776673, + "epoch": 0.0033554841893779354, + "grad_norm": 9.3125, + "learning_rate": 3.3553293815009514e-07, + "loss": 1.6493845, + "memory(GiB)": 291.07, + "step": 600, + "train_speed(iter/s)": 0.231839 + }, + { + "acc": 0.63866568, + "epoch": 0.0034673336623572, + "grad_norm": 12.6875, + "learning_rate": 3.467173694217649e-07, + "loss": 1.67741337, + "memory(GiB)": 291.07, + "step": 620, + "train_speed(iter/s)": 0.232137 + }, + { + "acc": 0.63059068, + "epoch": 0.0035791831353364644, + "grad_norm": 18.75, + "learning_rate": 3.5790180069343476e-07, + "loss": 1.73800278, + "memory(GiB)": 291.07, + "step": 640, + "train_speed(iter/s)": 0.232408 + }, + { + "acc": 0.65209956, + "epoch": 0.003691032608315729, + "grad_norm": 13.3125, + "learning_rate": 3.690862319651046e-07, + "loss": 1.63899841, + "memory(GiB)": 291.07, + "step": 660, + "train_speed(iter/s)": 0.232325 + }, + { + "acc": 0.63443618, + "epoch": 0.0038028820812949934, + "grad_norm": 15.25, + "learning_rate": 3.802706632367745e-07, + "loss": 1.70280628, + "memory(GiB)": 291.07, + "step": 680, + "train_speed(iter/s)": 0.232792 + }, + { + "acc": 0.63637352, + "epoch": 0.003914731554274258, + "grad_norm": 10.0625, + "learning_rate": 3.914550945084443e-07, + "loss": 1.6928524, + "memory(GiB)": 291.07, + "step": 700, + "train_speed(iter/s)": 0.23308 + }, + { + "acc": 0.63931861, + "epoch": 0.004026581027253522, + "grad_norm": 14.1875, + "learning_rate": 4.026395257801141e-07, + "loss": 1.66194229, + "memory(GiB)": 291.07, + "step": 720, + "train_speed(iter/s)": 0.233414 + }, + { + "acc": 0.6478673, + "epoch": 0.004138430500232787, + "grad_norm": 12.125, + "learning_rate": 4.1382395705178393e-07, + "loss": 1.64278069, + "memory(GiB)": 291.07, + "step": 740, + "train_speed(iter/s)": 0.233578 + }, + { + "acc": 0.64705439, + "epoch": 0.004250279973212051, + "grad_norm": 14.1875, + "learning_rate": 4.2500838832345377e-07, + "loss": 1.64648113, + "memory(GiB)": 291.07, + "step": 760, + "train_speed(iter/s)": 0.233437 + }, + { + "acc": 0.63679724, + "epoch": 0.004362129446191316, + "grad_norm": 6.90625, + "learning_rate": 4.3619281959512365e-07, + "loss": 1.65260544, + "memory(GiB)": 291.07, + "step": 780, + "train_speed(iter/s)": 0.23359 + }, + { + "acc": 0.62998996, + "epoch": 0.00447397891917058, + "grad_norm": 12.0, + "learning_rate": 4.473772508667935e-07, + "loss": 1.74353733, + "memory(GiB)": 291.07, + "step": 800, + "train_speed(iter/s)": 0.233362 + }, + { + "acc": 0.64355078, + "epoch": 0.004585828392149845, + "grad_norm": 11.9375, + "learning_rate": 4.5856168213846327e-07, + "loss": 1.66514282, + "memory(GiB)": 291.07, + "step": 820, + "train_speed(iter/s)": 0.233429 + }, + { + "acc": 0.64263201, + "epoch": 0.004697677865129109, + "grad_norm": 9.5625, + "learning_rate": 4.697461134101331e-07, + "loss": 1.65674267, + "memory(GiB)": 291.07, + "step": 840, + "train_speed(iter/s)": 0.233452 + }, + { + "acc": 0.64543691, + "epoch": 0.004809527338108374, + "grad_norm": 10.4375, + "learning_rate": 4.80930544681803e-07, + "loss": 1.67221622, + "memory(GiB)": 291.07, + "step": 860, + "train_speed(iter/s)": 0.23371 + }, + { + "acc": 0.64395475, + "epoch": 0.004921376811087638, + "grad_norm": 10.125, + "learning_rate": 4.921149759534728e-07, + "loss": 1.65121174, + "memory(GiB)": 291.07, + "step": 880, + "train_speed(iter/s)": 0.233761 + }, + { + "acc": 0.65196605, + "epoch": 0.005033226284066903, + "grad_norm": 5.875, + "learning_rate": 5.032994072251426e-07, + "loss": 1.62765026, + "memory(GiB)": 291.07, + "step": 900, + "train_speed(iter/s)": 0.233941 + }, + { + "acc": 0.65300198, + "epoch": 0.005145075757046167, + "grad_norm": 12.75, + "learning_rate": 5.144838384968125e-07, + "loss": 1.62944374, + "memory(GiB)": 291.07, + "step": 920, + "train_speed(iter/s)": 0.234198 + }, + { + "acc": 0.62531118, + "epoch": 0.005256925230025432, + "grad_norm": 10.1875, + "learning_rate": 5.256682697684823e-07, + "loss": 1.7695837, + "memory(GiB)": 291.07, + "step": 940, + "train_speed(iter/s)": 0.234815 + }, + { + "acc": 0.65381331, + "epoch": 0.005368774703004696, + "grad_norm": 12.625, + "learning_rate": 5.368527010401522e-07, + "loss": 1.66151009, + "memory(GiB)": 291.07, + "step": 960, + "train_speed(iter/s)": 0.234875 + }, + { + "acc": 0.64093027, + "epoch": 0.005480624175983961, + "grad_norm": 10.0, + "learning_rate": 5.480371323118219e-07, + "loss": 1.67430325, + "memory(GiB)": 291.07, + "step": 980, + "train_speed(iter/s)": 0.234857 + }, + { + "acc": 0.6549592, + "epoch": 0.005592473648963225, + "grad_norm": 11.75, + "learning_rate": 5.592215635834918e-07, + "loss": 1.61554718, + "memory(GiB)": 291.07, + "step": 1000, + "train_speed(iter/s)": 0.235207 + }, + { + "acc": 0.64199123, + "epoch": 0.00570432312194249, + "grad_norm": 10.375, + "learning_rate": 5.704059948551616e-07, + "loss": 1.68908825, + "memory(GiB)": 291.07, + "step": 1020, + "train_speed(iter/s)": 0.235444 + }, + { + "acc": 0.64967904, + "epoch": 0.005816172594921754, + "grad_norm": 13.4375, + "learning_rate": 5.815904261268315e-07, + "loss": 1.6196167, + "memory(GiB)": 291.07, + "step": 1040, + "train_speed(iter/s)": 0.235804 + }, + { + "acc": 0.64457526, + "epoch": 0.005928022067901019, + "grad_norm": 16.125, + "learning_rate": 5.927748573985014e-07, + "loss": 1.66851654, + "memory(GiB)": 291.07, + "step": 1060, + "train_speed(iter/s)": 0.236034 + }, + { + "acc": 0.64158769, + "epoch": 0.006039871540880284, + "grad_norm": 8.25, + "learning_rate": 6.039592886701712e-07, + "loss": 1.65470428, + "memory(GiB)": 291.07, + "step": 1080, + "train_speed(iter/s)": 0.236144 + }, + { + "acc": 0.63168755, + "epoch": 0.006151721013859548, + "grad_norm": 8.5625, + "learning_rate": 6.15143719941841e-07, + "loss": 1.71881962, + "memory(GiB)": 291.07, + "step": 1100, + "train_speed(iter/s)": 0.236473 + }, + { + "acc": 0.64541779, + "epoch": 0.006263570486838813, + "grad_norm": 14.0, + "learning_rate": 6.263281512135108e-07, + "loss": 1.64231873, + "memory(GiB)": 291.07, + "step": 1120, + "train_speed(iter/s)": 0.236749 + }, + { + "acc": 0.63790832, + "epoch": 0.006375419959818077, + "grad_norm": 6.96875, + "learning_rate": 6.375125824851806e-07, + "loss": 1.67936134, + "memory(GiB)": 291.07, + "step": 1140, + "train_speed(iter/s)": 0.237033 + }, + { + "acc": 0.63751616, + "epoch": 0.006487269432797342, + "grad_norm": 10.1875, + "learning_rate": 6.486970137568505e-07, + "loss": 1.68400955, + "memory(GiB)": 291.07, + "step": 1160, + "train_speed(iter/s)": 0.237213 + }, + { + "acc": 0.64691434, + "epoch": 0.006599118905776606, + "grad_norm": 6.65625, + "learning_rate": 6.598814450285204e-07, + "loss": 1.6413208, + "memory(GiB)": 291.07, + "step": 1180, + "train_speed(iter/s)": 0.237341 + }, + { + "acc": 0.63845811, + "epoch": 0.006710968378755871, + "grad_norm": 16.75, + "learning_rate": 6.710658763001903e-07, + "loss": 1.69180183, + "memory(GiB)": 291.07, + "step": 1200, + "train_speed(iter/s)": 0.237004 + }, + { + "acc": 0.65169001, + "epoch": 0.006822817851735135, + "grad_norm": 10.625, + "learning_rate": 6.822503075718601e-07, + "loss": 1.61024513, + "memory(GiB)": 291.07, + "step": 1220, + "train_speed(iter/s)": 0.237243 + }, + { + "acc": 0.64055386, + "epoch": 0.0069346673247144, + "grad_norm": 11.625, + "learning_rate": 6.934347388435298e-07, + "loss": 1.67372513, + "memory(GiB)": 291.07, + "step": 1240, + "train_speed(iter/s)": 0.23747 + }, + { + "acc": 0.64481287, + "epoch": 0.007046516797693664, + "grad_norm": 11.125, + "learning_rate": 7.046191701151997e-07, + "loss": 1.61804752, + "memory(GiB)": 291.07, + "step": 1260, + "train_speed(iter/s)": 0.237269 + }, + { + "acc": 0.65572333, + "epoch": 0.007158366270672929, + "grad_norm": 11.625, + "learning_rate": 7.158036013868695e-07, + "loss": 1.59459133, + "memory(GiB)": 291.07, + "step": 1280, + "train_speed(iter/s)": 0.237342 + }, + { + "acc": 0.62851019, + "epoch": 0.007270215743652193, + "grad_norm": 10.125, + "learning_rate": 7.269880326585393e-07, + "loss": 1.70585937, + "memory(GiB)": 291.07, + "step": 1300, + "train_speed(iter/s)": 0.237381 + }, + { + "acc": 0.65246711, + "epoch": 0.007382065216631458, + "grad_norm": 9.1875, + "learning_rate": 7.381724639302092e-07, + "loss": 1.61941948, + "memory(GiB)": 291.07, + "step": 1320, + "train_speed(iter/s)": 0.237443 + }, + { + "acc": 0.65091176, + "epoch": 0.007493914689610722, + "grad_norm": 7.25, + "learning_rate": 7.49356895201879e-07, + "loss": 1.60550308, + "memory(GiB)": 291.07, + "step": 1340, + "train_speed(iter/s)": 0.237723 + }, + { + "acc": 0.65657873, + "epoch": 0.007605764162589987, + "grad_norm": 12.75, + "learning_rate": 7.60541326473549e-07, + "loss": 1.59343834, + "memory(GiB)": 291.07, + "step": 1360, + "train_speed(iter/s)": 0.237363 + }, + { + "acc": 0.6497911, + "epoch": 0.007717613635569251, + "grad_norm": 12.125, + "learning_rate": 7.717257577452187e-07, + "loss": 1.63567734, + "memory(GiB)": 291.07, + "step": 1380, + "train_speed(iter/s)": 0.237544 + }, + { + "acc": 0.6568399, + "epoch": 0.007829463108548516, + "grad_norm": 10.5625, + "learning_rate": 7.829101890168886e-07, + "loss": 1.60683441, + "memory(GiB)": 291.07, + "step": 1400, + "train_speed(iter/s)": 0.237407 + }, + { + "acc": 0.6468574, + "epoch": 0.00794131258152778, + "grad_norm": 9.8125, + "learning_rate": 7.940946202885584e-07, + "loss": 1.65684853, + "memory(GiB)": 291.07, + "step": 1420, + "train_speed(iter/s)": 0.237374 + }, + { + "acc": 0.652001, + "epoch": 0.008053162054507044, + "grad_norm": 10.5625, + "learning_rate": 8.052790515602282e-07, + "loss": 1.59492035, + "memory(GiB)": 291.07, + "step": 1440, + "train_speed(iter/s)": 0.237298 + }, + { + "acc": 0.64792352, + "epoch": 0.00816501152748631, + "grad_norm": 8.3125, + "learning_rate": 8.164634828318981e-07, + "loss": 1.62101498, + "memory(GiB)": 291.07, + "step": 1460, + "train_speed(iter/s)": 0.237282 + }, + { + "acc": 0.6596518, + "epoch": 0.008276861000465574, + "grad_norm": 9.125, + "learning_rate": 8.276479141035679e-07, + "loss": 1.54311638, + "memory(GiB)": 291.07, + "step": 1480, + "train_speed(iter/s)": 0.237203 + }, + { + "acc": 0.64482794, + "epoch": 0.008388710473444838, + "grad_norm": 10.625, + "learning_rate": 8.388323453752376e-07, + "loss": 1.65594578, + "memory(GiB)": 291.07, + "step": 1500, + "train_speed(iter/s)": 0.23749 + }, + { + "acc": 0.6567802, + "epoch": 0.008500559946424102, + "grad_norm": 8.75, + "learning_rate": 8.500167766469075e-07, + "loss": 1.59235125, + "memory(GiB)": 291.07, + "step": 1520, + "train_speed(iter/s)": 0.23729 + }, + { + "acc": 0.67078204, + "epoch": 0.008612409419403368, + "grad_norm": 9.625, + "learning_rate": 8.612012079185775e-07, + "loss": 1.53127193, + "memory(GiB)": 291.07, + "step": 1540, + "train_speed(iter/s)": 0.237475 + }, + { + "acc": 0.66963944, + "epoch": 0.008724258892382632, + "grad_norm": 7.96875, + "learning_rate": 8.723856391902473e-07, + "loss": 1.52032394, + "memory(GiB)": 291.07, + "step": 1560, + "train_speed(iter/s)": 0.237513 + }, + { + "acc": 0.64595714, + "epoch": 0.008836108365361896, + "grad_norm": 8.8125, + "learning_rate": 8.835700704619171e-07, + "loss": 1.59227762, + "memory(GiB)": 291.07, + "step": 1580, + "train_speed(iter/s)": 0.237489 + }, + { + "acc": 0.65024767, + "epoch": 0.00894795783834116, + "grad_norm": 12.1875, + "learning_rate": 8.94754501733587e-07, + "loss": 1.56494942, + "memory(GiB)": 291.07, + "step": 1600, + "train_speed(iter/s)": 0.237529 + }, + { + "acc": 0.65608115, + "epoch": 0.009059807311320426, + "grad_norm": 8.4375, + "learning_rate": 9.059389330052568e-07, + "loss": 1.56758718, + "memory(GiB)": 291.07, + "step": 1620, + "train_speed(iter/s)": 0.2374 + }, + { + "acc": 0.65251756, + "epoch": 0.00917165678429969, + "grad_norm": 9.0625, + "learning_rate": 9.171233642769265e-07, + "loss": 1.57921829, + "memory(GiB)": 291.07, + "step": 1640, + "train_speed(iter/s)": 0.2376 + }, + { + "acc": 0.65048575, + "epoch": 0.009283506257278954, + "grad_norm": 8.875, + "learning_rate": 9.283077955485964e-07, + "loss": 1.58402004, + "memory(GiB)": 291.07, + "step": 1660, + "train_speed(iter/s)": 0.237842 + }, + { + "acc": 0.6533699, + "epoch": 0.009395355730258218, + "grad_norm": 9.6875, + "learning_rate": 9.394922268202662e-07, + "loss": 1.58595514, + "memory(GiB)": 291.07, + "step": 1680, + "train_speed(iter/s)": 0.237642 + }, + { + "acc": 0.65895405, + "epoch": 0.009507205203237484, + "grad_norm": 6.5, + "learning_rate": 9.506766580919361e-07, + "loss": 1.56286201, + "memory(GiB)": 291.07, + "step": 1700, + "train_speed(iter/s)": 0.237822 + }, + { + "acc": 0.65462527, + "epoch": 0.009619054676216748, + "grad_norm": 8.6875, + "learning_rate": 9.61861089363606e-07, + "loss": 1.56426115, + "memory(GiB)": 291.07, + "step": 1720, + "train_speed(iter/s)": 0.237843 + }, + { + "acc": 0.6457274, + "epoch": 0.009730904149196012, + "grad_norm": 10.25, + "learning_rate": 9.730455206352758e-07, + "loss": 1.63330288, + "memory(GiB)": 291.07, + "step": 1740, + "train_speed(iter/s)": 0.237982 + }, + { + "acc": 0.65715117, + "epoch": 0.009842753622175276, + "grad_norm": 10.125, + "learning_rate": 9.842299519069457e-07, + "loss": 1.54167929, + "memory(GiB)": 291.07, + "step": 1760, + "train_speed(iter/s)": 0.238096 + }, + { + "acc": 0.67081084, + "epoch": 0.009954603095154542, + "grad_norm": 9.3125, + "learning_rate": 9.954143831786155e-07, + "loss": 1.48492632, + "memory(GiB)": 291.07, + "step": 1780, + "train_speed(iter/s)": 0.238175 + }, + { + "acc": 0.65525684, + "epoch": 0.010066452568133806, + "grad_norm": 9.1875, + "learning_rate": 1.0065988144502852e-06, + "loss": 1.56345234, + "memory(GiB)": 291.07, + "step": 1800, + "train_speed(iter/s)": 0.238191 + }, + { + "acc": 0.680372, + "epoch": 0.01017830204111307, + "grad_norm": 9.5, + "learning_rate": 1.017783245721955e-06, + "loss": 1.44015503, + "memory(GiB)": 291.07, + "step": 1820, + "train_speed(iter/s)": 0.238212 + }, + { + "acc": 0.63326936, + "epoch": 0.010290151514092334, + "grad_norm": 5.6875, + "learning_rate": 1.028967676993625e-06, + "loss": 1.64366703, + "memory(GiB)": 291.07, + "step": 1840, + "train_speed(iter/s)": 0.238317 + }, + { + "acc": 0.65707636, + "epoch": 0.0104020009870716, + "grad_norm": 8.75, + "learning_rate": 1.0401521082652947e-06, + "loss": 1.54034071, + "memory(GiB)": 291.07, + "step": 1860, + "train_speed(iter/s)": 0.238427 + }, + { + "acc": 0.65157447, + "epoch": 0.010513850460050864, + "grad_norm": 7.5, + "learning_rate": 1.0513365395369646e-06, + "loss": 1.55852509, + "memory(GiB)": 291.07, + "step": 1880, + "train_speed(iter/s)": 0.23857 + }, + { + "acc": 0.67330117, + "epoch": 0.010625699933030128, + "grad_norm": 7.21875, + "learning_rate": 1.0625209708086344e-06, + "loss": 1.48330345, + "memory(GiB)": 291.07, + "step": 1900, + "train_speed(iter/s)": 0.238535 + }, + { + "acc": 0.66796083, + "epoch": 0.010737549406009392, + "grad_norm": 8.8125, + "learning_rate": 1.0737054020803043e-06, + "loss": 1.55040407, + "memory(GiB)": 291.07, + "step": 1920, + "train_speed(iter/s)": 0.238689 + }, + { + "acc": 0.66711373, + "epoch": 0.010849398878988658, + "grad_norm": 10.875, + "learning_rate": 1.0848898333519742e-06, + "loss": 1.51402121, + "memory(GiB)": 291.07, + "step": 1940, + "train_speed(iter/s)": 0.238557 + }, + { + "acc": 0.6560504, + "epoch": 0.010961248351967922, + "grad_norm": 11.5, + "learning_rate": 1.0960742646236439e-06, + "loss": 1.56006899, + "memory(GiB)": 291.07, + "step": 1960, + "train_speed(iter/s)": 0.2385 + }, + { + "acc": 0.66213055, + "epoch": 0.011073097824947186, + "grad_norm": 8.5, + "learning_rate": 1.1072586958953138e-06, + "loss": 1.49625778, + "memory(GiB)": 291.07, + "step": 1980, + "train_speed(iter/s)": 0.238581 + }, + { + "acc": 0.65239739, + "epoch": 0.01118494729792645, + "grad_norm": 7.90625, + "learning_rate": 1.1184431271669837e-06, + "loss": 1.54334908, + "memory(GiB)": 291.07, + "step": 2000, + "train_speed(iter/s)": 0.238542 + }, + { + "epoch": 0.01118494729792645, + "eval_acc": 0.6341694364066034, + "eval_loss": 1.5315635204315186, + "eval_runtime": 7528.4674, + "eval_samples_per_second": 10.0, + "eval_steps_per_second": 10.0, + "step": 2000 + }, + { + "acc": 0.63842554, + "epoch": 0.011296796770905716, + "grad_norm": 13.3125, + "learning_rate": 1.1296275584386536e-06, + "loss": 1.62973213, + "memory(GiB)": 302.18, + "step": 2020, + "train_speed(iter/s)": 0.125289 + }, + { + "acc": 0.65215592, + "epoch": 0.01140864624388498, + "grad_norm": 11.5, + "learning_rate": 1.1408119897103232e-06, + "loss": 1.52876272, + "memory(GiB)": 290.29, + "step": 2040, + "train_speed(iter/s)": 0.125855 + }, + { + "acc": 0.64531407, + "epoch": 0.011520495716864244, + "grad_norm": 6.3125, + "learning_rate": 1.1519964209819931e-06, + "loss": 1.58410931, + "memory(GiB)": 290.29, + "step": 2060, + "train_speed(iter/s)": 0.12641 + }, + { + "acc": 0.66043873, + "epoch": 0.011632345189843508, + "grad_norm": 8.25, + "learning_rate": 1.163180852253663e-06, + "loss": 1.55432501, + "memory(GiB)": 290.29, + "step": 2080, + "train_speed(iter/s)": 0.126983 + }, + { + "acc": 0.66926241, + "epoch": 0.011744194662822774, + "grad_norm": 8.375, + "learning_rate": 1.1743652835253329e-06, + "loss": 1.48061371, + "memory(GiB)": 290.29, + "step": 2100, + "train_speed(iter/s)": 0.127546 + }, + { + "acc": 0.6615201, + "epoch": 0.011856044135802038, + "grad_norm": 8.8125, + "learning_rate": 1.1855497147970028e-06, + "loss": 1.53711472, + "memory(GiB)": 290.29, + "step": 2120, + "train_speed(iter/s)": 0.128093 + }, + { + "acc": 0.6689476, + "epoch": 0.011967893608781302, + "grad_norm": 9.6875, + "learning_rate": 1.1967341460686725e-06, + "loss": 1.50884609, + "memory(GiB)": 290.29, + "step": 2140, + "train_speed(iter/s)": 0.128618 + }, + { + "acc": 0.65925674, + "epoch": 0.012079743081760567, + "grad_norm": 8.625, + "learning_rate": 1.2079185773403423e-06, + "loss": 1.52253637, + "memory(GiB)": 290.29, + "step": 2160, + "train_speed(iter/s)": 0.12917 + }, + { + "acc": 0.65991755, + "epoch": 0.012191592554739832, + "grad_norm": 7.59375, + "learning_rate": 1.2191030086120122e-06, + "loss": 1.56255264, + "memory(GiB)": 290.29, + "step": 2180, + "train_speed(iter/s)": 0.129678 + }, + { + "acc": 0.6580112, + "epoch": 0.012303442027719096, + "grad_norm": 7.15625, + "learning_rate": 1.230287439883682e-06, + "loss": 1.52089005, + "memory(GiB)": 290.29, + "step": 2200, + "train_speed(iter/s)": 0.130213 + }, + { + "acc": 0.66133962, + "epoch": 0.01241529150069836, + "grad_norm": 8.5625, + "learning_rate": 1.2414718711553518e-06, + "loss": 1.51789923, + "memory(GiB)": 290.29, + "step": 2220, + "train_speed(iter/s)": 0.130742 + }, + { + "acc": 0.66048164, + "epoch": 0.012527140973677625, + "grad_norm": 8.4375, + "learning_rate": 1.2526563024270217e-06, + "loss": 1.52457466, + "memory(GiB)": 290.29, + "step": 2240, + "train_speed(iter/s)": 0.131253 + }, + { + "acc": 0.66996489, + "epoch": 0.01263899044665689, + "grad_norm": 6.03125, + "learning_rate": 1.2638407336986916e-06, + "loss": 1.4498086, + "memory(GiB)": 290.29, + "step": 2260, + "train_speed(iter/s)": 0.131713 + }, + { + "acc": 0.6611968, + "epoch": 0.012750839919636154, + "grad_norm": 7.375, + "learning_rate": 1.2750251649703612e-06, + "loss": 1.53405313, + "memory(GiB)": 290.29, + "step": 2280, + "train_speed(iter/s)": 0.132231 + }, + { + "acc": 0.65484037, + "epoch": 0.012862689392615418, + "grad_norm": 7.5625, + "learning_rate": 1.2862095962420311e-06, + "loss": 1.5536273, + "memory(GiB)": 290.29, + "step": 2300, + "train_speed(iter/s)": 0.132726 + }, + { + "acc": 0.67159944, + "epoch": 0.012974538865594683, + "grad_norm": 10.5, + "learning_rate": 1.297394027513701e-06, + "loss": 1.5093276, + "memory(GiB)": 290.29, + "step": 2320, + "train_speed(iter/s)": 0.133231 + }, + { + "acc": 0.66923757, + "epoch": 0.013086388338573948, + "grad_norm": 9.1875, + "learning_rate": 1.308578458785371e-06, + "loss": 1.48870592, + "memory(GiB)": 290.29, + "step": 2340, + "train_speed(iter/s)": 0.133715 + }, + { + "acc": 0.67032919, + "epoch": 0.013198237811553212, + "grad_norm": 8.5, + "learning_rate": 1.3197628900570408e-06, + "loss": 1.47919693, + "memory(GiB)": 290.29, + "step": 2360, + "train_speed(iter/s)": 0.134193 + }, + { + "acc": 0.66196389, + "epoch": 0.013310087284532476, + "grad_norm": 10.0, + "learning_rate": 1.3309473213287105e-06, + "loss": 1.55530033, + "memory(GiB)": 290.29, + "step": 2380, + "train_speed(iter/s)": 0.134653 + }, + { + "acc": 0.67962379, + "epoch": 0.013421936757511741, + "grad_norm": 9.125, + "learning_rate": 1.3421317526003806e-06, + "loss": 1.43614712, + "memory(GiB)": 290.29, + "step": 2400, + "train_speed(iter/s)": 0.135146 + }, + { + "acc": 0.65908408, + "epoch": 0.013533786230491006, + "grad_norm": 13.4375, + "learning_rate": 1.3533161838720502e-06, + "loss": 1.54567966, + "memory(GiB)": 290.29, + "step": 2420, + "train_speed(iter/s)": 0.135634 + }, + { + "acc": 0.67157311, + "epoch": 0.01364563570347027, + "grad_norm": 11.1875, + "learning_rate": 1.3645006151437201e-06, + "loss": 1.48233776, + "memory(GiB)": 290.29, + "step": 2440, + "train_speed(iter/s)": 0.136092 + }, + { + "acc": 0.66361537, + "epoch": 0.013757485176449534, + "grad_norm": 7.90625, + "learning_rate": 1.3756850464153898e-06, + "loss": 1.53558397, + "memory(GiB)": 290.29, + "step": 2460, + "train_speed(iter/s)": 0.136578 + }, + { + "acc": 0.66398292, + "epoch": 0.0138693346494288, + "grad_norm": 8.8125, + "learning_rate": 1.3868694776870597e-06, + "loss": 1.48752146, + "memory(GiB)": 290.29, + "step": 2480, + "train_speed(iter/s)": 0.137005 + }, + { + "acc": 0.65371103, + "epoch": 0.013981184122408064, + "grad_norm": 10.25, + "learning_rate": 1.3980539089587294e-06, + "loss": 1.52491989, + "memory(GiB)": 290.29, + "step": 2500, + "train_speed(iter/s)": 0.137447 + }, + { + "acc": 0.67681375, + "epoch": 0.014093033595387328, + "grad_norm": 9.9375, + "learning_rate": 1.4092383402303995e-06, + "loss": 1.43201151, + "memory(GiB)": 290.29, + "step": 2520, + "train_speed(iter/s)": 0.137884 + }, + { + "acc": 0.65838041, + "epoch": 0.014204883068366592, + "grad_norm": 5.71875, + "learning_rate": 1.4204227715020694e-06, + "loss": 1.51673841, + "memory(GiB)": 290.29, + "step": 2540, + "train_speed(iter/s)": 0.138299 + }, + { + "acc": 0.65422668, + "epoch": 0.014316732541345857, + "grad_norm": 8.5625, + "learning_rate": 1.431607202773739e-06, + "loss": 1.53608465, + "memory(GiB)": 290.29, + "step": 2560, + "train_speed(iter/s)": 0.138791 + }, + { + "acc": 0.66059403, + "epoch": 0.014428582014325122, + "grad_norm": 6.5625, + "learning_rate": 1.442791634045409e-06, + "loss": 1.50607786, + "memory(GiB)": 290.29, + "step": 2580, + "train_speed(iter/s)": 0.139252 + }, + { + "acc": 0.66698966, + "epoch": 0.014540431487304386, + "grad_norm": 4.75, + "learning_rate": 1.4539760653170786e-06, + "loss": 1.49883575, + "memory(GiB)": 290.29, + "step": 2600, + "train_speed(iter/s)": 0.139731 + }, + { + "acc": 0.6635632, + "epoch": 0.01465228096028365, + "grad_norm": 10.6875, + "learning_rate": 1.4651604965887487e-06, + "loss": 1.51966829, + "memory(GiB)": 290.29, + "step": 2620, + "train_speed(iter/s)": 0.14018 + }, + { + "acc": 0.66408463, + "epoch": 0.014764130433262915, + "grad_norm": 7.25, + "learning_rate": 1.4763449278604184e-06, + "loss": 1.46771545, + "memory(GiB)": 290.29, + "step": 2640, + "train_speed(iter/s)": 0.140597 + }, + { + "acc": 0.66273384, + "epoch": 0.01487597990624218, + "grad_norm": 11.5, + "learning_rate": 1.4875293591320883e-06, + "loss": 1.49068995, + "memory(GiB)": 290.29, + "step": 2660, + "train_speed(iter/s)": 0.141032 + }, + { + "acc": 0.68762832, + "epoch": 0.014987829379221444, + "grad_norm": 10.6875, + "learning_rate": 1.498713790403758e-06, + "loss": 1.4226882, + "memory(GiB)": 290.29, + "step": 2680, + "train_speed(iter/s)": 0.141438 + }, + { + "acc": 0.67053561, + "epoch": 0.015099678852200708, + "grad_norm": 11.625, + "learning_rate": 1.509898221675428e-06, + "loss": 1.42319317, + "memory(GiB)": 290.29, + "step": 2700, + "train_speed(iter/s)": 0.141828 + }, + { + "acc": 0.67706547, + "epoch": 0.015211528325179973, + "grad_norm": 13.5625, + "learning_rate": 1.521082652947098e-06, + "loss": 1.41895618, + "memory(GiB)": 290.29, + "step": 2720, + "train_speed(iter/s)": 0.142253 + }, + { + "acc": 0.67318821, + "epoch": 0.015323377798159237, + "grad_norm": 4.375, + "learning_rate": 1.5322670842187676e-06, + "loss": 1.4388978, + "memory(GiB)": 290.29, + "step": 2740, + "train_speed(iter/s)": 0.142663 + }, + { + "acc": 0.65617824, + "epoch": 0.015435227271138502, + "grad_norm": 6.03125, + "learning_rate": 1.5434515154904375e-06, + "loss": 1.50950785, + "memory(GiB)": 290.29, + "step": 2760, + "train_speed(iter/s)": 0.143077 + }, + { + "acc": 0.65403004, + "epoch": 0.015547076744117766, + "grad_norm": 8.0625, + "learning_rate": 1.5546359467621072e-06, + "loss": 1.54176178, + "memory(GiB)": 290.29, + "step": 2780, + "train_speed(iter/s)": 0.143465 + }, + { + "acc": 0.6858386, + "epoch": 0.01565892621709703, + "grad_norm": 9.875, + "learning_rate": 1.5658203780337773e-06, + "loss": 1.36167812, + "memory(GiB)": 290.29, + "step": 2800, + "train_speed(iter/s)": 0.14383 + }, + { + "acc": 0.66626329, + "epoch": 0.015770775690076295, + "grad_norm": 6.9375, + "learning_rate": 1.577004809305447e-06, + "loss": 1.4845253, + "memory(GiB)": 290.29, + "step": 2820, + "train_speed(iter/s)": 0.144225 + }, + { + "acc": 0.68138161, + "epoch": 0.01588262516305556, + "grad_norm": 6.5, + "learning_rate": 1.5881892405771168e-06, + "loss": 1.41192026, + "memory(GiB)": 290.29, + "step": 2840, + "train_speed(iter/s)": 0.144669 + }, + { + "acc": 0.66675529, + "epoch": 0.015994474636034824, + "grad_norm": 7.28125, + "learning_rate": 1.5993736718487865e-06, + "loss": 1.47938547, + "memory(GiB)": 290.29, + "step": 2860, + "train_speed(iter/s)": 0.145041 + }, + { + "acc": 0.67981963, + "epoch": 0.016106324109014088, + "grad_norm": 7.1875, + "learning_rate": 1.6105581031204564e-06, + "loss": 1.40247774, + "memory(GiB)": 290.29, + "step": 2880, + "train_speed(iter/s)": 0.145401 + }, + { + "acc": 0.68103447, + "epoch": 0.016218173581993352, + "grad_norm": 8.9375, + "learning_rate": 1.6217425343921265e-06, + "loss": 1.42412014, + "memory(GiB)": 290.29, + "step": 2900, + "train_speed(iter/s)": 0.145782 + }, + { + "acc": 0.66615567, + "epoch": 0.01633002305497262, + "grad_norm": 11.0625, + "learning_rate": 1.6329269656637962e-06, + "loss": 1.47094889, + "memory(GiB)": 290.29, + "step": 2920, + "train_speed(iter/s)": 0.146152 + }, + { + "acc": 0.68500438, + "epoch": 0.016441872527951883, + "grad_norm": 9.375, + "learning_rate": 1.644111396935466e-06, + "loss": 1.37408381, + "memory(GiB)": 290.29, + "step": 2940, + "train_speed(iter/s)": 0.146515 + }, + { + "acc": 0.69188027, + "epoch": 0.016553722000931147, + "grad_norm": 9.25, + "learning_rate": 1.6552958282071357e-06, + "loss": 1.3496314, + "memory(GiB)": 290.29, + "step": 2960, + "train_speed(iter/s)": 0.146909 + }, + { + "acc": 0.66608982, + "epoch": 0.01666557147391041, + "grad_norm": 9.625, + "learning_rate": 1.6664802594788056e-06, + "loss": 1.4525548, + "memory(GiB)": 290.29, + "step": 2980, + "train_speed(iter/s)": 0.147276 + }, + { + "acc": 0.65997128, + "epoch": 0.016777420946889676, + "grad_norm": 4.78125, + "learning_rate": 1.6776646907504753e-06, + "loss": 1.47405491, + "memory(GiB)": 290.29, + "step": 3000, + "train_speed(iter/s)": 0.147639 + }, + { + "acc": 0.67549748, + "epoch": 0.01688927041986894, + "grad_norm": 6.71875, + "learning_rate": 1.6888491220221454e-06, + "loss": 1.43836012, + "memory(GiB)": 290.29, + "step": 3020, + "train_speed(iter/s)": 0.147973 + }, + { + "acc": 0.67662816, + "epoch": 0.017001119892848204, + "grad_norm": 6.71875, + "learning_rate": 1.700033553293815e-06, + "loss": 1.41957006, + "memory(GiB)": 290.29, + "step": 3040, + "train_speed(iter/s)": 0.148318 + }, + { + "acc": 0.68334875, + "epoch": 0.017112969365827468, + "grad_norm": 6.71875, + "learning_rate": 1.711217984565485e-06, + "loss": 1.37433958, + "memory(GiB)": 290.29, + "step": 3060, + "train_speed(iter/s)": 0.148718 + }, + { + "acc": 0.66365714, + "epoch": 0.017224818838806735, + "grad_norm": 10.5625, + "learning_rate": 1.722402415837155e-06, + "loss": 1.48590469, + "memory(GiB)": 290.29, + "step": 3080, + "train_speed(iter/s)": 0.149101 + }, + { + "acc": 0.68702664, + "epoch": 0.017336668311786, + "grad_norm": 12.75, + "learning_rate": 1.7335868471088247e-06, + "loss": 1.3782671, + "memory(GiB)": 290.29, + "step": 3100, + "train_speed(iter/s)": 0.149468 + }, + { + "acc": 0.67254677, + "epoch": 0.017448517784765263, + "grad_norm": 6.0625, + "learning_rate": 1.7447712783804946e-06, + "loss": 1.4493083, + "memory(GiB)": 290.29, + "step": 3120, + "train_speed(iter/s)": 0.149803 + }, + { + "acc": 0.68112121, + "epoch": 0.017560367257744527, + "grad_norm": 6.375, + "learning_rate": 1.7559557096521643e-06, + "loss": 1.39084387, + "memory(GiB)": 290.29, + "step": 3140, + "train_speed(iter/s)": 0.15016 + }, + { + "acc": 0.66826301, + "epoch": 0.01767221673072379, + "grad_norm": 7.6875, + "learning_rate": 1.7671401409238342e-06, + "loss": 1.44456968, + "memory(GiB)": 290.29, + "step": 3160, + "train_speed(iter/s)": 0.15049 + }, + { + "acc": 0.66728535, + "epoch": 0.017784066203703056, + "grad_norm": 14.0625, + "learning_rate": 1.7783245721955039e-06, + "loss": 1.46878033, + "memory(GiB)": 290.29, + "step": 3180, + "train_speed(iter/s)": 0.150809 + }, + { + "acc": 0.66425257, + "epoch": 0.01789591567668232, + "grad_norm": 6.0, + "learning_rate": 1.789509003467174e-06, + "loss": 1.47839413, + "memory(GiB)": 290.29, + "step": 3200, + "train_speed(iter/s)": 0.151131 + }, + { + "acc": 0.67017713, + "epoch": 0.018007765149661587, + "grad_norm": 7.78125, + "learning_rate": 1.8006934347388436e-06, + "loss": 1.42613001, + "memory(GiB)": 290.29, + "step": 3220, + "train_speed(iter/s)": 0.151448 + }, + { + "acc": 0.6867455, + "epoch": 0.01811961462264085, + "grad_norm": 12.9375, + "learning_rate": 1.8118778660105135e-06, + "loss": 1.37087479, + "memory(GiB)": 290.29, + "step": 3240, + "train_speed(iter/s)": 0.151759 + }, + { + "acc": 0.67033343, + "epoch": 0.018231464095620115, + "grad_norm": 6.25, + "learning_rate": 1.8230622972821834e-06, + "loss": 1.42451515, + "memory(GiB)": 290.29, + "step": 3260, + "train_speed(iter/s)": 0.152049 + }, + { + "acc": 0.6635293, + "epoch": 0.01834331356859938, + "grad_norm": 12.0, + "learning_rate": 1.834246728553853e-06, + "loss": 1.44571238, + "memory(GiB)": 290.29, + "step": 3280, + "train_speed(iter/s)": 0.152356 + }, + { + "acc": 0.67704668, + "epoch": 0.018455163041578643, + "grad_norm": 6.375, + "learning_rate": 1.8454311598255232e-06, + "loss": 1.41338291, + "memory(GiB)": 290.29, + "step": 3300, + "train_speed(iter/s)": 0.152696 + }, + { + "acc": 0.66476521, + "epoch": 0.018567012514557907, + "grad_norm": 5.71875, + "learning_rate": 1.8566155910971929e-06, + "loss": 1.45074511, + "memory(GiB)": 290.29, + "step": 3320, + "train_speed(iter/s)": 0.153007 + }, + { + "acc": 0.687922, + "epoch": 0.01867886198753717, + "grad_norm": 10.375, + "learning_rate": 1.8678000223688627e-06, + "loss": 1.37456217, + "memory(GiB)": 290.29, + "step": 3340, + "train_speed(iter/s)": 0.153335 + }, + { + "acc": 0.69471083, + "epoch": 0.018790711460516436, + "grad_norm": 9.8125, + "learning_rate": 1.8789844536405324e-06, + "loss": 1.33320513, + "memory(GiB)": 290.29, + "step": 3360, + "train_speed(iter/s)": 0.153663 + }, + { + "acc": 0.68155198, + "epoch": 0.018902560933495703, + "grad_norm": 9.5625, + "learning_rate": 1.8901688849122023e-06, + "loss": 1.37779083, + "memory(GiB)": 290.29, + "step": 3380, + "train_speed(iter/s)": 0.153965 + }, + { + "acc": 0.66346803, + "epoch": 0.019014410406474967, + "grad_norm": 8.3125, + "learning_rate": 1.9013533161838722e-06, + "loss": 1.45506744, + "memory(GiB)": 299.96, + "step": 3400, + "train_speed(iter/s)": 0.154214 + }, + { + "acc": 0.67178092, + "epoch": 0.01912625987945423, + "grad_norm": 9.0, + "learning_rate": 1.912537747455542e-06, + "loss": 1.42029819, + "memory(GiB)": 299.96, + "step": 3420, + "train_speed(iter/s)": 0.15449 + }, + { + "acc": 0.66913314, + "epoch": 0.019238109352433495, + "grad_norm": 10.6875, + "learning_rate": 1.923722178727212e-06, + "loss": 1.45169621, + "memory(GiB)": 299.96, + "step": 3440, + "train_speed(iter/s)": 0.154773 + }, + { + "acc": 0.67692347, + "epoch": 0.01934995882541276, + "grad_norm": 9.5, + "learning_rate": 1.934906609998882e-06, + "loss": 1.41245909, + "memory(GiB)": 299.96, + "step": 3460, + "train_speed(iter/s)": 0.155072 + }, + { + "acc": 0.68862934, + "epoch": 0.019461808298392023, + "grad_norm": 6.4375, + "learning_rate": 1.9460910412705515e-06, + "loss": 1.36263428, + "memory(GiB)": 299.96, + "step": 3480, + "train_speed(iter/s)": 0.155377 + }, + { + "acc": 0.70098333, + "epoch": 0.019573657771371288, + "grad_norm": 7.875, + "learning_rate": 1.957275472542221e-06, + "loss": 1.30308514, + "memory(GiB)": 299.96, + "step": 3500, + "train_speed(iter/s)": 0.155677 + }, + { + "acc": 0.68238959, + "epoch": 0.01968550724435055, + "grad_norm": 8.9375, + "learning_rate": 1.9684599038138913e-06, + "loss": 1.36048393, + "memory(GiB)": 299.96, + "step": 3520, + "train_speed(iter/s)": 0.155939 + }, + { + "acc": 0.67958508, + "epoch": 0.01979735671732982, + "grad_norm": 7.40625, + "learning_rate": 1.979644335085561e-06, + "loss": 1.37644196, + "memory(GiB)": 299.96, + "step": 3540, + "train_speed(iter/s)": 0.156258 + }, + { + "acc": 0.68012323, + "epoch": 0.019909206190309083, + "grad_norm": 9.4375, + "learning_rate": 1.990828766357231e-06, + "loss": 1.39232159, + "memory(GiB)": 299.96, + "step": 3560, + "train_speed(iter/s)": 0.156527 + }, + { + "acc": 0.68527346, + "epoch": 0.020021055663288347, + "grad_norm": 5.96875, + "learning_rate": 2.0020131976289008e-06, + "loss": 1.36088076, + "memory(GiB)": 299.96, + "step": 3580, + "train_speed(iter/s)": 0.156828 + }, + { + "acc": 0.66753321, + "epoch": 0.02013290513626761, + "grad_norm": 4.90625, + "learning_rate": 2.0131976289005704e-06, + "loss": 1.41846199, + "memory(GiB)": 299.96, + "step": 3600, + "train_speed(iter/s)": 0.157128 + }, + { + "acc": 0.69892755, + "epoch": 0.020244754609246875, + "grad_norm": 6.78125, + "learning_rate": 2.0243820601722405e-06, + "loss": 1.2950489, + "memory(GiB)": 299.96, + "step": 3620, + "train_speed(iter/s)": 0.157415 + }, + { + "acc": 0.67916117, + "epoch": 0.02035660408222614, + "grad_norm": 27.375, + "learning_rate": 2.03556649144391e-06, + "loss": 1.3810874, + "memory(GiB)": 299.96, + "step": 3640, + "train_speed(iter/s)": 0.157717 + }, + { + "acc": 0.68130217, + "epoch": 0.020468453555205404, + "grad_norm": 7.34375, + "learning_rate": 2.0467509227155803e-06, + "loss": 1.40262394, + "memory(GiB)": 299.96, + "step": 3660, + "train_speed(iter/s)": 0.157993 + }, + { + "acc": 0.68750458, + "epoch": 0.020580303028184668, + "grad_norm": 10.875, + "learning_rate": 2.05793535398725e-06, + "loss": 1.33392563, + "memory(GiB)": 299.96, + "step": 3680, + "train_speed(iter/s)": 0.158257 + }, + { + "acc": 0.67534347, + "epoch": 0.020692152501163935, + "grad_norm": 8.0625, + "learning_rate": 2.0691197852589197e-06, + "loss": 1.41951828, + "memory(GiB)": 299.96, + "step": 3700, + "train_speed(iter/s)": 0.158534 + }, + { + "acc": 0.68633165, + "epoch": 0.0208040019741432, + "grad_norm": 9.5, + "learning_rate": 2.0803042165305893e-06, + "loss": 1.35614052, + "memory(GiB)": 299.96, + "step": 3720, + "train_speed(iter/s)": 0.158817 + }, + { + "acc": 0.67682786, + "epoch": 0.020915851447122463, + "grad_norm": 6.9375, + "learning_rate": 2.0914886478022594e-06, + "loss": 1.40566683, + "memory(GiB)": 299.96, + "step": 3740, + "train_speed(iter/s)": 0.159079 + }, + { + "acc": 0.67970886, + "epoch": 0.021027700920101727, + "grad_norm": 11.0, + "learning_rate": 2.102673079073929e-06, + "loss": 1.39759703, + "memory(GiB)": 299.96, + "step": 3760, + "train_speed(iter/s)": 0.159307 + }, + { + "acc": 0.68532929, + "epoch": 0.02113955039308099, + "grad_norm": 9.5, + "learning_rate": 2.113857510345599e-06, + "loss": 1.37196016, + "memory(GiB)": 299.96, + "step": 3780, + "train_speed(iter/s)": 0.159588 + }, + { + "acc": 0.67957702, + "epoch": 0.021251399866060255, + "grad_norm": 7.6875, + "learning_rate": 2.125041941617269e-06, + "loss": 1.39975967, + "memory(GiB)": 299.96, + "step": 3800, + "train_speed(iter/s)": 0.159852 + }, + { + "acc": 0.68124695, + "epoch": 0.02136324933903952, + "grad_norm": 4.5, + "learning_rate": 2.1362263728889386e-06, + "loss": 1.34626999, + "memory(GiB)": 299.96, + "step": 3820, + "train_speed(iter/s)": 0.160134 + }, + { + "acc": 0.68362536, + "epoch": 0.021475098812018784, + "grad_norm": 9.1875, + "learning_rate": 2.1474108041606087e-06, + "loss": 1.36006489, + "memory(GiB)": 299.96, + "step": 3840, + "train_speed(iter/s)": 0.160381 + }, + { + "acc": 0.68855357, + "epoch": 0.02158694828499805, + "grad_norm": 8.375, + "learning_rate": 2.1585952354322783e-06, + "loss": 1.35463476, + "memory(GiB)": 299.96, + "step": 3860, + "train_speed(iter/s)": 0.160634 + }, + { + "acc": 0.6894969, + "epoch": 0.021698797757977315, + "grad_norm": 4.90625, + "learning_rate": 2.1697796667039484e-06, + "loss": 1.35288305, + "memory(GiB)": 299.96, + "step": 3880, + "train_speed(iter/s)": 0.160874 + }, + { + "acc": 0.68387632, + "epoch": 0.02181064723095658, + "grad_norm": 8.4375, + "learning_rate": 2.180964097975618e-06, + "loss": 1.37347593, + "memory(GiB)": 299.96, + "step": 3900, + "train_speed(iter/s)": 0.161136 + }, + { + "acc": 0.67394943, + "epoch": 0.021922496703935843, + "grad_norm": 8.25, + "learning_rate": 2.1921485292472878e-06, + "loss": 1.41388807, + "memory(GiB)": 299.96, + "step": 3920, + "train_speed(iter/s)": 0.161397 + }, + { + "acc": 0.69448376, + "epoch": 0.022034346176915107, + "grad_norm": 12.75, + "learning_rate": 2.203332960518958e-06, + "loss": 1.305301, + "memory(GiB)": 299.96, + "step": 3940, + "train_speed(iter/s)": 0.161625 + }, + { + "acc": 0.67981653, + "epoch": 0.02214619564989437, + "grad_norm": 8.375, + "learning_rate": 2.2145173917906276e-06, + "loss": 1.39912653, + "memory(GiB)": 299.96, + "step": 3960, + "train_speed(iter/s)": 0.161903 + }, + { + "acc": 0.68650041, + "epoch": 0.022258045122873635, + "grad_norm": 9.5625, + "learning_rate": 2.2257018230622977e-06, + "loss": 1.33066559, + "memory(GiB)": 299.96, + "step": 3980, + "train_speed(iter/s)": 0.162136 + }, + { + "acc": 0.68774209, + "epoch": 0.0223698945958529, + "grad_norm": 8.6875, + "learning_rate": 2.2368862543339673e-06, + "loss": 1.35496454, + "memory(GiB)": 299.96, + "step": 4000, + "train_speed(iter/s)": 0.162411 + }, + { + "epoch": 0.0223698945958529, + "eval_acc": 0.6481784492394442, + "eval_loss": 1.3544570207595825, + "eval_runtime": 7550.6635, + "eval_samples_per_second": 9.97, + "eval_steps_per_second": 9.97, + "step": 4000 + }, + { + "acc": 0.68886814, + "epoch": 0.022481744068832167, + "grad_norm": 8.625, + "learning_rate": 2.2480706856056374e-06, + "loss": 1.34083099, + "memory(GiB)": 299.96, + "step": 4020, + "train_speed(iter/s)": 0.124174 + }, + { + "acc": 0.67941289, + "epoch": 0.02259359354181143, + "grad_norm": 7.4375, + "learning_rate": 2.259255116877307e-06, + "loss": 1.39886856, + "memory(GiB)": 299.96, + "step": 4040, + "train_speed(iter/s)": 0.124463 + }, + { + "acc": 0.68668785, + "epoch": 0.022705443014790695, + "grad_norm": 7.9375, + "learning_rate": 2.2704395481489768e-06, + "loss": 1.35355339, + "memory(GiB)": 299.96, + "step": 4060, + "train_speed(iter/s)": 0.12475 + }, + { + "acc": 0.69089704, + "epoch": 0.02281729248776996, + "grad_norm": 6.375, + "learning_rate": 2.2816239794206465e-06, + "loss": 1.33095722, + "memory(GiB)": 299.96, + "step": 4080, + "train_speed(iter/s)": 0.125028 + }, + { + "acc": 0.70504127, + "epoch": 0.022929141960749223, + "grad_norm": 9.375, + "learning_rate": 2.2928084106923166e-06, + "loss": 1.2874464, + "memory(GiB)": 299.96, + "step": 4100, + "train_speed(iter/s)": 0.125313 + }, + { + "acc": 0.69235506, + "epoch": 0.023040991433728487, + "grad_norm": 9.9375, + "learning_rate": 2.3039928419639862e-06, + "loss": 1.34391308, + "memory(GiB)": 299.96, + "step": 4120, + "train_speed(iter/s)": 0.125595 + }, + { + "acc": 0.69600945, + "epoch": 0.02315284090670775, + "grad_norm": 8.9375, + "learning_rate": 2.3151772732356563e-06, + "loss": 1.29547501, + "memory(GiB)": 299.96, + "step": 4140, + "train_speed(iter/s)": 0.125861 + }, + { + "acc": 0.69394193, + "epoch": 0.023264690379687016, + "grad_norm": 8.1875, + "learning_rate": 2.326361704507326e-06, + "loss": 1.29969301, + "memory(GiB)": 299.96, + "step": 4160, + "train_speed(iter/s)": 0.126151 + }, + { + "acc": 0.68654084, + "epoch": 0.023376539852666283, + "grad_norm": 7.96875, + "learning_rate": 2.3375461357789957e-06, + "loss": 1.32194796, + "memory(GiB)": 299.96, + "step": 4180, + "train_speed(iter/s)": 0.126422 + }, + { + "acc": 0.68294811, + "epoch": 0.023488389325645547, + "grad_norm": 8.125, + "learning_rate": 2.3487305670506658e-06, + "loss": 1.34275141, + "memory(GiB)": 299.96, + "step": 4200, + "train_speed(iter/s)": 0.126662 + }, + { + "acc": 0.68641458, + "epoch": 0.02360023879862481, + "grad_norm": 6.8125, + "learning_rate": 2.3599149983223355e-06, + "loss": 1.33121529, + "memory(GiB)": 299.96, + "step": 4220, + "train_speed(iter/s)": 0.126939 + }, + { + "acc": 0.69472761, + "epoch": 0.023712088271604075, + "grad_norm": 6.21875, + "learning_rate": 2.3710994295940056e-06, + "loss": 1.30999231, + "memory(GiB)": 299.96, + "step": 4240, + "train_speed(iter/s)": 0.127211 + }, + { + "acc": 0.69340644, + "epoch": 0.02382393774458334, + "grad_norm": 6.84375, + "learning_rate": 2.3822838608656752e-06, + "loss": 1.31115866, + "memory(GiB)": 299.96, + "step": 4260, + "train_speed(iter/s)": 0.127469 + }, + { + "acc": 0.7122817, + "epoch": 0.023935787217562603, + "grad_norm": 8.6875, + "learning_rate": 2.393468292137345e-06, + "loss": 1.24885988, + "memory(GiB)": 299.96, + "step": 4280, + "train_speed(iter/s)": 0.127738 + }, + { + "acc": 0.68979211, + "epoch": 0.024047636690541867, + "grad_norm": 5.09375, + "learning_rate": 2.4046527234090146e-06, + "loss": 1.35305738, + "memory(GiB)": 299.96, + "step": 4300, + "train_speed(iter/s)": 0.127969 + }, + { + "acc": 0.69826169, + "epoch": 0.024159486163521135, + "grad_norm": 8.8125, + "learning_rate": 2.4158371546806847e-06, + "loss": 1.28398743, + "memory(GiB)": 299.96, + "step": 4320, + "train_speed(iter/s)": 0.128236 + }, + { + "acc": 0.69100194, + "epoch": 0.0242713356365004, + "grad_norm": 7.71875, + "learning_rate": 2.4270215859523548e-06, + "loss": 1.32262383, + "memory(GiB)": 299.96, + "step": 4340, + "train_speed(iter/s)": 0.128501 + }, + { + "acc": 0.6821558, + "epoch": 0.024383185109479663, + "grad_norm": 5.21875, + "learning_rate": 2.4382060172240245e-06, + "loss": 1.34220972, + "memory(GiB)": 299.96, + "step": 4360, + "train_speed(iter/s)": 0.128749 + }, + { + "acc": 0.69088769, + "epoch": 0.024495034582458927, + "grad_norm": 11.875, + "learning_rate": 2.449390448495694e-06, + "loss": 1.31812057, + "memory(GiB)": 299.96, + "step": 4380, + "train_speed(iter/s)": 0.12903 + }, + { + "acc": 0.68420892, + "epoch": 0.02460688405543819, + "grad_norm": 5.53125, + "learning_rate": 2.460574879767364e-06, + "loss": 1.33080959, + "memory(GiB)": 299.96, + "step": 4400, + "train_speed(iter/s)": 0.129305 + }, + { + "acc": 0.69111071, + "epoch": 0.024718733528417455, + "grad_norm": 6.84375, + "learning_rate": 2.471759311039034e-06, + "loss": 1.31999044, + "memory(GiB)": 299.96, + "step": 4420, + "train_speed(iter/s)": 0.129567 + }, + { + "acc": 0.68334517, + "epoch": 0.02483058300139672, + "grad_norm": 8.0625, + "learning_rate": 2.4829437423107036e-06, + "loss": 1.34714146, + "memory(GiB)": 299.96, + "step": 4440, + "train_speed(iter/s)": 0.129825 + }, + { + "acc": 0.68805223, + "epoch": 0.024942432474375983, + "grad_norm": 9.3125, + "learning_rate": 2.4941281735823737e-06, + "loss": 1.34515553, + "memory(GiB)": 299.96, + "step": 4460, + "train_speed(iter/s)": 0.130098 + }, + { + "acc": 0.68447046, + "epoch": 0.02505428194735525, + "grad_norm": 10.125, + "learning_rate": 2.5053126048540434e-06, + "loss": 1.34630852, + "memory(GiB)": 299.96, + "step": 4480, + "train_speed(iter/s)": 0.130364 + }, + { + "acc": 0.70801988, + "epoch": 0.025166131420334515, + "grad_norm": 6.59375, + "learning_rate": 2.516497036125713e-06, + "loss": 1.22591887, + "memory(GiB)": 299.96, + "step": 4500, + "train_speed(iter/s)": 0.130627 + }, + { + "acc": 0.69342308, + "epoch": 0.02527798089331378, + "grad_norm": 9.5, + "learning_rate": 2.527681467397383e-06, + "loss": 1.28180914, + "memory(GiB)": 299.96, + "step": 4520, + "train_speed(iter/s)": 0.130893 + }, + { + "acc": 0.6876029, + "epoch": 0.025389830366293043, + "grad_norm": 7.28125, + "learning_rate": 2.538865898669053e-06, + "loss": 1.31932487, + "memory(GiB)": 299.96, + "step": 4540, + "train_speed(iter/s)": 0.131145 + }, + { + "acc": 0.68712096, + "epoch": 0.025501679839272307, + "grad_norm": 8.125, + "learning_rate": 2.5500503299407225e-06, + "loss": 1.31991224, + "memory(GiB)": 299.96, + "step": 4560, + "train_speed(iter/s)": 0.131404 + }, + { + "acc": 0.70164146, + "epoch": 0.02561352931225157, + "grad_norm": 8.125, + "learning_rate": 2.5612347612123926e-06, + "loss": 1.27022934, + "memory(GiB)": 299.96, + "step": 4580, + "train_speed(iter/s)": 0.13165 + }, + { + "acc": 0.67768741, + "epoch": 0.025725378785230835, + "grad_norm": 11.0, + "learning_rate": 2.5724191924840623e-06, + "loss": 1.35368414, + "memory(GiB)": 299.96, + "step": 4600, + "train_speed(iter/s)": 0.131882 + }, + { + "acc": 0.68467913, + "epoch": 0.0258372282582101, + "grad_norm": 9.125, + "learning_rate": 2.583603623755732e-06, + "loss": 1.35832968, + "memory(GiB)": 299.96, + "step": 4620, + "train_speed(iter/s)": 0.132143 + }, + { + "acc": 0.70328465, + "epoch": 0.025949077731189367, + "grad_norm": 10.25, + "learning_rate": 2.594788055027402e-06, + "loss": 1.27709141, + "memory(GiB)": 299.96, + "step": 4640, + "train_speed(iter/s)": 0.13239 + }, + { + "acc": 0.70355134, + "epoch": 0.02606092720416863, + "grad_norm": 7.3125, + "learning_rate": 2.605972486299072e-06, + "loss": 1.24102097, + "memory(GiB)": 299.96, + "step": 4660, + "train_speed(iter/s)": 0.132638 + }, + { + "acc": 0.69309363, + "epoch": 0.026172776677147895, + "grad_norm": 8.75, + "learning_rate": 2.617156917570742e-06, + "loss": 1.28119135, + "memory(GiB)": 299.96, + "step": 4680, + "train_speed(iter/s)": 0.132891 + }, + { + "acc": 0.67304955, + "epoch": 0.02628462615012716, + "grad_norm": 6.21875, + "learning_rate": 2.6283413488424115e-06, + "loss": 1.39861937, + "memory(GiB)": 299.96, + "step": 4700, + "train_speed(iter/s)": 0.133118 + }, + { + "acc": 0.67862487, + "epoch": 0.026396475623106423, + "grad_norm": 8.625, + "learning_rate": 2.6395257801140816e-06, + "loss": 1.36972752, + "memory(GiB)": 299.96, + "step": 4720, + "train_speed(iter/s)": 0.133346 + }, + { + "acc": 0.69055948, + "epoch": 0.026508325096085687, + "grad_norm": 10.1875, + "learning_rate": 2.6507102113857513e-06, + "loss": 1.3268652, + "memory(GiB)": 299.96, + "step": 4740, + "train_speed(iter/s)": 0.133574 + }, + { + "acc": 0.69883018, + "epoch": 0.02662017456906495, + "grad_norm": 8.1875, + "learning_rate": 2.661894642657421e-06, + "loss": 1.24371576, + "memory(GiB)": 299.96, + "step": 4760, + "train_speed(iter/s)": 0.133812 + }, + { + "acc": 0.68724952, + "epoch": 0.026732024042044215, + "grad_norm": 8.4375, + "learning_rate": 2.6730790739290906e-06, + "loss": 1.3078763, + "memory(GiB)": 299.96, + "step": 4780, + "train_speed(iter/s)": 0.134051 + }, + { + "acc": 0.68349452, + "epoch": 0.026843873515023483, + "grad_norm": 5.40625, + "learning_rate": 2.684263505200761e-06, + "loss": 1.32303047, + "memory(GiB)": 299.96, + "step": 4800, + "train_speed(iter/s)": 0.134299 + }, + { + "acc": 0.68524461, + "epoch": 0.026955722988002747, + "grad_norm": 10.5, + "learning_rate": 2.695447936472431e-06, + "loss": 1.34585638, + "memory(GiB)": 299.96, + "step": 4820, + "train_speed(iter/s)": 0.134524 + }, + { + "acc": 0.68065372, + "epoch": 0.02706757246098201, + "grad_norm": 13.4375, + "learning_rate": 2.7066323677441005e-06, + "loss": 1.34625864, + "memory(GiB)": 299.96, + "step": 4840, + "train_speed(iter/s)": 0.134766 + }, + { + "acc": 0.69705353, + "epoch": 0.027179421933961275, + "grad_norm": 6.25, + "learning_rate": 2.71781679901577e-06, + "loss": 1.27497568, + "memory(GiB)": 299.96, + "step": 4860, + "train_speed(iter/s)": 0.135005 + }, + { + "acc": 0.67712951, + "epoch": 0.02729127140694054, + "grad_norm": 10.6875, + "learning_rate": 2.7290012302874403e-06, + "loss": 1.37200508, + "memory(GiB)": 299.96, + "step": 4880, + "train_speed(iter/s)": 0.135235 + }, + { + "acc": 0.68595366, + "epoch": 0.027403120879919803, + "grad_norm": 7.5625, + "learning_rate": 2.74018566155911e-06, + "loss": 1.32296801, + "memory(GiB)": 299.96, + "step": 4900, + "train_speed(iter/s)": 0.135464 + }, + { + "acc": 0.68797412, + "epoch": 0.027514970352899067, + "grad_norm": 5.15625, + "learning_rate": 2.7513700928307796e-06, + "loss": 1.32502842, + "memory(GiB)": 299.96, + "step": 4920, + "train_speed(iter/s)": 0.13567 + }, + { + "acc": 0.68464012, + "epoch": 0.02762681982587833, + "grad_norm": 7.40625, + "learning_rate": 2.7625545241024497e-06, + "loss": 1.31793394, + "memory(GiB)": 299.96, + "step": 4940, + "train_speed(iter/s)": 0.135869 + }, + { + "acc": 0.69362102, + "epoch": 0.0277386692988576, + "grad_norm": 7.875, + "learning_rate": 2.7737389553741194e-06, + "loss": 1.28576975, + "memory(GiB)": 299.96, + "step": 4960, + "train_speed(iter/s)": 0.136099 + }, + { + "acc": 0.67565079, + "epoch": 0.027850518771836863, + "grad_norm": 7.75, + "learning_rate": 2.784923386645789e-06, + "loss": 1.35199518, + "memory(GiB)": 299.96, + "step": 4980, + "train_speed(iter/s)": 0.136322 + }, + { + "acc": 0.69477625, + "epoch": 0.027962368244816127, + "grad_norm": 10.625, + "learning_rate": 2.7961078179174587e-06, + "loss": 1.26597948, + "memory(GiB)": 299.96, + "step": 5000, + "train_speed(iter/s)": 0.136555 + }, + { + "acc": 0.70346498, + "epoch": 0.02807421771779539, + "grad_norm": 11.0, + "learning_rate": 2.8072922491891293e-06, + "loss": 1.22258205, + "memory(GiB)": 299.96, + "step": 5020, + "train_speed(iter/s)": 0.136766 + }, + { + "acc": 0.70328908, + "epoch": 0.028186067190774655, + "grad_norm": 6.34375, + "learning_rate": 2.818476680460799e-06, + "loss": 1.22252312, + "memory(GiB)": 299.96, + "step": 5040, + "train_speed(iter/s)": 0.136996 + }, + { + "acc": 0.67854705, + "epoch": 0.02829791666375392, + "grad_norm": 8.8125, + "learning_rate": 2.8296611117324686e-06, + "loss": 1.36602249, + "memory(GiB)": 299.96, + "step": 5060, + "train_speed(iter/s)": 0.137221 + }, + { + "acc": 0.6882278, + "epoch": 0.028409766136733183, + "grad_norm": 7.5625, + "learning_rate": 2.8408455430041387e-06, + "loss": 1.3138751, + "memory(GiB)": 299.96, + "step": 5080, + "train_speed(iter/s)": 0.137435 + }, + { + "acc": 0.69407358, + "epoch": 0.028521615609712447, + "grad_norm": 10.4375, + "learning_rate": 2.8520299742758084e-06, + "loss": 1.28161573, + "memory(GiB)": 299.96, + "step": 5100, + "train_speed(iter/s)": 0.137659 + }, + { + "acc": 0.68957224, + "epoch": 0.028633465082691715, + "grad_norm": 6.71875, + "learning_rate": 2.863214405547478e-06, + "loss": 1.31506119, + "memory(GiB)": 299.96, + "step": 5120, + "train_speed(iter/s)": 0.137867 + }, + { + "acc": 0.69116936, + "epoch": 0.02874531455567098, + "grad_norm": 9.25, + "learning_rate": 2.8743988368191477e-06, + "loss": 1.28844194, + "memory(GiB)": 299.96, + "step": 5140, + "train_speed(iter/s)": 0.138097 + }, + { + "acc": 0.68783331, + "epoch": 0.028857164028650243, + "grad_norm": 10.3125, + "learning_rate": 2.885583268090818e-06, + "loss": 1.3053071, + "memory(GiB)": 299.96, + "step": 5160, + "train_speed(iter/s)": 0.138319 + }, + { + "acc": 0.69163456, + "epoch": 0.028969013501629507, + "grad_norm": 9.0625, + "learning_rate": 2.8967676993624875e-06, + "loss": 1.30025587, + "memory(GiB)": 299.96, + "step": 5180, + "train_speed(iter/s)": 0.138545 + }, + { + "acc": 0.69504647, + "epoch": 0.02908086297460877, + "grad_norm": 7.65625, + "learning_rate": 2.907952130634157e-06, + "loss": 1.28130169, + "memory(GiB)": 299.96, + "step": 5200, + "train_speed(iter/s)": 0.138767 + }, + { + "acc": 0.6929781, + "epoch": 0.029192712447588035, + "grad_norm": 7.125, + "learning_rate": 2.919136561905827e-06, + "loss": 1.29453974, + "memory(GiB)": 299.96, + "step": 5220, + "train_speed(iter/s)": 0.138958 + }, + { + "acc": 0.67286849, + "epoch": 0.0293045619205673, + "grad_norm": 8.625, + "learning_rate": 2.9303209931774974e-06, + "loss": 1.36135206, + "memory(GiB)": 299.96, + "step": 5240, + "train_speed(iter/s)": 0.139184 + }, + { + "acc": 0.69861593, + "epoch": 0.029416411393546563, + "grad_norm": 8.375, + "learning_rate": 2.941505424449167e-06, + "loss": 1.24977732, + "memory(GiB)": 299.96, + "step": 5260, + "train_speed(iter/s)": 0.139395 + }, + { + "acc": 0.69084496, + "epoch": 0.02952826086652583, + "grad_norm": 7.9375, + "learning_rate": 2.9526898557208367e-06, + "loss": 1.28593378, + "memory(GiB)": 299.96, + "step": 5280, + "train_speed(iter/s)": 0.139594 + }, + { + "acc": 0.70396729, + "epoch": 0.029640110339505095, + "grad_norm": 7.28125, + "learning_rate": 2.963874286992507e-06, + "loss": 1.2426156, + "memory(GiB)": 299.96, + "step": 5300, + "train_speed(iter/s)": 0.139819 + }, + { + "acc": 0.69568729, + "epoch": 0.02975195981248436, + "grad_norm": 10.4375, + "learning_rate": 2.9750587182641765e-06, + "loss": 1.29894409, + "memory(GiB)": 299.96, + "step": 5320, + "train_speed(iter/s)": 0.140001 + }, + { + "acc": 0.69814277, + "epoch": 0.029863809285463623, + "grad_norm": 10.75, + "learning_rate": 2.986243149535846e-06, + "loss": 1.2781373, + "memory(GiB)": 299.96, + "step": 5340, + "train_speed(iter/s)": 0.140206 + }, + { + "acc": 0.68409343, + "epoch": 0.029975658758442887, + "grad_norm": 5.8125, + "learning_rate": 2.997427580807516e-06, + "loss": 1.33238964, + "memory(GiB)": 299.96, + "step": 5360, + "train_speed(iter/s)": 0.140414 + }, + { + "acc": 0.68315778, + "epoch": 0.03008750823142215, + "grad_norm": 6.21875, + "learning_rate": 3.0086120120791864e-06, + "loss": 1.33707972, + "memory(GiB)": 299.96, + "step": 5380, + "train_speed(iter/s)": 0.140621 + }, + { + "acc": 0.71432581, + "epoch": 0.030199357704401415, + "grad_norm": 9.875, + "learning_rate": 3.019796443350856e-06, + "loss": 1.18237848, + "memory(GiB)": 299.96, + "step": 5400, + "train_speed(iter/s)": 0.140809 + }, + { + "acc": 0.70025606, + "epoch": 0.030311207177380683, + "grad_norm": 10.25, + "learning_rate": 3.0309808746225257e-06, + "loss": 1.23965006, + "memory(GiB)": 299.96, + "step": 5420, + "train_speed(iter/s)": 0.141022 + }, + { + "acc": 0.69956613, + "epoch": 0.030423056650359947, + "grad_norm": 12.125, + "learning_rate": 3.042165305894196e-06, + "loss": 1.27348223, + "memory(GiB)": 299.96, + "step": 5440, + "train_speed(iter/s)": 0.141211 + }, + { + "acc": 0.69109888, + "epoch": 0.03053490612333921, + "grad_norm": 11.8125, + "learning_rate": 3.0533497371658655e-06, + "loss": 1.30505838, + "memory(GiB)": 299.96, + "step": 5460, + "train_speed(iter/s)": 0.141425 + }, + { + "acc": 0.69277043, + "epoch": 0.030646755596318475, + "grad_norm": 9.9375, + "learning_rate": 3.064534168437535e-06, + "loss": 1.26328497, + "memory(GiB)": 299.96, + "step": 5480, + "train_speed(iter/s)": 0.14163 + }, + { + "acc": 0.69358521, + "epoch": 0.03075860506929774, + "grad_norm": 8.5, + "learning_rate": 3.075718599709205e-06, + "loss": 1.27252808, + "memory(GiB)": 299.96, + "step": 5500, + "train_speed(iter/s)": 0.141826 + }, + { + "acc": 0.70457883, + "epoch": 0.030870454542277003, + "grad_norm": 7.8125, + "learning_rate": 3.086903030980875e-06, + "loss": 1.23339787, + "memory(GiB)": 299.96, + "step": 5520, + "train_speed(iter/s)": 0.142021 + }, + { + "acc": 0.69403229, + "epoch": 0.030982304015256267, + "grad_norm": 6.0625, + "learning_rate": 3.0980874622525446e-06, + "loss": 1.28979139, + "memory(GiB)": 299.96, + "step": 5540, + "train_speed(iter/s)": 0.142231 + }, + { + "acc": 0.68734226, + "epoch": 0.03109415348823553, + "grad_norm": 8.1875, + "learning_rate": 3.1092718935242143e-06, + "loss": 1.30503159, + "memory(GiB)": 299.96, + "step": 5560, + "train_speed(iter/s)": 0.142425 + }, + { + "acc": 0.68839374, + "epoch": 0.0312060029612148, + "grad_norm": 9.125, + "learning_rate": 3.120456324795884e-06, + "loss": 1.30096016, + "memory(GiB)": 299.96, + "step": 5580, + "train_speed(iter/s)": 0.142636 + }, + { + "acc": 0.68720975, + "epoch": 0.03131785243419406, + "grad_norm": 7.65625, + "learning_rate": 3.1316407560675545e-06, + "loss": 1.29481916, + "memory(GiB)": 299.96, + "step": 5600, + "train_speed(iter/s)": 0.142799 + }, + { + "acc": 0.70191712, + "epoch": 0.03142970190717333, + "grad_norm": 8.125, + "learning_rate": 3.142825187339224e-06, + "loss": 1.2349349, + "memory(GiB)": 299.96, + "step": 5620, + "train_speed(iter/s)": 0.142995 + }, + { + "acc": 0.68026571, + "epoch": 0.03154155138015259, + "grad_norm": 6.21875, + "learning_rate": 3.154009618610894e-06, + "loss": 1.33640976, + "memory(GiB)": 299.96, + "step": 5640, + "train_speed(iter/s)": 0.143197 + }, + { + "acc": 0.68841114, + "epoch": 0.031653400853131855, + "grad_norm": 6.5, + "learning_rate": 3.165194049882564e-06, + "loss": 1.31951389, + "memory(GiB)": 299.96, + "step": 5660, + "train_speed(iter/s)": 0.143397 + }, + { + "acc": 0.68958673, + "epoch": 0.03176525032611112, + "grad_norm": 8.875, + "learning_rate": 3.1763784811542336e-06, + "loss": 1.29531746, + "memory(GiB)": 299.96, + "step": 5680, + "train_speed(iter/s)": 0.143587 + }, + { + "acc": 0.69374647, + "epoch": 0.03187709979909038, + "grad_norm": 12.9375, + "learning_rate": 3.1875629124259033e-06, + "loss": 1.2677372, + "memory(GiB)": 299.96, + "step": 5700, + "train_speed(iter/s)": 0.143784 + }, + { + "acc": 0.688868, + "epoch": 0.03198894927206965, + "grad_norm": 6.90625, + "learning_rate": 3.198747343697573e-06, + "loss": 1.29217758, + "memory(GiB)": 299.96, + "step": 5720, + "train_speed(iter/s)": 0.143986 + }, + { + "acc": 0.6966608, + "epoch": 0.03210079874504891, + "grad_norm": 8.9375, + "learning_rate": 3.209931774969243e-06, + "loss": 1.25201511, + "memory(GiB)": 299.96, + "step": 5740, + "train_speed(iter/s)": 0.144167 + }, + { + "acc": 0.68761301, + "epoch": 0.032212648218028175, + "grad_norm": 8.6875, + "learning_rate": 3.2211162062409128e-06, + "loss": 1.3108551, + "memory(GiB)": 299.96, + "step": 5760, + "train_speed(iter/s)": 0.144372 + }, + { + "acc": 0.67681975, + "epoch": 0.03232449769100744, + "grad_norm": 5.0625, + "learning_rate": 3.2323006375125825e-06, + "loss": 1.3617589, + "memory(GiB)": 299.96, + "step": 5780, + "train_speed(iter/s)": 0.144569 + }, + { + "acc": 0.69641619, + "epoch": 0.032436347163986703, + "grad_norm": 8.0625, + "learning_rate": 3.243485068784253e-06, + "loss": 1.24260159, + "memory(GiB)": 299.96, + "step": 5800, + "train_speed(iter/s)": 0.144739 + }, + { + "acc": 0.70628524, + "epoch": 0.032548196636965974, + "grad_norm": 7.75, + "learning_rate": 3.2546695000559226e-06, + "loss": 1.21430006, + "memory(GiB)": 299.96, + "step": 5820, + "train_speed(iter/s)": 0.144934 + }, + { + "acc": 0.69151616, + "epoch": 0.03266004610994524, + "grad_norm": 12.625, + "learning_rate": 3.2658539313275923e-06, + "loss": 1.29387016, + "memory(GiB)": 299.96, + "step": 5840, + "train_speed(iter/s)": 0.145119 + }, + { + "acc": 0.68770604, + "epoch": 0.0327718955829245, + "grad_norm": 17.125, + "learning_rate": 3.277038362599262e-06, + "loss": 1.32930212, + "memory(GiB)": 299.96, + "step": 5860, + "train_speed(iter/s)": 0.145309 + }, + { + "acc": 0.68294587, + "epoch": 0.03288374505590377, + "grad_norm": 9.0, + "learning_rate": 3.288222793870932e-06, + "loss": 1.31583271, + "memory(GiB)": 299.96, + "step": 5880, + "train_speed(iter/s)": 0.145496 + }, + { + "acc": 0.68233027, + "epoch": 0.03299559452888303, + "grad_norm": 8.0, + "learning_rate": 3.2994072251426018e-06, + "loss": 1.3198287, + "memory(GiB)": 299.96, + "step": 5900, + "train_speed(iter/s)": 0.145696 + }, + { + "acc": 0.70167365, + "epoch": 0.033107444001862295, + "grad_norm": 5.8125, + "learning_rate": 3.3105916564142715e-06, + "loss": 1.25117893, + "memory(GiB)": 299.96, + "step": 5920, + "train_speed(iter/s)": 0.145878 + }, + { + "acc": 0.70082707, + "epoch": 0.03321929347484156, + "grad_norm": 25.75, + "learning_rate": 3.321776087685941e-06, + "loss": 1.24856768, + "memory(GiB)": 299.96, + "step": 5940, + "train_speed(iter/s)": 0.146078 + }, + { + "acc": 0.68880157, + "epoch": 0.03333114294782082, + "grad_norm": 7.5625, + "learning_rate": 3.3329605189576112e-06, + "loss": 1.3083787, + "memory(GiB)": 299.96, + "step": 5960, + "train_speed(iter/s)": 0.146269 + }, + { + "acc": 0.69487114, + "epoch": 0.03344299242080009, + "grad_norm": 8.1875, + "learning_rate": 3.344144950229281e-06, + "loss": 1.27169971, + "memory(GiB)": 299.96, + "step": 5980, + "train_speed(iter/s)": 0.146455 + }, + { + "acc": 0.69404774, + "epoch": 0.03355484189377935, + "grad_norm": 11.0625, + "learning_rate": 3.3553293815009506e-06, + "loss": 1.25364704, + "memory(GiB)": 299.96, + "step": 6000, + "train_speed(iter/s)": 0.14665 + }, + { + "epoch": 0.03355484189377935, + "eval_acc": 0.6573367075997602, + "eval_loss": 1.2642945051193237, + "eval_runtime": 7538.1093, + "eval_samples_per_second": 9.987, + "eval_steps_per_second": 9.987, + "step": 6000 + }, + { + "acc": 0.69745555, + "epoch": 0.033666691366758615, + "grad_norm": 6.84375, + "learning_rate": 3.366513812772621e-06, + "loss": 1.27758036, + "memory(GiB)": 299.96, + "step": 6020, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.69113073, + "epoch": 0.03377854083973788, + "grad_norm": 7.3125, + "learning_rate": 3.3776982440442908e-06, + "loss": 1.28778305, + "memory(GiB)": 299.96, + "step": 6040, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.70142093, + "epoch": 0.03389039031271714, + "grad_norm": 9.625, + "learning_rate": 3.3888826753159605e-06, + "loss": 1.22980804, + "memory(GiB)": 299.96, + "step": 6060, + "train_speed(iter/s)": 0.124096 + }, + { + "acc": 0.70942097, + "epoch": 0.03400223978569641, + "grad_norm": 11.5, + "learning_rate": 3.40006710658763e-06, + "loss": 1.23368292, + "memory(GiB)": 299.96, + "step": 6080, + "train_speed(iter/s)": 0.124282 + }, + { + "acc": 0.69330921, + "epoch": 0.03411408925867567, + "grad_norm": 5.40625, + "learning_rate": 3.4112515378593002e-06, + "loss": 1.27146235, + "memory(GiB)": 299.96, + "step": 6100, + "train_speed(iter/s)": 0.124463 + }, + { + "acc": 0.71228552, + "epoch": 0.034225938731654935, + "grad_norm": 8.5, + "learning_rate": 3.42243596913097e-06, + "loss": 1.2062891, + "memory(GiB)": 299.96, + "step": 6120, + "train_speed(iter/s)": 0.124655 + }, + { + "acc": 0.69759626, + "epoch": 0.034337788204634206, + "grad_norm": 6.375, + "learning_rate": 3.4336204004026396e-06, + "loss": 1.26463671, + "memory(GiB)": 299.96, + "step": 6140, + "train_speed(iter/s)": 0.124837 + }, + { + "acc": 0.7009747, + "epoch": 0.03444963767761347, + "grad_norm": 9.625, + "learning_rate": 3.44480483167431e-06, + "loss": 1.22737312, + "memory(GiB)": 299.96, + "step": 6160, + "train_speed(iter/s)": 0.125043 + }, + { + "acc": 0.70773563, + "epoch": 0.034561487150592735, + "grad_norm": 6.53125, + "learning_rate": 3.4559892629459798e-06, + "loss": 1.20572586, + "memory(GiB)": 299.96, + "step": 6180, + "train_speed(iter/s)": 0.125237 + }, + { + "acc": 0.67832236, + "epoch": 0.034673336623572, + "grad_norm": 11.0, + "learning_rate": 3.4671736942176495e-06, + "loss": 1.37432384, + "memory(GiB)": 299.96, + "step": 6200, + "train_speed(iter/s)": 0.125431 + }, + { + "acc": 0.67474532, + "epoch": 0.03478518609655126, + "grad_norm": 8.0625, + "learning_rate": 3.478358125489319e-06, + "loss": 1.3766242, + "memory(GiB)": 299.96, + "step": 6220, + "train_speed(iter/s)": 0.125617 + }, + { + "acc": 0.71049042, + "epoch": 0.03489703556953053, + "grad_norm": 5.40625, + "learning_rate": 3.4895425567609892e-06, + "loss": 1.2075511, + "memory(GiB)": 299.96, + "step": 6240, + "train_speed(iter/s)": 0.125808 + }, + { + "acc": 0.69929047, + "epoch": 0.03500888504250979, + "grad_norm": 8.8125, + "learning_rate": 3.500726988032659e-06, + "loss": 1.2579236, + "memory(GiB)": 299.96, + "step": 6260, + "train_speed(iter/s)": 0.125988 + }, + { + "acc": 0.70747099, + "epoch": 0.035120734515489055, + "grad_norm": 7.6875, + "learning_rate": 3.5119114193043286e-06, + "loss": 1.22723188, + "memory(GiB)": 299.96, + "step": 6280, + "train_speed(iter/s)": 0.126181 + }, + { + "acc": 0.70400677, + "epoch": 0.03523258398846832, + "grad_norm": 5.84375, + "learning_rate": 3.5230958505759983e-06, + "loss": 1.21356316, + "memory(GiB)": 299.96, + "step": 6300, + "train_speed(iter/s)": 0.126371 + }, + { + "acc": 0.70155888, + "epoch": 0.03534443346144758, + "grad_norm": 8.6875, + "learning_rate": 3.5342802818476684e-06, + "loss": 1.25314245, + "memory(GiB)": 299.96, + "step": 6320, + "train_speed(iter/s)": 0.126562 + }, + { + "acc": 0.68176646, + "epoch": 0.03545628293442685, + "grad_norm": 12.1875, + "learning_rate": 3.545464713119338e-06, + "loss": 1.35281706, + "memory(GiB)": 299.96, + "step": 6340, + "train_speed(iter/s)": 0.12675 + }, + { + "acc": 0.69240775, + "epoch": 0.03556813240740611, + "grad_norm": 8.4375, + "learning_rate": 3.5566491443910077e-06, + "loss": 1.27116394, + "memory(GiB)": 299.96, + "step": 6360, + "train_speed(iter/s)": 0.126931 + }, + { + "acc": 0.71172829, + "epoch": 0.035679981880385375, + "grad_norm": 9.75, + "learning_rate": 3.5678335756626782e-06, + "loss": 1.20649509, + "memory(GiB)": 299.96, + "step": 6380, + "train_speed(iter/s)": 0.127114 + }, + { + "acc": 0.69041657, + "epoch": 0.03579183135336464, + "grad_norm": 10.3125, + "learning_rate": 3.579018006934348e-06, + "loss": 1.27706575, + "memory(GiB)": 299.96, + "step": 6400, + "train_speed(iter/s)": 0.127303 + }, + { + "acc": 0.70414524, + "epoch": 0.0359036808263439, + "grad_norm": 7.53125, + "learning_rate": 3.5902024382060176e-06, + "loss": 1.23561878, + "memory(GiB)": 299.96, + "step": 6420, + "train_speed(iter/s)": 0.127482 + }, + { + "acc": 0.70626245, + "epoch": 0.036015530299323174, + "grad_norm": 13.1875, + "learning_rate": 3.6013868694776873e-06, + "loss": 1.22725172, + "memory(GiB)": 299.96, + "step": 6440, + "train_speed(iter/s)": 0.127659 + }, + { + "acc": 0.69713902, + "epoch": 0.03612737977230244, + "grad_norm": 7.1875, + "learning_rate": 3.6125713007493574e-06, + "loss": 1.25829725, + "memory(GiB)": 299.96, + "step": 6460, + "train_speed(iter/s)": 0.127843 + }, + { + "acc": 0.715731, + "epoch": 0.0362392292452817, + "grad_norm": 8.875, + "learning_rate": 3.623755732021027e-06, + "loss": 1.16742182, + "memory(GiB)": 299.96, + "step": 6480, + "train_speed(iter/s)": 0.128028 + }, + { + "acc": 0.69602847, + "epoch": 0.03635107871826097, + "grad_norm": 9.3125, + "learning_rate": 3.6349401632926967e-06, + "loss": 1.26894941, + "memory(GiB)": 299.96, + "step": 6500, + "train_speed(iter/s)": 0.128204 + }, + { + "acc": 0.69042954, + "epoch": 0.03646292819124023, + "grad_norm": 6.96875, + "learning_rate": 3.646124594564367e-06, + "loss": 1.28149567, + "memory(GiB)": 299.96, + "step": 6520, + "train_speed(iter/s)": 0.128391 + }, + { + "acc": 0.69896331, + "epoch": 0.036574777664219495, + "grad_norm": 7.25, + "learning_rate": 3.6573090258360365e-06, + "loss": 1.25114012, + "memory(GiB)": 299.96, + "step": 6540, + "train_speed(iter/s)": 0.128565 + }, + { + "acc": 0.70565891, + "epoch": 0.03668662713719876, + "grad_norm": 5.5, + "learning_rate": 3.668493457107706e-06, + "loss": 1.21310558, + "memory(GiB)": 299.96, + "step": 6560, + "train_speed(iter/s)": 0.12875 + }, + { + "acc": 0.71111445, + "epoch": 0.03679847661017802, + "grad_norm": 9.5625, + "learning_rate": 3.679677888379376e-06, + "loss": 1.18430653, + "memory(GiB)": 299.96, + "step": 6580, + "train_speed(iter/s)": 0.128944 + }, + { + "acc": 0.70745111, + "epoch": 0.03691032608315729, + "grad_norm": 6.78125, + "learning_rate": 3.6908623196510464e-06, + "loss": 1.20464001, + "memory(GiB)": 299.96, + "step": 6600, + "train_speed(iter/s)": 0.129122 + }, + { + "acc": 0.67487931, + "epoch": 0.03702217555613655, + "grad_norm": 7.28125, + "learning_rate": 3.702046750922716e-06, + "loss": 1.34559088, + "memory(GiB)": 299.96, + "step": 6620, + "train_speed(iter/s)": 0.129299 + }, + { + "acc": 0.71356711, + "epoch": 0.037134025029115815, + "grad_norm": 5.34375, + "learning_rate": 3.7132311821943857e-06, + "loss": 1.18074837, + "memory(GiB)": 299.96, + "step": 6640, + "train_speed(iter/s)": 0.129467 + }, + { + "acc": 0.6973103, + "epoch": 0.03724587450209508, + "grad_norm": 9.0625, + "learning_rate": 3.7244156134660554e-06, + "loss": 1.25229635, + "memory(GiB)": 299.96, + "step": 6660, + "train_speed(iter/s)": 0.129638 + }, + { + "acc": 0.69925103, + "epoch": 0.03735772397507434, + "grad_norm": 9.3125, + "learning_rate": 3.7356000447377255e-06, + "loss": 1.24358654, + "memory(GiB)": 299.96, + "step": 6680, + "train_speed(iter/s)": 0.129814 + }, + { + "acc": 0.70522757, + "epoch": 0.03746957344805361, + "grad_norm": 11.0625, + "learning_rate": 3.746784476009395e-06, + "loss": 1.22373991, + "memory(GiB)": 299.96, + "step": 6700, + "train_speed(iter/s)": 0.129988 + }, + { + "acc": 0.70148463, + "epoch": 0.03758142292103287, + "grad_norm": 7.125, + "learning_rate": 3.757968907281065e-06, + "loss": 1.24187565, + "memory(GiB)": 299.96, + "step": 6720, + "train_speed(iter/s)": 0.130169 + }, + { + "acc": 0.70574355, + "epoch": 0.037693272394012135, + "grad_norm": 5.71875, + "learning_rate": 3.769153338552735e-06, + "loss": 1.22177219, + "memory(GiB)": 299.96, + "step": 6740, + "train_speed(iter/s)": 0.130337 + }, + { + "acc": 0.69209089, + "epoch": 0.037805121866991406, + "grad_norm": 9.9375, + "learning_rate": 3.7803377698244046e-06, + "loss": 1.2703784, + "memory(GiB)": 299.96, + "step": 6760, + "train_speed(iter/s)": 0.13051 + }, + { + "acc": 0.69865832, + "epoch": 0.03791697133997067, + "grad_norm": 7.65625, + "learning_rate": 3.7915222010960743e-06, + "loss": 1.22150545, + "memory(GiB)": 299.96, + "step": 6780, + "train_speed(iter/s)": 0.130692 + }, + { + "acc": 0.71764379, + "epoch": 0.038028820812949934, + "grad_norm": 7.34375, + "learning_rate": 3.8027066323677444e-06, + "loss": 1.17465353, + "memory(GiB)": 299.96, + "step": 6800, + "train_speed(iter/s)": 0.130859 + }, + { + "acc": 0.69969592, + "epoch": 0.0381406702859292, + "grad_norm": 7.5, + "learning_rate": 3.8138910636394145e-06, + "loss": 1.22767534, + "memory(GiB)": 299.96, + "step": 6820, + "train_speed(iter/s)": 0.131031 + }, + { + "acc": 0.70163269, + "epoch": 0.03825251975890846, + "grad_norm": 5.9375, + "learning_rate": 3.825075494911084e-06, + "loss": 1.23657169, + "memory(GiB)": 299.96, + "step": 6840, + "train_speed(iter/s)": 0.131202 + }, + { + "acc": 0.69279647, + "epoch": 0.03836436923188773, + "grad_norm": 8.375, + "learning_rate": 3.836259926182754e-06, + "loss": 1.26947632, + "memory(GiB)": 299.96, + "step": 6860, + "train_speed(iter/s)": 0.131358 + }, + { + "acc": 0.69216809, + "epoch": 0.03847621870486699, + "grad_norm": 8.3125, + "learning_rate": 3.847444357454424e-06, + "loss": 1.29820795, + "memory(GiB)": 299.96, + "step": 6880, + "train_speed(iter/s)": 0.13152 + }, + { + "acc": 0.68960514, + "epoch": 0.038588068177846255, + "grad_norm": 6.0, + "learning_rate": 3.858628788726094e-06, + "loss": 1.28499689, + "memory(GiB)": 299.96, + "step": 6900, + "train_speed(iter/s)": 0.131696 + }, + { + "acc": 0.69815893, + "epoch": 0.03869991765082552, + "grad_norm": 8.25, + "learning_rate": 3.869813219997764e-06, + "loss": 1.2242775, + "memory(GiB)": 299.96, + "step": 6920, + "train_speed(iter/s)": 0.13187 + }, + { + "acc": 0.70575776, + "epoch": 0.03881176712380478, + "grad_norm": 7.5, + "learning_rate": 3.880997651269433e-06, + "loss": 1.21684628, + "memory(GiB)": 299.96, + "step": 6940, + "train_speed(iter/s)": 0.132042 + }, + { + "acc": 0.6945087, + "epoch": 0.03892361659678405, + "grad_norm": 8.4375, + "learning_rate": 3.892182082541103e-06, + "loss": 1.26287527, + "memory(GiB)": 299.96, + "step": 6960, + "train_speed(iter/s)": 0.132194 + }, + { + "acc": 0.70593686, + "epoch": 0.03903546606976331, + "grad_norm": 6.5, + "learning_rate": 3.903366513812773e-06, + "loss": 1.19278955, + "memory(GiB)": 299.96, + "step": 6980, + "train_speed(iter/s)": 0.132357 + }, + { + "acc": 0.70223999, + "epoch": 0.039147315542742575, + "grad_norm": 4.625, + "learning_rate": 3.914550945084442e-06, + "loss": 1.21508904, + "memory(GiB)": 299.96, + "step": 7000, + "train_speed(iter/s)": 0.132528 + }, + { + "acc": 0.71035171, + "epoch": 0.03925916501572184, + "grad_norm": 12.25, + "learning_rate": 3.925735376356112e-06, + "loss": 1.19803991, + "memory(GiB)": 299.96, + "step": 7020, + "train_speed(iter/s)": 0.132689 + }, + { + "acc": 0.70247474, + "epoch": 0.0393710144887011, + "grad_norm": 7.71875, + "learning_rate": 3.936919807627783e-06, + "loss": 1.22758875, + "memory(GiB)": 299.96, + "step": 7040, + "train_speed(iter/s)": 0.132857 + }, + { + "acc": 0.71605873, + "epoch": 0.03948286396168037, + "grad_norm": 6.3125, + "learning_rate": 3.948104238899452e-06, + "loss": 1.16495066, + "memory(GiB)": 299.96, + "step": 7060, + "train_speed(iter/s)": 0.133005 + }, + { + "acc": 0.7124846, + "epoch": 0.03959471343465964, + "grad_norm": 7.65625, + "learning_rate": 3.959288670171122e-06, + "loss": 1.19125128, + "memory(GiB)": 299.96, + "step": 7080, + "train_speed(iter/s)": 0.133165 + }, + { + "acc": 0.70222058, + "epoch": 0.0397065629076389, + "grad_norm": 9.5625, + "learning_rate": 3.9704731014427925e-06, + "loss": 1.19918232, + "memory(GiB)": 299.96, + "step": 7100, + "train_speed(iter/s)": 0.133338 + }, + { + "acc": 0.68964567, + "epoch": 0.039818412380618166, + "grad_norm": 5.5, + "learning_rate": 3.981657532714462e-06, + "loss": 1.2980463, + "memory(GiB)": 299.96, + "step": 7120, + "train_speed(iter/s)": 0.133502 + }, + { + "acc": 0.70630131, + "epoch": 0.03993026185359743, + "grad_norm": 9.4375, + "learning_rate": 3.992841963986132e-06, + "loss": 1.21218805, + "memory(GiB)": 299.96, + "step": 7140, + "train_speed(iter/s)": 0.133668 + }, + { + "acc": 0.71587505, + "epoch": 0.040042111326576695, + "grad_norm": 5.53125, + "learning_rate": 4.0040263952578015e-06, + "loss": 1.15985918, + "memory(GiB)": 299.96, + "step": 7160, + "train_speed(iter/s)": 0.133831 + }, + { + "acc": 0.70518961, + "epoch": 0.04015396079955596, + "grad_norm": 7.21875, + "learning_rate": 4.015210826529471e-06, + "loss": 1.18980179, + "memory(GiB)": 299.96, + "step": 7180, + "train_speed(iter/s)": 0.133991 + }, + { + "acc": 0.69957795, + "epoch": 0.04026581027253522, + "grad_norm": 7.53125, + "learning_rate": 4.026395257801141e-06, + "loss": 1.23768797, + "memory(GiB)": 299.96, + "step": 7200, + "train_speed(iter/s)": 0.134156 + }, + { + "acc": 0.68764291, + "epoch": 0.04037765974551449, + "grad_norm": 11.875, + "learning_rate": 4.0375796890728105e-06, + "loss": 1.3108676, + "memory(GiB)": 299.96, + "step": 7220, + "train_speed(iter/s)": 0.134314 + }, + { + "acc": 0.70144787, + "epoch": 0.04048950921849375, + "grad_norm": 9.9375, + "learning_rate": 4.048764120344481e-06, + "loss": 1.21935062, + "memory(GiB)": 299.96, + "step": 7240, + "train_speed(iter/s)": 0.134469 + }, + { + "acc": 0.68923512, + "epoch": 0.040601358691473015, + "grad_norm": 6.9375, + "learning_rate": 4.059948551616151e-06, + "loss": 1.29810095, + "memory(GiB)": 299.96, + "step": 7260, + "train_speed(iter/s)": 0.134632 + }, + { + "acc": 0.70795674, + "epoch": 0.04071320816445228, + "grad_norm": 7.28125, + "learning_rate": 4.07113298288782e-06, + "loss": 1.18594685, + "memory(GiB)": 299.96, + "step": 7280, + "train_speed(iter/s)": 0.134778 + }, + { + "acc": 0.70635767, + "epoch": 0.04082505763743154, + "grad_norm": 9.0, + "learning_rate": 4.08231741415949e-06, + "loss": 1.18763676, + "memory(GiB)": 299.96, + "step": 7300, + "train_speed(iter/s)": 0.134939 + }, + { + "acc": 0.68808603, + "epoch": 0.04093690711041081, + "grad_norm": 12.5, + "learning_rate": 4.093501845431161e-06, + "loss": 1.29522591, + "memory(GiB)": 299.96, + "step": 7320, + "train_speed(iter/s)": 0.135082 + }, + { + "acc": 0.71375742, + "epoch": 0.04104875658339007, + "grad_norm": 12.125, + "learning_rate": 4.10468627670283e-06, + "loss": 1.17727642, + "memory(GiB)": 299.96, + "step": 7340, + "train_speed(iter/s)": 0.135225 + }, + { + "acc": 0.71107769, + "epoch": 0.041160606056369335, + "grad_norm": 7.78125, + "learning_rate": 4.1158707079745e-06, + "loss": 1.18100548, + "memory(GiB)": 299.96, + "step": 7360, + "train_speed(iter/s)": 0.13539 + }, + { + "acc": 0.71181645, + "epoch": 0.0412724555293486, + "grad_norm": 7.78125, + "learning_rate": 4.12705513924617e-06, + "loss": 1.1692873, + "memory(GiB)": 299.96, + "step": 7380, + "train_speed(iter/s)": 0.135544 + }, + { + "acc": 0.69877038, + "epoch": 0.04138430500232787, + "grad_norm": 9.125, + "learning_rate": 4.138239570517839e-06, + "loss": 1.20091486, + "memory(GiB)": 299.96, + "step": 7400, + "train_speed(iter/s)": 0.135704 + }, + { + "acc": 0.70792713, + "epoch": 0.041496154475307134, + "grad_norm": 11.625, + "learning_rate": 4.149424001789509e-06, + "loss": 1.18862925, + "memory(GiB)": 299.96, + "step": 7420, + "train_speed(iter/s)": 0.135863 + }, + { + "acc": 0.69706702, + "epoch": 0.0416080039482864, + "grad_norm": 6.6875, + "learning_rate": 4.160608433061179e-06, + "loss": 1.23282757, + "memory(GiB)": 299.96, + "step": 7440, + "train_speed(iter/s)": 0.136016 + }, + { + "acc": 0.72261968, + "epoch": 0.04171985342126566, + "grad_norm": 7.90625, + "learning_rate": 4.171792864332849e-06, + "loss": 1.13858299, + "memory(GiB)": 299.96, + "step": 7460, + "train_speed(iter/s)": 0.136168 + }, + { + "acc": 0.71092887, + "epoch": 0.041831702894244926, + "grad_norm": 6.15625, + "learning_rate": 4.182977295604519e-06, + "loss": 1.18183117, + "memory(GiB)": 299.96, + "step": 7480, + "train_speed(iter/s)": 0.136307 + }, + { + "acc": 0.69046779, + "epoch": 0.04194355236722419, + "grad_norm": 6.8125, + "learning_rate": 4.1941617268761885e-06, + "loss": 1.27194643, + "memory(GiB)": 299.96, + "step": 7500, + "train_speed(iter/s)": 0.136459 + }, + { + "acc": 0.7155684, + "epoch": 0.042055401840203455, + "grad_norm": 4.78125, + "learning_rate": 4.205346158147858e-06, + "loss": 1.16250811, + "memory(GiB)": 299.96, + "step": 7520, + "train_speed(iter/s)": 0.136623 + }, + { + "acc": 0.70854707, + "epoch": 0.04216725131318272, + "grad_norm": 9.25, + "learning_rate": 4.216530589419529e-06, + "loss": 1.18692608, + "memory(GiB)": 299.96, + "step": 7540, + "train_speed(iter/s)": 0.136771 + }, + { + "acc": 0.69293294, + "epoch": 0.04227910078616198, + "grad_norm": 6.46875, + "learning_rate": 4.227715020691198e-06, + "loss": 1.27415323, + "memory(GiB)": 299.96, + "step": 7560, + "train_speed(iter/s)": 0.136922 + }, + { + "acc": 0.70375018, + "epoch": 0.04239095025914125, + "grad_norm": 4.375, + "learning_rate": 4.238899451962868e-06, + "loss": 1.21499357, + "memory(GiB)": 299.96, + "step": 7580, + "train_speed(iter/s)": 0.137072 + }, + { + "acc": 0.72877827, + "epoch": 0.04250279973212051, + "grad_norm": 9.375, + "learning_rate": 4.250083883234538e-06, + "loss": 1.10786896, + "memory(GiB)": 299.96, + "step": 7600, + "train_speed(iter/s)": 0.137237 + }, + { + "acc": 0.69303865, + "epoch": 0.042614649205099775, + "grad_norm": 6.875, + "learning_rate": 4.2612683145062074e-06, + "loss": 1.24345512, + "memory(GiB)": 299.96, + "step": 7620, + "train_speed(iter/s)": 0.137386 + }, + { + "acc": 0.69093533, + "epoch": 0.04272649867807904, + "grad_norm": 6.78125, + "learning_rate": 4.272452745777877e-06, + "loss": 1.24031868, + "memory(GiB)": 299.96, + "step": 7640, + "train_speed(iter/s)": 0.137531 + }, + { + "acc": 0.71431093, + "epoch": 0.0428383481510583, + "grad_norm": 7.8125, + "learning_rate": 4.283637177049547e-06, + "loss": 1.15177011, + "memory(GiB)": 299.96, + "step": 7660, + "train_speed(iter/s)": 0.137689 + }, + { + "acc": 0.70323915, + "epoch": 0.04295019762403757, + "grad_norm": 10.25, + "learning_rate": 4.294821608321217e-06, + "loss": 1.22024612, + "memory(GiB)": 299.96, + "step": 7680, + "train_speed(iter/s)": 0.137826 + }, + { + "acc": 0.7012301, + "epoch": 0.04306204709701684, + "grad_norm": 9.6875, + "learning_rate": 4.306006039592887e-06, + "loss": 1.22580414, + "memory(GiB)": 299.96, + "step": 7700, + "train_speed(iter/s)": 0.137978 + }, + { + "acc": 0.72469444, + "epoch": 0.0431738965699961, + "grad_norm": 7.90625, + "learning_rate": 4.317190470864557e-06, + "loss": 1.12455044, + "memory(GiB)": 299.96, + "step": 7720, + "train_speed(iter/s)": 0.138131 + }, + { + "acc": 0.711377, + "epoch": 0.043285746042975366, + "grad_norm": 7.59375, + "learning_rate": 4.328374902136226e-06, + "loss": 1.18950481, + "memory(GiB)": 299.96, + "step": 7740, + "train_speed(iter/s)": 0.138279 + }, + { + "acc": 0.7028501, + "epoch": 0.04339759551595463, + "grad_norm": 5.3125, + "learning_rate": 4.339559333407897e-06, + "loss": 1.21593523, + "memory(GiB)": 299.96, + "step": 7760, + "train_speed(iter/s)": 0.138415 + }, + { + "acc": 0.71457825, + "epoch": 0.043509444988933894, + "grad_norm": 6.90625, + "learning_rate": 4.3507437646795665e-06, + "loss": 1.15739098, + "memory(GiB)": 299.96, + "step": 7780, + "train_speed(iter/s)": 0.138556 + }, + { + "acc": 0.69856639, + "epoch": 0.04362129446191316, + "grad_norm": 8.125, + "learning_rate": 4.361928195951236e-06, + "loss": 1.22739077, + "memory(GiB)": 299.96, + "step": 7800, + "train_speed(iter/s)": 0.138709 + }, + { + "acc": 0.69641247, + "epoch": 0.04373314393489242, + "grad_norm": 7.15625, + "learning_rate": 4.373112627222906e-06, + "loss": 1.22867804, + "memory(GiB)": 299.96, + "step": 7820, + "train_speed(iter/s)": 0.13886 + }, + { + "acc": 0.7070868, + "epoch": 0.04384499340787169, + "grad_norm": 8.5625, + "learning_rate": 4.3842970584945756e-06, + "loss": 1.18715067, + "memory(GiB)": 299.96, + "step": 7840, + "train_speed(iter/s)": 0.139002 + }, + { + "acc": 0.70544319, + "epoch": 0.04395684288085095, + "grad_norm": 5.75, + "learning_rate": 4.395481489766245e-06, + "loss": 1.18339024, + "memory(GiB)": 299.96, + "step": 7860, + "train_speed(iter/s)": 0.139152 + }, + { + "acc": 0.69881463, + "epoch": 0.044068692353830215, + "grad_norm": 4.5, + "learning_rate": 4.406665921037916e-06, + "loss": 1.21598816, + "memory(GiB)": 299.96, + "step": 7880, + "train_speed(iter/s)": 0.139298 + }, + { + "acc": 0.70186267, + "epoch": 0.04418054182680948, + "grad_norm": 3.84375, + "learning_rate": 4.4178503523095854e-06, + "loss": 1.22739248, + "memory(GiB)": 299.96, + "step": 7900, + "train_speed(iter/s)": 0.139429 + }, + { + "acc": 0.71705661, + "epoch": 0.04429239129978874, + "grad_norm": 9.25, + "learning_rate": 4.429034783581255e-06, + "loss": 1.13609715, + "memory(GiB)": 299.96, + "step": 7920, + "train_speed(iter/s)": 0.139584 + }, + { + "acc": 0.70096426, + "epoch": 0.04440424077276801, + "grad_norm": 6.78125, + "learning_rate": 4.440219214852925e-06, + "loss": 1.21562319, + "memory(GiB)": 299.96, + "step": 7940, + "train_speed(iter/s)": 0.139705 + }, + { + "acc": 0.70926747, + "epoch": 0.04451609024574727, + "grad_norm": 7.3125, + "learning_rate": 4.451403646124595e-06, + "loss": 1.17608509, + "memory(GiB)": 299.96, + "step": 7960, + "train_speed(iter/s)": 0.139855 + }, + { + "acc": 0.70355439, + "epoch": 0.044627939718726535, + "grad_norm": 6.40625, + "learning_rate": 4.462588077396265e-06, + "loss": 1.22489872, + "memory(GiB)": 299.96, + "step": 7980, + "train_speed(iter/s)": 0.139995 + }, + { + "acc": 0.70355358, + "epoch": 0.0447397891917058, + "grad_norm": 6.5, + "learning_rate": 4.473772508667935e-06, + "loss": 1.21390238, + "memory(GiB)": 299.96, + "step": 8000, + "train_speed(iter/s)": 0.140136 + }, + { + "epoch": 0.0447397891917058, + "eval_acc": 0.6658882704114083, + "eval_loss": 1.184854507446289, + "eval_runtime": 7569.6761, + "eval_samples_per_second": 9.945, + "eval_steps_per_second": 9.945, + "step": 8000 + }, + { + "acc": 0.7265492, + "epoch": 0.04485163866468507, + "grad_norm": 5.5, + "learning_rate": 4.484956939939604e-06, + "loss": 1.10444517, + "memory(GiB)": 299.96, + "step": 8020, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.71580286, + "epoch": 0.044963488137664334, + "grad_norm": 7.46875, + "learning_rate": 4.496141371211275e-06, + "loss": 1.14411573, + "memory(GiB)": 299.96, + "step": 8040, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.71480856, + "epoch": 0.0450753376106436, + "grad_norm": 9.5625, + "learning_rate": 4.5073258024829445e-06, + "loss": 1.1627923, + "memory(GiB)": 299.96, + "step": 8060, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.71156192, + "epoch": 0.04518718708362286, + "grad_norm": 7.375, + "learning_rate": 4.518510233754614e-06, + "loss": 1.16428194, + "memory(GiB)": 299.96, + "step": 8080, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.72391062, + "epoch": 0.045299036556602126, + "grad_norm": 6.46875, + "learning_rate": 4.529694665026284e-06, + "loss": 1.10136986, + "memory(GiB)": 299.96, + "step": 8100, + "train_speed(iter/s)": 0.124187 + }, + { + "acc": 0.71176424, + "epoch": 0.04541088602958139, + "grad_norm": 9.9375, + "learning_rate": 4.5408790962979536e-06, + "loss": 1.1815136, + "memory(GiB)": 299.96, + "step": 8120, + "train_speed(iter/s)": 0.124323 + }, + { + "acc": 0.70186563, + "epoch": 0.045522735502560654, + "grad_norm": 8.8125, + "learning_rate": 4.552063527569623e-06, + "loss": 1.21250572, + "memory(GiB)": 299.96, + "step": 8140, + "train_speed(iter/s)": 0.124459 + }, + { + "acc": 0.71728415, + "epoch": 0.04563458497553992, + "grad_norm": 5.6875, + "learning_rate": 4.563247958841293e-06, + "loss": 1.12473421, + "memory(GiB)": 299.96, + "step": 8160, + "train_speed(iter/s)": 0.124603 + }, + { + "acc": 0.71562667, + "epoch": 0.04574643444851918, + "grad_norm": 9.625, + "learning_rate": 4.5744323901129634e-06, + "loss": 1.14854412, + "memory(GiB)": 299.96, + "step": 8180, + "train_speed(iter/s)": 0.124743 + }, + { + "acc": 0.72000227, + "epoch": 0.04585828392149845, + "grad_norm": 9.4375, + "learning_rate": 4.585616821384633e-06, + "loss": 1.12738686, + "memory(GiB)": 299.96, + "step": 8200, + "train_speed(iter/s)": 0.124886 + }, + { + "acc": 0.70442591, + "epoch": 0.04597013339447771, + "grad_norm": 4.40625, + "learning_rate": 4.596801252656303e-06, + "loss": 1.20427332, + "memory(GiB)": 299.96, + "step": 8220, + "train_speed(iter/s)": 0.125021 + }, + { + "acc": 0.7223166, + "epoch": 0.046081982867456975, + "grad_norm": 5.0, + "learning_rate": 4.6079856839279725e-06, + "loss": 1.11271152, + "memory(GiB)": 299.96, + "step": 8240, + "train_speed(iter/s)": 0.125165 + }, + { + "acc": 0.70923843, + "epoch": 0.04619383234043624, + "grad_norm": 7.6875, + "learning_rate": 4.619170115199643e-06, + "loss": 1.16711712, + "memory(GiB)": 299.96, + "step": 8260, + "train_speed(iter/s)": 0.125309 + }, + { + "acc": 0.72162576, + "epoch": 0.0463056818134155, + "grad_norm": 7.625, + "learning_rate": 4.630354546471313e-06, + "loss": 1.13841963, + "memory(GiB)": 299.96, + "step": 8280, + "train_speed(iter/s)": 0.125438 + }, + { + "acc": 0.70153084, + "epoch": 0.04641753128639477, + "grad_norm": 5.40625, + "learning_rate": 4.641538977742982e-06, + "loss": 1.22743244, + "memory(GiB)": 299.96, + "step": 8300, + "train_speed(iter/s)": 0.125573 + }, + { + "acc": 0.72693429, + "epoch": 0.04652938075937403, + "grad_norm": 10.5, + "learning_rate": 4.652723409014652e-06, + "loss": 1.0751524, + "memory(GiB)": 299.96, + "step": 8320, + "train_speed(iter/s)": 0.125704 + }, + { + "acc": 0.71548591, + "epoch": 0.0466412302323533, + "grad_norm": 7.6875, + "learning_rate": 4.663907840286322e-06, + "loss": 1.12775517, + "memory(GiB)": 299.96, + "step": 8340, + "train_speed(iter/s)": 0.125841 + }, + { + "acc": 0.69200506, + "epoch": 0.046753079705332566, + "grad_norm": 9.875, + "learning_rate": 4.675092271557991e-06, + "loss": 1.25838022, + "memory(GiB)": 299.96, + "step": 8360, + "train_speed(iter/s)": 0.125977 + }, + { + "acc": 0.70475807, + "epoch": 0.04686492917831183, + "grad_norm": 6.53125, + "learning_rate": 4.686276702829661e-06, + "loss": 1.21798868, + "memory(GiB)": 299.96, + "step": 8380, + "train_speed(iter/s)": 0.126122 + }, + { + "acc": 0.70152016, + "epoch": 0.046976778651291094, + "grad_norm": 7.5625, + "learning_rate": 4.6974611341013316e-06, + "loss": 1.21419878, + "memory(GiB)": 299.96, + "step": 8400, + "train_speed(iter/s)": 0.126262 + }, + { + "acc": 0.70790968, + "epoch": 0.04708862812427036, + "grad_norm": 10.5625, + "learning_rate": 4.708645565373001e-06, + "loss": 1.16316605, + "memory(GiB)": 299.96, + "step": 8420, + "train_speed(iter/s)": 0.126401 + }, + { + "acc": 0.71770396, + "epoch": 0.04720047759724962, + "grad_norm": 10.75, + "learning_rate": 4.719829996644671e-06, + "loss": 1.12028332, + "memory(GiB)": 299.96, + "step": 8440, + "train_speed(iter/s)": 0.126543 + }, + { + "acc": 0.7141974, + "epoch": 0.047312327070228886, + "grad_norm": 10.125, + "learning_rate": 4.731014427916341e-06, + "loss": 1.13812943, + "memory(GiB)": 299.96, + "step": 8460, + "train_speed(iter/s)": 0.126686 + }, + { + "acc": 0.7204987, + "epoch": 0.04742417654320815, + "grad_norm": 6.09375, + "learning_rate": 4.742198859188011e-06, + "loss": 1.11063251, + "memory(GiB)": 299.96, + "step": 8480, + "train_speed(iter/s)": 0.126833 + }, + { + "acc": 0.71245494, + "epoch": 0.047536026016187415, + "grad_norm": 9.1875, + "learning_rate": 4.753383290459681e-06, + "loss": 1.17724104, + "memory(GiB)": 299.96, + "step": 8500, + "train_speed(iter/s)": 0.126966 + }, + { + "acc": 0.70417562, + "epoch": 0.04764787548916668, + "grad_norm": 4.84375, + "learning_rate": 4.7645677217313505e-06, + "loss": 1.19397373, + "memory(GiB)": 299.96, + "step": 8520, + "train_speed(iter/s)": 0.127101 + }, + { + "acc": 0.69817128, + "epoch": 0.04775972496214594, + "grad_norm": 6.0, + "learning_rate": 4.77575215300302e-06, + "loss": 1.24930525, + "memory(GiB)": 299.96, + "step": 8540, + "train_speed(iter/s)": 0.127243 + }, + { + "acc": 0.70625525, + "epoch": 0.04787157443512521, + "grad_norm": 12.3125, + "learning_rate": 4.78693658427469e-06, + "loss": 1.19688358, + "memory(GiB)": 299.96, + "step": 8560, + "train_speed(iter/s)": 0.12738 + }, + { + "acc": 0.70244207, + "epoch": 0.04798342390810447, + "grad_norm": 6.5625, + "learning_rate": 4.7981210155463595e-06, + "loss": 1.19333096, + "memory(GiB)": 299.96, + "step": 8580, + "train_speed(iter/s)": 0.12751 + }, + { + "acc": 0.71265764, + "epoch": 0.048095273381083735, + "grad_norm": 7.9375, + "learning_rate": 4.809305446818029e-06, + "loss": 1.17247057, + "memory(GiB)": 299.96, + "step": 8600, + "train_speed(iter/s)": 0.127637 + }, + { + "acc": 0.69840655, + "epoch": 0.048207122854063, + "grad_norm": 9.4375, + "learning_rate": 4.8204898780897e-06, + "loss": 1.23635654, + "memory(GiB)": 299.96, + "step": 8620, + "train_speed(iter/s)": 0.12777 + }, + { + "acc": 0.69401207, + "epoch": 0.04831897232704227, + "grad_norm": 8.75, + "learning_rate": 4.831674309361369e-06, + "loss": 1.24210711, + "memory(GiB)": 299.96, + "step": 8640, + "train_speed(iter/s)": 0.1279 + }, + { + "acc": 0.70990448, + "epoch": 0.048430821800021534, + "grad_norm": 8.625, + "learning_rate": 4.842858740633039e-06, + "loss": 1.17969246, + "memory(GiB)": 299.96, + "step": 8660, + "train_speed(iter/s)": 0.128032 + }, + { + "acc": 0.72505236, + "epoch": 0.0485426712730008, + "grad_norm": 9.4375, + "learning_rate": 4.8540431719047096e-06, + "loss": 1.09325104, + "memory(GiB)": 299.96, + "step": 8680, + "train_speed(iter/s)": 0.128177 + }, + { + "acc": 0.70079322, + "epoch": 0.04865452074598006, + "grad_norm": 4.5625, + "learning_rate": 4.865227603176379e-06, + "loss": 1.22333364, + "memory(GiB)": 299.96, + "step": 8700, + "train_speed(iter/s)": 0.128311 + }, + { + "acc": 0.709232, + "epoch": 0.048766370218959326, + "grad_norm": 6.71875, + "learning_rate": 4.876412034448049e-06, + "loss": 1.17358265, + "memory(GiB)": 299.96, + "step": 8720, + "train_speed(iter/s)": 0.128442 + }, + { + "acc": 0.70477204, + "epoch": 0.04887821969193859, + "grad_norm": 9.75, + "learning_rate": 4.887596465719719e-06, + "loss": 1.18707151, + "memory(GiB)": 299.96, + "step": 8740, + "train_speed(iter/s)": 0.128586 + }, + { + "acc": 0.70427413, + "epoch": 0.048990069164917854, + "grad_norm": 5.90625, + "learning_rate": 4.898780896991388e-06, + "loss": 1.19930954, + "memory(GiB)": 299.96, + "step": 8760, + "train_speed(iter/s)": 0.128711 + }, + { + "acc": 0.72945166, + "epoch": 0.04910191863789712, + "grad_norm": 7.65625, + "learning_rate": 4.909965328263058e-06, + "loss": 1.09345999, + "memory(GiB)": 299.96, + "step": 8780, + "train_speed(iter/s)": 0.128836 + }, + { + "acc": 0.70682173, + "epoch": 0.04921376811087638, + "grad_norm": 8.1875, + "learning_rate": 4.921149759534728e-06, + "loss": 1.19012861, + "memory(GiB)": 299.96, + "step": 8800, + "train_speed(iter/s)": 0.128975 + }, + { + "acc": 0.70946302, + "epoch": 0.04932561758385565, + "grad_norm": 6.125, + "learning_rate": 4.932334190806397e-06, + "loss": 1.16786041, + "memory(GiB)": 299.96, + "step": 8820, + "train_speed(iter/s)": 0.129099 + }, + { + "acc": 0.7223022, + "epoch": 0.04943746705683491, + "grad_norm": 4.8125, + "learning_rate": 4.943518622078068e-06, + "loss": 1.11797819, + "memory(GiB)": 299.96, + "step": 8840, + "train_speed(iter/s)": 0.129214 + }, + { + "acc": 0.70740409, + "epoch": 0.049549316529814175, + "grad_norm": 10.3125, + "learning_rate": 4.9547030533497375e-06, + "loss": 1.19851341, + "memory(GiB)": 299.96, + "step": 8860, + "train_speed(iter/s)": 0.129338 + }, + { + "acc": 0.70422297, + "epoch": 0.04966116600279344, + "grad_norm": 6.46875, + "learning_rate": 4.965887484621407e-06, + "loss": 1.21236019, + "memory(GiB)": 299.96, + "step": 8880, + "train_speed(iter/s)": 0.129465 + }, + { + "acc": 0.69122219, + "epoch": 0.0497730154757727, + "grad_norm": 12.0, + "learning_rate": 4.977071915893078e-06, + "loss": 1.26360512, + "memory(GiB)": 299.96, + "step": 8900, + "train_speed(iter/s)": 0.129602 + }, + { + "acc": 0.69969778, + "epoch": 0.04988486494875197, + "grad_norm": 6.28125, + "learning_rate": 4.988256347164747e-06, + "loss": 1.21888866, + "memory(GiB)": 299.96, + "step": 8920, + "train_speed(iter/s)": 0.129737 + }, + { + "acc": 0.72713814, + "epoch": 0.04999671442173123, + "grad_norm": 5.5, + "learning_rate": 4.999440778436417e-06, + "loss": 1.07195892, + "memory(GiB)": 299.96, + "step": 8940, + "train_speed(iter/s)": 0.12987 + }, + { + "acc": 0.70497022, + "epoch": 0.0501085638947105, + "grad_norm": 10.625, + "learning_rate": 5.010625209708087e-06, + "loss": 1.19557695, + "memory(GiB)": 299.96, + "step": 8960, + "train_speed(iter/s)": 0.129995 + }, + { + "acc": 0.69554772, + "epoch": 0.050220413367689766, + "grad_norm": 8.875, + "learning_rate": 5.021809640979756e-06, + "loss": 1.22863503, + "memory(GiB)": 299.96, + "step": 8980, + "train_speed(iter/s)": 0.130121 + }, + { + "acc": 0.71280522, + "epoch": 0.05033226284066903, + "grad_norm": 5.59375, + "learning_rate": 5.032994072251426e-06, + "loss": 1.16610546, + "memory(GiB)": 299.96, + "step": 9000, + "train_speed(iter/s)": 0.130247 + }, + { + "acc": 0.71536026, + "epoch": 0.050444112313648294, + "grad_norm": 7.5625, + "learning_rate": 5.044178503523096e-06, + "loss": 1.16557169, + "memory(GiB)": 299.96, + "step": 9020, + "train_speed(iter/s)": 0.130368 + }, + { + "acc": 0.71387568, + "epoch": 0.05055596178662756, + "grad_norm": 8.25, + "learning_rate": 5.055362934794766e-06, + "loss": 1.15323029, + "memory(GiB)": 299.96, + "step": 9040, + "train_speed(iter/s)": 0.130495 + }, + { + "acc": 0.71153097, + "epoch": 0.05066781125960682, + "grad_norm": 5.90625, + "learning_rate": 5.066547366066435e-06, + "loss": 1.18382444, + "memory(GiB)": 299.96, + "step": 9060, + "train_speed(iter/s)": 0.130611 + }, + { + "acc": 0.70371294, + "epoch": 0.050779660732586086, + "grad_norm": 7.9375, + "learning_rate": 5.077731797338106e-06, + "loss": 1.22373562, + "memory(GiB)": 299.96, + "step": 9080, + "train_speed(iter/s)": 0.130737 + }, + { + "acc": 0.71678371, + "epoch": 0.05089151020556535, + "grad_norm": 6.71875, + "learning_rate": 5.088916228609776e-06, + "loss": 1.15099764, + "memory(GiB)": 299.96, + "step": 9100, + "train_speed(iter/s)": 0.130867 + }, + { + "acc": 0.71786375, + "epoch": 0.051003359678544614, + "grad_norm": 10.5625, + "learning_rate": 5.100100659881445e-06, + "loss": 1.1326128, + "memory(GiB)": 299.96, + "step": 9120, + "train_speed(iter/s)": 0.130977 + }, + { + "acc": 0.6979527, + "epoch": 0.05111520915152388, + "grad_norm": 6.8125, + "learning_rate": 5.1112850911531155e-06, + "loss": 1.22531328, + "memory(GiB)": 299.96, + "step": 9140, + "train_speed(iter/s)": 0.131104 + }, + { + "acc": 0.71337256, + "epoch": 0.05122705862450314, + "grad_norm": 9.5, + "learning_rate": 5.122469522424785e-06, + "loss": 1.14899397, + "memory(GiB)": 299.96, + "step": 9160, + "train_speed(iter/s)": 0.131227 + }, + { + "acc": 0.71599259, + "epoch": 0.05133890809748241, + "grad_norm": 9.0625, + "learning_rate": 5.133653953696455e-06, + "loss": 1.15710392, + "memory(GiB)": 299.96, + "step": 9180, + "train_speed(iter/s)": 0.131356 + }, + { + "acc": 0.7069881, + "epoch": 0.05145075757046167, + "grad_norm": 4.90625, + "learning_rate": 5.1448383849681245e-06, + "loss": 1.16104679, + "memory(GiB)": 299.96, + "step": 9200, + "train_speed(iter/s)": 0.131479 + }, + { + "acc": 0.71886282, + "epoch": 0.051562607043440935, + "grad_norm": 7.15625, + "learning_rate": 5.156022816239795e-06, + "loss": 1.12356787, + "memory(GiB)": 299.96, + "step": 9220, + "train_speed(iter/s)": 0.131609 + }, + { + "acc": 0.72133718, + "epoch": 0.0516744565164202, + "grad_norm": 6.25, + "learning_rate": 5.167207247511464e-06, + "loss": 1.11381378, + "memory(GiB)": 299.96, + "step": 9240, + "train_speed(iter/s)": 0.131739 + }, + { + "acc": 0.69137135, + "epoch": 0.05178630598939946, + "grad_norm": 9.4375, + "learning_rate": 5.178391678783134e-06, + "loss": 1.25587215, + "memory(GiB)": 299.96, + "step": 9260, + "train_speed(iter/s)": 0.131863 + }, + { + "acc": 0.70421348, + "epoch": 0.051898155462378734, + "grad_norm": 9.625, + "learning_rate": 5.189576110054804e-06, + "loss": 1.1985384, + "memory(GiB)": 299.96, + "step": 9280, + "train_speed(iter/s)": 0.131981 + }, + { + "acc": 0.70958829, + "epoch": 0.052010004935358, + "grad_norm": 5.15625, + "learning_rate": 5.200760541326474e-06, + "loss": 1.19264317, + "memory(GiB)": 299.96, + "step": 9300, + "train_speed(iter/s)": 0.132104 + }, + { + "acc": 0.70784302, + "epoch": 0.05212185440833726, + "grad_norm": 5.375, + "learning_rate": 5.211944972598144e-06, + "loss": 1.18570957, + "memory(GiB)": 299.96, + "step": 9320, + "train_speed(iter/s)": 0.132231 + }, + { + "acc": 0.69248877, + "epoch": 0.052233703881316526, + "grad_norm": 5.34375, + "learning_rate": 5.223129403869813e-06, + "loss": 1.25045004, + "memory(GiB)": 299.96, + "step": 9340, + "train_speed(iter/s)": 0.132341 + }, + { + "acc": 0.71304617, + "epoch": 0.05234555335429579, + "grad_norm": 6.21875, + "learning_rate": 5.234313835141484e-06, + "loss": 1.16476479, + "memory(GiB)": 299.96, + "step": 9360, + "train_speed(iter/s)": 0.132462 + }, + { + "acc": 0.72526808, + "epoch": 0.052457402827275054, + "grad_norm": 7.0625, + "learning_rate": 5.245498266413154e-06, + "loss": 1.10406342, + "memory(GiB)": 299.96, + "step": 9380, + "train_speed(iter/s)": 0.132579 + }, + { + "acc": 0.6963306, + "epoch": 0.05256925230025432, + "grad_norm": 7.3125, + "learning_rate": 5.256682697684823e-06, + "loss": 1.21857586, + "memory(GiB)": 299.96, + "step": 9400, + "train_speed(iter/s)": 0.132709 + }, + { + "acc": 0.72719784, + "epoch": 0.05268110177323358, + "grad_norm": 8.25, + "learning_rate": 5.2678671289564935e-06, + "loss": 1.07270069, + "memory(GiB)": 299.96, + "step": 9420, + "train_speed(iter/s)": 0.132824 + }, + { + "acc": 0.70962424, + "epoch": 0.052792951246212846, + "grad_norm": 7.28125, + "learning_rate": 5.279051560228163e-06, + "loss": 1.17336626, + "memory(GiB)": 299.96, + "step": 9440, + "train_speed(iter/s)": 0.132929 + }, + { + "acc": 0.73624983, + "epoch": 0.05290480071919211, + "grad_norm": 8.875, + "learning_rate": 5.290235991499833e-06, + "loss": 1.072999, + "memory(GiB)": 299.96, + "step": 9460, + "train_speed(iter/s)": 0.133051 + }, + { + "acc": 0.70687284, + "epoch": 0.053016650192171375, + "grad_norm": 7.125, + "learning_rate": 5.3014204227715025e-06, + "loss": 1.19799271, + "memory(GiB)": 299.96, + "step": 9480, + "train_speed(iter/s)": 0.133162 + }, + { + "acc": 0.71825976, + "epoch": 0.05312849966515064, + "grad_norm": 7.34375, + "learning_rate": 5.312604854043172e-06, + "loss": 1.12469406, + "memory(GiB)": 299.96, + "step": 9500, + "train_speed(iter/s)": 0.133288 + }, + { + "acc": 0.7305407, + "epoch": 0.0532403491381299, + "grad_norm": 7.09375, + "learning_rate": 5.323789285314842e-06, + "loss": 1.06359444, + "memory(GiB)": 299.96, + "step": 9520, + "train_speed(iter/s)": 0.133401 + }, + { + "acc": 0.70872116, + "epoch": 0.05335219861110917, + "grad_norm": 6.9375, + "learning_rate": 5.334973716586512e-06, + "loss": 1.19059315, + "memory(GiB)": 299.96, + "step": 9540, + "train_speed(iter/s)": 0.13352 + }, + { + "acc": 0.69863009, + "epoch": 0.05346404808408843, + "grad_norm": 12.125, + "learning_rate": 5.346158147858181e-06, + "loss": 1.22009735, + "memory(GiB)": 299.96, + "step": 9560, + "train_speed(iter/s)": 0.133637 + }, + { + "acc": 0.70176463, + "epoch": 0.053575897557067695, + "grad_norm": 10.5, + "learning_rate": 5.357342579129852e-06, + "loss": 1.22134657, + "memory(GiB)": 299.96, + "step": 9580, + "train_speed(iter/s)": 0.133759 + }, + { + "acc": 0.71072907, + "epoch": 0.053687747030046966, + "grad_norm": 6.25, + "learning_rate": 5.368527010401522e-06, + "loss": 1.1751194, + "memory(GiB)": 299.96, + "step": 9600, + "train_speed(iter/s)": 0.133882 + }, + { + "acc": 0.69468837, + "epoch": 0.05379959650302623, + "grad_norm": 9.875, + "learning_rate": 5.379711441673191e-06, + "loss": 1.22086496, + "memory(GiB)": 299.96, + "step": 9620, + "train_speed(iter/s)": 0.133995 + }, + { + "acc": 0.71877322, + "epoch": 0.053911445976005494, + "grad_norm": 9.875, + "learning_rate": 5.390895872944862e-06, + "loss": 1.15759592, + "memory(GiB)": 299.96, + "step": 9640, + "train_speed(iter/s)": 0.134111 + }, + { + "acc": 0.72436094, + "epoch": 0.05402329544898476, + "grad_norm": 8.4375, + "learning_rate": 5.402080304216531e-06, + "loss": 1.09483414, + "memory(GiB)": 299.96, + "step": 9660, + "train_speed(iter/s)": 0.134225 + }, + { + "acc": 0.72363629, + "epoch": 0.05413514492196402, + "grad_norm": 6.46875, + "learning_rate": 5.413264735488201e-06, + "loss": 1.11703243, + "memory(GiB)": 299.96, + "step": 9680, + "train_speed(iter/s)": 0.134341 + }, + { + "acc": 0.71038847, + "epoch": 0.054246994394943286, + "grad_norm": 6.3125, + "learning_rate": 5.424449166759871e-06, + "loss": 1.18120356, + "memory(GiB)": 299.96, + "step": 9700, + "train_speed(iter/s)": 0.134462 + }, + { + "acc": 0.70523334, + "epoch": 0.05435884386792255, + "grad_norm": 5.96875, + "learning_rate": 5.43563359803154e-06, + "loss": 1.22427483, + "memory(GiB)": 299.96, + "step": 9720, + "train_speed(iter/s)": 0.134568 + }, + { + "acc": 0.71382289, + "epoch": 0.054470693340901814, + "grad_norm": 10.0, + "learning_rate": 5.44681802930321e-06, + "loss": 1.1579318, + "memory(GiB)": 299.96, + "step": 9740, + "train_speed(iter/s)": 0.134689 + }, + { + "acc": 0.72146707, + "epoch": 0.05458254281388108, + "grad_norm": 7.71875, + "learning_rate": 5.4580024605748805e-06, + "loss": 1.11492147, + "memory(GiB)": 299.96, + "step": 9760, + "train_speed(iter/s)": 0.13481 + }, + { + "acc": 0.70014243, + "epoch": 0.05469439228686034, + "grad_norm": 10.625, + "learning_rate": 5.469186891846549e-06, + "loss": 1.21574612, + "memory(GiB)": 299.96, + "step": 9780, + "train_speed(iter/s)": 0.134918 + }, + { + "acc": 0.70798054, + "epoch": 0.054806241759839606, + "grad_norm": 8.9375, + "learning_rate": 5.48037132311822e-06, + "loss": 1.1770628, + "memory(GiB)": 299.96, + "step": 9800, + "train_speed(iter/s)": 0.13504 + }, + { + "acc": 0.72976103, + "epoch": 0.05491809123281887, + "grad_norm": 8.0, + "learning_rate": 5.49155575438989e-06, + "loss": 1.1172205, + "memory(GiB)": 299.96, + "step": 9820, + "train_speed(iter/s)": 0.135157 + }, + { + "acc": 0.72152367, + "epoch": 0.055029940705798135, + "grad_norm": 6.96875, + "learning_rate": 5.502740185661559e-06, + "loss": 1.11710224, + "memory(GiB)": 299.96, + "step": 9840, + "train_speed(iter/s)": 0.135268 + }, + { + "acc": 0.71225104, + "epoch": 0.0551417901787774, + "grad_norm": 7.1875, + "learning_rate": 5.51392461693323e-06, + "loss": 1.20216808, + "memory(GiB)": 299.96, + "step": 9860, + "train_speed(iter/s)": 0.135385 + }, + { + "acc": 0.70701699, + "epoch": 0.05525363965175666, + "grad_norm": 8.375, + "learning_rate": 5.5251090482048994e-06, + "loss": 1.19581928, + "memory(GiB)": 299.96, + "step": 9880, + "train_speed(iter/s)": 0.135502 + }, + { + "acc": 0.72565565, + "epoch": 0.055365489124735934, + "grad_norm": 9.0625, + "learning_rate": 5.536293479476569e-06, + "loss": 1.11051903, + "memory(GiB)": 299.96, + "step": 9900, + "train_speed(iter/s)": 0.135625 + }, + { + "acc": 0.72691588, + "epoch": 0.0554773385977152, + "grad_norm": 9.0, + "learning_rate": 5.547477910748239e-06, + "loss": 1.07491875, + "memory(GiB)": 299.96, + "step": 9920, + "train_speed(iter/s)": 0.135745 + }, + { + "acc": 0.73065805, + "epoch": 0.05558918807069446, + "grad_norm": 7.0, + "learning_rate": 5.558662342019909e-06, + "loss": 1.05284004, + "memory(GiB)": 299.96, + "step": 9940, + "train_speed(iter/s)": 0.135855 + }, + { + "acc": 0.7144557, + "epoch": 0.055701037543673726, + "grad_norm": 9.125, + "learning_rate": 5.569846773291578e-06, + "loss": 1.16560268, + "memory(GiB)": 299.96, + "step": 9960, + "train_speed(iter/s)": 0.135962 + }, + { + "acc": 0.70777588, + "epoch": 0.05581288701665299, + "grad_norm": 10.375, + "learning_rate": 5.581031204563249e-06, + "loss": 1.19013681, + "memory(GiB)": 299.96, + "step": 9980, + "train_speed(iter/s)": 0.136078 + }, + { + "acc": 0.71017017, + "epoch": 0.055924736489632254, + "grad_norm": 6.25, + "learning_rate": 5.5922156358349175e-06, + "loss": 1.16805954, + "memory(GiB)": 299.96, + "step": 10000, + "train_speed(iter/s)": 0.136189 + }, + { + "epoch": 0.055924736489632254, + "eval_acc": 0.6712207037826114, + "eval_loss": 1.1582913398742676, + "eval_runtime": 7517.0166, + "eval_samples_per_second": 10.015, + "eval_steps_per_second": 10.015, + "step": 10000 + }, + { + "acc": 0.71664715, + "epoch": 0.05603658596261152, + "grad_norm": 6.9375, + "learning_rate": 5.603400067106588e-06, + "loss": 1.13323917, + "memory(GiB)": 299.96, + "step": 10020, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.72171082, + "epoch": 0.05614843543559078, + "grad_norm": 7.78125, + "learning_rate": 5.6145844983782585e-06, + "loss": 1.10008745, + "memory(GiB)": 299.96, + "step": 10040, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.70081844, + "epoch": 0.056260284908570046, + "grad_norm": 7.375, + "learning_rate": 5.625768929649927e-06, + "loss": 1.20919399, + "memory(GiB)": 299.96, + "step": 10060, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.69940624, + "epoch": 0.05637213438154931, + "grad_norm": 5.5, + "learning_rate": 5.636953360921598e-06, + "loss": 1.23210831, + "memory(GiB)": 299.96, + "step": 10080, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.72014136, + "epoch": 0.056483983854528574, + "grad_norm": 8.1875, + "learning_rate": 5.6481377921932676e-06, + "loss": 1.11277971, + "memory(GiB)": 299.96, + "step": 10100, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.72149382, + "epoch": 0.05659583332750784, + "grad_norm": 7.90625, + "learning_rate": 5.659322223464937e-06, + "loss": 1.13494768, + "memory(GiB)": 299.96, + "step": 10120, + "train_speed(iter/s)": 0.124015 + }, + { + "acc": 0.71404977, + "epoch": 0.0567076828004871, + "grad_norm": 12.625, + "learning_rate": 5.670506654736607e-06, + "loss": 1.15679636, + "memory(GiB)": 299.96, + "step": 10140, + "train_speed(iter/s)": 0.124134 + }, + { + "acc": 0.69529619, + "epoch": 0.05681953227346637, + "grad_norm": 6.5, + "learning_rate": 5.6816910860082774e-06, + "loss": 1.2415, + "memory(GiB)": 299.96, + "step": 10160, + "train_speed(iter/s)": 0.124245 + }, + { + "acc": 0.71230559, + "epoch": 0.05693138174644563, + "grad_norm": 6.59375, + "learning_rate": 5.692875517279946e-06, + "loss": 1.14807358, + "memory(GiB)": 299.96, + "step": 10180, + "train_speed(iter/s)": 0.124367 + }, + { + "acc": 0.72374377, + "epoch": 0.057043231219424895, + "grad_norm": 5.75, + "learning_rate": 5.704059948551617e-06, + "loss": 1.11200094, + "memory(GiB)": 299.96, + "step": 10200, + "train_speed(iter/s)": 0.124482 + }, + { + "acc": 0.71762099, + "epoch": 0.057155080692404166, + "grad_norm": 8.0625, + "learning_rate": 5.715244379823286e-06, + "loss": 1.1579071, + "memory(GiB)": 299.96, + "step": 10220, + "train_speed(iter/s)": 0.124583 + }, + { + "acc": 0.70851016, + "epoch": 0.05726693016538343, + "grad_norm": 7.78125, + "learning_rate": 5.726428811094956e-06, + "loss": 1.16826992, + "memory(GiB)": 299.96, + "step": 10240, + "train_speed(iter/s)": 0.124698 + }, + { + "acc": 0.7219636, + "epoch": 0.057378779638362694, + "grad_norm": 10.875, + "learning_rate": 5.737613242366627e-06, + "loss": 1.09053097, + "memory(GiB)": 299.96, + "step": 10260, + "train_speed(iter/s)": 0.124813 + }, + { + "acc": 0.71494932, + "epoch": 0.05749062911134196, + "grad_norm": 5.65625, + "learning_rate": 5.7487976736382955e-06, + "loss": 1.15278759, + "memory(GiB)": 299.96, + "step": 10280, + "train_speed(iter/s)": 0.124918 + }, + { + "acc": 0.72196231, + "epoch": 0.05760247858432122, + "grad_norm": 6.5, + "learning_rate": 5.759982104909966e-06, + "loss": 1.12844934, + "memory(GiB)": 299.96, + "step": 10300, + "train_speed(iter/s)": 0.125041 + }, + { + "acc": 0.70081997, + "epoch": 0.057714328057300486, + "grad_norm": 6.9375, + "learning_rate": 5.771166536181636e-06, + "loss": 1.22606668, + "memory(GiB)": 299.96, + "step": 10320, + "train_speed(iter/s)": 0.125154 + }, + { + "acc": 0.72489882, + "epoch": 0.05782617753027975, + "grad_norm": 8.1875, + "learning_rate": 5.782350967453305e-06, + "loss": 1.10529366, + "memory(GiB)": 299.96, + "step": 10340, + "train_speed(iter/s)": 0.125262 + }, + { + "acc": 0.72544928, + "epoch": 0.057938027003259014, + "grad_norm": 7.5, + "learning_rate": 5.793535398724975e-06, + "loss": 1.10018883, + "memory(GiB)": 299.96, + "step": 10360, + "train_speed(iter/s)": 0.125368 + }, + { + "acc": 0.70851727, + "epoch": 0.05804987647623828, + "grad_norm": 8.5, + "learning_rate": 5.8047198299966456e-06, + "loss": 1.1762248, + "memory(GiB)": 299.96, + "step": 10380, + "train_speed(iter/s)": 0.125477 + }, + { + "acc": 0.73058667, + "epoch": 0.05816172594921754, + "grad_norm": 6.40625, + "learning_rate": 5.815904261268314e-06, + "loss": 1.0732954, + "memory(GiB)": 299.96, + "step": 10400, + "train_speed(iter/s)": 0.12559 + }, + { + "acc": 0.72452736, + "epoch": 0.058273575422196806, + "grad_norm": 7.375, + "learning_rate": 5.827088692539985e-06, + "loss": 1.10327177, + "memory(GiB)": 299.96, + "step": 10420, + "train_speed(iter/s)": 0.1257 + }, + { + "acc": 0.70430465, + "epoch": 0.05838542489517607, + "grad_norm": 6.875, + "learning_rate": 5.838273123811654e-06, + "loss": 1.21283627, + "memory(GiB)": 299.96, + "step": 10440, + "train_speed(iter/s)": 0.125817 + }, + { + "acc": 0.72405324, + "epoch": 0.058497274368155334, + "grad_norm": 7.3125, + "learning_rate": 5.849457555083324e-06, + "loss": 1.10148153, + "memory(GiB)": 299.96, + "step": 10460, + "train_speed(iter/s)": 0.12593 + }, + { + "acc": 0.72567387, + "epoch": 0.0586091238411346, + "grad_norm": 8.5, + "learning_rate": 5.860641986354995e-06, + "loss": 1.09689226, + "memory(GiB)": 299.96, + "step": 10480, + "train_speed(iter/s)": 0.126047 + }, + { + "acc": 0.70673981, + "epoch": 0.05872097331411386, + "grad_norm": 6.96875, + "learning_rate": 5.871826417626664e-06, + "loss": 1.20399017, + "memory(GiB)": 299.96, + "step": 10500, + "train_speed(iter/s)": 0.126157 + }, + { + "acc": 0.71642456, + "epoch": 0.05883282278709313, + "grad_norm": 7.5, + "learning_rate": 5.883010848898334e-06, + "loss": 1.13472891, + "memory(GiB)": 299.96, + "step": 10520, + "train_speed(iter/s)": 0.126262 + }, + { + "acc": 0.71835055, + "epoch": 0.0589446722600724, + "grad_norm": 7.59375, + "learning_rate": 5.894195280170004e-06, + "loss": 1.13913631, + "memory(GiB)": 299.96, + "step": 10540, + "train_speed(iter/s)": 0.126372 + }, + { + "acc": 0.70861583, + "epoch": 0.05905652173305166, + "grad_norm": 5.4375, + "learning_rate": 5.9053797114416735e-06, + "loss": 1.16713648, + "memory(GiB)": 299.96, + "step": 10560, + "train_speed(iter/s)": 0.126477 + }, + { + "acc": 0.71312428, + "epoch": 0.059168371206030926, + "grad_norm": 9.4375, + "learning_rate": 5.916564142713343e-06, + "loss": 1.15231895, + "memory(GiB)": 299.96, + "step": 10580, + "train_speed(iter/s)": 0.126588 + }, + { + "acc": 0.69800992, + "epoch": 0.05928022067901019, + "grad_norm": 7.15625, + "learning_rate": 5.927748573985014e-06, + "loss": 1.23267593, + "memory(GiB)": 299.96, + "step": 10600, + "train_speed(iter/s)": 0.126697 + }, + { + "acc": 0.7122499, + "epoch": 0.059392070151989454, + "grad_norm": 6.8125, + "learning_rate": 5.9389330052566825e-06, + "loss": 1.15451059, + "memory(GiB)": 299.96, + "step": 10620, + "train_speed(iter/s)": 0.126808 + }, + { + "acc": 0.69946165, + "epoch": 0.05950391962496872, + "grad_norm": 8.0625, + "learning_rate": 5.950117436528353e-06, + "loss": 1.19401197, + "memory(GiB)": 299.96, + "step": 10640, + "train_speed(iter/s)": 0.126921 + }, + { + "acc": 0.70214896, + "epoch": 0.05961576909794798, + "grad_norm": 9.375, + "learning_rate": 5.9613018678000236e-06, + "loss": 1.19570103, + "memory(GiB)": 299.96, + "step": 10660, + "train_speed(iter/s)": 0.127026 + }, + { + "acc": 0.70910711, + "epoch": 0.059727618570927246, + "grad_norm": 8.25, + "learning_rate": 5.972486299071692e-06, + "loss": 1.17798615, + "memory(GiB)": 299.96, + "step": 10680, + "train_speed(iter/s)": 0.127135 + }, + { + "acc": 0.71458559, + "epoch": 0.05983946804390651, + "grad_norm": 9.375, + "learning_rate": 5.983670730343363e-06, + "loss": 1.12947721, + "memory(GiB)": 299.96, + "step": 10700, + "train_speed(iter/s)": 0.127245 + }, + { + "acc": 0.72588181, + "epoch": 0.059951317516885774, + "grad_norm": 8.0, + "learning_rate": 5.994855161615032e-06, + "loss": 1.06825294, + "memory(GiB)": 299.96, + "step": 10720, + "train_speed(iter/s)": 0.127348 + }, + { + "acc": 0.71644325, + "epoch": 0.06006316698986504, + "grad_norm": 7.09375, + "learning_rate": 6.006039592886702e-06, + "loss": 1.13753195, + "memory(GiB)": 299.96, + "step": 10740, + "train_speed(iter/s)": 0.127456 + }, + { + "acc": 0.71439009, + "epoch": 0.0601750164628443, + "grad_norm": 13.1875, + "learning_rate": 6.017224024158373e-06, + "loss": 1.1487052, + "memory(GiB)": 299.96, + "step": 10760, + "train_speed(iter/s)": 0.127562 + }, + { + "acc": 0.72015643, + "epoch": 0.060286865935823566, + "grad_norm": 6.28125, + "learning_rate": 6.028408455430042e-06, + "loss": 1.1129921, + "memory(GiB)": 299.96, + "step": 10780, + "train_speed(iter/s)": 0.127674 + }, + { + "acc": 0.70994692, + "epoch": 0.06039871540880283, + "grad_norm": 9.1875, + "learning_rate": 6.039592886701712e-06, + "loss": 1.1588232, + "memory(GiB)": 299.96, + "step": 10800, + "train_speed(iter/s)": 0.127785 + }, + { + "acc": 0.71006784, + "epoch": 0.060510564881782095, + "grad_norm": 6.25, + "learning_rate": 6.050777317973382e-06, + "loss": 1.15402174, + "memory(GiB)": 299.96, + "step": 10820, + "train_speed(iter/s)": 0.1279 + }, + { + "acc": 0.71408148, + "epoch": 0.060622414354761366, + "grad_norm": 4.3125, + "learning_rate": 6.0619617492450515e-06, + "loss": 1.16799107, + "memory(GiB)": 299.96, + "step": 10840, + "train_speed(iter/s)": 0.128004 + }, + { + "acc": 0.73140116, + "epoch": 0.06073426382774063, + "grad_norm": 9.3125, + "learning_rate": 6.073146180516721e-06, + "loss": 1.07642069, + "memory(GiB)": 299.96, + "step": 10860, + "train_speed(iter/s)": 0.128117 + }, + { + "acc": 0.69099669, + "epoch": 0.060846113300719894, + "grad_norm": 5.59375, + "learning_rate": 6.084330611788392e-06, + "loss": 1.23695374, + "memory(GiB)": 299.96, + "step": 10880, + "train_speed(iter/s)": 0.128229 + }, + { + "acc": 0.70815163, + "epoch": 0.06095796277369916, + "grad_norm": 6.96875, + "learning_rate": 6.0955150430600605e-06, + "loss": 1.19023066, + "memory(GiB)": 299.96, + "step": 10900, + "train_speed(iter/s)": 0.128334 + }, + { + "acc": 0.72178211, + "epoch": 0.06106981224667842, + "grad_norm": 10.5, + "learning_rate": 6.106699474331731e-06, + "loss": 1.09564171, + "memory(GiB)": 299.96, + "step": 10920, + "train_speed(iter/s)": 0.128433 + }, + { + "acc": 0.71405339, + "epoch": 0.061181661719657686, + "grad_norm": 7.65625, + "learning_rate": 6.1178839056034e-06, + "loss": 1.16855659, + "memory(GiB)": 299.96, + "step": 10940, + "train_speed(iter/s)": 0.128542 + }, + { + "acc": 0.72709937, + "epoch": 0.06129351119263695, + "grad_norm": 12.25, + "learning_rate": 6.12906833687507e-06, + "loss": 1.09898691, + "memory(GiB)": 299.96, + "step": 10960, + "train_speed(iter/s)": 0.128648 + }, + { + "acc": 0.71631103, + "epoch": 0.061405360665616214, + "grad_norm": 7.4375, + "learning_rate": 6.140252768146741e-06, + "loss": 1.15363579, + "memory(GiB)": 299.96, + "step": 10980, + "train_speed(iter/s)": 0.128749 + }, + { + "acc": 0.71039243, + "epoch": 0.06151721013859548, + "grad_norm": 5.90625, + "learning_rate": 6.15143719941841e-06, + "loss": 1.16717882, + "memory(GiB)": 299.96, + "step": 11000, + "train_speed(iter/s)": 0.128848 + }, + { + "acc": 0.70081944, + "epoch": 0.06162905961157474, + "grad_norm": 5.9375, + "learning_rate": 6.16262163069008e-06, + "loss": 1.19489174, + "memory(GiB)": 299.96, + "step": 11020, + "train_speed(iter/s)": 0.128955 + }, + { + "acc": 0.70096021, + "epoch": 0.061740909084554006, + "grad_norm": 8.375, + "learning_rate": 6.17380606196175e-06, + "loss": 1.22352867, + "memory(GiB)": 299.96, + "step": 11040, + "train_speed(iter/s)": 0.129062 + }, + { + "acc": 0.72692857, + "epoch": 0.06185275855753327, + "grad_norm": 9.0, + "learning_rate": 6.18499049323342e-06, + "loss": 1.0857914, + "memory(GiB)": 299.96, + "step": 11060, + "train_speed(iter/s)": 0.129165 + }, + { + "acc": 0.70098314, + "epoch": 0.061964608030512534, + "grad_norm": 8.8125, + "learning_rate": 6.196174924505089e-06, + "loss": 1.21892319, + "memory(GiB)": 299.96, + "step": 11080, + "train_speed(iter/s)": 0.129269 + }, + { + "acc": 0.71962981, + "epoch": 0.0620764575034918, + "grad_norm": 5.71875, + "learning_rate": 6.20735935577676e-06, + "loss": 1.1504302, + "memory(GiB)": 299.96, + "step": 11100, + "train_speed(iter/s)": 0.129366 + }, + { + "acc": 0.73015118, + "epoch": 0.06218830697647106, + "grad_norm": 6.1875, + "learning_rate": 6.218543787048429e-06, + "loss": 1.1067996, + "memory(GiB)": 299.96, + "step": 11120, + "train_speed(iter/s)": 0.12947 + }, + { + "acc": 0.70935383, + "epoch": 0.06230015644945033, + "grad_norm": 7.8125, + "learning_rate": 6.229728218320099e-06, + "loss": 1.1741642, + "memory(GiB)": 299.96, + "step": 11140, + "train_speed(iter/s)": 0.129576 + }, + { + "acc": 0.71987009, + "epoch": 0.0624120059224296, + "grad_norm": 8.5625, + "learning_rate": 6.240912649591768e-06, + "loss": 1.12512798, + "memory(GiB)": 299.96, + "step": 11160, + "train_speed(iter/s)": 0.12968 + }, + { + "acc": 0.7159874, + "epoch": 0.06252385539540886, + "grad_norm": 10.5, + "learning_rate": 6.2520970808634385e-06, + "loss": 1.16766644, + "memory(GiB)": 299.96, + "step": 11180, + "train_speed(iter/s)": 0.129779 + }, + { + "acc": 0.70228472, + "epoch": 0.06263570486838813, + "grad_norm": 9.75, + "learning_rate": 6.263281512135109e-06, + "loss": 1.19307117, + "memory(GiB)": 299.96, + "step": 11200, + "train_speed(iter/s)": 0.129882 + }, + { + "acc": 0.70983152, + "epoch": 0.06274755434136739, + "grad_norm": 7.625, + "learning_rate": 6.274465943406778e-06, + "loss": 1.16847944, + "memory(GiB)": 299.96, + "step": 11220, + "train_speed(iter/s)": 0.129979 + }, + { + "acc": 0.71312919, + "epoch": 0.06285940381434665, + "grad_norm": 9.0, + "learning_rate": 6.285650374678448e-06, + "loss": 1.17156963, + "memory(GiB)": 299.96, + "step": 11240, + "train_speed(iter/s)": 0.130083 + }, + { + "acc": 0.71692524, + "epoch": 0.06297125328732592, + "grad_norm": 7.53125, + "learning_rate": 6.296834805950118e-06, + "loss": 1.11528358, + "memory(GiB)": 299.96, + "step": 11260, + "train_speed(iter/s)": 0.130177 + }, + { + "acc": 0.71725311, + "epoch": 0.06308310276030518, + "grad_norm": 7.0625, + "learning_rate": 6.308019237221788e-06, + "loss": 1.12391834, + "memory(GiB)": 299.96, + "step": 11280, + "train_speed(iter/s)": 0.130283 + }, + { + "acc": 0.70768657, + "epoch": 0.06319495223328445, + "grad_norm": 7.65625, + "learning_rate": 6.3192036684934574e-06, + "loss": 1.1862442, + "memory(GiB)": 299.96, + "step": 11300, + "train_speed(iter/s)": 0.130388 + }, + { + "acc": 0.72440252, + "epoch": 0.06330680170626371, + "grad_norm": 10.5625, + "learning_rate": 6.330388099765128e-06, + "loss": 1.10716314, + "memory(GiB)": 299.96, + "step": 11320, + "train_speed(iter/s)": 0.13049 + }, + { + "acc": 0.7073832, + "epoch": 0.06341865117924297, + "grad_norm": 8.5, + "learning_rate": 6.341572531036797e-06, + "loss": 1.1615962, + "memory(GiB)": 299.96, + "step": 11340, + "train_speed(iter/s)": 0.130589 + }, + { + "acc": 0.71042318, + "epoch": 0.06353050065222224, + "grad_norm": 6.125, + "learning_rate": 6.352756962308467e-06, + "loss": 1.16559792, + "memory(GiB)": 299.96, + "step": 11360, + "train_speed(iter/s)": 0.130697 + }, + { + "acc": 0.70854487, + "epoch": 0.0636423501252015, + "grad_norm": 6.75, + "learning_rate": 6.363941393580138e-06, + "loss": 1.178339, + "memory(GiB)": 299.96, + "step": 11380, + "train_speed(iter/s)": 0.130793 + }, + { + "acc": 0.7151032, + "epoch": 0.06375419959818077, + "grad_norm": 10.0, + "learning_rate": 6.375125824851807e-06, + "loss": 1.13839788, + "memory(GiB)": 299.96, + "step": 11400, + "train_speed(iter/s)": 0.130899 + }, + { + "acc": 0.73237748, + "epoch": 0.06386604907116003, + "grad_norm": 8.1875, + "learning_rate": 6.386310256123477e-06, + "loss": 1.07622986, + "memory(GiB)": 299.96, + "step": 11420, + "train_speed(iter/s)": 0.130999 + }, + { + "acc": 0.71218681, + "epoch": 0.0639778985441393, + "grad_norm": 11.25, + "learning_rate": 6.397494687395146e-06, + "loss": 1.15625734, + "memory(GiB)": 299.96, + "step": 11440, + "train_speed(iter/s)": 0.131106 + }, + { + "acc": 0.71752467, + "epoch": 0.06408974801711856, + "grad_norm": 9.625, + "learning_rate": 6.4086791186668165e-06, + "loss": 1.15799599, + "memory(GiB)": 299.96, + "step": 11460, + "train_speed(iter/s)": 0.131208 + }, + { + "acc": 0.70850763, + "epoch": 0.06420159749009782, + "grad_norm": 5.78125, + "learning_rate": 6.419863549938486e-06, + "loss": 1.18324757, + "memory(GiB)": 299.96, + "step": 11480, + "train_speed(iter/s)": 0.131309 + }, + { + "acc": 0.71651173, + "epoch": 0.06431344696307709, + "grad_norm": 4.4375, + "learning_rate": 6.431047981210156e-06, + "loss": 1.14780064, + "memory(GiB)": 299.96, + "step": 11500, + "train_speed(iter/s)": 0.131402 + }, + { + "acc": 0.71440721, + "epoch": 0.06442529643605635, + "grad_norm": 8.0, + "learning_rate": 6.4422324124818256e-06, + "loss": 1.12281685, + "memory(GiB)": 299.96, + "step": 11520, + "train_speed(iter/s)": 0.131497 + }, + { + "acc": 0.71646895, + "epoch": 0.06453714590903561, + "grad_norm": 8.75, + "learning_rate": 6.453416843753496e-06, + "loss": 1.12349291, + "memory(GiB)": 299.96, + "step": 11540, + "train_speed(iter/s)": 0.131598 + }, + { + "acc": 0.70624204, + "epoch": 0.06464899538201488, + "grad_norm": 5.125, + "learning_rate": 6.464601275025165e-06, + "loss": 1.18249893, + "memory(GiB)": 299.96, + "step": 11560, + "train_speed(iter/s)": 0.131699 + }, + { + "acc": 0.71265478, + "epoch": 0.06476084485499414, + "grad_norm": 5.90625, + "learning_rate": 6.4757857062968354e-06, + "loss": 1.18179226, + "memory(GiB)": 299.96, + "step": 11580, + "train_speed(iter/s)": 0.131797 + }, + { + "acc": 0.70123019, + "epoch": 0.06487269432797341, + "grad_norm": 9.8125, + "learning_rate": 6.486970137568506e-06, + "loss": 1.23154554, + "memory(GiB)": 299.96, + "step": 11600, + "train_speed(iter/s)": 0.131897 + }, + { + "acc": 0.71899343, + "epoch": 0.06498454380095267, + "grad_norm": 4.90625, + "learning_rate": 6.498154568840175e-06, + "loss": 1.12830372, + "memory(GiB)": 299.96, + "step": 11620, + "train_speed(iter/s)": 0.131994 + }, + { + "acc": 0.72862072, + "epoch": 0.06509639327393195, + "grad_norm": 7.28125, + "learning_rate": 6.509339000111845e-06, + "loss": 1.09289246, + "memory(GiB)": 299.96, + "step": 11640, + "train_speed(iter/s)": 0.132083 + }, + { + "acc": 0.71350884, + "epoch": 0.06520824274691121, + "grad_norm": 7.40625, + "learning_rate": 6.520523431383514e-06, + "loss": 1.1507081, + "memory(GiB)": 299.96, + "step": 11660, + "train_speed(iter/s)": 0.132182 + }, + { + "acc": 0.72401295, + "epoch": 0.06532009221989048, + "grad_norm": 7.96875, + "learning_rate": 6.531707862655185e-06, + "loss": 1.10367393, + "memory(GiB)": 299.96, + "step": 11680, + "train_speed(iter/s)": 0.13228 + }, + { + "acc": 0.6985774, + "epoch": 0.06543194169286974, + "grad_norm": 7.71875, + "learning_rate": 6.542892293926854e-06, + "loss": 1.23056498, + "memory(GiB)": 299.96, + "step": 11700, + "train_speed(iter/s)": 0.132378 + }, + { + "acc": 0.72863889, + "epoch": 0.065543791165849, + "grad_norm": 10.375, + "learning_rate": 6.554076725198524e-06, + "loss": 1.08138771, + "memory(GiB)": 299.96, + "step": 11720, + "train_speed(iter/s)": 0.132476 + }, + { + "acc": 0.70010824, + "epoch": 0.06565564063882827, + "grad_norm": 5.9375, + "learning_rate": 6.565261156470194e-06, + "loss": 1.21416473, + "memory(GiB)": 299.96, + "step": 11740, + "train_speed(iter/s)": 0.132577 + }, + { + "acc": 0.70190639, + "epoch": 0.06576749011180753, + "grad_norm": 9.6875, + "learning_rate": 6.576445587741864e-06, + "loss": 1.20564222, + "memory(GiB)": 299.96, + "step": 11760, + "train_speed(iter/s)": 0.132681 + }, + { + "acc": 0.70250931, + "epoch": 0.0658793395847868, + "grad_norm": 5.15625, + "learning_rate": 6.587630019013533e-06, + "loss": 1.20683613, + "memory(GiB)": 299.96, + "step": 11780, + "train_speed(iter/s)": 0.132766 + }, + { + "acc": 0.71276517, + "epoch": 0.06599118905776606, + "grad_norm": 5.96875, + "learning_rate": 6.5988144502852036e-06, + "loss": 1.15012093, + "memory(GiB)": 299.96, + "step": 11800, + "train_speed(iter/s)": 0.132865 + }, + { + "acc": 0.71898246, + "epoch": 0.06610303853074533, + "grad_norm": 15.125, + "learning_rate": 6.609998881556874e-06, + "loss": 1.11834087, + "memory(GiB)": 299.96, + "step": 11820, + "train_speed(iter/s)": 0.132962 + }, + { + "acc": 0.71244721, + "epoch": 0.06621488800372459, + "grad_norm": 8.6875, + "learning_rate": 6.621183312828543e-06, + "loss": 1.15958748, + "memory(GiB)": 299.96, + "step": 11840, + "train_speed(iter/s)": 0.133056 + }, + { + "acc": 0.72902675, + "epoch": 0.06632673747670385, + "grad_norm": 10.6875, + "learning_rate": 6.6323677441002134e-06, + "loss": 1.07181244, + "memory(GiB)": 299.96, + "step": 11860, + "train_speed(iter/s)": 0.13315 + }, + { + "acc": 0.70223217, + "epoch": 0.06643858694968312, + "grad_norm": 7.03125, + "learning_rate": 6.643552175371882e-06, + "loss": 1.20503111, + "memory(GiB)": 299.96, + "step": 11880, + "train_speed(iter/s)": 0.133247 + }, + { + "acc": 0.71082134, + "epoch": 0.06655043642266238, + "grad_norm": 5.5, + "learning_rate": 6.654736606643553e-06, + "loss": 1.16356544, + "memory(GiB)": 299.96, + "step": 11900, + "train_speed(iter/s)": 0.133347 + }, + { + "acc": 0.72360325, + "epoch": 0.06666228589564165, + "grad_norm": 7.625, + "learning_rate": 6.6659210379152225e-06, + "loss": 1.10313644, + "memory(GiB)": 299.96, + "step": 11920, + "train_speed(iter/s)": 0.133443 + }, + { + "acc": 0.70242996, + "epoch": 0.06677413536862091, + "grad_norm": 7.53125, + "learning_rate": 6.677105469186892e-06, + "loss": 1.1872117, + "memory(GiB)": 299.96, + "step": 11940, + "train_speed(iter/s)": 0.133539 + }, + { + "acc": 0.71188979, + "epoch": 0.06688598484160017, + "grad_norm": 8.625, + "learning_rate": 6.688289900458562e-06, + "loss": 1.15408201, + "memory(GiB)": 299.96, + "step": 11960, + "train_speed(iter/s)": 0.133628 + }, + { + "acc": 0.71464329, + "epoch": 0.06699783431457944, + "grad_norm": 6.59375, + "learning_rate": 6.699474331730232e-06, + "loss": 1.13063154, + "memory(GiB)": 299.96, + "step": 11980, + "train_speed(iter/s)": 0.133722 + }, + { + "acc": 0.71073756, + "epoch": 0.0671096837875587, + "grad_norm": 5.625, + "learning_rate": 6.710658763001901e-06, + "loss": 1.17476521, + "memory(GiB)": 299.96, + "step": 12000, + "train_speed(iter/s)": 0.133817 + }, + { + "epoch": 0.0671096837875587, + "eval_acc": 0.6751667636334453, + "eval_loss": 1.1404706239700317, + "eval_runtime": 7551.8589, + "eval_samples_per_second": 9.969, + "eval_steps_per_second": 9.969, + "step": 12000 + }, + { + "acc": 0.73320475, + "epoch": 0.06722153326053797, + "grad_norm": 6.8125, + "learning_rate": 6.721843194273572e-06, + "loss": 1.08851213, + "memory(GiB)": 299.96, + "step": 12020, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.71152987, + "epoch": 0.06733338273351723, + "grad_norm": 10.625, + "learning_rate": 6.733027625545242e-06, + "loss": 1.16340666, + "memory(GiB)": 299.96, + "step": 12040, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.70825677, + "epoch": 0.0674452322064965, + "grad_norm": 6.1875, + "learning_rate": 6.744212056816911e-06, + "loss": 1.16081018, + "memory(GiB)": 299.96, + "step": 12060, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.7167563, + "epoch": 0.06755708167947576, + "grad_norm": 8.5, + "learning_rate": 6.7553964880885816e-06, + "loss": 1.14889927, + "memory(GiB)": 299.96, + "step": 12080, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.72207794, + "epoch": 0.06766893115245502, + "grad_norm": 7.46875, + "learning_rate": 6.766580919360251e-06, + "loss": 1.12344246, + "memory(GiB)": 299.96, + "step": 12100, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.7281352, + "epoch": 0.06778078062543429, + "grad_norm": 7.5625, + "learning_rate": 6.777765350631921e-06, + "loss": 1.10363531, + "memory(GiB)": 299.96, + "step": 12120, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.72078447, + "epoch": 0.06789263009841355, + "grad_norm": 6.75, + "learning_rate": 6.788949781903591e-06, + "loss": 1.10384207, + "memory(GiB)": 299.96, + "step": 12140, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.71179829, + "epoch": 0.06800447957139281, + "grad_norm": 9.25, + "learning_rate": 6.80013421317526e-06, + "loss": 1.18200016, + "memory(GiB)": 299.96, + "step": 12160, + "train_speed(iter/s)": 0.12405 + }, + { + "acc": 0.74023871, + "epoch": 0.06811632904437208, + "grad_norm": 12.25, + "learning_rate": 6.811318644446931e-06, + "loss": 1.04338083, + "memory(GiB)": 299.96, + "step": 12180, + "train_speed(iter/s)": 0.124149 + }, + { + "acc": 0.71563911, + "epoch": 0.06822817851735134, + "grad_norm": 7.71875, + "learning_rate": 6.8225030757186005e-06, + "loss": 1.14986258, + "memory(GiB)": 299.96, + "step": 12200, + "train_speed(iter/s)": 0.124242 + }, + { + "acc": 0.73181791, + "epoch": 0.0683400279903306, + "grad_norm": 8.375, + "learning_rate": 6.83368750699027e-06, + "loss": 1.08589115, + "memory(GiB)": 299.96, + "step": 12220, + "train_speed(iter/s)": 0.124332 + }, + { + "acc": 0.70764751, + "epoch": 0.06845187746330987, + "grad_norm": 7.75, + "learning_rate": 6.84487193826194e-06, + "loss": 1.16952963, + "memory(GiB)": 299.96, + "step": 12240, + "train_speed(iter/s)": 0.124422 + }, + { + "acc": 0.71411357, + "epoch": 0.06856372693628915, + "grad_norm": 7.0, + "learning_rate": 6.85605636953361e-06, + "loss": 1.15431566, + "memory(GiB)": 299.96, + "step": 12260, + "train_speed(iter/s)": 0.124515 + }, + { + "acc": 0.72959661, + "epoch": 0.06867557640926841, + "grad_norm": 8.9375, + "learning_rate": 6.867240800805279e-06, + "loss": 1.07685499, + "memory(GiB)": 299.96, + "step": 12280, + "train_speed(iter/s)": 0.124607 + }, + { + "acc": 0.70197077, + "epoch": 0.06878742588224768, + "grad_norm": 7.84375, + "learning_rate": 6.87842523207695e-06, + "loss": 1.19277563, + "memory(GiB)": 299.96, + "step": 12300, + "train_speed(iter/s)": 0.1247 + }, + { + "acc": 0.71003108, + "epoch": 0.06889927535522694, + "grad_norm": 10.4375, + "learning_rate": 6.88960966334862e-06, + "loss": 1.16509981, + "memory(GiB)": 299.96, + "step": 12320, + "train_speed(iter/s)": 0.124799 + }, + { + "acc": 0.71291914, + "epoch": 0.0690111248282062, + "grad_norm": 9.5625, + "learning_rate": 6.900794094620289e-06, + "loss": 1.15738039, + "memory(GiB)": 299.96, + "step": 12340, + "train_speed(iter/s)": 0.124893 + }, + { + "acc": 0.71470313, + "epoch": 0.06912297430118547, + "grad_norm": 16.25, + "learning_rate": 6.9119785258919596e-06, + "loss": 1.14719038, + "memory(GiB)": 299.96, + "step": 12360, + "train_speed(iter/s)": 0.124989 + }, + { + "acc": 0.7177846, + "epoch": 0.06923482377416473, + "grad_norm": 9.5, + "learning_rate": 6.923162957163628e-06, + "loss": 1.15968533, + "memory(GiB)": 299.96, + "step": 12380, + "train_speed(iter/s)": 0.125078 + }, + { + "acc": 0.71332154, + "epoch": 0.069346673247144, + "grad_norm": 7.0625, + "learning_rate": 6.934347388435299e-06, + "loss": 1.17880449, + "memory(GiB)": 299.96, + "step": 12400, + "train_speed(iter/s)": 0.125171 + }, + { + "acc": 0.71180959, + "epoch": 0.06945852272012326, + "grad_norm": 6.8125, + "learning_rate": 6.945531819706969e-06, + "loss": 1.16603479, + "memory(GiB)": 299.96, + "step": 12420, + "train_speed(iter/s)": 0.125253 + }, + { + "acc": 0.70927887, + "epoch": 0.06957037219310253, + "grad_norm": 6.3125, + "learning_rate": 6.956716250978638e-06, + "loss": 1.17299089, + "memory(GiB)": 299.96, + "step": 12440, + "train_speed(iter/s)": 0.125351 + }, + { + "acc": 0.72048593, + "epoch": 0.06968222166608179, + "grad_norm": 7.75, + "learning_rate": 6.967900682250308e-06, + "loss": 1.12981701, + "memory(GiB)": 299.96, + "step": 12460, + "train_speed(iter/s)": 0.125447 + }, + { + "acc": 0.71988411, + "epoch": 0.06979407113906105, + "grad_norm": 8.5625, + "learning_rate": 6.9790851135219785e-06, + "loss": 1.12315998, + "memory(GiB)": 299.96, + "step": 12480, + "train_speed(iter/s)": 0.125544 + }, + { + "acc": 0.719208, + "epoch": 0.06990592061204032, + "grad_norm": 9.5625, + "learning_rate": 6.990269544793647e-06, + "loss": 1.12494183, + "memory(GiB)": 299.96, + "step": 12500, + "train_speed(iter/s)": 0.125615 + }, + { + "acc": 0.73393965, + "epoch": 0.07001777008501958, + "grad_norm": 6.09375, + "learning_rate": 7.001453976065318e-06, + "loss": 1.05467482, + "memory(GiB)": 299.96, + "step": 12520, + "train_speed(iter/s)": 0.125704 + }, + { + "acc": 0.70583501, + "epoch": 0.07012961955799885, + "grad_norm": 9.25, + "learning_rate": 7.012638407336988e-06, + "loss": 1.1654047, + "memory(GiB)": 299.96, + "step": 12540, + "train_speed(iter/s)": 0.125797 + }, + { + "acc": 0.70698767, + "epoch": 0.07024146903097811, + "grad_norm": 5.1875, + "learning_rate": 7.023822838608657e-06, + "loss": 1.18894358, + "memory(GiB)": 299.96, + "step": 12560, + "train_speed(iter/s)": 0.125885 + }, + { + "acc": 0.7108088, + "epoch": 0.07035331850395737, + "grad_norm": 5.15625, + "learning_rate": 7.035007269880328e-06, + "loss": 1.16308594, + "memory(GiB)": 299.96, + "step": 12580, + "train_speed(iter/s)": 0.125978 + }, + { + "acc": 0.72122726, + "epoch": 0.07046516797693664, + "grad_norm": 8.0625, + "learning_rate": 7.0461917011519965e-06, + "loss": 1.11826715, + "memory(GiB)": 299.96, + "step": 12600, + "train_speed(iter/s)": 0.126071 + }, + { + "acc": 0.72792292, + "epoch": 0.0705770174499159, + "grad_norm": 6.53125, + "learning_rate": 7.057376132423667e-06, + "loss": 1.07041492, + "memory(GiB)": 299.96, + "step": 12620, + "train_speed(iter/s)": 0.126165 + }, + { + "acc": 0.71927404, + "epoch": 0.07068886692289517, + "grad_norm": 8.125, + "learning_rate": 7.068560563695337e-06, + "loss": 1.11399269, + "memory(GiB)": 299.96, + "step": 12640, + "train_speed(iter/s)": 0.126258 + }, + { + "acc": 0.71576858, + "epoch": 0.07080071639587443, + "grad_norm": 8.3125, + "learning_rate": 7.079744994967006e-06, + "loss": 1.16753311, + "memory(GiB)": 299.96, + "step": 12660, + "train_speed(iter/s)": 0.12635 + }, + { + "acc": 0.6992023, + "epoch": 0.0709125658688537, + "grad_norm": 8.5, + "learning_rate": 7.090929426238676e-06, + "loss": 1.20142384, + "memory(GiB)": 299.96, + "step": 12680, + "train_speed(iter/s)": 0.126436 + }, + { + "acc": 0.71143436, + "epoch": 0.07102441534183296, + "grad_norm": 6.875, + "learning_rate": 7.102113857510347e-06, + "loss": 1.15856619, + "memory(GiB)": 299.96, + "step": 12700, + "train_speed(iter/s)": 0.126528 + }, + { + "acc": 0.70860653, + "epoch": 0.07113626481481222, + "grad_norm": 5.5, + "learning_rate": 7.113298288782015e-06, + "loss": 1.14321499, + "memory(GiB)": 299.96, + "step": 12720, + "train_speed(iter/s)": 0.126617 + }, + { + "acc": 0.69891534, + "epoch": 0.07124811428779149, + "grad_norm": 5.78125, + "learning_rate": 7.124482720053686e-06, + "loss": 1.20678663, + "memory(GiB)": 299.96, + "step": 12740, + "train_speed(iter/s)": 0.126709 + }, + { + "acc": 0.71347013, + "epoch": 0.07135996376077075, + "grad_norm": 6.9375, + "learning_rate": 7.1356671513253565e-06, + "loss": 1.1703887, + "memory(GiB)": 299.96, + "step": 12760, + "train_speed(iter/s)": 0.126796 + }, + { + "acc": 0.71927524, + "epoch": 0.07147181323375001, + "grad_norm": 8.1875, + "learning_rate": 7.146851582597025e-06, + "loss": 1.14221592, + "memory(GiB)": 299.96, + "step": 12780, + "train_speed(iter/s)": 0.126881 + }, + { + "acc": 0.7314827, + "epoch": 0.07158366270672928, + "grad_norm": 6.96875, + "learning_rate": 7.158036013868696e-06, + "loss": 1.0470561, + "memory(GiB)": 299.96, + "step": 12800, + "train_speed(iter/s)": 0.126973 + }, + { + "acc": 0.71211772, + "epoch": 0.07169551217970854, + "grad_norm": 9.375, + "learning_rate": 7.1692204451403655e-06, + "loss": 1.16199188, + "memory(GiB)": 299.96, + "step": 12820, + "train_speed(iter/s)": 0.127067 + }, + { + "acc": 0.72934847, + "epoch": 0.0718073616526878, + "grad_norm": 8.625, + "learning_rate": 7.180404876412035e-06, + "loss": 1.06615114, + "memory(GiB)": 299.96, + "step": 12840, + "train_speed(iter/s)": 0.127148 + }, + { + "acc": 0.70182576, + "epoch": 0.07191921112566707, + "grad_norm": 8.5, + "learning_rate": 7.191589307683705e-06, + "loss": 1.18615828, + "memory(GiB)": 299.96, + "step": 12860, + "train_speed(iter/s)": 0.127234 + }, + { + "acc": 0.7215807, + "epoch": 0.07203106059864635, + "grad_norm": 9.0, + "learning_rate": 7.2027737389553745e-06, + "loss": 1.10710096, + "memory(GiB)": 299.96, + "step": 12880, + "train_speed(iter/s)": 0.127324 + }, + { + "acc": 0.72085857, + "epoch": 0.07214291007162561, + "grad_norm": 8.0625, + "learning_rate": 7.213958170227044e-06, + "loss": 1.09579945, + "memory(GiB)": 299.96, + "step": 12900, + "train_speed(iter/s)": 0.127416 + }, + { + "acc": 0.71866703, + "epoch": 0.07225475954460488, + "grad_norm": 6.125, + "learning_rate": 7.225142601498715e-06, + "loss": 1.13693438, + "memory(GiB)": 299.96, + "step": 12920, + "train_speed(iter/s)": 0.127507 + }, + { + "acc": 0.72043581, + "epoch": 0.07236660901758414, + "grad_norm": 6.0, + "learning_rate": 7.2363270327703835e-06, + "loss": 1.09540596, + "memory(GiB)": 299.96, + "step": 12940, + "train_speed(iter/s)": 0.127599 + }, + { + "acc": 0.73620472, + "epoch": 0.0724784584905634, + "grad_norm": 5.78125, + "learning_rate": 7.247511464042054e-06, + "loss": 1.04224148, + "memory(GiB)": 299.96, + "step": 12960, + "train_speed(iter/s)": 0.127688 + }, + { + "acc": 0.719452, + "epoch": 0.07259030796354267, + "grad_norm": 8.4375, + "learning_rate": 7.258695895313725e-06, + "loss": 1.13766394, + "memory(GiB)": 299.96, + "step": 12980, + "train_speed(iter/s)": 0.127776 + }, + { + "acc": 0.72257667, + "epoch": 0.07270215743652193, + "grad_norm": 9.8125, + "learning_rate": 7.269880326585393e-06, + "loss": 1.10970049, + "memory(GiB)": 299.96, + "step": 13000, + "train_speed(iter/s)": 0.127868 + }, + { + "acc": 0.71005564, + "epoch": 0.0728140069095012, + "grad_norm": 7.03125, + "learning_rate": 7.281064757857064e-06, + "loss": 1.17083368, + "memory(GiB)": 299.96, + "step": 13020, + "train_speed(iter/s)": 0.127962 + }, + { + "acc": 0.7118751, + "epoch": 0.07292585638248046, + "grad_norm": 7.53125, + "learning_rate": 7.292249189128734e-06, + "loss": 1.16900263, + "memory(GiB)": 299.96, + "step": 13040, + "train_speed(iter/s)": 0.128051 + }, + { + "acc": 0.72269063, + "epoch": 0.07303770585545973, + "grad_norm": 9.5625, + "learning_rate": 7.303433620400403e-06, + "loss": 1.10072756, + "memory(GiB)": 299.96, + "step": 13060, + "train_speed(iter/s)": 0.128139 + }, + { + "acc": 0.7112381, + "epoch": 0.07314955532843899, + "grad_norm": 9.0625, + "learning_rate": 7.314618051672073e-06, + "loss": 1.15140877, + "memory(GiB)": 299.96, + "step": 13080, + "train_speed(iter/s)": 0.128231 + }, + { + "acc": 0.73394585, + "epoch": 0.07326140480141825, + "grad_norm": 6.9375, + "learning_rate": 7.325802482943743e-06, + "loss": 1.05853596, + "memory(GiB)": 299.96, + "step": 13100, + "train_speed(iter/s)": 0.128323 + }, + { + "acc": 0.72067642, + "epoch": 0.07337325427439752, + "grad_norm": 7.3125, + "learning_rate": 7.336986914215412e-06, + "loss": 1.12345562, + "memory(GiB)": 299.96, + "step": 13120, + "train_speed(iter/s)": 0.128411 + }, + { + "acc": 0.72014093, + "epoch": 0.07348510374737678, + "grad_norm": 7.03125, + "learning_rate": 7.348171345487083e-06, + "loss": 1.12946339, + "memory(GiB)": 299.96, + "step": 13140, + "train_speed(iter/s)": 0.128505 + }, + { + "acc": 0.73488965, + "epoch": 0.07359695322035605, + "grad_norm": 5.21875, + "learning_rate": 7.359355776758752e-06, + "loss": 1.05136786, + "memory(GiB)": 299.96, + "step": 13160, + "train_speed(iter/s)": 0.128592 + }, + { + "acc": 0.71697998, + "epoch": 0.07370880269333531, + "grad_norm": 11.3125, + "learning_rate": 7.370540208030422e-06, + "loss": 1.15699654, + "memory(GiB)": 299.96, + "step": 13180, + "train_speed(iter/s)": 0.128674 + }, + { + "acc": 0.73048105, + "epoch": 0.07382065216631457, + "grad_norm": 6.625, + "learning_rate": 7.381724639302093e-06, + "loss": 1.08995342, + "memory(GiB)": 299.96, + "step": 13200, + "train_speed(iter/s)": 0.128768 + }, + { + "acc": 0.71543064, + "epoch": 0.07393250163929384, + "grad_norm": 6.9375, + "learning_rate": 7.3929090705737615e-06, + "loss": 1.15562963, + "memory(GiB)": 299.96, + "step": 13220, + "train_speed(iter/s)": 0.128861 + }, + { + "acc": 0.7140677, + "epoch": 0.0740443511122731, + "grad_norm": 4.78125, + "learning_rate": 7.404093501845432e-06, + "loss": 1.14009123, + "memory(GiB)": 299.96, + "step": 13240, + "train_speed(iter/s)": 0.128943 + }, + { + "acc": 0.72918878, + "epoch": 0.07415620058525237, + "grad_norm": 7.8125, + "learning_rate": 7.415277933117102e-06, + "loss": 1.08716183, + "memory(GiB)": 299.96, + "step": 13260, + "train_speed(iter/s)": 0.129028 + }, + { + "acc": 0.72013588, + "epoch": 0.07426805005823163, + "grad_norm": 13.8125, + "learning_rate": 7.426462364388771e-06, + "loss": 1.10681362, + "memory(GiB)": 299.96, + "step": 13280, + "train_speed(iter/s)": 0.129119 + }, + { + "acc": 0.72878904, + "epoch": 0.0743798995312109, + "grad_norm": 7.71875, + "learning_rate": 7.437646795660441e-06, + "loss": 1.11332664, + "memory(GiB)": 299.96, + "step": 13300, + "train_speed(iter/s)": 0.129206 + }, + { + "acc": 0.71884489, + "epoch": 0.07449174900419016, + "grad_norm": 6.9375, + "learning_rate": 7.448831226932111e-06, + "loss": 1.10544567, + "memory(GiB)": 299.96, + "step": 13320, + "train_speed(iter/s)": 0.129295 + }, + { + "acc": 0.71988254, + "epoch": 0.07460359847716942, + "grad_norm": 9.0, + "learning_rate": 7.4600156582037804e-06, + "loss": 1.12563887, + "memory(GiB)": 299.96, + "step": 13340, + "train_speed(iter/s)": 0.129382 + }, + { + "acc": 0.71389823, + "epoch": 0.07471544795014869, + "grad_norm": 5.5, + "learning_rate": 7.471200089475451e-06, + "loss": 1.1630208, + "memory(GiB)": 299.96, + "step": 13360, + "train_speed(iter/s)": 0.129457 + }, + { + "acc": 0.72000957, + "epoch": 0.07482729742312795, + "grad_norm": 8.9375, + "learning_rate": 7.48238452074712e-06, + "loss": 1.10553045, + "memory(GiB)": 299.96, + "step": 13380, + "train_speed(iter/s)": 0.129543 + }, + { + "acc": 0.72067237, + "epoch": 0.07493914689610721, + "grad_norm": 5.375, + "learning_rate": 7.49356895201879e-06, + "loss": 1.12431831, + "memory(GiB)": 299.96, + "step": 13400, + "train_speed(iter/s)": 0.129625 + }, + { + "acc": 0.735499, + "epoch": 0.07505099636908648, + "grad_norm": 6.6875, + "learning_rate": 7.504753383290461e-06, + "loss": 1.04852228, + "memory(GiB)": 299.96, + "step": 13420, + "train_speed(iter/s)": 0.129713 + }, + { + "acc": 0.72748532, + "epoch": 0.07516284584206574, + "grad_norm": 9.5, + "learning_rate": 7.51593781456213e-06, + "loss": 1.09055204, + "memory(GiB)": 299.96, + "step": 13440, + "train_speed(iter/s)": 0.129797 + }, + { + "acc": 0.72297673, + "epoch": 0.075274695315045, + "grad_norm": 7.25, + "learning_rate": 7.5271222458338e-06, + "loss": 1.08775005, + "memory(GiB)": 299.96, + "step": 13460, + "train_speed(iter/s)": 0.129886 + }, + { + "acc": 0.72882719, + "epoch": 0.07538654478802427, + "grad_norm": 9.125, + "learning_rate": 7.53830667710547e-06, + "loss": 1.06670494, + "memory(GiB)": 299.96, + "step": 13480, + "train_speed(iter/s)": 0.129973 + }, + { + "acc": 0.72202625, + "epoch": 0.07549839426100353, + "grad_norm": 8.8125, + "learning_rate": 7.5494911083771395e-06, + "loss": 1.11069765, + "memory(GiB)": 299.96, + "step": 13500, + "train_speed(iter/s)": 0.130052 + }, + { + "acc": 0.70265217, + "epoch": 0.07561024373398281, + "grad_norm": 7.5625, + "learning_rate": 7.560675539648809e-06, + "loss": 1.18559628, + "memory(GiB)": 299.96, + "step": 13520, + "train_speed(iter/s)": 0.130135 + }, + { + "acc": 0.72565856, + "epoch": 0.07572209320696208, + "grad_norm": 6.0, + "learning_rate": 7.57185997092048e-06, + "loss": 1.10519705, + "memory(GiB)": 299.96, + "step": 13540, + "train_speed(iter/s)": 0.13022 + }, + { + "acc": 0.7155601, + "epoch": 0.07583394267994134, + "grad_norm": 6.375, + "learning_rate": 7.5830444021921486e-06, + "loss": 1.13282661, + "memory(GiB)": 299.96, + "step": 13560, + "train_speed(iter/s)": 0.130303 + }, + { + "acc": 0.71096125, + "epoch": 0.0759457921529206, + "grad_norm": 6.46875, + "learning_rate": 7.594228833463819e-06, + "loss": 1.16910343, + "memory(GiB)": 299.96, + "step": 13580, + "train_speed(iter/s)": 0.130388 + }, + { + "acc": 0.72285824, + "epoch": 0.07605764162589987, + "grad_norm": 9.9375, + "learning_rate": 7.605413264735489e-06, + "loss": 1.11948538, + "memory(GiB)": 299.96, + "step": 13600, + "train_speed(iter/s)": 0.130479 + }, + { + "acc": 0.72891569, + "epoch": 0.07616949109887913, + "grad_norm": 6.375, + "learning_rate": 7.6165976960071584e-06, + "loss": 1.08795242, + "memory(GiB)": 299.96, + "step": 13620, + "train_speed(iter/s)": 0.130563 + }, + { + "acc": 0.72982469, + "epoch": 0.0762813405718584, + "grad_norm": 10.0625, + "learning_rate": 7.627782127278829e-06, + "loss": 1.05477695, + "memory(GiB)": 299.96, + "step": 13640, + "train_speed(iter/s)": 0.13065 + }, + { + "acc": 0.73584471, + "epoch": 0.07639319004483766, + "grad_norm": 7.8125, + "learning_rate": 7.638966558550498e-06, + "loss": 1.0453001, + "memory(GiB)": 299.96, + "step": 13660, + "train_speed(iter/s)": 0.130736 + }, + { + "acc": 0.72160897, + "epoch": 0.07650503951781693, + "grad_norm": 11.5, + "learning_rate": 7.650150989822168e-06, + "loss": 1.11282053, + "memory(GiB)": 299.96, + "step": 13680, + "train_speed(iter/s)": 0.130816 + }, + { + "acc": 0.72627187, + "epoch": 0.07661688899079619, + "grad_norm": 7.875, + "learning_rate": 7.661335421093839e-06, + "loss": 1.09102163, + "memory(GiB)": 299.96, + "step": 13700, + "train_speed(iter/s)": 0.130904 + }, + { + "acc": 0.70050597, + "epoch": 0.07672873846377545, + "grad_norm": 7.40625, + "learning_rate": 7.672519852365508e-06, + "loss": 1.21692333, + "memory(GiB)": 299.96, + "step": 13720, + "train_speed(iter/s)": 0.130989 + }, + { + "acc": 0.73602748, + "epoch": 0.07684058793675472, + "grad_norm": 9.4375, + "learning_rate": 7.683704283637178e-06, + "loss": 1.05643902, + "memory(GiB)": 299.96, + "step": 13740, + "train_speed(iter/s)": 0.131077 + }, + { + "acc": 0.71152887, + "epoch": 0.07695243740973398, + "grad_norm": 7.1875, + "learning_rate": 7.694888714908849e-06, + "loss": 1.15304899, + "memory(GiB)": 299.96, + "step": 13760, + "train_speed(iter/s)": 0.131157 + }, + { + "acc": 0.69116273, + "epoch": 0.07706428688271325, + "grad_norm": 6.96875, + "learning_rate": 7.706073146180518e-06, + "loss": 1.25027504, + "memory(GiB)": 299.96, + "step": 13780, + "train_speed(iter/s)": 0.131238 + }, + { + "acc": 0.7217988, + "epoch": 0.07717613635569251, + "grad_norm": 10.5, + "learning_rate": 7.717257577452188e-06, + "loss": 1.09320793, + "memory(GiB)": 299.96, + "step": 13800, + "train_speed(iter/s)": 0.131313 + }, + { + "acc": 0.72784657, + "epoch": 0.07728798582867177, + "grad_norm": 5.375, + "learning_rate": 7.728442008723857e-06, + "loss": 1.11974134, + "memory(GiB)": 299.96, + "step": 13820, + "train_speed(iter/s)": 0.131398 + }, + { + "acc": 0.71434736, + "epoch": 0.07739983530165104, + "grad_norm": 8.375, + "learning_rate": 7.739626439995527e-06, + "loss": 1.12553759, + "memory(GiB)": 299.96, + "step": 13840, + "train_speed(iter/s)": 0.131485 + }, + { + "acc": 0.71571827, + "epoch": 0.0775116847746303, + "grad_norm": 8.0625, + "learning_rate": 7.750810871267196e-06, + "loss": 1.13719101, + "memory(GiB)": 299.96, + "step": 13860, + "train_speed(iter/s)": 0.13157 + }, + { + "acc": 0.71877184, + "epoch": 0.07762353424760957, + "grad_norm": 9.375, + "learning_rate": 7.761995302538867e-06, + "loss": 1.11685209, + "memory(GiB)": 299.96, + "step": 13880, + "train_speed(iter/s)": 0.131654 + }, + { + "acc": 0.72853413, + "epoch": 0.07773538372058883, + "grad_norm": 8.0, + "learning_rate": 7.773179733810536e-06, + "loss": 1.0658493, + "memory(GiB)": 299.96, + "step": 13900, + "train_speed(iter/s)": 0.13174 + }, + { + "acc": 0.72110028, + "epoch": 0.0778472331935681, + "grad_norm": 12.125, + "learning_rate": 7.784364165082206e-06, + "loss": 1.13343983, + "memory(GiB)": 299.96, + "step": 13920, + "train_speed(iter/s)": 0.131807 + }, + { + "acc": 0.71897192, + "epoch": 0.07795908266654736, + "grad_norm": 7.21875, + "learning_rate": 7.795548596353875e-06, + "loss": 1.11369867, + "memory(GiB)": 299.96, + "step": 13940, + "train_speed(iter/s)": 0.131892 + }, + { + "acc": 0.72556615, + "epoch": 0.07807093213952662, + "grad_norm": 5.96875, + "learning_rate": 7.806733027625545e-06, + "loss": 1.08284416, + "memory(GiB)": 299.96, + "step": 13960, + "train_speed(iter/s)": 0.131976 + }, + { + "acc": 0.73107743, + "epoch": 0.07818278161250589, + "grad_norm": 7.125, + "learning_rate": 7.817917458897216e-06, + "loss": 1.05044518, + "memory(GiB)": 299.96, + "step": 13980, + "train_speed(iter/s)": 0.132055 + }, + { + "acc": 0.72434435, + "epoch": 0.07829463108548515, + "grad_norm": 7.40625, + "learning_rate": 7.829101890168885e-06, + "loss": 1.10251265, + "memory(GiB)": 299.96, + "step": 14000, + "train_speed(iter/s)": 0.132137 + }, + { + "epoch": 0.07829463108548515, + "eval_acc": 0.6785252550886854, + "eval_loss": 1.1261999607086182, + "eval_runtime": 7530.0347, + "eval_samples_per_second": 9.998, + "eval_steps_per_second": 9.998, + "step": 14000 + }, + { + "acc": 0.71372938, + "epoch": 0.07840648055846441, + "grad_norm": 7.125, + "learning_rate": 7.840286321440555e-06, + "loss": 1.16319408, + "memory(GiB)": 299.96, + "step": 14020, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.72111239, + "epoch": 0.07851833003144368, + "grad_norm": 4.96875, + "learning_rate": 7.851470752712224e-06, + "loss": 1.1148118, + "memory(GiB)": 299.96, + "step": 14040, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.70859618, + "epoch": 0.07863017950442294, + "grad_norm": 6.6875, + "learning_rate": 7.862655183983895e-06, + "loss": 1.15905619, + "memory(GiB)": 299.96, + "step": 14060, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.70089817, + "epoch": 0.0787420289774022, + "grad_norm": 7.59375, + "learning_rate": 7.873839615255565e-06, + "loss": 1.21567278, + "memory(GiB)": 299.96, + "step": 14080, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.71554856, + "epoch": 0.07885387845038147, + "grad_norm": 6.28125, + "learning_rate": 7.885024046527234e-06, + "loss": 1.14543257, + "memory(GiB)": 299.96, + "step": 14100, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.73004298, + "epoch": 0.07896572792336073, + "grad_norm": 7.84375, + "learning_rate": 7.896208477798905e-06, + "loss": 1.0746913, + "memory(GiB)": 299.96, + "step": 14120, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.71585212, + "epoch": 0.07907757739634001, + "grad_norm": 6.375, + "learning_rate": 7.907392909070575e-06, + "loss": 1.12623034, + "memory(GiB)": 299.96, + "step": 14140, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.72230544, + "epoch": 0.07918942686931928, + "grad_norm": 8.3125, + "learning_rate": 7.918577340342244e-06, + "loss": 1.12734022, + "memory(GiB)": 299.96, + "step": 14160, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.72393107, + "epoch": 0.07930127634229854, + "grad_norm": 7.375, + "learning_rate": 7.929761771613914e-06, + "loss": 1.11149139, + "memory(GiB)": 299.96, + "step": 14180, + "train_speed(iter/s)": 0.124002 + }, + { + "acc": 0.71570444, + "epoch": 0.0794131258152778, + "grad_norm": 9.375, + "learning_rate": 7.940946202885585e-06, + "loss": 1.14884701, + "memory(GiB)": 299.96, + "step": 14200, + "train_speed(iter/s)": 0.12408 + }, + { + "acc": 0.71533198, + "epoch": 0.07952497528825707, + "grad_norm": 6.84375, + "learning_rate": 7.952130634157254e-06, + "loss": 1.12676287, + "memory(GiB)": 299.96, + "step": 14220, + "train_speed(iter/s)": 0.124162 + }, + { + "acc": 0.71647749, + "epoch": 0.07963682476123633, + "grad_norm": 11.5, + "learning_rate": 7.963315065428924e-06, + "loss": 1.1297121, + "memory(GiB)": 299.96, + "step": 14240, + "train_speed(iter/s)": 0.124244 + }, + { + "acc": 0.7258872, + "epoch": 0.0797486742342156, + "grad_norm": 8.875, + "learning_rate": 7.974499496700593e-06, + "loss": 1.09186125, + "memory(GiB)": 299.96, + "step": 14260, + "train_speed(iter/s)": 0.12433 + }, + { + "acc": 0.72609582, + "epoch": 0.07986052370719486, + "grad_norm": 6.3125, + "learning_rate": 7.985683927972264e-06, + "loss": 1.07992964, + "memory(GiB)": 299.96, + "step": 14280, + "train_speed(iter/s)": 0.124407 + }, + { + "acc": 0.70103803, + "epoch": 0.07997237318017412, + "grad_norm": 5.09375, + "learning_rate": 7.996868359243933e-06, + "loss": 1.18996572, + "memory(GiB)": 299.96, + "step": 14300, + "train_speed(iter/s)": 0.124492 + }, + { + "acc": 0.71911659, + "epoch": 0.08008422265315339, + "grad_norm": 6.875, + "learning_rate": 8.008052790515603e-06, + "loss": 1.14281359, + "memory(GiB)": 299.96, + "step": 14320, + "train_speed(iter/s)": 0.124578 + }, + { + "acc": 0.72406898, + "epoch": 0.08019607212613265, + "grad_norm": 6.96875, + "learning_rate": 8.019237221787274e-06, + "loss": 1.09950714, + "memory(GiB)": 299.96, + "step": 14340, + "train_speed(iter/s)": 0.124658 + }, + { + "acc": 0.71265521, + "epoch": 0.08030792159911192, + "grad_norm": 8.1875, + "learning_rate": 8.030421653058942e-06, + "loss": 1.15389471, + "memory(GiB)": 299.96, + "step": 14360, + "train_speed(iter/s)": 0.12474 + }, + { + "acc": 0.73210182, + "epoch": 0.08041977107209118, + "grad_norm": 10.5625, + "learning_rate": 8.041606084330613e-06, + "loss": 1.05806293, + "memory(GiB)": 299.96, + "step": 14380, + "train_speed(iter/s)": 0.124821 + }, + { + "acc": 0.72704787, + "epoch": 0.08053162054507045, + "grad_norm": 8.1875, + "learning_rate": 8.052790515602282e-06, + "loss": 1.05650606, + "memory(GiB)": 299.96, + "step": 14400, + "train_speed(iter/s)": 0.124903 + }, + { + "acc": 0.74185038, + "epoch": 0.08064347001804971, + "grad_norm": 9.875, + "learning_rate": 8.063974946873952e-06, + "loss": 1.03100071, + "memory(GiB)": 299.96, + "step": 14420, + "train_speed(iter/s)": 0.124983 + }, + { + "acc": 0.7273756, + "epoch": 0.08075531949102897, + "grad_norm": 7.0625, + "learning_rate": 8.075159378145621e-06, + "loss": 1.07619267, + "memory(GiB)": 299.96, + "step": 14440, + "train_speed(iter/s)": 0.125067 + }, + { + "acc": 0.72888598, + "epoch": 0.08086716896400824, + "grad_norm": 9.875, + "learning_rate": 8.086343809417292e-06, + "loss": 1.06406326, + "memory(GiB)": 299.96, + "step": 14460, + "train_speed(iter/s)": 0.125151 + }, + { + "acc": 0.71566262, + "epoch": 0.0809790184369875, + "grad_norm": 6.0, + "learning_rate": 8.097528240688962e-06, + "loss": 1.14834814, + "memory(GiB)": 299.96, + "step": 14480, + "train_speed(iter/s)": 0.125228 + }, + { + "acc": 0.72459564, + "epoch": 0.08109086790996677, + "grad_norm": 8.5, + "learning_rate": 8.108712671960631e-06, + "loss": 1.10716896, + "memory(GiB)": 299.96, + "step": 14500, + "train_speed(iter/s)": 0.12531 + }, + { + "acc": 0.72091928, + "epoch": 0.08120271738294603, + "grad_norm": 6.3125, + "learning_rate": 8.119897103232301e-06, + "loss": 1.11735744, + "memory(GiB)": 299.96, + "step": 14520, + "train_speed(iter/s)": 0.125389 + }, + { + "acc": 0.72285762, + "epoch": 0.0813145668559253, + "grad_norm": 9.375, + "learning_rate": 8.13108153450397e-06, + "loss": 1.11511021, + "memory(GiB)": 299.96, + "step": 14540, + "train_speed(iter/s)": 0.125463 + }, + { + "acc": 0.71224494, + "epoch": 0.08142641632890456, + "grad_norm": 10.25, + "learning_rate": 8.14226596577564e-06, + "loss": 1.14703474, + "memory(GiB)": 299.96, + "step": 14560, + "train_speed(iter/s)": 0.125541 + }, + { + "acc": 0.7246933, + "epoch": 0.08153826580188382, + "grad_norm": 7.1875, + "learning_rate": 8.153450397047311e-06, + "loss": 1.09309654, + "memory(GiB)": 299.96, + "step": 14580, + "train_speed(iter/s)": 0.125621 + }, + { + "acc": 0.71305122, + "epoch": 0.08165011527486309, + "grad_norm": 4.78125, + "learning_rate": 8.16463482831898e-06, + "loss": 1.14956102, + "memory(GiB)": 299.96, + "step": 14600, + "train_speed(iter/s)": 0.125701 + }, + { + "acc": 0.72421379, + "epoch": 0.08176196474784235, + "grad_norm": 6.90625, + "learning_rate": 8.17581925959065e-06, + "loss": 1.09100389, + "memory(GiB)": 299.96, + "step": 14620, + "train_speed(iter/s)": 0.125781 + }, + { + "acc": 0.726438, + "epoch": 0.08187381422082161, + "grad_norm": 7.375, + "learning_rate": 8.187003690862321e-06, + "loss": 1.10120821, + "memory(GiB)": 299.96, + "step": 14640, + "train_speed(iter/s)": 0.125861 + }, + { + "acc": 0.71379356, + "epoch": 0.08198566369380088, + "grad_norm": 6.875, + "learning_rate": 8.19818812213399e-06, + "loss": 1.1374877, + "memory(GiB)": 299.96, + "step": 14660, + "train_speed(iter/s)": 0.125941 + }, + { + "acc": 0.71736422, + "epoch": 0.08209751316678014, + "grad_norm": 6.09375, + "learning_rate": 8.20937255340566e-06, + "loss": 1.13864965, + "memory(GiB)": 299.96, + "step": 14680, + "train_speed(iter/s)": 0.12602 + }, + { + "acc": 0.73167295, + "epoch": 0.0822093626397594, + "grad_norm": 11.5, + "learning_rate": 8.220556984677331e-06, + "loss": 1.06764441, + "memory(GiB)": 299.96, + "step": 14700, + "train_speed(iter/s)": 0.126101 + }, + { + "acc": 0.70931406, + "epoch": 0.08232121211273867, + "grad_norm": 7.40625, + "learning_rate": 8.231741415949e-06, + "loss": 1.17839241, + "memory(GiB)": 299.96, + "step": 14720, + "train_speed(iter/s)": 0.12618 + }, + { + "acc": 0.71851048, + "epoch": 0.08243306158571793, + "grad_norm": 5.53125, + "learning_rate": 8.24292584722067e-06, + "loss": 1.14332142, + "memory(GiB)": 299.96, + "step": 14740, + "train_speed(iter/s)": 0.126251 + }, + { + "acc": 0.71137023, + "epoch": 0.0825449110586972, + "grad_norm": 10.0625, + "learning_rate": 8.25411027849234e-06, + "loss": 1.15315361, + "memory(GiB)": 299.96, + "step": 14760, + "train_speed(iter/s)": 0.126329 + }, + { + "acc": 0.71221423, + "epoch": 0.08265676053167648, + "grad_norm": 6.96875, + "learning_rate": 8.26529470976401e-06, + "loss": 1.16082382, + "memory(GiB)": 299.96, + "step": 14780, + "train_speed(iter/s)": 0.126407 + }, + { + "acc": 0.71140294, + "epoch": 0.08276861000465574, + "grad_norm": 6.40625, + "learning_rate": 8.276479141035679e-06, + "loss": 1.14966726, + "memory(GiB)": 299.96, + "step": 14800, + "train_speed(iter/s)": 0.126487 + }, + { + "acc": 0.72899508, + "epoch": 0.082880459477635, + "grad_norm": 5.90625, + "learning_rate": 8.287663572307349e-06, + "loss": 1.07692499, + "memory(GiB)": 299.96, + "step": 14820, + "train_speed(iter/s)": 0.126563 + }, + { + "acc": 0.69936152, + "epoch": 0.08299230895061427, + "grad_norm": 6.34375, + "learning_rate": 8.298848003579018e-06, + "loss": 1.18731127, + "memory(GiB)": 299.96, + "step": 14840, + "train_speed(iter/s)": 0.126641 + }, + { + "acc": 0.70173173, + "epoch": 0.08310415842359353, + "grad_norm": 6.5625, + "learning_rate": 8.310032434850689e-06, + "loss": 1.19463453, + "memory(GiB)": 299.96, + "step": 14860, + "train_speed(iter/s)": 0.126723 + }, + { + "acc": 0.72426558, + "epoch": 0.0832160078965728, + "grad_norm": 5.75, + "learning_rate": 8.321216866122357e-06, + "loss": 1.09674282, + "memory(GiB)": 299.96, + "step": 14880, + "train_speed(iter/s)": 0.126801 + }, + { + "acc": 0.72899284, + "epoch": 0.08332785736955206, + "grad_norm": 8.625, + "learning_rate": 8.332401297394028e-06, + "loss": 1.10620289, + "memory(GiB)": 299.96, + "step": 14900, + "train_speed(iter/s)": 0.126879 + }, + { + "acc": 0.72005281, + "epoch": 0.08343970684253132, + "grad_norm": 7.53125, + "learning_rate": 8.343585728665698e-06, + "loss": 1.1132308, + "memory(GiB)": 299.96, + "step": 14920, + "train_speed(iter/s)": 0.126961 + }, + { + "acc": 0.72139492, + "epoch": 0.08355155631551059, + "grad_norm": 8.5, + "learning_rate": 8.354770159937367e-06, + "loss": 1.10732985, + "memory(GiB)": 299.96, + "step": 14940, + "train_speed(iter/s)": 0.127041 + }, + { + "acc": 0.70825353, + "epoch": 0.08366340578848985, + "grad_norm": 11.75, + "learning_rate": 8.365954591209038e-06, + "loss": 1.1731452, + "memory(GiB)": 299.96, + "step": 14960, + "train_speed(iter/s)": 0.12712 + }, + { + "acc": 0.72638187, + "epoch": 0.08377525526146912, + "grad_norm": 7.125, + "learning_rate": 8.377139022480708e-06, + "loss": 1.07427464, + "memory(GiB)": 299.96, + "step": 14980, + "train_speed(iter/s)": 0.127199 + }, + { + "acc": 0.73510451, + "epoch": 0.08388710473444838, + "grad_norm": 8.125, + "learning_rate": 8.388323453752377e-06, + "loss": 1.04545097, + "memory(GiB)": 299.96, + "step": 15000, + "train_speed(iter/s)": 0.127277 + }, + { + "acc": 0.72449846, + "epoch": 0.08399895420742765, + "grad_norm": 9.625, + "learning_rate": 8.399507885024048e-06, + "loss": 1.09780903, + "memory(GiB)": 299.96, + "step": 15020, + "train_speed(iter/s)": 0.127357 + }, + { + "acc": 0.71494284, + "epoch": 0.08411080368040691, + "grad_norm": 7.90625, + "learning_rate": 8.410692316295716e-06, + "loss": 1.137008, + "memory(GiB)": 299.96, + "step": 15040, + "train_speed(iter/s)": 0.127436 + }, + { + "acc": 0.71658249, + "epoch": 0.08422265315338617, + "grad_norm": 9.0625, + "learning_rate": 8.421876747567387e-06, + "loss": 1.10606384, + "memory(GiB)": 299.96, + "step": 15060, + "train_speed(iter/s)": 0.127508 + }, + { + "acc": 0.72326217, + "epoch": 0.08433450262636544, + "grad_norm": 8.5, + "learning_rate": 8.433061178839057e-06, + "loss": 1.09878759, + "memory(GiB)": 299.96, + "step": 15080, + "train_speed(iter/s)": 0.127581 + }, + { + "acc": 0.71872349, + "epoch": 0.0844463520993447, + "grad_norm": 7.28125, + "learning_rate": 8.444245610110726e-06, + "loss": 1.13200254, + "memory(GiB)": 299.96, + "step": 15100, + "train_speed(iter/s)": 0.127657 + }, + { + "acc": 0.70972772, + "epoch": 0.08455820157232397, + "grad_norm": 8.0625, + "learning_rate": 8.455430041382397e-06, + "loss": 1.169559, + "memory(GiB)": 299.96, + "step": 15120, + "train_speed(iter/s)": 0.127734 + }, + { + "acc": 0.71702828, + "epoch": 0.08467005104530323, + "grad_norm": 8.5, + "learning_rate": 8.466614472654067e-06, + "loss": 1.13463097, + "memory(GiB)": 299.96, + "step": 15140, + "train_speed(iter/s)": 0.12781 + }, + { + "acc": 0.71002851, + "epoch": 0.0847819005182825, + "grad_norm": 7.71875, + "learning_rate": 8.477798903925736e-06, + "loss": 1.16606226, + "memory(GiB)": 299.96, + "step": 15160, + "train_speed(iter/s)": 0.127888 + }, + { + "acc": 0.71450334, + "epoch": 0.08489374999126176, + "grad_norm": 7.90625, + "learning_rate": 8.488983335197407e-06, + "loss": 1.13542643, + "memory(GiB)": 299.96, + "step": 15180, + "train_speed(iter/s)": 0.127962 + }, + { + "acc": 0.72811384, + "epoch": 0.08500559946424102, + "grad_norm": 6.28125, + "learning_rate": 8.500167766469076e-06, + "loss": 1.05916615, + "memory(GiB)": 299.96, + "step": 15200, + "train_speed(iter/s)": 0.128033 + }, + { + "acc": 0.70773239, + "epoch": 0.08511744893722029, + "grad_norm": 6.90625, + "learning_rate": 8.511352197740746e-06, + "loss": 1.14004459, + "memory(GiB)": 299.96, + "step": 15220, + "train_speed(iter/s)": 0.128109 + }, + { + "acc": 0.71311922, + "epoch": 0.08522929841019955, + "grad_norm": 13.25, + "learning_rate": 8.522536629012415e-06, + "loss": 1.16433716, + "memory(GiB)": 299.96, + "step": 15240, + "train_speed(iter/s)": 0.128185 + }, + { + "acc": 0.72341747, + "epoch": 0.08534114788317881, + "grad_norm": 6.1875, + "learning_rate": 8.533721060284085e-06, + "loss": 1.12659245, + "memory(GiB)": 299.96, + "step": 15260, + "train_speed(iter/s)": 0.128253 + }, + { + "acc": 0.72805767, + "epoch": 0.08545299735615808, + "grad_norm": 10.25, + "learning_rate": 8.544905491555754e-06, + "loss": 1.07205744, + "memory(GiB)": 299.96, + "step": 15280, + "train_speed(iter/s)": 0.128314 + }, + { + "acc": 0.71228724, + "epoch": 0.08556484682913734, + "grad_norm": 7.53125, + "learning_rate": 8.556089922827425e-06, + "loss": 1.16384115, + "memory(GiB)": 299.96, + "step": 15300, + "train_speed(iter/s)": 0.128385 + }, + { + "acc": 0.72422848, + "epoch": 0.0856766963021166, + "grad_norm": 7.59375, + "learning_rate": 8.567274354099094e-06, + "loss": 1.07942228, + "memory(GiB)": 299.96, + "step": 15320, + "train_speed(iter/s)": 0.128461 + }, + { + "acc": 0.70650077, + "epoch": 0.08578854577509587, + "grad_norm": 6.25, + "learning_rate": 8.578458785370764e-06, + "loss": 1.17013273, + "memory(GiB)": 299.96, + "step": 15340, + "train_speed(iter/s)": 0.128532 + }, + { + "acc": 0.7333982, + "epoch": 0.08590039524807513, + "grad_norm": 9.25, + "learning_rate": 8.589643216642435e-06, + "loss": 1.05421715, + "memory(GiB)": 299.96, + "step": 15360, + "train_speed(iter/s)": 0.128606 + }, + { + "acc": 0.71431627, + "epoch": 0.0860122447210544, + "grad_norm": 10.75, + "learning_rate": 8.600827647914103e-06, + "loss": 1.14421864, + "memory(GiB)": 299.96, + "step": 15380, + "train_speed(iter/s)": 0.128684 + }, + { + "acc": 0.71660824, + "epoch": 0.08612409419403368, + "grad_norm": 6.625, + "learning_rate": 8.612012079185774e-06, + "loss": 1.11881771, + "memory(GiB)": 299.96, + "step": 15400, + "train_speed(iter/s)": 0.12875 + }, + { + "acc": 0.70655789, + "epoch": 0.08623594366701294, + "grad_norm": 7.8125, + "learning_rate": 8.623196510457445e-06, + "loss": 1.14725018, + "memory(GiB)": 299.96, + "step": 15420, + "train_speed(iter/s)": 0.128826 + }, + { + "acc": 0.73793497, + "epoch": 0.0863477931399922, + "grad_norm": 7.8125, + "learning_rate": 8.634380941729113e-06, + "loss": 1.06054935, + "memory(GiB)": 299.96, + "step": 15440, + "train_speed(iter/s)": 0.128904 + }, + { + "acc": 0.7203598, + "epoch": 0.08645964261297147, + "grad_norm": 7.0, + "learning_rate": 8.645565373000784e-06, + "loss": 1.11986656, + "memory(GiB)": 299.96, + "step": 15460, + "train_speed(iter/s)": 0.12898 + }, + { + "acc": 0.7218328, + "epoch": 0.08657149208595073, + "grad_norm": 9.0, + "learning_rate": 8.656749804272453e-06, + "loss": 1.10249891, + "memory(GiB)": 299.96, + "step": 15480, + "train_speed(iter/s)": 0.129054 + }, + { + "acc": 0.71918802, + "epoch": 0.08668334155893, + "grad_norm": 6.9375, + "learning_rate": 8.667934235544123e-06, + "loss": 1.12033682, + "memory(GiB)": 299.96, + "step": 15500, + "train_speed(iter/s)": 0.129126 + }, + { + "acc": 0.70987444, + "epoch": 0.08679519103190926, + "grad_norm": 7.15625, + "learning_rate": 8.679118666815794e-06, + "loss": 1.15769444, + "memory(GiB)": 299.96, + "step": 15520, + "train_speed(iter/s)": 0.129193 + }, + { + "acc": 0.71792817, + "epoch": 0.08690704050488852, + "grad_norm": 5.96875, + "learning_rate": 8.690303098087463e-06, + "loss": 1.13789692, + "memory(GiB)": 299.96, + "step": 15540, + "train_speed(iter/s)": 0.129267 + }, + { + "acc": 0.72702146, + "epoch": 0.08701888997786779, + "grad_norm": 8.25, + "learning_rate": 8.701487529359133e-06, + "loss": 1.09828196, + "memory(GiB)": 299.96, + "step": 15560, + "train_speed(iter/s)": 0.129336 + }, + { + "acc": 0.7359446, + "epoch": 0.08713073945084705, + "grad_norm": 8.6875, + "learning_rate": 8.712671960630804e-06, + "loss": 1.0351366, + "memory(GiB)": 299.96, + "step": 15580, + "train_speed(iter/s)": 0.129413 + }, + { + "acc": 0.71889844, + "epoch": 0.08724258892382632, + "grad_norm": 5.6875, + "learning_rate": 8.723856391902472e-06, + "loss": 1.12366371, + "memory(GiB)": 299.96, + "step": 15600, + "train_speed(iter/s)": 0.129488 + }, + { + "acc": 0.72204828, + "epoch": 0.08735443839680558, + "grad_norm": 9.0, + "learning_rate": 8.735040823174143e-06, + "loss": 1.1168541, + "memory(GiB)": 299.96, + "step": 15620, + "train_speed(iter/s)": 0.129563 + }, + { + "acc": 0.70113058, + "epoch": 0.08746628786978485, + "grad_norm": 6.875, + "learning_rate": 8.746225254445812e-06, + "loss": 1.2004714, + "memory(GiB)": 299.96, + "step": 15640, + "train_speed(iter/s)": 0.129637 + }, + { + "acc": 0.71872373, + "epoch": 0.08757813734276411, + "grad_norm": 16.375, + "learning_rate": 8.757409685717482e-06, + "loss": 1.1445137, + "memory(GiB)": 299.96, + "step": 15660, + "train_speed(iter/s)": 0.129708 + }, + { + "acc": 0.71964183, + "epoch": 0.08768998681574337, + "grad_norm": 6.03125, + "learning_rate": 8.768594116989151e-06, + "loss": 1.11300507, + "memory(GiB)": 299.96, + "step": 15680, + "train_speed(iter/s)": 0.129783 + }, + { + "acc": 0.72189817, + "epoch": 0.08780183628872264, + "grad_norm": 6.0625, + "learning_rate": 8.779778548260822e-06, + "loss": 1.11102085, + "memory(GiB)": 299.96, + "step": 15700, + "train_speed(iter/s)": 0.129851 + }, + { + "acc": 0.73142538, + "epoch": 0.0879136857617019, + "grad_norm": 8.1875, + "learning_rate": 8.79096297953249e-06, + "loss": 1.05415268, + "memory(GiB)": 299.96, + "step": 15720, + "train_speed(iter/s)": 0.129926 + }, + { + "acc": 0.71516218, + "epoch": 0.08802553523468117, + "grad_norm": 5.90625, + "learning_rate": 8.802147410804161e-06, + "loss": 1.13700647, + "memory(GiB)": 299.96, + "step": 15740, + "train_speed(iter/s)": 0.129993 + }, + { + "acc": 0.72839966, + "epoch": 0.08813738470766043, + "grad_norm": 4.90625, + "learning_rate": 8.813331842075832e-06, + "loss": 1.0780551, + "memory(GiB)": 299.96, + "step": 15760, + "train_speed(iter/s)": 0.130071 + }, + { + "acc": 0.71712637, + "epoch": 0.0882492341806397, + "grad_norm": 7.75, + "learning_rate": 8.8245162733475e-06, + "loss": 1.12541351, + "memory(GiB)": 299.96, + "step": 15780, + "train_speed(iter/s)": 0.130145 + }, + { + "acc": 0.72921314, + "epoch": 0.08836108365361896, + "grad_norm": 6.78125, + "learning_rate": 8.835700704619171e-06, + "loss": 1.08156853, + "memory(GiB)": 299.96, + "step": 15800, + "train_speed(iter/s)": 0.130218 + }, + { + "acc": 0.72598319, + "epoch": 0.08847293312659822, + "grad_norm": 9.875, + "learning_rate": 8.84688513589084e-06, + "loss": 1.09497938, + "memory(GiB)": 299.96, + "step": 15820, + "train_speed(iter/s)": 0.130291 + }, + { + "acc": 0.71232224, + "epoch": 0.08858478259957749, + "grad_norm": 7.25, + "learning_rate": 8.85806956716251e-06, + "loss": 1.14410877, + "memory(GiB)": 299.96, + "step": 15840, + "train_speed(iter/s)": 0.130368 + }, + { + "acc": 0.70128884, + "epoch": 0.08869663207255675, + "grad_norm": 6.34375, + "learning_rate": 8.86925399843418e-06, + "loss": 1.18689194, + "memory(GiB)": 299.96, + "step": 15860, + "train_speed(iter/s)": 0.130442 + }, + { + "acc": 0.7251863, + "epoch": 0.08880848154553601, + "grad_norm": 5.34375, + "learning_rate": 8.88043842970585e-06, + "loss": 1.10014687, + "memory(GiB)": 299.96, + "step": 15880, + "train_speed(iter/s)": 0.130518 + }, + { + "acc": 0.71755209, + "epoch": 0.08892033101851528, + "grad_norm": 6.65625, + "learning_rate": 8.89162286097752e-06, + "loss": 1.11049767, + "memory(GiB)": 299.96, + "step": 15900, + "train_speed(iter/s)": 0.130593 + }, + { + "acc": 0.72990346, + "epoch": 0.08903218049149454, + "grad_norm": 7.09375, + "learning_rate": 8.90280729224919e-06, + "loss": 1.07787294, + "memory(GiB)": 299.96, + "step": 15920, + "train_speed(iter/s)": 0.130659 + }, + { + "acc": 0.7052834, + "epoch": 0.0891440299644738, + "grad_norm": 6.96875, + "learning_rate": 8.91399172352086e-06, + "loss": 1.1785593, + "memory(GiB)": 299.96, + "step": 15940, + "train_speed(iter/s)": 0.130729 + }, + { + "acc": 0.72420111, + "epoch": 0.08925587943745307, + "grad_norm": 7.71875, + "learning_rate": 8.92517615479253e-06, + "loss": 1.09778662, + "memory(GiB)": 299.96, + "step": 15960, + "train_speed(iter/s)": 0.130797 + }, + { + "acc": 0.72146573, + "epoch": 0.08936772891043233, + "grad_norm": 5.65625, + "learning_rate": 8.936360586064199e-06, + "loss": 1.09686022, + "memory(GiB)": 299.96, + "step": 15980, + "train_speed(iter/s)": 0.130872 + }, + { + "acc": 0.73606639, + "epoch": 0.0894795783834116, + "grad_norm": 9.75, + "learning_rate": 8.94754501733587e-06, + "loss": 1.05164165, + "memory(GiB)": 299.96, + "step": 16000, + "train_speed(iter/s)": 0.130946 + }, + { + "epoch": 0.0894795783834116, + "eval_acc": 0.6817060809261959, + "eval_loss": 1.1132763624191284, + "eval_runtime": 7516.332, + "eval_samples_per_second": 10.016, + "eval_steps_per_second": 10.016, + "step": 16000 + }, + { + "acc": 0.71704493, + "epoch": 0.08959142785639086, + "grad_norm": 7.5625, + "learning_rate": 8.95872944860754e-06, + "loss": 1.1295702, + "memory(GiB)": 299.96, + "step": 16020, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.7423439, + "epoch": 0.08970327732937014, + "grad_norm": 7.28125, + "learning_rate": 8.969913879879209e-06, + "loss": 1.01161556, + "memory(GiB)": 299.96, + "step": 16040, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.71307049, + "epoch": 0.0898151268023494, + "grad_norm": 7.6875, + "learning_rate": 8.98109831115088e-06, + "loss": 1.15562334, + "memory(GiB)": 299.96, + "step": 16060, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.7151413, + "epoch": 0.08992697627532867, + "grad_norm": 5.28125, + "learning_rate": 8.99228274242255e-06, + "loss": 1.14060345, + "memory(GiB)": 299.96, + "step": 16080, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.72465525, + "epoch": 0.09003882574830793, + "grad_norm": 9.0, + "learning_rate": 9.003467173694219e-06, + "loss": 1.10316734, + "memory(GiB)": 299.96, + "step": 16100, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.7227675, + "epoch": 0.0901506752212872, + "grad_norm": 9.5, + "learning_rate": 9.014651604965889e-06, + "loss": 1.0954318, + "memory(GiB)": 299.96, + "step": 16120, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.71199646, + "epoch": 0.09026252469426646, + "grad_norm": 9.5, + "learning_rate": 9.025836036237558e-06, + "loss": 1.15796299, + "memory(GiB)": 299.96, + "step": 16140, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.716922, + "epoch": 0.09037437416724572, + "grad_norm": 7.90625, + "learning_rate": 9.037020467509228e-06, + "loss": 1.14076014, + "memory(GiB)": 299.96, + "step": 16160, + "train_speed(iter/s)": 0.123846 + }, + { + "acc": 0.7140439, + "epoch": 0.09048622364022499, + "grad_norm": 8.4375, + "learning_rate": 9.048204898780897e-06, + "loss": 1.16756878, + "memory(GiB)": 299.96, + "step": 16180, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.7108572, + "epoch": 0.09059807311320425, + "grad_norm": 6.84375, + "learning_rate": 9.059389330052568e-06, + "loss": 1.15692205, + "memory(GiB)": 299.96, + "step": 16200, + "train_speed(iter/s)": 0.123994 + }, + { + "acc": 0.69093671, + "epoch": 0.09070992258618352, + "grad_norm": 9.0, + "learning_rate": 9.070573761324237e-06, + "loss": 1.24735231, + "memory(GiB)": 299.96, + "step": 16220, + "train_speed(iter/s)": 0.124061 + }, + { + "acc": 0.7262773, + "epoch": 0.09082177205916278, + "grad_norm": 6.0, + "learning_rate": 9.081758192595907e-06, + "loss": 1.0943058, + "memory(GiB)": 299.96, + "step": 16240, + "train_speed(iter/s)": 0.124132 + }, + { + "acc": 0.72486143, + "epoch": 0.09093362153214204, + "grad_norm": 6.625, + "learning_rate": 9.092942623867576e-06, + "loss": 1.0947937, + "memory(GiB)": 299.96, + "step": 16260, + "train_speed(iter/s)": 0.124202 + }, + { + "acc": 0.72948265, + "epoch": 0.09104547100512131, + "grad_norm": 9.125, + "learning_rate": 9.104127055139246e-06, + "loss": 1.07614708, + "memory(GiB)": 299.96, + "step": 16280, + "train_speed(iter/s)": 0.124271 + }, + { + "acc": 0.72866535, + "epoch": 0.09115732047810057, + "grad_norm": 10.375, + "learning_rate": 9.115311486410917e-06, + "loss": 1.09043608, + "memory(GiB)": 299.96, + "step": 16300, + "train_speed(iter/s)": 0.124342 + }, + { + "acc": 0.69413266, + "epoch": 0.09126916995107984, + "grad_norm": 5.125, + "learning_rate": 9.126495917682586e-06, + "loss": 1.26303711, + "memory(GiB)": 299.96, + "step": 16320, + "train_speed(iter/s)": 0.124414 + }, + { + "acc": 0.72490511, + "epoch": 0.0913810194240591, + "grad_norm": 6.875, + "learning_rate": 9.137680348954256e-06, + "loss": 1.0947546, + "memory(GiB)": 299.96, + "step": 16340, + "train_speed(iter/s)": 0.124487 + }, + { + "acc": 0.71938581, + "epoch": 0.09149286889703837, + "grad_norm": 8.6875, + "learning_rate": 9.148864780225927e-06, + "loss": 1.12428665, + "memory(GiB)": 299.96, + "step": 16360, + "train_speed(iter/s)": 0.124559 + }, + { + "acc": 0.71482072, + "epoch": 0.09160471837001763, + "grad_norm": 7.0625, + "learning_rate": 9.160049211497596e-06, + "loss": 1.13491964, + "memory(GiB)": 299.96, + "step": 16380, + "train_speed(iter/s)": 0.124629 + }, + { + "acc": 0.72443457, + "epoch": 0.0917165678429969, + "grad_norm": 8.8125, + "learning_rate": 9.171233642769266e-06, + "loss": 1.09002066, + "memory(GiB)": 299.96, + "step": 16400, + "train_speed(iter/s)": 0.124699 + }, + { + "acc": 0.70894608, + "epoch": 0.09182841731597616, + "grad_norm": 8.875, + "learning_rate": 9.182418074040935e-06, + "loss": 1.15067329, + "memory(GiB)": 299.96, + "step": 16420, + "train_speed(iter/s)": 0.124768 + }, + { + "acc": 0.72681561, + "epoch": 0.09194026678895542, + "grad_norm": 6.71875, + "learning_rate": 9.193602505312606e-06, + "loss": 1.09110746, + "memory(GiB)": 299.96, + "step": 16440, + "train_speed(iter/s)": 0.12484 + }, + { + "acc": 0.72421813, + "epoch": 0.09205211626193469, + "grad_norm": 7.65625, + "learning_rate": 9.204786936584276e-06, + "loss": 1.06772823, + "memory(GiB)": 299.96, + "step": 16460, + "train_speed(iter/s)": 0.124912 + }, + { + "acc": 0.72164788, + "epoch": 0.09216396573491395, + "grad_norm": 8.8125, + "learning_rate": 9.215971367855945e-06, + "loss": 1.10342264, + "memory(GiB)": 299.96, + "step": 16480, + "train_speed(iter/s)": 0.124985 + }, + { + "acc": 0.7292057, + "epoch": 0.09227581520789321, + "grad_norm": 6.09375, + "learning_rate": 9.227155799127615e-06, + "loss": 1.07234306, + "memory(GiB)": 299.96, + "step": 16500, + "train_speed(iter/s)": 0.125049 + }, + { + "acc": 0.73908701, + "epoch": 0.09238766468087248, + "grad_norm": 5.40625, + "learning_rate": 9.238340230399286e-06, + "loss": 1.0243144, + "memory(GiB)": 299.96, + "step": 16520, + "train_speed(iter/s)": 0.12512 + }, + { + "acc": 0.73023481, + "epoch": 0.09249951415385174, + "grad_norm": 7.09375, + "learning_rate": 9.249524661670955e-06, + "loss": 1.07419157, + "memory(GiB)": 299.96, + "step": 16540, + "train_speed(iter/s)": 0.125181 + }, + { + "acc": 0.71332026, + "epoch": 0.092611363626831, + "grad_norm": 6.34375, + "learning_rate": 9.260709092942625e-06, + "loss": 1.13799505, + "memory(GiB)": 299.96, + "step": 16560, + "train_speed(iter/s)": 0.125249 + }, + { + "acc": 0.72833056, + "epoch": 0.09272321309981027, + "grad_norm": 5.3125, + "learning_rate": 9.271893524214294e-06, + "loss": 1.07251797, + "memory(GiB)": 299.96, + "step": 16580, + "train_speed(iter/s)": 0.125315 + }, + { + "acc": 0.7288641, + "epoch": 0.09283506257278953, + "grad_norm": 4.28125, + "learning_rate": 9.283077955485965e-06, + "loss": 1.0767211, + "memory(GiB)": 299.96, + "step": 16600, + "train_speed(iter/s)": 0.125382 + }, + { + "acc": 0.72053962, + "epoch": 0.0929469120457688, + "grad_norm": 7.46875, + "learning_rate": 9.294262386757634e-06, + "loss": 1.10957842, + "memory(GiB)": 299.96, + "step": 16620, + "train_speed(iter/s)": 0.125456 + }, + { + "acc": 0.72681012, + "epoch": 0.09305876151874806, + "grad_norm": 7.125, + "learning_rate": 9.305446818029304e-06, + "loss": 1.07857685, + "memory(GiB)": 299.96, + "step": 16640, + "train_speed(iter/s)": 0.125526 + }, + { + "acc": 0.72559705, + "epoch": 0.09317061099172734, + "grad_norm": 8.25, + "learning_rate": 9.316631249300973e-06, + "loss": 1.10166483, + "memory(GiB)": 299.96, + "step": 16660, + "train_speed(iter/s)": 0.125582 + }, + { + "acc": 0.74014821, + "epoch": 0.0932824604647066, + "grad_norm": 7.15625, + "learning_rate": 9.327815680572643e-06, + "loss": 1.02956276, + "memory(GiB)": 299.96, + "step": 16680, + "train_speed(iter/s)": 0.125652 + }, + { + "acc": 0.73677745, + "epoch": 0.09339430993768587, + "grad_norm": 7.4375, + "learning_rate": 9.339000111844312e-06, + "loss": 1.03858862, + "memory(GiB)": 299.96, + "step": 16700, + "train_speed(iter/s)": 0.12572 + }, + { + "acc": 0.708393, + "epoch": 0.09350615941066513, + "grad_norm": 6.59375, + "learning_rate": 9.350184543115983e-06, + "loss": 1.15902414, + "memory(GiB)": 299.96, + "step": 16720, + "train_speed(iter/s)": 0.125791 + }, + { + "acc": 0.71513047, + "epoch": 0.0936180088836444, + "grad_norm": 6.125, + "learning_rate": 9.361368974387653e-06, + "loss": 1.14423914, + "memory(GiB)": 299.96, + "step": 16740, + "train_speed(iter/s)": 0.125858 + }, + { + "acc": 0.72416916, + "epoch": 0.09372985835662366, + "grad_norm": 7.125, + "learning_rate": 9.372553405659322e-06, + "loss": 1.11437254, + "memory(GiB)": 299.96, + "step": 16760, + "train_speed(iter/s)": 0.125924 + }, + { + "acc": 0.71174936, + "epoch": 0.09384170782960292, + "grad_norm": 6.0625, + "learning_rate": 9.383737836930993e-06, + "loss": 1.17214909, + "memory(GiB)": 299.96, + "step": 16780, + "train_speed(iter/s)": 0.125991 + }, + { + "acc": 0.72935309, + "epoch": 0.09395355730258219, + "grad_norm": 8.3125, + "learning_rate": 9.394922268202663e-06, + "loss": 1.07082176, + "memory(GiB)": 299.96, + "step": 16800, + "train_speed(iter/s)": 0.126063 + }, + { + "acc": 0.72267942, + "epoch": 0.09406540677556145, + "grad_norm": 6.8125, + "learning_rate": 9.406106699474332e-06, + "loss": 1.12460651, + "memory(GiB)": 299.96, + "step": 16820, + "train_speed(iter/s)": 0.126127 + }, + { + "acc": 0.71761231, + "epoch": 0.09417725624854072, + "grad_norm": 5.125, + "learning_rate": 9.417291130746002e-06, + "loss": 1.13962469, + "memory(GiB)": 299.96, + "step": 16840, + "train_speed(iter/s)": 0.126194 + }, + { + "acc": 0.72825665, + "epoch": 0.09428910572151998, + "grad_norm": 6.75, + "learning_rate": 9.428475562017673e-06, + "loss": 1.09648714, + "memory(GiB)": 299.96, + "step": 16860, + "train_speed(iter/s)": 0.126263 + }, + { + "acc": 0.71442795, + "epoch": 0.09440095519449924, + "grad_norm": 7.5, + "learning_rate": 9.439659993289342e-06, + "loss": 1.13602829, + "memory(GiB)": 299.96, + "step": 16880, + "train_speed(iter/s)": 0.12633 + }, + { + "acc": 0.72130523, + "epoch": 0.09451280466747851, + "grad_norm": 6.0625, + "learning_rate": 9.450844424561012e-06, + "loss": 1.12324162, + "memory(GiB)": 299.96, + "step": 16900, + "train_speed(iter/s)": 0.126396 + }, + { + "acc": 0.72114391, + "epoch": 0.09462465414045777, + "grad_norm": 8.25, + "learning_rate": 9.462028855832681e-06, + "loss": 1.11265783, + "memory(GiB)": 299.96, + "step": 16920, + "train_speed(iter/s)": 0.126467 + }, + { + "acc": 0.72857332, + "epoch": 0.09473650361343704, + "grad_norm": 6.9375, + "learning_rate": 9.473213287104352e-06, + "loss": 1.08156099, + "memory(GiB)": 299.96, + "step": 16940, + "train_speed(iter/s)": 0.126534 + }, + { + "acc": 0.72883339, + "epoch": 0.0948483530864163, + "grad_norm": 6.0625, + "learning_rate": 9.484397718376022e-06, + "loss": 1.08091917, + "memory(GiB)": 299.96, + "step": 16960, + "train_speed(iter/s)": 0.126604 + }, + { + "acc": 0.73283963, + "epoch": 0.09496020255939557, + "grad_norm": 6.96875, + "learning_rate": 9.495582149647691e-06, + "loss": 1.06394186, + "memory(GiB)": 299.96, + "step": 16980, + "train_speed(iter/s)": 0.12667 + }, + { + "acc": 0.72861476, + "epoch": 0.09507205203237483, + "grad_norm": 9.8125, + "learning_rate": 9.506766580919362e-06, + "loss": 1.08709755, + "memory(GiB)": 299.96, + "step": 17000, + "train_speed(iter/s)": 0.126739 + }, + { + "acc": 0.72551761, + "epoch": 0.0951839015053541, + "grad_norm": 5.96875, + "learning_rate": 9.51795101219103e-06, + "loss": 1.06576786, + "memory(GiB)": 299.96, + "step": 17020, + "train_speed(iter/s)": 0.126807 + }, + { + "acc": 0.74114623, + "epoch": 0.09529575097833336, + "grad_norm": 8.3125, + "learning_rate": 9.529135443462701e-06, + "loss": 1.03927994, + "memory(GiB)": 299.96, + "step": 17040, + "train_speed(iter/s)": 0.126875 + }, + { + "acc": 0.72639518, + "epoch": 0.09540760045131262, + "grad_norm": 7.34375, + "learning_rate": 9.54031987473437e-06, + "loss": 1.09753418, + "memory(GiB)": 299.96, + "step": 17060, + "train_speed(iter/s)": 0.126941 + }, + { + "acc": 0.71105185, + "epoch": 0.09551944992429189, + "grad_norm": 5.4375, + "learning_rate": 9.55150430600604e-06, + "loss": 1.16014566, + "memory(GiB)": 299.96, + "step": 17080, + "train_speed(iter/s)": 0.127004 + }, + { + "acc": 0.73368635, + "epoch": 0.09563129939727115, + "grad_norm": 6.8125, + "learning_rate": 9.562688737277709e-06, + "loss": 1.05803003, + "memory(GiB)": 299.96, + "step": 17100, + "train_speed(iter/s)": 0.127073 + }, + { + "acc": 0.72311978, + "epoch": 0.09574314887025041, + "grad_norm": 8.0, + "learning_rate": 9.57387316854938e-06, + "loss": 1.08637905, + "memory(GiB)": 299.96, + "step": 17120, + "train_speed(iter/s)": 0.127138 + }, + { + "acc": 0.71983919, + "epoch": 0.09585499834322968, + "grad_norm": 8.125, + "learning_rate": 9.58505759982105e-06, + "loss": 1.11359406, + "memory(GiB)": 299.96, + "step": 17140, + "train_speed(iter/s)": 0.127206 + }, + { + "acc": 0.71856914, + "epoch": 0.09596684781620894, + "grad_norm": 5.1875, + "learning_rate": 9.596242031092719e-06, + "loss": 1.11776123, + "memory(GiB)": 299.96, + "step": 17160, + "train_speed(iter/s)": 0.127276 + }, + { + "acc": 0.72552519, + "epoch": 0.0960786972891882, + "grad_norm": 10.625, + "learning_rate": 9.60742646236439e-06, + "loss": 1.09170065, + "memory(GiB)": 299.96, + "step": 17180, + "train_speed(iter/s)": 0.127342 + }, + { + "acc": 0.72018323, + "epoch": 0.09619054676216747, + "grad_norm": 7.25, + "learning_rate": 9.618610893636058e-06, + "loss": 1.12457037, + "memory(GiB)": 299.96, + "step": 17200, + "train_speed(iter/s)": 0.127409 + }, + { + "acc": 0.72396293, + "epoch": 0.09630239623514673, + "grad_norm": 8.3125, + "learning_rate": 9.629795324907729e-06, + "loss": 1.11867476, + "memory(GiB)": 299.96, + "step": 17220, + "train_speed(iter/s)": 0.127476 + }, + { + "acc": 0.7280169, + "epoch": 0.096414245708126, + "grad_norm": 5.40625, + "learning_rate": 9.6409797561794e-06, + "loss": 1.07706642, + "memory(GiB)": 299.96, + "step": 17240, + "train_speed(iter/s)": 0.12754 + }, + { + "acc": 0.71871443, + "epoch": 0.09652609518110526, + "grad_norm": 13.4375, + "learning_rate": 9.652164187451068e-06, + "loss": 1.11550617, + "memory(GiB)": 299.96, + "step": 17260, + "train_speed(iter/s)": 0.127605 + }, + { + "acc": 0.7331409, + "epoch": 0.09663794465408454, + "grad_norm": 6.71875, + "learning_rate": 9.663348618722739e-06, + "loss": 1.06393023, + "memory(GiB)": 299.96, + "step": 17280, + "train_speed(iter/s)": 0.127673 + }, + { + "acc": 0.72648215, + "epoch": 0.0967497941270638, + "grad_norm": 9.0625, + "learning_rate": 9.67453304999441e-06, + "loss": 1.09042511, + "memory(GiB)": 299.96, + "step": 17300, + "train_speed(iter/s)": 0.127742 + }, + { + "acc": 0.71981673, + "epoch": 0.09686164360004307, + "grad_norm": 6.8125, + "learning_rate": 9.685717481266078e-06, + "loss": 1.12196732, + "memory(GiB)": 299.96, + "step": 17320, + "train_speed(iter/s)": 0.127804 + }, + { + "acc": 0.7200954, + "epoch": 0.09697349307302233, + "grad_norm": 6.71875, + "learning_rate": 9.696901912537749e-06, + "loss": 1.1281497, + "memory(GiB)": 299.96, + "step": 17340, + "train_speed(iter/s)": 0.127874 + }, + { + "acc": 0.71051702, + "epoch": 0.0970853425460016, + "grad_norm": 6.84375, + "learning_rate": 9.708086343809419e-06, + "loss": 1.14743605, + "memory(GiB)": 299.96, + "step": 17360, + "train_speed(iter/s)": 0.127943 + }, + { + "acc": 0.71100082, + "epoch": 0.09719719201898086, + "grad_norm": 7.125, + "learning_rate": 9.719270775081088e-06, + "loss": 1.17285099, + "memory(GiB)": 299.96, + "step": 17380, + "train_speed(iter/s)": 0.128009 + }, + { + "acc": 0.71264734, + "epoch": 0.09730904149196012, + "grad_norm": 8.1875, + "learning_rate": 9.730455206352758e-06, + "loss": 1.14696331, + "memory(GiB)": 299.96, + "step": 17400, + "train_speed(iter/s)": 0.128078 + }, + { + "acc": 0.72590961, + "epoch": 0.09742089096493939, + "grad_norm": 10.8125, + "learning_rate": 9.741639637624427e-06, + "loss": 1.1001235, + "memory(GiB)": 299.96, + "step": 17420, + "train_speed(iter/s)": 0.128143 + }, + { + "acc": 0.72539649, + "epoch": 0.09753274043791865, + "grad_norm": 7.03125, + "learning_rate": 9.752824068896098e-06, + "loss": 1.07794752, + "memory(GiB)": 299.96, + "step": 17440, + "train_speed(iter/s)": 0.128212 + }, + { + "acc": 0.72414131, + "epoch": 0.09764458991089792, + "grad_norm": 6.0625, + "learning_rate": 9.764008500167768e-06, + "loss": 1.09788971, + "memory(GiB)": 299.96, + "step": 17460, + "train_speed(iter/s)": 0.128278 + }, + { + "acc": 0.73099637, + "epoch": 0.09775643938387718, + "grad_norm": 6.46875, + "learning_rate": 9.775192931439437e-06, + "loss": 1.05839434, + "memory(GiB)": 299.96, + "step": 17480, + "train_speed(iter/s)": 0.128338 + }, + { + "acc": 0.7122509, + "epoch": 0.09786828885685644, + "grad_norm": 7.9375, + "learning_rate": 9.786377362711108e-06, + "loss": 1.13932285, + "memory(GiB)": 302.72, + "step": 17500, + "train_speed(iter/s)": 0.128392 + }, + { + "acc": 0.73341045, + "epoch": 0.09798013832983571, + "grad_norm": 8.6875, + "learning_rate": 9.797561793982777e-06, + "loss": 1.07822866, + "memory(GiB)": 290.33, + "step": 17520, + "train_speed(iter/s)": 0.128457 + }, + { + "acc": 0.72032781, + "epoch": 0.09809198780281497, + "grad_norm": 6.71875, + "learning_rate": 9.808746225254447e-06, + "loss": 1.12667627, + "memory(GiB)": 290.33, + "step": 17540, + "train_speed(iter/s)": 0.128521 + }, + { + "acc": 0.71771021, + "epoch": 0.09820383727579424, + "grad_norm": 10.1875, + "learning_rate": 9.819930656526116e-06, + "loss": 1.13545513, + "memory(GiB)": 290.33, + "step": 17560, + "train_speed(iter/s)": 0.128584 + }, + { + "acc": 0.72208009, + "epoch": 0.0983156867487735, + "grad_norm": 10.3125, + "learning_rate": 9.831115087797786e-06, + "loss": 1.12051191, + "memory(GiB)": 290.33, + "step": 17580, + "train_speed(iter/s)": 0.128651 + }, + { + "acc": 0.71217799, + "epoch": 0.09842753622175276, + "grad_norm": 6.34375, + "learning_rate": 9.842299519069455e-06, + "loss": 1.13581219, + "memory(GiB)": 290.33, + "step": 17600, + "train_speed(iter/s)": 0.128714 + }, + { + "acc": 0.73082805, + "epoch": 0.09853938569473203, + "grad_norm": 9.0625, + "learning_rate": 9.853483950341126e-06, + "loss": 1.07756195, + "memory(GiB)": 290.33, + "step": 17620, + "train_speed(iter/s)": 0.128778 + }, + { + "acc": 0.71349568, + "epoch": 0.0986512351677113, + "grad_norm": 6.5625, + "learning_rate": 9.864668381612795e-06, + "loss": 1.16499586, + "memory(GiB)": 290.33, + "step": 17640, + "train_speed(iter/s)": 0.128844 + }, + { + "acc": 0.73066463, + "epoch": 0.09876308464069056, + "grad_norm": 6.78125, + "learning_rate": 9.875852812884465e-06, + "loss": 1.06336269, + "memory(GiB)": 290.33, + "step": 17660, + "train_speed(iter/s)": 0.128907 + }, + { + "acc": 0.7292078, + "epoch": 0.09887493411366982, + "grad_norm": 6.96875, + "learning_rate": 9.887037244156136e-06, + "loss": 1.0569273, + "memory(GiB)": 290.33, + "step": 17680, + "train_speed(iter/s)": 0.128973 + }, + { + "acc": 0.71783953, + "epoch": 0.09898678358664909, + "grad_norm": 7.59375, + "learning_rate": 9.898221675427804e-06, + "loss": 1.12079163, + "memory(GiB)": 290.33, + "step": 17700, + "train_speed(iter/s)": 0.129037 + }, + { + "acc": 0.72045369, + "epoch": 0.09909863305962835, + "grad_norm": 5.90625, + "learning_rate": 9.909406106699475e-06, + "loss": 1.11412687, + "memory(GiB)": 290.33, + "step": 17720, + "train_speed(iter/s)": 0.129105 + }, + { + "acc": 0.72596021, + "epoch": 0.09921048253260761, + "grad_norm": 7.84375, + "learning_rate": 9.920590537971146e-06, + "loss": 1.11080103, + "memory(GiB)": 290.33, + "step": 17740, + "train_speed(iter/s)": 0.129166 + }, + { + "acc": 0.72401824, + "epoch": 0.09932233200558688, + "grad_norm": 8.5, + "learning_rate": 9.931774969242814e-06, + "loss": 1.09646721, + "memory(GiB)": 290.33, + "step": 17760, + "train_speed(iter/s)": 0.129234 + }, + { + "acc": 0.71832457, + "epoch": 0.09943418147856614, + "grad_norm": 6.5625, + "learning_rate": 9.942959400514485e-06, + "loss": 1.13320322, + "memory(GiB)": 292.61, + "step": 17780, + "train_speed(iter/s)": 0.129292 + }, + { + "acc": 0.72234554, + "epoch": 0.0995460309515454, + "grad_norm": 4.375, + "learning_rate": 9.954143831786155e-06, + "loss": 1.11102724, + "memory(GiB)": 292.61, + "step": 17800, + "train_speed(iter/s)": 0.129359 + }, + { + "acc": 0.71296992, + "epoch": 0.09965788042452467, + "grad_norm": 9.9375, + "learning_rate": 9.965328263057824e-06, + "loss": 1.1390873, + "memory(GiB)": 292.61, + "step": 17820, + "train_speed(iter/s)": 0.129427 + }, + { + "acc": 0.71328254, + "epoch": 0.09976972989750393, + "grad_norm": 8.8125, + "learning_rate": 9.976512694329495e-06, + "loss": 1.13989468, + "memory(GiB)": 292.61, + "step": 17840, + "train_speed(iter/s)": 0.129492 + }, + { + "acc": 0.71784725, + "epoch": 0.0998815793704832, + "grad_norm": 9.25, + "learning_rate": 9.987697125601164e-06, + "loss": 1.12515078, + "memory(GiB)": 292.61, + "step": 17860, + "train_speed(iter/s)": 0.129558 + }, + { + "acc": 0.7117116, + "epoch": 0.09999342884346246, + "grad_norm": 4.84375, + "learning_rate": 9.998881556872834e-06, + "loss": 1.15236521, + "memory(GiB)": 292.61, + "step": 17880, + "train_speed(iter/s)": 0.129612 + }, + { + "acc": 0.70667, + "epoch": 0.10010527831644173, + "grad_norm": 10.125, + "learning_rate": 9.99999993073858e-06, + "loss": 1.17316666, + "memory(GiB)": 292.61, + "step": 17900, + "train_speed(iter/s)": 0.129673 + }, + { + "acc": 0.7236095, + "epoch": 0.100217127789421, + "grad_norm": 4.6875, + "learning_rate": 9.999999691316385e-06, + "loss": 1.1090354, + "memory(GiB)": 292.61, + "step": 17920, + "train_speed(iter/s)": 0.129732 + }, + { + "acc": 0.72033877, + "epoch": 0.10032897726240027, + "grad_norm": 6.09375, + "learning_rate": 9.999999280878348e-06, + "loss": 1.12972784, + "memory(GiB)": 292.61, + "step": 17940, + "train_speed(iter/s)": 0.129794 + }, + { + "acc": 0.7162396, + "epoch": 0.10044082673537953, + "grad_norm": 9.5625, + "learning_rate": 9.99999869942448e-06, + "loss": 1.14600916, + "memory(GiB)": 292.61, + "step": 17960, + "train_speed(iter/s)": 0.129859 + }, + { + "acc": 0.73458815, + "epoch": 0.1005526762083588, + "grad_norm": 9.5625, + "learning_rate": 9.999997946954801e-06, + "loss": 1.0607542, + "memory(GiB)": 292.61, + "step": 17980, + "train_speed(iter/s)": 0.129917 + }, + { + "acc": 0.73404837, + "epoch": 0.10066452568133806, + "grad_norm": 8.625, + "learning_rate": 9.999997023469339e-06, + "loss": 1.06472454, + "memory(GiB)": 292.61, + "step": 18000, + "train_speed(iter/s)": 0.129981 + }, + { + "epoch": 0.10066452568133806, + "eval_acc": 0.6849300414516957, + "eval_loss": 1.1027286052703857, + "eval_runtime": 7541.4213, + "eval_samples_per_second": 9.983, + "eval_steps_per_second": 9.983, + "step": 18000 + }, + { + "acc": 0.72111154, + "epoch": 0.10077637515431732, + "grad_norm": 6.1875, + "learning_rate": 9.999995928968124e-06, + "loss": 1.09433746, + "memory(GiB)": 292.61, + "step": 18020, + "train_speed(iter/s)": 0.123248 + }, + { + "acc": 0.73573613, + "epoch": 0.10088822462729659, + "grad_norm": 8.25, + "learning_rate": 9.999994663451193e-06, + "loss": 1.06778669, + "memory(GiB)": 292.61, + "step": 18040, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.71155367, + "epoch": 0.10100007410027585, + "grad_norm": 5.15625, + "learning_rate": 9.99999322691859e-06, + "loss": 1.13388186, + "memory(GiB)": 292.61, + "step": 18060, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.70391951, + "epoch": 0.10111192357325512, + "grad_norm": 8.6875, + "learning_rate": 9.999991619370365e-06, + "loss": 1.18522062, + "memory(GiB)": 292.61, + "step": 18080, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.7223063, + "epoch": 0.10122377304623438, + "grad_norm": 10.0, + "learning_rate": 9.999989840806571e-06, + "loss": 1.11631212, + "memory(GiB)": 292.61, + "step": 18100, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.73714681, + "epoch": 0.10133562251921364, + "grad_norm": 11.5625, + "learning_rate": 9.999987891227271e-06, + "loss": 1.05443783, + "memory(GiB)": 292.61, + "step": 18120, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.7265336, + "epoch": 0.10144747199219291, + "grad_norm": 7.1875, + "learning_rate": 9.99998577063253e-06, + "loss": 1.10557394, + "memory(GiB)": 292.61, + "step": 18140, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.7312315, + "epoch": 0.10155932146517217, + "grad_norm": 6.9375, + "learning_rate": 9.999983479022422e-06, + "loss": 1.06585588, + "memory(GiB)": 292.61, + "step": 18160, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.72146506, + "epoch": 0.10167117093815144, + "grad_norm": 6.28125, + "learning_rate": 9.999981016397022e-06, + "loss": 1.09936256, + "memory(GiB)": 292.61, + "step": 18180, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.72193375, + "epoch": 0.1017830204111307, + "grad_norm": 9.5625, + "learning_rate": 9.999978382756419e-06, + "loss": 1.11324053, + "memory(GiB)": 292.61, + "step": 18200, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.70704656, + "epoch": 0.10189486988410996, + "grad_norm": 6.84375, + "learning_rate": 9.9999755781007e-06, + "loss": 1.18851566, + "memory(GiB)": 292.61, + "step": 18220, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.72582669, + "epoch": 0.10200671935708923, + "grad_norm": 9.0, + "learning_rate": 9.999972602429962e-06, + "loss": 1.09913807, + "memory(GiB)": 292.61, + "step": 18240, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.72400699, + "epoch": 0.10211856883006849, + "grad_norm": 9.75, + "learning_rate": 9.999969455744308e-06, + "loss": 1.12425938, + "memory(GiB)": 292.61, + "step": 18260, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.71754365, + "epoch": 0.10223041830304776, + "grad_norm": 6.375, + "learning_rate": 9.999966138043842e-06, + "loss": 1.15009127, + "memory(GiB)": 292.61, + "step": 18280, + "train_speed(iter/s)": 0.124059 + }, + { + "acc": 0.70396481, + "epoch": 0.10234226777602702, + "grad_norm": 6.625, + "learning_rate": 9.99996264932868e-06, + "loss": 1.19613075, + "memory(GiB)": 292.61, + "step": 18300, + "train_speed(iter/s)": 0.124121 + }, + { + "acc": 0.7124877, + "epoch": 0.10245411724900629, + "grad_norm": 8.3125, + "learning_rate": 9.99995898959894e-06, + "loss": 1.14980793, + "memory(GiB)": 292.61, + "step": 18320, + "train_speed(iter/s)": 0.124184 + }, + { + "acc": 0.72433157, + "epoch": 0.10256596672198555, + "grad_norm": 5.96875, + "learning_rate": 9.99995515885475e-06, + "loss": 1.11510668, + "memory(GiB)": 292.61, + "step": 18340, + "train_speed(iter/s)": 0.124245 + }, + { + "acc": 0.72899733, + "epoch": 0.10267781619496481, + "grad_norm": 5.375, + "learning_rate": 9.999951157096238e-06, + "loss": 1.08000975, + "memory(GiB)": 292.61, + "step": 18360, + "train_speed(iter/s)": 0.124312 + }, + { + "acc": 0.70521474, + "epoch": 0.10278966566794408, + "grad_norm": 6.59375, + "learning_rate": 9.999946984323544e-06, + "loss": 1.18645763, + "memory(GiB)": 292.61, + "step": 18380, + "train_speed(iter/s)": 0.124376 + }, + { + "acc": 0.73080034, + "epoch": 0.10290151514092334, + "grad_norm": 6.34375, + "learning_rate": 9.999942640536807e-06, + "loss": 1.07313347, + "memory(GiB)": 292.61, + "step": 18400, + "train_speed(iter/s)": 0.12444 + }, + { + "acc": 0.70391846, + "epoch": 0.1030133646139026, + "grad_norm": 4.9375, + "learning_rate": 9.999938125736176e-06, + "loss": 1.19684496, + "memory(GiB)": 292.61, + "step": 18420, + "train_speed(iter/s)": 0.124504 + }, + { + "acc": 0.71547818, + "epoch": 0.10312521408688187, + "grad_norm": 7.03125, + "learning_rate": 9.999933439921809e-06, + "loss": 1.15688915, + "memory(GiB)": 292.61, + "step": 18440, + "train_speed(iter/s)": 0.124566 + }, + { + "acc": 0.72817616, + "epoch": 0.10323706355986113, + "grad_norm": 7.40625, + "learning_rate": 9.999928583093863e-06, + "loss": 1.04998665, + "memory(GiB)": 292.61, + "step": 18460, + "train_speed(iter/s)": 0.124633 + }, + { + "acc": 0.72207789, + "epoch": 0.1033489130328404, + "grad_norm": 5.59375, + "learning_rate": 9.999923555252505e-06, + "loss": 1.12532949, + "memory(GiB)": 292.61, + "step": 18480, + "train_speed(iter/s)": 0.124696 + }, + { + "acc": 0.72418675, + "epoch": 0.10346076250581966, + "grad_norm": 7.3125, + "learning_rate": 9.999918356397908e-06, + "loss": 1.08689985, + "memory(GiB)": 292.61, + "step": 18500, + "train_speed(iter/s)": 0.12476 + }, + { + "acc": 0.71406889, + "epoch": 0.10357261197879893, + "grad_norm": 8.125, + "learning_rate": 9.999912986530246e-06, + "loss": 1.12321882, + "memory(GiB)": 292.61, + "step": 18520, + "train_speed(iter/s)": 0.124821 + }, + { + "acc": 0.70914249, + "epoch": 0.1036844614517782, + "grad_norm": 7.625, + "learning_rate": 9.999907445649709e-06, + "loss": 1.165133, + "memory(GiB)": 292.61, + "step": 18540, + "train_speed(iter/s)": 0.124885 + }, + { + "acc": 0.72174249, + "epoch": 0.10379631092475747, + "grad_norm": 9.0625, + "learning_rate": 9.999901733756483e-06, + "loss": 1.10602741, + "memory(GiB)": 292.61, + "step": 18560, + "train_speed(iter/s)": 0.124948 + }, + { + "acc": 0.73620353, + "epoch": 0.10390816039773673, + "grad_norm": 8.75, + "learning_rate": 9.999895850850762e-06, + "loss": 1.03604479, + "memory(GiB)": 292.61, + "step": 18580, + "train_speed(iter/s)": 0.125013 + }, + { + "acc": 0.72393899, + "epoch": 0.104020009870716, + "grad_norm": 5.96875, + "learning_rate": 9.99988979693275e-06, + "loss": 1.08961611, + "memory(GiB)": 292.61, + "step": 18600, + "train_speed(iter/s)": 0.125074 + }, + { + "acc": 0.73534465, + "epoch": 0.10413185934369526, + "grad_norm": 8.3125, + "learning_rate": 9.999883572002651e-06, + "loss": 1.05662327, + "memory(GiB)": 292.61, + "step": 18620, + "train_speed(iter/s)": 0.125138 + }, + { + "acc": 0.72632847, + "epoch": 0.10424370881667452, + "grad_norm": 6.4375, + "learning_rate": 9.99987717606068e-06, + "loss": 1.09465036, + "memory(GiB)": 292.61, + "step": 18640, + "train_speed(iter/s)": 0.125201 + }, + { + "acc": 0.7361649, + "epoch": 0.10435555828965379, + "grad_norm": 7.3125, + "learning_rate": 9.999870609107056e-06, + "loss": 1.04258804, + "memory(GiB)": 292.61, + "step": 18660, + "train_speed(iter/s)": 0.125266 + }, + { + "acc": 0.73041959, + "epoch": 0.10446740776263305, + "grad_norm": 8.1875, + "learning_rate": 9.999863871142001e-06, + "loss": 1.05037813, + "memory(GiB)": 292.61, + "step": 18680, + "train_speed(iter/s)": 0.12533 + }, + { + "acc": 0.71656566, + "epoch": 0.10457925723561232, + "grad_norm": 10.5625, + "learning_rate": 9.99985696216575e-06, + "loss": 1.13960457, + "memory(GiB)": 292.61, + "step": 18700, + "train_speed(iter/s)": 0.12539 + }, + { + "acc": 0.72559319, + "epoch": 0.10469110670859158, + "grad_norm": 8.5625, + "learning_rate": 9.999849882178536e-06, + "loss": 1.08627949, + "memory(GiB)": 292.61, + "step": 18720, + "train_speed(iter/s)": 0.125452 + }, + { + "acc": 0.73074336, + "epoch": 0.10480295618157084, + "grad_norm": 9.8125, + "learning_rate": 9.999842631180602e-06, + "loss": 1.04473286, + "memory(GiB)": 292.61, + "step": 18740, + "train_speed(iter/s)": 0.125513 + }, + { + "acc": 0.74175253, + "epoch": 0.10491480565455011, + "grad_norm": 7.71875, + "learning_rate": 9.999835209172196e-06, + "loss": 1.00249863, + "memory(GiB)": 292.61, + "step": 18760, + "train_speed(iter/s)": 0.125576 + }, + { + "acc": 0.72067947, + "epoch": 0.10502665512752937, + "grad_norm": 5.125, + "learning_rate": 9.999827616153573e-06, + "loss": 1.10767384, + "memory(GiB)": 292.61, + "step": 18780, + "train_speed(iter/s)": 0.125638 + }, + { + "acc": 0.71604729, + "epoch": 0.10513850460050864, + "grad_norm": 6.96875, + "learning_rate": 9.99981985212499e-06, + "loss": 1.12932043, + "memory(GiB)": 292.61, + "step": 18800, + "train_speed(iter/s)": 0.125701 + }, + { + "acc": 0.74551711, + "epoch": 0.1052503540734879, + "grad_norm": 8.0625, + "learning_rate": 9.999811917086714e-06, + "loss": 1.01273041, + "memory(GiB)": 292.61, + "step": 18820, + "train_speed(iter/s)": 0.125756 + }, + { + "acc": 0.72577014, + "epoch": 0.10536220354646716, + "grad_norm": 8.8125, + "learning_rate": 9.999803811039016e-06, + "loss": 1.07742901, + "memory(GiB)": 292.61, + "step": 18840, + "train_speed(iter/s)": 0.12582 + }, + { + "acc": 0.72730393, + "epoch": 0.10547405301944643, + "grad_norm": 9.0625, + "learning_rate": 9.999795533982176e-06, + "loss": 1.08693571, + "memory(GiB)": 292.61, + "step": 18860, + "train_speed(iter/s)": 0.125881 + }, + { + "acc": 0.72906475, + "epoch": 0.10558590249242569, + "grad_norm": 7.53125, + "learning_rate": 9.999787085916473e-06, + "loss": 1.08149157, + "memory(GiB)": 292.61, + "step": 18880, + "train_speed(iter/s)": 0.12594 + }, + { + "acc": 0.72771416, + "epoch": 0.10569775196540496, + "grad_norm": 7.90625, + "learning_rate": 9.999778466842197e-06, + "loss": 1.08172998, + "memory(GiB)": 292.61, + "step": 18900, + "train_speed(iter/s)": 0.125998 + }, + { + "acc": 0.72599535, + "epoch": 0.10580960143838422, + "grad_norm": 10.75, + "learning_rate": 9.999769676759645e-06, + "loss": 1.08581924, + "memory(GiB)": 292.61, + "step": 18920, + "train_speed(iter/s)": 0.12606 + }, + { + "acc": 0.7288301, + "epoch": 0.10592145091136349, + "grad_norm": 8.0625, + "learning_rate": 9.999760715669116e-06, + "loss": 1.07567358, + "memory(GiB)": 292.61, + "step": 18940, + "train_speed(iter/s)": 0.126117 + }, + { + "acc": 0.70768213, + "epoch": 0.10603330038434275, + "grad_norm": 6.125, + "learning_rate": 9.999751583570916e-06, + "loss": 1.15933275, + "memory(GiB)": 292.61, + "step": 18960, + "train_speed(iter/s)": 0.126174 + }, + { + "acc": 0.72822418, + "epoch": 0.10614514985732201, + "grad_norm": 8.8125, + "learning_rate": 9.999742280465358e-06, + "loss": 1.05804596, + "memory(GiB)": 292.61, + "step": 18980, + "train_speed(iter/s)": 0.126237 + }, + { + "acc": 0.72777743, + "epoch": 0.10625699933030128, + "grad_norm": 5.84375, + "learning_rate": 9.999732806352763e-06, + "loss": 1.08537655, + "memory(GiB)": 292.61, + "step": 19000, + "train_speed(iter/s)": 0.126299 + }, + { + "acc": 0.73035131, + "epoch": 0.10636884880328054, + "grad_norm": 7.0625, + "learning_rate": 9.999723161233448e-06, + "loss": 1.04785213, + "memory(GiB)": 292.61, + "step": 19020, + "train_speed(iter/s)": 0.126361 + }, + { + "acc": 0.72158065, + "epoch": 0.1064806982762598, + "grad_norm": 6.15625, + "learning_rate": 9.999713345107749e-06, + "loss": 1.11052151, + "memory(GiB)": 292.61, + "step": 19040, + "train_speed(iter/s)": 0.126424 + }, + { + "acc": 0.71841168, + "epoch": 0.10659254774923907, + "grad_norm": 5.09375, + "learning_rate": 9.999703357976e-06, + "loss": 1.12424107, + "memory(GiB)": 292.61, + "step": 19060, + "train_speed(iter/s)": 0.126484 + }, + { + "acc": 0.73887711, + "epoch": 0.10670439722221833, + "grad_norm": 6.46875, + "learning_rate": 9.999693199838545e-06, + "loss": 1.01365232, + "memory(GiB)": 292.61, + "step": 19080, + "train_speed(iter/s)": 0.126544 + }, + { + "acc": 0.74061494, + "epoch": 0.1068162466951976, + "grad_norm": 7.6875, + "learning_rate": 9.999682870695727e-06, + "loss": 1.01619816, + "memory(GiB)": 292.61, + "step": 19100, + "train_speed(iter/s)": 0.126606 + }, + { + "acc": 0.71443219, + "epoch": 0.10692809616817686, + "grad_norm": 5.5625, + "learning_rate": 9.999672370547899e-06, + "loss": 1.17180586, + "memory(GiB)": 292.61, + "step": 19120, + "train_speed(iter/s)": 0.12666 + }, + { + "acc": 0.72333875, + "epoch": 0.10703994564115613, + "grad_norm": 8.625, + "learning_rate": 9.999661699395425e-06, + "loss": 1.10750303, + "memory(GiB)": 292.61, + "step": 19140, + "train_speed(iter/s)": 0.126722 + }, + { + "acc": 0.7219873, + "epoch": 0.10715179511413539, + "grad_norm": 9.3125, + "learning_rate": 9.999650857238668e-06, + "loss": 1.08437757, + "memory(GiB)": 292.61, + "step": 19160, + "train_speed(iter/s)": 0.126784 + }, + { + "acc": 0.7208467, + "epoch": 0.10726364458711467, + "grad_norm": 6.4375, + "learning_rate": 9.999639844077995e-06, + "loss": 1.08999701, + "memory(GiB)": 292.61, + "step": 19180, + "train_speed(iter/s)": 0.126844 + }, + { + "acc": 0.71695118, + "epoch": 0.10737549406009393, + "grad_norm": 5.5, + "learning_rate": 9.999628659913789e-06, + "loss": 1.12376003, + "memory(GiB)": 292.61, + "step": 19200, + "train_speed(iter/s)": 0.126905 + }, + { + "acc": 0.71963544, + "epoch": 0.1074873435330732, + "grad_norm": 10.9375, + "learning_rate": 9.999617304746427e-06, + "loss": 1.10607958, + "memory(GiB)": 292.61, + "step": 19220, + "train_speed(iter/s)": 0.126963 + }, + { + "acc": 0.74244857, + "epoch": 0.10759919300605246, + "grad_norm": 10.75, + "learning_rate": 9.999605778576302e-06, + "loss": 1.00813084, + "memory(GiB)": 292.61, + "step": 19240, + "train_speed(iter/s)": 0.127022 + }, + { + "acc": 0.74413838, + "epoch": 0.10771104247903172, + "grad_norm": 5.84375, + "learning_rate": 9.999594081403805e-06, + "loss": 0.98402777, + "memory(GiB)": 292.61, + "step": 19260, + "train_speed(iter/s)": 0.127086 + }, + { + "acc": 0.72521563, + "epoch": 0.10782289195201099, + "grad_norm": 6.84375, + "learning_rate": 9.999582213229337e-06, + "loss": 1.0860672, + "memory(GiB)": 292.61, + "step": 19280, + "train_speed(iter/s)": 0.127149 + }, + { + "acc": 0.72431045, + "epoch": 0.10793474142499025, + "grad_norm": 6.15625, + "learning_rate": 9.999570174053305e-06, + "loss": 1.10059977, + "memory(GiB)": 292.61, + "step": 19300, + "train_speed(iter/s)": 0.127208 + }, + { + "acc": 0.72614665, + "epoch": 0.10804659089796952, + "grad_norm": 7.5625, + "learning_rate": 9.99955796387612e-06, + "loss": 1.08971834, + "memory(GiB)": 292.61, + "step": 19320, + "train_speed(iter/s)": 0.127268 + }, + { + "acc": 0.73592758, + "epoch": 0.10815844037094878, + "grad_norm": 8.25, + "learning_rate": 9.999545582698198e-06, + "loss": 1.05862608, + "memory(GiB)": 292.61, + "step": 19340, + "train_speed(iter/s)": 0.127328 + }, + { + "acc": 0.70820255, + "epoch": 0.10827028984392804, + "grad_norm": 8.4375, + "learning_rate": 9.999533030519966e-06, + "loss": 1.16586838, + "memory(GiB)": 292.61, + "step": 19360, + "train_speed(iter/s)": 0.127388 + }, + { + "acc": 0.72915597, + "epoch": 0.10838213931690731, + "grad_norm": 7.78125, + "learning_rate": 9.999520307341849e-06, + "loss": 1.06437731, + "memory(GiB)": 292.62, + "step": 19380, + "train_speed(iter/s)": 0.127444 + }, + { + "acc": 0.73891664, + "epoch": 0.10849398878988657, + "grad_norm": 7.875, + "learning_rate": 9.999507413164285e-06, + "loss": 1.02943764, + "memory(GiB)": 292.62, + "step": 19400, + "train_speed(iter/s)": 0.127504 + }, + { + "acc": 0.72423806, + "epoch": 0.10860583826286584, + "grad_norm": 8.8125, + "learning_rate": 9.999494347987715e-06, + "loss": 1.08460627, + "memory(GiB)": 292.62, + "step": 19420, + "train_speed(iter/s)": 0.127556 + }, + { + "acc": 0.73308992, + "epoch": 0.1087176877358451, + "grad_norm": 6.46875, + "learning_rate": 9.999481111812585e-06, + "loss": 1.04634209, + "memory(GiB)": 292.62, + "step": 19440, + "train_speed(iter/s)": 0.127615 + }, + { + "acc": 0.72934923, + "epoch": 0.10882953720882436, + "grad_norm": 6.625, + "learning_rate": 9.99946770463935e-06, + "loss": 1.0796505, + "memory(GiB)": 292.62, + "step": 19460, + "train_speed(iter/s)": 0.127668 + }, + { + "acc": 0.72528582, + "epoch": 0.10894138668180363, + "grad_norm": 7.375, + "learning_rate": 9.999454126468464e-06, + "loss": 1.08876467, + "memory(GiB)": 292.62, + "step": 19480, + "train_speed(iter/s)": 0.127725 + }, + { + "acc": 0.74221907, + "epoch": 0.10905323615478289, + "grad_norm": 8.5, + "learning_rate": 9.999440377300393e-06, + "loss": 0.99594355, + "memory(GiB)": 292.62, + "step": 19500, + "train_speed(iter/s)": 0.127784 + }, + { + "acc": 0.71806221, + "epoch": 0.10916508562776216, + "grad_norm": 7.84375, + "learning_rate": 9.999426457135612e-06, + "loss": 1.12128448, + "memory(GiB)": 292.62, + "step": 19520, + "train_speed(iter/s)": 0.127847 + }, + { + "acc": 0.71921229, + "epoch": 0.10927693510074142, + "grad_norm": 6.0625, + "learning_rate": 9.99941236597459e-06, + "loss": 1.13487711, + "memory(GiB)": 292.62, + "step": 19540, + "train_speed(iter/s)": 0.127909 + }, + { + "acc": 0.70617518, + "epoch": 0.10938878457372068, + "grad_norm": 5.25, + "learning_rate": 9.999398103817815e-06, + "loss": 1.17734222, + "memory(GiB)": 292.62, + "step": 19560, + "train_speed(iter/s)": 0.127968 + }, + { + "acc": 0.72813134, + "epoch": 0.10950063404669995, + "grad_norm": 6.9375, + "learning_rate": 9.999383670665771e-06, + "loss": 1.09064379, + "memory(GiB)": 292.62, + "step": 19580, + "train_speed(iter/s)": 0.128027 + }, + { + "acc": 0.73030195, + "epoch": 0.10961248351967921, + "grad_norm": 7.59375, + "learning_rate": 9.999369066518954e-06, + "loss": 1.05738106, + "memory(GiB)": 292.62, + "step": 19600, + "train_speed(iter/s)": 0.128088 + }, + { + "acc": 0.72480702, + "epoch": 0.10972433299265848, + "grad_norm": 8.0625, + "learning_rate": 9.99935429137786e-06, + "loss": 1.09459333, + "memory(GiB)": 292.62, + "step": 19620, + "train_speed(iter/s)": 0.128147 + }, + { + "acc": 0.72349305, + "epoch": 0.10983618246563774, + "grad_norm": 7.84375, + "learning_rate": 9.999339345242999e-06, + "loss": 1.10850105, + "memory(GiB)": 292.62, + "step": 19640, + "train_speed(iter/s)": 0.128203 + }, + { + "acc": 0.71777277, + "epoch": 0.109948031938617, + "grad_norm": 5.59375, + "learning_rate": 9.999324228114878e-06, + "loss": 1.09614887, + "memory(GiB)": 292.62, + "step": 19660, + "train_speed(iter/s)": 0.128266 + }, + { + "acc": 0.7353055, + "epoch": 0.11005988141159627, + "grad_norm": 6.78125, + "learning_rate": 9.999308939994017e-06, + "loss": 1.03981771, + "memory(GiB)": 292.62, + "step": 19680, + "train_speed(iter/s)": 0.128319 + }, + { + "acc": 0.72273703, + "epoch": 0.11017173088457553, + "grad_norm": 6.4375, + "learning_rate": 9.999293480880938e-06, + "loss": 1.09715757, + "memory(GiB)": 292.62, + "step": 19700, + "train_speed(iter/s)": 0.128376 + }, + { + "acc": 0.7112391, + "epoch": 0.1102835803575548, + "grad_norm": 6.5625, + "learning_rate": 9.99927785077617e-06, + "loss": 1.14311028, + "memory(GiB)": 292.62, + "step": 19720, + "train_speed(iter/s)": 0.128432 + }, + { + "acc": 0.73506298, + "epoch": 0.11039542983053406, + "grad_norm": 6.71875, + "learning_rate": 9.999262049680246e-06, + "loss": 1.05039396, + "memory(GiB)": 292.62, + "step": 19740, + "train_speed(iter/s)": 0.12849 + }, + { + "acc": 0.71426935, + "epoch": 0.11050727930351333, + "grad_norm": 8.5625, + "learning_rate": 9.99924607759371e-06, + "loss": 1.14107447, + "memory(GiB)": 292.62, + "step": 19760, + "train_speed(iter/s)": 0.128548 + }, + { + "acc": 0.73022299, + "epoch": 0.11061912877649259, + "grad_norm": 7.9375, + "learning_rate": 9.999229934517102e-06, + "loss": 1.0950654, + "memory(GiB)": 292.62, + "step": 19780, + "train_speed(iter/s)": 0.128607 + }, + { + "acc": 0.71540799, + "epoch": 0.11073097824947187, + "grad_norm": 7.0625, + "learning_rate": 9.999213620450981e-06, + "loss": 1.14153528, + "memory(GiB)": 292.62, + "step": 19800, + "train_speed(iter/s)": 0.128665 + }, + { + "acc": 0.73836899, + "epoch": 0.11084282772245113, + "grad_norm": 6.53125, + "learning_rate": 9.999197135395902e-06, + "loss": 1.01670055, + "memory(GiB)": 292.62, + "step": 19820, + "train_speed(iter/s)": 0.128722 + }, + { + "acc": 0.72954493, + "epoch": 0.1109546771954304, + "grad_norm": 6.15625, + "learning_rate": 9.999180479352427e-06, + "loss": 1.08195848, + "memory(GiB)": 292.62, + "step": 19840, + "train_speed(iter/s)": 0.128779 + }, + { + "acc": 0.72856536, + "epoch": 0.11106652666840966, + "grad_norm": 11.1875, + "learning_rate": 9.999163652321131e-06, + "loss": 1.09642906, + "memory(GiB)": 292.62, + "step": 19860, + "train_speed(iter/s)": 0.128838 + }, + { + "acc": 0.71914077, + "epoch": 0.11117837614138892, + "grad_norm": 11.9375, + "learning_rate": 9.999146654302584e-06, + "loss": 1.11176758, + "memory(GiB)": 292.62, + "step": 19880, + "train_speed(iter/s)": 0.128896 + }, + { + "acc": 0.71891251, + "epoch": 0.11129022561436819, + "grad_norm": 4.34375, + "learning_rate": 9.99912948529737e-06, + "loss": 1.11495743, + "memory(GiB)": 292.62, + "step": 19900, + "train_speed(iter/s)": 0.128953 + }, + { + "acc": 0.72730675, + "epoch": 0.11140207508734745, + "grad_norm": 5.21875, + "learning_rate": 9.999112145306076e-06, + "loss": 1.07054853, + "memory(GiB)": 292.62, + "step": 19920, + "train_speed(iter/s)": 0.129009 + }, + { + "acc": 0.72306981, + "epoch": 0.11151392456032672, + "grad_norm": 7.0625, + "learning_rate": 9.999094634329294e-06, + "loss": 1.09369688, + "memory(GiB)": 292.62, + "step": 19940, + "train_speed(iter/s)": 0.129065 + }, + { + "acc": 0.71443028, + "epoch": 0.11162577403330598, + "grad_norm": 6.84375, + "learning_rate": 9.999076952367623e-06, + "loss": 1.13636065, + "memory(GiB)": 292.62, + "step": 19960, + "train_speed(iter/s)": 0.129123 + }, + { + "acc": 0.72918277, + "epoch": 0.11173762350628524, + "grad_norm": 7.5625, + "learning_rate": 9.999059099421671e-06, + "loss": 1.06918592, + "memory(GiB)": 292.62, + "step": 19980, + "train_speed(iter/s)": 0.129183 + }, + { + "acc": 0.71666465, + "epoch": 0.11184947297926451, + "grad_norm": 12.1875, + "learning_rate": 9.999041075492046e-06, + "loss": 1.1362958, + "memory(GiB)": 292.62, + "step": 20000, + "train_speed(iter/s)": 0.12924 + }, + { + "epoch": 0.11184947297926451, + "eval_acc": 0.6871132975221759, + "eval_loss": 1.094447374343872, + "eval_runtime": 7514.0365, + "eval_samples_per_second": 10.019, + "eval_steps_per_second": 10.019, + "step": 20000 + }, + { + "acc": 0.72089758, + "epoch": 0.11196132245224377, + "grad_norm": 8.5, + "learning_rate": 9.999022880579362e-06, + "loss": 1.10325136, + "memory(GiB)": 292.62, + "step": 20020, + "train_speed(iter/s)": 0.123243 + }, + { + "acc": 0.7297111, + "epoch": 0.11207317192522304, + "grad_norm": 9.75, + "learning_rate": 9.999004514684247e-06, + "loss": 1.07730064, + "memory(GiB)": 292.62, + "step": 20040, + "train_speed(iter/s)": 0.123299 + }, + { + "acc": 0.72696552, + "epoch": 0.1121850213982023, + "grad_norm": 7.53125, + "learning_rate": 9.998985977807325e-06, + "loss": 1.06685247, + "memory(GiB)": 292.62, + "step": 20060, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.70813417, + "epoch": 0.11229687087118156, + "grad_norm": 6.5625, + "learning_rate": 9.998967269949231e-06, + "loss": 1.18429031, + "memory(GiB)": 292.62, + "step": 20080, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.71234527, + "epoch": 0.11240872034416083, + "grad_norm": 5.21875, + "learning_rate": 9.998948391110606e-06, + "loss": 1.1523241, + "memory(GiB)": 292.62, + "step": 20100, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.72705803, + "epoch": 0.11252056981714009, + "grad_norm": 7.03125, + "learning_rate": 9.998929341292095e-06, + "loss": 1.07683592, + "memory(GiB)": 292.62, + "step": 20120, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.74272256, + "epoch": 0.11263241929011936, + "grad_norm": 5.65625, + "learning_rate": 9.998910120494347e-06, + "loss": 1.00326281, + "memory(GiB)": 292.62, + "step": 20140, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.70614076, + "epoch": 0.11274426876309862, + "grad_norm": 7.75, + "learning_rate": 9.998890728718023e-06, + "loss": 1.1643364, + "memory(GiB)": 292.62, + "step": 20160, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.73225493, + "epoch": 0.11285611823607788, + "grad_norm": 7.03125, + "learning_rate": 9.998871165963784e-06, + "loss": 1.04939642, + "memory(GiB)": 292.62, + "step": 20180, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.71696277, + "epoch": 0.11296796770905715, + "grad_norm": 8.4375, + "learning_rate": 9.998851432232303e-06, + "loss": 1.13280497, + "memory(GiB)": 292.62, + "step": 20200, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.72206035, + "epoch": 0.11307981718203641, + "grad_norm": 4.71875, + "learning_rate": 9.99883152752425e-06, + "loss": 1.1021471, + "memory(GiB)": 292.62, + "step": 20220, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.72968712, + "epoch": 0.11319166665501568, + "grad_norm": 6.96875, + "learning_rate": 9.998811451840308e-06, + "loss": 1.06817837, + "memory(GiB)": 292.62, + "step": 20240, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.72255349, + "epoch": 0.11330351612799494, + "grad_norm": 7.65625, + "learning_rate": 9.998791205181163e-06, + "loss": 1.09496307, + "memory(GiB)": 292.62, + "step": 20260, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.73102274, + "epoch": 0.1134153656009742, + "grad_norm": 7.375, + "learning_rate": 9.99877078754751e-06, + "loss": 1.06901388, + "memory(GiB)": 292.62, + "step": 20280, + "train_speed(iter/s)": 0.123981 + }, + { + "acc": 0.72576833, + "epoch": 0.11352721507395347, + "grad_norm": 8.125, + "learning_rate": 9.998750198940043e-06, + "loss": 1.10697174, + "memory(GiB)": 292.62, + "step": 20300, + "train_speed(iter/s)": 0.124037 + }, + { + "acc": 0.72972312, + "epoch": 0.11363906454693273, + "grad_norm": 6.21875, + "learning_rate": 9.998729439359472e-06, + "loss": 1.07537403, + "memory(GiB)": 292.62, + "step": 20320, + "train_speed(iter/s)": 0.124093 + }, + { + "acc": 0.71290822, + "epoch": 0.113750914019912, + "grad_norm": 7.21875, + "learning_rate": 9.998708508806501e-06, + "loss": 1.13357716, + "memory(GiB)": 292.62, + "step": 20340, + "train_speed(iter/s)": 0.124149 + }, + { + "acc": 0.72626848, + "epoch": 0.11386276349289126, + "grad_norm": 6.78125, + "learning_rate": 9.998687407281848e-06, + "loss": 1.07130718, + "memory(GiB)": 292.62, + "step": 20360, + "train_speed(iter/s)": 0.124209 + }, + { + "acc": 0.7219666, + "epoch": 0.11397461296587053, + "grad_norm": 7.71875, + "learning_rate": 9.998666134786237e-06, + "loss": 1.09573622, + "memory(GiB)": 292.62, + "step": 20380, + "train_speed(iter/s)": 0.124264 + }, + { + "acc": 0.72433939, + "epoch": 0.11408646243884979, + "grad_norm": 8.4375, + "learning_rate": 9.998644691320393e-06, + "loss": 1.11727238, + "memory(GiB)": 292.62, + "step": 20400, + "train_speed(iter/s)": 0.124319 + }, + { + "acc": 0.7268158, + "epoch": 0.11419831191182907, + "grad_norm": 8.125, + "learning_rate": 9.998623076885051e-06, + "loss": 1.09382296, + "memory(GiB)": 292.62, + "step": 20420, + "train_speed(iter/s)": 0.124378 + }, + { + "acc": 0.72627749, + "epoch": 0.11431016138480833, + "grad_norm": 7.25, + "learning_rate": 9.99860129148095e-06, + "loss": 1.09746332, + "memory(GiB)": 292.62, + "step": 20440, + "train_speed(iter/s)": 0.124438 + }, + { + "acc": 0.71173816, + "epoch": 0.1144220108577876, + "grad_norm": 5.71875, + "learning_rate": 9.998579335108833e-06, + "loss": 1.17425652, + "memory(GiB)": 292.62, + "step": 20460, + "train_speed(iter/s)": 0.124493 + }, + { + "acc": 0.73941364, + "epoch": 0.11453386033076686, + "grad_norm": 5.5, + "learning_rate": 9.998557207769452e-06, + "loss": 1.0534503, + "memory(GiB)": 292.62, + "step": 20480, + "train_speed(iter/s)": 0.124549 + }, + { + "acc": 0.73155756, + "epoch": 0.11464570980374612, + "grad_norm": 6.34375, + "learning_rate": 9.998534909463567e-06, + "loss": 1.07958202, + "memory(GiB)": 292.62, + "step": 20500, + "train_speed(iter/s)": 0.124606 + }, + { + "acc": 0.73027959, + "epoch": 0.11475755927672539, + "grad_norm": 9.3125, + "learning_rate": 9.998512440191937e-06, + "loss": 1.08433409, + "memory(GiB)": 292.62, + "step": 20520, + "train_speed(iter/s)": 0.12466 + }, + { + "acc": 0.73127213, + "epoch": 0.11486940874970465, + "grad_norm": 8.4375, + "learning_rate": 9.998489799955331e-06, + "loss": 1.05826969, + "memory(GiB)": 292.62, + "step": 20540, + "train_speed(iter/s)": 0.12472 + }, + { + "acc": 0.72248468, + "epoch": 0.11498125822268392, + "grad_norm": 12.625, + "learning_rate": 9.998466988754525e-06, + "loss": 1.11677027, + "memory(GiB)": 292.62, + "step": 20560, + "train_speed(iter/s)": 0.124777 + }, + { + "acc": 0.74082017, + "epoch": 0.11509310769566318, + "grad_norm": 7.75, + "learning_rate": 9.998444006590297e-06, + "loss": 1.02303123, + "memory(GiB)": 292.62, + "step": 20580, + "train_speed(iter/s)": 0.124832 + }, + { + "acc": 0.73034506, + "epoch": 0.11520495716864244, + "grad_norm": 8.5625, + "learning_rate": 9.998420853463437e-06, + "loss": 1.09792604, + "memory(GiB)": 292.62, + "step": 20600, + "train_speed(iter/s)": 0.12489 + }, + { + "acc": 0.72566972, + "epoch": 0.11531680664162171, + "grad_norm": 4.875, + "learning_rate": 9.998397529374732e-06, + "loss": 1.08195038, + "memory(GiB)": 292.62, + "step": 20620, + "train_speed(iter/s)": 0.124945 + }, + { + "acc": 0.7308579, + "epoch": 0.11542865611460097, + "grad_norm": 7.625, + "learning_rate": 9.998374034324983e-06, + "loss": 1.07199354, + "memory(GiB)": 292.62, + "step": 20640, + "train_speed(iter/s)": 0.125001 + }, + { + "acc": 0.70911283, + "epoch": 0.11554050558758024, + "grad_norm": 5.625, + "learning_rate": 9.998350368314993e-06, + "loss": 1.16908083, + "memory(GiB)": 292.62, + "step": 20660, + "train_speed(iter/s)": 0.125056 + }, + { + "acc": 0.71785378, + "epoch": 0.1156523550605595, + "grad_norm": 7.5625, + "learning_rate": 9.99832653134557e-06, + "loss": 1.12809753, + "memory(GiB)": 292.62, + "step": 20680, + "train_speed(iter/s)": 0.125116 + }, + { + "acc": 0.7217145, + "epoch": 0.11576420453353876, + "grad_norm": 6.46875, + "learning_rate": 9.998302523417531e-06, + "loss": 1.11155787, + "memory(GiB)": 292.62, + "step": 20700, + "train_speed(iter/s)": 0.125169 + }, + { + "acc": 0.7176033, + "epoch": 0.11587605400651803, + "grad_norm": 9.3125, + "learning_rate": 9.998278344531695e-06, + "loss": 1.11150455, + "memory(GiB)": 292.62, + "step": 20720, + "train_speed(iter/s)": 0.125228 + }, + { + "acc": 0.74227614, + "epoch": 0.11598790347949729, + "grad_norm": 6.90625, + "learning_rate": 9.998253994688893e-06, + "loss": 1.00140638, + "memory(GiB)": 292.62, + "step": 20740, + "train_speed(iter/s)": 0.125286 + }, + { + "acc": 0.71985583, + "epoch": 0.11609975295247656, + "grad_norm": 8.875, + "learning_rate": 9.998229473889953e-06, + "loss": 1.12052813, + "memory(GiB)": 292.62, + "step": 20760, + "train_speed(iter/s)": 0.125344 + }, + { + "acc": 0.73632522, + "epoch": 0.11621160242545582, + "grad_norm": 8.3125, + "learning_rate": 9.998204782135717e-06, + "loss": 1.04050303, + "memory(GiB)": 292.62, + "step": 20780, + "train_speed(iter/s)": 0.125401 + }, + { + "acc": 0.71329675, + "epoch": 0.11632345189843508, + "grad_norm": 6.8125, + "learning_rate": 9.998179919427028e-06, + "loss": 1.11335344, + "memory(GiB)": 292.62, + "step": 20800, + "train_speed(iter/s)": 0.125455 + }, + { + "acc": 0.71552382, + "epoch": 0.11643530137141435, + "grad_norm": 7.0625, + "learning_rate": 9.99815488576474e-06, + "loss": 1.13291492, + "memory(GiB)": 292.62, + "step": 20820, + "train_speed(iter/s)": 0.125504 + }, + { + "acc": 0.72835188, + "epoch": 0.11654715084439361, + "grad_norm": 8.0625, + "learning_rate": 9.998129681149703e-06, + "loss": 1.0835947, + "memory(GiB)": 292.62, + "step": 20840, + "train_speed(iter/s)": 0.125555 + }, + { + "acc": 0.74379611, + "epoch": 0.11665900031737288, + "grad_norm": 7.46875, + "learning_rate": 9.998104305582783e-06, + "loss": 0.98243275, + "memory(GiB)": 292.62, + "step": 20860, + "train_speed(iter/s)": 0.125614 + }, + { + "acc": 0.73032813, + "epoch": 0.11677084979035214, + "grad_norm": 5.0625, + "learning_rate": 9.998078759064847e-06, + "loss": 1.0525754, + "memory(GiB)": 292.62, + "step": 20880, + "train_speed(iter/s)": 0.125667 + }, + { + "acc": 0.73322458, + "epoch": 0.1168826992633314, + "grad_norm": 8.75, + "learning_rate": 9.99805304159677e-06, + "loss": 1.06751928, + "memory(GiB)": 292.62, + "step": 20900, + "train_speed(iter/s)": 0.125724 + }, + { + "acc": 0.70802307, + "epoch": 0.11699454873631067, + "grad_norm": 6.875, + "learning_rate": 9.998027153179429e-06, + "loss": 1.17513218, + "memory(GiB)": 292.62, + "step": 20920, + "train_speed(iter/s)": 0.125782 + }, + { + "acc": 0.72750063, + "epoch": 0.11710639820928993, + "grad_norm": 6.53125, + "learning_rate": 9.998001093813711e-06, + "loss": 1.06972399, + "memory(GiB)": 292.62, + "step": 20940, + "train_speed(iter/s)": 0.125834 + }, + { + "acc": 0.73110032, + "epoch": 0.1172182476822692, + "grad_norm": 8.4375, + "learning_rate": 9.99797486350051e-06, + "loss": 1.04290848, + "memory(GiB)": 292.62, + "step": 20960, + "train_speed(iter/s)": 0.125889 + }, + { + "acc": 0.73128514, + "epoch": 0.11733009715524846, + "grad_norm": 6.6875, + "learning_rate": 9.997948462240719e-06, + "loss": 1.06044817, + "memory(GiB)": 292.62, + "step": 20980, + "train_speed(iter/s)": 0.125943 + }, + { + "acc": 0.73972383, + "epoch": 0.11744194662822773, + "grad_norm": 6.46875, + "learning_rate": 9.997921890035242e-06, + "loss": 1.02021942, + "memory(GiB)": 292.62, + "step": 21000, + "train_speed(iter/s)": 0.126001 + }, + { + "acc": 0.73590593, + "epoch": 0.11755379610120699, + "grad_norm": 8.625, + "learning_rate": 9.997895146884989e-06, + "loss": 1.04388876, + "memory(GiB)": 292.62, + "step": 21020, + "train_speed(iter/s)": 0.126057 + }, + { + "acc": 0.73455138, + "epoch": 0.11766564557418625, + "grad_norm": 5.125, + "learning_rate": 9.997868232790873e-06, + "loss": 1.05915976, + "memory(GiB)": 292.62, + "step": 21040, + "train_speed(iter/s)": 0.126115 + }, + { + "acc": 0.71491008, + "epoch": 0.11777749504716553, + "grad_norm": 6.21875, + "learning_rate": 9.997841147753817e-06, + "loss": 1.11302414, + "memory(GiB)": 292.62, + "step": 21060, + "train_speed(iter/s)": 0.126168 + }, + { + "acc": 0.72262483, + "epoch": 0.1178893445201448, + "grad_norm": 8.0625, + "learning_rate": 9.997813891774745e-06, + "loss": 1.09465265, + "memory(GiB)": 292.62, + "step": 21080, + "train_speed(iter/s)": 0.126223 + }, + { + "acc": 0.71182652, + "epoch": 0.11800119399312406, + "grad_norm": 7.65625, + "learning_rate": 9.997786464854591e-06, + "loss": 1.15425415, + "memory(GiB)": 292.62, + "step": 21100, + "train_speed(iter/s)": 0.126274 + }, + { + "acc": 0.72672172, + "epoch": 0.11811304346610332, + "grad_norm": 4.75, + "learning_rate": 9.997758866994291e-06, + "loss": 1.09219618, + "memory(GiB)": 292.62, + "step": 21120, + "train_speed(iter/s)": 0.126329 + }, + { + "acc": 0.72027249, + "epoch": 0.11822489293908259, + "grad_norm": 7.59375, + "learning_rate": 9.997731098194792e-06, + "loss": 1.09901896, + "memory(GiB)": 292.62, + "step": 21140, + "train_speed(iter/s)": 0.126387 + }, + { + "acc": 0.72672133, + "epoch": 0.11833674241206185, + "grad_norm": 6.15625, + "learning_rate": 9.997703158457041e-06, + "loss": 1.07539597, + "memory(GiB)": 292.62, + "step": 21160, + "train_speed(iter/s)": 0.126443 + }, + { + "acc": 0.71939044, + "epoch": 0.11844859188504112, + "grad_norm": 9.0625, + "learning_rate": 9.997675047781995e-06, + "loss": 1.1254652, + "memory(GiB)": 292.62, + "step": 21180, + "train_speed(iter/s)": 0.126499 + }, + { + "acc": 0.70922904, + "epoch": 0.11856044135802038, + "grad_norm": 4.71875, + "learning_rate": 9.997646766170615e-06, + "loss": 1.15458422, + "memory(GiB)": 292.62, + "step": 21200, + "train_speed(iter/s)": 0.126552 + }, + { + "acc": 0.7180685, + "epoch": 0.11867229083099964, + "grad_norm": 8.625, + "learning_rate": 9.997618313623869e-06, + "loss": 1.15262575, + "memory(GiB)": 292.62, + "step": 21220, + "train_speed(iter/s)": 0.126604 + }, + { + "acc": 0.72332678, + "epoch": 0.11878414030397891, + "grad_norm": 9.125, + "learning_rate": 9.997589690142729e-06, + "loss": 1.11728888, + "memory(GiB)": 292.62, + "step": 21240, + "train_speed(iter/s)": 0.12666 + }, + { + "acc": 0.7457078, + "epoch": 0.11889598977695817, + "grad_norm": 6.625, + "learning_rate": 9.997560895728176e-06, + "loss": 0.99195366, + "memory(GiB)": 292.62, + "step": 21260, + "train_speed(iter/s)": 0.126718 + }, + { + "acc": 0.71052184, + "epoch": 0.11900783924993744, + "grad_norm": 6.65625, + "learning_rate": 9.997531930381193e-06, + "loss": 1.14640274, + "memory(GiB)": 292.62, + "step": 21280, + "train_speed(iter/s)": 0.126773 + }, + { + "acc": 0.71749778, + "epoch": 0.1191196887229167, + "grad_norm": 5.15625, + "learning_rate": 9.997502794102769e-06, + "loss": 1.10682325, + "memory(GiB)": 292.62, + "step": 21300, + "train_speed(iter/s)": 0.126829 + }, + { + "acc": 0.74870796, + "epoch": 0.11923153819589596, + "grad_norm": 13.75, + "learning_rate": 9.997473486893905e-06, + "loss": 0.98126345, + "memory(GiB)": 292.62, + "step": 21320, + "train_speed(iter/s)": 0.126887 + }, + { + "acc": 0.71928101, + "epoch": 0.11934338766887523, + "grad_norm": 8.125, + "learning_rate": 9.9974440087556e-06, + "loss": 1.12439489, + "memory(GiB)": 292.62, + "step": 21340, + "train_speed(iter/s)": 0.126942 + }, + { + "acc": 0.72968097, + "epoch": 0.11945523714185449, + "grad_norm": 7.625, + "learning_rate": 9.997414359688865e-06, + "loss": 1.12605562, + "memory(GiB)": 292.62, + "step": 21360, + "train_speed(iter/s)": 0.126996 + }, + { + "acc": 0.73420959, + "epoch": 0.11956708661483376, + "grad_norm": 7.71875, + "learning_rate": 9.997384539694713e-06, + "loss": 1.04062757, + "memory(GiB)": 292.62, + "step": 21380, + "train_speed(iter/s)": 0.12705 + }, + { + "acc": 0.72652726, + "epoch": 0.11967893608781302, + "grad_norm": 8.25, + "learning_rate": 9.99735454877416e-06, + "loss": 1.06612759, + "memory(GiB)": 292.62, + "step": 21400, + "train_speed(iter/s)": 0.127104 + }, + { + "acc": 0.74090075, + "epoch": 0.11979078556079228, + "grad_norm": 10.1875, + "learning_rate": 9.997324386928237e-06, + "loss": 1.02083035, + "memory(GiB)": 292.62, + "step": 21420, + "train_speed(iter/s)": 0.127159 + }, + { + "acc": 0.71662068, + "epoch": 0.11990263503377155, + "grad_norm": 9.9375, + "learning_rate": 9.997294054157976e-06, + "loss": 1.14034147, + "memory(GiB)": 292.62, + "step": 21440, + "train_speed(iter/s)": 0.127214 + }, + { + "acc": 0.72583108, + "epoch": 0.12001448450675081, + "grad_norm": 5.625, + "learning_rate": 9.99726355046441e-06, + "loss": 1.09060736, + "memory(GiB)": 292.62, + "step": 21460, + "train_speed(iter/s)": 0.127268 + }, + { + "acc": 0.73059883, + "epoch": 0.12012633397973008, + "grad_norm": 6.78125, + "learning_rate": 9.997232875848585e-06, + "loss": 1.04875011, + "memory(GiB)": 292.62, + "step": 21480, + "train_speed(iter/s)": 0.127325 + }, + { + "acc": 0.72585807, + "epoch": 0.12023818345270934, + "grad_norm": 4.5, + "learning_rate": 9.997202030311552e-06, + "loss": 1.08171892, + "memory(GiB)": 292.62, + "step": 21500, + "train_speed(iter/s)": 0.127376 + }, + { + "acc": 0.73978343, + "epoch": 0.1203500329256886, + "grad_norm": 7.21875, + "learning_rate": 9.997171013854361e-06, + "loss": 1.01603413, + "memory(GiB)": 292.62, + "step": 21520, + "train_speed(iter/s)": 0.127432 + }, + { + "acc": 0.71898098, + "epoch": 0.12046188239866787, + "grad_norm": 7.21875, + "learning_rate": 9.997139826478078e-06, + "loss": 1.09452114, + "memory(GiB)": 292.62, + "step": 21540, + "train_speed(iter/s)": 0.127488 + }, + { + "acc": 0.74159307, + "epoch": 0.12057373187164713, + "grad_norm": 8.9375, + "learning_rate": 9.997108468183765e-06, + "loss": 1.03791409, + "memory(GiB)": 292.62, + "step": 21560, + "train_speed(iter/s)": 0.127541 + }, + { + "acc": 0.72674279, + "epoch": 0.1206855813446264, + "grad_norm": 10.0625, + "learning_rate": 9.9970769389725e-06, + "loss": 1.08836832, + "memory(GiB)": 292.62, + "step": 21580, + "train_speed(iter/s)": 0.127593 + }, + { + "acc": 0.73143158, + "epoch": 0.12079743081760566, + "grad_norm": 8.1875, + "learning_rate": 9.997045238845355e-06, + "loss": 1.07227354, + "memory(GiB)": 292.62, + "step": 21600, + "train_speed(iter/s)": 0.127645 + }, + { + "acc": 0.712391, + "epoch": 0.12090928029058493, + "grad_norm": 8.3125, + "learning_rate": 9.997013367803418e-06, + "loss": 1.15864563, + "memory(GiB)": 292.62, + "step": 21620, + "train_speed(iter/s)": 0.127693 + }, + { + "acc": 0.74424496, + "epoch": 0.12102112976356419, + "grad_norm": 7.4375, + "learning_rate": 9.996981325847782e-06, + "loss": 1.01399641, + "memory(GiB)": 292.62, + "step": 21640, + "train_speed(iter/s)": 0.12775 + }, + { + "acc": 0.72802324, + "epoch": 0.12113297923654345, + "grad_norm": 7.46875, + "learning_rate": 9.996949112979536e-06, + "loss": 1.08214664, + "memory(GiB)": 292.62, + "step": 21660, + "train_speed(iter/s)": 0.127805 + }, + { + "acc": 0.71377583, + "epoch": 0.12124482870952273, + "grad_norm": 6.65625, + "learning_rate": 9.996916729199788e-06, + "loss": 1.13137283, + "memory(GiB)": 292.62, + "step": 21680, + "train_speed(iter/s)": 0.127856 + }, + { + "acc": 0.72622356, + "epoch": 0.121356678182502, + "grad_norm": 6.375, + "learning_rate": 9.996884174509643e-06, + "loss": 1.10339069, + "memory(GiB)": 292.62, + "step": 21700, + "train_speed(iter/s)": 0.127908 + }, + { + "acc": 0.73175545, + "epoch": 0.12146852765548126, + "grad_norm": 7.0, + "learning_rate": 9.996851448910211e-06, + "loss": 1.05416622, + "memory(GiB)": 292.62, + "step": 21720, + "train_speed(iter/s)": 0.127965 + }, + { + "acc": 0.71254444, + "epoch": 0.12158037712846052, + "grad_norm": 7.25, + "learning_rate": 9.99681855240262e-06, + "loss": 1.1349843, + "memory(GiB)": 292.62, + "step": 21740, + "train_speed(iter/s)": 0.128016 + }, + { + "acc": 0.73194318, + "epoch": 0.12169222660143979, + "grad_norm": 5.59375, + "learning_rate": 9.996785484987987e-06, + "loss": 1.05536919, + "memory(GiB)": 292.62, + "step": 21760, + "train_speed(iter/s)": 0.128069 + }, + { + "acc": 0.72158289, + "epoch": 0.12180407607441905, + "grad_norm": 7.6875, + "learning_rate": 9.996752246667447e-06, + "loss": 1.07363882, + "memory(GiB)": 292.62, + "step": 21780, + "train_speed(iter/s)": 0.128121 + }, + { + "acc": 0.719526, + "epoch": 0.12191592554739832, + "grad_norm": 4.03125, + "learning_rate": 9.996718837442133e-06, + "loss": 1.10256338, + "memory(GiB)": 292.62, + "step": 21800, + "train_speed(iter/s)": 0.128176 + }, + { + "acc": 0.70267582, + "epoch": 0.12202777502037758, + "grad_norm": 6.0, + "learning_rate": 9.996685257313194e-06, + "loss": 1.20111389, + "memory(GiB)": 292.62, + "step": 21820, + "train_speed(iter/s)": 0.12823 + }, + { + "acc": 0.74763684, + "epoch": 0.12213962449335684, + "grad_norm": 6.34375, + "learning_rate": 9.996651506281775e-06, + "loss": 0.9843647, + "memory(GiB)": 292.62, + "step": 21840, + "train_speed(iter/s)": 0.128283 + }, + { + "acc": 0.72107592, + "epoch": 0.12225147396633611, + "grad_norm": 6.3125, + "learning_rate": 9.99661758434903e-06, + "loss": 1.10158796, + "memory(GiB)": 292.62, + "step": 21860, + "train_speed(iter/s)": 0.128337 + }, + { + "acc": 0.73859921, + "epoch": 0.12236332343931537, + "grad_norm": 8.0625, + "learning_rate": 9.99658349151612e-06, + "loss": 1.0535161, + "memory(GiB)": 292.62, + "step": 21880, + "train_speed(iter/s)": 0.128392 + }, + { + "acc": 0.73655896, + "epoch": 0.12247517291229464, + "grad_norm": 5.1875, + "learning_rate": 9.996549227784211e-06, + "loss": 1.02855482, + "memory(GiB)": 292.62, + "step": 21900, + "train_speed(iter/s)": 0.128445 + }, + { + "acc": 0.7233851, + "epoch": 0.1225870223852739, + "grad_norm": 8.0625, + "learning_rate": 9.996514793154472e-06, + "loss": 1.09362698, + "memory(GiB)": 292.62, + "step": 21920, + "train_speed(iter/s)": 0.128497 + }, + { + "acc": 0.72619085, + "epoch": 0.12269887185825316, + "grad_norm": 6.84375, + "learning_rate": 9.996480187628088e-06, + "loss": 1.08374739, + "memory(GiB)": 292.62, + "step": 21940, + "train_speed(iter/s)": 0.128547 + }, + { + "acc": 0.72854986, + "epoch": 0.12281072133123243, + "grad_norm": 8.1875, + "learning_rate": 9.996445411206235e-06, + "loss": 1.07908134, + "memory(GiB)": 292.62, + "step": 21960, + "train_speed(iter/s)": 0.128597 + }, + { + "acc": 0.72453818, + "epoch": 0.12292257080421169, + "grad_norm": 7.875, + "learning_rate": 9.996410463890106e-06, + "loss": 1.08027058, + "memory(GiB)": 292.62, + "step": 21980, + "train_speed(iter/s)": 0.128649 + }, + { + "acc": 0.72146091, + "epoch": 0.12303442027719096, + "grad_norm": 6.375, + "learning_rate": 9.996375345680896e-06, + "loss": 1.0813282, + "memory(GiB)": 292.62, + "step": 22000, + "train_speed(iter/s)": 0.128702 + }, + { + "epoch": 0.12303442027719096, + "eval_acc": 0.6889262854309559, + "eval_loss": 1.0878078937530518, + "eval_runtime": 7576.9078, + "eval_samples_per_second": 9.936, + "eval_steps_per_second": 9.936, + "step": 22000 + }, + { + "acc": 0.71959963, + "epoch": 0.12314626975017022, + "grad_norm": 7.71875, + "learning_rate": 9.996340056579808e-06, + "loss": 1.13602314, + "memory(GiB)": 292.62, + "step": 22020, + "train_speed(iter/s)": 0.123224 + }, + { + "acc": 0.71939034, + "epoch": 0.12325811922314948, + "grad_norm": 6.0625, + "learning_rate": 9.996304596588045e-06, + "loss": 1.11342049, + "memory(GiB)": 292.62, + "step": 22040, + "train_speed(iter/s)": 0.123278 + }, + { + "acc": 0.72497797, + "epoch": 0.12336996869612875, + "grad_norm": 9.75, + "learning_rate": 9.996268965706822e-06, + "loss": 1.10512333, + "memory(GiB)": 292.62, + "step": 22060, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.70260911, + "epoch": 0.12348181816910801, + "grad_norm": 6.9375, + "learning_rate": 9.996233163937358e-06, + "loss": 1.19064493, + "memory(GiB)": 292.62, + "step": 22080, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.71528687, + "epoch": 0.12359366764208728, + "grad_norm": 7.96875, + "learning_rate": 9.99619719128088e-06, + "loss": 1.15483303, + "memory(GiB)": 292.62, + "step": 22100, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.71101522, + "epoch": 0.12370551711506654, + "grad_norm": 6.71875, + "learning_rate": 9.99616104773861e-06, + "loss": 1.15291891, + "memory(GiB)": 292.62, + "step": 22120, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74693775, + "epoch": 0.1238173665880458, + "grad_norm": 12.1875, + "learning_rate": 9.996124733311794e-06, + "loss": 1.00897846, + "memory(GiB)": 292.62, + "step": 22140, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.73523798, + "epoch": 0.12392921606102507, + "grad_norm": 4.34375, + "learning_rate": 9.996088248001668e-06, + "loss": 1.03627605, + "memory(GiB)": 292.62, + "step": 22160, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.72825203, + "epoch": 0.12404106553400433, + "grad_norm": 5.15625, + "learning_rate": 9.996051591809484e-06, + "loss": 1.0671093, + "memory(GiB)": 292.62, + "step": 22180, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.72875428, + "epoch": 0.1241529150069836, + "grad_norm": 7.625, + "learning_rate": 9.99601476473649e-06, + "loss": 1.06964293, + "memory(GiB)": 292.62, + "step": 22200, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.71200991, + "epoch": 0.12426476447996286, + "grad_norm": 6.90625, + "learning_rate": 9.995977766783952e-06, + "loss": 1.14497709, + "memory(GiB)": 292.62, + "step": 22220, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.72271285, + "epoch": 0.12437661395294212, + "grad_norm": 5.71875, + "learning_rate": 9.99594059795313e-06, + "loss": 1.09162016, + "memory(GiB)": 292.62, + "step": 22240, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.73706064, + "epoch": 0.12448846342592139, + "grad_norm": 5.78125, + "learning_rate": 9.9959032582453e-06, + "loss": 1.05037651, + "memory(GiB)": 292.62, + "step": 22260, + "train_speed(iter/s)": 0.12384 + }, + { + "acc": 0.69927473, + "epoch": 0.12460031289890065, + "grad_norm": 6.84375, + "learning_rate": 9.995865747661737e-06, + "loss": 1.18953781, + "memory(GiB)": 292.62, + "step": 22280, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.71833825, + "epoch": 0.12471216237187992, + "grad_norm": 5.28125, + "learning_rate": 9.995828066203722e-06, + "loss": 1.11914101, + "memory(GiB)": 292.62, + "step": 22300, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.72441406, + "epoch": 0.1248240118448592, + "grad_norm": 4.84375, + "learning_rate": 9.995790213872549e-06, + "loss": 1.07347422, + "memory(GiB)": 292.62, + "step": 22320, + "train_speed(iter/s)": 0.123995 + }, + { + "acc": 0.73335166, + "epoch": 0.12493586131783846, + "grad_norm": 7.28125, + "learning_rate": 9.995752190669507e-06, + "loss": 1.07075415, + "memory(GiB)": 292.62, + "step": 22340, + "train_speed(iter/s)": 0.124047 + }, + { + "acc": 0.72454081, + "epoch": 0.12504771079081772, + "grad_norm": 9.0, + "learning_rate": 9.9957139965959e-06, + "loss": 1.10695829, + "memory(GiB)": 292.62, + "step": 22360, + "train_speed(iter/s)": 0.1241 + }, + { + "acc": 0.72195873, + "epoch": 0.125159560263797, + "grad_norm": 4.0, + "learning_rate": 9.995675631653033e-06, + "loss": 1.09130602, + "memory(GiB)": 292.62, + "step": 22380, + "train_speed(iter/s)": 0.124151 + }, + { + "acc": 0.73430457, + "epoch": 0.12527140973677625, + "grad_norm": 7.84375, + "learning_rate": 9.995637095842218e-06, + "loss": 1.06327, + "memory(GiB)": 292.62, + "step": 22400, + "train_speed(iter/s)": 0.124202 + }, + { + "acc": 0.72590785, + "epoch": 0.12538325920975552, + "grad_norm": 8.3125, + "learning_rate": 9.995598389164774e-06, + "loss": 1.08214016, + "memory(GiB)": 292.62, + "step": 22420, + "train_speed(iter/s)": 0.124252 + }, + { + "acc": 0.74431157, + "epoch": 0.12549510868273478, + "grad_norm": 9.6875, + "learning_rate": 9.995559511622024e-06, + "loss": 0.99778605, + "memory(GiB)": 292.62, + "step": 22440, + "train_speed(iter/s)": 0.124305 + }, + { + "acc": 0.71444354, + "epoch": 0.12560695815571404, + "grad_norm": 7.5, + "learning_rate": 9.9955204632153e-06, + "loss": 1.1692421, + "memory(GiB)": 292.62, + "step": 22460, + "train_speed(iter/s)": 0.124354 + }, + { + "acc": 0.73378706, + "epoch": 0.1257188076286933, + "grad_norm": 6.1875, + "learning_rate": 9.995481243945933e-06, + "loss": 1.05077429, + "memory(GiB)": 292.62, + "step": 22480, + "train_speed(iter/s)": 0.124401 + }, + { + "acc": 0.70545011, + "epoch": 0.12583065710167257, + "grad_norm": 8.625, + "learning_rate": 9.995441853815269e-06, + "loss": 1.19495792, + "memory(GiB)": 292.62, + "step": 22500, + "train_speed(iter/s)": 0.12445 + }, + { + "acc": 0.72791815, + "epoch": 0.12594250657465184, + "grad_norm": 6.5, + "learning_rate": 9.995402292824655e-06, + "loss": 1.07703743, + "memory(GiB)": 292.62, + "step": 22520, + "train_speed(iter/s)": 0.124503 + }, + { + "acc": 0.74065843, + "epoch": 0.1260543560476311, + "grad_norm": 6.9375, + "learning_rate": 9.995362560975439e-06, + "loss": 1.03020716, + "memory(GiB)": 292.62, + "step": 22540, + "train_speed(iter/s)": 0.124553 + }, + { + "acc": 0.73188839, + "epoch": 0.12616620552061036, + "grad_norm": 5.96875, + "learning_rate": 9.995322658268986e-06, + "loss": 1.04310131, + "memory(GiB)": 292.62, + "step": 22560, + "train_speed(iter/s)": 0.124604 + }, + { + "acc": 0.73161564, + "epoch": 0.12627805499358963, + "grad_norm": 7.65625, + "learning_rate": 9.995282584706657e-06, + "loss": 1.05977573, + "memory(GiB)": 292.62, + "step": 22580, + "train_speed(iter/s)": 0.124653 + }, + { + "acc": 0.72152605, + "epoch": 0.1263899044665689, + "grad_norm": 7.46875, + "learning_rate": 9.995242340289825e-06, + "loss": 1.10203619, + "memory(GiB)": 292.62, + "step": 22600, + "train_speed(iter/s)": 0.124704 + }, + { + "acc": 0.75418835, + "epoch": 0.12650175393954816, + "grad_norm": 7.625, + "learning_rate": 9.995201925019865e-06, + "loss": 0.96710472, + "memory(GiB)": 292.62, + "step": 22620, + "train_speed(iter/s)": 0.124756 + }, + { + "acc": 0.72914329, + "epoch": 0.12661360341252742, + "grad_norm": 6.59375, + "learning_rate": 9.99516133889816e-06, + "loss": 1.0716629, + "memory(GiB)": 292.62, + "step": 22640, + "train_speed(iter/s)": 0.124807 + }, + { + "acc": 0.72858644, + "epoch": 0.12672545288550668, + "grad_norm": 6.90625, + "learning_rate": 9.995120581926099e-06, + "loss": 1.08244181, + "memory(GiB)": 292.62, + "step": 22660, + "train_speed(iter/s)": 0.124857 + }, + { + "acc": 0.73611174, + "epoch": 0.12683730235848595, + "grad_norm": 6.25, + "learning_rate": 9.995079654105074e-06, + "loss": 1.02694693, + "memory(GiB)": 292.62, + "step": 22680, + "train_speed(iter/s)": 0.124907 + }, + { + "acc": 0.71637373, + "epoch": 0.1269491518314652, + "grad_norm": 8.0, + "learning_rate": 9.995038555436486e-06, + "loss": 1.13811932, + "memory(GiB)": 292.62, + "step": 22700, + "train_speed(iter/s)": 0.124956 + }, + { + "acc": 0.72786236, + "epoch": 0.12706100130444448, + "grad_norm": 6.5, + "learning_rate": 9.99499728592174e-06, + "loss": 1.06342554, + "memory(GiB)": 292.62, + "step": 22720, + "train_speed(iter/s)": 0.125005 + }, + { + "acc": 0.7247025, + "epoch": 0.12717285077742374, + "grad_norm": 6.625, + "learning_rate": 9.994955845562248e-06, + "loss": 1.06850939, + "memory(GiB)": 292.62, + "step": 22740, + "train_speed(iter/s)": 0.125057 + }, + { + "acc": 0.72060442, + "epoch": 0.127284700250403, + "grad_norm": 9.25, + "learning_rate": 9.994914234359427e-06, + "loss": 1.09609079, + "memory(GiB)": 292.62, + "step": 22760, + "train_speed(iter/s)": 0.125106 + }, + { + "acc": 0.73427796, + "epoch": 0.12739654972338227, + "grad_norm": 8.3125, + "learning_rate": 9.994872452314703e-06, + "loss": 1.05043478, + "memory(GiB)": 292.62, + "step": 22780, + "train_speed(iter/s)": 0.125157 + }, + { + "acc": 0.74919424, + "epoch": 0.12750839919636153, + "grad_norm": 7.21875, + "learning_rate": 9.994830499429501e-06, + "loss": 0.98276873, + "memory(GiB)": 292.62, + "step": 22800, + "train_speed(iter/s)": 0.125206 + }, + { + "acc": 0.71844845, + "epoch": 0.1276202486693408, + "grad_norm": 9.25, + "learning_rate": 9.994788375705259e-06, + "loss": 1.12388277, + "memory(GiB)": 292.62, + "step": 22820, + "train_speed(iter/s)": 0.125256 + }, + { + "acc": 0.72052612, + "epoch": 0.12773209814232006, + "grad_norm": 7.625, + "learning_rate": 9.994746081143415e-06, + "loss": 1.08277721, + "memory(GiB)": 292.62, + "step": 22840, + "train_speed(iter/s)": 0.125307 + }, + { + "acc": 0.71812172, + "epoch": 0.12784394761529932, + "grad_norm": 7.78125, + "learning_rate": 9.994703615745418e-06, + "loss": 1.11310635, + "memory(GiB)": 292.62, + "step": 22860, + "train_speed(iter/s)": 0.125359 + }, + { + "acc": 0.71790862, + "epoch": 0.1279557970882786, + "grad_norm": 5.5625, + "learning_rate": 9.994660979512719e-06, + "loss": 1.12081385, + "memory(GiB)": 292.62, + "step": 22880, + "train_speed(iter/s)": 0.12541 + }, + { + "acc": 0.72333021, + "epoch": 0.12806764656125785, + "grad_norm": 8.125, + "learning_rate": 9.994618172446777e-06, + "loss": 1.10092249, + "memory(GiB)": 292.62, + "step": 22900, + "train_speed(iter/s)": 0.125462 + }, + { + "acc": 0.72578979, + "epoch": 0.12817949603423712, + "grad_norm": 5.375, + "learning_rate": 9.994575194549057e-06, + "loss": 1.06589661, + "memory(GiB)": 292.62, + "step": 22920, + "train_speed(iter/s)": 0.125514 + }, + { + "acc": 0.72318649, + "epoch": 0.12829134550721638, + "grad_norm": 7.40625, + "learning_rate": 9.994532045821028e-06, + "loss": 1.09465199, + "memory(GiB)": 292.62, + "step": 22940, + "train_speed(iter/s)": 0.125563 + }, + { + "acc": 0.71729822, + "epoch": 0.12840319498019565, + "grad_norm": 7.46875, + "learning_rate": 9.994488726264165e-06, + "loss": 1.13838835, + "memory(GiB)": 292.62, + "step": 22960, + "train_speed(iter/s)": 0.12561 + }, + { + "acc": 0.73348989, + "epoch": 0.1285150444531749, + "grad_norm": 6.6875, + "learning_rate": 9.99444523587995e-06, + "loss": 1.07517281, + "memory(GiB)": 292.62, + "step": 22980, + "train_speed(iter/s)": 0.125661 + }, + { + "acc": 0.71150041, + "epoch": 0.12862689392615417, + "grad_norm": 5.625, + "learning_rate": 9.994401574669872e-06, + "loss": 1.17367859, + "memory(GiB)": 292.62, + "step": 23000, + "train_speed(iter/s)": 0.12571 + }, + { + "acc": 0.72685413, + "epoch": 0.12873874339913344, + "grad_norm": 5.34375, + "learning_rate": 9.994357742635423e-06, + "loss": 1.07592802, + "memory(GiB)": 292.62, + "step": 23020, + "train_speed(iter/s)": 0.125761 + }, + { + "acc": 0.7326334, + "epoch": 0.1288505928721127, + "grad_norm": 8.625, + "learning_rate": 9.994313739778102e-06, + "loss": 1.04683313, + "memory(GiB)": 292.62, + "step": 23040, + "train_speed(iter/s)": 0.125814 + }, + { + "acc": 0.74148197, + "epoch": 0.12896244234509197, + "grad_norm": 5.4375, + "learning_rate": 9.994269566099416e-06, + "loss": 1.01993771, + "memory(GiB)": 292.62, + "step": 23060, + "train_speed(iter/s)": 0.125865 + }, + { + "acc": 0.7395359, + "epoch": 0.12907429181807123, + "grad_norm": 5.3125, + "learning_rate": 9.994225221600873e-06, + "loss": 1.02131691, + "memory(GiB)": 292.62, + "step": 23080, + "train_speed(iter/s)": 0.125912 + }, + { + "acc": 0.71029816, + "epoch": 0.1291861412910505, + "grad_norm": 5.71875, + "learning_rate": 9.994180706283992e-06, + "loss": 1.14865065, + "memory(GiB)": 292.62, + "step": 23100, + "train_speed(iter/s)": 0.125961 + }, + { + "acc": 0.72094655, + "epoch": 0.12929799076402976, + "grad_norm": 6.96875, + "learning_rate": 9.994136020150294e-06, + "loss": 1.10474968, + "memory(GiB)": 292.62, + "step": 23120, + "train_speed(iter/s)": 0.126015 + }, + { + "acc": 0.73908086, + "epoch": 0.12940984023700902, + "grad_norm": 7.25, + "learning_rate": 9.994091163201308e-06, + "loss": 1.00954046, + "memory(GiB)": 292.62, + "step": 23140, + "train_speed(iter/s)": 0.126064 + }, + { + "acc": 0.72828388, + "epoch": 0.12952168970998829, + "grad_norm": 9.1875, + "learning_rate": 9.994046135438568e-06, + "loss": 1.09900579, + "memory(GiB)": 292.62, + "step": 23160, + "train_speed(iter/s)": 0.126115 + }, + { + "acc": 0.71433816, + "epoch": 0.12963353918296755, + "grad_norm": 5.9375, + "learning_rate": 9.994000936863617e-06, + "loss": 1.15823545, + "memory(GiB)": 292.62, + "step": 23180, + "train_speed(iter/s)": 0.126168 + }, + { + "acc": 0.73216834, + "epoch": 0.12974538865594681, + "grad_norm": 5.75, + "learning_rate": 9.993955567477995e-06, + "loss": 1.05882311, + "memory(GiB)": 292.62, + "step": 23200, + "train_speed(iter/s)": 0.126217 + }, + { + "acc": 0.71665797, + "epoch": 0.12985723812892608, + "grad_norm": 7.53125, + "learning_rate": 9.99391002728326e-06, + "loss": 1.11982718, + "memory(GiB)": 292.62, + "step": 23220, + "train_speed(iter/s)": 0.126267 + }, + { + "acc": 0.73857584, + "epoch": 0.12996908760190534, + "grad_norm": 5.96875, + "learning_rate": 9.993864316280965e-06, + "loss": 1.02844763, + "memory(GiB)": 292.62, + "step": 23240, + "train_speed(iter/s)": 0.126315 + }, + { + "acc": 0.72754216, + "epoch": 0.13008093707488463, + "grad_norm": 7.59375, + "learning_rate": 9.993818434472677e-06, + "loss": 1.08616142, + "memory(GiB)": 292.62, + "step": 23260, + "train_speed(iter/s)": 0.126363 + }, + { + "acc": 0.72321353, + "epoch": 0.1301927865478639, + "grad_norm": 6.71875, + "learning_rate": 9.99377238185996e-06, + "loss": 1.09583597, + "memory(GiB)": 292.62, + "step": 23280, + "train_speed(iter/s)": 0.126415 + }, + { + "acc": 0.74475846, + "epoch": 0.13030463602084316, + "grad_norm": 5.25, + "learning_rate": 9.993726158444397e-06, + "loss": 0.99282026, + "memory(GiB)": 292.62, + "step": 23300, + "train_speed(iter/s)": 0.126465 + }, + { + "acc": 0.71364279, + "epoch": 0.13041648549382243, + "grad_norm": 6.53125, + "learning_rate": 9.993679764227563e-06, + "loss": 1.13940077, + "memory(GiB)": 292.62, + "step": 23320, + "train_speed(iter/s)": 0.126512 + }, + { + "acc": 0.7157557, + "epoch": 0.1305283349668017, + "grad_norm": 7.0, + "learning_rate": 9.993633199211045e-06, + "loss": 1.12647133, + "memory(GiB)": 292.62, + "step": 23340, + "train_speed(iter/s)": 0.126564 + }, + { + "acc": 0.72247863, + "epoch": 0.13064018443978095, + "grad_norm": 7.59375, + "learning_rate": 9.99358646339644e-06, + "loss": 1.11831722, + "memory(GiB)": 292.62, + "step": 23360, + "train_speed(iter/s)": 0.126615 + }, + { + "acc": 0.73270311, + "epoch": 0.13075203391276022, + "grad_norm": 9.375, + "learning_rate": 9.993539556785343e-06, + "loss": 1.08194695, + "memory(GiB)": 292.62, + "step": 23380, + "train_speed(iter/s)": 0.126665 + }, + { + "acc": 0.71860523, + "epoch": 0.13086388338573948, + "grad_norm": 4.40625, + "learning_rate": 9.993492479379359e-06, + "loss": 1.1295289, + "memory(GiB)": 292.62, + "step": 23400, + "train_speed(iter/s)": 0.126714 + }, + { + "acc": 0.73514524, + "epoch": 0.13097573285871875, + "grad_norm": 7.875, + "learning_rate": 9.993445231180097e-06, + "loss": 1.03860359, + "memory(GiB)": 292.62, + "step": 23420, + "train_speed(iter/s)": 0.126764 + }, + { + "acc": 0.71094203, + "epoch": 0.131087582331698, + "grad_norm": 9.3125, + "learning_rate": 9.993397812189175e-06, + "loss": 1.18654299, + "memory(GiB)": 292.62, + "step": 23440, + "train_speed(iter/s)": 0.126811 + }, + { + "acc": 0.70985818, + "epoch": 0.13119943180467727, + "grad_norm": 6.03125, + "learning_rate": 9.993350222408214e-06, + "loss": 1.15504427, + "memory(GiB)": 292.62, + "step": 23460, + "train_speed(iter/s)": 0.126859 + }, + { + "acc": 0.71934915, + "epoch": 0.13131128127765654, + "grad_norm": 7.0, + "learning_rate": 9.993302461838843e-06, + "loss": 1.10240135, + "memory(GiB)": 292.62, + "step": 23480, + "train_speed(iter/s)": 0.126906 + }, + { + "acc": 0.71898127, + "epoch": 0.1314231307506358, + "grad_norm": 5.15625, + "learning_rate": 9.993254530482692e-06, + "loss": 1.10552149, + "memory(GiB)": 292.62, + "step": 23500, + "train_speed(iter/s)": 0.126958 + }, + { + "acc": 0.71389527, + "epoch": 0.13153498022361507, + "grad_norm": 8.4375, + "learning_rate": 9.993206428341405e-06, + "loss": 1.16295481, + "memory(GiB)": 292.62, + "step": 23520, + "train_speed(iter/s)": 0.127008 + }, + { + "acc": 0.7339417, + "epoch": 0.13164682969659433, + "grad_norm": 6.1875, + "learning_rate": 9.993158155416625e-06, + "loss": 1.05249567, + "memory(GiB)": 292.62, + "step": 23540, + "train_speed(iter/s)": 0.127054 + }, + { + "acc": 0.72172608, + "epoch": 0.1317586791695736, + "grad_norm": 8.0, + "learning_rate": 9.993109711710004e-06, + "loss": 1.11167116, + "memory(GiB)": 292.62, + "step": 23560, + "train_speed(iter/s)": 0.127104 + }, + { + "acc": 0.72181649, + "epoch": 0.13187052864255286, + "grad_norm": 7.90625, + "learning_rate": 9.993061097223196e-06, + "loss": 1.1192812, + "memory(GiB)": 292.62, + "step": 23580, + "train_speed(iter/s)": 0.127147 + }, + { + "acc": 0.72491202, + "epoch": 0.13198237811553212, + "grad_norm": 7.65625, + "learning_rate": 9.993012311957867e-06, + "loss": 1.08381758, + "memory(GiB)": 292.62, + "step": 23600, + "train_speed(iter/s)": 0.127195 + }, + { + "acc": 0.72971396, + "epoch": 0.1320942275885114, + "grad_norm": 10.25, + "learning_rate": 9.992963355915683e-06, + "loss": 1.06661406, + "memory(GiB)": 292.62, + "step": 23620, + "train_speed(iter/s)": 0.12724 + }, + { + "acc": 0.7124198, + "epoch": 0.13220607706149065, + "grad_norm": 9.0625, + "learning_rate": 9.992914229098323e-06, + "loss": 1.15926456, + "memory(GiB)": 292.62, + "step": 23640, + "train_speed(iter/s)": 0.127287 + }, + { + "acc": 0.73398156, + "epoch": 0.13231792653446992, + "grad_norm": 4.71875, + "learning_rate": 9.992864931507462e-06, + "loss": 1.04668169, + "memory(GiB)": 292.62, + "step": 23660, + "train_speed(iter/s)": 0.127334 + }, + { + "acc": 0.72147374, + "epoch": 0.13242977600744918, + "grad_norm": 7.53125, + "learning_rate": 9.99281546314479e-06, + "loss": 1.1262475, + "memory(GiB)": 292.62, + "step": 23680, + "train_speed(iter/s)": 0.127381 + }, + { + "acc": 0.72843175, + "epoch": 0.13254162548042844, + "grad_norm": 7.21875, + "learning_rate": 9.992765824011996e-06, + "loss": 1.07747307, + "memory(GiB)": 292.62, + "step": 23700, + "train_speed(iter/s)": 0.127428 + }, + { + "acc": 0.7467782, + "epoch": 0.1326534749534077, + "grad_norm": 6.65625, + "learning_rate": 9.99271601411078e-06, + "loss": 1.03505783, + "memory(GiB)": 292.62, + "step": 23720, + "train_speed(iter/s)": 0.127478 + }, + { + "acc": 0.72641697, + "epoch": 0.13276532442638697, + "grad_norm": 7.0625, + "learning_rate": 9.992666033442843e-06, + "loss": 1.07920361, + "memory(GiB)": 292.62, + "step": 23740, + "train_speed(iter/s)": 0.127529 + }, + { + "acc": 0.74036832, + "epoch": 0.13287717389936624, + "grad_norm": 7.5625, + "learning_rate": 9.992615882009898e-06, + "loss": 1.02420216, + "memory(GiB)": 292.62, + "step": 23760, + "train_speed(iter/s)": 0.127577 + }, + { + "acc": 0.72240939, + "epoch": 0.1329890233723455, + "grad_norm": 4.59375, + "learning_rate": 9.992565559813659e-06, + "loss": 1.09648666, + "memory(GiB)": 292.62, + "step": 23780, + "train_speed(iter/s)": 0.127626 + }, + { + "acc": 0.73374171, + "epoch": 0.13310087284532476, + "grad_norm": 6.71875, + "learning_rate": 9.992515066855845e-06, + "loss": 1.03072014, + "memory(GiB)": 292.62, + "step": 23800, + "train_speed(iter/s)": 0.127675 + }, + { + "acc": 0.73133845, + "epoch": 0.13321272231830403, + "grad_norm": 5.4375, + "learning_rate": 9.992464403138187e-06, + "loss": 1.05573797, + "memory(GiB)": 292.62, + "step": 23820, + "train_speed(iter/s)": 0.127718 + }, + { + "acc": 0.72996631, + "epoch": 0.1333245717912833, + "grad_norm": 6.21875, + "learning_rate": 9.992413568662417e-06, + "loss": 1.06975965, + "memory(GiB)": 292.62, + "step": 23840, + "train_speed(iter/s)": 0.127765 + }, + { + "acc": 0.73541131, + "epoch": 0.13343642126426256, + "grad_norm": 9.0625, + "learning_rate": 9.992362563430269e-06, + "loss": 1.03971643, + "memory(GiB)": 292.62, + "step": 23860, + "train_speed(iter/s)": 0.127814 + }, + { + "acc": 0.72466874, + "epoch": 0.13354827073724182, + "grad_norm": 6.28125, + "learning_rate": 9.992311387443491e-06, + "loss": 1.11408787, + "memory(GiB)": 292.62, + "step": 23880, + "train_speed(iter/s)": 0.127862 + }, + { + "acc": 0.73125167, + "epoch": 0.13366012021022108, + "grad_norm": 7.5, + "learning_rate": 9.992260040703836e-06, + "loss": 1.07046537, + "memory(GiB)": 292.62, + "step": 23900, + "train_speed(iter/s)": 0.12791 + }, + { + "acc": 0.72252712, + "epoch": 0.13377196968320035, + "grad_norm": 9.3125, + "learning_rate": 9.992208523213055e-06, + "loss": 1.10318022, + "memory(GiB)": 292.62, + "step": 23920, + "train_speed(iter/s)": 0.127959 + }, + { + "acc": 0.71578007, + "epoch": 0.1338838191561796, + "grad_norm": 9.0, + "learning_rate": 9.992156834972914e-06, + "loss": 1.12227068, + "memory(GiB)": 292.62, + "step": 23940, + "train_speed(iter/s)": 0.128008 + }, + { + "acc": 0.73644557, + "epoch": 0.13399566862915888, + "grad_norm": 6.25, + "learning_rate": 9.99210497598518e-06, + "loss": 1.050776, + "memory(GiB)": 292.62, + "step": 23960, + "train_speed(iter/s)": 0.128057 + }, + { + "acc": 0.71721492, + "epoch": 0.13410751810213814, + "grad_norm": 6.09375, + "learning_rate": 9.992052946251626e-06, + "loss": 1.13406572, + "memory(GiB)": 292.62, + "step": 23980, + "train_speed(iter/s)": 0.128106 + }, + { + "acc": 0.72245846, + "epoch": 0.1342193675751174, + "grad_norm": 7.5, + "learning_rate": 9.992000745774031e-06, + "loss": 1.13571301, + "memory(GiB)": 292.62, + "step": 24000, + "train_speed(iter/s)": 0.128155 + }, + { + "epoch": 0.1342193675751174, + "eval_acc": 0.6902301361311034, + "eval_loss": 1.082785725593567, + "eval_runtime": 7543.3305, + "eval_samples_per_second": 9.98, + "eval_steps_per_second": 9.98, + "step": 24000 + }, + { + "acc": 0.73184867, + "epoch": 0.13433121704809667, + "grad_norm": 7.5625, + "learning_rate": 9.991948374554183e-06, + "loss": 1.08117685, + "memory(GiB)": 292.62, + "step": 24020, + "train_speed(iter/s)": 0.12318 + }, + { + "acc": 0.7419848, + "epoch": 0.13444306652107593, + "grad_norm": 6.78125, + "learning_rate": 9.99189583259387e-06, + "loss": 1.01968927, + "memory(GiB)": 292.62, + "step": 24040, + "train_speed(iter/s)": 0.12323 + }, + { + "acc": 0.74483509, + "epoch": 0.1345549159940552, + "grad_norm": 6.875, + "learning_rate": 9.991843119894892e-06, + "loss": 0.99742298, + "memory(GiB)": 292.62, + "step": 24060, + "train_speed(iter/s)": 0.123279 + }, + { + "acc": 0.71700349, + "epoch": 0.13466676546703446, + "grad_norm": 6.1875, + "learning_rate": 9.99179023645905e-06, + "loss": 1.13560019, + "memory(GiB)": 292.62, + "step": 24080, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.74433203, + "epoch": 0.13477861494001372, + "grad_norm": 8.0625, + "learning_rate": 9.991737182288152e-06, + "loss": 1.00916576, + "memory(GiB)": 292.62, + "step": 24100, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.74464602, + "epoch": 0.134890464412993, + "grad_norm": 9.9375, + "learning_rate": 9.991683957384015e-06, + "loss": 0.99593592, + "memory(GiB)": 292.62, + "step": 24120, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.71355872, + "epoch": 0.13500231388597225, + "grad_norm": 8.25, + "learning_rate": 9.991630561748459e-06, + "loss": 1.14787674, + "memory(GiB)": 292.62, + "step": 24140, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.73258276, + "epoch": 0.13511416335895152, + "grad_norm": 9.375, + "learning_rate": 9.99157699538331e-06, + "loss": 1.0519042, + "memory(GiB)": 292.62, + "step": 24160, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.73705549, + "epoch": 0.13522601283193078, + "grad_norm": 6.0, + "learning_rate": 9.991523258290399e-06, + "loss": 1.03214207, + "memory(GiB)": 292.62, + "step": 24180, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.73009028, + "epoch": 0.13533786230491004, + "grad_norm": 6.65625, + "learning_rate": 9.991469350471565e-06, + "loss": 1.07933826, + "memory(GiB)": 292.62, + "step": 24200, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.71146345, + "epoch": 0.1354497117778893, + "grad_norm": 6.3125, + "learning_rate": 9.99141527192865e-06, + "loss": 1.13653698, + "memory(GiB)": 292.62, + "step": 24220, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.71992087, + "epoch": 0.13556156125086857, + "grad_norm": 5.5625, + "learning_rate": 9.991361022663509e-06, + "loss": 1.10739975, + "memory(GiB)": 292.62, + "step": 24240, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.72726469, + "epoch": 0.13567341072384784, + "grad_norm": 7.9375, + "learning_rate": 9.991306602677991e-06, + "loss": 1.09080038, + "memory(GiB)": 292.62, + "step": 24260, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.7204195, + "epoch": 0.1357852601968271, + "grad_norm": 5.78125, + "learning_rate": 9.991252011973962e-06, + "loss": 1.11691961, + "memory(GiB)": 292.62, + "step": 24280, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.74277229, + "epoch": 0.13589710966980637, + "grad_norm": 4.84375, + "learning_rate": 9.991197250553286e-06, + "loss": 1.02108583, + "memory(GiB)": 292.62, + "step": 24300, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.72692676, + "epoch": 0.13600895914278563, + "grad_norm": 9.0625, + "learning_rate": 9.991142318417838e-06, + "loss": 1.07186861, + "memory(GiB)": 292.62, + "step": 24320, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73027782, + "epoch": 0.1361208086157649, + "grad_norm": 5.90625, + "learning_rate": 9.991087215569497e-06, + "loss": 1.05343475, + "memory(GiB)": 292.62, + "step": 24340, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.736482, + "epoch": 0.13623265808874416, + "grad_norm": 9.3125, + "learning_rate": 9.991031942010145e-06, + "loss": 1.05257673, + "memory(GiB)": 292.62, + "step": 24360, + "train_speed(iter/s)": 0.123997 + }, + { + "acc": 0.73235688, + "epoch": 0.13634450756172342, + "grad_norm": 7.78125, + "learning_rate": 9.990976497741675e-06, + "loss": 1.08422327, + "memory(GiB)": 292.62, + "step": 24380, + "train_speed(iter/s)": 0.124046 + }, + { + "acc": 0.73417444, + "epoch": 0.13645635703470269, + "grad_norm": 6.90625, + "learning_rate": 9.990920882765985e-06, + "loss": 1.04637337, + "memory(GiB)": 292.62, + "step": 24400, + "train_speed(iter/s)": 0.124091 + }, + { + "acc": 0.72770667, + "epoch": 0.13656820650768195, + "grad_norm": 5.6875, + "learning_rate": 9.990865097084972e-06, + "loss": 1.09044962, + "memory(GiB)": 292.62, + "step": 24420, + "train_speed(iter/s)": 0.12414 + }, + { + "acc": 0.71938996, + "epoch": 0.1366800559806612, + "grad_norm": 4.84375, + "learning_rate": 9.990809140700549e-06, + "loss": 1.1124958, + "memory(GiB)": 292.62, + "step": 24440, + "train_speed(iter/s)": 0.124186 + }, + { + "acc": 0.73314977, + "epoch": 0.13679190545364048, + "grad_norm": 6.78125, + "learning_rate": 9.990753013614627e-06, + "loss": 1.04612265, + "memory(GiB)": 292.62, + "step": 24460, + "train_speed(iter/s)": 0.124232 + }, + { + "acc": 0.74389815, + "epoch": 0.13690375492661974, + "grad_norm": 6.875, + "learning_rate": 9.990696715829127e-06, + "loss": 1.00397129, + "memory(GiB)": 292.62, + "step": 24480, + "train_speed(iter/s)": 0.124281 + }, + { + "acc": 0.71129646, + "epoch": 0.137015604399599, + "grad_norm": 7.53125, + "learning_rate": 9.990640247345975e-06, + "loss": 1.16178703, + "memory(GiB)": 292.62, + "step": 24500, + "train_speed(iter/s)": 0.124328 + }, + { + "acc": 0.72729526, + "epoch": 0.1371274538725783, + "grad_norm": 5.78125, + "learning_rate": 9.9905836081671e-06, + "loss": 1.07978277, + "memory(GiB)": 292.62, + "step": 24520, + "train_speed(iter/s)": 0.124377 + }, + { + "acc": 0.7261652, + "epoch": 0.13723930334555756, + "grad_norm": 6.90625, + "learning_rate": 9.990526798294443e-06, + "loss": 1.10138302, + "memory(GiB)": 292.62, + "step": 24540, + "train_speed(iter/s)": 0.124422 + }, + { + "acc": 0.73769369, + "epoch": 0.13735115281853683, + "grad_norm": 4.5, + "learning_rate": 9.990469817729943e-06, + "loss": 1.05488501, + "memory(GiB)": 292.62, + "step": 24560, + "train_speed(iter/s)": 0.124469 + }, + { + "acc": 0.75137491, + "epoch": 0.1374630022915161, + "grad_norm": 6.6875, + "learning_rate": 9.990412666475551e-06, + "loss": 0.96524391, + "memory(GiB)": 292.62, + "step": 24580, + "train_speed(iter/s)": 0.124518 + }, + { + "acc": 0.74366465, + "epoch": 0.13757485176449535, + "grad_norm": 4.5625, + "learning_rate": 9.990355344533223e-06, + "loss": 1.01758413, + "memory(GiB)": 292.62, + "step": 24600, + "train_speed(iter/s)": 0.124563 + }, + { + "acc": 0.74050756, + "epoch": 0.13768670123747462, + "grad_norm": 6.09375, + "learning_rate": 9.990297851904918e-06, + "loss": 1.02760973, + "memory(GiB)": 292.62, + "step": 24620, + "train_speed(iter/s)": 0.124612 + }, + { + "acc": 0.73627419, + "epoch": 0.13779855071045388, + "grad_norm": 7.125, + "learning_rate": 9.990240188592601e-06, + "loss": 1.04200182, + "memory(GiB)": 292.62, + "step": 24640, + "train_speed(iter/s)": 0.124659 + }, + { + "acc": 0.72739201, + "epoch": 0.13791040018343315, + "grad_norm": 8.4375, + "learning_rate": 9.990182354598248e-06, + "loss": 1.09117508, + "memory(GiB)": 292.62, + "step": 24660, + "train_speed(iter/s)": 0.124706 + }, + { + "acc": 0.72869191, + "epoch": 0.1380222496564124, + "grad_norm": 10.8125, + "learning_rate": 9.990124349923834e-06, + "loss": 1.06069078, + "memory(GiB)": 292.62, + "step": 24680, + "train_speed(iter/s)": 0.124755 + }, + { + "acc": 0.72976894, + "epoch": 0.13813409912939167, + "grad_norm": 6.03125, + "learning_rate": 9.990066174571344e-06, + "loss": 1.08644753, + "memory(GiB)": 292.62, + "step": 24700, + "train_speed(iter/s)": 0.124804 + }, + { + "acc": 0.72739263, + "epoch": 0.13824594860237094, + "grad_norm": 7.78125, + "learning_rate": 9.990007828542769e-06, + "loss": 1.08134394, + "memory(GiB)": 292.62, + "step": 24720, + "train_speed(iter/s)": 0.124853 + }, + { + "acc": 0.73282213, + "epoch": 0.1383577980753502, + "grad_norm": 6.90625, + "learning_rate": 9.989949311840102e-06, + "loss": 1.04219141, + "memory(GiB)": 292.62, + "step": 24740, + "train_speed(iter/s)": 0.124901 + }, + { + "acc": 0.7412816, + "epoch": 0.13846964754832947, + "grad_norm": 9.0625, + "learning_rate": 9.989890624465345e-06, + "loss": 1.02477407, + "memory(GiB)": 292.62, + "step": 24760, + "train_speed(iter/s)": 0.124946 + }, + { + "acc": 0.71920562, + "epoch": 0.13858149702130873, + "grad_norm": 7.21875, + "learning_rate": 9.989831766420509e-06, + "loss": 1.12770767, + "memory(GiB)": 292.62, + "step": 24780, + "train_speed(iter/s)": 0.124993 + }, + { + "acc": 0.73853898, + "epoch": 0.138693346494288, + "grad_norm": 5.71875, + "learning_rate": 9.989772737707601e-06, + "loss": 1.02786484, + "memory(GiB)": 292.62, + "step": 24800, + "train_speed(iter/s)": 0.125038 + }, + { + "acc": 0.72360425, + "epoch": 0.13880519596726726, + "grad_norm": 5.6875, + "learning_rate": 9.989713538328645e-06, + "loss": 1.12305479, + "memory(GiB)": 292.62, + "step": 24820, + "train_speed(iter/s)": 0.125087 + }, + { + "acc": 0.72300234, + "epoch": 0.13891704544024652, + "grad_norm": 5.34375, + "learning_rate": 9.989654168285664e-06, + "loss": 1.09317589, + "memory(GiB)": 292.62, + "step": 24840, + "train_speed(iter/s)": 0.125131 + }, + { + "acc": 0.720856, + "epoch": 0.1390288949132258, + "grad_norm": 7.375, + "learning_rate": 9.989594627580687e-06, + "loss": 1.12400894, + "memory(GiB)": 292.62, + "step": 24860, + "train_speed(iter/s)": 0.125173 + }, + { + "acc": 0.7281713, + "epoch": 0.13914074438620505, + "grad_norm": 10.4375, + "learning_rate": 9.989534916215755e-06, + "loss": 1.07065182, + "memory(GiB)": 292.62, + "step": 24880, + "train_speed(iter/s)": 0.125221 + }, + { + "acc": 0.72442303, + "epoch": 0.13925259385918431, + "grad_norm": 6.90625, + "learning_rate": 9.989475034192905e-06, + "loss": 1.10539112, + "memory(GiB)": 292.62, + "step": 24900, + "train_speed(iter/s)": 0.125266 + }, + { + "acc": 0.72851596, + "epoch": 0.13936444333216358, + "grad_norm": 5.3125, + "learning_rate": 9.98941498151419e-06, + "loss": 1.04295988, + "memory(GiB)": 292.62, + "step": 24920, + "train_speed(iter/s)": 0.125315 + }, + { + "acc": 0.7235086, + "epoch": 0.13947629280514284, + "grad_norm": 4.3125, + "learning_rate": 9.98935475818166e-06, + "loss": 1.08443203, + "memory(GiB)": 292.62, + "step": 24940, + "train_speed(iter/s)": 0.125362 + }, + { + "acc": 0.71044049, + "epoch": 0.1395881422781221, + "grad_norm": 6.5, + "learning_rate": 9.989294364197377e-06, + "loss": 1.1457531, + "memory(GiB)": 292.62, + "step": 24960, + "train_speed(iter/s)": 0.125404 + }, + { + "acc": 0.74111395, + "epoch": 0.13969999175110137, + "grad_norm": 8.8125, + "learning_rate": 9.989233799563406e-06, + "loss": 1.01969528, + "memory(GiB)": 292.62, + "step": 24980, + "train_speed(iter/s)": 0.125452 + }, + { + "acc": 0.72639122, + "epoch": 0.13981184122408064, + "grad_norm": 9.0625, + "learning_rate": 9.989173064281819e-06, + "loss": 1.09476786, + "memory(GiB)": 292.62, + "step": 25000, + "train_speed(iter/s)": 0.125497 + }, + { + "acc": 0.72700443, + "epoch": 0.1399236906970599, + "grad_norm": 8.625, + "learning_rate": 9.989112158354692e-06, + "loss": 1.0895133, + "memory(GiB)": 292.62, + "step": 25020, + "train_speed(iter/s)": 0.12554 + }, + { + "acc": 0.73499289, + "epoch": 0.14003554017003916, + "grad_norm": 9.5625, + "learning_rate": 9.989051081784109e-06, + "loss": 1.04550982, + "memory(GiB)": 292.62, + "step": 25040, + "train_speed(iter/s)": 0.125589 + }, + { + "acc": 0.73636308, + "epoch": 0.14014738964301843, + "grad_norm": 7.96875, + "learning_rate": 9.98898983457216e-06, + "loss": 1.0452919, + "memory(GiB)": 292.62, + "step": 25060, + "train_speed(iter/s)": 0.125634 + }, + { + "acc": 0.71650701, + "epoch": 0.1402592391159977, + "grad_norm": 6.03125, + "learning_rate": 9.98892841672094e-06, + "loss": 1.1279664, + "memory(GiB)": 292.62, + "step": 25080, + "train_speed(iter/s)": 0.125677 + }, + { + "acc": 0.7243557, + "epoch": 0.14037108858897696, + "grad_norm": 7.75, + "learning_rate": 9.988866828232547e-06, + "loss": 1.08320589, + "memory(GiB)": 292.62, + "step": 25100, + "train_speed(iter/s)": 0.125722 + }, + { + "acc": 0.73911834, + "epoch": 0.14048293806195622, + "grad_norm": 8.9375, + "learning_rate": 9.988805069109088e-06, + "loss": 1.0279418, + "memory(GiB)": 292.62, + "step": 25120, + "train_speed(iter/s)": 0.12577 + }, + { + "acc": 0.72511377, + "epoch": 0.14059478753493548, + "grad_norm": 8.3125, + "learning_rate": 9.988743139352679e-06, + "loss": 1.09552031, + "memory(GiB)": 292.62, + "step": 25140, + "train_speed(iter/s)": 0.125815 + }, + { + "acc": 0.737149, + "epoch": 0.14070663700791475, + "grad_norm": 8.1875, + "learning_rate": 9.988681038965436e-06, + "loss": 1.02963858, + "memory(GiB)": 292.62, + "step": 25160, + "train_speed(iter/s)": 0.125863 + }, + { + "acc": 0.7279912, + "epoch": 0.140818486480894, + "grad_norm": 10.9375, + "learning_rate": 9.988618767949481e-06, + "loss": 1.09080648, + "memory(GiB)": 292.62, + "step": 25180, + "train_speed(iter/s)": 0.125907 + }, + { + "acc": 0.71972227, + "epoch": 0.14093033595387328, + "grad_norm": 7.625, + "learning_rate": 9.988556326306947e-06, + "loss": 1.10633421, + "memory(GiB)": 292.62, + "step": 25200, + "train_speed(iter/s)": 0.125953 + }, + { + "acc": 0.7193759, + "epoch": 0.14104218542685254, + "grad_norm": 9.1875, + "learning_rate": 9.988493714039969e-06, + "loss": 1.08589296, + "memory(GiB)": 292.62, + "step": 25220, + "train_speed(iter/s)": 0.125999 + }, + { + "acc": 0.72491965, + "epoch": 0.1411540348998318, + "grad_norm": 7.53125, + "learning_rate": 9.988430931150686e-06, + "loss": 1.07095432, + "memory(GiB)": 292.62, + "step": 25240, + "train_speed(iter/s)": 0.126047 + }, + { + "acc": 0.72422009, + "epoch": 0.14126588437281107, + "grad_norm": 5.28125, + "learning_rate": 9.988367977641247e-06, + "loss": 1.09066448, + "memory(GiB)": 292.62, + "step": 25260, + "train_speed(iter/s)": 0.126093 + }, + { + "acc": 0.73718634, + "epoch": 0.14137773384579033, + "grad_norm": 6.03125, + "learning_rate": 9.988304853513806e-06, + "loss": 1.04390278, + "memory(GiB)": 292.62, + "step": 25280, + "train_speed(iter/s)": 0.126138 + }, + { + "acc": 0.71845827, + "epoch": 0.1414895833187696, + "grad_norm": 5.25, + "learning_rate": 9.988241558770523e-06, + "loss": 1.13119268, + "memory(GiB)": 292.62, + "step": 25300, + "train_speed(iter/s)": 0.126183 + }, + { + "acc": 0.73067493, + "epoch": 0.14160143279174886, + "grad_norm": 6.53125, + "learning_rate": 9.98817809341356e-06, + "loss": 1.05909643, + "memory(GiB)": 292.62, + "step": 25320, + "train_speed(iter/s)": 0.126227 + }, + { + "acc": 0.73558455, + "epoch": 0.14171328226472812, + "grad_norm": 5.875, + "learning_rate": 9.98811445744509e-06, + "loss": 1.03915377, + "memory(GiB)": 292.62, + "step": 25340, + "train_speed(iter/s)": 0.126276 + }, + { + "acc": 0.72913179, + "epoch": 0.1418251317377074, + "grad_norm": 9.375, + "learning_rate": 9.988050650867288e-06, + "loss": 1.06754389, + "memory(GiB)": 292.62, + "step": 25360, + "train_speed(iter/s)": 0.126322 + }, + { + "acc": 0.72860899, + "epoch": 0.14193698121068665, + "grad_norm": 7.875, + "learning_rate": 9.987986673682337e-06, + "loss": 1.0904521, + "memory(GiB)": 292.62, + "step": 25380, + "train_speed(iter/s)": 0.126367 + }, + { + "acc": 0.74145627, + "epoch": 0.14204883068366592, + "grad_norm": 5.875, + "learning_rate": 9.987922525892426e-06, + "loss": 1.00413017, + "memory(GiB)": 292.62, + "step": 25400, + "train_speed(iter/s)": 0.126414 + }, + { + "acc": 0.72958736, + "epoch": 0.14216068015664518, + "grad_norm": 8.25, + "learning_rate": 9.987858207499748e-06, + "loss": 1.07467613, + "memory(GiB)": 292.62, + "step": 25420, + "train_speed(iter/s)": 0.126459 + }, + { + "acc": 0.73155146, + "epoch": 0.14227252962962444, + "grad_norm": 6.875, + "learning_rate": 9.987793718506503e-06, + "loss": 1.03218203, + "memory(GiB)": 292.62, + "step": 25440, + "train_speed(iter/s)": 0.126505 + }, + { + "acc": 0.71898961, + "epoch": 0.1423843791026037, + "grad_norm": 7.84375, + "learning_rate": 9.987729058914897e-06, + "loss": 1.13877687, + "memory(GiB)": 292.62, + "step": 25460, + "train_speed(iter/s)": 0.126549 + }, + { + "acc": 0.75302777, + "epoch": 0.14249622857558297, + "grad_norm": 9.75, + "learning_rate": 9.98766422872714e-06, + "loss": 0.95342093, + "memory(GiB)": 292.62, + "step": 25480, + "train_speed(iter/s)": 0.126596 + }, + { + "acc": 0.73962908, + "epoch": 0.14260807804856224, + "grad_norm": 6.0, + "learning_rate": 9.987599227945453e-06, + "loss": 1.0153573, + "memory(GiB)": 292.62, + "step": 25500, + "train_speed(iter/s)": 0.126636 + }, + { + "acc": 0.73304667, + "epoch": 0.1427199275215415, + "grad_norm": 12.0, + "learning_rate": 9.987534056572057e-06, + "loss": 1.06001358, + "memory(GiB)": 292.62, + "step": 25520, + "train_speed(iter/s)": 0.126682 + }, + { + "acc": 0.73217349, + "epoch": 0.14283177699452076, + "grad_norm": 5.71875, + "learning_rate": 9.987468714609181e-06, + "loss": 1.06032591, + "memory(GiB)": 292.62, + "step": 25540, + "train_speed(iter/s)": 0.126727 + }, + { + "acc": 0.71954098, + "epoch": 0.14294362646750003, + "grad_norm": 6.5, + "learning_rate": 9.987403202059061e-06, + "loss": 1.11835995, + "memory(GiB)": 292.62, + "step": 25560, + "train_speed(iter/s)": 0.126772 + }, + { + "acc": 0.7088119, + "epoch": 0.1430554759404793, + "grad_norm": 5.84375, + "learning_rate": 9.987337518923935e-06, + "loss": 1.1828434, + "memory(GiB)": 292.62, + "step": 25580, + "train_speed(iter/s)": 0.126818 + }, + { + "acc": 0.72275071, + "epoch": 0.14316732541345856, + "grad_norm": 5.53125, + "learning_rate": 9.987271665206054e-06, + "loss": 1.10663223, + "memory(GiB)": 292.62, + "step": 25600, + "train_speed(iter/s)": 0.126862 + }, + { + "acc": 0.71863365, + "epoch": 0.14327917488643782, + "grad_norm": 6.9375, + "learning_rate": 9.987205640907664e-06, + "loss": 1.12317457, + "memory(GiB)": 292.62, + "step": 25620, + "train_speed(iter/s)": 0.126907 + }, + { + "acc": 0.72217317, + "epoch": 0.14339102435941709, + "grad_norm": 4.90625, + "learning_rate": 9.98713944603103e-06, + "loss": 1.09447136, + "memory(GiB)": 292.62, + "step": 25640, + "train_speed(iter/s)": 0.126948 + }, + { + "acc": 0.72985492, + "epoch": 0.14350287383239635, + "grad_norm": 8.5625, + "learning_rate": 9.987073080578413e-06, + "loss": 1.04569921, + "memory(GiB)": 292.62, + "step": 25660, + "train_speed(iter/s)": 0.126993 + }, + { + "acc": 0.72276435, + "epoch": 0.1436147233053756, + "grad_norm": 4.875, + "learning_rate": 9.987006544552082e-06, + "loss": 1.11529999, + "memory(GiB)": 292.62, + "step": 25680, + "train_speed(iter/s)": 0.127038 + }, + { + "acc": 0.73365684, + "epoch": 0.14372657277835488, + "grad_norm": 8.3125, + "learning_rate": 9.986939837954315e-06, + "loss": 1.03660936, + "memory(GiB)": 292.62, + "step": 25700, + "train_speed(iter/s)": 0.127085 + }, + { + "acc": 0.71667562, + "epoch": 0.14383842225133414, + "grad_norm": 6.0625, + "learning_rate": 9.986872960787391e-06, + "loss": 1.14185905, + "memory(GiB)": 292.62, + "step": 25720, + "train_speed(iter/s)": 0.127128 + }, + { + "acc": 0.73105316, + "epoch": 0.1439502717243134, + "grad_norm": 8.75, + "learning_rate": 9.986805913053598e-06, + "loss": 1.07913227, + "memory(GiB)": 292.62, + "step": 25740, + "train_speed(iter/s)": 0.127171 + }, + { + "acc": 0.70640817, + "epoch": 0.1440621211972927, + "grad_norm": 6.28125, + "learning_rate": 9.986738694755232e-06, + "loss": 1.1779932, + "memory(GiB)": 292.62, + "step": 25760, + "train_speed(iter/s)": 0.127217 + }, + { + "acc": 0.71603174, + "epoch": 0.14417397067027196, + "grad_norm": 6.6875, + "learning_rate": 9.986671305894591e-06, + "loss": 1.14926224, + "memory(GiB)": 292.62, + "step": 25780, + "train_speed(iter/s)": 0.127262 + }, + { + "acc": 0.72079411, + "epoch": 0.14428582014325123, + "grad_norm": 7.25, + "learning_rate": 9.986603746473977e-06, + "loss": 1.11228333, + "memory(GiB)": 292.62, + "step": 25800, + "train_speed(iter/s)": 0.127306 + }, + { + "acc": 0.71608071, + "epoch": 0.1443976696162305, + "grad_norm": 7.25, + "learning_rate": 9.986536016495703e-06, + "loss": 1.14229832, + "memory(GiB)": 292.62, + "step": 25820, + "train_speed(iter/s)": 0.127348 + }, + { + "acc": 0.72059717, + "epoch": 0.14450951908920975, + "grad_norm": 5.59375, + "learning_rate": 9.986468115962088e-06, + "loss": 1.11926975, + "memory(GiB)": 292.62, + "step": 25840, + "train_speed(iter/s)": 0.127394 + }, + { + "acc": 0.74266405, + "epoch": 0.14462136856218902, + "grad_norm": 9.4375, + "learning_rate": 9.986400044875448e-06, + "loss": 0.98846455, + "memory(GiB)": 292.62, + "step": 25860, + "train_speed(iter/s)": 0.127439 + }, + { + "acc": 0.72436504, + "epoch": 0.14473321803516828, + "grad_norm": 8.875, + "learning_rate": 9.986331803238116e-06, + "loss": 1.09317913, + "memory(GiB)": 292.62, + "step": 25880, + "train_speed(iter/s)": 0.127483 + }, + { + "acc": 0.72734284, + "epoch": 0.14484506750814755, + "grad_norm": 6.5625, + "learning_rate": 9.986263391052427e-06, + "loss": 1.08694153, + "memory(GiB)": 292.62, + "step": 25900, + "train_speed(iter/s)": 0.127524 + }, + { + "acc": 0.71844454, + "epoch": 0.1449569169811268, + "grad_norm": 7.03125, + "learning_rate": 9.986194808320717e-06, + "loss": 1.13306026, + "memory(GiB)": 292.62, + "step": 25920, + "train_speed(iter/s)": 0.127569 + }, + { + "acc": 0.72494097, + "epoch": 0.14506876645410607, + "grad_norm": 6.96875, + "learning_rate": 9.986126055045333e-06, + "loss": 1.06998577, + "memory(GiB)": 292.62, + "step": 25940, + "train_speed(iter/s)": 0.127614 + }, + { + "acc": 0.70980859, + "epoch": 0.14518061592708534, + "grad_norm": 6.8125, + "learning_rate": 9.98605713122863e-06, + "loss": 1.15335264, + "memory(GiB)": 292.62, + "step": 25960, + "train_speed(iter/s)": 0.127659 + }, + { + "acc": 0.73258638, + "epoch": 0.1452924654000646, + "grad_norm": 7.375, + "learning_rate": 9.985988036872961e-06, + "loss": 1.04620733, + "memory(GiB)": 292.62, + "step": 25980, + "train_speed(iter/s)": 0.127702 + }, + { + "acc": 0.73500648, + "epoch": 0.14540431487304387, + "grad_norm": 8.625, + "learning_rate": 9.985918771980691e-06, + "loss": 1.05528946, + "memory(GiB)": 292.62, + "step": 26000, + "train_speed(iter/s)": 0.127747 + }, + { + "epoch": 0.14540431487304387, + "eval_acc": 0.6912122267072438, + "eval_loss": 1.0786824226379395, + "eval_runtime": 7509.3122, + "eval_samples_per_second": 10.025, + "eval_steps_per_second": 10.025, + "step": 26000 + }, + { + "acc": 0.72084489, + "epoch": 0.14551616434602313, + "grad_norm": 9.4375, + "learning_rate": 9.98584933655419e-06, + "loss": 1.10885019, + "memory(GiB)": 292.62, + "step": 26020, + "train_speed(iter/s)": 0.123189 + }, + { + "acc": 0.72073913, + "epoch": 0.1456280138190024, + "grad_norm": 5.96875, + "learning_rate": 9.985779730595832e-06, + "loss": 1.09350061, + "memory(GiB)": 292.62, + "step": 26040, + "train_speed(iter/s)": 0.123234 + }, + { + "acc": 0.7253509, + "epoch": 0.14573986329198166, + "grad_norm": 8.9375, + "learning_rate": 9.985709954107998e-06, + "loss": 1.07839031, + "memory(GiB)": 292.62, + "step": 26060, + "train_speed(iter/s)": 0.12328 + }, + { + "acc": 0.71791563, + "epoch": 0.14585171276496092, + "grad_norm": 7.78125, + "learning_rate": 9.985640007093073e-06, + "loss": 1.12225647, + "memory(GiB)": 292.62, + "step": 26080, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.73193154, + "epoch": 0.1459635622379402, + "grad_norm": 7.75, + "learning_rate": 9.98556988955345e-06, + "loss": 1.06091366, + "memory(GiB)": 292.62, + "step": 26100, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.73125691, + "epoch": 0.14607541171091945, + "grad_norm": 8.5, + "learning_rate": 9.98549960149153e-06, + "loss": 1.07124052, + "memory(GiB)": 292.62, + "step": 26120, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.72677083, + "epoch": 0.14618726118389871, + "grad_norm": 6.03125, + "learning_rate": 9.985429142909715e-06, + "loss": 1.07001801, + "memory(GiB)": 292.62, + "step": 26140, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.73199434, + "epoch": 0.14629911065687798, + "grad_norm": 5.09375, + "learning_rate": 9.985358513810416e-06, + "loss": 1.05176239, + "memory(GiB)": 292.62, + "step": 26160, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.71569734, + "epoch": 0.14641096012985724, + "grad_norm": 4.84375, + "learning_rate": 9.985287714196043e-06, + "loss": 1.12589436, + "memory(GiB)": 292.62, + "step": 26180, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.7309679, + "epoch": 0.1465228096028365, + "grad_norm": 7.59375, + "learning_rate": 9.985216744069026e-06, + "loss": 1.05921679, + "memory(GiB)": 292.62, + "step": 26200, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.73795657, + "epoch": 0.14663465907581577, + "grad_norm": 9.3125, + "learning_rate": 9.985145603431788e-06, + "loss": 1.04059496, + "memory(GiB)": 292.62, + "step": 26220, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.73010397, + "epoch": 0.14674650854879503, + "grad_norm": 6.84375, + "learning_rate": 9.985074292286763e-06, + "loss": 1.04703636, + "memory(GiB)": 292.62, + "step": 26240, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.73399973, + "epoch": 0.1468583580217743, + "grad_norm": 9.1875, + "learning_rate": 9.985002810636391e-06, + "loss": 1.04502659, + "memory(GiB)": 292.62, + "step": 26260, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.72192307, + "epoch": 0.14697020749475356, + "grad_norm": 5.21875, + "learning_rate": 9.984931158483113e-06, + "loss": 1.09485788, + "memory(GiB)": 292.62, + "step": 26280, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.73065143, + "epoch": 0.14708205696773283, + "grad_norm": 8.875, + "learning_rate": 9.984859335829384e-06, + "loss": 1.06654329, + "memory(GiB)": 292.62, + "step": 26300, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.72905197, + "epoch": 0.1471939064407121, + "grad_norm": 7.71875, + "learning_rate": 9.984787342677659e-06, + "loss": 1.06827259, + "memory(GiB)": 292.62, + "step": 26320, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.72678065, + "epoch": 0.14730575591369136, + "grad_norm": 7.15625, + "learning_rate": 9.984715179030402e-06, + "loss": 1.07301521, + "memory(GiB)": 292.62, + "step": 26340, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.7500174, + "epoch": 0.14741760538667062, + "grad_norm": 8.1875, + "learning_rate": 9.984642844890078e-06, + "loss": 0.98393173, + "memory(GiB)": 292.62, + "step": 26360, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.73371582, + "epoch": 0.14752945485964988, + "grad_norm": 7.53125, + "learning_rate": 9.984570340259165e-06, + "loss": 1.05858383, + "memory(GiB)": 292.62, + "step": 26380, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.72427983, + "epoch": 0.14764130433262915, + "grad_norm": 7.0625, + "learning_rate": 9.984497665140138e-06, + "loss": 1.08982706, + "memory(GiB)": 292.62, + "step": 26400, + "train_speed(iter/s)": 0.124029 + }, + { + "acc": 0.73295684, + "epoch": 0.1477531538056084, + "grad_norm": 5.6875, + "learning_rate": 9.984424819535487e-06, + "loss": 1.04633617, + "memory(GiB)": 292.62, + "step": 26420, + "train_speed(iter/s)": 0.124071 + }, + { + "acc": 0.72581987, + "epoch": 0.14786500327858768, + "grad_norm": 6.25, + "learning_rate": 9.984351803447703e-06, + "loss": 1.06975431, + "memory(GiB)": 292.62, + "step": 26440, + "train_speed(iter/s)": 0.124116 + }, + { + "acc": 0.72518797, + "epoch": 0.14797685275156694, + "grad_norm": 6.6875, + "learning_rate": 9.98427861687928e-06, + "loss": 1.07539253, + "memory(GiB)": 292.62, + "step": 26460, + "train_speed(iter/s)": 0.124156 + }, + { + "acc": 0.72770863, + "epoch": 0.1480887022245462, + "grad_norm": 5.875, + "learning_rate": 9.984205259832724e-06, + "loss": 1.09877043, + "memory(GiB)": 292.62, + "step": 26480, + "train_speed(iter/s)": 0.124192 + }, + { + "acc": 0.72766495, + "epoch": 0.14820055169752547, + "grad_norm": 7.40625, + "learning_rate": 9.984131732310545e-06, + "loss": 1.09156075, + "memory(GiB)": 292.62, + "step": 26500, + "train_speed(iter/s)": 0.124233 + }, + { + "acc": 0.73650117, + "epoch": 0.14831240117050473, + "grad_norm": 5.9375, + "learning_rate": 9.984058034315256e-06, + "loss": 1.04170771, + "memory(GiB)": 292.62, + "step": 26520, + "train_speed(iter/s)": 0.124277 + }, + { + "acc": 0.71384802, + "epoch": 0.148424250643484, + "grad_norm": 6.0, + "learning_rate": 9.983984165849377e-06, + "loss": 1.13753242, + "memory(GiB)": 292.62, + "step": 26540, + "train_speed(iter/s)": 0.12432 + }, + { + "acc": 0.73162398, + "epoch": 0.14853610011646326, + "grad_norm": 5.9375, + "learning_rate": 9.983910126915438e-06, + "loss": 1.04942408, + "memory(GiB)": 292.62, + "step": 26560, + "train_speed(iter/s)": 0.124364 + }, + { + "acc": 0.72837806, + "epoch": 0.14864794958944252, + "grad_norm": 6.5, + "learning_rate": 9.983835917515966e-06, + "loss": 1.10356197, + "memory(GiB)": 292.62, + "step": 26580, + "train_speed(iter/s)": 0.124406 + }, + { + "acc": 0.73699942, + "epoch": 0.1487597990624218, + "grad_norm": 7.09375, + "learning_rate": 9.983761537653505e-06, + "loss": 1.03515015, + "memory(GiB)": 292.62, + "step": 26600, + "train_speed(iter/s)": 0.12445 + }, + { + "acc": 0.724333, + "epoch": 0.14887164853540105, + "grad_norm": 5.625, + "learning_rate": 9.983686987330596e-06, + "loss": 1.0821619, + "memory(GiB)": 292.62, + "step": 26620, + "train_speed(iter/s)": 0.124493 + }, + { + "acc": 0.73522229, + "epoch": 0.14898349800838032, + "grad_norm": 5.15625, + "learning_rate": 9.983612266549788e-06, + "loss": 1.03210878, + "memory(GiB)": 292.62, + "step": 26640, + "train_speed(iter/s)": 0.124536 + }, + { + "acc": 0.72581182, + "epoch": 0.14909534748135958, + "grad_norm": 6.9375, + "learning_rate": 9.98353737531364e-06, + "loss": 1.05628977, + "memory(GiB)": 292.62, + "step": 26660, + "train_speed(iter/s)": 0.124582 + }, + { + "acc": 0.72528176, + "epoch": 0.14920719695433884, + "grad_norm": 6.96875, + "learning_rate": 9.983462313624709e-06, + "loss": 1.075846, + "memory(GiB)": 292.62, + "step": 26680, + "train_speed(iter/s)": 0.124625 + }, + { + "acc": 0.73490133, + "epoch": 0.1493190464273181, + "grad_norm": 7.78125, + "learning_rate": 9.983387081485565e-06, + "loss": 1.04032278, + "memory(GiB)": 292.62, + "step": 26700, + "train_speed(iter/s)": 0.124666 + }, + { + "acc": 0.7190877, + "epoch": 0.14943089590029737, + "grad_norm": 7.625, + "learning_rate": 9.983311678898782e-06, + "loss": 1.12389364, + "memory(GiB)": 292.62, + "step": 26720, + "train_speed(iter/s)": 0.124707 + }, + { + "acc": 0.73702836, + "epoch": 0.14954274537327664, + "grad_norm": 9.8125, + "learning_rate": 9.983236105866938e-06, + "loss": 1.04667549, + "memory(GiB)": 292.62, + "step": 26740, + "train_speed(iter/s)": 0.124751 + }, + { + "acc": 0.73056784, + "epoch": 0.1496545948462559, + "grad_norm": 6.0, + "learning_rate": 9.983160362392616e-06, + "loss": 1.06142826, + "memory(GiB)": 292.62, + "step": 26760, + "train_speed(iter/s)": 0.124798 + }, + { + "acc": 0.73593521, + "epoch": 0.14976644431923516, + "grad_norm": 6.59375, + "learning_rate": 9.98308444847841e-06, + "loss": 1.01769829, + "memory(GiB)": 292.62, + "step": 26780, + "train_speed(iter/s)": 0.124841 + }, + { + "acc": 0.72310734, + "epoch": 0.14987829379221443, + "grad_norm": 4.46875, + "learning_rate": 9.983008364126915e-06, + "loss": 1.10596762, + "memory(GiB)": 292.62, + "step": 26800, + "train_speed(iter/s)": 0.124885 + }, + { + "acc": 0.721346, + "epoch": 0.1499901432651937, + "grad_norm": 7.46875, + "learning_rate": 9.982932109340733e-06, + "loss": 1.1105381, + "memory(GiB)": 292.62, + "step": 26820, + "train_speed(iter/s)": 0.124929 + }, + { + "acc": 0.72439027, + "epoch": 0.15010199273817296, + "grad_norm": 7.5625, + "learning_rate": 9.982855684122473e-06, + "loss": 1.08031807, + "memory(GiB)": 292.62, + "step": 26840, + "train_speed(iter/s)": 0.124974 + }, + { + "acc": 0.73338652, + "epoch": 0.15021384221115222, + "grad_norm": 6.5625, + "learning_rate": 9.982779088474747e-06, + "loss": 1.04428778, + "memory(GiB)": 292.62, + "step": 26860, + "train_speed(iter/s)": 0.125018 + }, + { + "acc": 0.73286061, + "epoch": 0.15032569168413148, + "grad_norm": 7.15625, + "learning_rate": 9.982702322400178e-06, + "loss": 1.07942762, + "memory(GiB)": 292.62, + "step": 26880, + "train_speed(iter/s)": 0.125062 + }, + { + "acc": 0.74023194, + "epoch": 0.15043754115711075, + "grad_norm": 9.0625, + "learning_rate": 9.982625385901387e-06, + "loss": 1.03237228, + "memory(GiB)": 292.62, + "step": 26900, + "train_speed(iter/s)": 0.125105 + }, + { + "acc": 0.74086657, + "epoch": 0.15054939063009, + "grad_norm": 5.75, + "learning_rate": 9.98254827898101e-06, + "loss": 1.03389969, + "memory(GiB)": 292.62, + "step": 26920, + "train_speed(iter/s)": 0.125145 + }, + { + "acc": 0.744945, + "epoch": 0.15066124010306928, + "grad_norm": 7.21875, + "learning_rate": 9.982471001641682e-06, + "loss": 1.01823816, + "memory(GiB)": 292.62, + "step": 26940, + "train_speed(iter/s)": 0.12519 + }, + { + "acc": 0.73420191, + "epoch": 0.15077308957604854, + "grad_norm": 6.4375, + "learning_rate": 9.982393553886049e-06, + "loss": 1.03613958, + "memory(GiB)": 292.62, + "step": 26960, + "train_speed(iter/s)": 0.12523 + }, + { + "acc": 0.72712507, + "epoch": 0.1508849390490278, + "grad_norm": 9.25, + "learning_rate": 9.982315935716755e-06, + "loss": 1.07859077, + "memory(GiB)": 292.62, + "step": 26980, + "train_speed(iter/s)": 0.125271 + }, + { + "acc": 0.72178068, + "epoch": 0.15099678852200707, + "grad_norm": 8.0, + "learning_rate": 9.98223814713646e-06, + "loss": 1.1060462, + "memory(GiB)": 292.62, + "step": 27000, + "train_speed(iter/s)": 0.125315 + }, + { + "acc": 0.73131337, + "epoch": 0.15110863799498636, + "grad_norm": 8.125, + "learning_rate": 9.98216018814782e-06, + "loss": 1.04951601, + "memory(GiB)": 292.62, + "step": 27020, + "train_speed(iter/s)": 0.125356 + }, + { + "acc": 0.72336187, + "epoch": 0.15122048746796563, + "grad_norm": 8.0625, + "learning_rate": 9.982082058753505e-06, + "loss": 1.09884567, + "memory(GiB)": 292.62, + "step": 27040, + "train_speed(iter/s)": 0.125398 + }, + { + "acc": 0.75007157, + "epoch": 0.1513323369409449, + "grad_norm": 5.15625, + "learning_rate": 9.982003758956185e-06, + "loss": 0.97874994, + "memory(GiB)": 292.62, + "step": 27060, + "train_speed(iter/s)": 0.12544 + }, + { + "acc": 0.71228762, + "epoch": 0.15144418641392415, + "grad_norm": 5.84375, + "learning_rate": 9.98192528875854e-06, + "loss": 1.17026987, + "memory(GiB)": 292.62, + "step": 27080, + "train_speed(iter/s)": 0.125482 + }, + { + "acc": 0.72211294, + "epoch": 0.15155603588690342, + "grad_norm": 4.65625, + "learning_rate": 9.981846648163251e-06, + "loss": 1.10478935, + "memory(GiB)": 292.62, + "step": 27100, + "train_speed(iter/s)": 0.125527 + }, + { + "acc": 0.72483525, + "epoch": 0.15166788535988268, + "grad_norm": 10.0, + "learning_rate": 9.98176783717301e-06, + "loss": 1.11290646, + "memory(GiB)": 292.62, + "step": 27120, + "train_speed(iter/s)": 0.125569 + }, + { + "acc": 0.719279, + "epoch": 0.15177973483286195, + "grad_norm": 5.34375, + "learning_rate": 9.981688855790514e-06, + "loss": 1.1253499, + "memory(GiB)": 292.62, + "step": 27140, + "train_speed(iter/s)": 0.125611 + }, + { + "acc": 0.71653113, + "epoch": 0.1518915843058412, + "grad_norm": 9.5625, + "learning_rate": 9.981609704018462e-06, + "loss": 1.1375742, + "memory(GiB)": 292.62, + "step": 27160, + "train_speed(iter/s)": 0.125655 + }, + { + "acc": 0.74298496, + "epoch": 0.15200343377882047, + "grad_norm": 9.1875, + "learning_rate": 9.98153038185956e-06, + "loss": 1.02477579, + "memory(GiB)": 292.62, + "step": 27180, + "train_speed(iter/s)": 0.125698 + }, + { + "acc": 0.7391047, + "epoch": 0.15211528325179974, + "grad_norm": 6.375, + "learning_rate": 9.981450889316524e-06, + "loss": 1.02063589, + "memory(GiB)": 292.62, + "step": 27200, + "train_speed(iter/s)": 0.125742 + }, + { + "acc": 0.72662544, + "epoch": 0.152227132724779, + "grad_norm": 8.4375, + "learning_rate": 9.981371226392074e-06, + "loss": 1.0825408, + "memory(GiB)": 292.62, + "step": 27220, + "train_speed(iter/s)": 0.125782 + }, + { + "acc": 0.72346559, + "epoch": 0.15233898219775827, + "grad_norm": 7.5, + "learning_rate": 9.98129139308893e-06, + "loss": 1.09624815, + "memory(GiB)": 292.62, + "step": 27240, + "train_speed(iter/s)": 0.125826 + }, + { + "acc": 0.72140627, + "epoch": 0.15245083167073753, + "grad_norm": 8.875, + "learning_rate": 9.981211389409825e-06, + "loss": 1.10797081, + "memory(GiB)": 292.62, + "step": 27260, + "train_speed(iter/s)": 0.12587 + }, + { + "acc": 0.73111858, + "epoch": 0.1525626811437168, + "grad_norm": 8.0625, + "learning_rate": 9.981131215357496e-06, + "loss": 1.05362158, + "memory(GiB)": 292.62, + "step": 27280, + "train_speed(iter/s)": 0.125912 + }, + { + "acc": 0.74557142, + "epoch": 0.15267453061669606, + "grad_norm": 8.0625, + "learning_rate": 9.981050870934686e-06, + "loss": 1.00397806, + "memory(GiB)": 292.62, + "step": 27300, + "train_speed(iter/s)": 0.125953 + }, + { + "acc": 0.72752013, + "epoch": 0.15278638008967532, + "grad_norm": 8.0625, + "learning_rate": 9.98097035614414e-06, + "loss": 1.06230106, + "memory(GiB)": 292.62, + "step": 27320, + "train_speed(iter/s)": 0.125995 + }, + { + "acc": 0.73388805, + "epoch": 0.1528982295626546, + "grad_norm": 5.375, + "learning_rate": 9.980889670988614e-06, + "loss": 1.03774691, + "memory(GiB)": 292.62, + "step": 27340, + "train_speed(iter/s)": 0.126039 + }, + { + "acc": 0.72953968, + "epoch": 0.15301007903563385, + "grad_norm": 8.625, + "learning_rate": 9.980808815470868e-06, + "loss": 1.07516956, + "memory(GiB)": 292.62, + "step": 27360, + "train_speed(iter/s)": 0.126081 + }, + { + "acc": 0.7435555, + "epoch": 0.15312192850861311, + "grad_norm": 7.625, + "learning_rate": 9.980727789593668e-06, + "loss": 1.00767469, + "memory(GiB)": 292.62, + "step": 27380, + "train_speed(iter/s)": 0.126126 + }, + { + "acc": 0.73469648, + "epoch": 0.15323377798159238, + "grad_norm": 8.0, + "learning_rate": 9.980646593359781e-06, + "loss": 1.04750271, + "memory(GiB)": 292.62, + "step": 27400, + "train_speed(iter/s)": 0.12617 + }, + { + "acc": 0.72347293, + "epoch": 0.15334562745457164, + "grad_norm": 6.9375, + "learning_rate": 9.980565226771989e-06, + "loss": 1.108181, + "memory(GiB)": 292.62, + "step": 27420, + "train_speed(iter/s)": 0.126213 + }, + { + "acc": 0.71233454, + "epoch": 0.1534574769275509, + "grad_norm": 6.71875, + "learning_rate": 9.980483689833072e-06, + "loss": 1.13307142, + "memory(GiB)": 292.62, + "step": 27440, + "train_speed(iter/s)": 0.126257 + }, + { + "acc": 0.73599839, + "epoch": 0.15356932640053017, + "grad_norm": 7.21875, + "learning_rate": 9.980401982545821e-06, + "loss": 1.05293989, + "memory(GiB)": 292.62, + "step": 27460, + "train_speed(iter/s)": 0.126293 + }, + { + "acc": 0.73097486, + "epoch": 0.15368117587350943, + "grad_norm": 8.0, + "learning_rate": 9.980320104913031e-06, + "loss": 1.0630271, + "memory(GiB)": 292.62, + "step": 27480, + "train_speed(iter/s)": 0.126337 + }, + { + "acc": 0.73312802, + "epoch": 0.1537930253464887, + "grad_norm": 6.65625, + "learning_rate": 9.980238056937501e-06, + "loss": 1.05818043, + "memory(GiB)": 292.62, + "step": 27500, + "train_speed(iter/s)": 0.126374 + }, + { + "acc": 0.72440906, + "epoch": 0.15390487481946796, + "grad_norm": 6.21875, + "learning_rate": 9.980155838622037e-06, + "loss": 1.10980129, + "memory(GiB)": 292.62, + "step": 27520, + "train_speed(iter/s)": 0.126417 + }, + { + "acc": 0.74359221, + "epoch": 0.15401672429244723, + "grad_norm": 6.75, + "learning_rate": 9.980073449969449e-06, + "loss": 1.009126, + "memory(GiB)": 292.62, + "step": 27540, + "train_speed(iter/s)": 0.126458 + }, + { + "acc": 0.71641774, + "epoch": 0.1541285737654265, + "grad_norm": 6.15625, + "learning_rate": 9.979990890982562e-06, + "loss": 1.13147011, + "memory(GiB)": 292.62, + "step": 27560, + "train_speed(iter/s)": 0.126498 + }, + { + "acc": 0.73398261, + "epoch": 0.15424042323840575, + "grad_norm": 6.34375, + "learning_rate": 9.979908161664194e-06, + "loss": 1.05415649, + "memory(GiB)": 292.62, + "step": 27580, + "train_speed(iter/s)": 0.126542 + }, + { + "acc": 0.7091588, + "epoch": 0.15435227271138502, + "grad_norm": 6.65625, + "learning_rate": 9.979825262017175e-06, + "loss": 1.15630188, + "memory(GiB)": 292.62, + "step": 27600, + "train_speed(iter/s)": 0.126582 + }, + { + "acc": 0.72635417, + "epoch": 0.15446412218436428, + "grad_norm": 9.4375, + "learning_rate": 9.979742192044341e-06, + "loss": 1.08239603, + "memory(GiB)": 292.62, + "step": 27620, + "train_speed(iter/s)": 0.12662 + }, + { + "acc": 0.73309865, + "epoch": 0.15457597165734355, + "grad_norm": 12.3125, + "learning_rate": 9.979658951748534e-06, + "loss": 1.05035973, + "memory(GiB)": 292.62, + "step": 27640, + "train_speed(iter/s)": 0.126663 + }, + { + "acc": 0.73858314, + "epoch": 0.1546878211303228, + "grad_norm": 5.3125, + "learning_rate": 9.979575541132602e-06, + "loss": 1.02287807, + "memory(GiB)": 292.62, + "step": 27660, + "train_speed(iter/s)": 0.126705 + }, + { + "acc": 0.74313836, + "epoch": 0.15479967060330208, + "grad_norm": 7.375, + "learning_rate": 9.979491960199397e-06, + "loss": 1.01466455, + "memory(GiB)": 292.62, + "step": 27680, + "train_speed(iter/s)": 0.126745 + }, + { + "acc": 0.71055765, + "epoch": 0.15491152007628134, + "grad_norm": 5.28125, + "learning_rate": 9.979408208951776e-06, + "loss": 1.16315708, + "memory(GiB)": 292.62, + "step": 27700, + "train_speed(iter/s)": 0.126787 + }, + { + "acc": 0.72405591, + "epoch": 0.1550233695492606, + "grad_norm": 5.4375, + "learning_rate": 9.979324287392606e-06, + "loss": 1.07318487, + "memory(GiB)": 292.62, + "step": 27720, + "train_speed(iter/s)": 0.126832 + }, + { + "acc": 0.72514396, + "epoch": 0.15513521902223987, + "grad_norm": 9.1875, + "learning_rate": 9.979240195524756e-06, + "loss": 1.07708845, + "memory(GiB)": 292.62, + "step": 27740, + "train_speed(iter/s)": 0.126872 + }, + { + "acc": 0.72871108, + "epoch": 0.15524706849521913, + "grad_norm": 11.5, + "learning_rate": 9.979155933351101e-06, + "loss": 1.06921978, + "memory(GiB)": 292.62, + "step": 27760, + "train_speed(iter/s)": 0.126916 + }, + { + "acc": 0.7158854, + "epoch": 0.1553589179681984, + "grad_norm": 8.4375, + "learning_rate": 9.979071500874527e-06, + "loss": 1.1192378, + "memory(GiB)": 292.62, + "step": 27780, + "train_speed(iter/s)": 0.126955 + }, + { + "acc": 0.71680841, + "epoch": 0.15547076744117766, + "grad_norm": 8.8125, + "learning_rate": 9.978986898097918e-06, + "loss": 1.11261492, + "memory(GiB)": 292.62, + "step": 27800, + "train_speed(iter/s)": 0.126997 + }, + { + "acc": 0.73307137, + "epoch": 0.15558261691415692, + "grad_norm": 6.96875, + "learning_rate": 9.97890212502417e-06, + "loss": 1.07885733, + "memory(GiB)": 292.62, + "step": 27820, + "train_speed(iter/s)": 0.127039 + }, + { + "acc": 0.71999574, + "epoch": 0.1556944663871362, + "grad_norm": 7.15625, + "learning_rate": 9.978817181656182e-06, + "loss": 1.10510235, + "memory(GiB)": 292.62, + "step": 27840, + "train_speed(iter/s)": 0.127078 + }, + { + "acc": 0.73881464, + "epoch": 0.15580631586011545, + "grad_norm": 8.25, + "learning_rate": 9.978732067996856e-06, + "loss": 1.05755548, + "memory(GiB)": 292.62, + "step": 27860, + "train_speed(iter/s)": 0.127118 + }, + { + "acc": 0.71432276, + "epoch": 0.15591816533309472, + "grad_norm": 9.1875, + "learning_rate": 9.97864678404911e-06, + "loss": 1.13935728, + "memory(GiB)": 292.62, + "step": 27880, + "train_speed(iter/s)": 0.12716 + }, + { + "acc": 0.73880582, + "epoch": 0.15603001480607398, + "grad_norm": 6.46875, + "learning_rate": 9.978561329815854e-06, + "loss": 1.03280735, + "memory(GiB)": 292.62, + "step": 27900, + "train_speed(iter/s)": 0.127202 + }, + { + "acc": 0.73302712, + "epoch": 0.15614186427905324, + "grad_norm": 6.34375, + "learning_rate": 9.978475705300016e-06, + "loss": 1.06358166, + "memory(GiB)": 292.62, + "step": 27920, + "train_speed(iter/s)": 0.127241 + }, + { + "acc": 0.72691722, + "epoch": 0.1562537137520325, + "grad_norm": 7.28125, + "learning_rate": 9.978389910504522e-06, + "loss": 1.09270802, + "memory(GiB)": 292.62, + "step": 27940, + "train_speed(iter/s)": 0.12728 + }, + { + "acc": 0.72177234, + "epoch": 0.15636556322501177, + "grad_norm": 5.5, + "learning_rate": 9.978303945432308e-06, + "loss": 1.09209013, + "memory(GiB)": 292.62, + "step": 27960, + "train_speed(iter/s)": 0.127321 + }, + { + "acc": 0.72657161, + "epoch": 0.15647741269799104, + "grad_norm": 5.09375, + "learning_rate": 9.978217810086312e-06, + "loss": 1.09034815, + "memory(GiB)": 292.62, + "step": 27980, + "train_speed(iter/s)": 0.127362 + }, + { + "acc": 0.73694205, + "epoch": 0.1565892621709703, + "grad_norm": 9.6875, + "learning_rate": 9.978131504469481e-06, + "loss": 1.03586159, + "memory(GiB)": 292.62, + "step": 28000, + "train_speed(iter/s)": 0.127401 + }, + { + "epoch": 0.1565892621709703, + "eval_acc": 0.692150098066097, + "eval_loss": 1.0751924514770508, + "eval_runtime": 7534.6793, + "eval_samples_per_second": 9.992, + "eval_steps_per_second": 9.992, + "step": 28000 + }, + { + "acc": 0.74816909, + "epoch": 0.15670111164394956, + "grad_norm": 6.40625, + "learning_rate": 9.978045028584768e-06, + "loss": 0.99965734, + "memory(GiB)": 292.62, + "step": 28020, + "train_speed(iter/s)": 0.123168 + }, + { + "acc": 0.72851529, + "epoch": 0.15681296111692883, + "grad_norm": 3.828125, + "learning_rate": 9.977958382435132e-06, + "loss": 1.07852468, + "memory(GiB)": 292.62, + "step": 28040, + "train_speed(iter/s)": 0.12321 + }, + { + "acc": 0.73454947, + "epoch": 0.1569248105899081, + "grad_norm": 6.59375, + "learning_rate": 9.977871566023532e-06, + "loss": 1.04590836, + "memory(GiB)": 292.62, + "step": 28060, + "train_speed(iter/s)": 0.123251 + }, + { + "acc": 0.72112885, + "epoch": 0.15703666006288736, + "grad_norm": 6.1875, + "learning_rate": 9.97778457935294e-06, + "loss": 1.13138828, + "memory(GiB)": 292.62, + "step": 28080, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.71519251, + "epoch": 0.15714850953586662, + "grad_norm": 6.0625, + "learning_rate": 9.977697422426333e-06, + "loss": 1.13031254, + "memory(GiB)": 292.62, + "step": 28100, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.74105816, + "epoch": 0.15726035900884588, + "grad_norm": 7.46875, + "learning_rate": 9.97761009524669e-06, + "loss": 1.02277985, + "memory(GiB)": 292.62, + "step": 28120, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.72445865, + "epoch": 0.15737220848182515, + "grad_norm": 7.84375, + "learning_rate": 9.977522597816997e-06, + "loss": 1.12664232, + "memory(GiB)": 292.62, + "step": 28140, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.73453779, + "epoch": 0.1574840579548044, + "grad_norm": 6.96875, + "learning_rate": 9.977434930140249e-06, + "loss": 1.0230052, + "memory(GiB)": 292.62, + "step": 28160, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.72518091, + "epoch": 0.15759590742778368, + "grad_norm": 8.3125, + "learning_rate": 9.977347092219442e-06, + "loss": 1.09447527, + "memory(GiB)": 292.62, + "step": 28180, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.72726178, + "epoch": 0.15770775690076294, + "grad_norm": 6.1875, + "learning_rate": 9.977259084057583e-06, + "loss": 1.06519289, + "memory(GiB)": 292.62, + "step": 28200, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74370356, + "epoch": 0.1578196063737422, + "grad_norm": 5.15625, + "learning_rate": 9.97717090565768e-06, + "loss": 1.00835743, + "memory(GiB)": 292.62, + "step": 28220, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.72323909, + "epoch": 0.15793145584672147, + "grad_norm": 5.25, + "learning_rate": 9.97708255702275e-06, + "loss": 1.10535517, + "memory(GiB)": 292.62, + "step": 28240, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.73338675, + "epoch": 0.15804330531970073, + "grad_norm": 7.875, + "learning_rate": 9.976994038155814e-06, + "loss": 1.03724108, + "memory(GiB)": 292.62, + "step": 28260, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.73015141, + "epoch": 0.15815515479268002, + "grad_norm": 6.28125, + "learning_rate": 9.976905349059902e-06, + "loss": 1.07626162, + "memory(GiB)": 292.62, + "step": 28280, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.71785817, + "epoch": 0.1582670042656593, + "grad_norm": 8.0, + "learning_rate": 9.976816489738044e-06, + "loss": 1.13902311, + "memory(GiB)": 292.62, + "step": 28300, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.70759869, + "epoch": 0.15837885373863855, + "grad_norm": 8.625, + "learning_rate": 9.976727460193283e-06, + "loss": 1.16571341, + "memory(GiB)": 292.62, + "step": 28320, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.72694464, + "epoch": 0.15849070321161782, + "grad_norm": 7.0, + "learning_rate": 9.976638260428661e-06, + "loss": 1.08681526, + "memory(GiB)": 292.62, + "step": 28340, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.71917486, + "epoch": 0.15860255268459708, + "grad_norm": 8.3125, + "learning_rate": 9.976548890447228e-06, + "loss": 1.12124872, + "memory(GiB)": 292.62, + "step": 28360, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.70579739, + "epoch": 0.15871440215757635, + "grad_norm": 5.4375, + "learning_rate": 9.976459350252045e-06, + "loss": 1.20240669, + "memory(GiB)": 292.62, + "step": 28380, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.74132576, + "epoch": 0.1588262516305556, + "grad_norm": 6.6875, + "learning_rate": 9.976369639846173e-06, + "loss": 1.0394742, + "memory(GiB)": 292.62, + "step": 28400, + "train_speed(iter/s)": 0.123938 + }, + { + "acc": 0.72608733, + "epoch": 0.15893810110353487, + "grad_norm": 4.96875, + "learning_rate": 9.976279759232679e-06, + "loss": 1.06721659, + "memory(GiB)": 292.62, + "step": 28420, + "train_speed(iter/s)": 0.123979 + }, + { + "acc": 0.7236855, + "epoch": 0.15904995057651414, + "grad_norm": 7.15625, + "learning_rate": 9.976189708414636e-06, + "loss": 1.12271299, + "memory(GiB)": 292.62, + "step": 28440, + "train_speed(iter/s)": 0.124017 + }, + { + "acc": 0.72912107, + "epoch": 0.1591618000494934, + "grad_norm": 9.0, + "learning_rate": 9.976099487395128e-06, + "loss": 1.06991758, + "memory(GiB)": 292.62, + "step": 28460, + "train_speed(iter/s)": 0.124057 + }, + { + "acc": 0.72778063, + "epoch": 0.15927364952247267, + "grad_norm": 9.3125, + "learning_rate": 9.976009096177239e-06, + "loss": 1.07282839, + "memory(GiB)": 292.62, + "step": 28480, + "train_speed(iter/s)": 0.124098 + }, + { + "acc": 0.72532105, + "epoch": 0.15938549899545193, + "grad_norm": 7.3125, + "learning_rate": 9.97591853476406e-06, + "loss": 1.10548725, + "memory(GiB)": 292.62, + "step": 28500, + "train_speed(iter/s)": 0.124139 + }, + { + "acc": 0.71870432, + "epoch": 0.1594973484684312, + "grad_norm": 4.59375, + "learning_rate": 9.975827803158688e-06, + "loss": 1.11787395, + "memory(GiB)": 292.62, + "step": 28520, + "train_speed(iter/s)": 0.124181 + }, + { + "acc": 0.72062597, + "epoch": 0.15960919794141046, + "grad_norm": 4.15625, + "learning_rate": 9.975736901364228e-06, + "loss": 1.10863323, + "memory(GiB)": 292.62, + "step": 28540, + "train_speed(iter/s)": 0.124223 + }, + { + "acc": 0.72773352, + "epoch": 0.15972104741438972, + "grad_norm": 6.25, + "learning_rate": 9.975645829383787e-06, + "loss": 1.07875443, + "memory(GiB)": 292.62, + "step": 28560, + "train_speed(iter/s)": 0.124264 + }, + { + "acc": 0.73709602, + "epoch": 0.15983289688736899, + "grad_norm": 7.75, + "learning_rate": 9.975554587220482e-06, + "loss": 1.03440647, + "memory(GiB)": 292.62, + "step": 28580, + "train_speed(iter/s)": 0.124302 + }, + { + "acc": 0.73756437, + "epoch": 0.15994474636034825, + "grad_norm": 9.3125, + "learning_rate": 9.975463174877433e-06, + "loss": 1.02557144, + "memory(GiB)": 292.62, + "step": 28600, + "train_speed(iter/s)": 0.124343 + }, + { + "acc": 0.72551899, + "epoch": 0.16005659583332751, + "grad_norm": 8.8125, + "learning_rate": 9.975371592357767e-06, + "loss": 1.0792182, + "memory(GiB)": 292.62, + "step": 28620, + "train_speed(iter/s)": 0.124385 + }, + { + "acc": 0.74312463, + "epoch": 0.16016844530630678, + "grad_norm": 8.0, + "learning_rate": 9.975279839664616e-06, + "loss": 0.98533201, + "memory(GiB)": 292.62, + "step": 28640, + "train_speed(iter/s)": 0.124426 + }, + { + "acc": 0.72421646, + "epoch": 0.16028029477928604, + "grad_norm": 6.1875, + "learning_rate": 9.975187916801118e-06, + "loss": 1.09871035, + "memory(GiB)": 292.62, + "step": 28660, + "train_speed(iter/s)": 0.124468 + }, + { + "acc": 0.71370454, + "epoch": 0.1603921442522653, + "grad_norm": 8.0625, + "learning_rate": 9.975095823770419e-06, + "loss": 1.12161942, + "memory(GiB)": 292.62, + "step": 28680, + "train_speed(iter/s)": 0.124507 + }, + { + "acc": 0.72746105, + "epoch": 0.16050399372524457, + "grad_norm": 7.1875, + "learning_rate": 9.975003560575666e-06, + "loss": 1.06070137, + "memory(GiB)": 292.62, + "step": 28700, + "train_speed(iter/s)": 0.124549 + }, + { + "acc": 0.72226467, + "epoch": 0.16061584319822383, + "grad_norm": 8.3125, + "learning_rate": 9.974911127220015e-06, + "loss": 1.12795382, + "memory(GiB)": 292.62, + "step": 28720, + "train_speed(iter/s)": 0.124591 + }, + { + "acc": 0.73342185, + "epoch": 0.1607276926712031, + "grad_norm": 7.65625, + "learning_rate": 9.97481852370663e-06, + "loss": 1.04583263, + "memory(GiB)": 292.62, + "step": 28740, + "train_speed(iter/s)": 0.124634 + }, + { + "acc": 0.74214039, + "epoch": 0.16083954214418236, + "grad_norm": 6.375, + "learning_rate": 9.974725750038676e-06, + "loss": 1.02665539, + "memory(GiB)": 292.62, + "step": 28760, + "train_speed(iter/s)": 0.124674 + }, + { + "acc": 0.73282919, + "epoch": 0.16095139161716163, + "grad_norm": 6.09375, + "learning_rate": 9.974632806219327e-06, + "loss": 1.05398493, + "memory(GiB)": 292.62, + "step": 28780, + "train_speed(iter/s)": 0.124715 + }, + { + "acc": 0.73437157, + "epoch": 0.1610632410901409, + "grad_norm": 8.3125, + "learning_rate": 9.974539692251761e-06, + "loss": 1.065275, + "memory(GiB)": 292.62, + "step": 28800, + "train_speed(iter/s)": 0.124753 + }, + { + "acc": 0.7233839, + "epoch": 0.16117509056312015, + "grad_norm": 6.0625, + "learning_rate": 9.974446408139166e-06, + "loss": 1.09859247, + "memory(GiB)": 292.62, + "step": 28820, + "train_speed(iter/s)": 0.124794 + }, + { + "acc": 0.7231864, + "epoch": 0.16128694003609942, + "grad_norm": 4.78125, + "learning_rate": 9.974352953884728e-06, + "loss": 1.0917654, + "memory(GiB)": 292.62, + "step": 28840, + "train_speed(iter/s)": 0.124835 + }, + { + "acc": 0.73090768, + "epoch": 0.16139878950907868, + "grad_norm": 6.40625, + "learning_rate": 9.974259329491647e-06, + "loss": 1.06003256, + "memory(GiB)": 292.62, + "step": 28860, + "train_speed(iter/s)": 0.124875 + }, + { + "acc": 0.72928314, + "epoch": 0.16151063898205795, + "grad_norm": 10.5625, + "learning_rate": 9.974165534963122e-06, + "loss": 1.05503416, + "memory(GiB)": 292.62, + "step": 28880, + "train_speed(iter/s)": 0.124915 + }, + { + "acc": 0.73060908, + "epoch": 0.1616224884550372, + "grad_norm": 7.625, + "learning_rate": 9.974071570302366e-06, + "loss": 1.07808552, + "memory(GiB)": 292.62, + "step": 28900, + "train_speed(iter/s)": 0.124955 + }, + { + "acc": 0.72718854, + "epoch": 0.16173433792801648, + "grad_norm": 5.78125, + "learning_rate": 9.97397743551259e-06, + "loss": 1.07460117, + "memory(GiB)": 292.62, + "step": 28920, + "train_speed(iter/s)": 0.124995 + }, + { + "acc": 0.72159061, + "epoch": 0.16184618740099574, + "grad_norm": 7.125, + "learning_rate": 9.97388313059701e-06, + "loss": 1.0962781, + "memory(GiB)": 292.62, + "step": 28940, + "train_speed(iter/s)": 0.125036 + }, + { + "acc": 0.72781353, + "epoch": 0.161958036873975, + "grad_norm": 6.75, + "learning_rate": 9.973788655558859e-06, + "loss": 1.08723555, + "memory(GiB)": 292.62, + "step": 28960, + "train_speed(iter/s)": 0.125077 + }, + { + "acc": 0.73702002, + "epoch": 0.16206988634695427, + "grad_norm": 6.8125, + "learning_rate": 9.973694010401362e-06, + "loss": 1.03633556, + "memory(GiB)": 292.62, + "step": 28980, + "train_speed(iter/s)": 0.125116 + }, + { + "acc": 0.72998953, + "epoch": 0.16218173581993353, + "grad_norm": 5.6875, + "learning_rate": 9.97359919512776e-06, + "loss": 1.0636014, + "memory(GiB)": 292.62, + "step": 29000, + "train_speed(iter/s)": 0.125155 + }, + { + "acc": 0.73349695, + "epoch": 0.1622935852929128, + "grad_norm": 8.5625, + "learning_rate": 9.973504209741293e-06, + "loss": 1.05129585, + "memory(GiB)": 292.62, + "step": 29020, + "train_speed(iter/s)": 0.125195 + }, + { + "acc": 0.7249361, + "epoch": 0.16240543476589206, + "grad_norm": 6.9375, + "learning_rate": 9.973409054245213e-06, + "loss": 1.0950942, + "memory(GiB)": 292.62, + "step": 29040, + "train_speed(iter/s)": 0.125235 + }, + { + "acc": 0.72011485, + "epoch": 0.16251728423887132, + "grad_norm": 6.40625, + "learning_rate": 9.973313728642773e-06, + "loss": 1.09191971, + "memory(GiB)": 292.62, + "step": 29060, + "train_speed(iter/s)": 0.125275 + }, + { + "acc": 0.75158124, + "epoch": 0.1626291337118506, + "grad_norm": 5.3125, + "learning_rate": 9.973218232937234e-06, + "loss": 1.00275545, + "memory(GiB)": 292.62, + "step": 29080, + "train_speed(iter/s)": 0.125316 + }, + { + "acc": 0.73061018, + "epoch": 0.16274098318482985, + "grad_norm": 8.6875, + "learning_rate": 9.973122567131862e-06, + "loss": 1.04017134, + "memory(GiB)": 292.62, + "step": 29100, + "train_speed(iter/s)": 0.125358 + }, + { + "acc": 0.73053493, + "epoch": 0.16285283265780912, + "grad_norm": 6.25, + "learning_rate": 9.973026731229927e-06, + "loss": 1.05043259, + "memory(GiB)": 292.62, + "step": 29120, + "train_speed(iter/s)": 0.125397 + }, + { + "acc": 0.72530093, + "epoch": 0.16296468213078838, + "grad_norm": 7.625, + "learning_rate": 9.972930725234711e-06, + "loss": 1.09229403, + "memory(GiB)": 292.62, + "step": 29140, + "train_speed(iter/s)": 0.125436 + }, + { + "acc": 0.72524443, + "epoch": 0.16307653160376764, + "grad_norm": 7.125, + "learning_rate": 9.972834549149493e-06, + "loss": 1.07025995, + "memory(GiB)": 292.62, + "step": 29160, + "train_speed(iter/s)": 0.125475 + }, + { + "acc": 0.73255363, + "epoch": 0.1631883810767469, + "grad_norm": 6.0, + "learning_rate": 9.972738202977567e-06, + "loss": 1.05377522, + "memory(GiB)": 292.62, + "step": 29180, + "train_speed(iter/s)": 0.125512 + }, + { + "acc": 0.73297663, + "epoch": 0.16330023054972617, + "grad_norm": 9.375, + "learning_rate": 9.972641686722225e-06, + "loss": 1.06823387, + "memory(GiB)": 292.62, + "step": 29200, + "train_speed(iter/s)": 0.125554 + }, + { + "acc": 0.72063222, + "epoch": 0.16341208002270544, + "grad_norm": 8.8125, + "learning_rate": 9.97254500038677e-06, + "loss": 1.11448145, + "memory(GiB)": 292.62, + "step": 29220, + "train_speed(iter/s)": 0.125594 + }, + { + "acc": 0.71976223, + "epoch": 0.1635239294956847, + "grad_norm": 6.6875, + "learning_rate": 9.972448143974509e-06, + "loss": 1.13818655, + "memory(GiB)": 292.62, + "step": 29240, + "train_speed(iter/s)": 0.12563 + }, + { + "acc": 0.74667158, + "epoch": 0.16363577896866396, + "grad_norm": 7.40625, + "learning_rate": 9.972351117488754e-06, + "loss": 0.98029232, + "memory(GiB)": 292.62, + "step": 29260, + "train_speed(iter/s)": 0.12567 + }, + { + "acc": 0.71951165, + "epoch": 0.16374762844164323, + "grad_norm": 8.75, + "learning_rate": 9.972253920932824e-06, + "loss": 1.10415249, + "memory(GiB)": 292.62, + "step": 29280, + "train_speed(iter/s)": 0.12571 + }, + { + "acc": 0.73535037, + "epoch": 0.1638594779146225, + "grad_norm": 6.375, + "learning_rate": 9.972156554310042e-06, + "loss": 1.04272547, + "memory(GiB)": 292.62, + "step": 29300, + "train_speed(iter/s)": 0.125752 + }, + { + "acc": 0.72369304, + "epoch": 0.16397132738760176, + "grad_norm": 7.09375, + "learning_rate": 9.97205901762374e-06, + "loss": 1.11328611, + "memory(GiB)": 292.62, + "step": 29320, + "train_speed(iter/s)": 0.125793 + }, + { + "acc": 0.72323976, + "epoch": 0.16408317686058102, + "grad_norm": 8.125, + "learning_rate": 9.971961310877255e-06, + "loss": 1.09165945, + "memory(GiB)": 292.62, + "step": 29340, + "train_speed(iter/s)": 0.125825 + }, + { + "acc": 0.71964993, + "epoch": 0.16419502633356028, + "grad_norm": 5.65625, + "learning_rate": 9.971863434073927e-06, + "loss": 1.09069128, + "memory(GiB)": 292.62, + "step": 29360, + "train_speed(iter/s)": 0.125865 + }, + { + "acc": 0.72492075, + "epoch": 0.16430687580653955, + "grad_norm": 6.0, + "learning_rate": 9.971765387217105e-06, + "loss": 1.09723969, + "memory(GiB)": 292.62, + "step": 29380, + "train_speed(iter/s)": 0.125906 + }, + { + "acc": 0.74180651, + "epoch": 0.1644187252795188, + "grad_norm": 4.84375, + "learning_rate": 9.97166717031014e-06, + "loss": 0.99873753, + "memory(GiB)": 292.62, + "step": 29400, + "train_speed(iter/s)": 0.125943 + }, + { + "acc": 0.71783695, + "epoch": 0.16453057475249808, + "grad_norm": 7.34375, + "learning_rate": 9.971568783356394e-06, + "loss": 1.10924082, + "memory(GiB)": 292.62, + "step": 29420, + "train_speed(iter/s)": 0.125982 + }, + { + "acc": 0.70567012, + "epoch": 0.16464242422547734, + "grad_norm": 9.75, + "learning_rate": 9.97147022635923e-06, + "loss": 1.19048042, + "memory(GiB)": 292.62, + "step": 29440, + "train_speed(iter/s)": 0.126019 + }, + { + "acc": 0.75467806, + "epoch": 0.1647542736984566, + "grad_norm": 5.9375, + "learning_rate": 9.97137149932202e-06, + "loss": 0.92929564, + "memory(GiB)": 292.62, + "step": 29460, + "train_speed(iter/s)": 0.126058 + }, + { + "acc": 0.74030371, + "epoch": 0.16486612317143587, + "grad_norm": 9.875, + "learning_rate": 9.971272602248143e-06, + "loss": 1.02163363, + "memory(GiB)": 292.62, + "step": 29480, + "train_speed(iter/s)": 0.126098 + }, + { + "acc": 0.7212214, + "epoch": 0.16497797264441513, + "grad_norm": 6.96875, + "learning_rate": 9.971173535140979e-06, + "loss": 1.10325937, + "memory(GiB)": 292.62, + "step": 29500, + "train_speed(iter/s)": 0.126134 + }, + { + "acc": 0.71982932, + "epoch": 0.1650898221173944, + "grad_norm": 7.46875, + "learning_rate": 9.971074298003915e-06, + "loss": 1.126085, + "memory(GiB)": 292.62, + "step": 29520, + "train_speed(iter/s)": 0.126172 + }, + { + "acc": 0.74801712, + "epoch": 0.1652016715903737, + "grad_norm": 9.8125, + "learning_rate": 9.97097489084035e-06, + "loss": 0.99912701, + "memory(GiB)": 292.62, + "step": 29540, + "train_speed(iter/s)": 0.126212 + }, + { + "acc": 0.73213181, + "epoch": 0.16531352106335295, + "grad_norm": 4.40625, + "learning_rate": 9.97087531365368e-06, + "loss": 1.06055174, + "memory(GiB)": 292.62, + "step": 29560, + "train_speed(iter/s)": 0.126252 + }, + { + "acc": 0.73093386, + "epoch": 0.16542537053633222, + "grad_norm": 5.3125, + "learning_rate": 9.970775566447311e-06, + "loss": 1.06920881, + "memory(GiB)": 292.62, + "step": 29580, + "train_speed(iter/s)": 0.126293 + }, + { + "acc": 0.72488399, + "epoch": 0.16553722000931148, + "grad_norm": 7.75, + "learning_rate": 9.970675649224656e-06, + "loss": 1.07713757, + "memory(GiB)": 292.62, + "step": 29600, + "train_speed(iter/s)": 0.12633 + }, + { + "acc": 0.7177917, + "epoch": 0.16564906948229075, + "grad_norm": 7.28125, + "learning_rate": 9.970575561989133e-06, + "loss": 1.12144461, + "memory(GiB)": 292.62, + "step": 29620, + "train_speed(iter/s)": 0.126368 + }, + { + "acc": 0.74162426, + "epoch": 0.16576091895527, + "grad_norm": 8.1875, + "learning_rate": 9.970475304744166e-06, + "loss": 0.99945316, + "memory(GiB)": 292.62, + "step": 29640, + "train_speed(iter/s)": 0.126409 + }, + { + "acc": 0.71802917, + "epoch": 0.16587276842824927, + "grad_norm": 5.5625, + "learning_rate": 9.970374877493178e-06, + "loss": 1.10194044, + "memory(GiB)": 292.62, + "step": 29660, + "train_speed(iter/s)": 0.126448 + }, + { + "acc": 0.73079696, + "epoch": 0.16598461790122854, + "grad_norm": 6.40625, + "learning_rate": 9.970274280239613e-06, + "loss": 1.09559498, + "memory(GiB)": 292.62, + "step": 29680, + "train_speed(iter/s)": 0.126484 + }, + { + "acc": 0.71724939, + "epoch": 0.1660964673742078, + "grad_norm": 6.3125, + "learning_rate": 9.970173512986905e-06, + "loss": 1.1105978, + "memory(GiB)": 292.62, + "step": 29700, + "train_speed(iter/s)": 0.126521 + }, + { + "acc": 0.739817, + "epoch": 0.16620831684718707, + "grad_norm": 7.0625, + "learning_rate": 9.970072575738504e-06, + "loss": 1.0205061, + "memory(GiB)": 292.62, + "step": 29720, + "train_speed(iter/s)": 0.126559 + }, + { + "acc": 0.73680487, + "epoch": 0.16632016632016633, + "grad_norm": 9.3125, + "learning_rate": 9.96997146849786e-06, + "loss": 1.01980505, + "memory(GiB)": 292.62, + "step": 29740, + "train_speed(iter/s)": 0.126599 + }, + { + "acc": 0.73091226, + "epoch": 0.1664320157931456, + "grad_norm": 8.0625, + "learning_rate": 9.969870191268434e-06, + "loss": 1.06683455, + "memory(GiB)": 292.62, + "step": 29760, + "train_speed(iter/s)": 0.126636 + }, + { + "acc": 0.73355465, + "epoch": 0.16654386526612486, + "grad_norm": 7.5, + "learning_rate": 9.969768744053687e-06, + "loss": 1.04288359, + "memory(GiB)": 292.62, + "step": 29780, + "train_speed(iter/s)": 0.126676 + }, + { + "acc": 0.73188796, + "epoch": 0.16665571473910412, + "grad_norm": 5.71875, + "learning_rate": 9.969667126857092e-06, + "loss": 1.05337048, + "memory(GiB)": 292.62, + "step": 29800, + "train_speed(iter/s)": 0.126715 + }, + { + "acc": 0.72823453, + "epoch": 0.16676756421208339, + "grad_norm": 8.125, + "learning_rate": 9.96956533968212e-06, + "loss": 1.06268473, + "memory(GiB)": 292.62, + "step": 29820, + "train_speed(iter/s)": 0.126754 + }, + { + "acc": 0.71724782, + "epoch": 0.16687941368506265, + "grad_norm": 4.78125, + "learning_rate": 9.969463382532258e-06, + "loss": 1.1040884, + "memory(GiB)": 292.62, + "step": 29840, + "train_speed(iter/s)": 0.126791 + }, + { + "acc": 0.71739936, + "epoch": 0.1669912631580419, + "grad_norm": 7.21875, + "learning_rate": 9.96936125541099e-06, + "loss": 1.11446228, + "memory(GiB)": 292.62, + "step": 29860, + "train_speed(iter/s)": 0.126831 + }, + { + "acc": 0.72116213, + "epoch": 0.16710311263102118, + "grad_norm": 10.9375, + "learning_rate": 9.96925895832181e-06, + "loss": 1.10937176, + "memory(GiB)": 292.62, + "step": 29880, + "train_speed(iter/s)": 0.126868 + }, + { + "acc": 0.73075223, + "epoch": 0.16721496210400044, + "grad_norm": 6.46875, + "learning_rate": 9.969156491268216e-06, + "loss": 1.05924816, + "memory(GiB)": 292.62, + "step": 29900, + "train_speed(iter/s)": 0.126906 + }, + { + "acc": 0.72724719, + "epoch": 0.1673268115769797, + "grad_norm": 6.3125, + "learning_rate": 9.969053854253712e-06, + "loss": 1.07727985, + "memory(GiB)": 292.62, + "step": 29920, + "train_speed(iter/s)": 0.126945 + }, + { + "acc": 0.72752013, + "epoch": 0.16743866104995897, + "grad_norm": 5.25, + "learning_rate": 9.968951047281813e-06, + "loss": 1.05894966, + "memory(GiB)": 292.62, + "step": 29940, + "train_speed(iter/s)": 0.126984 + }, + { + "acc": 0.73092103, + "epoch": 0.16755051052293823, + "grad_norm": 4.53125, + "learning_rate": 9.968848070356029e-06, + "loss": 1.07635536, + "memory(GiB)": 292.62, + "step": 29960, + "train_speed(iter/s)": 0.127022 + }, + { + "acc": 0.72937965, + "epoch": 0.1676623599959175, + "grad_norm": 8.0625, + "learning_rate": 9.968744923479885e-06, + "loss": 1.05805368, + "memory(GiB)": 292.62, + "step": 29980, + "train_speed(iter/s)": 0.127061 + }, + { + "acc": 0.7304184, + "epoch": 0.16777420946889676, + "grad_norm": 8.1875, + "learning_rate": 9.96864160665691e-06, + "loss": 1.07268391, + "memory(GiB)": 292.62, + "step": 30000, + "train_speed(iter/s)": 0.127098 + }, + { + "epoch": 0.16777420946889676, + "eval_acc": 0.693001995829689, + "eval_loss": 1.0719707012176514, + "eval_runtime": 7500.7073, + "eval_samples_per_second": 10.037, + "eval_steps_per_second": 10.037, + "step": 30000 + }, + { + "acc": 0.72153711, + "epoch": 0.16788605894187603, + "grad_norm": 6.4375, + "learning_rate": 9.968538119890638e-06, + "loss": 1.12588749, + "memory(GiB)": 292.62, + "step": 30020, + "train_speed(iter/s)": 0.123174 + }, + { + "acc": 0.74925647, + "epoch": 0.1679979084148553, + "grad_norm": 6.40625, + "learning_rate": 9.968434463184606e-06, + "loss": 0.98538361, + "memory(GiB)": 292.62, + "step": 30040, + "train_speed(iter/s)": 0.12321 + }, + { + "acc": 0.72381701, + "epoch": 0.16810975788783455, + "grad_norm": 6.96875, + "learning_rate": 9.968330636542362e-06, + "loss": 1.10419817, + "memory(GiB)": 292.62, + "step": 30060, + "train_speed(iter/s)": 0.12325 + }, + { + "acc": 0.73007107, + "epoch": 0.16822160736081382, + "grad_norm": 9.9375, + "learning_rate": 9.968226639967456e-06, + "loss": 1.06743593, + "memory(GiB)": 292.62, + "step": 30080, + "train_speed(iter/s)": 0.123287 + }, + { + "acc": 0.73040581, + "epoch": 0.16833345683379308, + "grad_norm": 6.25, + "learning_rate": 9.968122473463444e-06, + "loss": 1.07927675, + "memory(GiB)": 292.62, + "step": 30100, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.72947693, + "epoch": 0.16844530630677235, + "grad_norm": 8.125, + "learning_rate": 9.96801813703389e-06, + "loss": 1.07096481, + "memory(GiB)": 292.62, + "step": 30120, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.72034578, + "epoch": 0.1685571557797516, + "grad_norm": 6.4375, + "learning_rate": 9.967913630682364e-06, + "loss": 1.09476023, + "memory(GiB)": 292.62, + "step": 30140, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.72983685, + "epoch": 0.16866900525273087, + "grad_norm": 7.71875, + "learning_rate": 9.967808954412439e-06, + "loss": 1.07882004, + "memory(GiB)": 292.62, + "step": 30160, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.7394927, + "epoch": 0.16878085472571014, + "grad_norm": 5.34375, + "learning_rate": 9.967704108227694e-06, + "loss": 1.02036362, + "memory(GiB)": 292.62, + "step": 30180, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.71885962, + "epoch": 0.1688927041986894, + "grad_norm": 7.40625, + "learning_rate": 9.967599092131716e-06, + "loss": 1.11446085, + "memory(GiB)": 292.62, + "step": 30200, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.72191482, + "epoch": 0.16900455367166867, + "grad_norm": 8.5, + "learning_rate": 9.967493906128098e-06, + "loss": 1.0924551, + "memory(GiB)": 292.62, + "step": 30220, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.72390685, + "epoch": 0.16911640314464793, + "grad_norm": 7.28125, + "learning_rate": 9.967388550220438e-06, + "loss": 1.07540016, + "memory(GiB)": 292.62, + "step": 30240, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.72575169, + "epoch": 0.1692282526176272, + "grad_norm": 9.3125, + "learning_rate": 9.967283024412337e-06, + "loss": 1.10655499, + "memory(GiB)": 292.62, + "step": 30260, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.73424973, + "epoch": 0.16934010209060646, + "grad_norm": 7.90625, + "learning_rate": 9.967177328707407e-06, + "loss": 1.03284378, + "memory(GiB)": 292.62, + "step": 30280, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.7453856, + "epoch": 0.16945195156358572, + "grad_norm": 6.625, + "learning_rate": 9.96707146310926e-06, + "loss": 0.9938282, + "memory(GiB)": 292.62, + "step": 30300, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.70400014, + "epoch": 0.169563801036565, + "grad_norm": 6.8125, + "learning_rate": 9.96696542762152e-06, + "loss": 1.20310984, + "memory(GiB)": 292.62, + "step": 30320, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.72204599, + "epoch": 0.16967565050954425, + "grad_norm": 6.3125, + "learning_rate": 9.966859222247814e-06, + "loss": 1.10185261, + "memory(GiB)": 292.62, + "step": 30340, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.72046733, + "epoch": 0.16978749998252352, + "grad_norm": 6.125, + "learning_rate": 9.966752846991769e-06, + "loss": 1.09194689, + "memory(GiB)": 292.62, + "step": 30360, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.72383084, + "epoch": 0.16989934945550278, + "grad_norm": 8.75, + "learning_rate": 9.966646301857031e-06, + "loss": 1.10976467, + "memory(GiB)": 292.62, + "step": 30380, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.72601929, + "epoch": 0.17001119892848204, + "grad_norm": 4.78125, + "learning_rate": 9.966539586847238e-06, + "loss": 1.08134146, + "memory(GiB)": 292.62, + "step": 30400, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.72239747, + "epoch": 0.1701230484014613, + "grad_norm": 7.59375, + "learning_rate": 9.966432701966047e-06, + "loss": 1.10798473, + "memory(GiB)": 292.62, + "step": 30420, + "train_speed(iter/s)": 0.123938 + }, + { + "acc": 0.71883383, + "epoch": 0.17023489787444057, + "grad_norm": 8.0625, + "learning_rate": 9.966325647217106e-06, + "loss": 1.1095336, + "memory(GiB)": 292.62, + "step": 30440, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.72082653, + "epoch": 0.17034674734741984, + "grad_norm": 7.625, + "learning_rate": 9.96621842260408e-06, + "loss": 1.09462366, + "memory(GiB)": 292.62, + "step": 30460, + "train_speed(iter/s)": 0.124011 + }, + { + "acc": 0.72553158, + "epoch": 0.1704585968203991, + "grad_norm": 7.09375, + "learning_rate": 9.966111028130638e-06, + "loss": 1.08458767, + "memory(GiB)": 292.62, + "step": 30480, + "train_speed(iter/s)": 0.12405 + }, + { + "acc": 0.73208752, + "epoch": 0.17057044629337836, + "grad_norm": 8.375, + "learning_rate": 9.966003463800452e-06, + "loss": 1.05018797, + "memory(GiB)": 292.62, + "step": 30500, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.72135158, + "epoch": 0.17068229576635763, + "grad_norm": 5.0625, + "learning_rate": 9.965895729617198e-06, + "loss": 1.09628906, + "memory(GiB)": 292.62, + "step": 30520, + "train_speed(iter/s)": 0.124126 + }, + { + "acc": 0.73285999, + "epoch": 0.1707941452393369, + "grad_norm": 8.3125, + "learning_rate": 9.965787825584567e-06, + "loss": 1.05762844, + "memory(GiB)": 292.62, + "step": 30540, + "train_speed(iter/s)": 0.124163 + }, + { + "acc": 0.72183156, + "epoch": 0.17090599471231616, + "grad_norm": 6.28125, + "learning_rate": 9.965679751706243e-06, + "loss": 1.10346737, + "memory(GiB)": 292.62, + "step": 30560, + "train_speed(iter/s)": 0.124199 + }, + { + "acc": 0.73480387, + "epoch": 0.17101784418529542, + "grad_norm": 9.4375, + "learning_rate": 9.965571507985929e-06, + "loss": 1.04177465, + "memory(GiB)": 292.62, + "step": 30580, + "train_speed(iter/s)": 0.124238 + }, + { + "acc": 0.72723322, + "epoch": 0.17112969365827468, + "grad_norm": 5.5, + "learning_rate": 9.965463094427321e-06, + "loss": 1.11134033, + "memory(GiB)": 292.62, + "step": 30600, + "train_speed(iter/s)": 0.124275 + }, + { + "acc": 0.73820829, + "epoch": 0.17124154313125395, + "grad_norm": 5.65625, + "learning_rate": 9.965354511034132e-06, + "loss": 1.02600393, + "memory(GiB)": 292.62, + "step": 30620, + "train_speed(iter/s)": 0.124314 + }, + { + "acc": 0.72524948, + "epoch": 0.1713533926042332, + "grad_norm": 7.1875, + "learning_rate": 9.965245757810073e-06, + "loss": 1.11529264, + "memory(GiB)": 292.62, + "step": 30640, + "train_speed(iter/s)": 0.124352 + }, + { + "acc": 0.73169284, + "epoch": 0.17146524207721248, + "grad_norm": 6.125, + "learning_rate": 9.965136834758866e-06, + "loss": 1.08749561, + "memory(GiB)": 292.62, + "step": 30660, + "train_speed(iter/s)": 0.124391 + }, + { + "acc": 0.73524103, + "epoch": 0.17157709155019174, + "grad_norm": 5.6875, + "learning_rate": 9.965027741884235e-06, + "loss": 1.04207439, + "memory(GiB)": 292.62, + "step": 30680, + "train_speed(iter/s)": 0.124429 + }, + { + "acc": 0.71507201, + "epoch": 0.171688941023171, + "grad_norm": 5.125, + "learning_rate": 9.96491847918991e-06, + "loss": 1.14870348, + "memory(GiB)": 292.62, + "step": 30700, + "train_speed(iter/s)": 0.124467 + }, + { + "acc": 0.71864815, + "epoch": 0.17180079049615027, + "grad_norm": 8.0625, + "learning_rate": 9.964809046679631e-06, + "loss": 1.12811403, + "memory(GiB)": 292.62, + "step": 30720, + "train_speed(iter/s)": 0.1245 + }, + { + "acc": 0.7278214, + "epoch": 0.17191263996912953, + "grad_norm": 10.375, + "learning_rate": 9.96469944435714e-06, + "loss": 1.05725555, + "memory(GiB)": 292.62, + "step": 30740, + "train_speed(iter/s)": 0.124534 + }, + { + "acc": 0.73083816, + "epoch": 0.1720244894421088, + "grad_norm": 5.1875, + "learning_rate": 9.964589672226185e-06, + "loss": 1.06313467, + "memory(GiB)": 292.62, + "step": 30760, + "train_speed(iter/s)": 0.12457 + }, + { + "acc": 0.73135428, + "epoch": 0.17213633891508806, + "grad_norm": 10.375, + "learning_rate": 9.964479730290521e-06, + "loss": 1.05761957, + "memory(GiB)": 292.62, + "step": 30780, + "train_speed(iter/s)": 0.124606 + }, + { + "acc": 0.73596563, + "epoch": 0.17224818838806735, + "grad_norm": 8.4375, + "learning_rate": 9.964369618553907e-06, + "loss": 1.07069902, + "memory(GiB)": 292.62, + "step": 30800, + "train_speed(iter/s)": 0.124643 + }, + { + "acc": 0.69786396, + "epoch": 0.17236003786104662, + "grad_norm": 5.3125, + "learning_rate": 9.964259337020112e-06, + "loss": 1.23125305, + "memory(GiB)": 292.62, + "step": 30820, + "train_speed(iter/s)": 0.124681 + }, + { + "acc": 0.7295608, + "epoch": 0.17247188733402588, + "grad_norm": 9.0625, + "learning_rate": 9.964148885692906e-06, + "loss": 1.09162064, + "memory(GiB)": 292.62, + "step": 30840, + "train_speed(iter/s)": 0.124721 + }, + { + "acc": 0.71481366, + "epoch": 0.17258373680700514, + "grad_norm": 6.625, + "learning_rate": 9.964038264576066e-06, + "loss": 1.12721605, + "memory(GiB)": 292.62, + "step": 30860, + "train_speed(iter/s)": 0.12476 + }, + { + "acc": 0.74765534, + "epoch": 0.1726955862799844, + "grad_norm": 8.625, + "learning_rate": 9.963927473673379e-06, + "loss": 0.97372599, + "memory(GiB)": 292.62, + "step": 30880, + "train_speed(iter/s)": 0.124798 + }, + { + "acc": 0.71707387, + "epoch": 0.17280743575296367, + "grad_norm": 5.59375, + "learning_rate": 9.963816512988631e-06, + "loss": 1.13160524, + "memory(GiB)": 292.62, + "step": 30900, + "train_speed(iter/s)": 0.124837 + }, + { + "acc": 0.74814782, + "epoch": 0.17291928522594294, + "grad_norm": 5.71875, + "learning_rate": 9.963705382525618e-06, + "loss": 0.996346, + "memory(GiB)": 292.62, + "step": 30920, + "train_speed(iter/s)": 0.124876 + }, + { + "acc": 0.73960099, + "epoch": 0.1730311346989222, + "grad_norm": 5.5, + "learning_rate": 9.963594082288142e-06, + "loss": 1.01463804, + "memory(GiB)": 292.62, + "step": 30940, + "train_speed(iter/s)": 0.124914 + }, + { + "acc": 0.70308647, + "epoch": 0.17314298417190147, + "grad_norm": 6.1875, + "learning_rate": 9.963482612280008e-06, + "loss": 1.18071709, + "memory(GiB)": 292.62, + "step": 30960, + "train_speed(iter/s)": 0.124954 + }, + { + "acc": 0.70856676, + "epoch": 0.17325483364488073, + "grad_norm": 6.28125, + "learning_rate": 9.963370972505034e-06, + "loss": 1.15806065, + "memory(GiB)": 292.62, + "step": 30980, + "train_speed(iter/s)": 0.124992 + }, + { + "acc": 0.72894115, + "epoch": 0.17336668311786, + "grad_norm": 6.40625, + "learning_rate": 9.96325916296703e-06, + "loss": 1.08360348, + "memory(GiB)": 292.62, + "step": 31000, + "train_speed(iter/s)": 0.125031 + }, + { + "acc": 0.70340161, + "epoch": 0.17347853259083926, + "grad_norm": 6.75, + "learning_rate": 9.963147183669826e-06, + "loss": 1.1917655, + "memory(GiB)": 292.62, + "step": 31020, + "train_speed(iter/s)": 0.125067 + }, + { + "acc": 0.72315831, + "epoch": 0.17359038206381852, + "grad_norm": 6.21875, + "learning_rate": 9.96303503461725e-06, + "loss": 1.09136181, + "memory(GiB)": 292.62, + "step": 31040, + "train_speed(iter/s)": 0.125106 + }, + { + "acc": 0.7237093, + "epoch": 0.17370223153679779, + "grad_norm": 7.65625, + "learning_rate": 9.962922715813138e-06, + "loss": 1.07931442, + "memory(GiB)": 292.62, + "step": 31060, + "train_speed(iter/s)": 0.125145 + }, + { + "acc": 0.73512611, + "epoch": 0.17381408100977705, + "grad_norm": 8.125, + "learning_rate": 9.962810227261335e-06, + "loss": 1.03629427, + "memory(GiB)": 292.62, + "step": 31080, + "train_speed(iter/s)": 0.125183 + }, + { + "acc": 0.73304925, + "epoch": 0.1739259304827563, + "grad_norm": 6.0625, + "learning_rate": 9.962697568965684e-06, + "loss": 1.04332228, + "memory(GiB)": 292.62, + "step": 31100, + "train_speed(iter/s)": 0.12522 + }, + { + "acc": 0.72536688, + "epoch": 0.17403777995573558, + "grad_norm": 8.3125, + "learning_rate": 9.962584740930039e-06, + "loss": 1.08702145, + "memory(GiB)": 292.62, + "step": 31120, + "train_speed(iter/s)": 0.125258 + }, + { + "acc": 0.717981, + "epoch": 0.17414962942871484, + "grad_norm": 7.125, + "learning_rate": 9.96247174315826e-06, + "loss": 1.10280514, + "memory(GiB)": 292.62, + "step": 31140, + "train_speed(iter/s)": 0.125296 + }, + { + "acc": 0.72036724, + "epoch": 0.1742614789016941, + "grad_norm": 8.6875, + "learning_rate": 9.962358575654213e-06, + "loss": 1.11973114, + "memory(GiB)": 292.62, + "step": 31160, + "train_speed(iter/s)": 0.125332 + }, + { + "acc": 0.73002243, + "epoch": 0.17437332837467337, + "grad_norm": 5.71875, + "learning_rate": 9.962245238421766e-06, + "loss": 1.08619862, + "memory(GiB)": 292.62, + "step": 31180, + "train_speed(iter/s)": 0.125369 + }, + { + "acc": 0.72708879, + "epoch": 0.17448517784765263, + "grad_norm": 8.875, + "learning_rate": 9.962131731464797e-06, + "loss": 1.06535044, + "memory(GiB)": 292.62, + "step": 31200, + "train_speed(iter/s)": 0.125408 + }, + { + "acc": 0.73815565, + "epoch": 0.1745970273206319, + "grad_norm": 7.46875, + "learning_rate": 9.96201805478719e-06, + "loss": 1.03424253, + "memory(GiB)": 292.62, + "step": 31220, + "train_speed(iter/s)": 0.125444 + }, + { + "acc": 0.71630983, + "epoch": 0.17470887679361116, + "grad_norm": 7.40625, + "learning_rate": 9.96190420839283e-06, + "loss": 1.12705326, + "memory(GiB)": 292.62, + "step": 31240, + "train_speed(iter/s)": 0.125482 + }, + { + "acc": 0.72571211, + "epoch": 0.17482072626659043, + "grad_norm": 5.625, + "learning_rate": 9.961790192285614e-06, + "loss": 1.09505548, + "memory(GiB)": 292.62, + "step": 31260, + "train_speed(iter/s)": 0.12552 + }, + { + "acc": 0.72810388, + "epoch": 0.1749325757395697, + "grad_norm": 6.25, + "learning_rate": 9.961676006469439e-06, + "loss": 1.07882166, + "memory(GiB)": 292.62, + "step": 31280, + "train_speed(iter/s)": 0.125559 + }, + { + "acc": 0.72859349, + "epoch": 0.17504442521254895, + "grad_norm": 8.125, + "learning_rate": 9.961561650948211e-06, + "loss": 1.07436457, + "memory(GiB)": 292.62, + "step": 31300, + "train_speed(iter/s)": 0.125597 + }, + { + "acc": 0.7277741, + "epoch": 0.17515627468552822, + "grad_norm": 8.875, + "learning_rate": 9.961447125725843e-06, + "loss": 1.07688055, + "memory(GiB)": 292.62, + "step": 31320, + "train_speed(iter/s)": 0.125634 + }, + { + "acc": 0.72702594, + "epoch": 0.17526812415850748, + "grad_norm": 9.6875, + "learning_rate": 9.961332430806249e-06, + "loss": 1.08549452, + "memory(GiB)": 292.62, + "step": 31340, + "train_speed(iter/s)": 0.125672 + }, + { + "acc": 0.73108578, + "epoch": 0.17537997363148675, + "grad_norm": 6.5, + "learning_rate": 9.961217566193355e-06, + "loss": 1.05131693, + "memory(GiB)": 292.62, + "step": 31360, + "train_speed(iter/s)": 0.12571 + }, + { + "acc": 0.72346983, + "epoch": 0.175491823104466, + "grad_norm": 6.1875, + "learning_rate": 9.961102531891089e-06, + "loss": 1.08320684, + "memory(GiB)": 292.62, + "step": 31380, + "train_speed(iter/s)": 0.125746 + }, + { + "acc": 0.74386125, + "epoch": 0.17560367257744527, + "grad_norm": 8.6875, + "learning_rate": 9.960987327903384e-06, + "loss": 1.00476809, + "memory(GiB)": 292.62, + "step": 31400, + "train_speed(iter/s)": 0.125786 + }, + { + "acc": 0.73640323, + "epoch": 0.17571552205042454, + "grad_norm": 6.46875, + "learning_rate": 9.960871954234183e-06, + "loss": 1.0417592, + "memory(GiB)": 292.62, + "step": 31420, + "train_speed(iter/s)": 0.125823 + }, + { + "acc": 0.73332992, + "epoch": 0.1758273715234038, + "grad_norm": 6.75, + "learning_rate": 9.960756410887432e-06, + "loss": 1.05939589, + "memory(GiB)": 292.62, + "step": 31440, + "train_speed(iter/s)": 0.125859 + }, + { + "acc": 0.7256856, + "epoch": 0.17593922099638307, + "grad_norm": 7.59375, + "learning_rate": 9.96064069786708e-06, + "loss": 1.09316206, + "memory(GiB)": 292.62, + "step": 31460, + "train_speed(iter/s)": 0.125897 + }, + { + "acc": 0.717207, + "epoch": 0.17605107046936233, + "grad_norm": 5.78125, + "learning_rate": 9.960524815177085e-06, + "loss": 1.13310976, + "memory(GiB)": 292.62, + "step": 31480, + "train_speed(iter/s)": 0.125935 + }, + { + "acc": 0.71371446, + "epoch": 0.1761629199423416, + "grad_norm": 7.625, + "learning_rate": 9.960408762821412e-06, + "loss": 1.11374083, + "memory(GiB)": 292.62, + "step": 31500, + "train_speed(iter/s)": 0.125972 + }, + { + "acc": 0.73790913, + "epoch": 0.17627476941532086, + "grad_norm": 6.9375, + "learning_rate": 9.960292540804031e-06, + "loss": 1.02863874, + "memory(GiB)": 292.62, + "step": 31520, + "train_speed(iter/s)": 0.126007 + }, + { + "acc": 0.71522121, + "epoch": 0.17638661888830012, + "grad_norm": 6.625, + "learning_rate": 9.960176149128916e-06, + "loss": 1.15412197, + "memory(GiB)": 292.62, + "step": 31540, + "train_speed(iter/s)": 0.126043 + }, + { + "acc": 0.74441681, + "epoch": 0.1764984683612794, + "grad_norm": 6.90625, + "learning_rate": 9.960059587800051e-06, + "loss": 1.01483164, + "memory(GiB)": 292.62, + "step": 31560, + "train_speed(iter/s)": 0.126081 + }, + { + "acc": 0.72479281, + "epoch": 0.17661031783425865, + "grad_norm": 5.65625, + "learning_rate": 9.959942856821418e-06, + "loss": 1.10903692, + "memory(GiB)": 292.62, + "step": 31580, + "train_speed(iter/s)": 0.126115 + }, + { + "acc": 0.72318506, + "epoch": 0.17672216730723792, + "grad_norm": 4.53125, + "learning_rate": 9.959825956197011e-06, + "loss": 1.08180294, + "memory(GiB)": 292.62, + "step": 31600, + "train_speed(iter/s)": 0.126153 + }, + { + "acc": 0.72044291, + "epoch": 0.17683401678021718, + "grad_norm": 7.78125, + "learning_rate": 9.95970888593083e-06, + "loss": 1.10455484, + "memory(GiB)": 292.62, + "step": 31620, + "train_speed(iter/s)": 0.12619 + }, + { + "acc": 0.71302438, + "epoch": 0.17694586625319644, + "grad_norm": 11.75, + "learning_rate": 9.95959164602688e-06, + "loss": 1.12646799, + "memory(GiB)": 292.62, + "step": 31640, + "train_speed(iter/s)": 0.126228 + }, + { + "acc": 0.70936441, + "epoch": 0.1770577157261757, + "grad_norm": 6.3125, + "learning_rate": 9.959474236489167e-06, + "loss": 1.18172665, + "memory(GiB)": 292.62, + "step": 31660, + "train_speed(iter/s)": 0.126264 + }, + { + "acc": 0.72733011, + "epoch": 0.17716956519915497, + "grad_norm": 6.03125, + "learning_rate": 9.95935665732171e-06, + "loss": 1.07902031, + "memory(GiB)": 292.62, + "step": 31680, + "train_speed(iter/s)": 0.126303 + }, + { + "acc": 0.73473291, + "epoch": 0.17728141467213424, + "grad_norm": 6.15625, + "learning_rate": 9.959238908528529e-06, + "loss": 1.04732113, + "memory(GiB)": 292.62, + "step": 31700, + "train_speed(iter/s)": 0.126339 + }, + { + "acc": 0.72961383, + "epoch": 0.1773932641451135, + "grad_norm": 7.90625, + "learning_rate": 9.959120990113653e-06, + "loss": 1.07104692, + "memory(GiB)": 292.62, + "step": 31720, + "train_speed(iter/s)": 0.126376 + }, + { + "acc": 0.72392759, + "epoch": 0.17750511361809276, + "grad_norm": 14.4375, + "learning_rate": 9.959002902081115e-06, + "loss": 1.06610622, + "memory(GiB)": 292.62, + "step": 31740, + "train_speed(iter/s)": 0.126415 + }, + { + "acc": 0.71988049, + "epoch": 0.17761696309107203, + "grad_norm": 4.9375, + "learning_rate": 9.95888464443495e-06, + "loss": 1.104706, + "memory(GiB)": 292.62, + "step": 31760, + "train_speed(iter/s)": 0.126451 + }, + { + "acc": 0.7382544, + "epoch": 0.1777288125640513, + "grad_norm": 8.8125, + "learning_rate": 9.958766217179208e-06, + "loss": 1.02813044, + "memory(GiB)": 292.62, + "step": 31780, + "train_speed(iter/s)": 0.126489 + }, + { + "acc": 0.72033715, + "epoch": 0.17784066203703056, + "grad_norm": 7.34375, + "learning_rate": 9.958647620317937e-06, + "loss": 1.10985622, + "memory(GiB)": 292.62, + "step": 31800, + "train_speed(iter/s)": 0.126527 + }, + { + "acc": 0.73087907, + "epoch": 0.17795251151000982, + "grad_norm": 4.65625, + "learning_rate": 9.958528853855194e-06, + "loss": 1.04853916, + "memory(GiB)": 292.62, + "step": 31820, + "train_speed(iter/s)": 0.126563 + }, + { + "acc": 0.73950267, + "epoch": 0.17806436098298908, + "grad_norm": 5.15625, + "learning_rate": 9.958409917795043e-06, + "loss": 1.02001877, + "memory(GiB)": 292.62, + "step": 31840, + "train_speed(iter/s)": 0.126599 + }, + { + "acc": 0.73816619, + "epoch": 0.17817621045596835, + "grad_norm": 8.8125, + "learning_rate": 9.958290812141546e-06, + "loss": 1.04510212, + "memory(GiB)": 292.62, + "step": 31860, + "train_speed(iter/s)": 0.126637 + }, + { + "acc": 0.72333579, + "epoch": 0.1782880599289476, + "grad_norm": 7.5, + "learning_rate": 9.958171536898784e-06, + "loss": 1.09188795, + "memory(GiB)": 292.62, + "step": 31880, + "train_speed(iter/s)": 0.126675 + }, + { + "acc": 0.74126225, + "epoch": 0.17839990940192688, + "grad_norm": 5.5, + "learning_rate": 9.958052092070834e-06, + "loss": 1.02334747, + "memory(GiB)": 292.62, + "step": 31900, + "train_speed(iter/s)": 0.126712 + }, + { + "acc": 0.73274784, + "epoch": 0.17851175887490614, + "grad_norm": 5.15625, + "learning_rate": 9.95793247766178e-06, + "loss": 1.05639715, + "memory(GiB)": 292.62, + "step": 31920, + "train_speed(iter/s)": 0.126748 + }, + { + "acc": 0.74101796, + "epoch": 0.1786236083478854, + "grad_norm": 9.8125, + "learning_rate": 9.957812693675712e-06, + "loss": 1.0141367, + "memory(GiB)": 292.62, + "step": 31940, + "train_speed(iter/s)": 0.126783 + }, + { + "acc": 0.73817153, + "epoch": 0.17873545782086467, + "grad_norm": 5.875, + "learning_rate": 9.957692740116733e-06, + "loss": 1.02071524, + "memory(GiB)": 292.62, + "step": 31960, + "train_speed(iter/s)": 0.126819 + }, + { + "acc": 0.73068805, + "epoch": 0.17884730729384393, + "grad_norm": 4.0, + "learning_rate": 9.957572616988939e-06, + "loss": 1.05498295, + "memory(GiB)": 292.62, + "step": 31980, + "train_speed(iter/s)": 0.126856 + }, + { + "acc": 0.72906866, + "epoch": 0.1789591567668232, + "grad_norm": 9.9375, + "learning_rate": 9.957452324296441e-06, + "loss": 1.08290634, + "memory(GiB)": 292.62, + "step": 32000, + "train_speed(iter/s)": 0.12689 + }, + { + "epoch": 0.1789591567668232, + "eval_acc": 0.6935276474617232, + "eval_loss": 1.0692013502120972, + "eval_runtime": 7497.3042, + "eval_samples_per_second": 10.041, + "eval_steps_per_second": 10.041, + "step": 32000 + }, + { + "acc": 0.71565623, + "epoch": 0.17907100623980246, + "grad_norm": 7.25, + "learning_rate": 9.957331862043355e-06, + "loss": 1.14485178, + "memory(GiB)": 292.62, + "step": 32020, + "train_speed(iter/s)": 0.123214 + }, + { + "acc": 0.73640356, + "epoch": 0.17918285571278172, + "grad_norm": 7.3125, + "learning_rate": 9.957211230233801e-06, + "loss": 1.0481823, + "memory(GiB)": 292.62, + "step": 32040, + "train_speed(iter/s)": 0.123251 + }, + { + "acc": 0.74369674, + "epoch": 0.17929470518576102, + "grad_norm": 7.09375, + "learning_rate": 9.957090428871905e-06, + "loss": 1.00044031, + "memory(GiB)": 292.62, + "step": 32060, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.72316604, + "epoch": 0.17940655465874028, + "grad_norm": 8.6875, + "learning_rate": 9.956969457961795e-06, + "loss": 1.07456865, + "memory(GiB)": 292.62, + "step": 32080, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.72469616, + "epoch": 0.17951840413171954, + "grad_norm": 5.5625, + "learning_rate": 9.956848317507612e-06, + "loss": 1.098419, + "memory(GiB)": 292.62, + "step": 32100, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.73840737, + "epoch": 0.1796302536046988, + "grad_norm": 4.8125, + "learning_rate": 9.956727007513499e-06, + "loss": 1.01311073, + "memory(GiB)": 292.62, + "step": 32120, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.72318196, + "epoch": 0.17974210307767807, + "grad_norm": 4.65625, + "learning_rate": 9.956605527983604e-06, + "loss": 1.0862112, + "memory(GiB)": 292.62, + "step": 32140, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.73618197, + "epoch": 0.17985395255065734, + "grad_norm": 7.0625, + "learning_rate": 9.956483878922085e-06, + "loss": 1.0514554, + "memory(GiB)": 292.62, + "step": 32160, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.72344637, + "epoch": 0.1799658020236366, + "grad_norm": 7.71875, + "learning_rate": 9.956362060333101e-06, + "loss": 1.09622059, + "memory(GiB)": 292.62, + "step": 32180, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.74212613, + "epoch": 0.18007765149661586, + "grad_norm": 6.78125, + "learning_rate": 9.956240072220818e-06, + "loss": 1.01305275, + "memory(GiB)": 292.62, + "step": 32200, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73057261, + "epoch": 0.18018950096959513, + "grad_norm": 6.46875, + "learning_rate": 9.956117914589408e-06, + "loss": 1.07214756, + "memory(GiB)": 292.62, + "step": 32220, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.70509958, + "epoch": 0.1803013504425744, + "grad_norm": 7.125, + "learning_rate": 9.955995587443048e-06, + "loss": 1.17289391, + "memory(GiB)": 292.62, + "step": 32240, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.72135592, + "epoch": 0.18041319991555366, + "grad_norm": 6.90625, + "learning_rate": 9.955873090785927e-06, + "loss": 1.10146542, + "memory(GiB)": 292.62, + "step": 32260, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.74204297, + "epoch": 0.18052504938853292, + "grad_norm": 6.5625, + "learning_rate": 9.95575042462223e-06, + "loss": 1.02738724, + "memory(GiB)": 292.62, + "step": 32280, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.73466878, + "epoch": 0.18063689886151219, + "grad_norm": 10.625, + "learning_rate": 9.955627588956154e-06, + "loss": 1.02401552, + "memory(GiB)": 292.62, + "step": 32300, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.72067871, + "epoch": 0.18074874833449145, + "grad_norm": 7.875, + "learning_rate": 9.955504583791901e-06, + "loss": 1.10466671, + "memory(GiB)": 292.62, + "step": 32320, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.72573729, + "epoch": 0.1808605978074707, + "grad_norm": 4.53125, + "learning_rate": 9.955381409133679e-06, + "loss": 1.09401665, + "memory(GiB)": 292.62, + "step": 32340, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.72706914, + "epoch": 0.18097244728044998, + "grad_norm": 7.4375, + "learning_rate": 9.955258064985699e-06, + "loss": 1.0933424, + "memory(GiB)": 292.62, + "step": 32360, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.72606587, + "epoch": 0.18108429675342924, + "grad_norm": 7.15625, + "learning_rate": 9.95513455135218e-06, + "loss": 1.06875935, + "memory(GiB)": 292.62, + "step": 32380, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.72424259, + "epoch": 0.1811961462264085, + "grad_norm": 8.1875, + "learning_rate": 9.955010868237348e-06, + "loss": 1.10510874, + "memory(GiB)": 292.62, + "step": 32400, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.73345003, + "epoch": 0.18130799569938777, + "grad_norm": 7.21875, + "learning_rate": 9.954887015645432e-06, + "loss": 1.06716385, + "memory(GiB)": 292.62, + "step": 32420, + "train_speed(iter/s)": 0.12393 + }, + { + "acc": 0.72454848, + "epoch": 0.18141984517236703, + "grad_norm": 8.75, + "learning_rate": 9.954762993580666e-06, + "loss": 1.09555416, + "memory(GiB)": 292.62, + "step": 32440, + "train_speed(iter/s)": 0.123966 + }, + { + "acc": 0.73472347, + "epoch": 0.1815316946453463, + "grad_norm": 8.5625, + "learning_rate": 9.954638802047299e-06, + "loss": 1.03321857, + "memory(GiB)": 292.62, + "step": 32460, + "train_speed(iter/s)": 0.124003 + }, + { + "acc": 0.72352514, + "epoch": 0.18164354411832556, + "grad_norm": 6.46875, + "learning_rate": 9.95451444104957e-06, + "loss": 1.07863932, + "memory(GiB)": 292.62, + "step": 32480, + "train_speed(iter/s)": 0.124039 + }, + { + "acc": 0.71655731, + "epoch": 0.18175539359130483, + "grad_norm": 6.34375, + "learning_rate": 9.954389910591739e-06, + "loss": 1.1229598, + "memory(GiB)": 292.62, + "step": 32500, + "train_speed(iter/s)": 0.124074 + }, + { + "acc": 0.72256837, + "epoch": 0.1818672430642841, + "grad_norm": 7.03125, + "learning_rate": 9.954265210678063e-06, + "loss": 1.10244074, + "memory(GiB)": 292.62, + "step": 32520, + "train_speed(iter/s)": 0.12411 + }, + { + "acc": 0.73339925, + "epoch": 0.18197909253726335, + "grad_norm": 8.125, + "learning_rate": 9.954140341312808e-06, + "loss": 1.05620718, + "memory(GiB)": 292.62, + "step": 32540, + "train_speed(iter/s)": 0.124146 + }, + { + "acc": 0.73214097, + "epoch": 0.18209094201024262, + "grad_norm": 8.9375, + "learning_rate": 9.954015302500242e-06, + "loss": 1.06631727, + "memory(GiB)": 292.62, + "step": 32560, + "train_speed(iter/s)": 0.124182 + }, + { + "acc": 0.74198055, + "epoch": 0.18220279148322188, + "grad_norm": 6.21875, + "learning_rate": 9.953890094244646e-06, + "loss": 0.9982605, + "memory(GiB)": 292.62, + "step": 32580, + "train_speed(iter/s)": 0.124219 + }, + { + "acc": 0.74430475, + "epoch": 0.18231464095620115, + "grad_norm": 9.1875, + "learning_rate": 9.9537647165503e-06, + "loss": 1.00561676, + "memory(GiB)": 292.62, + "step": 32600, + "train_speed(iter/s)": 0.124255 + }, + { + "acc": 0.74116654, + "epoch": 0.1824264904291804, + "grad_norm": 7.46875, + "learning_rate": 9.953639169421491e-06, + "loss": 1.02312412, + "memory(GiB)": 292.62, + "step": 32620, + "train_speed(iter/s)": 0.124287 + }, + { + "acc": 0.73751497, + "epoch": 0.18253833990215967, + "grad_norm": 5.9375, + "learning_rate": 9.953513452862517e-06, + "loss": 1.06369095, + "memory(GiB)": 292.62, + "step": 32640, + "train_speed(iter/s)": 0.124319 + }, + { + "acc": 0.73181124, + "epoch": 0.18265018937513894, + "grad_norm": 5.15625, + "learning_rate": 9.953387566877675e-06, + "loss": 1.06480799, + "memory(GiB)": 292.62, + "step": 32660, + "train_speed(iter/s)": 0.124352 + }, + { + "acc": 0.73177099, + "epoch": 0.1827620388481182, + "grad_norm": 9.0625, + "learning_rate": 9.95326151147127e-06, + "loss": 1.08471394, + "memory(GiB)": 292.62, + "step": 32680, + "train_speed(iter/s)": 0.124387 + }, + { + "acc": 0.74863601, + "epoch": 0.18287388832109747, + "grad_norm": 7.96875, + "learning_rate": 9.953135286647617e-06, + "loss": 1.00314951, + "memory(GiB)": 292.62, + "step": 32700, + "train_speed(iter/s)": 0.12442 + }, + { + "acc": 0.70430508, + "epoch": 0.18298573779407673, + "grad_norm": 4.71875, + "learning_rate": 9.95300889241103e-06, + "loss": 1.18287649, + "memory(GiB)": 292.62, + "step": 32720, + "train_speed(iter/s)": 0.124457 + }, + { + "acc": 0.73472528, + "epoch": 0.183097587267056, + "grad_norm": 5.78125, + "learning_rate": 9.952882328765833e-06, + "loss": 1.03605261, + "memory(GiB)": 292.62, + "step": 32740, + "train_speed(iter/s)": 0.124492 + }, + { + "acc": 0.7397264, + "epoch": 0.18320943674003526, + "grad_norm": 6.75, + "learning_rate": 9.952755595716357e-06, + "loss": 1.02443924, + "memory(GiB)": 292.62, + "step": 32760, + "train_speed(iter/s)": 0.124529 + }, + { + "acc": 0.72344551, + "epoch": 0.18332128621301452, + "grad_norm": 7.6875, + "learning_rate": 9.952628693266935e-06, + "loss": 1.11402168, + "memory(GiB)": 292.62, + "step": 32780, + "train_speed(iter/s)": 0.124564 + }, + { + "acc": 0.7296648, + "epoch": 0.1834331356859938, + "grad_norm": 5.875, + "learning_rate": 9.952501621421904e-06, + "loss": 1.07959499, + "memory(GiB)": 292.62, + "step": 32800, + "train_speed(iter/s)": 0.124601 + }, + { + "acc": 0.73865237, + "epoch": 0.18354498515897305, + "grad_norm": 9.0, + "learning_rate": 9.952374380185616e-06, + "loss": 1.02328796, + "memory(GiB)": 292.62, + "step": 32820, + "train_speed(iter/s)": 0.124634 + }, + { + "acc": 0.74325647, + "epoch": 0.18365683463195231, + "grad_norm": 8.0625, + "learning_rate": 9.952246969562421e-06, + "loss": 1.01278238, + "memory(GiB)": 292.62, + "step": 32840, + "train_speed(iter/s)": 0.12467 + }, + { + "acc": 0.73474264, + "epoch": 0.18376868410493158, + "grad_norm": 10.375, + "learning_rate": 9.952119389556676e-06, + "loss": 1.03586836, + "memory(GiB)": 292.62, + "step": 32860, + "train_speed(iter/s)": 0.124706 + }, + { + "acc": 0.72134819, + "epoch": 0.18388053357791084, + "grad_norm": 4.875, + "learning_rate": 9.951991640172744e-06, + "loss": 1.10217867, + "memory(GiB)": 292.62, + "step": 32880, + "train_speed(iter/s)": 0.124739 + }, + { + "acc": 0.72137723, + "epoch": 0.1839923830508901, + "grad_norm": 7.375, + "learning_rate": 9.951863721414997e-06, + "loss": 1.11994009, + "memory(GiB)": 292.62, + "step": 32900, + "train_speed(iter/s)": 0.124775 + }, + { + "acc": 0.73045144, + "epoch": 0.18410423252386937, + "grad_norm": 7.09375, + "learning_rate": 9.951735633287809e-06, + "loss": 1.06350756, + "memory(GiB)": 292.62, + "step": 32920, + "train_speed(iter/s)": 0.12481 + }, + { + "acc": 0.75266676, + "epoch": 0.18421608199684864, + "grad_norm": 7.96875, + "learning_rate": 9.95160737579556e-06, + "loss": 0.96462803, + "memory(GiB)": 292.62, + "step": 32940, + "train_speed(iter/s)": 0.124845 + }, + { + "acc": 0.73849654, + "epoch": 0.1843279314698279, + "grad_norm": 5.9375, + "learning_rate": 9.951478948942638e-06, + "loss": 1.01290007, + "memory(GiB)": 292.62, + "step": 32960, + "train_speed(iter/s)": 0.12488 + }, + { + "acc": 0.72860394, + "epoch": 0.18443978094280716, + "grad_norm": 7.09375, + "learning_rate": 9.951350352733436e-06, + "loss": 1.07427998, + "memory(GiB)": 292.62, + "step": 32980, + "train_speed(iter/s)": 0.124916 + }, + { + "acc": 0.73853521, + "epoch": 0.18455163041578643, + "grad_norm": 7.5, + "learning_rate": 9.951221587172351e-06, + "loss": 1.03332624, + "memory(GiB)": 292.62, + "step": 33000, + "train_speed(iter/s)": 0.124952 + }, + { + "acc": 0.73915124, + "epoch": 0.1846634798887657, + "grad_norm": 9.1875, + "learning_rate": 9.951092652263786e-06, + "loss": 1.01877203, + "memory(GiB)": 292.62, + "step": 33020, + "train_speed(iter/s)": 0.124989 + }, + { + "acc": 0.72886715, + "epoch": 0.18477532936174496, + "grad_norm": 6.5, + "learning_rate": 9.950963548012155e-06, + "loss": 1.07662487, + "memory(GiB)": 292.62, + "step": 33040, + "train_speed(iter/s)": 0.125022 + }, + { + "acc": 0.74177613, + "epoch": 0.18488717883472422, + "grad_norm": 5.40625, + "learning_rate": 9.95083427442187e-06, + "loss": 1.02445822, + "memory(GiB)": 292.62, + "step": 33060, + "train_speed(iter/s)": 0.125057 + }, + { + "acc": 0.73118653, + "epoch": 0.18499902830770348, + "grad_norm": 5.8125, + "learning_rate": 9.950704831497355e-06, + "loss": 1.05934172, + "memory(GiB)": 292.62, + "step": 33080, + "train_speed(iter/s)": 0.125091 + }, + { + "acc": 0.73149428, + "epoch": 0.18511087778068275, + "grad_norm": 7.65625, + "learning_rate": 9.950575219243038e-06, + "loss": 1.08311749, + "memory(GiB)": 292.62, + "step": 33100, + "train_speed(iter/s)": 0.125126 + }, + { + "acc": 0.7316679, + "epoch": 0.185222727253662, + "grad_norm": 6.75, + "learning_rate": 9.950445437663349e-06, + "loss": 1.050033, + "memory(GiB)": 292.62, + "step": 33120, + "train_speed(iter/s)": 0.12516 + }, + { + "acc": 0.72929325, + "epoch": 0.18533457672664128, + "grad_norm": 7.625, + "learning_rate": 9.950315486762728e-06, + "loss": 1.09462452, + "memory(GiB)": 292.62, + "step": 33140, + "train_speed(iter/s)": 0.125195 + }, + { + "acc": 0.71788597, + "epoch": 0.18544642619962054, + "grad_norm": 6.15625, + "learning_rate": 9.950185366545621e-06, + "loss": 1.10991077, + "memory(GiB)": 292.62, + "step": 33160, + "train_speed(iter/s)": 0.125226 + }, + { + "acc": 0.74522381, + "epoch": 0.1855582756725998, + "grad_norm": 5.90625, + "learning_rate": 9.950055077016478e-06, + "loss": 1.00419016, + "memory(GiB)": 292.62, + "step": 33180, + "train_speed(iter/s)": 0.125262 + }, + { + "acc": 0.74534068, + "epoch": 0.18567012514557907, + "grad_norm": 5.84375, + "learning_rate": 9.949924618179756e-06, + "loss": 1.00168667, + "memory(GiB)": 292.62, + "step": 33200, + "train_speed(iter/s)": 0.125298 + }, + { + "acc": 0.74985981, + "epoch": 0.18578197461855833, + "grad_norm": 7.125, + "learning_rate": 9.949793990039915e-06, + "loss": 0.96667423, + "memory(GiB)": 292.62, + "step": 33220, + "train_speed(iter/s)": 0.125332 + }, + { + "acc": 0.72508607, + "epoch": 0.1858938240915376, + "grad_norm": 9.0, + "learning_rate": 9.949663192601424e-06, + "loss": 1.0846343, + "memory(GiB)": 292.62, + "step": 33240, + "train_speed(iter/s)": 0.125369 + }, + { + "acc": 0.71804643, + "epoch": 0.18600567356451686, + "grad_norm": 8.3125, + "learning_rate": 9.949532225868758e-06, + "loss": 1.13247414, + "memory(GiB)": 292.62, + "step": 33260, + "train_speed(iter/s)": 0.125403 + }, + { + "acc": 0.71137791, + "epoch": 0.18611752303749612, + "grad_norm": 6.40625, + "learning_rate": 9.949401089846395e-06, + "loss": 1.13845816, + "memory(GiB)": 292.62, + "step": 33280, + "train_speed(iter/s)": 0.12544 + }, + { + "acc": 0.70703244, + "epoch": 0.18622937251047542, + "grad_norm": 6.0625, + "learning_rate": 9.949269784538819e-06, + "loss": 1.17020483, + "memory(GiB)": 292.62, + "step": 33300, + "train_speed(iter/s)": 0.125475 + }, + { + "acc": 0.71887097, + "epoch": 0.18634122198345468, + "grad_norm": 6.40625, + "learning_rate": 9.949138309950526e-06, + "loss": 1.10168867, + "memory(GiB)": 292.62, + "step": 33320, + "train_speed(iter/s)": 0.125508 + }, + { + "acc": 0.73259501, + "epoch": 0.18645307145643394, + "grad_norm": 6.3125, + "learning_rate": 9.949006666086007e-06, + "loss": 1.05920458, + "memory(GiB)": 292.62, + "step": 33340, + "train_speed(iter/s)": 0.125543 + }, + { + "acc": 0.74391994, + "epoch": 0.1865649209294132, + "grad_norm": 7.125, + "learning_rate": 9.948874852949767e-06, + "loss": 0.98577185, + "memory(GiB)": 292.62, + "step": 33360, + "train_speed(iter/s)": 0.125578 + }, + { + "acc": 0.71586623, + "epoch": 0.18667677040239247, + "grad_norm": 4.96875, + "learning_rate": 9.948742870546316e-06, + "loss": 1.11604862, + "memory(GiB)": 292.62, + "step": 33380, + "train_speed(iter/s)": 0.125612 + }, + { + "acc": 0.73087716, + "epoch": 0.18678861987537174, + "grad_norm": 8.8125, + "learning_rate": 9.948610718880165e-06, + "loss": 1.07815113, + "memory(GiB)": 292.62, + "step": 33400, + "train_speed(iter/s)": 0.125647 + }, + { + "acc": 0.72298126, + "epoch": 0.186900469348351, + "grad_norm": 7.40625, + "learning_rate": 9.948478397955838e-06, + "loss": 1.08117952, + "memory(GiB)": 292.62, + "step": 33420, + "train_speed(iter/s)": 0.125683 + }, + { + "acc": 0.74065876, + "epoch": 0.18701231882133026, + "grad_norm": 8.0, + "learning_rate": 9.948345907777856e-06, + "loss": 1.02644396, + "memory(GiB)": 292.62, + "step": 33440, + "train_speed(iter/s)": 0.125719 + }, + { + "acc": 0.72936845, + "epoch": 0.18712416829430953, + "grad_norm": 7.0625, + "learning_rate": 9.948213248350756e-06, + "loss": 1.06598272, + "memory(GiB)": 292.62, + "step": 33460, + "train_speed(iter/s)": 0.12575 + }, + { + "acc": 0.73356709, + "epoch": 0.1872360177672888, + "grad_norm": 6.8125, + "learning_rate": 9.94808041967907e-06, + "loss": 1.06270857, + "memory(GiB)": 292.62, + "step": 33480, + "train_speed(iter/s)": 0.125785 + }, + { + "acc": 0.73922281, + "epoch": 0.18734786724026806, + "grad_norm": 7.5, + "learning_rate": 9.947947421767346e-06, + "loss": 1.00829725, + "memory(GiB)": 292.62, + "step": 33500, + "train_speed(iter/s)": 0.125815 + }, + { + "acc": 0.73097744, + "epoch": 0.18745971671324732, + "grad_norm": 8.5, + "learning_rate": 9.94781425462013e-06, + "loss": 1.07251215, + "memory(GiB)": 292.62, + "step": 33520, + "train_speed(iter/s)": 0.125849 + }, + { + "acc": 0.72638254, + "epoch": 0.18757156618622658, + "grad_norm": 6.09375, + "learning_rate": 9.947680918241975e-06, + "loss": 1.06856613, + "memory(GiB)": 292.62, + "step": 33540, + "train_speed(iter/s)": 0.125884 + }, + { + "acc": 0.72489929, + "epoch": 0.18768341565920585, + "grad_norm": 5.75, + "learning_rate": 9.947547412637446e-06, + "loss": 1.06520395, + "memory(GiB)": 292.62, + "step": 33560, + "train_speed(iter/s)": 0.125918 + }, + { + "acc": 0.72354894, + "epoch": 0.1877952651321851, + "grad_norm": 6.5625, + "learning_rate": 9.947413737811106e-06, + "loss": 1.09992237, + "memory(GiB)": 292.62, + "step": 33580, + "train_speed(iter/s)": 0.125953 + }, + { + "acc": 0.74057784, + "epoch": 0.18790711460516438, + "grad_norm": 8.9375, + "learning_rate": 9.947279893767532e-06, + "loss": 1.01095028, + "memory(GiB)": 292.62, + "step": 33600, + "train_speed(iter/s)": 0.125986 + }, + { + "acc": 0.71902122, + "epoch": 0.18801896407814364, + "grad_norm": 9.875, + "learning_rate": 9.947145880511296e-06, + "loss": 1.13044415, + "memory(GiB)": 292.62, + "step": 33620, + "train_speed(iter/s)": 0.12602 + }, + { + "acc": 0.72730165, + "epoch": 0.1881308135511229, + "grad_norm": 5.6875, + "learning_rate": 9.947011698046982e-06, + "loss": 1.06361923, + "memory(GiB)": 292.62, + "step": 33640, + "train_speed(iter/s)": 0.126056 + }, + { + "acc": 0.72613025, + "epoch": 0.18824266302410217, + "grad_norm": 6.9375, + "learning_rate": 9.946877346379184e-06, + "loss": 1.08143864, + "memory(GiB)": 292.62, + "step": 33660, + "train_speed(iter/s)": 0.126088 + }, + { + "acc": 0.71722574, + "epoch": 0.18835451249708143, + "grad_norm": 6.5, + "learning_rate": 9.946742825512492e-06, + "loss": 1.14383945, + "memory(GiB)": 292.62, + "step": 33680, + "train_speed(iter/s)": 0.126125 + }, + { + "acc": 0.73186474, + "epoch": 0.1884663619700607, + "grad_norm": 6.3125, + "learning_rate": 9.946608135451513e-06, + "loss": 1.05379219, + "memory(GiB)": 292.62, + "step": 33700, + "train_speed(iter/s)": 0.126161 + }, + { + "acc": 0.73269386, + "epoch": 0.18857821144303996, + "grad_norm": 4.5, + "learning_rate": 9.946473276200847e-06, + "loss": 1.03627377, + "memory(GiB)": 292.62, + "step": 33720, + "train_speed(iter/s)": 0.126194 + }, + { + "acc": 0.73144898, + "epoch": 0.18869006091601923, + "grad_norm": 5.40625, + "learning_rate": 9.946338247765113e-06, + "loss": 1.07137709, + "memory(GiB)": 292.62, + "step": 33740, + "train_speed(iter/s)": 0.12623 + }, + { + "acc": 0.72810616, + "epoch": 0.1888019103889985, + "grad_norm": 8.875, + "learning_rate": 9.946203050148924e-06, + "loss": 1.0843483, + "memory(GiB)": 292.62, + "step": 33760, + "train_speed(iter/s)": 0.126265 + }, + { + "acc": 0.73920422, + "epoch": 0.18891375986197775, + "grad_norm": 5.71875, + "learning_rate": 9.94606768335691e-06, + "loss": 1.02806797, + "memory(GiB)": 292.62, + "step": 33780, + "train_speed(iter/s)": 0.126301 + }, + { + "acc": 0.72311974, + "epoch": 0.18902560933495702, + "grad_norm": 7.0625, + "learning_rate": 9.945932147393696e-06, + "loss": 1.11004286, + "memory(GiB)": 292.62, + "step": 33800, + "train_speed(iter/s)": 0.126334 + }, + { + "acc": 0.73135147, + "epoch": 0.18913745880793628, + "grad_norm": 5.1875, + "learning_rate": 9.945796442263918e-06, + "loss": 1.06218119, + "memory(GiB)": 292.62, + "step": 33820, + "train_speed(iter/s)": 0.126367 + }, + { + "acc": 0.71140022, + "epoch": 0.18924930828091555, + "grad_norm": 5.90625, + "learning_rate": 9.94566056797222e-06, + "loss": 1.16563301, + "memory(GiB)": 292.62, + "step": 33840, + "train_speed(iter/s)": 0.126403 + }, + { + "acc": 0.73661218, + "epoch": 0.1893611577538948, + "grad_norm": 7.53125, + "learning_rate": 9.945524524523249e-06, + "loss": 1.03425426, + "memory(GiB)": 292.62, + "step": 33860, + "train_speed(iter/s)": 0.126434 + }, + { + "acc": 0.73111038, + "epoch": 0.18947300722687407, + "grad_norm": 7.09375, + "learning_rate": 9.945388311921657e-06, + "loss": 1.04543619, + "memory(GiB)": 292.62, + "step": 33880, + "train_speed(iter/s)": 0.126468 + }, + { + "acc": 0.71591372, + "epoch": 0.18958485669985334, + "grad_norm": 7.5625, + "learning_rate": 9.945251930172103e-06, + "loss": 1.14441118, + "memory(GiB)": 292.62, + "step": 33900, + "train_speed(iter/s)": 0.126503 + }, + { + "acc": 0.72695689, + "epoch": 0.1896967061728326, + "grad_norm": 7.25, + "learning_rate": 9.945115379279251e-06, + "loss": 1.09482641, + "memory(GiB)": 292.62, + "step": 33920, + "train_speed(iter/s)": 0.126538 + }, + { + "acc": 0.71375403, + "epoch": 0.18980855564581187, + "grad_norm": 7.90625, + "learning_rate": 9.944978659247774e-06, + "loss": 1.16230679, + "memory(GiB)": 292.62, + "step": 33940, + "train_speed(iter/s)": 0.126572 + }, + { + "acc": 0.71396089, + "epoch": 0.18992040511879113, + "grad_norm": 7.75, + "learning_rate": 9.944841770082347e-06, + "loss": 1.15117245, + "memory(GiB)": 292.62, + "step": 33960, + "train_speed(iter/s)": 0.126606 + }, + { + "acc": 0.73901258, + "epoch": 0.1900322545917704, + "grad_norm": 7.71875, + "learning_rate": 9.944704711787651e-06, + "loss": 1.03146038, + "memory(GiB)": 292.62, + "step": 33980, + "train_speed(iter/s)": 0.12664 + }, + { + "acc": 0.73047886, + "epoch": 0.19014410406474966, + "grad_norm": 6.96875, + "learning_rate": 9.944567484368375e-06, + "loss": 1.06553354, + "memory(GiB)": 292.62, + "step": 34000, + "train_speed(iter/s)": 0.126674 + }, + { + "epoch": 0.19014410406474966, + "eval_acc": 0.6943105790213188, + "eval_loss": 1.0666879415512085, + "eval_runtime": 7496.964, + "eval_samples_per_second": 10.042, + "eval_steps_per_second": 10.042, + "step": 34000 + }, + { + "acc": 0.72229681, + "epoch": 0.19025595353772892, + "grad_norm": 8.125, + "learning_rate": 9.944430087829213e-06, + "loss": 1.09108953, + "memory(GiB)": 292.62, + "step": 34020, + "train_speed(iter/s)": 0.123223 + }, + { + "acc": 0.72085671, + "epoch": 0.1903678030107082, + "grad_norm": 7.5625, + "learning_rate": 9.944292522174862e-06, + "loss": 1.07756252, + "memory(GiB)": 292.62, + "step": 34040, + "train_speed(iter/s)": 0.123257 + }, + { + "acc": 0.72899399, + "epoch": 0.19047965248368745, + "grad_norm": 8.0625, + "learning_rate": 9.94415478741003e-06, + "loss": 1.07529984, + "memory(GiB)": 292.62, + "step": 34060, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.71524076, + "epoch": 0.19059150195666671, + "grad_norm": 8.9375, + "learning_rate": 9.944016883539427e-06, + "loss": 1.13347139, + "memory(GiB)": 292.62, + "step": 34080, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.73838916, + "epoch": 0.19070335142964598, + "grad_norm": 6.46875, + "learning_rate": 9.943878810567769e-06, + "loss": 1.02786808, + "memory(GiB)": 292.62, + "step": 34100, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.72717791, + "epoch": 0.19081520090262524, + "grad_norm": 6.59375, + "learning_rate": 9.943740568499778e-06, + "loss": 1.07823553, + "memory(GiB)": 292.62, + "step": 34120, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.73128982, + "epoch": 0.1909270503756045, + "grad_norm": 8.5, + "learning_rate": 9.943602157340185e-06, + "loss": 1.05292521, + "memory(GiB)": 292.62, + "step": 34140, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.72732215, + "epoch": 0.19103889984858377, + "grad_norm": 5.46875, + "learning_rate": 9.943463577093721e-06, + "loss": 1.09166784, + "memory(GiB)": 292.62, + "step": 34160, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.72794609, + "epoch": 0.19115074932156303, + "grad_norm": 5.75, + "learning_rate": 9.943324827765129e-06, + "loss": 1.07745819, + "memory(GiB)": 292.62, + "step": 34180, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.73834534, + "epoch": 0.1912625987945423, + "grad_norm": 6.84375, + "learning_rate": 9.943185909359152e-06, + "loss": 1.02128534, + "memory(GiB)": 292.62, + "step": 34200, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.73525276, + "epoch": 0.19137444826752156, + "grad_norm": 11.125, + "learning_rate": 9.943046821880541e-06, + "loss": 1.04621439, + "memory(GiB)": 292.62, + "step": 34220, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.72636995, + "epoch": 0.19148629774050083, + "grad_norm": 6.0625, + "learning_rate": 9.942907565334058e-06, + "loss": 1.07030306, + "memory(GiB)": 292.62, + "step": 34240, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74647861, + "epoch": 0.1915981472134801, + "grad_norm": 9.3125, + "learning_rate": 9.942768139724461e-06, + "loss": 0.99474325, + "memory(GiB)": 292.62, + "step": 34260, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.72834754, + "epoch": 0.19170999668645936, + "grad_norm": 7.03125, + "learning_rate": 9.94262854505652e-06, + "loss": 1.05157652, + "memory(GiB)": 292.62, + "step": 34280, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.7370667, + "epoch": 0.19182184615943862, + "grad_norm": 5.84375, + "learning_rate": 9.94248878133501e-06, + "loss": 1.03098059, + "memory(GiB)": 292.62, + "step": 34300, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.72749691, + "epoch": 0.19193369563241788, + "grad_norm": 4.96875, + "learning_rate": 9.942348848564714e-06, + "loss": 1.07607918, + "memory(GiB)": 292.62, + "step": 34320, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.72184882, + "epoch": 0.19204554510539715, + "grad_norm": 7.25, + "learning_rate": 9.942208746750414e-06, + "loss": 1.11319208, + "memory(GiB)": 292.62, + "step": 34340, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.73765945, + "epoch": 0.1921573945783764, + "grad_norm": 6.46875, + "learning_rate": 9.942068475896903e-06, + "loss": 1.02800856, + "memory(GiB)": 292.62, + "step": 34360, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.72997947, + "epoch": 0.19226924405135568, + "grad_norm": 5.8125, + "learning_rate": 9.94192803600898e-06, + "loss": 1.05278225, + "memory(GiB)": 292.62, + "step": 34380, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.73257003, + "epoch": 0.19238109352433494, + "grad_norm": 9.0, + "learning_rate": 9.941787427091447e-06, + "loss": 1.0455204, + "memory(GiB)": 292.62, + "step": 34400, + "train_speed(iter/s)": 0.123862 + }, + { + "acc": 0.73445225, + "epoch": 0.1924929429973142, + "grad_norm": 9.3125, + "learning_rate": 9.941646649149114e-06, + "loss": 1.05964994, + "memory(GiB)": 292.62, + "step": 34420, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.74193449, + "epoch": 0.19260479247029347, + "grad_norm": 6.9375, + "learning_rate": 9.941505702186798e-06, + "loss": 1.01645908, + "memory(GiB)": 292.62, + "step": 34440, + "train_speed(iter/s)": 0.12393 + }, + { + "acc": 0.73094397, + "epoch": 0.19271664194327273, + "grad_norm": 10.375, + "learning_rate": 9.941364586209315e-06, + "loss": 1.06928883, + "memory(GiB)": 292.62, + "step": 34460, + "train_speed(iter/s)": 0.123963 + }, + { + "acc": 0.71750574, + "epoch": 0.192828491416252, + "grad_norm": 5.59375, + "learning_rate": 9.941223301221496e-06, + "loss": 1.08965216, + "memory(GiB)": 292.62, + "step": 34480, + "train_speed(iter/s)": 0.123998 + }, + { + "acc": 0.7243258, + "epoch": 0.19294034088923126, + "grad_norm": 5.96875, + "learning_rate": 9.941081847228172e-06, + "loss": 1.08736706, + "memory(GiB)": 292.62, + "step": 34500, + "train_speed(iter/s)": 0.124032 + }, + { + "acc": 0.72638454, + "epoch": 0.19305219036221052, + "grad_norm": 6.28125, + "learning_rate": 9.940940224234181e-06, + "loss": 1.06632166, + "memory(GiB)": 292.62, + "step": 34520, + "train_speed(iter/s)": 0.124067 + }, + { + "acc": 0.7232923, + "epoch": 0.1931640398351898, + "grad_norm": 6.53125, + "learning_rate": 9.940798432244368e-06, + "loss": 1.09001637, + "memory(GiB)": 292.62, + "step": 34540, + "train_speed(iter/s)": 0.124101 + }, + { + "acc": 0.72374296, + "epoch": 0.19327588930816908, + "grad_norm": 4.9375, + "learning_rate": 9.94065647126358e-06, + "loss": 1.10443697, + "memory(GiB)": 292.62, + "step": 34560, + "train_speed(iter/s)": 0.124134 + }, + { + "acc": 0.73780856, + "epoch": 0.19338773878114834, + "grad_norm": 6.0, + "learning_rate": 9.940514341296673e-06, + "loss": 1.04106274, + "memory(GiB)": 292.62, + "step": 34580, + "train_speed(iter/s)": 0.124168 + }, + { + "acc": 0.71960831, + "epoch": 0.1934995882541276, + "grad_norm": 7.6875, + "learning_rate": 9.940372042348512e-06, + "loss": 1.13772554, + "memory(GiB)": 292.62, + "step": 34600, + "train_speed(iter/s)": 0.124202 + }, + { + "acc": 0.74470878, + "epoch": 0.19361143772710687, + "grad_norm": 7.03125, + "learning_rate": 9.940229574423963e-06, + "loss": 0.99519529, + "memory(GiB)": 292.62, + "step": 34620, + "train_speed(iter/s)": 0.124237 + }, + { + "acc": 0.71843777, + "epoch": 0.19372328720008614, + "grad_norm": 7.375, + "learning_rate": 9.940086937527896e-06, + "loss": 1.12092381, + "memory(GiB)": 292.62, + "step": 34640, + "train_speed(iter/s)": 0.12427 + }, + { + "acc": 0.72613101, + "epoch": 0.1938351366730654, + "grad_norm": 8.4375, + "learning_rate": 9.93994413166519e-06, + "loss": 1.07533121, + "memory(GiB)": 292.62, + "step": 34660, + "train_speed(iter/s)": 0.124304 + }, + { + "acc": 0.72573767, + "epoch": 0.19394698614604466, + "grad_norm": 6.28125, + "learning_rate": 9.939801156840732e-06, + "loss": 1.07003117, + "memory(GiB)": 292.62, + "step": 34680, + "train_speed(iter/s)": 0.124336 + }, + { + "acc": 0.73623695, + "epoch": 0.19405883561902393, + "grad_norm": 6.5625, + "learning_rate": 9.93965801305941e-06, + "loss": 1.02511292, + "memory(GiB)": 292.62, + "step": 34700, + "train_speed(iter/s)": 0.12437 + }, + { + "acc": 0.72488928, + "epoch": 0.1941706850920032, + "grad_norm": 7.25, + "learning_rate": 9.939514700326121e-06, + "loss": 1.09473286, + "memory(GiB)": 292.62, + "step": 34720, + "train_speed(iter/s)": 0.124405 + }, + { + "acc": 0.73750262, + "epoch": 0.19428253456498246, + "grad_norm": 5.875, + "learning_rate": 9.939371218645766e-06, + "loss": 1.03376284, + "memory(GiB)": 292.62, + "step": 34740, + "train_speed(iter/s)": 0.124439 + }, + { + "acc": 0.73781061, + "epoch": 0.19439438403796172, + "grad_norm": 7.84375, + "learning_rate": 9.939227568023252e-06, + "loss": 1.04365301, + "memory(GiB)": 292.62, + "step": 34760, + "train_speed(iter/s)": 0.124472 + }, + { + "acc": 0.73475733, + "epoch": 0.19450623351094098, + "grad_norm": 6.90625, + "learning_rate": 9.939083748463496e-06, + "loss": 1.05877495, + "memory(GiB)": 292.62, + "step": 34780, + "train_speed(iter/s)": 0.124507 + }, + { + "acc": 0.7283586, + "epoch": 0.19461808298392025, + "grad_norm": 7.84375, + "learning_rate": 9.938939759971414e-06, + "loss": 1.08689861, + "memory(GiB)": 292.62, + "step": 34800, + "train_speed(iter/s)": 0.124541 + }, + { + "acc": 0.72371292, + "epoch": 0.1947299324568995, + "grad_norm": 6.9375, + "learning_rate": 9.938795602551929e-06, + "loss": 1.07954683, + "memory(GiB)": 292.62, + "step": 34820, + "train_speed(iter/s)": 0.124573 + }, + { + "acc": 0.73517332, + "epoch": 0.19484178192987878, + "grad_norm": 7.6875, + "learning_rate": 9.938651276209974e-06, + "loss": 1.05803185, + "memory(GiB)": 292.62, + "step": 34840, + "train_speed(iter/s)": 0.124607 + }, + { + "acc": 0.75217395, + "epoch": 0.19495363140285804, + "grad_norm": 10.3125, + "learning_rate": 9.938506780950487e-06, + "loss": 0.96853065, + "memory(GiB)": 292.62, + "step": 34860, + "train_speed(iter/s)": 0.124641 + }, + { + "acc": 0.7280745, + "epoch": 0.1950654808758373, + "grad_norm": 7.53125, + "learning_rate": 9.938362116778409e-06, + "loss": 1.05355749, + "memory(GiB)": 292.62, + "step": 34880, + "train_speed(iter/s)": 0.124676 + }, + { + "acc": 0.71921725, + "epoch": 0.19517733034881657, + "grad_norm": 8.125, + "learning_rate": 9.938217283698685e-06, + "loss": 1.11507816, + "memory(GiB)": 292.62, + "step": 34900, + "train_speed(iter/s)": 0.124711 + }, + { + "acc": 0.7352797, + "epoch": 0.19528917982179583, + "grad_norm": 8.375, + "learning_rate": 9.938072281716273e-06, + "loss": 1.01661015, + "memory(GiB)": 292.62, + "step": 34920, + "train_speed(iter/s)": 0.124743 + }, + { + "acc": 0.73085608, + "epoch": 0.1954010292947751, + "grad_norm": 7.75, + "learning_rate": 9.937927110836132e-06, + "loss": 1.05822935, + "memory(GiB)": 292.62, + "step": 34940, + "train_speed(iter/s)": 0.124774 + }, + { + "acc": 0.72385368, + "epoch": 0.19551287876775436, + "grad_norm": 7.09375, + "learning_rate": 9.937781771063224e-06, + "loss": 1.09725533, + "memory(GiB)": 292.62, + "step": 34960, + "train_speed(iter/s)": 0.124807 + }, + { + "acc": 0.72485805, + "epoch": 0.19562472824073363, + "grad_norm": 9.5, + "learning_rate": 9.937636262402523e-06, + "loss": 1.09380703, + "memory(GiB)": 292.62, + "step": 34980, + "train_speed(iter/s)": 0.12484 + }, + { + "acc": 0.7305685, + "epoch": 0.1957365777137129, + "grad_norm": 5.9375, + "learning_rate": 9.937490584859005e-06, + "loss": 1.0387249, + "memory(GiB)": 292.62, + "step": 35000, + "train_speed(iter/s)": 0.124875 + }, + { + "acc": 0.73045783, + "epoch": 0.19584842718669215, + "grad_norm": 10.5625, + "learning_rate": 9.937344738437653e-06, + "loss": 1.06174335, + "memory(GiB)": 292.62, + "step": 35020, + "train_speed(iter/s)": 0.124909 + }, + { + "acc": 0.74186072, + "epoch": 0.19596027665967142, + "grad_norm": 7.1875, + "learning_rate": 9.937198723143454e-06, + "loss": 1.01491957, + "memory(GiB)": 292.62, + "step": 35040, + "train_speed(iter/s)": 0.124942 + }, + { + "acc": 0.73860521, + "epoch": 0.19607212613265068, + "grad_norm": 9.6875, + "learning_rate": 9.937052538981402e-06, + "loss": 1.03515749, + "memory(GiB)": 292.62, + "step": 35060, + "train_speed(iter/s)": 0.124976 + }, + { + "acc": 0.71970358, + "epoch": 0.19618397560562995, + "grad_norm": 5.75, + "learning_rate": 9.936906185956501e-06, + "loss": 1.11814938, + "memory(GiB)": 292.62, + "step": 35080, + "train_speed(iter/s)": 0.12501 + }, + { + "acc": 0.72225738, + "epoch": 0.1962958250786092, + "grad_norm": 5.78125, + "learning_rate": 9.936759664073752e-06, + "loss": 1.10061932, + "memory(GiB)": 292.62, + "step": 35100, + "train_speed(iter/s)": 0.125041 + }, + { + "acc": 0.71883016, + "epoch": 0.19640767455158847, + "grad_norm": 8.0625, + "learning_rate": 9.93661297333817e-06, + "loss": 1.12891188, + "memory(GiB)": 292.62, + "step": 35120, + "train_speed(iter/s)": 0.125076 + }, + { + "acc": 0.71983128, + "epoch": 0.19651952402456774, + "grad_norm": 7.84375, + "learning_rate": 9.93646611375477e-06, + "loss": 1.10002394, + "memory(GiB)": 292.62, + "step": 35140, + "train_speed(iter/s)": 0.125108 + }, + { + "acc": 0.72679811, + "epoch": 0.196631373497547, + "grad_norm": 6.375, + "learning_rate": 9.936319085328577e-06, + "loss": 1.07026615, + "memory(GiB)": 292.62, + "step": 35160, + "train_speed(iter/s)": 0.125143 + }, + { + "acc": 0.740762, + "epoch": 0.19674322297052627, + "grad_norm": 5.9375, + "learning_rate": 9.936171888064616e-06, + "loss": 1.01768179, + "memory(GiB)": 292.62, + "step": 35180, + "train_speed(iter/s)": 0.125176 + }, + { + "acc": 0.73360877, + "epoch": 0.19685507244350553, + "grad_norm": 5.28125, + "learning_rate": 9.936024521967926e-06, + "loss": 1.03420534, + "memory(GiB)": 292.62, + "step": 35200, + "train_speed(iter/s)": 0.125209 + }, + { + "acc": 0.7386198, + "epoch": 0.1969669219164848, + "grad_norm": 7.21875, + "learning_rate": 9.935876987043545e-06, + "loss": 1.03792982, + "memory(GiB)": 292.62, + "step": 35220, + "train_speed(iter/s)": 0.125242 + }, + { + "acc": 0.72217102, + "epoch": 0.19707877138946406, + "grad_norm": 8.75, + "learning_rate": 9.93572928329652e-06, + "loss": 1.08483734, + "memory(GiB)": 292.62, + "step": 35240, + "train_speed(iter/s)": 0.125276 + }, + { + "acc": 0.72860813, + "epoch": 0.19719062086244332, + "grad_norm": 8.4375, + "learning_rate": 9.935581410731902e-06, + "loss": 1.06498137, + "memory(GiB)": 292.62, + "step": 35260, + "train_speed(iter/s)": 0.125311 + }, + { + "acc": 0.730058, + "epoch": 0.1973024703354226, + "grad_norm": 7.15625, + "learning_rate": 9.935433369354751e-06, + "loss": 1.06765099, + "memory(GiB)": 292.62, + "step": 35280, + "train_speed(iter/s)": 0.125342 + }, + { + "acc": 0.749227, + "epoch": 0.19741431980840185, + "grad_norm": 7.1875, + "learning_rate": 9.935285159170129e-06, + "loss": 0.98815536, + "memory(GiB)": 292.62, + "step": 35300, + "train_speed(iter/s)": 0.125376 + }, + { + "acc": 0.74103851, + "epoch": 0.19752616928138111, + "grad_norm": 8.8125, + "learning_rate": 9.935136780183104e-06, + "loss": 1.0286129, + "memory(GiB)": 292.62, + "step": 35320, + "train_speed(iter/s)": 0.12541 + }, + { + "acc": 0.751753, + "epoch": 0.19763801875436038, + "grad_norm": 7.84375, + "learning_rate": 9.934988232398752e-06, + "loss": 0.97176132, + "memory(GiB)": 292.62, + "step": 35340, + "train_speed(iter/s)": 0.125443 + }, + { + "acc": 0.73528552, + "epoch": 0.19774986822733964, + "grad_norm": 6.0625, + "learning_rate": 9.934839515822155e-06, + "loss": 1.04512844, + "memory(GiB)": 292.62, + "step": 35360, + "train_speed(iter/s)": 0.125475 + }, + { + "acc": 0.72735162, + "epoch": 0.1978617177003189, + "grad_norm": 7.375, + "learning_rate": 9.934690630458399e-06, + "loss": 1.09983501, + "memory(GiB)": 292.62, + "step": 35380, + "train_speed(iter/s)": 0.125508 + }, + { + "acc": 0.72125869, + "epoch": 0.19797356717329817, + "grad_norm": 10.0, + "learning_rate": 9.934541576312576e-06, + "loss": 1.10026674, + "memory(GiB)": 292.62, + "step": 35400, + "train_speed(iter/s)": 0.125539 + }, + { + "acc": 0.74574218, + "epoch": 0.19808541664627743, + "grad_norm": 7.1875, + "learning_rate": 9.934392353389782e-06, + "loss": 1.00551739, + "memory(GiB)": 292.62, + "step": 35420, + "train_speed(iter/s)": 0.12557 + }, + { + "acc": 0.73325591, + "epoch": 0.1981972661192567, + "grad_norm": 7.96875, + "learning_rate": 9.934242961695124e-06, + "loss": 1.06120443, + "memory(GiB)": 292.62, + "step": 35440, + "train_speed(iter/s)": 0.125603 + }, + { + "acc": 0.71640496, + "epoch": 0.19830911559223596, + "grad_norm": 11.8125, + "learning_rate": 9.934093401233711e-06, + "loss": 1.13173981, + "memory(GiB)": 292.62, + "step": 35460, + "train_speed(iter/s)": 0.125635 + }, + { + "acc": 0.72095232, + "epoch": 0.19842096506521523, + "grad_norm": 9.1875, + "learning_rate": 9.933943672010657e-06, + "loss": 1.10546684, + "memory(GiB)": 292.62, + "step": 35480, + "train_speed(iter/s)": 0.125667 + }, + { + "acc": 0.72474065, + "epoch": 0.1985328145381945, + "grad_norm": 7.5, + "learning_rate": 9.933793774031086e-06, + "loss": 1.09442253, + "memory(GiB)": 292.62, + "step": 35500, + "train_speed(iter/s)": 0.125701 + }, + { + "acc": 0.72696252, + "epoch": 0.19864466401117375, + "grad_norm": 6.75, + "learning_rate": 9.933643707300123e-06, + "loss": 1.09186516, + "memory(GiB)": 292.62, + "step": 35520, + "train_speed(iter/s)": 0.125733 + }, + { + "acc": 0.73547482, + "epoch": 0.19875651348415302, + "grad_norm": 5.21875, + "learning_rate": 9.933493471822901e-06, + "loss": 1.0469636, + "memory(GiB)": 292.62, + "step": 35540, + "train_speed(iter/s)": 0.125765 + }, + { + "acc": 0.72770734, + "epoch": 0.19886836295713228, + "grad_norm": 6.53125, + "learning_rate": 9.933343067604559e-06, + "loss": 1.05081959, + "memory(GiB)": 292.62, + "step": 35560, + "train_speed(iter/s)": 0.125798 + }, + { + "acc": 0.72778893, + "epoch": 0.19898021243011155, + "grad_norm": 6.09375, + "learning_rate": 9.93319249465024e-06, + "loss": 1.08496513, + "memory(GiB)": 292.62, + "step": 35580, + "train_speed(iter/s)": 0.125827 + }, + { + "acc": 0.7163249, + "epoch": 0.1990920619030908, + "grad_norm": 7.3125, + "learning_rate": 9.933041752965094e-06, + "loss": 1.12631931, + "memory(GiB)": 292.62, + "step": 35600, + "train_speed(iter/s)": 0.12586 + }, + { + "acc": 0.71689973, + "epoch": 0.19920391137607008, + "grad_norm": 7.125, + "learning_rate": 9.93289084255428e-06, + "loss": 1.11585617, + "memory(GiB)": 292.62, + "step": 35620, + "train_speed(iter/s)": 0.125891 + }, + { + "acc": 0.73153296, + "epoch": 0.19931576084904934, + "grad_norm": 10.125, + "learning_rate": 9.932739763422956e-06, + "loss": 1.06252937, + "memory(GiB)": 292.62, + "step": 35640, + "train_speed(iter/s)": 0.125924 + }, + { + "acc": 0.71912766, + "epoch": 0.1994276103220286, + "grad_norm": 4.875, + "learning_rate": 9.932588515576291e-06, + "loss": 1.1244482, + "memory(GiB)": 292.62, + "step": 35660, + "train_speed(iter/s)": 0.125957 + }, + { + "acc": 0.73449965, + "epoch": 0.19953945979500787, + "grad_norm": 8.3125, + "learning_rate": 9.932437099019459e-06, + "loss": 1.05021515, + "memory(GiB)": 292.62, + "step": 35680, + "train_speed(iter/s)": 0.12599 + }, + { + "acc": 0.71933627, + "epoch": 0.19965130926798713, + "grad_norm": 8.8125, + "learning_rate": 9.932285513757636e-06, + "loss": 1.12380571, + "memory(GiB)": 292.62, + "step": 35700, + "train_speed(iter/s)": 0.126022 + }, + { + "acc": 0.73789778, + "epoch": 0.1997631587409664, + "grad_norm": 10.1875, + "learning_rate": 9.93213375979601e-06, + "loss": 1.03607435, + "memory(GiB)": 292.62, + "step": 35720, + "train_speed(iter/s)": 0.126054 + }, + { + "acc": 0.73706684, + "epoch": 0.19987500821394566, + "grad_norm": 8.5625, + "learning_rate": 9.931981837139767e-06, + "loss": 1.03457584, + "memory(GiB)": 292.62, + "step": 35740, + "train_speed(iter/s)": 0.126087 + }, + { + "acc": 0.72807531, + "epoch": 0.19998685768692492, + "grad_norm": 4.875, + "learning_rate": 9.93182974579411e-06, + "loss": 1.07055244, + "memory(GiB)": 292.62, + "step": 35760, + "train_speed(iter/s)": 0.12612 + }, + { + "acc": 0.74605417, + "epoch": 0.2000987071599042, + "grad_norm": 5.21875, + "learning_rate": 9.931677485764238e-06, + "loss": 1.00748291, + "memory(GiB)": 292.62, + "step": 35780, + "train_speed(iter/s)": 0.126153 + }, + { + "acc": 0.74407864, + "epoch": 0.20021055663288345, + "grad_norm": 6.59375, + "learning_rate": 9.931525057055356e-06, + "loss": 1.00521784, + "memory(GiB)": 292.62, + "step": 35800, + "train_speed(iter/s)": 0.126185 + }, + { + "acc": 0.72722907, + "epoch": 0.20032240610586274, + "grad_norm": 7.4375, + "learning_rate": 9.93137245967268e-06, + "loss": 1.09799633, + "memory(GiB)": 292.62, + "step": 35820, + "train_speed(iter/s)": 0.126216 + }, + { + "acc": 0.73737307, + "epoch": 0.200434255578842, + "grad_norm": 4.0625, + "learning_rate": 9.931219693621427e-06, + "loss": 1.05108318, + "memory(GiB)": 292.62, + "step": 35840, + "train_speed(iter/s)": 0.126247 + }, + { + "acc": 0.71777291, + "epoch": 0.20054610505182127, + "grad_norm": 7.65625, + "learning_rate": 9.931066758906827e-06, + "loss": 1.12550392, + "memory(GiB)": 292.62, + "step": 35860, + "train_speed(iter/s)": 0.126279 + }, + { + "acc": 0.72295794, + "epoch": 0.20065795452480054, + "grad_norm": 7.4375, + "learning_rate": 9.930913655534106e-06, + "loss": 1.09244671, + "memory(GiB)": 292.62, + "step": 35880, + "train_speed(iter/s)": 0.12631 + }, + { + "acc": 0.73785934, + "epoch": 0.2007698039977798, + "grad_norm": 6.3125, + "learning_rate": 9.930760383508502e-06, + "loss": 1.04915304, + "memory(GiB)": 292.62, + "step": 35900, + "train_speed(iter/s)": 0.126342 + }, + { + "acc": 0.71939511, + "epoch": 0.20088165347075906, + "grad_norm": 6.78125, + "learning_rate": 9.93060694283526e-06, + "loss": 1.08806906, + "memory(GiB)": 292.62, + "step": 35920, + "train_speed(iter/s)": 0.126375 + }, + { + "acc": 0.70017457, + "epoch": 0.20099350294373833, + "grad_norm": 8.5625, + "learning_rate": 9.930453333519625e-06, + "loss": 1.21328106, + "memory(GiB)": 292.62, + "step": 35940, + "train_speed(iter/s)": 0.126407 + }, + { + "acc": 0.72505121, + "epoch": 0.2011053524167176, + "grad_norm": 7.6875, + "learning_rate": 9.930299555566851e-06, + "loss": 1.09463902, + "memory(GiB)": 292.62, + "step": 35960, + "train_speed(iter/s)": 0.126439 + }, + { + "acc": 0.72071681, + "epoch": 0.20121720188969686, + "grad_norm": 5.46875, + "learning_rate": 9.9301456089822e-06, + "loss": 1.11387882, + "memory(GiB)": 292.62, + "step": 35980, + "train_speed(iter/s)": 0.126469 + }, + { + "acc": 0.71511631, + "epoch": 0.20132905136267612, + "grad_norm": 5.40625, + "learning_rate": 9.929991493770937e-06, + "loss": 1.13221512, + "memory(GiB)": 292.62, + "step": 36000, + "train_speed(iter/s)": 0.126502 + }, + { + "epoch": 0.20132905136267612, + "eval_acc": 0.6947627784418055, + "eval_loss": 1.0640994310379028, + "eval_runtime": 7504.9746, + "eval_samples_per_second": 10.031, + "eval_steps_per_second": 10.031, + "step": 36000 + }, + { + "acc": 0.72528038, + "epoch": 0.20144090083565538, + "grad_norm": 6.28125, + "learning_rate": 9.92983720993833e-06, + "loss": 1.08362875, + "memory(GiB)": 292.62, + "step": 36020, + "train_speed(iter/s)": 0.12324 + }, + { + "acc": 0.74909573, + "epoch": 0.20155275030863465, + "grad_norm": 8.6875, + "learning_rate": 9.929682757489659e-06, + "loss": 0.98451357, + "memory(GiB)": 292.62, + "step": 36040, + "train_speed(iter/s)": 0.123272 + }, + { + "acc": 0.74019957, + "epoch": 0.2016645997816139, + "grad_norm": 10.75, + "learning_rate": 9.929528136430206e-06, + "loss": 1.0139637, + "memory(GiB)": 292.62, + "step": 36060, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.71950426, + "epoch": 0.20177644925459318, + "grad_norm": 6.125, + "learning_rate": 9.929373346765261e-06, + "loss": 1.13088989, + "memory(GiB)": 292.62, + "step": 36080, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.73272839, + "epoch": 0.20188829872757244, + "grad_norm": 5.09375, + "learning_rate": 9.929218388500115e-06, + "loss": 1.05126009, + "memory(GiB)": 292.62, + "step": 36100, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.7296978, + "epoch": 0.2020001482005517, + "grad_norm": 6.125, + "learning_rate": 9.929063261640071e-06, + "loss": 1.09263115, + "memory(GiB)": 292.62, + "step": 36120, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.73002186, + "epoch": 0.20211199767353097, + "grad_norm": 6.3125, + "learning_rate": 9.928907966190434e-06, + "loss": 1.05923595, + "memory(GiB)": 292.62, + "step": 36140, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.73053093, + "epoch": 0.20222384714651023, + "grad_norm": 7.25, + "learning_rate": 9.928752502156516e-06, + "loss": 1.05871038, + "memory(GiB)": 292.62, + "step": 36160, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.73554606, + "epoch": 0.2023356966194895, + "grad_norm": 6.15625, + "learning_rate": 9.928596869543631e-06, + "loss": 1.02133379, + "memory(GiB)": 292.62, + "step": 36180, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.71657538, + "epoch": 0.20244754609246876, + "grad_norm": 7.15625, + "learning_rate": 9.928441068357107e-06, + "loss": 1.1336627, + "memory(GiB)": 292.62, + "step": 36200, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.7409719, + "epoch": 0.20255939556544802, + "grad_norm": 9.1875, + "learning_rate": 9.92828509860227e-06, + "loss": 1.01435251, + "memory(GiB)": 292.62, + "step": 36220, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.740416, + "epoch": 0.2026712450384273, + "grad_norm": 7.28125, + "learning_rate": 9.928128960284458e-06, + "loss": 1.01972475, + "memory(GiB)": 292.62, + "step": 36240, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.73114934, + "epoch": 0.20278309451140655, + "grad_norm": 4.375, + "learning_rate": 9.927972653409005e-06, + "loss": 1.07634888, + "memory(GiB)": 292.62, + "step": 36260, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.71443906, + "epoch": 0.20289494398438582, + "grad_norm": 5.28125, + "learning_rate": 9.927816177981264e-06, + "loss": 1.12906675, + "memory(GiB)": 302.58, + "step": 36280, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.7285955, + "epoch": 0.20300679345736508, + "grad_norm": 4.5, + "learning_rate": 9.927659534006585e-06, + "loss": 1.08396473, + "memory(GiB)": 302.58, + "step": 36300, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.73454528, + "epoch": 0.20311864293034435, + "grad_norm": 5.5, + "learning_rate": 9.927502721490321e-06, + "loss": 1.05233564, + "memory(GiB)": 302.58, + "step": 36320, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.72299676, + "epoch": 0.2032304924033236, + "grad_norm": 9.0, + "learning_rate": 9.92734574043784e-06, + "loss": 1.08630047, + "memory(GiB)": 302.58, + "step": 36340, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.74191155, + "epoch": 0.20334234187630287, + "grad_norm": 9.0, + "learning_rate": 9.927188590854513e-06, + "loss": 1.02212114, + "memory(GiB)": 302.58, + "step": 36360, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.70997276, + "epoch": 0.20345419134928214, + "grad_norm": 10.375, + "learning_rate": 9.92703127274571e-06, + "loss": 1.14818897, + "memory(GiB)": 302.58, + "step": 36380, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.72859001, + "epoch": 0.2035660408222614, + "grad_norm": 7.25, + "learning_rate": 9.926873786116817e-06, + "loss": 1.07237835, + "memory(GiB)": 302.58, + "step": 36400, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.73447542, + "epoch": 0.20367789029524067, + "grad_norm": 7.15625, + "learning_rate": 9.926716130973215e-06, + "loss": 1.03777151, + "memory(GiB)": 302.58, + "step": 36420, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.73935866, + "epoch": 0.20378973976821993, + "grad_norm": 9.25, + "learning_rate": 9.926558307320302e-06, + "loss": 1.01136227, + "memory(GiB)": 302.58, + "step": 36440, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.73298516, + "epoch": 0.2039015892411992, + "grad_norm": 5.875, + "learning_rate": 9.92640031516347e-06, + "loss": 1.04988966, + "memory(GiB)": 302.58, + "step": 36460, + "train_speed(iter/s)": 0.123942 + }, + { + "acc": 0.71470714, + "epoch": 0.20401343871417846, + "grad_norm": 7.65625, + "learning_rate": 9.926242154508128e-06, + "loss": 1.13869715, + "memory(GiB)": 302.58, + "step": 36480, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.73720446, + "epoch": 0.20412528818715772, + "grad_norm": 6.8125, + "learning_rate": 9.926083825359683e-06, + "loss": 1.02049179, + "memory(GiB)": 302.58, + "step": 36500, + "train_speed(iter/s)": 0.124006 + }, + { + "acc": 0.72402277, + "epoch": 0.20423713766013699, + "grad_norm": 8.4375, + "learning_rate": 9.925925327723552e-06, + "loss": 1.09104013, + "memory(GiB)": 302.58, + "step": 36520, + "train_speed(iter/s)": 0.124035 + }, + { + "acc": 0.72694788, + "epoch": 0.20434898713311625, + "grad_norm": 7.75, + "learning_rate": 9.925766661605154e-06, + "loss": 1.07047892, + "memory(GiB)": 302.58, + "step": 36540, + "train_speed(iter/s)": 0.124067 + }, + { + "acc": 0.71943178, + "epoch": 0.20446083660609551, + "grad_norm": 6.3125, + "learning_rate": 9.925607827009916e-06, + "loss": 1.09464588, + "memory(GiB)": 302.58, + "step": 36560, + "train_speed(iter/s)": 0.124098 + }, + { + "acc": 0.72098284, + "epoch": 0.20457268607907478, + "grad_norm": 7.75, + "learning_rate": 9.925448823943274e-06, + "loss": 1.11805906, + "memory(GiB)": 302.58, + "step": 36580, + "train_speed(iter/s)": 0.124128 + }, + { + "acc": 0.73729124, + "epoch": 0.20468453555205404, + "grad_norm": 6.59375, + "learning_rate": 9.925289652410663e-06, + "loss": 1.05609655, + "memory(GiB)": 302.58, + "step": 36600, + "train_speed(iter/s)": 0.124159 + }, + { + "acc": 0.72714677, + "epoch": 0.2047963850250333, + "grad_norm": 8.6875, + "learning_rate": 9.925130312417528e-06, + "loss": 1.07555723, + "memory(GiB)": 302.58, + "step": 36620, + "train_speed(iter/s)": 0.124192 + }, + { + "acc": 0.7349227, + "epoch": 0.20490823449801257, + "grad_norm": 10.8125, + "learning_rate": 9.924970803969318e-06, + "loss": 1.03370924, + "memory(GiB)": 302.58, + "step": 36640, + "train_speed(iter/s)": 0.124222 + }, + { + "acc": 0.73593745, + "epoch": 0.20502008397099183, + "grad_norm": 7.5, + "learning_rate": 9.924811127071492e-06, + "loss": 1.05241737, + "memory(GiB)": 302.58, + "step": 36660, + "train_speed(iter/s)": 0.124252 + }, + { + "acc": 0.73933578, + "epoch": 0.2051319334439711, + "grad_norm": 8.3125, + "learning_rate": 9.924651281729509e-06, + "loss": 1.0254673, + "memory(GiB)": 302.58, + "step": 36680, + "train_speed(iter/s)": 0.124285 + }, + { + "acc": 0.70746541, + "epoch": 0.20524378291695036, + "grad_norm": 6.53125, + "learning_rate": 9.924491267948836e-06, + "loss": 1.17335205, + "memory(GiB)": 302.58, + "step": 36700, + "train_speed(iter/s)": 0.124317 + }, + { + "acc": 0.70673327, + "epoch": 0.20535563238992963, + "grad_norm": 6.875, + "learning_rate": 9.924331085734946e-06, + "loss": 1.17096653, + "memory(GiB)": 302.58, + "step": 36720, + "train_speed(iter/s)": 0.124349 + }, + { + "acc": 0.74159555, + "epoch": 0.2054674818629089, + "grad_norm": 4.65625, + "learning_rate": 9.924170735093318e-06, + "loss": 1.01307468, + "memory(GiB)": 302.58, + "step": 36740, + "train_speed(iter/s)": 0.124381 + }, + { + "acc": 0.72431068, + "epoch": 0.20557933133588815, + "grad_norm": 7.125, + "learning_rate": 9.924010216029439e-06, + "loss": 1.08896856, + "memory(GiB)": 302.58, + "step": 36760, + "train_speed(iter/s)": 0.124412 + }, + { + "acc": 0.73318472, + "epoch": 0.20569118080886742, + "grad_norm": 7.65625, + "learning_rate": 9.923849528548795e-06, + "loss": 1.04260864, + "memory(GiB)": 302.58, + "step": 36780, + "train_speed(iter/s)": 0.124443 + }, + { + "acc": 0.7388896, + "epoch": 0.20580303028184668, + "grad_norm": 9.1875, + "learning_rate": 9.923688672656883e-06, + "loss": 1.02929144, + "memory(GiB)": 302.58, + "step": 36800, + "train_speed(iter/s)": 0.124473 + }, + { + "acc": 0.7172668, + "epoch": 0.20591487975482595, + "grad_norm": 5.84375, + "learning_rate": 9.923527648359207e-06, + "loss": 1.12054691, + "memory(GiB)": 302.58, + "step": 36820, + "train_speed(iter/s)": 0.124504 + }, + { + "acc": 0.75364728, + "epoch": 0.2060267292278052, + "grad_norm": 5.71875, + "learning_rate": 9.923366455661275e-06, + "loss": 0.95606718, + "memory(GiB)": 302.58, + "step": 36840, + "train_speed(iter/s)": 0.124535 + }, + { + "acc": 0.72174006, + "epoch": 0.20613857870078447, + "grad_norm": 4.0, + "learning_rate": 9.923205094568596e-06, + "loss": 1.1222537, + "memory(GiB)": 302.58, + "step": 36860, + "train_speed(iter/s)": 0.124563 + }, + { + "acc": 0.74438648, + "epoch": 0.20625042817376374, + "grad_norm": 9.4375, + "learning_rate": 9.923043565086695e-06, + "loss": 1.02054958, + "memory(GiB)": 302.58, + "step": 36880, + "train_speed(iter/s)": 0.124595 + }, + { + "acc": 0.7336246, + "epoch": 0.206362277646743, + "grad_norm": 7.0625, + "learning_rate": 9.92288186722109e-06, + "loss": 1.08010244, + "memory(GiB)": 302.58, + "step": 36900, + "train_speed(iter/s)": 0.124628 + }, + { + "acc": 0.72391944, + "epoch": 0.20647412711972227, + "grad_norm": 8.375, + "learning_rate": 9.922720000977316e-06, + "loss": 1.10592213, + "memory(GiB)": 302.58, + "step": 36920, + "train_speed(iter/s)": 0.124659 + }, + { + "acc": 0.72545538, + "epoch": 0.20658597659270153, + "grad_norm": 7.46875, + "learning_rate": 9.922557966360909e-06, + "loss": 1.09204063, + "memory(GiB)": 302.58, + "step": 36940, + "train_speed(iter/s)": 0.124686 + }, + { + "acc": 0.73942437, + "epoch": 0.2066978260656808, + "grad_norm": 6.90625, + "learning_rate": 9.92239576337741e-06, + "loss": 1.02128992, + "memory(GiB)": 302.58, + "step": 36960, + "train_speed(iter/s)": 0.124718 + }, + { + "acc": 0.73939767, + "epoch": 0.20680967553866006, + "grad_norm": 6.46875, + "learning_rate": 9.922233392032369e-06, + "loss": 1.02538548, + "memory(GiB)": 302.58, + "step": 36980, + "train_speed(iter/s)": 0.124748 + }, + { + "acc": 0.71728773, + "epoch": 0.20692152501163932, + "grad_norm": 8.375, + "learning_rate": 9.922070852331337e-06, + "loss": 1.1119175, + "memory(GiB)": 302.58, + "step": 37000, + "train_speed(iter/s)": 0.124776 + }, + { + "acc": 0.73326759, + "epoch": 0.2070333744846186, + "grad_norm": 8.5625, + "learning_rate": 9.921908144279875e-06, + "loss": 1.03788738, + "memory(GiB)": 302.58, + "step": 37020, + "train_speed(iter/s)": 0.124808 + }, + { + "acc": 0.71790552, + "epoch": 0.20714522395759785, + "grad_norm": 9.25, + "learning_rate": 9.921745267883546e-06, + "loss": 1.14412317, + "memory(GiB)": 302.58, + "step": 37040, + "train_speed(iter/s)": 0.124839 + }, + { + "acc": 0.73500938, + "epoch": 0.20725707343057712, + "grad_norm": 4.53125, + "learning_rate": 9.921582223147924e-06, + "loss": 1.03399467, + "memory(GiB)": 302.58, + "step": 37060, + "train_speed(iter/s)": 0.124869 + }, + { + "acc": 0.72911105, + "epoch": 0.2073689229035564, + "grad_norm": 6.6875, + "learning_rate": 9.921419010078582e-06, + "loss": 1.06778917, + "memory(GiB)": 302.58, + "step": 37080, + "train_speed(iter/s)": 0.124902 + }, + { + "acc": 0.73922887, + "epoch": 0.20748077237653567, + "grad_norm": 8.9375, + "learning_rate": 9.921255628681106e-06, + "loss": 1.02770538, + "memory(GiB)": 302.58, + "step": 37100, + "train_speed(iter/s)": 0.124933 + }, + { + "acc": 0.75508609, + "epoch": 0.20759262184951494, + "grad_norm": 5.84375, + "learning_rate": 9.921092078961083e-06, + "loss": 0.95294065, + "memory(GiB)": 302.58, + "step": 37120, + "train_speed(iter/s)": 0.124966 + }, + { + "acc": 0.7405581, + "epoch": 0.2077044713224942, + "grad_norm": 9.875, + "learning_rate": 9.920928360924105e-06, + "loss": 1.01025686, + "memory(GiB)": 302.58, + "step": 37140, + "train_speed(iter/s)": 0.124998 + }, + { + "acc": 0.72006068, + "epoch": 0.20781632079547346, + "grad_norm": 4.875, + "learning_rate": 9.920764474575775e-06, + "loss": 1.10947952, + "memory(GiB)": 302.58, + "step": 37160, + "train_speed(iter/s)": 0.12503 + }, + { + "acc": 0.73082943, + "epoch": 0.20792817026845273, + "grad_norm": 5.6875, + "learning_rate": 9.920600419921696e-06, + "loss": 1.04113111, + "memory(GiB)": 302.58, + "step": 37180, + "train_speed(iter/s)": 0.12506 + }, + { + "acc": 0.72235208, + "epoch": 0.208040019741432, + "grad_norm": 7.78125, + "learning_rate": 9.920436196967479e-06, + "loss": 1.09768972, + "memory(GiB)": 302.58, + "step": 37200, + "train_speed(iter/s)": 0.125091 + }, + { + "acc": 0.7281487, + "epoch": 0.20815186921441126, + "grad_norm": 6.78125, + "learning_rate": 9.920271805718742e-06, + "loss": 1.05534382, + "memory(GiB)": 302.58, + "step": 37220, + "train_speed(iter/s)": 0.125122 + }, + { + "acc": 0.7342999, + "epoch": 0.20826371868739052, + "grad_norm": 7.75, + "learning_rate": 9.92010724618111e-06, + "loss": 1.0354763, + "memory(GiB)": 302.58, + "step": 37240, + "train_speed(iter/s)": 0.125153 + }, + { + "acc": 0.72999954, + "epoch": 0.20837556816036978, + "grad_norm": 6.5, + "learning_rate": 9.919942518360206e-06, + "loss": 1.06377068, + "memory(GiB)": 302.58, + "step": 37260, + "train_speed(iter/s)": 0.125186 + }, + { + "acc": 0.7171598, + "epoch": 0.20848741763334905, + "grad_norm": 8.125, + "learning_rate": 9.919777622261667e-06, + "loss": 1.14568233, + "memory(GiB)": 302.58, + "step": 37280, + "train_speed(iter/s)": 0.125217 + }, + { + "acc": 0.73484883, + "epoch": 0.2085992671063283, + "grad_norm": 7.71875, + "learning_rate": 9.919612557891134e-06, + "loss": 1.02681389, + "memory(GiB)": 302.58, + "step": 37300, + "train_speed(iter/s)": 0.125247 + }, + { + "acc": 0.73193903, + "epoch": 0.20871111657930758, + "grad_norm": 8.875, + "learning_rate": 9.919447325254252e-06, + "loss": 1.0425271, + "memory(GiB)": 302.58, + "step": 37320, + "train_speed(iter/s)": 0.125277 + }, + { + "acc": 0.72661633, + "epoch": 0.20882296605228684, + "grad_norm": 8.3125, + "learning_rate": 9.919281924356672e-06, + "loss": 1.07011108, + "memory(GiB)": 302.58, + "step": 37340, + "train_speed(iter/s)": 0.125309 + }, + { + "acc": 0.73950415, + "epoch": 0.2089348155252661, + "grad_norm": 9.3125, + "learning_rate": 9.919116355204053e-06, + "loss": 1.02495213, + "memory(GiB)": 302.58, + "step": 37360, + "train_speed(iter/s)": 0.125341 + }, + { + "acc": 0.7174839, + "epoch": 0.20904666499824537, + "grad_norm": 7.25, + "learning_rate": 9.918950617802055e-06, + "loss": 1.10758171, + "memory(GiB)": 302.58, + "step": 37380, + "train_speed(iter/s)": 0.125372 + }, + { + "acc": 0.72207546, + "epoch": 0.20915851447122463, + "grad_norm": 6.625, + "learning_rate": 9.918784712156349e-06, + "loss": 1.09948759, + "memory(GiB)": 302.58, + "step": 37400, + "train_speed(iter/s)": 0.125404 + }, + { + "acc": 0.73809972, + "epoch": 0.2092703639442039, + "grad_norm": 7.21875, + "learning_rate": 9.918618638272609e-06, + "loss": 1.03296528, + "memory(GiB)": 302.58, + "step": 37420, + "train_speed(iter/s)": 0.125435 + }, + { + "acc": 0.71335449, + "epoch": 0.20938221341718316, + "grad_norm": 8.4375, + "learning_rate": 9.918452396156514e-06, + "loss": 1.13535252, + "memory(GiB)": 302.58, + "step": 37440, + "train_speed(iter/s)": 0.125466 + }, + { + "acc": 0.75486722, + "epoch": 0.20949406289016242, + "grad_norm": 6.6875, + "learning_rate": 9.918285985813751e-06, + "loss": 0.95838575, + "memory(GiB)": 302.58, + "step": 37460, + "train_speed(iter/s)": 0.125498 + }, + { + "acc": 0.73636255, + "epoch": 0.2096059123631417, + "grad_norm": 8.0625, + "learning_rate": 9.918119407250013e-06, + "loss": 1.0359931, + "memory(GiB)": 302.58, + "step": 37480, + "train_speed(iter/s)": 0.125529 + }, + { + "acc": 0.73335962, + "epoch": 0.20971776183612095, + "grad_norm": 8.625, + "learning_rate": 9.917952660470996e-06, + "loss": 1.05187054, + "memory(GiB)": 302.58, + "step": 37500, + "train_speed(iter/s)": 0.125559 + }, + { + "acc": 0.73209476, + "epoch": 0.20982961130910022, + "grad_norm": 5.875, + "learning_rate": 9.917785745482403e-06, + "loss": 1.06972151, + "memory(GiB)": 302.58, + "step": 37520, + "train_speed(iter/s)": 0.12559 + }, + { + "acc": 0.73595185, + "epoch": 0.20994146078207948, + "grad_norm": 9.6875, + "learning_rate": 9.917618662289944e-06, + "loss": 1.03642979, + "memory(GiB)": 302.58, + "step": 37540, + "train_speed(iter/s)": 0.125621 + }, + { + "acc": 0.74010386, + "epoch": 0.21005331025505874, + "grad_norm": 11.25, + "learning_rate": 9.917451410899334e-06, + "loss": 1.02974701, + "memory(GiB)": 302.58, + "step": 37560, + "train_speed(iter/s)": 0.125652 + }, + { + "acc": 0.72182155, + "epoch": 0.210165159728038, + "grad_norm": 7.65625, + "learning_rate": 9.917283991316291e-06, + "loss": 1.11201582, + "memory(GiB)": 302.58, + "step": 37580, + "train_speed(iter/s)": 0.125684 + }, + { + "acc": 0.72659845, + "epoch": 0.21027700920101727, + "grad_norm": 5.125, + "learning_rate": 9.917116403546544e-06, + "loss": 1.09007521, + "memory(GiB)": 302.58, + "step": 37600, + "train_speed(iter/s)": 0.125716 + }, + { + "acc": 0.72698951, + "epoch": 0.21038885867399654, + "grad_norm": 6.3125, + "learning_rate": 9.916948647595827e-06, + "loss": 1.08806219, + "memory(GiB)": 302.58, + "step": 37620, + "train_speed(iter/s)": 0.125747 + }, + { + "acc": 0.73278456, + "epoch": 0.2105007081469758, + "grad_norm": 8.75, + "learning_rate": 9.916780723469871e-06, + "loss": 1.04789619, + "memory(GiB)": 302.58, + "step": 37640, + "train_speed(iter/s)": 0.125776 + }, + { + "acc": 0.73043485, + "epoch": 0.21061255761995507, + "grad_norm": 7.1875, + "learning_rate": 9.916612631174428e-06, + "loss": 1.08459396, + "memory(GiB)": 302.58, + "step": 37660, + "train_speed(iter/s)": 0.125805 + }, + { + "acc": 0.72354803, + "epoch": 0.21072440709293433, + "grad_norm": 6.0, + "learning_rate": 9.916444370715239e-06, + "loss": 1.09203844, + "memory(GiB)": 302.58, + "step": 37680, + "train_speed(iter/s)": 0.125833 + }, + { + "acc": 0.72925863, + "epoch": 0.2108362565659136, + "grad_norm": 4.34375, + "learning_rate": 9.916275942098064e-06, + "loss": 1.05970802, + "memory(GiB)": 302.58, + "step": 37700, + "train_speed(iter/s)": 0.125864 + }, + { + "acc": 0.73044782, + "epoch": 0.21094810603889286, + "grad_norm": 6.78125, + "learning_rate": 9.916107345328662e-06, + "loss": 1.05033064, + "memory(GiB)": 302.58, + "step": 37720, + "train_speed(iter/s)": 0.125896 + }, + { + "acc": 0.73164229, + "epoch": 0.21105995551187212, + "grad_norm": 5.6875, + "learning_rate": 9.915938580412803e-06, + "loss": 1.07659178, + "memory(GiB)": 302.58, + "step": 37740, + "train_speed(iter/s)": 0.125923 + }, + { + "acc": 0.73835168, + "epoch": 0.21117180498485139, + "grad_norm": 6.53125, + "learning_rate": 9.915769647356254e-06, + "loss": 1.01889048, + "memory(GiB)": 302.58, + "step": 37760, + "train_speed(iter/s)": 0.125954 + }, + { + "acc": 0.72140551, + "epoch": 0.21128365445783065, + "grad_norm": 7.625, + "learning_rate": 9.915600546164797e-06, + "loss": 1.0875227, + "memory(GiB)": 302.58, + "step": 37780, + "train_speed(iter/s)": 0.125985 + }, + { + "acc": 0.72716904, + "epoch": 0.2113955039308099, + "grad_norm": 8.875, + "learning_rate": 9.915431276844213e-06, + "loss": 1.08194647, + "memory(GiB)": 302.58, + "step": 37800, + "train_speed(iter/s)": 0.126016 + }, + { + "acc": 0.72999511, + "epoch": 0.21150735340378918, + "grad_norm": 6.71875, + "learning_rate": 9.915261839400294e-06, + "loss": 1.04747362, + "memory(GiB)": 302.58, + "step": 37820, + "train_speed(iter/s)": 0.126045 + }, + { + "acc": 0.73767753, + "epoch": 0.21161920287676844, + "grad_norm": 5.96875, + "learning_rate": 9.915092233838833e-06, + "loss": 1.0367198, + "memory(GiB)": 302.58, + "step": 37840, + "train_speed(iter/s)": 0.126076 + }, + { + "acc": 0.75057106, + "epoch": 0.2117310523497477, + "grad_norm": 7.21875, + "learning_rate": 9.914922460165632e-06, + "loss": 0.98381863, + "memory(GiB)": 302.58, + "step": 37860, + "train_speed(iter/s)": 0.126107 + }, + { + "acc": 0.71698442, + "epoch": 0.21184290182272697, + "grad_norm": 4.96875, + "learning_rate": 9.9147525183865e-06, + "loss": 1.1217782, + "memory(GiB)": 302.58, + "step": 37880, + "train_speed(iter/s)": 0.12614 + }, + { + "acc": 0.71291995, + "epoch": 0.21195475129570623, + "grad_norm": 8.6875, + "learning_rate": 9.914582408507247e-06, + "loss": 1.13492632, + "memory(GiB)": 302.58, + "step": 37900, + "train_speed(iter/s)": 0.12617 + }, + { + "acc": 0.7449614, + "epoch": 0.2120666007686855, + "grad_norm": 8.75, + "learning_rate": 9.914412130533692e-06, + "loss": 0.99398613, + "memory(GiB)": 302.58, + "step": 37920, + "train_speed(iter/s)": 0.1262 + }, + { + "acc": 0.74223423, + "epoch": 0.21217845024166476, + "grad_norm": 7.5, + "learning_rate": 9.914241684471657e-06, + "loss": 1.00411711, + "memory(GiB)": 302.58, + "step": 37940, + "train_speed(iter/s)": 0.126233 + }, + { + "acc": 0.71896672, + "epoch": 0.21229029971464403, + "grad_norm": 7.53125, + "learning_rate": 9.914071070326977e-06, + "loss": 1.12678308, + "memory(GiB)": 302.58, + "step": 37960, + "train_speed(iter/s)": 0.126263 + }, + { + "acc": 0.7320652, + "epoch": 0.2124021491876233, + "grad_norm": 5.25, + "learning_rate": 9.913900288105482e-06, + "loss": 1.0548995, + "memory(GiB)": 302.58, + "step": 37980, + "train_speed(iter/s)": 0.126292 + }, + { + "acc": 0.73223372, + "epoch": 0.21251399866060255, + "grad_norm": 5.875, + "learning_rate": 9.913729337813017e-06, + "loss": 1.04463234, + "memory(GiB)": 302.58, + "step": 38000, + "train_speed(iter/s)": 0.12632 + }, + { + "epoch": 0.21251399866060255, + "eval_acc": 0.6953544877674461, + "eval_loss": 1.0618358850479126, + "eval_runtime": 7494.9271, + "eval_samples_per_second": 10.045, + "eval_steps_per_second": 10.045, + "step": 38000 + }, + { + "acc": 0.71869764, + "epoch": 0.21262584813358182, + "grad_norm": 8.0, + "learning_rate": 9.913558219455426e-06, + "loss": 1.13609018, + "memory(GiB)": 302.58, + "step": 38020, + "train_speed(iter/s)": 0.123239 + }, + { + "acc": 0.71843548, + "epoch": 0.21273769760656108, + "grad_norm": 9.125, + "learning_rate": 9.913386933038564e-06, + "loss": 1.08939333, + "memory(GiB)": 302.58, + "step": 38040, + "train_speed(iter/s)": 0.12327 + }, + { + "acc": 0.72653155, + "epoch": 0.21284954707954035, + "grad_norm": 5.6875, + "learning_rate": 9.91321547856829e-06, + "loss": 1.08518457, + "memory(GiB)": 302.58, + "step": 38060, + "train_speed(iter/s)": 0.123301 + }, + { + "acc": 0.73691559, + "epoch": 0.2129613965525196, + "grad_norm": 7.15625, + "learning_rate": 9.913043856050466e-06, + "loss": 1.05928192, + "memory(GiB)": 302.58, + "step": 38080, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.7221117, + "epoch": 0.21307324602549887, + "grad_norm": 4.96875, + "learning_rate": 9.912872065490964e-06, + "loss": 1.09908514, + "memory(GiB)": 302.58, + "step": 38100, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.74437809, + "epoch": 0.21318509549847814, + "grad_norm": 5.25, + "learning_rate": 9.912700106895659e-06, + "loss": 1.00068283, + "memory(GiB)": 302.58, + "step": 38120, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.71950011, + "epoch": 0.2132969449714574, + "grad_norm": 6.0, + "learning_rate": 9.912527980270433e-06, + "loss": 1.11683149, + "memory(GiB)": 302.58, + "step": 38140, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.72506938, + "epoch": 0.21340879444443667, + "grad_norm": 4.84375, + "learning_rate": 9.912355685621172e-06, + "loss": 1.07127209, + "memory(GiB)": 302.58, + "step": 38160, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.72152553, + "epoch": 0.21352064391741593, + "grad_norm": 10.4375, + "learning_rate": 9.91218322295377e-06, + "loss": 1.10407057, + "memory(GiB)": 302.58, + "step": 38180, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.72161689, + "epoch": 0.2136324933903952, + "grad_norm": 6.09375, + "learning_rate": 9.912010592274126e-06, + "loss": 1.09699287, + "memory(GiB)": 302.58, + "step": 38200, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.7221581, + "epoch": 0.21374434286337446, + "grad_norm": 7.375, + "learning_rate": 9.911837793588143e-06, + "loss": 1.08389006, + "memory(GiB)": 302.58, + "step": 38220, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.7184495, + "epoch": 0.21385619233635372, + "grad_norm": 8.75, + "learning_rate": 9.911664826901734e-06, + "loss": 1.10787525, + "memory(GiB)": 302.58, + "step": 38240, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.73029385, + "epoch": 0.213968041809333, + "grad_norm": 8.625, + "learning_rate": 9.911491692220812e-06, + "loss": 1.04819336, + "memory(GiB)": 302.58, + "step": 38260, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.73435779, + "epoch": 0.21407989128231225, + "grad_norm": 6.625, + "learning_rate": 9.911318389551302e-06, + "loss": 1.05797901, + "memory(GiB)": 302.58, + "step": 38280, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.71953959, + "epoch": 0.21419174075529152, + "grad_norm": 6.71875, + "learning_rate": 9.91114491889913e-06, + "loss": 1.12074528, + "memory(GiB)": 302.58, + "step": 38300, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.73459001, + "epoch": 0.21430359022827078, + "grad_norm": 7.84375, + "learning_rate": 9.910971280270227e-06, + "loss": 1.03729963, + "memory(GiB)": 302.58, + "step": 38320, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.72953982, + "epoch": 0.21441543970125007, + "grad_norm": 7.46875, + "learning_rate": 9.910797473670535e-06, + "loss": 1.08432407, + "memory(GiB)": 302.58, + "step": 38340, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.73686328, + "epoch": 0.21452728917422934, + "grad_norm": 5.96875, + "learning_rate": 9.910623499105996e-06, + "loss": 1.05304966, + "memory(GiB)": 302.58, + "step": 38360, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.7272191, + "epoch": 0.2146391386472086, + "grad_norm": 9.125, + "learning_rate": 9.910449356582565e-06, + "loss": 1.08926744, + "memory(GiB)": 302.58, + "step": 38380, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.71110201, + "epoch": 0.21475098812018786, + "grad_norm": 4.84375, + "learning_rate": 9.910275046106193e-06, + "loss": 1.15953131, + "memory(GiB)": 302.58, + "step": 38400, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.74148531, + "epoch": 0.21486283759316713, + "grad_norm": 6.75, + "learning_rate": 9.910100567682845e-06, + "loss": 1.00962601, + "memory(GiB)": 302.58, + "step": 38420, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.73510294, + "epoch": 0.2149746870661464, + "grad_norm": 6.4375, + "learning_rate": 9.909925921318489e-06, + "loss": 1.04815016, + "memory(GiB)": 302.58, + "step": 38440, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.7432374, + "epoch": 0.21508653653912566, + "grad_norm": 5.8125, + "learning_rate": 9.909751107019097e-06, + "loss": 0.98173847, + "memory(GiB)": 302.58, + "step": 38460, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.72325149, + "epoch": 0.21519838601210492, + "grad_norm": 7.0, + "learning_rate": 9.909576124790649e-06, + "loss": 1.084342, + "memory(GiB)": 302.58, + "step": 38480, + "train_speed(iter/s)": 0.123938 + }, + { + "acc": 0.73723702, + "epoch": 0.21531023548508418, + "grad_norm": 7.1875, + "learning_rate": 9.90940097463913e-06, + "loss": 1.03332949, + "memory(GiB)": 302.58, + "step": 38500, + "train_speed(iter/s)": 0.123966 + }, + { + "acc": 0.72432756, + "epoch": 0.21542208495806345, + "grad_norm": 8.3125, + "learning_rate": 9.90922565657053e-06, + "loss": 1.06775713, + "memory(GiB)": 302.58, + "step": 38520, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.72856007, + "epoch": 0.2155339344310427, + "grad_norm": 5.59375, + "learning_rate": 9.909050170590844e-06, + "loss": 1.06986561, + "memory(GiB)": 302.58, + "step": 38540, + "train_speed(iter/s)": 0.124028 + }, + { + "acc": 0.72171483, + "epoch": 0.21564578390402198, + "grad_norm": 8.375, + "learning_rate": 9.908874516706078e-06, + "loss": 1.09105263, + "memory(GiB)": 302.58, + "step": 38560, + "train_speed(iter/s)": 0.124057 + }, + { + "acc": 0.72484732, + "epoch": 0.21575763337700124, + "grad_norm": 5.875, + "learning_rate": 9.908698694922237e-06, + "loss": 1.08239756, + "memory(GiB)": 302.58, + "step": 38580, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.73220992, + "epoch": 0.2158694828499805, + "grad_norm": 5.875, + "learning_rate": 9.908522705245336e-06, + "loss": 1.0330719, + "memory(GiB)": 302.58, + "step": 38600, + "train_speed(iter/s)": 0.124116 + }, + { + "acc": 0.7310842, + "epoch": 0.21598133232295977, + "grad_norm": 8.375, + "learning_rate": 9.908346547681394e-06, + "loss": 1.06412916, + "memory(GiB)": 302.58, + "step": 38620, + "train_speed(iter/s)": 0.124147 + }, + { + "acc": 0.74567013, + "epoch": 0.21609318179593903, + "grad_norm": 6.75, + "learning_rate": 9.908170222236437e-06, + "loss": 1.02565422, + "memory(GiB)": 302.58, + "step": 38640, + "train_speed(iter/s)": 0.124177 + }, + { + "acc": 0.73744059, + "epoch": 0.2162050312689183, + "grad_norm": 7.59375, + "learning_rate": 9.907993728916495e-06, + "loss": 1.05398464, + "memory(GiB)": 302.58, + "step": 38660, + "train_speed(iter/s)": 0.124208 + }, + { + "acc": 0.73544183, + "epoch": 0.21631688074189756, + "grad_norm": 6.40625, + "learning_rate": 9.907817067727603e-06, + "loss": 1.04041462, + "memory(GiB)": 302.58, + "step": 38680, + "train_speed(iter/s)": 0.124234 + }, + { + "acc": 0.71414838, + "epoch": 0.21642873021487682, + "grad_norm": 9.375, + "learning_rate": 9.907640238675805e-06, + "loss": 1.12817984, + "memory(GiB)": 302.58, + "step": 38700, + "train_speed(iter/s)": 0.124262 + }, + { + "acc": 0.74432983, + "epoch": 0.2165405796878561, + "grad_norm": 6.40625, + "learning_rate": 9.90746324176715e-06, + "loss": 1.01085854, + "memory(GiB)": 302.58, + "step": 38720, + "train_speed(iter/s)": 0.124293 + }, + { + "acc": 0.73703918, + "epoch": 0.21665242916083535, + "grad_norm": 5.53125, + "learning_rate": 9.907286077007691e-06, + "loss": 1.03809414, + "memory(GiB)": 302.58, + "step": 38740, + "train_speed(iter/s)": 0.124323 + }, + { + "acc": 0.72371716, + "epoch": 0.21676427863381462, + "grad_norm": 8.75, + "learning_rate": 9.907108744403488e-06, + "loss": 1.09492226, + "memory(GiB)": 302.58, + "step": 38760, + "train_speed(iter/s)": 0.124352 + }, + { + "acc": 0.72294865, + "epoch": 0.21687612810679388, + "grad_norm": 5.78125, + "learning_rate": 9.906931243960605e-06, + "loss": 1.09478092, + "memory(GiB)": 302.58, + "step": 38780, + "train_speed(iter/s)": 0.124381 + }, + { + "acc": 0.7295177, + "epoch": 0.21698797757977314, + "grad_norm": 7.3125, + "learning_rate": 9.906753575685115e-06, + "loss": 1.04990807, + "memory(GiB)": 302.58, + "step": 38800, + "train_speed(iter/s)": 0.124411 + }, + { + "acc": 0.73651018, + "epoch": 0.2170998270527524, + "grad_norm": 4.8125, + "learning_rate": 9.906575739583094e-06, + "loss": 1.03513126, + "memory(GiB)": 302.58, + "step": 38820, + "train_speed(iter/s)": 0.124438 + }, + { + "acc": 0.72207241, + "epoch": 0.21721167652573167, + "grad_norm": 6.0625, + "learning_rate": 9.906397735660623e-06, + "loss": 1.09496918, + "memory(GiB)": 302.58, + "step": 38840, + "train_speed(iter/s)": 0.124468 + }, + { + "acc": 0.73152876, + "epoch": 0.21732352599871094, + "grad_norm": 4.9375, + "learning_rate": 9.906219563923791e-06, + "loss": 1.05583353, + "memory(GiB)": 302.58, + "step": 38860, + "train_speed(iter/s)": 0.124498 + }, + { + "acc": 0.7277884, + "epoch": 0.2174353754716902, + "grad_norm": 5.90625, + "learning_rate": 9.906041224378696e-06, + "loss": 1.07805405, + "memory(GiB)": 302.58, + "step": 38880, + "train_speed(iter/s)": 0.124528 + }, + { + "acc": 0.71963191, + "epoch": 0.21754722494466947, + "grad_norm": 6.59375, + "learning_rate": 9.905862717031432e-06, + "loss": 1.12375593, + "memory(GiB)": 302.58, + "step": 38900, + "train_speed(iter/s)": 0.124559 + }, + { + "acc": 0.7532331, + "epoch": 0.21765907441764873, + "grad_norm": 6.28125, + "learning_rate": 9.905684041888107e-06, + "loss": 0.95753765, + "memory(GiB)": 302.58, + "step": 38920, + "train_speed(iter/s)": 0.124589 + }, + { + "acc": 0.74122715, + "epoch": 0.217770923890628, + "grad_norm": 5.6875, + "learning_rate": 9.905505198954833e-06, + "loss": 1.0022543, + "memory(GiB)": 302.58, + "step": 38940, + "train_speed(iter/s)": 0.12462 + }, + { + "acc": 0.71419005, + "epoch": 0.21788277336360726, + "grad_norm": 5.28125, + "learning_rate": 9.905326188237725e-06, + "loss": 1.15461979, + "memory(GiB)": 302.58, + "step": 38960, + "train_speed(iter/s)": 0.124649 + }, + { + "acc": 0.72919378, + "epoch": 0.21799462283658652, + "grad_norm": 9.0, + "learning_rate": 9.90514700974291e-06, + "loss": 1.08184624, + "memory(GiB)": 302.58, + "step": 38980, + "train_speed(iter/s)": 0.124677 + }, + { + "acc": 0.72012506, + "epoch": 0.21810647230956579, + "grad_norm": 8.0, + "learning_rate": 9.90496766347651e-06, + "loss": 1.13402281, + "memory(GiB)": 302.58, + "step": 39000, + "train_speed(iter/s)": 0.124707 + }, + { + "acc": 0.73774905, + "epoch": 0.21821832178254505, + "grad_norm": 7.15625, + "learning_rate": 9.904788149444665e-06, + "loss": 1.04528675, + "memory(GiB)": 302.58, + "step": 39020, + "train_speed(iter/s)": 0.124738 + }, + { + "acc": 0.75310383, + "epoch": 0.2183301712555243, + "grad_norm": 5.90625, + "learning_rate": 9.90460846765351e-06, + "loss": 0.97114391, + "memory(GiB)": 302.58, + "step": 39040, + "train_speed(iter/s)": 0.124768 + }, + { + "acc": 0.70483613, + "epoch": 0.21844202072850358, + "grad_norm": 6.84375, + "learning_rate": 9.904428618109196e-06, + "loss": 1.17611122, + "memory(GiB)": 302.58, + "step": 39060, + "train_speed(iter/s)": 0.124799 + }, + { + "acc": 0.72702861, + "epoch": 0.21855387020148284, + "grad_norm": 7.15625, + "learning_rate": 9.904248600817871e-06, + "loss": 1.11266985, + "memory(GiB)": 302.58, + "step": 39080, + "train_speed(iter/s)": 0.124827 + }, + { + "acc": 0.72817001, + "epoch": 0.2186657196744621, + "grad_norm": 7.65625, + "learning_rate": 9.904068415785692e-06, + "loss": 1.07949238, + "memory(GiB)": 302.58, + "step": 39100, + "train_speed(iter/s)": 0.124858 + }, + { + "acc": 0.72888193, + "epoch": 0.21877756914744137, + "grad_norm": 6.75, + "learning_rate": 9.903888063018825e-06, + "loss": 1.07629547, + "memory(GiB)": 302.58, + "step": 39120, + "train_speed(iter/s)": 0.124886 + }, + { + "acc": 0.74840069, + "epoch": 0.21888941862042063, + "grad_norm": 8.4375, + "learning_rate": 9.903707542523436e-06, + "loss": 0.98854561, + "memory(GiB)": 302.58, + "step": 39140, + "train_speed(iter/s)": 0.124914 + }, + { + "acc": 0.72203403, + "epoch": 0.2190012680933999, + "grad_norm": 6.125, + "learning_rate": 9.903526854305698e-06, + "loss": 1.11389399, + "memory(GiB)": 302.58, + "step": 39160, + "train_speed(iter/s)": 0.124944 + }, + { + "acc": 0.73812838, + "epoch": 0.21911311756637916, + "grad_norm": 9.625, + "learning_rate": 9.903345998371794e-06, + "loss": 1.03754444, + "memory(GiB)": 302.58, + "step": 39180, + "train_speed(iter/s)": 0.12497 + }, + { + "acc": 0.71325812, + "epoch": 0.21922496703935843, + "grad_norm": 4.8125, + "learning_rate": 9.903164974727908e-06, + "loss": 1.13848753, + "memory(GiB)": 302.58, + "step": 39200, + "train_speed(iter/s)": 0.124999 + }, + { + "acc": 0.73548751, + "epoch": 0.2193368165123377, + "grad_norm": 6.71875, + "learning_rate": 9.902983783380234e-06, + "loss": 1.03856983, + "memory(GiB)": 302.58, + "step": 39220, + "train_speed(iter/s)": 0.125026 + }, + { + "acc": 0.72904096, + "epoch": 0.21944866598531695, + "grad_norm": 6.8125, + "learning_rate": 9.902802424334965e-06, + "loss": 1.06209335, + "memory(GiB)": 302.58, + "step": 39240, + "train_speed(iter/s)": 0.125056 + }, + { + "acc": 0.73382659, + "epoch": 0.21956051545829622, + "grad_norm": 4.53125, + "learning_rate": 9.902620897598309e-06, + "loss": 1.04974537, + "memory(GiB)": 302.58, + "step": 39260, + "train_speed(iter/s)": 0.125087 + }, + { + "acc": 0.72835741, + "epoch": 0.21967236493127548, + "grad_norm": 7.3125, + "learning_rate": 9.902439203176472e-06, + "loss": 1.08842096, + "memory(GiB)": 302.58, + "step": 39280, + "train_speed(iter/s)": 0.125113 + }, + { + "acc": 0.72462645, + "epoch": 0.21978421440425475, + "grad_norm": 7.59375, + "learning_rate": 9.902257341075669e-06, + "loss": 1.10341797, + "memory(GiB)": 302.58, + "step": 39300, + "train_speed(iter/s)": 0.125143 + }, + { + "acc": 0.73477383, + "epoch": 0.219896063877234, + "grad_norm": 6.03125, + "learning_rate": 9.90207531130212e-06, + "loss": 1.04886189, + "memory(GiB)": 302.58, + "step": 39320, + "train_speed(iter/s)": 0.125171 + }, + { + "acc": 0.73903365, + "epoch": 0.22000791335021327, + "grad_norm": 6.96875, + "learning_rate": 9.901893113862052e-06, + "loss": 1.00861197, + "memory(GiB)": 302.58, + "step": 39340, + "train_speed(iter/s)": 0.125199 + }, + { + "acc": 0.7287477, + "epoch": 0.22011976282319254, + "grad_norm": 5.6875, + "learning_rate": 9.901710748761695e-06, + "loss": 1.05890656, + "memory(GiB)": 302.58, + "step": 39360, + "train_speed(iter/s)": 0.125228 + }, + { + "acc": 0.72324028, + "epoch": 0.2202316122961718, + "grad_norm": 11.1875, + "learning_rate": 9.901528216007288e-06, + "loss": 1.09311361, + "memory(GiB)": 302.58, + "step": 39380, + "train_speed(iter/s)": 0.125258 + }, + { + "acc": 0.74059057, + "epoch": 0.22034346176915107, + "grad_norm": 6.46875, + "learning_rate": 9.901345515605072e-06, + "loss": 1.02397366, + "memory(GiB)": 302.58, + "step": 39400, + "train_speed(iter/s)": 0.125289 + }, + { + "acc": 0.74231706, + "epoch": 0.22045531124213033, + "grad_norm": 9.3125, + "learning_rate": 9.9011626475613e-06, + "loss": 0.98189926, + "memory(GiB)": 302.58, + "step": 39420, + "train_speed(iter/s)": 0.12532 + }, + { + "acc": 0.73231149, + "epoch": 0.2205671607151096, + "grad_norm": 10.75, + "learning_rate": 9.900979611882223e-06, + "loss": 1.0600709, + "memory(GiB)": 302.58, + "step": 39440, + "train_speed(iter/s)": 0.12535 + }, + { + "acc": 0.73679404, + "epoch": 0.22067901018808886, + "grad_norm": 6.90625, + "learning_rate": 9.900796408574104e-06, + "loss": 1.04921055, + "memory(GiB)": 302.58, + "step": 39460, + "train_speed(iter/s)": 0.125379 + }, + { + "acc": 0.73028102, + "epoch": 0.22079085966106812, + "grad_norm": 7.875, + "learning_rate": 9.900613037643207e-06, + "loss": 1.07043533, + "memory(GiB)": 302.58, + "step": 39480, + "train_speed(iter/s)": 0.125407 + }, + { + "acc": 0.74083314, + "epoch": 0.2209027091340474, + "grad_norm": 7.75, + "learning_rate": 9.900429499095804e-06, + "loss": 1.00392408, + "memory(GiB)": 302.58, + "step": 39500, + "train_speed(iter/s)": 0.125438 + }, + { + "acc": 0.72020741, + "epoch": 0.22101455860702665, + "grad_norm": 5.5625, + "learning_rate": 9.900245792938174e-06, + "loss": 1.12485285, + "memory(GiB)": 302.58, + "step": 39520, + "train_speed(iter/s)": 0.125469 + }, + { + "acc": 0.72850304, + "epoch": 0.22112640808000592, + "grad_norm": 7.3125, + "learning_rate": 9.9000619191766e-06, + "loss": 1.07952385, + "memory(GiB)": 302.58, + "step": 39540, + "train_speed(iter/s)": 0.125499 + }, + { + "acc": 0.72984829, + "epoch": 0.22123825755298518, + "grad_norm": 6.34375, + "learning_rate": 9.89987787781737e-06, + "loss": 1.06020947, + "memory(GiB)": 302.58, + "step": 39560, + "train_speed(iter/s)": 0.125529 + }, + { + "acc": 0.72560344, + "epoch": 0.22135010702596444, + "grad_norm": 6.78125, + "learning_rate": 9.89969366886678e-06, + "loss": 1.08789387, + "memory(GiB)": 302.58, + "step": 39580, + "train_speed(iter/s)": 0.125559 + }, + { + "acc": 0.72863126, + "epoch": 0.22146195649894374, + "grad_norm": 7.0, + "learning_rate": 9.89950929233113e-06, + "loss": 1.08062143, + "memory(GiB)": 302.58, + "step": 39600, + "train_speed(iter/s)": 0.125587 + }, + { + "acc": 0.72508874, + "epoch": 0.221573805971923, + "grad_norm": 8.3125, + "learning_rate": 9.899324748216726e-06, + "loss": 1.07598486, + "memory(GiB)": 302.58, + "step": 39620, + "train_speed(iter/s)": 0.125617 + }, + { + "acc": 0.70744362, + "epoch": 0.22168565544490226, + "grad_norm": 7.34375, + "learning_rate": 9.899140036529879e-06, + "loss": 1.17027569, + "memory(GiB)": 302.58, + "step": 39640, + "train_speed(iter/s)": 0.125647 + }, + { + "acc": 0.73782291, + "epoch": 0.22179750491788153, + "grad_norm": 7.75, + "learning_rate": 9.89895515727691e-06, + "loss": 1.00527573, + "memory(GiB)": 302.58, + "step": 39660, + "train_speed(iter/s)": 0.125678 + }, + { + "acc": 0.73215127, + "epoch": 0.2219093543908608, + "grad_norm": 6.15625, + "learning_rate": 9.89877011046414e-06, + "loss": 1.03589373, + "memory(GiB)": 302.58, + "step": 39680, + "train_speed(iter/s)": 0.125706 + }, + { + "acc": 0.72761564, + "epoch": 0.22202120386384006, + "grad_norm": 6.0625, + "learning_rate": 9.8985848960979e-06, + "loss": 1.10306883, + "memory(GiB)": 302.58, + "step": 39700, + "train_speed(iter/s)": 0.125736 + }, + { + "acc": 0.73536992, + "epoch": 0.22213305333681932, + "grad_norm": 5.84375, + "learning_rate": 9.89839951418452e-06, + "loss": 1.03500605, + "memory(GiB)": 302.58, + "step": 39720, + "train_speed(iter/s)": 0.125766 + }, + { + "acc": 0.73171673, + "epoch": 0.22224490280979858, + "grad_norm": 6.6875, + "learning_rate": 9.898213964730347e-06, + "loss": 1.0681551, + "memory(GiB)": 302.58, + "step": 39740, + "train_speed(iter/s)": 0.125796 + }, + { + "acc": 0.73167567, + "epoch": 0.22235675228277785, + "grad_norm": 6.96875, + "learning_rate": 9.898028247741725e-06, + "loss": 1.04991732, + "memory(GiB)": 302.58, + "step": 39760, + "train_speed(iter/s)": 0.125824 + }, + { + "acc": 0.73776979, + "epoch": 0.2224686017557571, + "grad_norm": 6.1875, + "learning_rate": 9.897842363225002e-06, + "loss": 1.02840424, + "memory(GiB)": 302.58, + "step": 39780, + "train_speed(iter/s)": 0.125853 + }, + { + "acc": 0.72257323, + "epoch": 0.22258045122873638, + "grad_norm": 7.34375, + "learning_rate": 9.897656311186543e-06, + "loss": 1.09796953, + "memory(GiB)": 302.58, + "step": 39800, + "train_speed(iter/s)": 0.125882 + }, + { + "acc": 0.73907328, + "epoch": 0.22269230070171564, + "grad_norm": 8.25, + "learning_rate": 9.897470091632708e-06, + "loss": 1.03188076, + "memory(GiB)": 302.58, + "step": 39820, + "train_speed(iter/s)": 0.125912 + }, + { + "acc": 0.72421441, + "epoch": 0.2228041501746949, + "grad_norm": 4.90625, + "learning_rate": 9.897283704569865e-06, + "loss": 1.0872591, + "memory(GiB)": 302.58, + "step": 39840, + "train_speed(iter/s)": 0.125942 + }, + { + "acc": 0.73851361, + "epoch": 0.22291599964767417, + "grad_norm": 8.125, + "learning_rate": 9.89709715000439e-06, + "loss": 1.02645903, + "memory(GiB)": 302.58, + "step": 39860, + "train_speed(iter/s)": 0.125973 + }, + { + "acc": 0.73205843, + "epoch": 0.22302784912065343, + "grad_norm": 5.875, + "learning_rate": 9.896910427942667e-06, + "loss": 1.05147047, + "memory(GiB)": 302.58, + "step": 39880, + "train_speed(iter/s)": 0.126001 + }, + { + "acc": 0.7209003, + "epoch": 0.2231396985936327, + "grad_norm": 7.0625, + "learning_rate": 9.896723538391078e-06, + "loss": 1.10023909, + "memory(GiB)": 302.58, + "step": 39900, + "train_speed(iter/s)": 0.12603 + }, + { + "acc": 0.71672583, + "epoch": 0.22325154806661196, + "grad_norm": 4.1875, + "learning_rate": 9.896536481356016e-06, + "loss": 1.12306576, + "memory(GiB)": 302.58, + "step": 39920, + "train_speed(iter/s)": 0.126057 + }, + { + "acc": 0.72578096, + "epoch": 0.22336339753959122, + "grad_norm": 4.84375, + "learning_rate": 9.89634925684388e-06, + "loss": 1.09451809, + "memory(GiB)": 302.58, + "step": 39940, + "train_speed(iter/s)": 0.126086 + }, + { + "acc": 0.72531238, + "epoch": 0.2234752470125705, + "grad_norm": 7.1875, + "learning_rate": 9.896161864861075e-06, + "loss": 1.0921463, + "memory(GiB)": 302.58, + "step": 39960, + "train_speed(iter/s)": 0.126114 + }, + { + "acc": 0.7102355, + "epoch": 0.22358709648554975, + "grad_norm": 6.59375, + "learning_rate": 9.89597430541401e-06, + "loss": 1.16696854, + "memory(GiB)": 302.58, + "step": 39980, + "train_speed(iter/s)": 0.126143 + }, + { + "acc": 0.73798985, + "epoch": 0.22369894595852902, + "grad_norm": 9.75, + "learning_rate": 9.895786578509094e-06, + "loss": 1.0291213, + "memory(GiB)": 302.58, + "step": 40000, + "train_speed(iter/s)": 0.126172 + }, + { + "epoch": 0.22369894595852902, + "eval_acc": 0.6958179761519895, + "eval_loss": 1.060030221939087, + "eval_runtime": 7525.2496, + "eval_samples_per_second": 10.004, + "eval_steps_per_second": 10.004, + "step": 40000 + }, + { + "acc": 0.72442007, + "epoch": 0.22381079543150828, + "grad_norm": 6.125, + "learning_rate": 9.895598684152757e-06, + "loss": 1.07937231, + "memory(GiB)": 302.58, + "step": 40020, + "train_speed(iter/s)": 0.123236 + }, + { + "acc": 0.73201103, + "epoch": 0.22392264490448754, + "grad_norm": 8.75, + "learning_rate": 9.89541062235142e-06, + "loss": 1.0439003, + "memory(GiB)": 302.58, + "step": 40040, + "train_speed(iter/s)": 0.123264 + }, + { + "acc": 0.74698806, + "epoch": 0.2240344943774668, + "grad_norm": 10.125, + "learning_rate": 9.89522239311152e-06, + "loss": 0.97642174, + "memory(GiB)": 302.58, + "step": 40060, + "train_speed(iter/s)": 0.123293 + }, + { + "acc": 0.74476018, + "epoch": 0.22414634385044607, + "grad_norm": 10.1875, + "learning_rate": 9.895033996439487e-06, + "loss": 1.01161442, + "memory(GiB)": 302.58, + "step": 40080, + "train_speed(iter/s)": 0.123322 + }, + { + "acc": 0.71029844, + "epoch": 0.22425819332342534, + "grad_norm": 5.59375, + "learning_rate": 9.894845432341771e-06, + "loss": 1.17154636, + "memory(GiB)": 302.58, + "step": 40100, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.72131543, + "epoch": 0.2243700427964046, + "grad_norm": 6.5, + "learning_rate": 9.894656700824822e-06, + "loss": 1.08680935, + "memory(GiB)": 302.58, + "step": 40120, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.73004093, + "epoch": 0.22448189226938386, + "grad_norm": 11.0, + "learning_rate": 9.894467801895091e-06, + "loss": 1.07541628, + "memory(GiB)": 302.58, + "step": 40140, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.73197665, + "epoch": 0.22459374174236313, + "grad_norm": 5.4375, + "learning_rate": 9.894278735559043e-06, + "loss": 1.04434528, + "memory(GiB)": 302.58, + "step": 40160, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.71260719, + "epoch": 0.2247055912153424, + "grad_norm": 11.4375, + "learning_rate": 9.894089501823142e-06, + "loss": 1.14616117, + "memory(GiB)": 302.58, + "step": 40180, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.72457161, + "epoch": 0.22481744068832166, + "grad_norm": 5.5625, + "learning_rate": 9.893900100693862e-06, + "loss": 1.0821599, + "memory(GiB)": 302.58, + "step": 40200, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.73260527, + "epoch": 0.22492929016130092, + "grad_norm": 6.84375, + "learning_rate": 9.893710532177679e-06, + "loss": 1.05632277, + "memory(GiB)": 302.58, + "step": 40220, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.72866774, + "epoch": 0.22504113963428019, + "grad_norm": 6.78125, + "learning_rate": 9.89352079628108e-06, + "loss": 1.0645052, + "memory(GiB)": 302.58, + "step": 40240, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.73666172, + "epoch": 0.22515298910725945, + "grad_norm": 6.1875, + "learning_rate": 9.893330893010552e-06, + "loss": 1.04433889, + "memory(GiB)": 302.58, + "step": 40260, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.74252629, + "epoch": 0.2252648385802387, + "grad_norm": 7.09375, + "learning_rate": 9.89314082237259e-06, + "loss": 0.992449, + "memory(GiB)": 302.58, + "step": 40280, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.72385535, + "epoch": 0.22537668805321798, + "grad_norm": 11.4375, + "learning_rate": 9.892950584373699e-06, + "loss": 1.06662579, + "memory(GiB)": 302.58, + "step": 40300, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.73186026, + "epoch": 0.22548853752619724, + "grad_norm": 7.40625, + "learning_rate": 9.89276017902038e-06, + "loss": 1.03944588, + "memory(GiB)": 302.58, + "step": 40320, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.73377433, + "epoch": 0.2256003869991765, + "grad_norm": 7.90625, + "learning_rate": 9.89256960631915e-06, + "loss": 1.070895, + "memory(GiB)": 302.58, + "step": 40340, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.74772677, + "epoch": 0.22571223647215577, + "grad_norm": 6.625, + "learning_rate": 9.892378866276525e-06, + "loss": 0.96278801, + "memory(GiB)": 302.58, + "step": 40360, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.72822938, + "epoch": 0.22582408594513503, + "grad_norm": 6.0625, + "learning_rate": 9.89218795889903e-06, + "loss": 1.06353703, + "memory(GiB)": 302.58, + "step": 40380, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.72514563, + "epoch": 0.2259359354181143, + "grad_norm": 9.0, + "learning_rate": 9.891996884193194e-06, + "loss": 1.08380404, + "memory(GiB)": 302.58, + "step": 40400, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.72888322, + "epoch": 0.22604778489109356, + "grad_norm": 7.0, + "learning_rate": 9.89180564216555e-06, + "loss": 1.07002316, + "memory(GiB)": 302.58, + "step": 40420, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.73263879, + "epoch": 0.22615963436407283, + "grad_norm": 6.1875, + "learning_rate": 9.891614232822645e-06, + "loss": 1.06410322, + "memory(GiB)": 302.58, + "step": 40440, + "train_speed(iter/s)": 0.12384 + }, + { + "acc": 0.74042521, + "epoch": 0.2262714838370521, + "grad_norm": 6.15625, + "learning_rate": 9.891422656171022e-06, + "loss": 1.01775293, + "memory(GiB)": 302.58, + "step": 40460, + "train_speed(iter/s)": 0.123869 + }, + { + "acc": 0.7280436, + "epoch": 0.22638333331003135, + "grad_norm": 6.34375, + "learning_rate": 9.891230912217232e-06, + "loss": 1.06390181, + "memory(GiB)": 302.58, + "step": 40480, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.71880994, + "epoch": 0.22649518278301062, + "grad_norm": 9.375, + "learning_rate": 9.891039000967836e-06, + "loss": 1.134478, + "memory(GiB)": 302.58, + "step": 40500, + "train_speed(iter/s)": 0.123925 + }, + { + "acc": 0.72919703, + "epoch": 0.22660703225598988, + "grad_norm": 4.53125, + "learning_rate": 9.890846922429397e-06, + "loss": 1.07849932, + "memory(GiB)": 302.58, + "step": 40520, + "train_speed(iter/s)": 0.123954 + }, + { + "acc": 0.72230844, + "epoch": 0.22671888172896915, + "grad_norm": 3.96875, + "learning_rate": 9.890654676608484e-06, + "loss": 1.1023715, + "memory(GiB)": 302.58, + "step": 40540, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.74714766, + "epoch": 0.2268307312019484, + "grad_norm": 7.46875, + "learning_rate": 9.890462263511676e-06, + "loss": 0.97782688, + "memory(GiB)": 302.58, + "step": 40560, + "train_speed(iter/s)": 0.124011 + }, + { + "acc": 0.73053713, + "epoch": 0.22694258067492767, + "grad_norm": 6.84375, + "learning_rate": 9.890269683145548e-06, + "loss": 1.0487606, + "memory(GiB)": 302.58, + "step": 40580, + "train_speed(iter/s)": 0.124039 + }, + { + "acc": 0.73115535, + "epoch": 0.22705443014790694, + "grad_norm": 8.6875, + "learning_rate": 9.890076935516692e-06, + "loss": 1.06711226, + "memory(GiB)": 302.58, + "step": 40600, + "train_speed(iter/s)": 0.124069 + }, + { + "acc": 0.72418909, + "epoch": 0.2271662796208862, + "grad_norm": 5.375, + "learning_rate": 9.8898840206317e-06, + "loss": 1.10957375, + "memory(GiB)": 302.58, + "step": 40620, + "train_speed(iter/s)": 0.124094 + }, + { + "acc": 0.72752113, + "epoch": 0.22727812909386547, + "grad_norm": 7.34375, + "learning_rate": 9.889690938497166e-06, + "loss": 1.06566544, + "memory(GiB)": 302.58, + "step": 40640, + "train_speed(iter/s)": 0.124122 + }, + { + "acc": 0.73357782, + "epoch": 0.22738997856684473, + "grad_norm": 7.9375, + "learning_rate": 9.889497689119698e-06, + "loss": 1.06089096, + "memory(GiB)": 302.58, + "step": 40660, + "train_speed(iter/s)": 0.124149 + }, + { + "acc": 0.7386848, + "epoch": 0.227501828039824, + "grad_norm": 8.8125, + "learning_rate": 9.889304272505906e-06, + "loss": 1.02713737, + "memory(GiB)": 302.58, + "step": 40680, + "train_speed(iter/s)": 0.124178 + }, + { + "acc": 0.74222131, + "epoch": 0.22761367751280326, + "grad_norm": 10.375, + "learning_rate": 9.889110688662405e-06, + "loss": 1.01691666, + "memory(GiB)": 302.58, + "step": 40700, + "train_speed(iter/s)": 0.124207 + }, + { + "acc": 0.7297987, + "epoch": 0.22772552698578252, + "grad_norm": 8.4375, + "learning_rate": 9.888916937595814e-06, + "loss": 1.06254473, + "memory(GiB)": 302.58, + "step": 40720, + "train_speed(iter/s)": 0.124233 + }, + { + "acc": 0.73697605, + "epoch": 0.2278373764587618, + "grad_norm": 9.5, + "learning_rate": 9.888723019312761e-06, + "loss": 1.03479862, + "memory(GiB)": 302.58, + "step": 40740, + "train_speed(iter/s)": 0.12426 + }, + { + "acc": 0.73576174, + "epoch": 0.22794922593174105, + "grad_norm": 6.71875, + "learning_rate": 9.88852893381988e-06, + "loss": 1.04538155, + "memory(GiB)": 302.58, + "step": 40760, + "train_speed(iter/s)": 0.124289 + }, + { + "acc": 0.73782129, + "epoch": 0.22806107540472031, + "grad_norm": 6.09375, + "learning_rate": 9.888334681123807e-06, + "loss": 1.00270882, + "memory(GiB)": 302.58, + "step": 40780, + "train_speed(iter/s)": 0.124317 + }, + { + "acc": 0.72842851, + "epoch": 0.22817292487769958, + "grad_norm": 8.375, + "learning_rate": 9.888140261231189e-06, + "loss": 1.07918968, + "memory(GiB)": 302.58, + "step": 40800, + "train_speed(iter/s)": 0.124347 + }, + { + "acc": 0.7412488, + "epoch": 0.22828477435067884, + "grad_norm": 6.8125, + "learning_rate": 9.887945674148675e-06, + "loss": 1.00081139, + "memory(GiB)": 302.58, + "step": 40820, + "train_speed(iter/s)": 0.124375 + }, + { + "acc": 0.73056822, + "epoch": 0.22839662382365813, + "grad_norm": 7.34375, + "learning_rate": 9.887750919882917e-06, + "loss": 1.07141714, + "memory(GiB)": 302.58, + "step": 40840, + "train_speed(iter/s)": 0.124399 + }, + { + "acc": 0.73156934, + "epoch": 0.2285084732966374, + "grad_norm": 4.625, + "learning_rate": 9.887555998440581e-06, + "loss": 1.04589939, + "memory(GiB)": 302.58, + "step": 40860, + "train_speed(iter/s)": 0.124427 + }, + { + "acc": 0.7387598, + "epoch": 0.22862032276961666, + "grad_norm": 8.0, + "learning_rate": 9.887360909828331e-06, + "loss": 1.02699871, + "memory(GiB)": 302.58, + "step": 40880, + "train_speed(iter/s)": 0.124455 + }, + { + "acc": 0.73433623, + "epoch": 0.22873217224259593, + "grad_norm": 5.25, + "learning_rate": 9.887165654052841e-06, + "loss": 1.05494576, + "memory(GiB)": 302.58, + "step": 40900, + "train_speed(iter/s)": 0.124484 + }, + { + "acc": 0.73957114, + "epoch": 0.2288440217155752, + "grad_norm": 7.0625, + "learning_rate": 9.88697023112079e-06, + "loss": 1.01600628, + "memory(GiB)": 302.58, + "step": 40920, + "train_speed(iter/s)": 0.124513 + }, + { + "acc": 0.72526898, + "epoch": 0.22895587118855446, + "grad_norm": 8.8125, + "learning_rate": 9.886774641038858e-06, + "loss": 1.0815486, + "memory(GiB)": 302.58, + "step": 40940, + "train_speed(iter/s)": 0.124541 + }, + { + "acc": 0.72803288, + "epoch": 0.22906772066153372, + "grad_norm": 7.28125, + "learning_rate": 9.88657888381374e-06, + "loss": 1.05231934, + "memory(GiB)": 302.58, + "step": 40960, + "train_speed(iter/s)": 0.124568 + }, + { + "acc": 0.72258105, + "epoch": 0.22917957013451298, + "grad_norm": 4.71875, + "learning_rate": 9.886382959452127e-06, + "loss": 1.08562078, + "memory(GiB)": 302.58, + "step": 40980, + "train_speed(iter/s)": 0.124596 + }, + { + "acc": 0.73825274, + "epoch": 0.22929141960749225, + "grad_norm": 6.6875, + "learning_rate": 9.886186867960724e-06, + "loss": 1.03776579, + "memory(GiB)": 302.58, + "step": 41000, + "train_speed(iter/s)": 0.124623 + }, + { + "acc": 0.72658672, + "epoch": 0.2294032690804715, + "grad_norm": 6.03125, + "learning_rate": 9.885990609346237e-06, + "loss": 1.10472193, + "memory(GiB)": 302.58, + "step": 41020, + "train_speed(iter/s)": 0.124651 + }, + { + "acc": 0.73501973, + "epoch": 0.22951511855345078, + "grad_norm": 6.9375, + "learning_rate": 9.885794183615377e-06, + "loss": 1.04790287, + "memory(GiB)": 302.58, + "step": 41040, + "train_speed(iter/s)": 0.124679 + }, + { + "acc": 0.7357451, + "epoch": 0.22962696802643004, + "grad_norm": 7.09375, + "learning_rate": 9.885597590774865e-06, + "loss": 1.05107603, + "memory(GiB)": 302.58, + "step": 41060, + "train_speed(iter/s)": 0.124706 + }, + { + "acc": 0.73585215, + "epoch": 0.2297388174994093, + "grad_norm": 8.5, + "learning_rate": 9.885400830831421e-06, + "loss": 1.04420872, + "memory(GiB)": 302.58, + "step": 41080, + "train_speed(iter/s)": 0.124732 + }, + { + "acc": 0.73037224, + "epoch": 0.22985066697238857, + "grad_norm": 5.625, + "learning_rate": 9.88520390379178e-06, + "loss": 1.0658556, + "memory(GiB)": 302.58, + "step": 41100, + "train_speed(iter/s)": 0.12476 + }, + { + "acc": 0.7387074, + "epoch": 0.22996251644536783, + "grad_norm": 5.28125, + "learning_rate": 9.885006809662674e-06, + "loss": 1.03429985, + "memory(GiB)": 302.58, + "step": 41120, + "train_speed(iter/s)": 0.124788 + }, + { + "acc": 0.73413177, + "epoch": 0.2300743659183471, + "grad_norm": 7.125, + "learning_rate": 9.884809548450844e-06, + "loss": 1.02955217, + "memory(GiB)": 302.58, + "step": 41140, + "train_speed(iter/s)": 0.124817 + }, + { + "acc": 0.72229605, + "epoch": 0.23018621539132636, + "grad_norm": 6.6875, + "learning_rate": 9.88461212016304e-06, + "loss": 1.09487391, + "memory(GiB)": 302.58, + "step": 41160, + "train_speed(iter/s)": 0.124846 + }, + { + "acc": 0.74502544, + "epoch": 0.23029806486430562, + "grad_norm": 4.21875, + "learning_rate": 9.884414524806011e-06, + "loss": 1.00993481, + "memory(GiB)": 302.58, + "step": 41180, + "train_speed(iter/s)": 0.124874 + }, + { + "acc": 0.73207407, + "epoch": 0.2304099143372849, + "grad_norm": 11.3125, + "learning_rate": 9.884216762386519e-06, + "loss": 1.06326246, + "memory(GiB)": 302.58, + "step": 41200, + "train_speed(iter/s)": 0.124903 + }, + { + "acc": 0.73186588, + "epoch": 0.23052176381026415, + "grad_norm": 6.5625, + "learning_rate": 9.884018832911326e-06, + "loss": 1.04860067, + "memory(GiB)": 302.58, + "step": 41220, + "train_speed(iter/s)": 0.12493 + }, + { + "acc": 0.7308486, + "epoch": 0.23063361328324342, + "grad_norm": 5.46875, + "learning_rate": 9.8838207363872e-06, + "loss": 1.0466095, + "memory(GiB)": 302.58, + "step": 41240, + "train_speed(iter/s)": 0.124959 + }, + { + "acc": 0.73108892, + "epoch": 0.23074546275622268, + "grad_norm": 7.6875, + "learning_rate": 9.883622472820921e-06, + "loss": 1.0740139, + "memory(GiB)": 302.58, + "step": 41260, + "train_speed(iter/s)": 0.124987 + }, + { + "acc": 0.73022728, + "epoch": 0.23085731222920194, + "grad_norm": 6.34375, + "learning_rate": 9.883424042219268e-06, + "loss": 1.05616531, + "memory(GiB)": 302.58, + "step": 41280, + "train_speed(iter/s)": 0.125016 + }, + { + "acc": 0.71876092, + "epoch": 0.2309691617021812, + "grad_norm": 12.75, + "learning_rate": 9.883225444589026e-06, + "loss": 1.11585741, + "memory(GiB)": 302.58, + "step": 41300, + "train_speed(iter/s)": 0.125044 + }, + { + "acc": 0.72772498, + "epoch": 0.23108101117516047, + "grad_norm": 6.90625, + "learning_rate": 9.88302667993699e-06, + "loss": 1.06729765, + "memory(GiB)": 302.58, + "step": 41320, + "train_speed(iter/s)": 0.125073 + }, + { + "acc": 0.74194202, + "epoch": 0.23119286064813974, + "grad_norm": 6.0625, + "learning_rate": 9.882827748269958e-06, + "loss": 1.01358404, + "memory(GiB)": 302.58, + "step": 41340, + "train_speed(iter/s)": 0.1251 + }, + { + "acc": 0.7439158, + "epoch": 0.231304710121119, + "grad_norm": 7.84375, + "learning_rate": 9.882628649594735e-06, + "loss": 0.9991642, + "memory(GiB)": 302.58, + "step": 41360, + "train_speed(iter/s)": 0.12513 + }, + { + "acc": 0.71543579, + "epoch": 0.23141655959409826, + "grad_norm": 8.3125, + "learning_rate": 9.882429383918129e-06, + "loss": 1.13776484, + "memory(GiB)": 302.58, + "step": 41380, + "train_speed(iter/s)": 0.125154 + }, + { + "acc": 0.73443394, + "epoch": 0.23152840906707753, + "grad_norm": 8.0625, + "learning_rate": 9.882229951246956e-06, + "loss": 1.0541254, + "memory(GiB)": 302.58, + "step": 41400, + "train_speed(iter/s)": 0.125183 + }, + { + "acc": 0.72868624, + "epoch": 0.2316402585400568, + "grad_norm": 7.46875, + "learning_rate": 9.882030351588038e-06, + "loss": 1.097927, + "memory(GiB)": 302.58, + "step": 41420, + "train_speed(iter/s)": 0.125211 + }, + { + "acc": 0.74763031, + "epoch": 0.23175210801303606, + "grad_norm": 7.09375, + "learning_rate": 9.881830584948202e-06, + "loss": 0.98221846, + "memory(GiB)": 302.58, + "step": 41440, + "train_speed(iter/s)": 0.12524 + }, + { + "acc": 0.72821469, + "epoch": 0.23186395748601532, + "grad_norm": 8.125, + "learning_rate": 9.88163065133428e-06, + "loss": 1.05428877, + "memory(GiB)": 302.58, + "step": 41460, + "train_speed(iter/s)": 0.125268 + }, + { + "acc": 0.7271173, + "epoch": 0.23197580695899458, + "grad_norm": 6.59375, + "learning_rate": 9.881430550753108e-06, + "loss": 1.07626715, + "memory(GiB)": 302.58, + "step": 41480, + "train_speed(iter/s)": 0.125297 + }, + { + "acc": 0.72931995, + "epoch": 0.23208765643197385, + "grad_norm": 5.78125, + "learning_rate": 9.881230283211536e-06, + "loss": 1.07794304, + "memory(GiB)": 302.58, + "step": 41500, + "train_speed(iter/s)": 0.125326 + }, + { + "acc": 0.72631407, + "epoch": 0.2321995059049531, + "grad_norm": 6.71875, + "learning_rate": 9.881029848716408e-06, + "loss": 1.08313599, + "memory(GiB)": 302.58, + "step": 41520, + "train_speed(iter/s)": 0.125354 + }, + { + "acc": 0.71918211, + "epoch": 0.23231135537793238, + "grad_norm": 8.375, + "learning_rate": 9.880829247274584e-06, + "loss": 1.11587143, + "memory(GiB)": 302.58, + "step": 41540, + "train_speed(iter/s)": 0.125383 + }, + { + "acc": 0.73463588, + "epoch": 0.23242320485091164, + "grad_norm": 8.625, + "learning_rate": 9.880628478892921e-06, + "loss": 1.02614422, + "memory(GiB)": 302.58, + "step": 41560, + "train_speed(iter/s)": 0.125411 + }, + { + "acc": 0.72747045, + "epoch": 0.2325350543238909, + "grad_norm": 7.21875, + "learning_rate": 9.880427543578288e-06, + "loss": 1.11937313, + "memory(GiB)": 302.58, + "step": 41580, + "train_speed(iter/s)": 0.125438 + }, + { + "acc": 0.72677913, + "epoch": 0.23264690379687017, + "grad_norm": 8.0, + "learning_rate": 9.880226441337559e-06, + "loss": 1.07639933, + "memory(GiB)": 302.58, + "step": 41600, + "train_speed(iter/s)": 0.125466 + }, + { + "acc": 0.72527213, + "epoch": 0.23275875326984943, + "grad_norm": 8.625, + "learning_rate": 9.88002517217761e-06, + "loss": 1.09933863, + "memory(GiB)": 302.58, + "step": 41620, + "train_speed(iter/s)": 0.125493 + }, + { + "acc": 0.73930912, + "epoch": 0.2328706027428287, + "grad_norm": 6.65625, + "learning_rate": 9.879823736105327e-06, + "loss": 1.03814259, + "memory(GiB)": 302.58, + "step": 41640, + "train_speed(iter/s)": 0.125521 + }, + { + "acc": 0.74995751, + "epoch": 0.23298245221580796, + "grad_norm": 6.09375, + "learning_rate": 9.879622133127597e-06, + "loss": 0.972616, + "memory(GiB)": 302.58, + "step": 41660, + "train_speed(iter/s)": 0.125546 + }, + { + "acc": 0.73338385, + "epoch": 0.23309430168878723, + "grad_norm": 7.15625, + "learning_rate": 9.87942036325132e-06, + "loss": 1.06977129, + "memory(GiB)": 302.58, + "step": 41680, + "train_speed(iter/s)": 0.125573 + }, + { + "acc": 0.74245801, + "epoch": 0.2332061511617665, + "grad_norm": 7.15625, + "learning_rate": 9.879218426483392e-06, + "loss": 1.01362267, + "memory(GiB)": 302.58, + "step": 41700, + "train_speed(iter/s)": 0.125601 + }, + { + "acc": 0.73826675, + "epoch": 0.23331800063474575, + "grad_norm": 10.1875, + "learning_rate": 9.879016322830723e-06, + "loss": 1.01759806, + "memory(GiB)": 302.58, + "step": 41720, + "train_speed(iter/s)": 0.125629 + }, + { + "acc": 0.71359005, + "epoch": 0.23342985010772502, + "grad_norm": 9.5, + "learning_rate": 9.878814052300225e-06, + "loss": 1.12952089, + "memory(GiB)": 302.58, + "step": 41740, + "train_speed(iter/s)": 0.125656 + }, + { + "acc": 0.72925677, + "epoch": 0.23354169958070428, + "grad_norm": 6.3125, + "learning_rate": 9.878611614898813e-06, + "loss": 1.0783144, + "memory(GiB)": 302.58, + "step": 41760, + "train_speed(iter/s)": 0.125683 + }, + { + "acc": 0.71989665, + "epoch": 0.23365354905368355, + "grad_norm": 7.6875, + "learning_rate": 9.87840901063342e-06, + "loss": 1.12877941, + "memory(GiB)": 302.58, + "step": 41780, + "train_speed(iter/s)": 0.125709 + }, + { + "acc": 0.73057413, + "epoch": 0.2337653985266628, + "grad_norm": 6.15625, + "learning_rate": 9.878206239510966e-06, + "loss": 1.07984877, + "memory(GiB)": 302.58, + "step": 41800, + "train_speed(iter/s)": 0.125737 + }, + { + "acc": 0.72462058, + "epoch": 0.23387724799964207, + "grad_norm": 6.53125, + "learning_rate": 9.87800330153839e-06, + "loss": 1.08923988, + "memory(GiB)": 302.58, + "step": 41820, + "train_speed(iter/s)": 0.125764 + }, + { + "acc": 0.71857071, + "epoch": 0.23398909747262134, + "grad_norm": 6.15625, + "learning_rate": 9.877800196722635e-06, + "loss": 1.12564354, + "memory(GiB)": 302.58, + "step": 41840, + "train_speed(iter/s)": 0.125792 + }, + { + "acc": 0.72578521, + "epoch": 0.2341009469456006, + "grad_norm": 5.75, + "learning_rate": 9.877596925070646e-06, + "loss": 1.09453859, + "memory(GiB)": 302.58, + "step": 41860, + "train_speed(iter/s)": 0.125819 + }, + { + "acc": 0.74579973, + "epoch": 0.23421279641857987, + "grad_norm": 9.3125, + "learning_rate": 9.877393486589373e-06, + "loss": 1.00930033, + "memory(GiB)": 302.58, + "step": 41880, + "train_speed(iter/s)": 0.125847 + }, + { + "acc": 0.72484093, + "epoch": 0.23432464589155913, + "grad_norm": 4.28125, + "learning_rate": 9.87718988128578e-06, + "loss": 1.080054, + "memory(GiB)": 302.58, + "step": 41900, + "train_speed(iter/s)": 0.125875 + }, + { + "acc": 0.72124596, + "epoch": 0.2344364953645384, + "grad_norm": 8.0625, + "learning_rate": 9.876986109166826e-06, + "loss": 1.09132347, + "memory(GiB)": 302.58, + "step": 41920, + "train_speed(iter/s)": 0.125902 + }, + { + "acc": 0.73320885, + "epoch": 0.23454834483751766, + "grad_norm": 6.125, + "learning_rate": 9.876782170239485e-06, + "loss": 1.05059252, + "memory(GiB)": 302.58, + "step": 41940, + "train_speed(iter/s)": 0.125929 + }, + { + "acc": 0.73496404, + "epoch": 0.23466019431049692, + "grad_norm": 5.53125, + "learning_rate": 9.876578064510728e-06, + "loss": 1.04846554, + "memory(GiB)": 302.58, + "step": 41960, + "train_speed(iter/s)": 0.125957 + }, + { + "acc": 0.71535497, + "epoch": 0.2347720437834762, + "grad_norm": 6.53125, + "learning_rate": 9.876373791987539e-06, + "loss": 1.10737181, + "memory(GiB)": 302.58, + "step": 41980, + "train_speed(iter/s)": 0.125986 + }, + { + "acc": 0.72670975, + "epoch": 0.23488389325645545, + "grad_norm": 8.125, + "learning_rate": 9.876169352676903e-06, + "loss": 1.09961472, + "memory(GiB)": 302.58, + "step": 42000, + "train_speed(iter/s)": 0.126014 + }, + { + "epoch": 0.23488389325645545, + "eval_acc": 0.696273379863584, + "eval_loss": 1.05818510055542, + "eval_runtime": 7510.4824, + "eval_samples_per_second": 10.024, + "eval_steps_per_second": 10.024, + "step": 42000 + }, + { + "acc": 0.74041972, + "epoch": 0.23499574272943471, + "grad_norm": 7.09375, + "learning_rate": 9.875964746585814e-06, + "loss": 1.02435198, + "memory(GiB)": 302.58, + "step": 42020, + "train_speed(iter/s)": 0.123225 + }, + { + "acc": 0.73672318, + "epoch": 0.23510759220241398, + "grad_norm": 5.46875, + "learning_rate": 9.875759973721269e-06, + "loss": 1.04470663, + "memory(GiB)": 302.58, + "step": 42040, + "train_speed(iter/s)": 0.123252 + }, + { + "acc": 0.73349137, + "epoch": 0.23521944167539324, + "grad_norm": 7.03125, + "learning_rate": 9.875555034090272e-06, + "loss": 1.05238247, + "memory(GiB)": 302.58, + "step": 42060, + "train_speed(iter/s)": 0.123279 + }, + { + "acc": 0.73289461, + "epoch": 0.2353312911483725, + "grad_norm": 5.25, + "learning_rate": 9.875349927699833e-06, + "loss": 1.03735533, + "memory(GiB)": 302.58, + "step": 42080, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.73954344, + "epoch": 0.2354431406213518, + "grad_norm": 6.71875, + "learning_rate": 9.875144654556967e-06, + "loss": 1.02346582, + "memory(GiB)": 302.58, + "step": 42100, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.7478673, + "epoch": 0.23555499009433106, + "grad_norm": 8.5, + "learning_rate": 9.874939214668696e-06, + "loss": 0.99137125, + "memory(GiB)": 302.58, + "step": 42120, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.72791467, + "epoch": 0.23566683956731033, + "grad_norm": 7.125, + "learning_rate": 9.874733608042045e-06, + "loss": 1.06321707, + "memory(GiB)": 302.58, + "step": 42140, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.72295089, + "epoch": 0.2357786890402896, + "grad_norm": 7.5625, + "learning_rate": 9.874527834684046e-06, + "loss": 1.09575758, + "memory(GiB)": 302.58, + "step": 42160, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.72466555, + "epoch": 0.23589053851326885, + "grad_norm": 5.1875, + "learning_rate": 9.87432189460174e-06, + "loss": 1.08243294, + "memory(GiB)": 302.58, + "step": 42180, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.7477725, + "epoch": 0.23600238798624812, + "grad_norm": 7.125, + "learning_rate": 9.87411578780217e-06, + "loss": 0.99857197, + "memory(GiB)": 302.58, + "step": 42200, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.72639513, + "epoch": 0.23611423745922738, + "grad_norm": 5.34375, + "learning_rate": 9.873909514292383e-06, + "loss": 1.07294016, + "memory(GiB)": 302.58, + "step": 42220, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.72433324, + "epoch": 0.23622608693220665, + "grad_norm": 8.125, + "learning_rate": 9.873703074079436e-06, + "loss": 1.10295715, + "memory(GiB)": 302.58, + "step": 42240, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.74066067, + "epoch": 0.2363379364051859, + "grad_norm": 8.5625, + "learning_rate": 9.87349646717039e-06, + "loss": 1.01938419, + "memory(GiB)": 302.58, + "step": 42260, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.71927285, + "epoch": 0.23644978587816518, + "grad_norm": 6.84375, + "learning_rate": 9.873289693572313e-06, + "loss": 1.12281218, + "memory(GiB)": 302.58, + "step": 42280, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.73873754, + "epoch": 0.23656163535114444, + "grad_norm": 6.09375, + "learning_rate": 9.873082753292273e-06, + "loss": 1.05914621, + "memory(GiB)": 302.58, + "step": 42300, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.73712053, + "epoch": 0.2366734848241237, + "grad_norm": 9.0625, + "learning_rate": 9.872875646337352e-06, + "loss": 1.05991287, + "memory(GiB)": 302.58, + "step": 42320, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.7207727, + "epoch": 0.23678533429710297, + "grad_norm": 6.46875, + "learning_rate": 9.872668372714633e-06, + "loss": 1.13229609, + "memory(GiB)": 302.58, + "step": 42340, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.72133274, + "epoch": 0.23689718377008223, + "grad_norm": 7.15625, + "learning_rate": 9.872460932431203e-06, + "loss": 1.106635, + "memory(GiB)": 302.58, + "step": 42360, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.73824024, + "epoch": 0.2370090332430615, + "grad_norm": 5.125, + "learning_rate": 9.872253325494162e-06, + "loss": 1.04176388, + "memory(GiB)": 302.58, + "step": 42380, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.71726251, + "epoch": 0.23712088271604076, + "grad_norm": 5.9375, + "learning_rate": 9.872045551910605e-06, + "loss": 1.11834202, + "memory(GiB)": 302.58, + "step": 42400, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.73549285, + "epoch": 0.23723273218902002, + "grad_norm": 6.21875, + "learning_rate": 9.871837611687642e-06, + "loss": 1.05409813, + "memory(GiB)": 302.58, + "step": 42420, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.74052992, + "epoch": 0.2373445816619993, + "grad_norm": 7.21875, + "learning_rate": 9.871629504832385e-06, + "loss": 1.02947426, + "memory(GiB)": 302.58, + "step": 42440, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.74938145, + "epoch": 0.23745643113497855, + "grad_norm": 7.09375, + "learning_rate": 9.871421231351951e-06, + "loss": 0.9739521, + "memory(GiB)": 302.58, + "step": 42460, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.72710605, + "epoch": 0.23756828060795782, + "grad_norm": 10.0, + "learning_rate": 9.871212791253464e-06, + "loss": 1.08228378, + "memory(GiB)": 302.58, + "step": 42480, + "train_speed(iter/s)": 0.123846 + }, + { + "acc": 0.72968392, + "epoch": 0.23768013008093708, + "grad_norm": 8.375, + "learning_rate": 9.871004184544054e-06, + "loss": 1.07725534, + "memory(GiB)": 302.58, + "step": 42500, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.71852179, + "epoch": 0.23779197955391634, + "grad_norm": 8.25, + "learning_rate": 9.870795411230855e-06, + "loss": 1.09344645, + "memory(GiB)": 302.58, + "step": 42520, + "train_speed(iter/s)": 0.123902 + }, + { + "acc": 0.73792439, + "epoch": 0.2379038290268956, + "grad_norm": 6.375, + "learning_rate": 9.870586471321009e-06, + "loss": 1.00951738, + "memory(GiB)": 302.58, + "step": 42540, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.73622036, + "epoch": 0.23801567849987487, + "grad_norm": 5.90625, + "learning_rate": 9.87037736482166e-06, + "loss": 1.0344346, + "memory(GiB)": 302.58, + "step": 42560, + "train_speed(iter/s)": 0.123955 + }, + { + "acc": 0.72214446, + "epoch": 0.23812752797285414, + "grad_norm": 9.5625, + "learning_rate": 9.870168091739962e-06, + "loss": 1.08873653, + "memory(GiB)": 302.58, + "step": 42580, + "train_speed(iter/s)": 0.123981 + }, + { + "acc": 0.73096466, + "epoch": 0.2382393774458334, + "grad_norm": 5.5625, + "learning_rate": 9.869958652083072e-06, + "loss": 1.07470102, + "memory(GiB)": 302.58, + "step": 42600, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.73659344, + "epoch": 0.23835122691881266, + "grad_norm": 7.375, + "learning_rate": 9.869749045858154e-06, + "loss": 1.04833565, + "memory(GiB)": 302.58, + "step": 42620, + "train_speed(iter/s)": 0.124036 + }, + { + "acc": 0.72942986, + "epoch": 0.23846307639179193, + "grad_norm": 6.96875, + "learning_rate": 9.869539273072378e-06, + "loss": 1.09079399, + "memory(GiB)": 302.58, + "step": 42640, + "train_speed(iter/s)": 0.124062 + }, + { + "acc": 0.72026429, + "epoch": 0.2385749258647712, + "grad_norm": 6.28125, + "learning_rate": 9.869329333732917e-06, + "loss": 1.11754322, + "memory(GiB)": 302.58, + "step": 42660, + "train_speed(iter/s)": 0.124089 + }, + { + "acc": 0.71632996, + "epoch": 0.23868677533775046, + "grad_norm": 7.625, + "learning_rate": 9.869119227846952e-06, + "loss": 1.11701994, + "memory(GiB)": 302.58, + "step": 42680, + "train_speed(iter/s)": 0.124116 + }, + { + "acc": 0.73498645, + "epoch": 0.23879862481072972, + "grad_norm": 6.875, + "learning_rate": 9.868908955421672e-06, + "loss": 1.02981844, + "memory(GiB)": 302.58, + "step": 42700, + "train_speed(iter/s)": 0.12414 + }, + { + "acc": 0.7315578, + "epoch": 0.23891047428370898, + "grad_norm": 6.65625, + "learning_rate": 9.868698516464266e-06, + "loss": 1.05687075, + "memory(GiB)": 302.58, + "step": 42720, + "train_speed(iter/s)": 0.124167 + }, + { + "acc": 0.74277034, + "epoch": 0.23902232375668825, + "grad_norm": 7.96875, + "learning_rate": 9.86848791098193e-06, + "loss": 1.00843401, + "memory(GiB)": 302.58, + "step": 42740, + "train_speed(iter/s)": 0.124195 + }, + { + "acc": 0.73389993, + "epoch": 0.2391341732296675, + "grad_norm": 5.84375, + "learning_rate": 9.868277138981873e-06, + "loss": 1.06752701, + "memory(GiB)": 302.58, + "step": 42760, + "train_speed(iter/s)": 0.124222 + }, + { + "acc": 0.73354044, + "epoch": 0.23924602270264678, + "grad_norm": 7.03125, + "learning_rate": 9.8680662004713e-06, + "loss": 1.05837975, + "memory(GiB)": 302.58, + "step": 42780, + "train_speed(iter/s)": 0.12425 + }, + { + "acc": 0.7293829, + "epoch": 0.23935787217562604, + "grad_norm": 6.59375, + "learning_rate": 9.867855095457426e-06, + "loss": 1.08173409, + "memory(GiB)": 302.58, + "step": 42800, + "train_speed(iter/s)": 0.124278 + }, + { + "acc": 0.72486124, + "epoch": 0.2394697216486053, + "grad_norm": 9.1875, + "learning_rate": 9.867643823947472e-06, + "loss": 1.08918324, + "memory(GiB)": 302.58, + "step": 42820, + "train_speed(iter/s)": 0.124306 + }, + { + "acc": 0.73446512, + "epoch": 0.23958157112158457, + "grad_norm": 7.46875, + "learning_rate": 9.867432385948666e-06, + "loss": 1.02116385, + "memory(GiB)": 302.58, + "step": 42840, + "train_speed(iter/s)": 0.124335 + }, + { + "acc": 0.7389894, + "epoch": 0.23969342059456383, + "grad_norm": 8.375, + "learning_rate": 9.867220781468239e-06, + "loss": 1.0203908, + "memory(GiB)": 302.58, + "step": 42860, + "train_speed(iter/s)": 0.124361 + }, + { + "acc": 0.74365206, + "epoch": 0.2398052700675431, + "grad_norm": 5.6875, + "learning_rate": 9.867009010513424e-06, + "loss": 1.00413361, + "memory(GiB)": 302.58, + "step": 42880, + "train_speed(iter/s)": 0.124388 + }, + { + "acc": 0.73401299, + "epoch": 0.23991711954052236, + "grad_norm": 7.625, + "learning_rate": 9.866797073091471e-06, + "loss": 1.05977983, + "memory(GiB)": 302.58, + "step": 42900, + "train_speed(iter/s)": 0.124415 + }, + { + "acc": 0.72685218, + "epoch": 0.24002896901350163, + "grad_norm": 5.65625, + "learning_rate": 9.866584969209624e-06, + "loss": 1.08386488, + "memory(GiB)": 302.58, + "step": 42920, + "train_speed(iter/s)": 0.124441 + }, + { + "acc": 0.73207974, + "epoch": 0.2401408184864809, + "grad_norm": 6.375, + "learning_rate": 9.866372698875143e-06, + "loss": 1.03902216, + "memory(GiB)": 302.58, + "step": 42940, + "train_speed(iter/s)": 0.124469 + }, + { + "acc": 0.73309488, + "epoch": 0.24025266795946015, + "grad_norm": 6.0625, + "learning_rate": 9.866160262095282e-06, + "loss": 1.07316732, + "memory(GiB)": 302.58, + "step": 42960, + "train_speed(iter/s)": 0.124496 + }, + { + "acc": 0.73644204, + "epoch": 0.24036451743243942, + "grad_norm": 5.65625, + "learning_rate": 9.865947658877309e-06, + "loss": 1.02965508, + "memory(GiB)": 302.58, + "step": 42980, + "train_speed(iter/s)": 0.124523 + }, + { + "acc": 0.72084489, + "epoch": 0.24047636690541868, + "grad_norm": 10.25, + "learning_rate": 9.865734889228497e-06, + "loss": 1.13254671, + "memory(GiB)": 302.58, + "step": 43000, + "train_speed(iter/s)": 0.124549 + }, + { + "acc": 0.7279603, + "epoch": 0.24058821637839795, + "grad_norm": 6.96875, + "learning_rate": 9.865521953156126e-06, + "loss": 1.0748724, + "memory(GiB)": 302.58, + "step": 43020, + "train_speed(iter/s)": 0.124577 + }, + { + "acc": 0.73063831, + "epoch": 0.2407000658513772, + "grad_norm": 6.3125, + "learning_rate": 9.865308850667473e-06, + "loss": 1.04853802, + "memory(GiB)": 302.58, + "step": 43040, + "train_speed(iter/s)": 0.124604 + }, + { + "acc": 0.73659692, + "epoch": 0.24081191532435647, + "grad_norm": 5.125, + "learning_rate": 9.86509558176983e-06, + "loss": 1.03313732, + "memory(GiB)": 302.58, + "step": 43060, + "train_speed(iter/s)": 0.124628 + }, + { + "acc": 0.74295254, + "epoch": 0.24092376479733574, + "grad_norm": 8.5625, + "learning_rate": 9.86488214647049e-06, + "loss": 1.02324877, + "memory(GiB)": 302.58, + "step": 43080, + "train_speed(iter/s)": 0.124656 + }, + { + "acc": 0.74462328, + "epoch": 0.241035614270315, + "grad_norm": 4.71875, + "learning_rate": 9.864668544776758e-06, + "loss": 1.00221767, + "memory(GiB)": 302.58, + "step": 43100, + "train_speed(iter/s)": 0.124682 + }, + { + "acc": 0.73495326, + "epoch": 0.24114746374329427, + "grad_norm": 7.625, + "learning_rate": 9.864454776695935e-06, + "loss": 1.04376516, + "memory(GiB)": 302.58, + "step": 43120, + "train_speed(iter/s)": 0.124708 + }, + { + "acc": 0.73226681, + "epoch": 0.24125931321627353, + "grad_norm": 6.9375, + "learning_rate": 9.864240842235333e-06, + "loss": 1.07241421, + "memory(GiB)": 302.58, + "step": 43140, + "train_speed(iter/s)": 0.124736 + }, + { + "acc": 0.72155371, + "epoch": 0.2413711626892528, + "grad_norm": 7.78125, + "learning_rate": 9.86402674140227e-06, + "loss": 1.09353561, + "memory(GiB)": 302.58, + "step": 43160, + "train_speed(iter/s)": 0.124762 + }, + { + "acc": 0.73136115, + "epoch": 0.24148301216223206, + "grad_norm": 6.9375, + "learning_rate": 9.86381247420407e-06, + "loss": 1.06826162, + "memory(GiB)": 302.58, + "step": 43180, + "train_speed(iter/s)": 0.124788 + }, + { + "acc": 0.72725668, + "epoch": 0.24159486163521132, + "grad_norm": 6.96875, + "learning_rate": 9.863598040648062e-06, + "loss": 1.08711519, + "memory(GiB)": 302.58, + "step": 43200, + "train_speed(iter/s)": 0.124813 + }, + { + "acc": 0.74333405, + "epoch": 0.2417067111081906, + "grad_norm": 4.09375, + "learning_rate": 9.863383440741576e-06, + "loss": 0.98320618, + "memory(GiB)": 302.58, + "step": 43220, + "train_speed(iter/s)": 0.12484 + }, + { + "acc": 0.72071805, + "epoch": 0.24181856058116985, + "grad_norm": 6.78125, + "learning_rate": 9.863168674491956e-06, + "loss": 1.09234676, + "memory(GiB)": 302.58, + "step": 43240, + "train_speed(iter/s)": 0.124867 + }, + { + "acc": 0.72717423, + "epoch": 0.24193041005414911, + "grad_norm": 8.0, + "learning_rate": 9.862953741906548e-06, + "loss": 1.09007492, + "memory(GiB)": 302.58, + "step": 43260, + "train_speed(iter/s)": 0.124893 + }, + { + "acc": 0.74016304, + "epoch": 0.24204225952712838, + "grad_norm": 7.4375, + "learning_rate": 9.8627386429927e-06, + "loss": 1.03533115, + "memory(GiB)": 302.58, + "step": 43280, + "train_speed(iter/s)": 0.124919 + }, + { + "acc": 0.73943214, + "epoch": 0.24215410900010764, + "grad_norm": 9.5, + "learning_rate": 9.862523377757774e-06, + "loss": 1.0176466, + "memory(GiB)": 302.58, + "step": 43300, + "train_speed(iter/s)": 0.124946 + }, + { + "acc": 0.71259136, + "epoch": 0.2422659584730869, + "grad_norm": 7.90625, + "learning_rate": 9.862307946209126e-06, + "loss": 1.14191141, + "memory(GiB)": 302.58, + "step": 43320, + "train_speed(iter/s)": 0.124971 + }, + { + "acc": 0.73830934, + "epoch": 0.24237780794606617, + "grad_norm": 7.15625, + "learning_rate": 9.86209234835413e-06, + "loss": 1.05953245, + "memory(GiB)": 302.58, + "step": 43340, + "train_speed(iter/s)": 0.124998 + }, + { + "acc": 0.74599791, + "epoch": 0.24248965741904546, + "grad_norm": 9.6875, + "learning_rate": 9.861876584200159e-06, + "loss": 0.99061317, + "memory(GiB)": 302.58, + "step": 43360, + "train_speed(iter/s)": 0.125026 + }, + { + "acc": 0.72655878, + "epoch": 0.24260150689202473, + "grad_norm": 9.5625, + "learning_rate": 9.861660653754592e-06, + "loss": 1.09063816, + "memory(GiB)": 302.58, + "step": 43380, + "train_speed(iter/s)": 0.125055 + }, + { + "acc": 0.72658458, + "epoch": 0.242713356365004, + "grad_norm": 6.125, + "learning_rate": 9.861444557024815e-06, + "loss": 1.05254221, + "memory(GiB)": 302.58, + "step": 43400, + "train_speed(iter/s)": 0.125083 + }, + { + "acc": 0.73790898, + "epoch": 0.24282520583798325, + "grad_norm": 5.5625, + "learning_rate": 9.861228294018217e-06, + "loss": 1.01765308, + "memory(GiB)": 302.58, + "step": 43420, + "train_speed(iter/s)": 0.125107 + }, + { + "acc": 0.74780211, + "epoch": 0.24293705531096252, + "grad_norm": 6.09375, + "learning_rate": 9.8610118647422e-06, + "loss": 0.98639364, + "memory(GiB)": 302.58, + "step": 43440, + "train_speed(iter/s)": 0.125134 + }, + { + "acc": 0.73964562, + "epoch": 0.24304890478394178, + "grad_norm": 7.75, + "learning_rate": 9.86079526920416e-06, + "loss": 1.01172428, + "memory(GiB)": 302.58, + "step": 43460, + "train_speed(iter/s)": 0.125162 + }, + { + "acc": 0.73494673, + "epoch": 0.24316075425692105, + "grad_norm": 7.09375, + "learning_rate": 9.86057850741151e-06, + "loss": 1.04030418, + "memory(GiB)": 302.58, + "step": 43480, + "train_speed(iter/s)": 0.125189 + }, + { + "acc": 0.72062163, + "epoch": 0.2432726037299003, + "grad_norm": 5.84375, + "learning_rate": 9.860361579371664e-06, + "loss": 1.11953182, + "memory(GiB)": 302.58, + "step": 43500, + "train_speed(iter/s)": 0.125217 + }, + { + "acc": 0.72768936, + "epoch": 0.24338445320287957, + "grad_norm": 6.4375, + "learning_rate": 9.86014448509204e-06, + "loss": 1.08960791, + "memory(GiB)": 302.58, + "step": 43520, + "train_speed(iter/s)": 0.125243 + }, + { + "acc": 0.73731256, + "epoch": 0.24349630267585884, + "grad_norm": 6.40625, + "learning_rate": 9.859927224580062e-06, + "loss": 1.02159204, + "memory(GiB)": 302.58, + "step": 43540, + "train_speed(iter/s)": 0.125269 + }, + { + "acc": 0.72272677, + "epoch": 0.2436081521488381, + "grad_norm": 5.78125, + "learning_rate": 9.859709797843162e-06, + "loss": 1.1086997, + "memory(GiB)": 302.58, + "step": 43560, + "train_speed(iter/s)": 0.125292 + }, + { + "acc": 0.73115983, + "epoch": 0.24372000162181737, + "grad_norm": 6.6875, + "learning_rate": 9.859492204888778e-06, + "loss": 1.05621529, + "memory(GiB)": 302.58, + "step": 43580, + "train_speed(iter/s)": 0.125319 + }, + { + "acc": 0.73441339, + "epoch": 0.24383185109479663, + "grad_norm": 8.25, + "learning_rate": 9.859274445724352e-06, + "loss": 1.05568571, + "memory(GiB)": 302.58, + "step": 43600, + "train_speed(iter/s)": 0.125345 + }, + { + "acc": 0.73335791, + "epoch": 0.2439437005677759, + "grad_norm": 8.0, + "learning_rate": 9.859056520357334e-06, + "loss": 1.05068474, + "memory(GiB)": 302.58, + "step": 43620, + "train_speed(iter/s)": 0.12537 + }, + { + "acc": 0.73300834, + "epoch": 0.24405555004075516, + "grad_norm": 6.84375, + "learning_rate": 9.858838428795171e-06, + "loss": 1.06280565, + "memory(GiB)": 302.58, + "step": 43640, + "train_speed(iter/s)": 0.125397 + }, + { + "acc": 0.71701388, + "epoch": 0.24416739951373442, + "grad_norm": 6.5625, + "learning_rate": 9.858620171045328e-06, + "loss": 1.1343977, + "memory(GiB)": 302.58, + "step": 43660, + "train_speed(iter/s)": 0.125422 + }, + { + "acc": 0.72156863, + "epoch": 0.2442792489867137, + "grad_norm": 7.1875, + "learning_rate": 9.858401747115268e-06, + "loss": 1.09964161, + "memory(GiB)": 302.58, + "step": 43680, + "train_speed(iter/s)": 0.125449 + }, + { + "acc": 0.75345616, + "epoch": 0.24439109845969295, + "grad_norm": 5.96875, + "learning_rate": 9.858183157012466e-06, + "loss": 0.96709766, + "memory(GiB)": 302.58, + "step": 43700, + "train_speed(iter/s)": 0.125477 + }, + { + "acc": 0.73016319, + "epoch": 0.24450294793267222, + "grad_norm": 7.40625, + "learning_rate": 9.857964400744394e-06, + "loss": 1.06440287, + "memory(GiB)": 302.58, + "step": 43720, + "train_speed(iter/s)": 0.125504 + }, + { + "acc": 0.73468876, + "epoch": 0.24461479740565148, + "grad_norm": 6.53125, + "learning_rate": 9.857745478318535e-06, + "loss": 1.0557662, + "memory(GiB)": 302.58, + "step": 43740, + "train_speed(iter/s)": 0.125531 + }, + { + "acc": 0.72804794, + "epoch": 0.24472664687863074, + "grad_norm": 6.71875, + "learning_rate": 9.857526389742376e-06, + "loss": 1.05573473, + "memory(GiB)": 302.58, + "step": 43760, + "train_speed(iter/s)": 0.125557 + }, + { + "acc": 0.73164263, + "epoch": 0.24483849635161, + "grad_norm": 6.0, + "learning_rate": 9.857307135023412e-06, + "loss": 1.06074648, + "memory(GiB)": 302.58, + "step": 43780, + "train_speed(iter/s)": 0.125583 + }, + { + "acc": 0.73720713, + "epoch": 0.24495034582458927, + "grad_norm": 4.5625, + "learning_rate": 9.857087714169142e-06, + "loss": 1.0213378, + "memory(GiB)": 302.58, + "step": 43800, + "train_speed(iter/s)": 0.12561 + }, + { + "acc": 0.73539162, + "epoch": 0.24506219529756854, + "grad_norm": 5.03125, + "learning_rate": 9.856868127187072e-06, + "loss": 1.03783646, + "memory(GiB)": 302.58, + "step": 43820, + "train_speed(iter/s)": 0.125636 + }, + { + "acc": 0.71603937, + "epoch": 0.2451740447705478, + "grad_norm": 5.75, + "learning_rate": 9.85664837408471e-06, + "loss": 1.12727528, + "memory(GiB)": 302.58, + "step": 43840, + "train_speed(iter/s)": 0.125661 + }, + { + "acc": 0.72427235, + "epoch": 0.24528589424352706, + "grad_norm": 5.9375, + "learning_rate": 9.856428454869574e-06, + "loss": 1.08535795, + "memory(GiB)": 302.58, + "step": 43860, + "train_speed(iter/s)": 0.125686 + }, + { + "acc": 0.7341507, + "epoch": 0.24539774371650633, + "grad_norm": 6.625, + "learning_rate": 9.856208369549185e-06, + "loss": 1.03626432, + "memory(GiB)": 302.58, + "step": 43880, + "train_speed(iter/s)": 0.12571 + }, + { + "acc": 0.73272333, + "epoch": 0.2455095931894856, + "grad_norm": 6.96875, + "learning_rate": 9.855988118131071e-06, + "loss": 1.06120081, + "memory(GiB)": 302.58, + "step": 43900, + "train_speed(iter/s)": 0.125737 + }, + { + "acc": 0.73459949, + "epoch": 0.24562144266246486, + "grad_norm": 6.5625, + "learning_rate": 9.855767700622766e-06, + "loss": 1.05521593, + "memory(GiB)": 302.58, + "step": 43920, + "train_speed(iter/s)": 0.125763 + }, + { + "acc": 0.72922764, + "epoch": 0.24573329213544412, + "grad_norm": 7.34375, + "learning_rate": 9.855547117031809e-06, + "loss": 1.06523266, + "memory(GiB)": 302.58, + "step": 43940, + "train_speed(iter/s)": 0.12579 + }, + { + "acc": 0.72795053, + "epoch": 0.24584514160842338, + "grad_norm": 5.28125, + "learning_rate": 9.855326367365743e-06, + "loss": 1.07232008, + "memory(GiB)": 302.58, + "step": 43960, + "train_speed(iter/s)": 0.125817 + }, + { + "acc": 0.73058481, + "epoch": 0.24595699108140265, + "grad_norm": 8.4375, + "learning_rate": 9.855105451632121e-06, + "loss": 1.07472296, + "memory(GiB)": 302.58, + "step": 43980, + "train_speed(iter/s)": 0.125843 + }, + { + "acc": 0.72510872, + "epoch": 0.2460688405543819, + "grad_norm": 8.625, + "learning_rate": 9.854884369838496e-06, + "loss": 1.08152494, + "memory(GiB)": 302.58, + "step": 44000, + "train_speed(iter/s)": 0.12587 + }, + { + "epoch": 0.2460688405543819, + "eval_acc": 0.6965869567210696, + "eval_loss": 1.0568163394927979, + "eval_runtime": 7539.6966, + "eval_samples_per_second": 9.985, + "eval_steps_per_second": 9.985, + "step": 44000 + }, + { + "acc": 0.72692919, + "epoch": 0.24618069002736118, + "grad_norm": 7.34375, + "learning_rate": 9.854663121992432e-06, + "loss": 1.07702341, + "memory(GiB)": 302.58, + "step": 44020, + "train_speed(iter/s)": 0.123203 + }, + { + "acc": 0.72152162, + "epoch": 0.24629253950034044, + "grad_norm": 5.9375, + "learning_rate": 9.854441708101494e-06, + "loss": 1.12136688, + "memory(GiB)": 302.58, + "step": 44040, + "train_speed(iter/s)": 0.123227 + }, + { + "acc": 0.73346972, + "epoch": 0.2464043889733197, + "grad_norm": 7.75, + "learning_rate": 9.854220128173257e-06, + "loss": 1.05044861, + "memory(GiB)": 302.58, + "step": 44060, + "train_speed(iter/s)": 0.123254 + }, + { + "acc": 0.73307719, + "epoch": 0.24651623844629897, + "grad_norm": 7.4375, + "learning_rate": 9.8539983822153e-06, + "loss": 1.07152138, + "memory(GiB)": 302.58, + "step": 44080, + "train_speed(iter/s)": 0.12328 + }, + { + "acc": 0.72765074, + "epoch": 0.24662808791927823, + "grad_norm": 4.75, + "learning_rate": 9.853776470235207e-06, + "loss": 1.05844193, + "memory(GiB)": 302.58, + "step": 44100, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.73343072, + "epoch": 0.2467399373922575, + "grad_norm": 9.5, + "learning_rate": 9.853554392240567e-06, + "loss": 1.06058359, + "memory(GiB)": 302.58, + "step": 44120, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.72664599, + "epoch": 0.24685178686523676, + "grad_norm": 8.125, + "learning_rate": 9.853332148238979e-06, + "loss": 1.08525858, + "memory(GiB)": 302.58, + "step": 44140, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.71799736, + "epoch": 0.24696363633821602, + "grad_norm": 7.78125, + "learning_rate": 9.853109738238038e-06, + "loss": 1.12657232, + "memory(GiB)": 302.58, + "step": 44160, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.7383935, + "epoch": 0.2470754858111953, + "grad_norm": 5.5, + "learning_rate": 9.852887162245357e-06, + "loss": 1.00437746, + "memory(GiB)": 302.58, + "step": 44180, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.73235092, + "epoch": 0.24718733528417455, + "grad_norm": 6.96875, + "learning_rate": 9.852664420268549e-06, + "loss": 1.06684656, + "memory(GiB)": 302.58, + "step": 44200, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.73237281, + "epoch": 0.24729918475715382, + "grad_norm": 7.34375, + "learning_rate": 9.852441512315228e-06, + "loss": 1.06584101, + "memory(GiB)": 302.58, + "step": 44220, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.71807041, + "epoch": 0.24741103423013308, + "grad_norm": 12.0, + "learning_rate": 9.852218438393021e-06, + "loss": 1.10549049, + "memory(GiB)": 302.58, + "step": 44240, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.72958703, + "epoch": 0.24752288370311235, + "grad_norm": 9.25, + "learning_rate": 9.851995198509559e-06, + "loss": 1.06030598, + "memory(GiB)": 302.58, + "step": 44260, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.7108912, + "epoch": 0.2476347331760916, + "grad_norm": 6.21875, + "learning_rate": 9.851771792672473e-06, + "loss": 1.15487232, + "memory(GiB)": 302.58, + "step": 44280, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.72086954, + "epoch": 0.24774658264907087, + "grad_norm": 6.53125, + "learning_rate": 9.851548220889408e-06, + "loss": 1.11162205, + "memory(GiB)": 302.58, + "step": 44300, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.72380137, + "epoch": 0.24785843212205014, + "grad_norm": 8.625, + "learning_rate": 9.851324483168012e-06, + "loss": 1.07580147, + "memory(GiB)": 302.58, + "step": 44320, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.73531461, + "epoch": 0.2479702815950294, + "grad_norm": 5.34375, + "learning_rate": 9.851100579515934e-06, + "loss": 1.03320217, + "memory(GiB)": 302.58, + "step": 44340, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.73572569, + "epoch": 0.24808213106800867, + "grad_norm": 7.0625, + "learning_rate": 9.850876509940834e-06, + "loss": 1.0485425, + "memory(GiB)": 302.58, + "step": 44360, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.71661434, + "epoch": 0.24819398054098793, + "grad_norm": 7.4375, + "learning_rate": 9.850652274450377e-06, + "loss": 1.13419676, + "memory(GiB)": 302.58, + "step": 44380, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.73706145, + "epoch": 0.2483058300139672, + "grad_norm": 5.3125, + "learning_rate": 9.85042787305223e-06, + "loss": 1.04370718, + "memory(GiB)": 302.58, + "step": 44400, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.72177653, + "epoch": 0.24841767948694646, + "grad_norm": 6.28125, + "learning_rate": 9.850203305754068e-06, + "loss": 1.09557858, + "memory(GiB)": 302.58, + "step": 44420, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.7174901, + "epoch": 0.24852952895992572, + "grad_norm": 5.25, + "learning_rate": 9.849978572563575e-06, + "loss": 1.12744007, + "memory(GiB)": 302.58, + "step": 44440, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.73094869, + "epoch": 0.24864137843290499, + "grad_norm": 7.59375, + "learning_rate": 9.849753673488436e-06, + "loss": 1.06521864, + "memory(GiB)": 302.58, + "step": 44460, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.72240038, + "epoch": 0.24875322790588425, + "grad_norm": 6.65625, + "learning_rate": 9.849528608536343e-06, + "loss": 1.10312347, + "memory(GiB)": 302.58, + "step": 44480, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.71885314, + "epoch": 0.24886507737886351, + "grad_norm": 6.9375, + "learning_rate": 9.849303377714994e-06, + "loss": 1.10564709, + "memory(GiB)": 302.58, + "step": 44500, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.73433418, + "epoch": 0.24897692685184278, + "grad_norm": 5.28125, + "learning_rate": 9.849077981032091e-06, + "loss": 1.02764788, + "memory(GiB)": 302.58, + "step": 44520, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.73086219, + "epoch": 0.24908877632482204, + "grad_norm": 6.75, + "learning_rate": 9.848852418495349e-06, + "loss": 1.07537632, + "memory(GiB)": 302.58, + "step": 44540, + "train_speed(iter/s)": 0.123875 + }, + { + "acc": 0.71820445, + "epoch": 0.2492006257978013, + "grad_norm": 4.21875, + "learning_rate": 9.848626690112476e-06, + "loss": 1.11143618, + "memory(GiB)": 302.58, + "step": 44560, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.72212496, + "epoch": 0.24931247527078057, + "grad_norm": 6.875, + "learning_rate": 9.848400795891197e-06, + "loss": 1.09606085, + "memory(GiB)": 302.58, + "step": 44580, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.72756438, + "epoch": 0.24942432474375983, + "grad_norm": 7.25, + "learning_rate": 9.848174735839235e-06, + "loss": 1.08609324, + "memory(GiB)": 302.58, + "step": 44600, + "train_speed(iter/s)": 0.123953 + }, + { + "acc": 0.72418299, + "epoch": 0.24953617421673913, + "grad_norm": 6.875, + "learning_rate": 9.847948509964327e-06, + "loss": 1.097472, + "memory(GiB)": 302.58, + "step": 44620, + "train_speed(iter/s)": 0.12398 + }, + { + "acc": 0.74128776, + "epoch": 0.2496480236897184, + "grad_norm": 6.8125, + "learning_rate": 9.847722118274205e-06, + "loss": 0.99922132, + "memory(GiB)": 302.58, + "step": 44640, + "train_speed(iter/s)": 0.124006 + }, + { + "acc": 0.72894597, + "epoch": 0.24975987316269765, + "grad_norm": 6.15625, + "learning_rate": 9.847495560776616e-06, + "loss": 1.06353397, + "memory(GiB)": 302.58, + "step": 44660, + "train_speed(iter/s)": 0.124032 + }, + { + "acc": 0.73540359, + "epoch": 0.24987172263567692, + "grad_norm": 9.0625, + "learning_rate": 9.847268837479307e-06, + "loss": 1.03197746, + "memory(GiB)": 302.58, + "step": 44680, + "train_speed(iter/s)": 0.124058 + }, + { + "acc": 0.73209753, + "epoch": 0.24998357210865618, + "grad_norm": 10.5, + "learning_rate": 9.847041948390034e-06, + "loss": 1.05107641, + "memory(GiB)": 302.58, + "step": 44700, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.73335319, + "epoch": 0.25009542158163545, + "grad_norm": 7.875, + "learning_rate": 9.846814893516557e-06, + "loss": 1.04270115, + "memory(GiB)": 302.58, + "step": 44720, + "train_speed(iter/s)": 0.12411 + }, + { + "acc": 0.73825936, + "epoch": 0.2502072710546147, + "grad_norm": 9.0625, + "learning_rate": 9.846587672866642e-06, + "loss": 1.02226067, + "memory(GiB)": 302.58, + "step": 44740, + "train_speed(iter/s)": 0.124135 + }, + { + "acc": 0.75310802, + "epoch": 0.250319120527594, + "grad_norm": 8.25, + "learning_rate": 9.846360286448059e-06, + "loss": 0.96744299, + "memory(GiB)": 302.58, + "step": 44760, + "train_speed(iter/s)": 0.124159 + }, + { + "acc": 0.73787141, + "epoch": 0.25043097000057324, + "grad_norm": 6.59375, + "learning_rate": 9.846132734268587e-06, + "loss": 1.03819513, + "memory(GiB)": 302.58, + "step": 44780, + "train_speed(iter/s)": 0.124185 + }, + { + "acc": 0.73538904, + "epoch": 0.2505428194735525, + "grad_norm": 6.59375, + "learning_rate": 9.84590501633601e-06, + "loss": 1.03986454, + "memory(GiB)": 302.58, + "step": 44800, + "train_speed(iter/s)": 0.124213 + }, + { + "acc": 0.72021594, + "epoch": 0.25065466894653177, + "grad_norm": 6.6875, + "learning_rate": 9.845677132658116e-06, + "loss": 1.10514345, + "memory(GiB)": 302.58, + "step": 44820, + "train_speed(iter/s)": 0.124236 + }, + { + "acc": 0.72550125, + "epoch": 0.25076651841951103, + "grad_norm": 6.875, + "learning_rate": 9.845449083242698e-06, + "loss": 1.08781462, + "memory(GiB)": 302.58, + "step": 44840, + "train_speed(iter/s)": 0.124263 + }, + { + "acc": 0.72149053, + "epoch": 0.2508783678924903, + "grad_norm": 8.4375, + "learning_rate": 9.845220868097555e-06, + "loss": 1.10029993, + "memory(GiB)": 302.58, + "step": 44860, + "train_speed(iter/s)": 0.124289 + }, + { + "acc": 0.71594515, + "epoch": 0.25099021736546956, + "grad_norm": 8.75, + "learning_rate": 9.844992487230498e-06, + "loss": 1.11898756, + "memory(GiB)": 302.58, + "step": 44880, + "train_speed(iter/s)": 0.124314 + }, + { + "acc": 0.73460784, + "epoch": 0.2511020668384488, + "grad_norm": 7.09375, + "learning_rate": 9.844763940649334e-06, + "loss": 1.05556278, + "memory(GiB)": 302.58, + "step": 44900, + "train_speed(iter/s)": 0.12434 + }, + { + "acc": 0.74639444, + "epoch": 0.2512139163114281, + "grad_norm": 7.15625, + "learning_rate": 9.84453522836188e-06, + "loss": 1.00273342, + "memory(GiB)": 302.58, + "step": 44920, + "train_speed(iter/s)": 0.124365 + }, + { + "acc": 0.72322097, + "epoch": 0.25132576578440735, + "grad_norm": 7.34375, + "learning_rate": 9.844306350375958e-06, + "loss": 1.09937897, + "memory(GiB)": 302.58, + "step": 44940, + "train_speed(iter/s)": 0.12439 + }, + { + "acc": 0.73498535, + "epoch": 0.2514376152573866, + "grad_norm": 4.28125, + "learning_rate": 9.8440773066994e-06, + "loss": 1.05324097, + "memory(GiB)": 302.58, + "step": 44960, + "train_speed(iter/s)": 0.124417 + }, + { + "acc": 0.72457557, + "epoch": 0.2515494647303659, + "grad_norm": 7.3125, + "learning_rate": 9.843848097340039e-06, + "loss": 1.09659128, + "memory(GiB)": 302.58, + "step": 44980, + "train_speed(iter/s)": 0.124443 + }, + { + "acc": 0.72744732, + "epoch": 0.25166131420334514, + "grad_norm": 4.9375, + "learning_rate": 9.843618722305712e-06, + "loss": 1.08722687, + "memory(GiB)": 302.58, + "step": 45000, + "train_speed(iter/s)": 0.124469 + }, + { + "acc": 0.73212337, + "epoch": 0.2517731636763244, + "grad_norm": 7.03125, + "learning_rate": 9.843389181604267e-06, + "loss": 1.05931578, + "memory(GiB)": 302.58, + "step": 45020, + "train_speed(iter/s)": 0.124492 + }, + { + "acc": 0.73901153, + "epoch": 0.25188501314930367, + "grad_norm": 7.28125, + "learning_rate": 9.843159475243553e-06, + "loss": 1.02073679, + "memory(GiB)": 302.58, + "step": 45040, + "train_speed(iter/s)": 0.124517 + }, + { + "acc": 0.7457624, + "epoch": 0.25199686262228294, + "grad_norm": 7.1875, + "learning_rate": 9.842929603231426e-06, + "loss": 0.98910351, + "memory(GiB)": 302.58, + "step": 45060, + "train_speed(iter/s)": 0.124541 + }, + { + "acc": 0.73426318, + "epoch": 0.2521087120952622, + "grad_norm": 5.78125, + "learning_rate": 9.842699565575753e-06, + "loss": 1.04391317, + "memory(GiB)": 302.58, + "step": 45080, + "train_speed(iter/s)": 0.124567 + }, + { + "acc": 0.72943497, + "epoch": 0.25222056156824146, + "grad_norm": 6.59375, + "learning_rate": 9.842469362284399e-06, + "loss": 1.08423986, + "memory(GiB)": 302.58, + "step": 45100, + "train_speed(iter/s)": 0.124593 + }, + { + "acc": 0.74195199, + "epoch": 0.2523324110412207, + "grad_norm": 8.5625, + "learning_rate": 9.842238993365237e-06, + "loss": 0.9987649, + "memory(GiB)": 302.58, + "step": 45120, + "train_speed(iter/s)": 0.124618 + }, + { + "acc": 0.74178286, + "epoch": 0.2524442605142, + "grad_norm": 7.84375, + "learning_rate": 9.842008458826147e-06, + "loss": 1.00955982, + "memory(GiB)": 302.58, + "step": 45140, + "train_speed(iter/s)": 0.124645 + }, + { + "acc": 0.74208918, + "epoch": 0.25255610998717926, + "grad_norm": 5.84375, + "learning_rate": 9.841777758675014e-06, + "loss": 1.00888453, + "memory(GiB)": 302.58, + "step": 45160, + "train_speed(iter/s)": 0.124671 + }, + { + "acc": 0.73953266, + "epoch": 0.2526679594601585, + "grad_norm": 6.15625, + "learning_rate": 9.841546892919729e-06, + "loss": 1.01236506, + "memory(GiB)": 302.58, + "step": 45180, + "train_speed(iter/s)": 0.124697 + }, + { + "acc": 0.72615223, + "epoch": 0.2527798089331378, + "grad_norm": 9.8125, + "learning_rate": 9.841315861568186e-06, + "loss": 1.08300753, + "memory(GiB)": 302.58, + "step": 45200, + "train_speed(iter/s)": 0.124722 + }, + { + "acc": 0.7307631, + "epoch": 0.25289165840611705, + "grad_norm": 8.3125, + "learning_rate": 9.841084664628293e-06, + "loss": 1.0636488, + "memory(GiB)": 302.58, + "step": 45220, + "train_speed(iter/s)": 0.124747 + }, + { + "acc": 0.71363525, + "epoch": 0.2530035078790963, + "grad_norm": 7.96875, + "learning_rate": 9.84085330210795e-06, + "loss": 1.14635382, + "memory(GiB)": 302.58, + "step": 45240, + "train_speed(iter/s)": 0.124774 + }, + { + "acc": 0.74808984, + "epoch": 0.2531153573520756, + "grad_norm": 5.8125, + "learning_rate": 9.840621774015078e-06, + "loss": 0.98669767, + "memory(GiB)": 302.58, + "step": 45260, + "train_speed(iter/s)": 0.124798 + }, + { + "acc": 0.73374586, + "epoch": 0.25322720682505484, + "grad_norm": 6.625, + "learning_rate": 9.84039008035759e-06, + "loss": 1.03355255, + "memory(GiB)": 302.58, + "step": 45280, + "train_speed(iter/s)": 0.124824 + }, + { + "acc": 0.73668737, + "epoch": 0.2533390562980341, + "grad_norm": 7.125, + "learning_rate": 9.840158221143412e-06, + "loss": 1.03359613, + "memory(GiB)": 302.58, + "step": 45300, + "train_speed(iter/s)": 0.124851 + }, + { + "acc": 0.75081024, + "epoch": 0.25345090577101337, + "grad_norm": 8.6875, + "learning_rate": 9.839926196380477e-06, + "loss": 0.99008493, + "memory(GiB)": 302.58, + "step": 45320, + "train_speed(iter/s)": 0.124877 + }, + { + "acc": 0.74431434, + "epoch": 0.25356275524399263, + "grad_norm": 7.4375, + "learning_rate": 9.839694006076718e-06, + "loss": 1.01082296, + "memory(GiB)": 302.58, + "step": 45340, + "train_speed(iter/s)": 0.124903 + }, + { + "acc": 0.7383225, + "epoch": 0.2536746047169719, + "grad_norm": 7.21875, + "learning_rate": 9.839461650240078e-06, + "loss": 1.01896658, + "memory(GiB)": 302.58, + "step": 45360, + "train_speed(iter/s)": 0.124929 + }, + { + "acc": 0.72243962, + "epoch": 0.25378645418995116, + "grad_norm": 6.46875, + "learning_rate": 9.839229128878504e-06, + "loss": 1.07729778, + "memory(GiB)": 302.58, + "step": 45380, + "train_speed(iter/s)": 0.124957 + }, + { + "acc": 0.72073607, + "epoch": 0.2538983036629304, + "grad_norm": 9.3125, + "learning_rate": 9.838996441999948e-06, + "loss": 1.0958971, + "memory(GiB)": 302.58, + "step": 45400, + "train_speed(iter/s)": 0.124983 + }, + { + "acc": 0.70173516, + "epoch": 0.2540101531359097, + "grad_norm": 3.796875, + "learning_rate": 9.838763589612372e-06, + "loss": 1.19840651, + "memory(GiB)": 302.58, + "step": 45420, + "train_speed(iter/s)": 0.125007 + }, + { + "acc": 0.74791813, + "epoch": 0.25412200260888895, + "grad_norm": 5.03125, + "learning_rate": 9.838530571723737e-06, + "loss": 0.98776979, + "memory(GiB)": 302.58, + "step": 45440, + "train_speed(iter/s)": 0.125033 + }, + { + "acc": 0.72630143, + "epoch": 0.2542338520818682, + "grad_norm": 7.28125, + "learning_rate": 9.838297388342013e-06, + "loss": 1.08257923, + "memory(GiB)": 302.58, + "step": 45460, + "train_speed(iter/s)": 0.125058 + }, + { + "acc": 0.73239193, + "epoch": 0.2543457015548475, + "grad_norm": 6.4375, + "learning_rate": 9.838064039475177e-06, + "loss": 1.04524622, + "memory(GiB)": 302.58, + "step": 45480, + "train_speed(iter/s)": 0.125084 + }, + { + "acc": 0.72580643, + "epoch": 0.25445755102782674, + "grad_norm": 6.15625, + "learning_rate": 9.83783052513121e-06, + "loss": 1.08207607, + "memory(GiB)": 302.58, + "step": 45500, + "train_speed(iter/s)": 0.125109 + }, + { + "acc": 0.74506588, + "epoch": 0.254569400500806, + "grad_norm": 5.875, + "learning_rate": 9.837596845318098e-06, + "loss": 1.00576277, + "memory(GiB)": 302.58, + "step": 45520, + "train_speed(iter/s)": 0.125134 + }, + { + "acc": 0.74109592, + "epoch": 0.2546812499737853, + "grad_norm": 8.0625, + "learning_rate": 9.837363000043834e-06, + "loss": 1.01373558, + "memory(GiB)": 302.58, + "step": 45540, + "train_speed(iter/s)": 0.125159 + }, + { + "acc": 0.73277082, + "epoch": 0.25479309944676454, + "grad_norm": 5.78125, + "learning_rate": 9.837128989316418e-06, + "loss": 1.0474081, + "memory(GiB)": 302.58, + "step": 45560, + "train_speed(iter/s)": 0.125187 + }, + { + "acc": 0.73533034, + "epoch": 0.2549049489197438, + "grad_norm": 7.15625, + "learning_rate": 9.836894813143851e-06, + "loss": 1.0461607, + "memory(GiB)": 302.58, + "step": 45580, + "train_speed(iter/s)": 0.125212 + }, + { + "acc": 0.72934604, + "epoch": 0.25501679839272307, + "grad_norm": 10.125, + "learning_rate": 9.836660471534145e-06, + "loss": 1.04561291, + "memory(GiB)": 302.58, + "step": 45600, + "train_speed(iter/s)": 0.125238 + }, + { + "acc": 0.7347559, + "epoch": 0.25512864786570233, + "grad_norm": 7.375, + "learning_rate": 9.836425964495315e-06, + "loss": 1.0720829, + "memory(GiB)": 302.58, + "step": 45620, + "train_speed(iter/s)": 0.125265 + }, + { + "acc": 0.72431059, + "epoch": 0.2552404973386816, + "grad_norm": 6.21875, + "learning_rate": 9.836191292035379e-06, + "loss": 1.09026423, + "memory(GiB)": 302.58, + "step": 45640, + "train_speed(iter/s)": 0.12529 + }, + { + "acc": 0.74169874, + "epoch": 0.25535234681166086, + "grad_norm": 4.78125, + "learning_rate": 9.835956454162368e-06, + "loss": 1.0140605, + "memory(GiB)": 302.58, + "step": 45660, + "train_speed(iter/s)": 0.125317 + }, + { + "acc": 0.72589021, + "epoch": 0.2554641962846401, + "grad_norm": 8.75, + "learning_rate": 9.83572145088431e-06, + "loss": 1.08202486, + "memory(GiB)": 302.58, + "step": 45680, + "train_speed(iter/s)": 0.125343 + }, + { + "acc": 0.73661885, + "epoch": 0.2555760457576194, + "grad_norm": 6.90625, + "learning_rate": 9.835486282209246e-06, + "loss": 1.02885342, + "memory(GiB)": 302.58, + "step": 45700, + "train_speed(iter/s)": 0.125368 + }, + { + "acc": 0.73975439, + "epoch": 0.25568789523059865, + "grad_norm": 6.875, + "learning_rate": 9.835250948145217e-06, + "loss": 1.01471653, + "memory(GiB)": 302.58, + "step": 45720, + "train_speed(iter/s)": 0.125394 + }, + { + "acc": 0.72868562, + "epoch": 0.2557997447035779, + "grad_norm": 5.84375, + "learning_rate": 9.835015448700273e-06, + "loss": 1.06996651, + "memory(GiB)": 302.58, + "step": 45740, + "train_speed(iter/s)": 0.125418 + }, + { + "acc": 0.73563056, + "epoch": 0.2559115941765572, + "grad_norm": 8.3125, + "learning_rate": 9.83477978388247e-06, + "loss": 1.04948425, + "memory(GiB)": 302.58, + "step": 45760, + "train_speed(iter/s)": 0.125442 + }, + { + "acc": 0.72901988, + "epoch": 0.25602344364953644, + "grad_norm": 8.4375, + "learning_rate": 9.834543953699869e-06, + "loss": 1.05946321, + "memory(GiB)": 302.58, + "step": 45780, + "train_speed(iter/s)": 0.125467 + }, + { + "acc": 0.72697482, + "epoch": 0.2561352931225157, + "grad_norm": 5.03125, + "learning_rate": 9.834307958160533e-06, + "loss": 1.04762402, + "memory(GiB)": 302.58, + "step": 45800, + "train_speed(iter/s)": 0.125492 + }, + { + "acc": 0.73571682, + "epoch": 0.25624714259549497, + "grad_norm": 9.0, + "learning_rate": 9.834071797272537e-06, + "loss": 1.04696178, + "memory(GiB)": 302.58, + "step": 45820, + "train_speed(iter/s)": 0.125518 + }, + { + "acc": 0.73964305, + "epoch": 0.25635899206847423, + "grad_norm": 9.8125, + "learning_rate": 9.833835471043958e-06, + "loss": 1.03995495, + "memory(GiB)": 302.58, + "step": 45840, + "train_speed(iter/s)": 0.125542 + }, + { + "acc": 0.73631558, + "epoch": 0.2564708415414535, + "grad_norm": 8.4375, + "learning_rate": 9.833598979482876e-06, + "loss": 1.04290094, + "memory(GiB)": 302.58, + "step": 45860, + "train_speed(iter/s)": 0.125568 + }, + { + "acc": 0.73042717, + "epoch": 0.25658269101443276, + "grad_norm": 6.125, + "learning_rate": 9.833362322597381e-06, + "loss": 1.06746082, + "memory(GiB)": 302.58, + "step": 45880, + "train_speed(iter/s)": 0.125595 + }, + { + "acc": 0.72642159, + "epoch": 0.256694540487412, + "grad_norm": 10.0625, + "learning_rate": 9.833125500395572e-06, + "loss": 1.08682737, + "memory(GiB)": 302.58, + "step": 45900, + "train_speed(iter/s)": 0.12562 + }, + { + "acc": 0.73094354, + "epoch": 0.2568063899603913, + "grad_norm": 5.8125, + "learning_rate": 9.832888512885543e-06, + "loss": 1.06402111, + "memory(GiB)": 302.58, + "step": 45920, + "train_speed(iter/s)": 0.125645 + }, + { + "acc": 0.72602768, + "epoch": 0.25691823943337055, + "grad_norm": 5.90625, + "learning_rate": 9.832651360075403e-06, + "loss": 1.09444513, + "memory(GiB)": 302.58, + "step": 45940, + "train_speed(iter/s)": 0.125672 + }, + { + "acc": 0.72804313, + "epoch": 0.2570300889063498, + "grad_norm": 6.125, + "learning_rate": 9.832414041973262e-06, + "loss": 1.08616962, + "memory(GiB)": 302.58, + "step": 45960, + "train_speed(iter/s)": 0.125696 + }, + { + "acc": 0.73239288, + "epoch": 0.2571419383793291, + "grad_norm": 4.71875, + "learning_rate": 9.832176558587239e-06, + "loss": 1.05174522, + "memory(GiB)": 302.58, + "step": 45980, + "train_speed(iter/s)": 0.125722 + }, + { + "acc": 0.73865328, + "epoch": 0.25725378785230835, + "grad_norm": 10.625, + "learning_rate": 9.831938909925454e-06, + "loss": 1.04406996, + "memory(GiB)": 302.58, + "step": 46000, + "train_speed(iter/s)": 0.125747 + }, + { + "epoch": 0.25725378785230835, + "eval_acc": 0.6969968594989336, + "eval_loss": 1.0551568269729614, + "eval_runtime": 7504.5262, + "eval_samples_per_second": 10.032, + "eval_steps_per_second": 10.032, + "step": 46000 + }, + { + "acc": 0.71935201, + "epoch": 0.2573656373252876, + "grad_norm": 9.5625, + "learning_rate": 9.831701095996035e-06, + "loss": 1.1423089, + "memory(GiB)": 302.58, + "step": 46020, + "train_speed(iter/s)": 0.123209 + }, + { + "acc": 0.73579893, + "epoch": 0.2574774867982669, + "grad_norm": 5.15625, + "learning_rate": 9.83146311680712e-06, + "loss": 1.03715935, + "memory(GiB)": 302.58, + "step": 46040, + "train_speed(iter/s)": 0.123235 + }, + { + "acc": 0.72598052, + "epoch": 0.25758933627124614, + "grad_norm": 8.375, + "learning_rate": 9.831224972366846e-06, + "loss": 1.06706886, + "memory(GiB)": 302.58, + "step": 46060, + "train_speed(iter/s)": 0.123262 + }, + { + "acc": 0.73780179, + "epoch": 0.2577011857442254, + "grad_norm": 8.75, + "learning_rate": 9.83098666268336e-06, + "loss": 1.05353403, + "memory(GiB)": 302.58, + "step": 46080, + "train_speed(iter/s)": 0.123288 + }, + { + "acc": 0.73938985, + "epoch": 0.25781303521720467, + "grad_norm": 6.96875, + "learning_rate": 9.830748187764809e-06, + "loss": 1.02253933, + "memory(GiB)": 302.58, + "step": 46100, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.7354425, + "epoch": 0.25792488469018393, + "grad_norm": 7.03125, + "learning_rate": 9.830509547619353e-06, + "loss": 1.04566822, + "memory(GiB)": 302.58, + "step": 46120, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.73176451, + "epoch": 0.2580367341631632, + "grad_norm": 5.25, + "learning_rate": 9.830270742255152e-06, + "loss": 1.05099773, + "memory(GiB)": 302.58, + "step": 46140, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.74625831, + "epoch": 0.25814858363614246, + "grad_norm": 5.84375, + "learning_rate": 9.830031771680377e-06, + "loss": 0.98134527, + "memory(GiB)": 302.58, + "step": 46160, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.72491379, + "epoch": 0.2582604331091217, + "grad_norm": 6.53125, + "learning_rate": 9.829792635903199e-06, + "loss": 1.09119024, + "memory(GiB)": 302.58, + "step": 46180, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.7329812, + "epoch": 0.258372282582101, + "grad_norm": 7.46875, + "learning_rate": 9.829553334931799e-06, + "loss": 1.05227289, + "memory(GiB)": 302.58, + "step": 46200, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.74119935, + "epoch": 0.25848413205508025, + "grad_norm": 12.4375, + "learning_rate": 9.82931386877436e-06, + "loss": 1.0243885, + "memory(GiB)": 302.58, + "step": 46220, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.73957872, + "epoch": 0.2585959815280595, + "grad_norm": 6.3125, + "learning_rate": 9.829074237439074e-06, + "loss": 1.03015881, + "memory(GiB)": 302.58, + "step": 46240, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.72735, + "epoch": 0.2587078310010388, + "grad_norm": 6.90625, + "learning_rate": 9.828834440934134e-06, + "loss": 1.07915516, + "memory(GiB)": 302.58, + "step": 46260, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.72826843, + "epoch": 0.25881968047401804, + "grad_norm": 6.15625, + "learning_rate": 9.828594479267746e-06, + "loss": 1.08097134, + "memory(GiB)": 302.58, + "step": 46280, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.72057528, + "epoch": 0.2589315299469973, + "grad_norm": 9.0, + "learning_rate": 9.828354352448117e-06, + "loss": 1.10979156, + "memory(GiB)": 302.58, + "step": 46300, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.73604259, + "epoch": 0.25904337941997657, + "grad_norm": 5.40625, + "learning_rate": 9.828114060483458e-06, + "loss": 1.0323987, + "memory(GiB)": 302.58, + "step": 46320, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.74468703, + "epoch": 0.25915522889295584, + "grad_norm": 7.96875, + "learning_rate": 9.827873603381988e-06, + "loss": 0.98643255, + "memory(GiB)": 302.58, + "step": 46340, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.71833434, + "epoch": 0.2592670783659351, + "grad_norm": 4.78125, + "learning_rate": 9.827632981151932e-06, + "loss": 1.09309587, + "memory(GiB)": 302.58, + "step": 46360, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.74010482, + "epoch": 0.25937892783891436, + "grad_norm": 8.8125, + "learning_rate": 9.827392193801521e-06, + "loss": 1.03898115, + "memory(GiB)": 302.58, + "step": 46380, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.73378921, + "epoch": 0.25949077731189363, + "grad_norm": 4.96875, + "learning_rate": 9.827151241338988e-06, + "loss": 1.04334564, + "memory(GiB)": 302.58, + "step": 46400, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.72786999, + "epoch": 0.2596026267848729, + "grad_norm": 6.03125, + "learning_rate": 9.826910123772579e-06, + "loss": 1.07173004, + "memory(GiB)": 302.58, + "step": 46420, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.74165053, + "epoch": 0.25971447625785216, + "grad_norm": 7.03125, + "learning_rate": 9.826668841110536e-06, + "loss": 1.00640812, + "memory(GiB)": 302.58, + "step": 46440, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.73734369, + "epoch": 0.2598263257308314, + "grad_norm": 4.28125, + "learning_rate": 9.826427393361113e-06, + "loss": 1.03117037, + "memory(GiB)": 302.58, + "step": 46460, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.73903403, + "epoch": 0.2599381752038107, + "grad_norm": 5.15625, + "learning_rate": 9.82618578053257e-06, + "loss": 1.02342825, + "memory(GiB)": 302.58, + "step": 46480, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.74015279, + "epoch": 0.26005002467679, + "grad_norm": 7.625, + "learning_rate": 9.82594400263317e-06, + "loss": 1.02726927, + "memory(GiB)": 302.58, + "step": 46500, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.73108964, + "epoch": 0.26016187414976927, + "grad_norm": 5.71875, + "learning_rate": 9.825702059671183e-06, + "loss": 1.05086908, + "memory(GiB)": 302.58, + "step": 46520, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.73143234, + "epoch": 0.26027372362274853, + "grad_norm": 6.71875, + "learning_rate": 9.825459951654881e-06, + "loss": 1.05156755, + "memory(GiB)": 302.58, + "step": 46540, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.73387771, + "epoch": 0.2603855730957278, + "grad_norm": 8.5625, + "learning_rate": 9.825217678592552e-06, + "loss": 1.05232534, + "memory(GiB)": 302.58, + "step": 46560, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.73025918, + "epoch": 0.26049742256870706, + "grad_norm": 5.90625, + "learning_rate": 9.824975240492474e-06, + "loss": 1.07111235, + "memory(GiB)": 302.58, + "step": 46580, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.73105431, + "epoch": 0.2606092720416863, + "grad_norm": 7.4375, + "learning_rate": 9.824732637362945e-06, + "loss": 1.0753891, + "memory(GiB)": 302.58, + "step": 46600, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.73123384, + "epoch": 0.2607211215146656, + "grad_norm": 6.75, + "learning_rate": 9.82448986921226e-06, + "loss": 1.06999989, + "memory(GiB)": 302.58, + "step": 46620, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.73258314, + "epoch": 0.26083297098764485, + "grad_norm": 8.6875, + "learning_rate": 9.824246936048725e-06, + "loss": 1.03461905, + "memory(GiB)": 302.58, + "step": 46640, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.74820762, + "epoch": 0.2609448204606241, + "grad_norm": 7.875, + "learning_rate": 9.824003837880647e-06, + "loss": 0.97886467, + "memory(GiB)": 302.58, + "step": 46660, + "train_speed(iter/s)": 0.124008 + }, + { + "acc": 0.73920283, + "epoch": 0.2610566699336034, + "grad_norm": 6.28125, + "learning_rate": 9.82376057471634e-06, + "loss": 1.02412529, + "memory(GiB)": 302.58, + "step": 46680, + "train_speed(iter/s)": 0.124033 + }, + { + "acc": 0.70439053, + "epoch": 0.26116851940658264, + "grad_norm": 6.78125, + "learning_rate": 9.823517146564127e-06, + "loss": 1.20083551, + "memory(GiB)": 302.58, + "step": 46700, + "train_speed(iter/s)": 0.124057 + }, + { + "acc": 0.73877273, + "epoch": 0.2612803688795619, + "grad_norm": 8.5625, + "learning_rate": 9.823273553432331e-06, + "loss": 1.0100318, + "memory(GiB)": 302.58, + "step": 46720, + "train_speed(iter/s)": 0.124082 + }, + { + "acc": 0.73913417, + "epoch": 0.2613922183525412, + "grad_norm": 8.1875, + "learning_rate": 9.823029795329287e-06, + "loss": 1.02496586, + "memory(GiB)": 302.58, + "step": 46740, + "train_speed(iter/s)": 0.124107 + }, + { + "acc": 0.74721599, + "epoch": 0.26150406782552044, + "grad_norm": 6.53125, + "learning_rate": 9.822785872263329e-06, + "loss": 0.99217501, + "memory(GiB)": 302.58, + "step": 46760, + "train_speed(iter/s)": 0.124133 + }, + { + "acc": 0.73463373, + "epoch": 0.2616159172984997, + "grad_norm": 6.0, + "learning_rate": 9.822541784242803e-06, + "loss": 1.05342789, + "memory(GiB)": 302.58, + "step": 46780, + "train_speed(iter/s)": 0.124158 + }, + { + "acc": 0.73949833, + "epoch": 0.26172776677147896, + "grad_norm": 6.75, + "learning_rate": 9.822297531276055e-06, + "loss": 1.01638098, + "memory(GiB)": 302.58, + "step": 46800, + "train_speed(iter/s)": 0.124182 + }, + { + "acc": 0.73229346, + "epoch": 0.26183961624445823, + "grad_norm": 5.59375, + "learning_rate": 9.822053113371442e-06, + "loss": 1.04485044, + "memory(GiB)": 302.58, + "step": 46820, + "train_speed(iter/s)": 0.124206 + }, + { + "acc": 0.7189775, + "epoch": 0.2619514657174375, + "grad_norm": 7.5625, + "learning_rate": 9.821808530537321e-06, + "loss": 1.1198245, + "memory(GiB)": 302.58, + "step": 46840, + "train_speed(iter/s)": 0.124232 + }, + { + "acc": 0.72713556, + "epoch": 0.26206331519041676, + "grad_norm": 6.8125, + "learning_rate": 9.821563782782059e-06, + "loss": 1.07327766, + "memory(GiB)": 302.58, + "step": 46860, + "train_speed(iter/s)": 0.124258 + }, + { + "acc": 0.74550595, + "epoch": 0.262175164663396, + "grad_norm": 10.375, + "learning_rate": 9.821318870114027e-06, + "loss": 1.01115818, + "memory(GiB)": 302.58, + "step": 46880, + "train_speed(iter/s)": 0.124282 + }, + { + "acc": 0.73964047, + "epoch": 0.2622870141363753, + "grad_norm": 5.625, + "learning_rate": 9.821073792541601e-06, + "loss": 0.99895735, + "memory(GiB)": 302.58, + "step": 46900, + "train_speed(iter/s)": 0.124307 + }, + { + "acc": 0.73404007, + "epoch": 0.26239886360935455, + "grad_norm": 6.40625, + "learning_rate": 9.820828550073165e-06, + "loss": 1.06231337, + "memory(GiB)": 302.58, + "step": 46920, + "train_speed(iter/s)": 0.124332 + }, + { + "acc": 0.73977661, + "epoch": 0.2625107130823338, + "grad_norm": 7.21875, + "learning_rate": 9.820583142717105e-06, + "loss": 1.03016663, + "memory(GiB)": 302.58, + "step": 46940, + "train_speed(iter/s)": 0.124357 + }, + { + "acc": 0.72811284, + "epoch": 0.2626225625553131, + "grad_norm": 7.15625, + "learning_rate": 9.820337570481817e-06, + "loss": 1.08666153, + "memory(GiB)": 302.58, + "step": 46960, + "train_speed(iter/s)": 0.124382 + }, + { + "acc": 0.73931088, + "epoch": 0.26273441202829234, + "grad_norm": 7.5, + "learning_rate": 9.8200918333757e-06, + "loss": 1.00732727, + "memory(GiB)": 302.58, + "step": 46980, + "train_speed(iter/s)": 0.124405 + }, + { + "acc": 0.73291926, + "epoch": 0.2628462615012716, + "grad_norm": 5.78125, + "learning_rate": 9.819845931407156e-06, + "loss": 1.03807144, + "memory(GiB)": 302.58, + "step": 47000, + "train_speed(iter/s)": 0.124429 + }, + { + "acc": 0.74008451, + "epoch": 0.26295811097425087, + "grad_norm": 7.125, + "learning_rate": 9.819599864584599e-06, + "loss": 1.03048477, + "memory(GiB)": 302.58, + "step": 47020, + "train_speed(iter/s)": 0.124453 + }, + { + "acc": 0.73630433, + "epoch": 0.26306996044723013, + "grad_norm": 7.1875, + "learning_rate": 9.819353632916445e-06, + "loss": 1.05307102, + "memory(GiB)": 302.58, + "step": 47040, + "train_speed(iter/s)": 0.124479 + }, + { + "acc": 0.7446125, + "epoch": 0.2631818099202094, + "grad_norm": 7.71875, + "learning_rate": 9.819107236411112e-06, + "loss": 1.00804462, + "memory(GiB)": 302.58, + "step": 47060, + "train_speed(iter/s)": 0.124503 + }, + { + "acc": 0.73926239, + "epoch": 0.26329365939318866, + "grad_norm": 14.8125, + "learning_rate": 9.818860675077033e-06, + "loss": 1.02853994, + "memory(GiB)": 302.58, + "step": 47080, + "train_speed(iter/s)": 0.124527 + }, + { + "acc": 0.74859009, + "epoch": 0.2634055088661679, + "grad_norm": 7.78125, + "learning_rate": 9.818613948922637e-06, + "loss": 0.96222315, + "memory(GiB)": 302.58, + "step": 47100, + "train_speed(iter/s)": 0.124553 + }, + { + "acc": 0.74187784, + "epoch": 0.2635173583391472, + "grad_norm": 4.75, + "learning_rate": 9.818367057956366e-06, + "loss": 1.00022039, + "memory(GiB)": 302.58, + "step": 47120, + "train_speed(iter/s)": 0.124577 + }, + { + "acc": 0.74273739, + "epoch": 0.26362920781212645, + "grad_norm": 6.875, + "learning_rate": 9.818120002186663e-06, + "loss": 0.99980164, + "memory(GiB)": 302.58, + "step": 47140, + "train_speed(iter/s)": 0.1246 + }, + { + "acc": 0.72859731, + "epoch": 0.2637410572851057, + "grad_norm": 9.125, + "learning_rate": 9.817872781621978e-06, + "loss": 1.09098253, + "memory(GiB)": 302.58, + "step": 47160, + "train_speed(iter/s)": 0.124624 + }, + { + "acc": 0.73007841, + "epoch": 0.263852906758085, + "grad_norm": 6.4375, + "learning_rate": 9.817625396270767e-06, + "loss": 1.05879812, + "memory(GiB)": 302.58, + "step": 47180, + "train_speed(iter/s)": 0.124648 + }, + { + "acc": 0.73747363, + "epoch": 0.26396475623106425, + "grad_norm": 7.96875, + "learning_rate": 9.81737784614149e-06, + "loss": 1.04116621, + "memory(GiB)": 302.58, + "step": 47200, + "train_speed(iter/s)": 0.124673 + }, + { + "acc": 0.74736204, + "epoch": 0.2640766057040435, + "grad_norm": 8.5625, + "learning_rate": 9.817130131242615e-06, + "loss": 1.00415668, + "memory(GiB)": 302.58, + "step": 47220, + "train_speed(iter/s)": 0.124697 + }, + { + "acc": 0.73501563, + "epoch": 0.2641884551770228, + "grad_norm": 8.125, + "learning_rate": 9.816882251582616e-06, + "loss": 1.06566153, + "memory(GiB)": 302.58, + "step": 47240, + "train_speed(iter/s)": 0.124722 + }, + { + "acc": 0.72766838, + "epoch": 0.26430030465000204, + "grad_norm": 7.15625, + "learning_rate": 9.816634207169971e-06, + "loss": 1.08632431, + "memory(GiB)": 302.58, + "step": 47260, + "train_speed(iter/s)": 0.124746 + }, + { + "acc": 0.73802328, + "epoch": 0.2644121541229813, + "grad_norm": 4.96875, + "learning_rate": 9.81638599801316e-06, + "loss": 1.02796249, + "memory(GiB)": 302.58, + "step": 47280, + "train_speed(iter/s)": 0.124771 + }, + { + "acc": 0.72814589, + "epoch": 0.26452400359596057, + "grad_norm": 8.1875, + "learning_rate": 9.816137624120678e-06, + "loss": 1.08007288, + "memory(GiB)": 302.58, + "step": 47300, + "train_speed(iter/s)": 0.124796 + }, + { + "acc": 0.73312263, + "epoch": 0.26463585306893983, + "grad_norm": 7.90625, + "learning_rate": 9.815889085501017e-06, + "loss": 1.05403376, + "memory(GiB)": 302.58, + "step": 47320, + "train_speed(iter/s)": 0.12482 + }, + { + "acc": 0.72457552, + "epoch": 0.2647477025419191, + "grad_norm": 7.8125, + "learning_rate": 9.81564038216268e-06, + "loss": 1.07850428, + "memory(GiB)": 302.58, + "step": 47340, + "train_speed(iter/s)": 0.124845 + }, + { + "acc": 0.7305511, + "epoch": 0.26485955201489836, + "grad_norm": 5.03125, + "learning_rate": 9.815391514114169e-06, + "loss": 1.05911808, + "memory(GiB)": 302.58, + "step": 47360, + "train_speed(iter/s)": 0.124868 + }, + { + "acc": 0.7368166, + "epoch": 0.2649714014878776, + "grad_norm": 5.78125, + "learning_rate": 9.815142481364e-06, + "loss": 1.02562742, + "memory(GiB)": 302.58, + "step": 47380, + "train_speed(iter/s)": 0.124891 + }, + { + "acc": 0.74957008, + "epoch": 0.2650832509608569, + "grad_norm": 10.1875, + "learning_rate": 9.81489328392069e-06, + "loss": 0.96720285, + "memory(GiB)": 302.58, + "step": 47400, + "train_speed(iter/s)": 0.124915 + }, + { + "acc": 0.74186869, + "epoch": 0.26519510043383615, + "grad_norm": 5.125, + "learning_rate": 9.814643921792763e-06, + "loss": 1.00966759, + "memory(GiB)": 302.58, + "step": 47420, + "train_speed(iter/s)": 0.124941 + }, + { + "acc": 0.73075566, + "epoch": 0.2653069499068154, + "grad_norm": 6.21875, + "learning_rate": 9.814394394988747e-06, + "loss": 1.06015053, + "memory(GiB)": 302.58, + "step": 47440, + "train_speed(iter/s)": 0.124965 + }, + { + "acc": 0.70556626, + "epoch": 0.2654187993797947, + "grad_norm": 6.9375, + "learning_rate": 9.814144703517174e-06, + "loss": 1.15877352, + "memory(GiB)": 302.58, + "step": 47460, + "train_speed(iter/s)": 0.12499 + }, + { + "acc": 0.75560946, + "epoch": 0.26553064885277394, + "grad_norm": 5.78125, + "learning_rate": 9.81389484738659e-06, + "loss": 0.96228781, + "memory(GiB)": 302.58, + "step": 47480, + "train_speed(iter/s)": 0.125014 + }, + { + "acc": 0.72318397, + "epoch": 0.2656424983257532, + "grad_norm": 9.5, + "learning_rate": 9.813644826605536e-06, + "loss": 1.09018021, + "memory(GiB)": 302.58, + "step": 47500, + "train_speed(iter/s)": 0.125039 + }, + { + "acc": 0.74905462, + "epoch": 0.26575434779873247, + "grad_norm": 5.375, + "learning_rate": 9.813394641182566e-06, + "loss": 0.98764067, + "memory(GiB)": 302.58, + "step": 47520, + "train_speed(iter/s)": 0.125064 + }, + { + "acc": 0.72459517, + "epoch": 0.26586619727171173, + "grad_norm": 7.21875, + "learning_rate": 9.813144291126237e-06, + "loss": 1.09494781, + "memory(GiB)": 302.58, + "step": 47540, + "train_speed(iter/s)": 0.125087 + }, + { + "acc": 0.74475508, + "epoch": 0.265978046744691, + "grad_norm": 5.96875, + "learning_rate": 9.812893776445109e-06, + "loss": 1.01216221, + "memory(GiB)": 302.58, + "step": 47560, + "train_speed(iter/s)": 0.125113 + }, + { + "acc": 0.73604999, + "epoch": 0.26608989621767026, + "grad_norm": 4.8125, + "learning_rate": 9.812643097147755e-06, + "loss": 1.01263914, + "memory(GiB)": 302.58, + "step": 47580, + "train_speed(iter/s)": 0.125138 + }, + { + "acc": 0.73379073, + "epoch": 0.2662017456906495, + "grad_norm": 6.15625, + "learning_rate": 9.812392253242747e-06, + "loss": 1.03960695, + "memory(GiB)": 302.58, + "step": 47600, + "train_speed(iter/s)": 0.125162 + }, + { + "acc": 0.73374281, + "epoch": 0.2663135951636288, + "grad_norm": 7.21875, + "learning_rate": 9.812141244738662e-06, + "loss": 1.04817705, + "memory(GiB)": 302.58, + "step": 47620, + "train_speed(iter/s)": 0.125184 + }, + { + "acc": 0.74148254, + "epoch": 0.26642544463660806, + "grad_norm": 8.5625, + "learning_rate": 9.811890071644087e-06, + "loss": 1.00678701, + "memory(GiB)": 302.58, + "step": 47640, + "train_speed(iter/s)": 0.125206 + }, + { + "acc": 0.74132323, + "epoch": 0.2665372941095873, + "grad_norm": 9.3125, + "learning_rate": 9.811638733967617e-06, + "loss": 1.01744528, + "memory(GiB)": 302.58, + "step": 47660, + "train_speed(iter/s)": 0.12523 + }, + { + "acc": 0.72918825, + "epoch": 0.2666491435825666, + "grad_norm": 7.78125, + "learning_rate": 9.811387231717842e-06, + "loss": 1.05020218, + "memory(GiB)": 302.58, + "step": 47680, + "train_speed(iter/s)": 0.125255 + }, + { + "acc": 0.73432479, + "epoch": 0.26676099305554585, + "grad_norm": 7.25, + "learning_rate": 9.811135564903368e-06, + "loss": 1.0458725, + "memory(GiB)": 302.58, + "step": 47700, + "train_speed(iter/s)": 0.125278 + }, + { + "acc": 0.73628736, + "epoch": 0.2668728425285251, + "grad_norm": 8.875, + "learning_rate": 9.810883733532802e-06, + "loss": 1.0363637, + "memory(GiB)": 302.58, + "step": 47720, + "train_speed(iter/s)": 0.125302 + }, + { + "acc": 0.7267313, + "epoch": 0.2669846920015044, + "grad_norm": 6.40625, + "learning_rate": 9.810631737614758e-06, + "loss": 1.0689827, + "memory(GiB)": 302.58, + "step": 47740, + "train_speed(iter/s)": 0.125326 + }, + { + "acc": 0.73946447, + "epoch": 0.26709654147448364, + "grad_norm": 6.78125, + "learning_rate": 9.810379577157853e-06, + "loss": 1.00972967, + "memory(GiB)": 302.58, + "step": 47760, + "train_speed(iter/s)": 0.125349 + }, + { + "acc": 0.73520188, + "epoch": 0.2672083909474629, + "grad_norm": 6.46875, + "learning_rate": 9.810127252170714e-06, + "loss": 1.04148569, + "memory(GiB)": 302.58, + "step": 47780, + "train_speed(iter/s)": 0.125373 + }, + { + "acc": 0.73572688, + "epoch": 0.26732024042044217, + "grad_norm": 6.5625, + "learning_rate": 9.80987476266197e-06, + "loss": 1.03154078, + "memory(GiB)": 302.58, + "step": 47800, + "train_speed(iter/s)": 0.125398 + }, + { + "acc": 0.7486414, + "epoch": 0.26743208989342143, + "grad_norm": 9.125, + "learning_rate": 9.80962210864026e-06, + "loss": 0.98398762, + "memory(GiB)": 302.58, + "step": 47820, + "train_speed(iter/s)": 0.125421 + }, + { + "acc": 0.74356713, + "epoch": 0.2675439393664007, + "grad_norm": 6.90625, + "learning_rate": 9.80936929011422e-06, + "loss": 1.00115042, + "memory(GiB)": 302.58, + "step": 47840, + "train_speed(iter/s)": 0.125442 + }, + { + "acc": 0.72141199, + "epoch": 0.26765578883937996, + "grad_norm": 6.15625, + "learning_rate": 9.809116307092501e-06, + "loss": 1.12445612, + "memory(GiB)": 302.58, + "step": 47860, + "train_speed(iter/s)": 0.125465 + }, + { + "acc": 0.73605151, + "epoch": 0.2677676383123592, + "grad_norm": 7.53125, + "learning_rate": 9.808863159583756e-06, + "loss": 1.03298569, + "memory(GiB)": 302.58, + "step": 47880, + "train_speed(iter/s)": 0.125491 + }, + { + "acc": 0.73223481, + "epoch": 0.2678794877853385, + "grad_norm": 6.3125, + "learning_rate": 9.808609847596642e-06, + "loss": 1.03499985, + "memory(GiB)": 302.58, + "step": 47900, + "train_speed(iter/s)": 0.125516 + }, + { + "acc": 0.74268856, + "epoch": 0.26799133725831775, + "grad_norm": 8.375, + "learning_rate": 9.808356371139822e-06, + "loss": 1.02721329, + "memory(GiB)": 302.58, + "step": 47920, + "train_speed(iter/s)": 0.12554 + }, + { + "acc": 0.73740077, + "epoch": 0.268103186731297, + "grad_norm": 6.375, + "learning_rate": 9.808102730221968e-06, + "loss": 1.04374294, + "memory(GiB)": 302.58, + "step": 47940, + "train_speed(iter/s)": 0.125563 + }, + { + "acc": 0.73164001, + "epoch": 0.2682150362042763, + "grad_norm": 5.53125, + "learning_rate": 9.807848924851756e-06, + "loss": 1.06013222, + "memory(GiB)": 302.58, + "step": 47960, + "train_speed(iter/s)": 0.125588 + }, + { + "acc": 0.73033066, + "epoch": 0.26832688567725554, + "grad_norm": 7.34375, + "learning_rate": 9.807594955037865e-06, + "loss": 1.05050774, + "memory(GiB)": 302.58, + "step": 47980, + "train_speed(iter/s)": 0.125612 + }, + { + "acc": 0.73773546, + "epoch": 0.2684387351502348, + "grad_norm": 8.25, + "learning_rate": 9.807340820788981e-06, + "loss": 1.01881952, + "memory(GiB)": 302.58, + "step": 48000, + "train_speed(iter/s)": 0.125637 + }, + { + "epoch": 0.2684387351502348, + "eval_acc": 0.697469714272869, + "eval_loss": 1.053774356842041, + "eval_runtime": 7503.66, + "eval_samples_per_second": 10.033, + "eval_steps_per_second": 10.033, + "step": 48000 + }, + { + "acc": 0.738237, + "epoch": 0.2685505846232141, + "grad_norm": 6.875, + "learning_rate": 9.807086522113799e-06, + "loss": 1.03843355, + "memory(GiB)": 302.58, + "step": 48020, + "train_speed(iter/s)": 0.123209 + }, + { + "acc": 0.73952599, + "epoch": 0.26866243409619334, + "grad_norm": 6.875, + "learning_rate": 9.806832059021012e-06, + "loss": 1.01305532, + "memory(GiB)": 302.58, + "step": 48040, + "train_speed(iter/s)": 0.123234 + }, + { + "acc": 0.74281516, + "epoch": 0.2687742835691726, + "grad_norm": 8.875, + "learning_rate": 9.806577431519328e-06, + "loss": 1.00887318, + "memory(GiB)": 302.58, + "step": 48060, + "train_speed(iter/s)": 0.123259 + }, + { + "acc": 0.75525565, + "epoch": 0.26888613304215186, + "grad_norm": 8.25, + "learning_rate": 9.806322639617457e-06, + "loss": 0.97097425, + "memory(GiB)": 302.58, + "step": 48080, + "train_speed(iter/s)": 0.123283 + }, + { + "acc": 0.72368541, + "epoch": 0.26899798251513113, + "grad_norm": 4.625, + "learning_rate": 9.806067683324107e-06, + "loss": 1.10632143, + "memory(GiB)": 302.58, + "step": 48100, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.71835465, + "epoch": 0.2691098319881104, + "grad_norm": 9.125, + "learning_rate": 9.805812562648005e-06, + "loss": 1.1134387, + "memory(GiB)": 302.58, + "step": 48120, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.74089994, + "epoch": 0.26922168146108966, + "grad_norm": 9.1875, + "learning_rate": 9.805557277597875e-06, + "loss": 1.02430286, + "memory(GiB)": 302.58, + "step": 48140, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.74073391, + "epoch": 0.2693335309340689, + "grad_norm": 5.84375, + "learning_rate": 9.805301828182448e-06, + "loss": 1.01250391, + "memory(GiB)": 302.58, + "step": 48160, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.73276925, + "epoch": 0.2694453804070482, + "grad_norm": 6.40625, + "learning_rate": 9.80504621441046e-06, + "loss": 1.0563633, + "memory(GiB)": 302.58, + "step": 48180, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.730405, + "epoch": 0.26955722988002745, + "grad_norm": 5.53125, + "learning_rate": 9.804790436290656e-06, + "loss": 1.0829381, + "memory(GiB)": 302.58, + "step": 48200, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.73792019, + "epoch": 0.2696690793530067, + "grad_norm": 6.5625, + "learning_rate": 9.804534493831783e-06, + "loss": 1.04452124, + "memory(GiB)": 302.58, + "step": 48220, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.72511015, + "epoch": 0.269780928825986, + "grad_norm": 6.53125, + "learning_rate": 9.804278387042596e-06, + "loss": 1.07515783, + "memory(GiB)": 302.58, + "step": 48240, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.71927147, + "epoch": 0.26989277829896524, + "grad_norm": 6.875, + "learning_rate": 9.804022115931854e-06, + "loss": 1.1184824, + "memory(GiB)": 302.58, + "step": 48260, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.75304937, + "epoch": 0.2700046277719445, + "grad_norm": 8.8125, + "learning_rate": 9.803765680508323e-06, + "loss": 0.97441959, + "memory(GiB)": 302.58, + "step": 48280, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74544239, + "epoch": 0.27011647724492377, + "grad_norm": 7.21875, + "learning_rate": 9.803509080780773e-06, + "loss": 1.00856438, + "memory(GiB)": 302.58, + "step": 48300, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.73360744, + "epoch": 0.27022832671790303, + "grad_norm": 11.9375, + "learning_rate": 9.803252316757981e-06, + "loss": 1.04980536, + "memory(GiB)": 302.58, + "step": 48320, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.73298635, + "epoch": 0.2703401761908823, + "grad_norm": 10.0625, + "learning_rate": 9.802995388448729e-06, + "loss": 1.05300102, + "memory(GiB)": 302.58, + "step": 48340, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.7421977, + "epoch": 0.27045202566386156, + "grad_norm": 6.0625, + "learning_rate": 9.802738295861804e-06, + "loss": 1.00147419, + "memory(GiB)": 302.58, + "step": 48360, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.73337083, + "epoch": 0.2705638751368408, + "grad_norm": 7.3125, + "learning_rate": 9.802481039006002e-06, + "loss": 1.04017363, + "memory(GiB)": 302.58, + "step": 48380, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74406185, + "epoch": 0.2706757246098201, + "grad_norm": 10.6875, + "learning_rate": 9.802223617890118e-06, + "loss": 0.99658546, + "memory(GiB)": 302.58, + "step": 48400, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.73681703, + "epoch": 0.27078757408279935, + "grad_norm": 4.625, + "learning_rate": 9.801966032522962e-06, + "loss": 1.03401022, + "memory(GiB)": 302.58, + "step": 48420, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.74419875, + "epoch": 0.2708994235557786, + "grad_norm": 7.53125, + "learning_rate": 9.801708282913338e-06, + "loss": 1.00525045, + "memory(GiB)": 302.58, + "step": 48440, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.74135365, + "epoch": 0.2710112730287579, + "grad_norm": 7.84375, + "learning_rate": 9.801450369070067e-06, + "loss": 1.0228714, + "memory(GiB)": 302.58, + "step": 48460, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.72999496, + "epoch": 0.27112312250173715, + "grad_norm": 8.1875, + "learning_rate": 9.801192291001967e-06, + "loss": 1.07723322, + "memory(GiB)": 302.58, + "step": 48480, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.73901467, + "epoch": 0.2712349719747164, + "grad_norm": 5.28125, + "learning_rate": 9.800934048717867e-06, + "loss": 1.01780834, + "memory(GiB)": 302.58, + "step": 48500, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.73762031, + "epoch": 0.2713468214476957, + "grad_norm": 7.375, + "learning_rate": 9.800675642226599e-06, + "loss": 1.06025915, + "memory(GiB)": 302.58, + "step": 48520, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.73305097, + "epoch": 0.27145867092067494, + "grad_norm": 8.125, + "learning_rate": 9.800417071537001e-06, + "loss": 1.0316577, + "memory(GiB)": 302.58, + "step": 48540, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.73247457, + "epoch": 0.2715705203936542, + "grad_norm": 5.96875, + "learning_rate": 9.800158336657919e-06, + "loss": 1.03923044, + "memory(GiB)": 302.58, + "step": 48560, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.73123007, + "epoch": 0.27168236986663347, + "grad_norm": 6.59375, + "learning_rate": 9.7998994375982e-06, + "loss": 1.04893799, + "memory(GiB)": 302.58, + "step": 48580, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.72553844, + "epoch": 0.27179421933961273, + "grad_norm": 6.40625, + "learning_rate": 9.7996403743667e-06, + "loss": 1.09951124, + "memory(GiB)": 302.58, + "step": 48600, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.71834345, + "epoch": 0.271906068812592, + "grad_norm": 7.375, + "learning_rate": 9.79938114697228e-06, + "loss": 1.13644476, + "memory(GiB)": 302.58, + "step": 48620, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.72893858, + "epoch": 0.27201791828557126, + "grad_norm": 4.84375, + "learning_rate": 9.799121755423806e-06, + "loss": 1.07949667, + "memory(GiB)": 302.58, + "step": 48640, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.74793344, + "epoch": 0.2721297677585505, + "grad_norm": 8.875, + "learning_rate": 9.798862199730152e-06, + "loss": 0.97059269, + "memory(GiB)": 302.58, + "step": 48660, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.73916864, + "epoch": 0.2722416172315298, + "grad_norm": 9.125, + "learning_rate": 9.798602479900193e-06, + "loss": 1.02709675, + "memory(GiB)": 302.58, + "step": 48680, + "train_speed(iter/s)": 0.124 + }, + { + "acc": 0.7393117, + "epoch": 0.27235346670450905, + "grad_norm": 7.4375, + "learning_rate": 9.798342595942812e-06, + "loss": 1.03117952, + "memory(GiB)": 302.58, + "step": 48700, + "train_speed(iter/s)": 0.124024 + }, + { + "acc": 0.72902431, + "epoch": 0.2724653161774883, + "grad_norm": 6.53125, + "learning_rate": 9.7980825478669e-06, + "loss": 1.06376009, + "memory(GiB)": 302.58, + "step": 48720, + "train_speed(iter/s)": 0.124047 + }, + { + "acc": 0.74563293, + "epoch": 0.2725771656504676, + "grad_norm": 5.90625, + "learning_rate": 9.797822335681352e-06, + "loss": 1.00130253, + "memory(GiB)": 302.58, + "step": 48740, + "train_speed(iter/s)": 0.124071 + }, + { + "acc": 0.7324944, + "epoch": 0.27268901512344684, + "grad_norm": 6.5, + "learning_rate": 9.797561959395065e-06, + "loss": 1.06025772, + "memory(GiB)": 302.58, + "step": 48760, + "train_speed(iter/s)": 0.124096 + }, + { + "acc": 0.73656769, + "epoch": 0.2728008645964261, + "grad_norm": 9.8125, + "learning_rate": 9.797301419016946e-06, + "loss": 1.03484287, + "memory(GiB)": 302.58, + "step": 48780, + "train_speed(iter/s)": 0.12412 + }, + { + "acc": 0.72915587, + "epoch": 0.27291271406940537, + "grad_norm": 6.3125, + "learning_rate": 9.797040714555907e-06, + "loss": 1.06643124, + "memory(GiB)": 302.58, + "step": 48800, + "train_speed(iter/s)": 0.124143 + }, + { + "acc": 0.73016672, + "epoch": 0.27302456354238464, + "grad_norm": 7.65625, + "learning_rate": 9.796779846020865e-06, + "loss": 1.05821199, + "memory(GiB)": 302.58, + "step": 48820, + "train_speed(iter/s)": 0.124167 + }, + { + "acc": 0.74484954, + "epoch": 0.2731364130153639, + "grad_norm": 8.875, + "learning_rate": 9.796518813420743e-06, + "loss": 1.02264566, + "memory(GiB)": 302.58, + "step": 48840, + "train_speed(iter/s)": 0.12419 + }, + { + "acc": 0.7319119, + "epoch": 0.27324826248834316, + "grad_norm": 6.3125, + "learning_rate": 9.796257616764468e-06, + "loss": 1.07584696, + "memory(GiB)": 302.58, + "step": 48860, + "train_speed(iter/s)": 0.124212 + }, + { + "acc": 0.72421651, + "epoch": 0.2733601119613224, + "grad_norm": 8.9375, + "learning_rate": 9.795996256060972e-06, + "loss": 1.09365091, + "memory(GiB)": 302.58, + "step": 48880, + "train_speed(iter/s)": 0.124235 + }, + { + "acc": 0.73448858, + "epoch": 0.2734719614343017, + "grad_norm": 9.25, + "learning_rate": 9.795734731319197e-06, + "loss": 1.04786386, + "memory(GiB)": 302.58, + "step": 48900, + "train_speed(iter/s)": 0.124259 + }, + { + "acc": 0.74611325, + "epoch": 0.27358381090728096, + "grad_norm": 7.46875, + "learning_rate": 9.795473042548089e-06, + "loss": 0.99278517, + "memory(GiB)": 302.58, + "step": 48920, + "train_speed(iter/s)": 0.124283 + }, + { + "acc": 0.73836489, + "epoch": 0.2736956603802602, + "grad_norm": 7.9375, + "learning_rate": 9.795211189756596e-06, + "loss": 1.02559462, + "memory(GiB)": 302.58, + "step": 48940, + "train_speed(iter/s)": 0.124306 + }, + { + "acc": 0.73004088, + "epoch": 0.2738075098532395, + "grad_norm": 6.65625, + "learning_rate": 9.794949172953674e-06, + "loss": 1.04282379, + "memory(GiB)": 302.58, + "step": 48960, + "train_speed(iter/s)": 0.124329 + }, + { + "acc": 0.74074049, + "epoch": 0.27391935932621875, + "grad_norm": 8.125, + "learning_rate": 9.794686992148288e-06, + "loss": 1.02149849, + "memory(GiB)": 302.58, + "step": 48980, + "train_speed(iter/s)": 0.124353 + }, + { + "acc": 0.72018151, + "epoch": 0.274031208799198, + "grad_norm": 6.03125, + "learning_rate": 9.7944246473494e-06, + "loss": 1.10835705, + "memory(GiB)": 302.58, + "step": 49000, + "train_speed(iter/s)": 0.124377 + }, + { + "acc": 0.7324245, + "epoch": 0.27414305827217733, + "grad_norm": 5.84375, + "learning_rate": 9.79416213856599e-06, + "loss": 1.0501936, + "memory(GiB)": 302.58, + "step": 49020, + "train_speed(iter/s)": 0.124401 + }, + { + "acc": 0.74855957, + "epoch": 0.2742549077451566, + "grad_norm": 5.40625, + "learning_rate": 9.793899465807031e-06, + "loss": 0.98560133, + "memory(GiB)": 302.58, + "step": 49040, + "train_speed(iter/s)": 0.124421 + }, + { + "acc": 0.75300474, + "epoch": 0.27436675721813586, + "grad_norm": 9.5625, + "learning_rate": 9.793636629081511e-06, + "loss": 0.95569468, + "memory(GiB)": 302.58, + "step": 49060, + "train_speed(iter/s)": 0.124444 + }, + { + "acc": 0.72689219, + "epoch": 0.2744786066911151, + "grad_norm": 8.3125, + "learning_rate": 9.793373628398415e-06, + "loss": 1.08082504, + "memory(GiB)": 302.58, + "step": 49080, + "train_speed(iter/s)": 0.124468 + }, + { + "acc": 0.73606548, + "epoch": 0.2745904561640944, + "grad_norm": 7.84375, + "learning_rate": 9.793110463766744e-06, + "loss": 1.05667801, + "memory(GiB)": 302.58, + "step": 49100, + "train_speed(iter/s)": 0.124491 + }, + { + "acc": 0.72451711, + "epoch": 0.27470230563707365, + "grad_norm": 10.0, + "learning_rate": 9.792847135195495e-06, + "loss": 1.08990946, + "memory(GiB)": 302.58, + "step": 49120, + "train_speed(iter/s)": 0.124515 + }, + { + "acc": 0.72334037, + "epoch": 0.2748141551100529, + "grad_norm": 6.78125, + "learning_rate": 9.792583642693677e-06, + "loss": 1.13630018, + "memory(GiB)": 302.58, + "step": 49140, + "train_speed(iter/s)": 0.124539 + }, + { + "acc": 0.7294723, + "epoch": 0.2749260045830322, + "grad_norm": 10.75, + "learning_rate": 9.792319986270303e-06, + "loss": 1.07545691, + "memory(GiB)": 302.58, + "step": 49160, + "train_speed(iter/s)": 0.124562 + }, + { + "acc": 0.73759718, + "epoch": 0.27503785405601144, + "grad_norm": 9.0625, + "learning_rate": 9.79205616593439e-06, + "loss": 1.01102457, + "memory(GiB)": 302.58, + "step": 49180, + "train_speed(iter/s)": 0.124586 + }, + { + "acc": 0.73821921, + "epoch": 0.2751497035289907, + "grad_norm": 10.9375, + "learning_rate": 9.791792181694959e-06, + "loss": 1.04819078, + "memory(GiB)": 302.58, + "step": 49200, + "train_speed(iter/s)": 0.12461 + }, + { + "acc": 0.73357282, + "epoch": 0.27526155300196997, + "grad_norm": 9.375, + "learning_rate": 9.79152803356104e-06, + "loss": 1.05688801, + "memory(GiB)": 302.58, + "step": 49220, + "train_speed(iter/s)": 0.124634 + }, + { + "acc": 0.71999731, + "epoch": 0.27537340247494924, + "grad_norm": 7.53125, + "learning_rate": 9.79126372154167e-06, + "loss": 1.10731983, + "memory(GiB)": 302.58, + "step": 49240, + "train_speed(iter/s)": 0.124657 + }, + { + "acc": 0.74656, + "epoch": 0.2754852519479285, + "grad_norm": 7.9375, + "learning_rate": 9.790999245645888e-06, + "loss": 0.99970684, + "memory(GiB)": 302.58, + "step": 49260, + "train_speed(iter/s)": 0.124682 + }, + { + "acc": 0.74546518, + "epoch": 0.27559710142090776, + "grad_norm": 6.0, + "learning_rate": 9.790734605882739e-06, + "loss": 0.99412565, + "memory(GiB)": 302.58, + "step": 49280, + "train_speed(iter/s)": 0.124705 + }, + { + "acc": 0.73637633, + "epoch": 0.27570895089388703, + "grad_norm": 9.8125, + "learning_rate": 9.790469802261277e-06, + "loss": 1.0387373, + "memory(GiB)": 302.58, + "step": 49300, + "train_speed(iter/s)": 0.124727 + }, + { + "acc": 0.74379945, + "epoch": 0.2758208003668663, + "grad_norm": 8.125, + "learning_rate": 9.790204834790557e-06, + "loss": 0.99007034, + "memory(GiB)": 302.58, + "step": 49320, + "train_speed(iter/s)": 0.124752 + }, + { + "acc": 0.7221487, + "epoch": 0.27593264983984556, + "grad_norm": 7.03125, + "learning_rate": 9.789939703479642e-06, + "loss": 1.10558739, + "memory(GiB)": 302.58, + "step": 49340, + "train_speed(iter/s)": 0.124776 + }, + { + "acc": 0.71477728, + "epoch": 0.2760444993128248, + "grad_norm": 7.90625, + "learning_rate": 9.789674408337602e-06, + "loss": 1.15124588, + "memory(GiB)": 302.58, + "step": 49360, + "train_speed(iter/s)": 0.1248 + }, + { + "acc": 0.74231515, + "epoch": 0.2761563487858041, + "grad_norm": 6.84375, + "learning_rate": 9.789408949373506e-06, + "loss": 1.02448206, + "memory(GiB)": 302.58, + "step": 49380, + "train_speed(iter/s)": 0.124824 + }, + { + "acc": 0.74743862, + "epoch": 0.27626819825878335, + "grad_norm": 7.34375, + "learning_rate": 9.789143326596441e-06, + "loss": 0.99943371, + "memory(GiB)": 302.58, + "step": 49400, + "train_speed(iter/s)": 0.124847 + }, + { + "acc": 0.74390678, + "epoch": 0.2763800477317626, + "grad_norm": 7.59375, + "learning_rate": 9.788877540015487e-06, + "loss": 1.00882988, + "memory(GiB)": 302.58, + "step": 49420, + "train_speed(iter/s)": 0.124869 + }, + { + "acc": 0.73608465, + "epoch": 0.2764918972047419, + "grad_norm": 7.46875, + "learning_rate": 9.788611589639736e-06, + "loss": 1.04986172, + "memory(GiB)": 302.58, + "step": 49440, + "train_speed(iter/s)": 0.124893 + }, + { + "acc": 0.71888146, + "epoch": 0.27660374667772114, + "grad_norm": 7.875, + "learning_rate": 9.788345475478285e-06, + "loss": 1.09986506, + "memory(GiB)": 302.58, + "step": 49460, + "train_speed(iter/s)": 0.124917 + }, + { + "acc": 0.73931026, + "epoch": 0.2767155961507004, + "grad_norm": 7.28125, + "learning_rate": 9.788079197540233e-06, + "loss": 1.01515646, + "memory(GiB)": 302.58, + "step": 49480, + "train_speed(iter/s)": 0.12494 + }, + { + "acc": 0.73258047, + "epoch": 0.27682744562367967, + "grad_norm": 7.53125, + "learning_rate": 9.787812755834691e-06, + "loss": 1.07735424, + "memory(GiB)": 302.58, + "step": 49500, + "train_speed(iter/s)": 0.124963 + }, + { + "acc": 0.74213281, + "epoch": 0.27693929509665893, + "grad_norm": 6.65625, + "learning_rate": 9.787546150370772e-06, + "loss": 1.01038313, + "memory(GiB)": 302.58, + "step": 49520, + "train_speed(iter/s)": 0.124988 + }, + { + "acc": 0.75506849, + "epoch": 0.2770511445696382, + "grad_norm": 7.5, + "learning_rate": 9.787279381157592e-06, + "loss": 0.95694132, + "memory(GiB)": 302.58, + "step": 49540, + "train_speed(iter/s)": 0.125011 + }, + { + "acc": 0.7224854, + "epoch": 0.27716299404261746, + "grad_norm": 8.0625, + "learning_rate": 9.787012448204281e-06, + "loss": 1.09949646, + "memory(GiB)": 302.58, + "step": 49560, + "train_speed(iter/s)": 0.125034 + }, + { + "acc": 0.74187427, + "epoch": 0.2772748435155967, + "grad_norm": 6.53125, + "learning_rate": 9.786745351519961e-06, + "loss": 1.01258774, + "memory(GiB)": 302.58, + "step": 49580, + "train_speed(iter/s)": 0.125057 + }, + { + "acc": 0.7279964, + "epoch": 0.277386692988576, + "grad_norm": 5.5, + "learning_rate": 9.786478091113774e-06, + "loss": 1.07747746, + "memory(GiB)": 302.58, + "step": 49600, + "train_speed(iter/s)": 0.12508 + }, + { + "acc": 0.73274016, + "epoch": 0.27749854246155525, + "grad_norm": 8.4375, + "learning_rate": 9.786210666994858e-06, + "loss": 1.04888592, + "memory(GiB)": 302.58, + "step": 49620, + "train_speed(iter/s)": 0.125104 + }, + { + "acc": 0.71870389, + "epoch": 0.2776103919345345, + "grad_norm": 6.46875, + "learning_rate": 9.78594307917236e-06, + "loss": 1.1186614, + "memory(GiB)": 302.58, + "step": 49640, + "train_speed(iter/s)": 0.125127 + }, + { + "acc": 0.7444026, + "epoch": 0.2777222414075138, + "grad_norm": 6.65625, + "learning_rate": 9.785675327655433e-06, + "loss": 1.00764942, + "memory(GiB)": 302.58, + "step": 49660, + "train_speed(iter/s)": 0.125148 + }, + { + "acc": 0.73574677, + "epoch": 0.27783409088049305, + "grad_norm": 6.96875, + "learning_rate": 9.785407412453234e-06, + "loss": 1.03507891, + "memory(GiB)": 302.58, + "step": 49680, + "train_speed(iter/s)": 0.125172 + }, + { + "acc": 0.7391593, + "epoch": 0.2779459403534723, + "grad_norm": 9.8125, + "learning_rate": 9.78513933357493e-06, + "loss": 1.02810049, + "memory(GiB)": 302.58, + "step": 49700, + "train_speed(iter/s)": 0.125195 + }, + { + "acc": 0.73669181, + "epoch": 0.2780577898264516, + "grad_norm": 6.84375, + "learning_rate": 9.784871091029685e-06, + "loss": 1.04163113, + "memory(GiB)": 302.58, + "step": 49720, + "train_speed(iter/s)": 0.125218 + }, + { + "acc": 0.73756166, + "epoch": 0.27816963929943084, + "grad_norm": 7.9375, + "learning_rate": 9.784602684826677e-06, + "loss": 1.03264008, + "memory(GiB)": 302.58, + "step": 49740, + "train_speed(iter/s)": 0.125239 + }, + { + "acc": 0.74114356, + "epoch": 0.2782814887724101, + "grad_norm": 5.5625, + "learning_rate": 9.784334114975085e-06, + "loss": 1.01704473, + "memory(GiB)": 302.58, + "step": 49760, + "train_speed(iter/s)": 0.125263 + }, + { + "acc": 0.72384801, + "epoch": 0.27839333824538937, + "grad_norm": 6.90625, + "learning_rate": 9.784065381484098e-06, + "loss": 1.08519535, + "memory(GiB)": 302.58, + "step": 49780, + "train_speed(iter/s)": 0.125287 + }, + { + "acc": 0.73024282, + "epoch": 0.27850518771836863, + "grad_norm": 7.84375, + "learning_rate": 9.783796484362903e-06, + "loss": 1.07165031, + "memory(GiB)": 302.58, + "step": 49800, + "train_speed(iter/s)": 0.125311 + }, + { + "acc": 0.71733222, + "epoch": 0.2786170371913479, + "grad_norm": 7.375, + "learning_rate": 9.783527423620699e-06, + "loss": 1.12332335, + "memory(GiB)": 302.58, + "step": 49820, + "train_speed(iter/s)": 0.125335 + }, + { + "acc": 0.73390718, + "epoch": 0.27872888666432716, + "grad_norm": 6.53125, + "learning_rate": 9.78325819926669e-06, + "loss": 1.05257082, + "memory(GiB)": 302.58, + "step": 49840, + "train_speed(iter/s)": 0.125358 + }, + { + "acc": 0.73387074, + "epoch": 0.2788407361373064, + "grad_norm": 8.0, + "learning_rate": 9.782988811310083e-06, + "loss": 1.05673113, + "memory(GiB)": 302.58, + "step": 49860, + "train_speed(iter/s)": 0.125382 + }, + { + "acc": 0.75255785, + "epoch": 0.2789525856102857, + "grad_norm": 7.5625, + "learning_rate": 9.782719259760091e-06, + "loss": 0.98257828, + "memory(GiB)": 302.58, + "step": 49880, + "train_speed(iter/s)": 0.125405 + }, + { + "acc": 0.71299324, + "epoch": 0.27906443508326495, + "grad_norm": 5.09375, + "learning_rate": 9.782449544625936e-06, + "loss": 1.13708735, + "memory(GiB)": 302.58, + "step": 49900, + "train_speed(iter/s)": 0.125429 + }, + { + "acc": 0.72772217, + "epoch": 0.2791762845562442, + "grad_norm": 10.5625, + "learning_rate": 9.782179665916842e-06, + "loss": 1.06668119, + "memory(GiB)": 302.58, + "step": 49920, + "train_speed(iter/s)": 0.125454 + }, + { + "acc": 0.73181458, + "epoch": 0.2792881340292235, + "grad_norm": 5.8125, + "learning_rate": 9.78190962364204e-06, + "loss": 1.06826801, + "memory(GiB)": 302.58, + "step": 49940, + "train_speed(iter/s)": 0.125478 + }, + { + "acc": 0.72727294, + "epoch": 0.27939998350220274, + "grad_norm": 6.125, + "learning_rate": 9.781639417810765e-06, + "loss": 1.06388693, + "memory(GiB)": 302.58, + "step": 49960, + "train_speed(iter/s)": 0.125502 + }, + { + "acc": 0.74559531, + "epoch": 0.279511832975182, + "grad_norm": 6.46875, + "learning_rate": 9.781369048432259e-06, + "loss": 0.99269466, + "memory(GiB)": 302.58, + "step": 49980, + "train_speed(iter/s)": 0.125525 + }, + { + "acc": 0.73126841, + "epoch": 0.27962368244816127, + "grad_norm": 7.59375, + "learning_rate": 9.781098515515771e-06, + "loss": 1.05635071, + "memory(GiB)": 302.58, + "step": 50000, + "train_speed(iter/s)": 0.125549 + }, + { + "epoch": 0.27962368244816127, + "eval_acc": 0.6975584977849475, + "eval_loss": 1.0528035163879395, + "eval_runtime": 7565.4906, + "eval_samples_per_second": 9.951, + "eval_steps_per_second": 9.951, + "step": 50000 + }, + { + "acc": 0.73377972, + "epoch": 0.27973553192114053, + "grad_norm": 6.1875, + "learning_rate": 9.780827819070553e-06, + "loss": 1.03087053, + "memory(GiB)": 302.58, + "step": 50020, + "train_speed(iter/s)": 0.123201 + }, + { + "acc": 0.74140882, + "epoch": 0.2798473813941198, + "grad_norm": 7.65625, + "learning_rate": 9.780556959105864e-06, + "loss": 1.02055244, + "memory(GiB)": 302.58, + "step": 50040, + "train_speed(iter/s)": 0.123222 + }, + { + "acc": 0.70443659, + "epoch": 0.27995923086709906, + "grad_norm": 8.0, + "learning_rate": 9.780285935630968e-06, + "loss": 1.20330315, + "memory(GiB)": 302.58, + "step": 50060, + "train_speed(iter/s)": 0.123246 + }, + { + "acc": 0.73475838, + "epoch": 0.2800710803400783, + "grad_norm": 6.375, + "learning_rate": 9.780014748655139e-06, + "loss": 1.04936132, + "memory(GiB)": 302.58, + "step": 50080, + "train_speed(iter/s)": 0.123269 + }, + { + "acc": 0.72787175, + "epoch": 0.2801829298130576, + "grad_norm": 6.53125, + "learning_rate": 9.779743398187644e-06, + "loss": 1.08691797, + "memory(GiB)": 302.58, + "step": 50100, + "train_speed(iter/s)": 0.12329 + }, + { + "acc": 0.73686361, + "epoch": 0.28029477928603685, + "grad_norm": 7.8125, + "learning_rate": 9.77947188423777e-06, + "loss": 1.04601402, + "memory(GiB)": 302.58, + "step": 50120, + "train_speed(iter/s)": 0.123314 + }, + { + "acc": 0.75770969, + "epoch": 0.2804066287590161, + "grad_norm": 6.8125, + "learning_rate": 9.779200206814801e-06, + "loss": 0.94921007, + "memory(GiB)": 302.58, + "step": 50140, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.73665891, + "epoch": 0.2805184782319954, + "grad_norm": 7.84375, + "learning_rate": 9.778928365928032e-06, + "loss": 1.03432093, + "memory(GiB)": 302.58, + "step": 50160, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.7393333, + "epoch": 0.28063032770497465, + "grad_norm": 8.375, + "learning_rate": 9.778656361586759e-06, + "loss": 1.01640005, + "memory(GiB)": 302.58, + "step": 50180, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.73350945, + "epoch": 0.2807421771779539, + "grad_norm": 4.9375, + "learning_rate": 9.778384193800285e-06, + "loss": 1.06210604, + "memory(GiB)": 302.58, + "step": 50200, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.70960507, + "epoch": 0.2808540266509332, + "grad_norm": 4.71875, + "learning_rate": 9.778111862577922e-06, + "loss": 1.16762419, + "memory(GiB)": 302.58, + "step": 50220, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.73756189, + "epoch": 0.28096587612391244, + "grad_norm": 7.03125, + "learning_rate": 9.77783936792898e-06, + "loss": 1.03571444, + "memory(GiB)": 302.58, + "step": 50240, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.73618989, + "epoch": 0.2810777255968917, + "grad_norm": 6.9375, + "learning_rate": 9.777566709862783e-06, + "loss": 1.00050669, + "memory(GiB)": 302.58, + "step": 50260, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.72079506, + "epoch": 0.28118957506987097, + "grad_norm": 7.3125, + "learning_rate": 9.777293888388655e-06, + "loss": 1.1118804, + "memory(GiB)": 302.58, + "step": 50280, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.74235525, + "epoch": 0.28130142454285023, + "grad_norm": 6.40625, + "learning_rate": 9.777020903515926e-06, + "loss": 1.00156918, + "memory(GiB)": 302.58, + "step": 50300, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.73166895, + "epoch": 0.2814132740158295, + "grad_norm": 8.8125, + "learning_rate": 9.776747755253935e-06, + "loss": 1.05323067, + "memory(GiB)": 302.58, + "step": 50320, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.73590798, + "epoch": 0.28152512348880876, + "grad_norm": 5.21875, + "learning_rate": 9.776474443612025e-06, + "loss": 1.03072138, + "memory(GiB)": 302.58, + "step": 50340, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.72964168, + "epoch": 0.281636972961788, + "grad_norm": 10.0625, + "learning_rate": 9.776200968599544e-06, + "loss": 1.06605129, + "memory(GiB)": 302.58, + "step": 50360, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.74030313, + "epoch": 0.2817488224347673, + "grad_norm": 5.9375, + "learning_rate": 9.775927330225843e-06, + "loss": 1.02162304, + "memory(GiB)": 302.58, + "step": 50380, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.73906631, + "epoch": 0.28186067190774655, + "grad_norm": 6.875, + "learning_rate": 9.775653528500285e-06, + "loss": 1.02173481, + "memory(GiB)": 302.58, + "step": 50400, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.72226453, + "epoch": 0.2819725213807258, + "grad_norm": 7.46875, + "learning_rate": 9.775379563432233e-06, + "loss": 1.1023325, + "memory(GiB)": 302.58, + "step": 50420, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.721416, + "epoch": 0.2820843708537051, + "grad_norm": 8.0, + "learning_rate": 9.775105435031057e-06, + "loss": 1.10508633, + "memory(GiB)": 302.58, + "step": 50440, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.73077354, + "epoch": 0.28219622032668434, + "grad_norm": 5.3125, + "learning_rate": 9.774831143306135e-06, + "loss": 1.06253176, + "memory(GiB)": 302.58, + "step": 50460, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.74143677, + "epoch": 0.2823080697996636, + "grad_norm": 5.84375, + "learning_rate": 9.774556688266846e-06, + "loss": 0.98904305, + "memory(GiB)": 302.58, + "step": 50480, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.72668824, + "epoch": 0.28241991927264287, + "grad_norm": 8.375, + "learning_rate": 9.774282069922578e-06, + "loss": 1.09343281, + "memory(GiB)": 302.58, + "step": 50500, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.72377973, + "epoch": 0.28253176874562214, + "grad_norm": 5.28125, + "learning_rate": 9.774007288282724e-06, + "loss": 1.10438166, + "memory(GiB)": 302.58, + "step": 50520, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.74826503, + "epoch": 0.2826436182186014, + "grad_norm": 7.0, + "learning_rate": 9.773732343356684e-06, + "loss": 0.9926363, + "memory(GiB)": 302.58, + "step": 50540, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.74455018, + "epoch": 0.28275546769158066, + "grad_norm": 6.375, + "learning_rate": 9.77345723515386e-06, + "loss": 1.00129881, + "memory(GiB)": 302.58, + "step": 50560, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.73026204, + "epoch": 0.28286731716455993, + "grad_norm": 6.4375, + "learning_rate": 9.773181963683663e-06, + "loss": 1.04057693, + "memory(GiB)": 302.58, + "step": 50580, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.73178215, + "epoch": 0.2829791666375392, + "grad_norm": 10.1875, + "learning_rate": 9.772906528955508e-06, + "loss": 1.05653744, + "memory(GiB)": 302.58, + "step": 50600, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.74097714, + "epoch": 0.28309101611051846, + "grad_norm": 9.5625, + "learning_rate": 9.772630930978816e-06, + "loss": 1.0338048, + "memory(GiB)": 302.58, + "step": 50620, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.73589091, + "epoch": 0.2832028655834977, + "grad_norm": 6.5625, + "learning_rate": 9.772355169763009e-06, + "loss": 1.07110233, + "memory(GiB)": 302.58, + "step": 50640, + "train_speed(iter/s)": 0.123902 + }, + { + "acc": 0.73005862, + "epoch": 0.283314715056477, + "grad_norm": 8.75, + "learning_rate": 9.772079245317525e-06, + "loss": 1.08105412, + "memory(GiB)": 302.58, + "step": 50660, + "train_speed(iter/s)": 0.123925 + }, + { + "acc": 0.73522544, + "epoch": 0.28342656452945625, + "grad_norm": 7.84375, + "learning_rate": 9.771803157651798e-06, + "loss": 1.03548346, + "memory(GiB)": 302.58, + "step": 50680, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.74230938, + "epoch": 0.2835384140024355, + "grad_norm": 6.9375, + "learning_rate": 9.771526906775273e-06, + "loss": 1.00610638, + "memory(GiB)": 302.58, + "step": 50700, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.73807278, + "epoch": 0.2836502634754148, + "grad_norm": 5.15625, + "learning_rate": 9.771250492697397e-06, + "loss": 1.02621813, + "memory(GiB)": 302.58, + "step": 50720, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.733953, + "epoch": 0.28376211294839404, + "grad_norm": 4.28125, + "learning_rate": 9.770973915427627e-06, + "loss": 1.05469761, + "memory(GiB)": 302.58, + "step": 50740, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.72520328, + "epoch": 0.2838739624213733, + "grad_norm": 8.5, + "learning_rate": 9.770697174975417e-06, + "loss": 1.09937563, + "memory(GiB)": 302.58, + "step": 50760, + "train_speed(iter/s)": 0.124042 + }, + { + "acc": 0.7131382, + "epoch": 0.28398581189435257, + "grad_norm": 6.0, + "learning_rate": 9.770420271350239e-06, + "loss": 1.15761681, + "memory(GiB)": 302.58, + "step": 50780, + "train_speed(iter/s)": 0.124065 + }, + { + "acc": 0.7296339, + "epoch": 0.28409766136733183, + "grad_norm": 6.8125, + "learning_rate": 9.77014320456156e-06, + "loss": 1.08677626, + "memory(GiB)": 302.58, + "step": 50800, + "train_speed(iter/s)": 0.124085 + }, + { + "acc": 0.7308392, + "epoch": 0.2842095108403111, + "grad_norm": 6.125, + "learning_rate": 9.769865974618857e-06, + "loss": 1.08758535, + "memory(GiB)": 302.58, + "step": 50820, + "train_speed(iter/s)": 0.124106 + }, + { + "acc": 0.72325454, + "epoch": 0.28432136031329036, + "grad_norm": 7.3125, + "learning_rate": 9.769588581531614e-06, + "loss": 1.08315506, + "memory(GiB)": 302.58, + "step": 50840, + "train_speed(iter/s)": 0.124129 + }, + { + "acc": 0.73186207, + "epoch": 0.2844332097862696, + "grad_norm": 6.875, + "learning_rate": 9.769311025309319e-06, + "loss": 1.05776768, + "memory(GiB)": 302.58, + "step": 50860, + "train_speed(iter/s)": 0.124152 + }, + { + "acc": 0.70952168, + "epoch": 0.2845450592592489, + "grad_norm": 4.03125, + "learning_rate": 9.769033305961461e-06, + "loss": 1.1840806, + "memory(GiB)": 302.58, + "step": 50880, + "train_speed(iter/s)": 0.124173 + }, + { + "acc": 0.71590695, + "epoch": 0.28465690873222815, + "grad_norm": 5.25, + "learning_rate": 9.768755423497543e-06, + "loss": 1.15186729, + "memory(GiB)": 302.58, + "step": 50900, + "train_speed(iter/s)": 0.124195 + }, + { + "acc": 0.73629494, + "epoch": 0.2847687582052074, + "grad_norm": 8.625, + "learning_rate": 9.768477377927068e-06, + "loss": 1.04550695, + "memory(GiB)": 302.58, + "step": 50920, + "train_speed(iter/s)": 0.124218 + }, + { + "acc": 0.74739046, + "epoch": 0.2848806076781867, + "grad_norm": 7.46875, + "learning_rate": 9.768199169259548e-06, + "loss": 0.99231997, + "memory(GiB)": 302.58, + "step": 50940, + "train_speed(iter/s)": 0.124243 + }, + { + "acc": 0.73088317, + "epoch": 0.28499245715116595, + "grad_norm": 10.0, + "learning_rate": 9.767920797504493e-06, + "loss": 1.04895029, + "memory(GiB)": 302.58, + "step": 50960, + "train_speed(iter/s)": 0.124265 + }, + { + "acc": 0.73395138, + "epoch": 0.2851043066241452, + "grad_norm": 7.65625, + "learning_rate": 9.76764226267143e-06, + "loss": 1.04835854, + "memory(GiB)": 302.58, + "step": 50980, + "train_speed(iter/s)": 0.124287 + }, + { + "acc": 0.72487082, + "epoch": 0.2852161560971245, + "grad_norm": 4.75, + "learning_rate": 9.767363564769885e-06, + "loss": 1.08204575, + "memory(GiB)": 302.58, + "step": 51000, + "train_speed(iter/s)": 0.12431 + }, + { + "acc": 0.72619762, + "epoch": 0.28532800557010374, + "grad_norm": 6.28125, + "learning_rate": 9.76708470380939e-06, + "loss": 1.08278179, + "memory(GiB)": 302.58, + "step": 51020, + "train_speed(iter/s)": 0.124331 + }, + { + "acc": 0.75065622, + "epoch": 0.285439855043083, + "grad_norm": 8.1875, + "learning_rate": 9.766805679799478e-06, + "loss": 0.97187271, + "memory(GiB)": 302.58, + "step": 51040, + "train_speed(iter/s)": 0.124353 + }, + { + "acc": 0.73196988, + "epoch": 0.28555170451606227, + "grad_norm": 6.5, + "learning_rate": 9.766526492749699e-06, + "loss": 1.07156506, + "memory(GiB)": 302.58, + "step": 51060, + "train_speed(iter/s)": 0.124376 + }, + { + "acc": 0.7228898, + "epoch": 0.28566355398904153, + "grad_norm": 6.5625, + "learning_rate": 9.7662471426696e-06, + "loss": 1.11914043, + "memory(GiB)": 302.58, + "step": 51080, + "train_speed(iter/s)": 0.124399 + }, + { + "acc": 0.72953973, + "epoch": 0.2857754034620208, + "grad_norm": 7.96875, + "learning_rate": 9.765967629568735e-06, + "loss": 1.06750536, + "memory(GiB)": 302.58, + "step": 51100, + "train_speed(iter/s)": 0.124422 + }, + { + "acc": 0.72923441, + "epoch": 0.28588725293500006, + "grad_norm": 4.65625, + "learning_rate": 9.765687953456664e-06, + "loss": 1.04844599, + "memory(GiB)": 302.58, + "step": 51120, + "train_speed(iter/s)": 0.124443 + }, + { + "acc": 0.74113955, + "epoch": 0.2859991024079793, + "grad_norm": 7.0625, + "learning_rate": 9.765408114342954e-06, + "loss": 1.00829468, + "memory(GiB)": 302.58, + "step": 51140, + "train_speed(iter/s)": 0.124466 + }, + { + "acc": 0.73935657, + "epoch": 0.2861109518809586, + "grad_norm": 8.75, + "learning_rate": 9.765128112237177e-06, + "loss": 1.01741638, + "memory(GiB)": 302.58, + "step": 51160, + "train_speed(iter/s)": 0.124488 + }, + { + "acc": 0.72659383, + "epoch": 0.28622280135393785, + "grad_norm": 9.5, + "learning_rate": 9.764847947148908e-06, + "loss": 1.0903265, + "memory(GiB)": 302.58, + "step": 51180, + "train_speed(iter/s)": 0.124511 + }, + { + "acc": 0.72901196, + "epoch": 0.2863346508269171, + "grad_norm": 6.65625, + "learning_rate": 9.764567619087729e-06, + "loss": 1.07526731, + "memory(GiB)": 302.58, + "step": 51200, + "train_speed(iter/s)": 0.124534 + }, + { + "acc": 0.72527156, + "epoch": 0.2864465002998964, + "grad_norm": 5.53125, + "learning_rate": 9.76428712806323e-06, + "loss": 1.093643, + "memory(GiB)": 302.58, + "step": 51220, + "train_speed(iter/s)": 0.124555 + }, + { + "acc": 0.71969714, + "epoch": 0.28655834977287564, + "grad_norm": 10.125, + "learning_rate": 9.764006474085004e-06, + "loss": 1.14846249, + "memory(GiB)": 302.58, + "step": 51240, + "train_speed(iter/s)": 0.124577 + }, + { + "acc": 0.73569436, + "epoch": 0.2866701992458549, + "grad_norm": 7.0, + "learning_rate": 9.763725657162651e-06, + "loss": 1.05530806, + "memory(GiB)": 302.58, + "step": 51260, + "train_speed(iter/s)": 0.1246 + }, + { + "acc": 0.74162526, + "epoch": 0.28678204871883417, + "grad_norm": 5.71875, + "learning_rate": 9.763444677305775e-06, + "loss": 1.00775766, + "memory(GiB)": 302.58, + "step": 51280, + "train_speed(iter/s)": 0.124624 + }, + { + "acc": 0.75160847, + "epoch": 0.28689389819181343, + "grad_norm": 8.1875, + "learning_rate": 9.763163534523987e-06, + "loss": 0.97496872, + "memory(GiB)": 302.58, + "step": 51300, + "train_speed(iter/s)": 0.124646 + }, + { + "acc": 0.72961926, + "epoch": 0.2870057476647927, + "grad_norm": 7.59375, + "learning_rate": 9.762882228826903e-06, + "loss": 1.05380878, + "memory(GiB)": 302.58, + "step": 51320, + "train_speed(iter/s)": 0.124668 + }, + { + "acc": 0.73037548, + "epoch": 0.28711759713777196, + "grad_norm": 5.84375, + "learning_rate": 9.76260076022414e-06, + "loss": 1.0605401, + "memory(GiB)": 302.58, + "step": 51340, + "train_speed(iter/s)": 0.124691 + }, + { + "acc": 0.72278609, + "epoch": 0.2872294466107512, + "grad_norm": 7.0625, + "learning_rate": 9.762319128725333e-06, + "loss": 1.09222546, + "memory(GiB)": 302.58, + "step": 51360, + "train_speed(iter/s)": 0.124715 + }, + { + "acc": 0.74151773, + "epoch": 0.2873412960837305, + "grad_norm": 7.28125, + "learning_rate": 9.762037334340112e-06, + "loss": 1.01677485, + "memory(GiB)": 302.58, + "step": 51380, + "train_speed(iter/s)": 0.124737 + }, + { + "acc": 0.71992278, + "epoch": 0.28745314555670975, + "grad_norm": 6.0625, + "learning_rate": 9.761755377078111e-06, + "loss": 1.09794884, + "memory(GiB)": 302.58, + "step": 51400, + "train_speed(iter/s)": 0.124761 + }, + { + "acc": 0.73270192, + "epoch": 0.287564995029689, + "grad_norm": 4.90625, + "learning_rate": 9.761473256948978e-06, + "loss": 1.04444761, + "memory(GiB)": 302.58, + "step": 51420, + "train_speed(iter/s)": 0.124783 + }, + { + "acc": 0.74342017, + "epoch": 0.2876768445026683, + "grad_norm": 8.5, + "learning_rate": 9.761190973962362e-06, + "loss": 1.00482588, + "memory(GiB)": 302.58, + "step": 51440, + "train_speed(iter/s)": 0.124807 + }, + { + "acc": 0.73782387, + "epoch": 0.28778869397564755, + "grad_norm": 6.125, + "learning_rate": 9.760908528127918e-06, + "loss": 1.0318758, + "memory(GiB)": 302.58, + "step": 51460, + "train_speed(iter/s)": 0.124828 + }, + { + "acc": 0.73627157, + "epoch": 0.2879005434486268, + "grad_norm": 6.1875, + "learning_rate": 9.760625919455304e-06, + "loss": 1.0214222, + "memory(GiB)": 302.58, + "step": 51480, + "train_speed(iter/s)": 0.12485 + }, + { + "acc": 0.73583236, + "epoch": 0.2880123929216061, + "grad_norm": 7.375, + "learning_rate": 9.76034314795419e-06, + "loss": 1.04236937, + "memory(GiB)": 302.58, + "step": 51500, + "train_speed(iter/s)": 0.124873 + }, + { + "acc": 0.732374, + "epoch": 0.2881242423945854, + "grad_norm": 5.40625, + "learning_rate": 9.760060213634244e-06, + "loss": 1.04767551, + "memory(GiB)": 302.58, + "step": 51520, + "train_speed(iter/s)": 0.124895 + }, + { + "acc": 0.72544999, + "epoch": 0.28823609186756466, + "grad_norm": 6.875, + "learning_rate": 9.759777116505145e-06, + "loss": 1.08769808, + "memory(GiB)": 302.58, + "step": 51540, + "train_speed(iter/s)": 0.124917 + }, + { + "acc": 0.72477531, + "epoch": 0.2883479413405439, + "grad_norm": 6.90625, + "learning_rate": 9.759493856576575e-06, + "loss": 1.08716049, + "memory(GiB)": 302.58, + "step": 51560, + "train_speed(iter/s)": 0.124939 + }, + { + "acc": 0.74447074, + "epoch": 0.2884597908135232, + "grad_norm": 7.0, + "learning_rate": 9.759210433858226e-06, + "loss": 1.0035387, + "memory(GiB)": 302.58, + "step": 51580, + "train_speed(iter/s)": 0.124963 + }, + { + "acc": 0.739221, + "epoch": 0.28857164028650245, + "grad_norm": 8.0625, + "learning_rate": 9.758926848359789e-06, + "loss": 1.04239082, + "memory(GiB)": 302.58, + "step": 51600, + "train_speed(iter/s)": 0.124985 + }, + { + "acc": 0.74131212, + "epoch": 0.2886834897594817, + "grad_norm": 6.09375, + "learning_rate": 9.75864310009096e-06, + "loss": 1.01107721, + "memory(GiB)": 302.58, + "step": 51620, + "train_speed(iter/s)": 0.125005 + }, + { + "acc": 0.73675981, + "epoch": 0.288795339232461, + "grad_norm": 7.875, + "learning_rate": 9.758359189061451e-06, + "loss": 1.03309851, + "memory(GiB)": 302.58, + "step": 51640, + "train_speed(iter/s)": 0.125029 + }, + { + "acc": 0.74131398, + "epoch": 0.28890718870544024, + "grad_norm": 7.4375, + "learning_rate": 9.75807511528097e-06, + "loss": 1.00596161, + "memory(GiB)": 302.58, + "step": 51660, + "train_speed(iter/s)": 0.125051 + }, + { + "acc": 0.72238035, + "epoch": 0.2890190381784195, + "grad_norm": 4.0625, + "learning_rate": 9.75779087875923e-06, + "loss": 1.09859791, + "memory(GiB)": 302.58, + "step": 51680, + "train_speed(iter/s)": 0.125072 + }, + { + "acc": 0.73926072, + "epoch": 0.28913088765139877, + "grad_norm": 6.78125, + "learning_rate": 9.757506479505955e-06, + "loss": 1.02269011, + "memory(GiB)": 302.58, + "step": 51700, + "train_speed(iter/s)": 0.125093 + }, + { + "acc": 0.74893174, + "epoch": 0.28924273712437804, + "grad_norm": 5.1875, + "learning_rate": 9.757221917530875e-06, + "loss": 0.9860796, + "memory(GiB)": 302.58, + "step": 51720, + "train_speed(iter/s)": 0.125115 + }, + { + "acc": 0.74944863, + "epoch": 0.2893545865973573, + "grad_norm": 8.75, + "learning_rate": 9.756937192843721e-06, + "loss": 0.98023357, + "memory(GiB)": 302.58, + "step": 51740, + "train_speed(iter/s)": 0.125137 + }, + { + "acc": 0.73954391, + "epoch": 0.28946643607033656, + "grad_norm": 8.9375, + "learning_rate": 9.756652305454228e-06, + "loss": 1.02882233, + "memory(GiB)": 302.58, + "step": 51760, + "train_speed(iter/s)": 0.125158 + }, + { + "acc": 0.74824572, + "epoch": 0.2895782855433158, + "grad_norm": 4.375, + "learning_rate": 9.756367255372146e-06, + "loss": 0.98535357, + "memory(GiB)": 302.58, + "step": 51780, + "train_speed(iter/s)": 0.12518 + }, + { + "acc": 0.72849746, + "epoch": 0.2896901350162951, + "grad_norm": 7.28125, + "learning_rate": 9.756082042607223e-06, + "loss": 1.09362764, + "memory(GiB)": 302.58, + "step": 51800, + "train_speed(iter/s)": 0.125203 + }, + { + "acc": 0.74275818, + "epoch": 0.28980198448927436, + "grad_norm": 6.40625, + "learning_rate": 9.755796667169209e-06, + "loss": 1.01102295, + "memory(GiB)": 302.58, + "step": 51820, + "train_speed(iter/s)": 0.125226 + }, + { + "acc": 0.71418214, + "epoch": 0.2899138339622536, + "grad_norm": 8.0625, + "learning_rate": 9.755511129067872e-06, + "loss": 1.137988, + "memory(GiB)": 302.58, + "step": 51840, + "train_speed(iter/s)": 0.125247 + }, + { + "acc": 0.74319339, + "epoch": 0.2900256834352329, + "grad_norm": 7.6875, + "learning_rate": 9.755225428312973e-06, + "loss": 1.02204657, + "memory(GiB)": 302.58, + "step": 51860, + "train_speed(iter/s)": 0.125268 + }, + { + "acc": 0.72422934, + "epoch": 0.29013753290821215, + "grad_norm": 7.71875, + "learning_rate": 9.754939564914288e-06, + "loss": 1.11184921, + "memory(GiB)": 302.58, + "step": 51880, + "train_speed(iter/s)": 0.12529 + }, + { + "acc": 0.75689478, + "epoch": 0.2902493823811914, + "grad_norm": 4.625, + "learning_rate": 9.754653538881592e-06, + "loss": 0.95367947, + "memory(GiB)": 302.58, + "step": 51900, + "train_speed(iter/s)": 0.12531 + }, + { + "acc": 0.72861862, + "epoch": 0.2903612318541707, + "grad_norm": 5.5625, + "learning_rate": 9.754367350224667e-06, + "loss": 1.10863743, + "memory(GiB)": 302.58, + "step": 51920, + "train_speed(iter/s)": 0.125331 + }, + { + "acc": 0.72922702, + "epoch": 0.29047308132714994, + "grad_norm": 9.625, + "learning_rate": 9.754080998953303e-06, + "loss": 1.04020596, + "memory(GiB)": 302.58, + "step": 51940, + "train_speed(iter/s)": 0.125354 + }, + { + "acc": 0.72578683, + "epoch": 0.2905849308001292, + "grad_norm": 7.65625, + "learning_rate": 9.753794485077295e-06, + "loss": 1.09784212, + "memory(GiB)": 302.58, + "step": 51960, + "train_speed(iter/s)": 0.125376 + }, + { + "acc": 0.74866815, + "epoch": 0.29069678027310847, + "grad_norm": 6.28125, + "learning_rate": 9.75350780860644e-06, + "loss": 0.98411427, + "memory(GiB)": 302.58, + "step": 51980, + "train_speed(iter/s)": 0.125398 + }, + { + "acc": 0.72931294, + "epoch": 0.29080862974608773, + "grad_norm": 5.96875, + "learning_rate": 9.753220969550547e-06, + "loss": 1.06531649, + "memory(GiB)": 302.58, + "step": 52000, + "train_speed(iter/s)": 0.125419 + }, + { + "epoch": 0.29080862974608773, + "eval_acc": 0.6980235636666516, + "eval_loss": 1.0512337684631348, + "eval_runtime": 7505.4779, + "eval_samples_per_second": 10.03, + "eval_steps_per_second": 10.03, + "step": 52000 + }, + { + "acc": 0.74354463, + "epoch": 0.290920479219067, + "grad_norm": 9.25, + "learning_rate": 9.752933967919424e-06, + "loss": 1.0147769, + "memory(GiB)": 302.58, + "step": 52020, + "train_speed(iter/s)": 0.123179 + }, + { + "acc": 0.7230895, + "epoch": 0.29103232869204626, + "grad_norm": 6.03125, + "learning_rate": 9.752646803722887e-06, + "loss": 1.08738108, + "memory(GiB)": 302.58, + "step": 52040, + "train_speed(iter/s)": 0.123203 + }, + { + "acc": 0.73534384, + "epoch": 0.2911441781650255, + "grad_norm": 7.46875, + "learning_rate": 9.75235947697076e-06, + "loss": 1.04381886, + "memory(GiB)": 302.58, + "step": 52060, + "train_speed(iter/s)": 0.123225 + }, + { + "acc": 0.74567604, + "epoch": 0.2912560276380048, + "grad_norm": 6.0, + "learning_rate": 9.752071987672871e-06, + "loss": 0.98797226, + "memory(GiB)": 302.58, + "step": 52080, + "train_speed(iter/s)": 0.123247 + }, + { + "acc": 0.72970138, + "epoch": 0.29136787711098405, + "grad_norm": 6.125, + "learning_rate": 9.751784335839051e-06, + "loss": 1.07610502, + "memory(GiB)": 302.58, + "step": 52100, + "train_speed(iter/s)": 0.123269 + }, + { + "acc": 0.71736455, + "epoch": 0.2914797265839633, + "grad_norm": 5.59375, + "learning_rate": 9.751496521479138e-06, + "loss": 1.13055639, + "memory(GiB)": 302.58, + "step": 52120, + "train_speed(iter/s)": 0.12329 + }, + { + "acc": 0.74428601, + "epoch": 0.2915915760569426, + "grad_norm": 4.78125, + "learning_rate": 9.751208544602979e-06, + "loss": 1.01226883, + "memory(GiB)": 302.58, + "step": 52140, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.73498869, + "epoch": 0.29170342552992184, + "grad_norm": 6.5625, + "learning_rate": 9.750920405220421e-06, + "loss": 1.02875795, + "memory(GiB)": 302.58, + "step": 52160, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.72983575, + "epoch": 0.2918152750029011, + "grad_norm": 6.1875, + "learning_rate": 9.75063210334132e-06, + "loss": 1.05379353, + "memory(GiB)": 302.58, + "step": 52180, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.73012152, + "epoch": 0.2919271244758804, + "grad_norm": 6.84375, + "learning_rate": 9.750343638975538e-06, + "loss": 1.07665882, + "memory(GiB)": 302.58, + "step": 52200, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.74640608, + "epoch": 0.29203897394885964, + "grad_norm": 6.25, + "learning_rate": 9.75005501213294e-06, + "loss": 0.98402815, + "memory(GiB)": 302.58, + "step": 52220, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.73140101, + "epoch": 0.2921508234218389, + "grad_norm": 4.375, + "learning_rate": 9.749766222823399e-06, + "loss": 1.05148964, + "memory(GiB)": 302.58, + "step": 52240, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74693689, + "epoch": 0.29226267289481817, + "grad_norm": 7.125, + "learning_rate": 9.749477271056793e-06, + "loss": 0.98571205, + "memory(GiB)": 302.58, + "step": 52260, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.72344589, + "epoch": 0.29237452236779743, + "grad_norm": 5.8125, + "learning_rate": 9.749188156843002e-06, + "loss": 1.1125308, + "memory(GiB)": 302.58, + "step": 52280, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.7343214, + "epoch": 0.2924863718407767, + "grad_norm": 4.6875, + "learning_rate": 9.748898880191917e-06, + "loss": 1.05551291, + "memory(GiB)": 302.58, + "step": 52300, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.73841949, + "epoch": 0.29259822131375596, + "grad_norm": 5.1875, + "learning_rate": 9.748609441113435e-06, + "loss": 1.01803617, + "memory(GiB)": 302.58, + "step": 52320, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.7272676, + "epoch": 0.2927100707867352, + "grad_norm": 6.75, + "learning_rate": 9.74831983961745e-06, + "loss": 1.0677907, + "memory(GiB)": 302.58, + "step": 52340, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73735642, + "epoch": 0.2928219202597145, + "grad_norm": 7.875, + "learning_rate": 9.74803007571387e-06, + "loss": 1.03521194, + "memory(GiB)": 302.58, + "step": 52360, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.71786842, + "epoch": 0.29293376973269375, + "grad_norm": 5.40625, + "learning_rate": 9.747740149412606e-06, + "loss": 1.10494242, + "memory(GiB)": 302.58, + "step": 52380, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75121369, + "epoch": 0.293045619205673, + "grad_norm": 10.0, + "learning_rate": 9.747450060723576e-06, + "loss": 0.98013086, + "memory(GiB)": 302.58, + "step": 52400, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.74672904, + "epoch": 0.2931574686786523, + "grad_norm": 7.65625, + "learning_rate": 9.747159809656699e-06, + "loss": 0.9818121, + "memory(GiB)": 302.58, + "step": 52420, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.7295722, + "epoch": 0.29326931815163154, + "grad_norm": 7.375, + "learning_rate": 9.746869396221902e-06, + "loss": 1.06184149, + "memory(GiB)": 302.58, + "step": 52440, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.72736545, + "epoch": 0.2933811676246108, + "grad_norm": 6.25, + "learning_rate": 9.746578820429122e-06, + "loss": 1.07844944, + "memory(GiB)": 302.58, + "step": 52460, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.72368627, + "epoch": 0.29349301709759007, + "grad_norm": 6.53125, + "learning_rate": 9.746288082288294e-06, + "loss": 1.10670452, + "memory(GiB)": 302.58, + "step": 52480, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.73283296, + "epoch": 0.29360486657056933, + "grad_norm": 8.1875, + "learning_rate": 9.745997181809364e-06, + "loss": 1.03190298, + "memory(GiB)": 302.58, + "step": 52500, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.73595634, + "epoch": 0.2937167160435486, + "grad_norm": 7.3125, + "learning_rate": 9.745706119002281e-06, + "loss": 1.05690145, + "memory(GiB)": 302.58, + "step": 52520, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.73554735, + "epoch": 0.29382856551652786, + "grad_norm": 5.3125, + "learning_rate": 9.745414893877001e-06, + "loss": 1.02967997, + "memory(GiB)": 302.58, + "step": 52540, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.74028091, + "epoch": 0.2939404149895071, + "grad_norm": 7.9375, + "learning_rate": 9.745123506443485e-06, + "loss": 1.02897263, + "memory(GiB)": 302.58, + "step": 52560, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.73406711, + "epoch": 0.2940522644624864, + "grad_norm": 5.59375, + "learning_rate": 9.744831956711698e-06, + "loss": 1.04930792, + "memory(GiB)": 302.58, + "step": 52580, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.73863292, + "epoch": 0.29416411393546565, + "grad_norm": 6.9375, + "learning_rate": 9.744540244691612e-06, + "loss": 1.03456135, + "memory(GiB)": 302.58, + "step": 52600, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.71739187, + "epoch": 0.2942759634084449, + "grad_norm": 7.59375, + "learning_rate": 9.744248370393205e-06, + "loss": 1.12186089, + "memory(GiB)": 302.58, + "step": 52620, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.73418489, + "epoch": 0.2943878128814242, + "grad_norm": 7.90625, + "learning_rate": 9.74395633382646e-06, + "loss": 1.05752668, + "memory(GiB)": 302.58, + "step": 52640, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.72542667, + "epoch": 0.29449966235440345, + "grad_norm": 4.9375, + "learning_rate": 9.743664135001368e-06, + "loss": 1.10906029, + "memory(GiB)": 302.58, + "step": 52660, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.7322391, + "epoch": 0.2946115118273827, + "grad_norm": 7.125, + "learning_rate": 9.743371773927918e-06, + "loss": 1.05781612, + "memory(GiB)": 302.58, + "step": 52680, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.73264861, + "epoch": 0.294723361300362, + "grad_norm": 7.25, + "learning_rate": 9.743079250616115e-06, + "loss": 1.04739542, + "memory(GiB)": 302.58, + "step": 52700, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.72960076, + "epoch": 0.29483521077334124, + "grad_norm": 6.375, + "learning_rate": 9.74278656507596e-06, + "loss": 1.09173794, + "memory(GiB)": 302.58, + "step": 52720, + "train_speed(iter/s)": 0.123943 + }, + { + "acc": 0.72908263, + "epoch": 0.2949470602463205, + "grad_norm": 5.8125, + "learning_rate": 9.742493717317466e-06, + "loss": 1.07738781, + "memory(GiB)": 302.58, + "step": 52740, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.74011168, + "epoch": 0.29505890971929977, + "grad_norm": 5.75, + "learning_rate": 9.74220070735065e-06, + "loss": 1.00560923, + "memory(GiB)": 302.58, + "step": 52760, + "train_speed(iter/s)": 0.123985 + }, + { + "acc": 0.7483355, + "epoch": 0.29517075919227903, + "grad_norm": 8.5625, + "learning_rate": 9.74190753518553e-06, + "loss": 0.9861598, + "memory(GiB)": 302.58, + "step": 52780, + "train_speed(iter/s)": 0.124007 + }, + { + "acc": 0.71508975, + "epoch": 0.2952826086652583, + "grad_norm": 6.0, + "learning_rate": 9.741614200832137e-06, + "loss": 1.15168409, + "memory(GiB)": 302.58, + "step": 52800, + "train_speed(iter/s)": 0.12403 + }, + { + "acc": 0.72234163, + "epoch": 0.29539445813823756, + "grad_norm": 5.125, + "learning_rate": 9.741320704300504e-06, + "loss": 1.09652653, + "memory(GiB)": 302.58, + "step": 52820, + "train_speed(iter/s)": 0.12405 + }, + { + "acc": 0.74729552, + "epoch": 0.2955063076112168, + "grad_norm": 7.15625, + "learning_rate": 9.741027045600668e-06, + "loss": 0.97758522, + "memory(GiB)": 302.58, + "step": 52840, + "train_speed(iter/s)": 0.124071 + }, + { + "acc": 0.74757161, + "epoch": 0.2956181570841961, + "grad_norm": 7.4375, + "learning_rate": 9.740733224742673e-06, + "loss": 0.9944232, + "memory(GiB)": 302.58, + "step": 52860, + "train_speed(iter/s)": 0.124094 + }, + { + "acc": 0.737638, + "epoch": 0.29573000655717535, + "grad_norm": 9.375, + "learning_rate": 9.740439241736572e-06, + "loss": 1.04234343, + "memory(GiB)": 302.58, + "step": 52880, + "train_speed(iter/s)": 0.124116 + }, + { + "acc": 0.72480421, + "epoch": 0.2958418560301546, + "grad_norm": 7.6875, + "learning_rate": 9.740145096592415e-06, + "loss": 1.08291636, + "memory(GiB)": 302.58, + "step": 52900, + "train_speed(iter/s)": 0.124139 + }, + { + "acc": 0.73042736, + "epoch": 0.2959537055031339, + "grad_norm": 6.4375, + "learning_rate": 9.739850789320266e-06, + "loss": 1.06589842, + "memory(GiB)": 302.58, + "step": 52920, + "train_speed(iter/s)": 0.124159 + }, + { + "acc": 0.74098339, + "epoch": 0.29606555497611314, + "grad_norm": 8.75, + "learning_rate": 9.739556319930189e-06, + "loss": 1.0092, + "memory(GiB)": 302.58, + "step": 52940, + "train_speed(iter/s)": 0.124182 + }, + { + "acc": 0.7261416, + "epoch": 0.2961774044490924, + "grad_norm": 6.03125, + "learning_rate": 9.739261688432257e-06, + "loss": 1.07351017, + "memory(GiB)": 302.58, + "step": 52960, + "train_speed(iter/s)": 0.124205 + }, + { + "acc": 0.7389883, + "epoch": 0.29628925392207167, + "grad_norm": 6.78125, + "learning_rate": 9.73896689483655e-06, + "loss": 1.01550341, + "memory(GiB)": 302.58, + "step": 52980, + "train_speed(iter/s)": 0.124226 + }, + { + "acc": 0.74333844, + "epoch": 0.29640110339505094, + "grad_norm": 8.9375, + "learning_rate": 9.738671939153147e-06, + "loss": 1.03231649, + "memory(GiB)": 302.58, + "step": 53000, + "train_speed(iter/s)": 0.124248 + }, + { + "acc": 0.73414989, + "epoch": 0.2965129528680302, + "grad_norm": 6.125, + "learning_rate": 9.738376821392138e-06, + "loss": 1.05644827, + "memory(GiB)": 302.58, + "step": 53020, + "train_speed(iter/s)": 0.12427 + }, + { + "acc": 0.73671479, + "epoch": 0.29662480234100946, + "grad_norm": 5.625, + "learning_rate": 9.738081541563616e-06, + "loss": 1.04490376, + "memory(GiB)": 302.58, + "step": 53040, + "train_speed(iter/s)": 0.124293 + }, + { + "acc": 0.73528676, + "epoch": 0.2967366518139887, + "grad_norm": 6.5625, + "learning_rate": 9.737786099677682e-06, + "loss": 1.05137672, + "memory(GiB)": 302.58, + "step": 53060, + "train_speed(iter/s)": 0.124315 + }, + { + "acc": 0.71620312, + "epoch": 0.296848501286968, + "grad_norm": 7.84375, + "learning_rate": 9.737490495744439e-06, + "loss": 1.12366705, + "memory(GiB)": 302.58, + "step": 53080, + "train_speed(iter/s)": 0.124336 + }, + { + "acc": 0.73030877, + "epoch": 0.29696035075994726, + "grad_norm": 5.8125, + "learning_rate": 9.737194729774e-06, + "loss": 1.06236153, + "memory(GiB)": 302.58, + "step": 53100, + "train_speed(iter/s)": 0.124359 + }, + { + "acc": 0.74292789, + "epoch": 0.2970722002329265, + "grad_norm": 7.71875, + "learning_rate": 9.73689880177648e-06, + "loss": 1.04123144, + "memory(GiB)": 302.58, + "step": 53120, + "train_speed(iter/s)": 0.124381 + }, + { + "acc": 0.72187362, + "epoch": 0.2971840497059058, + "grad_norm": 7.46875, + "learning_rate": 9.736602711762e-06, + "loss": 1.08144512, + "memory(GiB)": 302.58, + "step": 53140, + "train_speed(iter/s)": 0.124404 + }, + { + "acc": 0.74093604, + "epoch": 0.29729589917888505, + "grad_norm": 7.40625, + "learning_rate": 9.73630645974069e-06, + "loss": 1.00977879, + "memory(GiB)": 302.58, + "step": 53160, + "train_speed(iter/s)": 0.124426 + }, + { + "acc": 0.7076189, + "epoch": 0.2974077486518643, + "grad_norm": 8.25, + "learning_rate": 9.736010045722678e-06, + "loss": 1.1714056, + "memory(GiB)": 302.58, + "step": 53180, + "train_speed(iter/s)": 0.124446 + }, + { + "acc": 0.73076115, + "epoch": 0.2975195981248436, + "grad_norm": 5.78125, + "learning_rate": 9.735713469718107e-06, + "loss": 1.04550915, + "memory(GiB)": 302.58, + "step": 53200, + "train_speed(iter/s)": 0.124467 + }, + { + "acc": 0.71594229, + "epoch": 0.29763144759782284, + "grad_norm": 5.875, + "learning_rate": 9.735416731737117e-06, + "loss": 1.13585882, + "memory(GiB)": 302.58, + "step": 53220, + "train_speed(iter/s)": 0.124488 + }, + { + "acc": 0.7210166, + "epoch": 0.2977432970708021, + "grad_norm": 9.6875, + "learning_rate": 9.73511983178986e-06, + "loss": 1.12096682, + "memory(GiB)": 302.58, + "step": 53240, + "train_speed(iter/s)": 0.12451 + }, + { + "acc": 0.7430541, + "epoch": 0.29785514654378137, + "grad_norm": 5.40625, + "learning_rate": 9.73482276988649e-06, + "loss": 1.00657911, + "memory(GiB)": 302.58, + "step": 53260, + "train_speed(iter/s)": 0.12453 + }, + { + "acc": 0.73000813, + "epoch": 0.29796699601676063, + "grad_norm": 4.65625, + "learning_rate": 9.734525546037169e-06, + "loss": 1.08488855, + "memory(GiB)": 302.58, + "step": 53280, + "train_speed(iter/s)": 0.124549 + }, + { + "acc": 0.72018728, + "epoch": 0.2980788454897399, + "grad_norm": 7.40625, + "learning_rate": 9.73422816025206e-06, + "loss": 1.12044916, + "memory(GiB)": 302.58, + "step": 53300, + "train_speed(iter/s)": 0.124569 + }, + { + "acc": 0.73854671, + "epoch": 0.29819069496271916, + "grad_norm": 5.78125, + "learning_rate": 9.733930612541334e-06, + "loss": 1.03165007, + "memory(GiB)": 302.58, + "step": 53320, + "train_speed(iter/s)": 0.12459 + }, + { + "acc": 0.71529031, + "epoch": 0.2983025444356984, + "grad_norm": 7.25, + "learning_rate": 9.733632902915174e-06, + "loss": 1.15025711, + "memory(GiB)": 302.58, + "step": 53340, + "train_speed(iter/s)": 0.124612 + }, + { + "acc": 0.74080696, + "epoch": 0.2984143939086777, + "grad_norm": 9.625, + "learning_rate": 9.733335031383758e-06, + "loss": 1.02988148, + "memory(GiB)": 302.58, + "step": 53360, + "train_speed(iter/s)": 0.124633 + }, + { + "acc": 0.73807206, + "epoch": 0.29852624338165695, + "grad_norm": 5.5, + "learning_rate": 9.733036997957273e-06, + "loss": 1.06735172, + "memory(GiB)": 302.58, + "step": 53380, + "train_speed(iter/s)": 0.124654 + }, + { + "acc": 0.73739595, + "epoch": 0.2986380928546362, + "grad_norm": 8.1875, + "learning_rate": 9.732738802645915e-06, + "loss": 1.02582207, + "memory(GiB)": 302.58, + "step": 53400, + "train_speed(iter/s)": 0.124676 + }, + { + "acc": 0.74097648, + "epoch": 0.2987499423276155, + "grad_norm": 6.625, + "learning_rate": 9.732440445459885e-06, + "loss": 1.00728979, + "memory(GiB)": 302.58, + "step": 53420, + "train_speed(iter/s)": 0.124699 + }, + { + "acc": 0.73988543, + "epoch": 0.29886179180059474, + "grad_norm": 9.4375, + "learning_rate": 9.732141926409382e-06, + "loss": 1.02105923, + "memory(GiB)": 302.58, + "step": 53440, + "train_speed(iter/s)": 0.124721 + }, + { + "acc": 0.74186654, + "epoch": 0.298973641273574, + "grad_norm": 8.4375, + "learning_rate": 9.731843245504622e-06, + "loss": 1.0385807, + "memory(GiB)": 302.58, + "step": 53460, + "train_speed(iter/s)": 0.124744 + }, + { + "acc": 0.74881573, + "epoch": 0.2990854907465533, + "grad_norm": 6.625, + "learning_rate": 9.73154440275582e-06, + "loss": 0.99631834, + "memory(GiB)": 302.58, + "step": 53480, + "train_speed(iter/s)": 0.124765 + }, + { + "acc": 0.73074927, + "epoch": 0.29919734021953254, + "grad_norm": 6.3125, + "learning_rate": 9.731245398173194e-06, + "loss": 1.05400639, + "memory(GiB)": 302.58, + "step": 53500, + "train_speed(iter/s)": 0.124787 + }, + { + "acc": 0.73386927, + "epoch": 0.2993091896925118, + "grad_norm": 6.4375, + "learning_rate": 9.730946231766976e-06, + "loss": 1.03089867, + "memory(GiB)": 302.58, + "step": 53520, + "train_speed(iter/s)": 0.124808 + }, + { + "acc": 0.73124032, + "epoch": 0.29942103916549107, + "grad_norm": 5.84375, + "learning_rate": 9.730646903547391e-06, + "loss": 1.04200535, + "memory(GiB)": 302.58, + "step": 53540, + "train_speed(iter/s)": 0.124829 + }, + { + "acc": 0.71622252, + "epoch": 0.29953288863847033, + "grad_norm": 8.6875, + "learning_rate": 9.730347413524686e-06, + "loss": 1.15357151, + "memory(GiB)": 302.58, + "step": 53560, + "train_speed(iter/s)": 0.124851 + }, + { + "acc": 0.7356986, + "epoch": 0.2996447381114496, + "grad_norm": 7.09375, + "learning_rate": 9.730047761709096e-06, + "loss": 1.03301544, + "memory(GiB)": 302.58, + "step": 53580, + "train_speed(iter/s)": 0.124873 + }, + { + "acc": 0.73333845, + "epoch": 0.29975658758442886, + "grad_norm": 8.3125, + "learning_rate": 9.729747948110878e-06, + "loss": 1.05944147, + "memory(GiB)": 302.58, + "step": 53600, + "train_speed(iter/s)": 0.124896 + }, + { + "acc": 0.71083479, + "epoch": 0.2998684370574081, + "grad_norm": 6.625, + "learning_rate": 9.729447972740277e-06, + "loss": 1.15914516, + "memory(GiB)": 302.58, + "step": 53620, + "train_speed(iter/s)": 0.124917 + }, + { + "acc": 0.72082229, + "epoch": 0.2999802865303874, + "grad_norm": 11.25, + "learning_rate": 9.729147835607564e-06, + "loss": 1.09803314, + "memory(GiB)": 302.58, + "step": 53640, + "train_speed(iter/s)": 0.124939 + }, + { + "acc": 0.74329381, + "epoch": 0.30009213600336665, + "grad_norm": 7.46875, + "learning_rate": 9.728847536722996e-06, + "loss": 1.01209183, + "memory(GiB)": 302.58, + "step": 53660, + "train_speed(iter/s)": 0.124959 + }, + { + "acc": 0.74057136, + "epoch": 0.3002039854763459, + "grad_norm": 8.1875, + "learning_rate": 9.72854707609685e-06, + "loss": 1.02453117, + "memory(GiB)": 302.58, + "step": 53680, + "train_speed(iter/s)": 0.124981 + }, + { + "acc": 0.73877163, + "epoch": 0.3003158349493252, + "grad_norm": 8.0625, + "learning_rate": 9.728246453739399e-06, + "loss": 1.0195652, + "memory(GiB)": 302.58, + "step": 53700, + "train_speed(iter/s)": 0.125003 + }, + { + "acc": 0.72506342, + "epoch": 0.30042768442230444, + "grad_norm": 6.0625, + "learning_rate": 9.727945669660925e-06, + "loss": 1.10940084, + "memory(GiB)": 302.58, + "step": 53720, + "train_speed(iter/s)": 0.125025 + }, + { + "acc": 0.7290339, + "epoch": 0.3005395338952837, + "grad_norm": 12.75, + "learning_rate": 9.72764472387172e-06, + "loss": 1.0620471, + "memory(GiB)": 302.58, + "step": 53740, + "train_speed(iter/s)": 0.125047 + }, + { + "acc": 0.75306101, + "epoch": 0.30065138336826297, + "grad_norm": 6.375, + "learning_rate": 9.727343616382073e-06, + "loss": 0.95578699, + "memory(GiB)": 302.58, + "step": 53760, + "train_speed(iter/s)": 0.125069 + }, + { + "acc": 0.74176826, + "epoch": 0.30076323284124223, + "grad_norm": 6.46875, + "learning_rate": 9.727042347202283e-06, + "loss": 1.01211529, + "memory(GiB)": 302.58, + "step": 53780, + "train_speed(iter/s)": 0.12509 + }, + { + "acc": 0.73490009, + "epoch": 0.3008750823142215, + "grad_norm": 7.625, + "learning_rate": 9.726740916342657e-06, + "loss": 1.04917631, + "memory(GiB)": 302.58, + "step": 53800, + "train_speed(iter/s)": 0.125113 + }, + { + "acc": 0.72091069, + "epoch": 0.30098693178720076, + "grad_norm": 6.15625, + "learning_rate": 9.726439323813503e-06, + "loss": 1.10645256, + "memory(GiB)": 302.58, + "step": 53820, + "train_speed(iter/s)": 0.125134 + }, + { + "acc": 0.74639745, + "epoch": 0.30109878126018, + "grad_norm": 9.5625, + "learning_rate": 9.726137569625138e-06, + "loss": 0.99204569, + "memory(GiB)": 302.58, + "step": 53840, + "train_speed(iter/s)": 0.125155 + }, + { + "acc": 0.73414588, + "epoch": 0.3012106307331593, + "grad_norm": 6.21875, + "learning_rate": 9.725835653787883e-06, + "loss": 1.03413906, + "memory(GiB)": 302.58, + "step": 53860, + "train_speed(iter/s)": 0.125177 + }, + { + "acc": 0.70699229, + "epoch": 0.30132248020613855, + "grad_norm": 5.8125, + "learning_rate": 9.725533576312061e-06, + "loss": 1.17887077, + "memory(GiB)": 302.58, + "step": 53880, + "train_speed(iter/s)": 0.125197 + }, + { + "acc": 0.73741021, + "epoch": 0.3014343296791178, + "grad_norm": 5.71875, + "learning_rate": 9.725231337208007e-06, + "loss": 1.03887701, + "memory(GiB)": 302.58, + "step": 53900, + "train_speed(iter/s)": 0.125218 + }, + { + "acc": 0.73605814, + "epoch": 0.3015461791520971, + "grad_norm": 6.15625, + "learning_rate": 9.724928936486058e-06, + "loss": 1.06178865, + "memory(GiB)": 302.58, + "step": 53920, + "train_speed(iter/s)": 0.125239 + }, + { + "acc": 0.74294057, + "epoch": 0.30165802862507635, + "grad_norm": 6.34375, + "learning_rate": 9.724626374156558e-06, + "loss": 1.01550713, + "memory(GiB)": 302.58, + "step": 53940, + "train_speed(iter/s)": 0.125261 + }, + { + "acc": 0.73900561, + "epoch": 0.3017698780980556, + "grad_norm": 6.125, + "learning_rate": 9.724323650229854e-06, + "loss": 1.02340508, + "memory(GiB)": 302.58, + "step": 53960, + "train_speed(iter/s)": 0.125279 + }, + { + "acc": 0.73614917, + "epoch": 0.3018817275710349, + "grad_norm": 3.9375, + "learning_rate": 9.7240207647163e-06, + "loss": 1.03154688, + "memory(GiB)": 302.58, + "step": 53980, + "train_speed(iter/s)": 0.1253 + }, + { + "acc": 0.73118286, + "epoch": 0.30199357704401414, + "grad_norm": 4.9375, + "learning_rate": 9.723717717626258e-06, + "loss": 1.04704742, + "memory(GiB)": 302.58, + "step": 54000, + "train_speed(iter/s)": 0.125321 + }, + { + "epoch": 0.30199357704401414, + "eval_acc": 0.6983934374540616, + "eval_loss": 1.0501792430877686, + "eval_runtime": 7497.4334, + "eval_samples_per_second": 10.041, + "eval_steps_per_second": 10.041, + "step": 54000 + }, + { + "acc": 0.74173641, + "epoch": 0.3021054265169934, + "grad_norm": 9.0625, + "learning_rate": 9.723414508970091e-06, + "loss": 1.02790327, + "memory(GiB)": 302.58, + "step": 54020, + "train_speed(iter/s)": 0.123167 + }, + { + "acc": 0.72863479, + "epoch": 0.3022172759899727, + "grad_norm": 8.125, + "learning_rate": 9.72311113875817e-06, + "loss": 1.05581942, + "memory(GiB)": 302.58, + "step": 54040, + "train_speed(iter/s)": 0.12319 + }, + { + "acc": 0.73134937, + "epoch": 0.302329125462952, + "grad_norm": 6.09375, + "learning_rate": 9.72280760700087e-06, + "loss": 1.05938463, + "memory(GiB)": 302.58, + "step": 54060, + "train_speed(iter/s)": 0.123211 + }, + { + "acc": 0.72203898, + "epoch": 0.30244097493593125, + "grad_norm": 7.21875, + "learning_rate": 9.722503913708576e-06, + "loss": 1.10930176, + "memory(GiB)": 302.58, + "step": 54080, + "train_speed(iter/s)": 0.123233 + }, + { + "acc": 0.73506942, + "epoch": 0.3025528244089105, + "grad_norm": 5.40625, + "learning_rate": 9.722200058891673e-06, + "loss": 1.05479202, + "memory(GiB)": 302.58, + "step": 54100, + "train_speed(iter/s)": 0.123254 + }, + { + "acc": 0.75705781, + "epoch": 0.3026646738818898, + "grad_norm": 6.90625, + "learning_rate": 9.721896042560554e-06, + "loss": 0.96015196, + "memory(GiB)": 302.58, + "step": 54120, + "train_speed(iter/s)": 0.123276 + }, + { + "acc": 0.75155897, + "epoch": 0.30277652335486904, + "grad_norm": 5.75, + "learning_rate": 9.72159186472562e-06, + "loss": 0.95802422, + "memory(GiB)": 302.58, + "step": 54140, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.72911868, + "epoch": 0.3028883728278483, + "grad_norm": 6.625, + "learning_rate": 9.721287525397269e-06, + "loss": 1.07538471, + "memory(GiB)": 302.58, + "step": 54160, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.73119516, + "epoch": 0.30300022230082757, + "grad_norm": 6.03125, + "learning_rate": 9.720983024585915e-06, + "loss": 1.04706955, + "memory(GiB)": 302.58, + "step": 54180, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.73578625, + "epoch": 0.30311207177380683, + "grad_norm": 6.0625, + "learning_rate": 9.720678362301972e-06, + "loss": 1.03020153, + "memory(GiB)": 302.58, + "step": 54200, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.72517314, + "epoch": 0.3032239212467861, + "grad_norm": 7.0625, + "learning_rate": 9.72037353855586e-06, + "loss": 1.08796501, + "memory(GiB)": 302.58, + "step": 54220, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.74284258, + "epoch": 0.30333577071976536, + "grad_norm": 7.25, + "learning_rate": 9.720068553358006e-06, + "loss": 1.0113204, + "memory(GiB)": 302.58, + "step": 54240, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.72859464, + "epoch": 0.3034476201927446, + "grad_norm": 5.5, + "learning_rate": 9.71976340671884e-06, + "loss": 1.07845716, + "memory(GiB)": 302.58, + "step": 54260, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.72210803, + "epoch": 0.3035594696657239, + "grad_norm": 5.6875, + "learning_rate": 9.719458098648799e-06, + "loss": 1.08702049, + "memory(GiB)": 302.58, + "step": 54280, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.73380075, + "epoch": 0.30367131913870316, + "grad_norm": 8.875, + "learning_rate": 9.719152629158327e-06, + "loss": 1.06077347, + "memory(GiB)": 302.58, + "step": 54300, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.733986, + "epoch": 0.3037831686116824, + "grad_norm": 5.59375, + "learning_rate": 9.71884699825787e-06, + "loss": 1.04832001, + "memory(GiB)": 302.58, + "step": 54320, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.73539681, + "epoch": 0.3038950180846617, + "grad_norm": 5.5625, + "learning_rate": 9.718541205957883e-06, + "loss": 1.03720627, + "memory(GiB)": 302.58, + "step": 54340, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74946442, + "epoch": 0.30400686755764095, + "grad_norm": 5.53125, + "learning_rate": 9.718235252268824e-06, + "loss": 0.9853941, + "memory(GiB)": 302.58, + "step": 54360, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.71732492, + "epoch": 0.3041187170306202, + "grad_norm": 6.5, + "learning_rate": 9.717929137201158e-06, + "loss": 1.1189044, + "memory(GiB)": 302.58, + "step": 54380, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.72990294, + "epoch": 0.3042305665035995, + "grad_norm": 8.0625, + "learning_rate": 9.717622860765357e-06, + "loss": 1.08277988, + "memory(GiB)": 302.58, + "step": 54400, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.73208194, + "epoch": 0.30434241597657874, + "grad_norm": 6.96875, + "learning_rate": 9.717316422971894e-06, + "loss": 1.06314745, + "memory(GiB)": 302.58, + "step": 54420, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.72156572, + "epoch": 0.304454265449558, + "grad_norm": 4.75, + "learning_rate": 9.717009823831251e-06, + "loss": 1.11813946, + "memory(GiB)": 302.58, + "step": 54440, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.74460425, + "epoch": 0.30456611492253727, + "grad_norm": 5.375, + "learning_rate": 9.716703063353917e-06, + "loss": 0.99112244, + "memory(GiB)": 302.58, + "step": 54460, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.72449651, + "epoch": 0.30467796439551653, + "grad_norm": 6.75, + "learning_rate": 9.71639614155038e-06, + "loss": 1.10520563, + "memory(GiB)": 302.58, + "step": 54480, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.74207029, + "epoch": 0.3047898138684958, + "grad_norm": 8.75, + "learning_rate": 9.71608905843114e-06, + "loss": 1.01878901, + "memory(GiB)": 302.58, + "step": 54500, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.73148875, + "epoch": 0.30490166334147506, + "grad_norm": 5.375, + "learning_rate": 9.7157818140067e-06, + "loss": 1.09266119, + "memory(GiB)": 302.58, + "step": 54520, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.73418646, + "epoch": 0.3050135128144543, + "grad_norm": 9.125, + "learning_rate": 9.715474408287566e-06, + "loss": 1.04811211, + "memory(GiB)": 302.58, + "step": 54540, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.7310895, + "epoch": 0.3051253622874336, + "grad_norm": 6.75, + "learning_rate": 9.715166841284258e-06, + "loss": 1.04906282, + "memory(GiB)": 302.58, + "step": 54560, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.7239749, + "epoch": 0.30523721176041285, + "grad_norm": 8.375, + "learning_rate": 9.714859113007292e-06, + "loss": 1.11004686, + "memory(GiB)": 302.58, + "step": 54580, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.72783709, + "epoch": 0.3053490612333921, + "grad_norm": 7.28125, + "learning_rate": 9.714551223467194e-06, + "loss": 1.07258873, + "memory(GiB)": 302.58, + "step": 54600, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.73851461, + "epoch": 0.3054609107063714, + "grad_norm": 7.96875, + "learning_rate": 9.714243172674495e-06, + "loss": 1.03110533, + "memory(GiB)": 302.58, + "step": 54620, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.72817745, + "epoch": 0.30557276017935064, + "grad_norm": 6.65625, + "learning_rate": 9.71393496063973e-06, + "loss": 1.07416611, + "memory(GiB)": 302.58, + "step": 54640, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.75061769, + "epoch": 0.3056846096523299, + "grad_norm": 6.875, + "learning_rate": 9.713626587373443e-06, + "loss": 0.974016, + "memory(GiB)": 302.58, + "step": 54660, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.745368, + "epoch": 0.3057964591253092, + "grad_norm": 7.65625, + "learning_rate": 9.713318052886177e-06, + "loss": 1.00156574, + "memory(GiB)": 302.58, + "step": 54680, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.71588879, + "epoch": 0.30590830859828844, + "grad_norm": 6.625, + "learning_rate": 9.713009357188493e-06, + "loss": 1.12481279, + "memory(GiB)": 302.58, + "step": 54700, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.73748751, + "epoch": 0.3060201580712677, + "grad_norm": 7.75, + "learning_rate": 9.712700500290941e-06, + "loss": 1.03691483, + "memory(GiB)": 302.58, + "step": 54720, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.71367936, + "epoch": 0.30613200754424696, + "grad_norm": 7.5, + "learning_rate": 9.71239148220409e-06, + "loss": 1.16210718, + "memory(GiB)": 302.58, + "step": 54740, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.73397708, + "epoch": 0.30624385701722623, + "grad_norm": 4.65625, + "learning_rate": 9.712082302938507e-06, + "loss": 1.06414061, + "memory(GiB)": 302.58, + "step": 54760, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.73754129, + "epoch": 0.3063557064902055, + "grad_norm": 7.0, + "learning_rate": 9.711772962504769e-06, + "loss": 1.02512922, + "memory(GiB)": 302.58, + "step": 54780, + "train_speed(iter/s)": 0.123977 + }, + { + "acc": 0.72836633, + "epoch": 0.30646755596318476, + "grad_norm": 5.25, + "learning_rate": 9.711463460913453e-06, + "loss": 1.08456087, + "memory(GiB)": 302.58, + "step": 54800, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.72251105, + "epoch": 0.306579405436164, + "grad_norm": 6.40625, + "learning_rate": 9.711153798175148e-06, + "loss": 1.11972914, + "memory(GiB)": 302.58, + "step": 54820, + "train_speed(iter/s)": 0.124021 + }, + { + "acc": 0.74597397, + "epoch": 0.3066912549091433, + "grad_norm": 8.25, + "learning_rate": 9.710843974300443e-06, + "loss": 0.9891552, + "memory(GiB)": 302.58, + "step": 54840, + "train_speed(iter/s)": 0.124041 + }, + { + "acc": 0.72993331, + "epoch": 0.30680310438212255, + "grad_norm": 8.8125, + "learning_rate": 9.710533989299939e-06, + "loss": 1.05631895, + "memory(GiB)": 302.58, + "step": 54860, + "train_speed(iter/s)": 0.124063 + }, + { + "acc": 0.74699683, + "epoch": 0.3069149538551018, + "grad_norm": 5.96875, + "learning_rate": 9.710223843184235e-06, + "loss": 1.00309772, + "memory(GiB)": 302.58, + "step": 54880, + "train_speed(iter/s)": 0.124083 + }, + { + "acc": 0.74331141, + "epoch": 0.3070268033280811, + "grad_norm": 6.28125, + "learning_rate": 9.709913535963938e-06, + "loss": 1.01679316, + "memory(GiB)": 302.58, + "step": 54900, + "train_speed(iter/s)": 0.124104 + }, + { + "acc": 0.7414536, + "epoch": 0.30713865280106034, + "grad_norm": 6.5, + "learning_rate": 9.709603067649665e-06, + "loss": 1.0085454, + "memory(GiB)": 302.58, + "step": 54920, + "train_speed(iter/s)": 0.124126 + }, + { + "acc": 0.74506598, + "epoch": 0.3072505022740396, + "grad_norm": 7.71875, + "learning_rate": 9.709292438252033e-06, + "loss": 1.01393232, + "memory(GiB)": 302.58, + "step": 54940, + "train_speed(iter/s)": 0.124147 + }, + { + "acc": 0.74919634, + "epoch": 0.30736235174701887, + "grad_norm": 7.09375, + "learning_rate": 9.708981647781666e-06, + "loss": 0.948563, + "memory(GiB)": 302.58, + "step": 54960, + "train_speed(iter/s)": 0.124168 + }, + { + "acc": 0.7298224, + "epoch": 0.30747420121999813, + "grad_norm": 9.0625, + "learning_rate": 9.708670696249196e-06, + "loss": 1.06793966, + "memory(GiB)": 302.58, + "step": 54980, + "train_speed(iter/s)": 0.124188 + }, + { + "acc": 0.73460422, + "epoch": 0.3075860506929774, + "grad_norm": 5.125, + "learning_rate": 9.708359583665259e-06, + "loss": 1.0151022, + "memory(GiB)": 302.58, + "step": 55000, + "train_speed(iter/s)": 0.124209 + }, + { + "acc": 0.72629471, + "epoch": 0.30769790016595666, + "grad_norm": 8.125, + "learning_rate": 9.708048310040492e-06, + "loss": 1.08592501, + "memory(GiB)": 302.58, + "step": 55020, + "train_speed(iter/s)": 0.12423 + }, + { + "acc": 0.74149108, + "epoch": 0.3078097496389359, + "grad_norm": 6.875, + "learning_rate": 9.707736875385544e-06, + "loss": 1.03349562, + "memory(GiB)": 302.58, + "step": 55040, + "train_speed(iter/s)": 0.124251 + }, + { + "acc": 0.73100567, + "epoch": 0.3079215991119152, + "grad_norm": 6.71875, + "learning_rate": 9.707425279711067e-06, + "loss": 1.05910978, + "memory(GiB)": 302.58, + "step": 55060, + "train_speed(iter/s)": 0.124271 + }, + { + "acc": 0.73317266, + "epoch": 0.30803344858489445, + "grad_norm": 5.78125, + "learning_rate": 9.70711352302772e-06, + "loss": 1.04576521, + "memory(GiB)": 302.58, + "step": 55080, + "train_speed(iter/s)": 0.124291 + }, + { + "acc": 0.72981534, + "epoch": 0.3081452980578737, + "grad_norm": 7.0625, + "learning_rate": 9.706801605346163e-06, + "loss": 1.07756109, + "memory(GiB)": 302.58, + "step": 55100, + "train_speed(iter/s)": 0.124312 + }, + { + "acc": 0.73679872, + "epoch": 0.308257147530853, + "grad_norm": 8.5625, + "learning_rate": 9.706489526677068e-06, + "loss": 1.04472427, + "memory(GiB)": 302.58, + "step": 55120, + "train_speed(iter/s)": 0.124333 + }, + { + "acc": 0.73340006, + "epoch": 0.30836899700383225, + "grad_norm": 7.15625, + "learning_rate": 9.706177287031106e-06, + "loss": 1.0631937, + "memory(GiB)": 302.58, + "step": 55140, + "train_speed(iter/s)": 0.124354 + }, + { + "acc": 0.75361066, + "epoch": 0.3084808464768115, + "grad_norm": 5.65625, + "learning_rate": 9.705864886418959e-06, + "loss": 0.96656933, + "memory(GiB)": 302.58, + "step": 55160, + "train_speed(iter/s)": 0.124374 + }, + { + "acc": 0.73992896, + "epoch": 0.3085926959497908, + "grad_norm": 7.15625, + "learning_rate": 9.70555232485131e-06, + "loss": 1.04358473, + "memory(GiB)": 302.58, + "step": 55180, + "train_speed(iter/s)": 0.124394 + }, + { + "acc": 0.74002762, + "epoch": 0.30870454542277004, + "grad_norm": 7.0625, + "learning_rate": 9.70523960233885e-06, + "loss": 1.01130571, + "memory(GiB)": 302.58, + "step": 55200, + "train_speed(iter/s)": 0.124415 + }, + { + "acc": 0.72067695, + "epoch": 0.3088163948957493, + "grad_norm": 6.15625, + "learning_rate": 9.704926718892279e-06, + "loss": 1.12378387, + "memory(GiB)": 302.58, + "step": 55220, + "train_speed(iter/s)": 0.124434 + }, + { + "acc": 0.75272317, + "epoch": 0.30892824436872857, + "grad_norm": 6.25, + "learning_rate": 9.704613674522291e-06, + "loss": 0.95525064, + "memory(GiB)": 302.58, + "step": 55240, + "train_speed(iter/s)": 0.124455 + }, + { + "acc": 0.73803182, + "epoch": 0.30904009384170783, + "grad_norm": 7.34375, + "learning_rate": 9.7043004692396e-06, + "loss": 1.01932592, + "memory(GiB)": 302.58, + "step": 55260, + "train_speed(iter/s)": 0.124476 + }, + { + "acc": 0.73130946, + "epoch": 0.3091519433146871, + "grad_norm": 6.21875, + "learning_rate": 9.703987103054914e-06, + "loss": 1.06404266, + "memory(GiB)": 302.58, + "step": 55280, + "train_speed(iter/s)": 0.124498 + }, + { + "acc": 0.73920527, + "epoch": 0.30926379278766636, + "grad_norm": 9.25, + "learning_rate": 9.703673575978956e-06, + "loss": 1.03402424, + "memory(GiB)": 302.58, + "step": 55300, + "train_speed(iter/s)": 0.124518 + }, + { + "acc": 0.72444906, + "epoch": 0.3093756422606456, + "grad_norm": 5.875, + "learning_rate": 9.703359888022444e-06, + "loss": 1.09948072, + "memory(GiB)": 302.58, + "step": 55320, + "train_speed(iter/s)": 0.124539 + }, + { + "acc": 0.73106489, + "epoch": 0.3094874917336249, + "grad_norm": 6.84375, + "learning_rate": 9.703046039196112e-06, + "loss": 1.08537931, + "memory(GiB)": 302.58, + "step": 55340, + "train_speed(iter/s)": 0.124561 + }, + { + "acc": 0.74652767, + "epoch": 0.30959934120660415, + "grad_norm": 5.5, + "learning_rate": 9.702732029510691e-06, + "loss": 1.00472898, + "memory(GiB)": 302.58, + "step": 55360, + "train_speed(iter/s)": 0.12458 + }, + { + "acc": 0.7177371, + "epoch": 0.3097111906795834, + "grad_norm": 7.46875, + "learning_rate": 9.702417858976922e-06, + "loss": 1.12873945, + "memory(GiB)": 302.58, + "step": 55380, + "train_speed(iter/s)": 0.1246 + }, + { + "acc": 0.71816249, + "epoch": 0.3098230401525627, + "grad_norm": 8.3125, + "learning_rate": 9.702103527605554e-06, + "loss": 1.11288128, + "memory(GiB)": 302.58, + "step": 55400, + "train_speed(iter/s)": 0.124622 + }, + { + "acc": 0.72301683, + "epoch": 0.30993488962554194, + "grad_norm": 6.65625, + "learning_rate": 9.701789035407335e-06, + "loss": 1.09905767, + "memory(GiB)": 302.58, + "step": 55420, + "train_speed(iter/s)": 0.124642 + }, + { + "acc": 0.72621245, + "epoch": 0.3100467390985212, + "grad_norm": 8.25, + "learning_rate": 9.70147438239302e-06, + "loss": 1.06705379, + "memory(GiB)": 302.58, + "step": 55440, + "train_speed(iter/s)": 0.124664 + }, + { + "acc": 0.73869524, + "epoch": 0.31015858857150047, + "grad_norm": 8.125, + "learning_rate": 9.701159568573372e-06, + "loss": 1.02361689, + "memory(GiB)": 302.58, + "step": 55460, + "train_speed(iter/s)": 0.124686 + }, + { + "acc": 0.74401159, + "epoch": 0.31027043804447973, + "grad_norm": 6.90625, + "learning_rate": 9.700844593959163e-06, + "loss": 1.00000019, + "memory(GiB)": 302.58, + "step": 55480, + "train_speed(iter/s)": 0.124706 + }, + { + "acc": 0.72722583, + "epoch": 0.310382287517459, + "grad_norm": 8.0625, + "learning_rate": 9.700529458561161e-06, + "loss": 1.06268578, + "memory(GiB)": 302.58, + "step": 55500, + "train_speed(iter/s)": 0.124726 + }, + { + "acc": 0.73460646, + "epoch": 0.31049413699043826, + "grad_norm": 6.5625, + "learning_rate": 9.700214162390149e-06, + "loss": 1.03812628, + "memory(GiB)": 302.58, + "step": 55520, + "train_speed(iter/s)": 0.124747 + }, + { + "acc": 0.72575641, + "epoch": 0.3106059864634175, + "grad_norm": 8.1875, + "learning_rate": 9.699898705456906e-06, + "loss": 1.08850374, + "memory(GiB)": 302.58, + "step": 55540, + "train_speed(iter/s)": 0.124767 + }, + { + "acc": 0.72445917, + "epoch": 0.3107178359363968, + "grad_norm": 5.53125, + "learning_rate": 9.699583087772225e-06, + "loss": 1.10333385, + "memory(GiB)": 302.58, + "step": 55560, + "train_speed(iter/s)": 0.124787 + }, + { + "acc": 0.73265605, + "epoch": 0.31082968540937606, + "grad_norm": 10.625, + "learning_rate": 9.6992673093469e-06, + "loss": 1.06719637, + "memory(GiB)": 302.58, + "step": 55580, + "train_speed(iter/s)": 0.124808 + }, + { + "acc": 0.72430286, + "epoch": 0.3109415348823553, + "grad_norm": 5.84375, + "learning_rate": 9.698951370191734e-06, + "loss": 1.09389467, + "memory(GiB)": 302.58, + "step": 55600, + "train_speed(iter/s)": 0.124829 + }, + { + "acc": 0.7404737, + "epoch": 0.3110533843553346, + "grad_norm": 7.09375, + "learning_rate": 9.698635270317529e-06, + "loss": 0.99978151, + "memory(GiB)": 302.58, + "step": 55620, + "train_speed(iter/s)": 0.12485 + }, + { + "acc": 0.75503693, + "epoch": 0.31116523382831385, + "grad_norm": 7.34375, + "learning_rate": 9.6983190097351e-06, + "loss": 0.95303516, + "memory(GiB)": 302.58, + "step": 55640, + "train_speed(iter/s)": 0.124871 + }, + { + "acc": 0.72644958, + "epoch": 0.3112770833012931, + "grad_norm": 4.875, + "learning_rate": 9.698002588455264e-06, + "loss": 1.06873941, + "memory(GiB)": 302.58, + "step": 55660, + "train_speed(iter/s)": 0.124893 + }, + { + "acc": 0.74890122, + "epoch": 0.3113889327742724, + "grad_norm": 6.875, + "learning_rate": 9.69768600648884e-06, + "loss": 0.98662443, + "memory(GiB)": 302.58, + "step": 55680, + "train_speed(iter/s)": 0.124914 + }, + { + "acc": 0.74296212, + "epoch": 0.31150078224725164, + "grad_norm": 9.875, + "learning_rate": 9.69736926384666e-06, + "loss": 1.0127984, + "memory(GiB)": 302.58, + "step": 55700, + "train_speed(iter/s)": 0.124933 + }, + { + "acc": 0.74771714, + "epoch": 0.3116126317202309, + "grad_norm": 7.15625, + "learning_rate": 9.697052360539556e-06, + "loss": 0.97941647, + "memory(GiB)": 302.58, + "step": 55720, + "train_speed(iter/s)": 0.124954 + }, + { + "acc": 0.72658, + "epoch": 0.31172448119321017, + "grad_norm": 4.3125, + "learning_rate": 9.696735296578367e-06, + "loss": 1.08000269, + "memory(GiB)": 302.58, + "step": 55740, + "train_speed(iter/s)": 0.124974 + }, + { + "acc": 0.72359333, + "epoch": 0.31183633066618943, + "grad_norm": 7.59375, + "learning_rate": 9.696418071973938e-06, + "loss": 1.09482841, + "memory(GiB)": 302.58, + "step": 55760, + "train_speed(iter/s)": 0.124994 + }, + { + "acc": 0.72962737, + "epoch": 0.3119481801391687, + "grad_norm": 7.4375, + "learning_rate": 9.69610068673712e-06, + "loss": 1.07414074, + "memory(GiB)": 302.58, + "step": 55780, + "train_speed(iter/s)": 0.125014 + }, + { + "acc": 0.74024744, + "epoch": 0.31206002961214796, + "grad_norm": 6.6875, + "learning_rate": 9.695783140878765e-06, + "loss": 1.01130915, + "memory(GiB)": 302.58, + "step": 55800, + "train_speed(iter/s)": 0.125035 + }, + { + "acc": 0.73036156, + "epoch": 0.3121718790851272, + "grad_norm": 10.8125, + "learning_rate": 9.695465434409739e-06, + "loss": 1.06216803, + "memory(GiB)": 302.58, + "step": 55820, + "train_speed(iter/s)": 0.125056 + }, + { + "acc": 0.73690829, + "epoch": 0.3122837285581065, + "grad_norm": 6.5, + "learning_rate": 9.695147567340906e-06, + "loss": 1.04289227, + "memory(GiB)": 302.58, + "step": 55840, + "train_speed(iter/s)": 0.125076 + }, + { + "acc": 0.73135509, + "epoch": 0.31239557803108575, + "grad_norm": 7.53125, + "learning_rate": 9.694829539683138e-06, + "loss": 1.0560235, + "memory(GiB)": 302.58, + "step": 55860, + "train_speed(iter/s)": 0.125095 + }, + { + "acc": 0.74361076, + "epoch": 0.312507427504065, + "grad_norm": 7.5625, + "learning_rate": 9.694511351447312e-06, + "loss": 0.98964968, + "memory(GiB)": 302.58, + "step": 55880, + "train_speed(iter/s)": 0.125116 + }, + { + "acc": 0.73857446, + "epoch": 0.3126192769770443, + "grad_norm": 7.25, + "learning_rate": 9.694193002644314e-06, + "loss": 1.03025694, + "memory(GiB)": 302.58, + "step": 55900, + "train_speed(iter/s)": 0.125136 + }, + { + "acc": 0.73329654, + "epoch": 0.31273112645002354, + "grad_norm": 7.5, + "learning_rate": 9.693874493285028e-06, + "loss": 1.03913374, + "memory(GiB)": 302.58, + "step": 55920, + "train_speed(iter/s)": 0.125155 + }, + { + "acc": 0.71891875, + "epoch": 0.3128429759230028, + "grad_norm": 9.625, + "learning_rate": 9.693555823380352e-06, + "loss": 1.12868681, + "memory(GiB)": 302.58, + "step": 55940, + "train_speed(iter/s)": 0.125175 + }, + { + "acc": 0.74068303, + "epoch": 0.3129548253959821, + "grad_norm": 8.3125, + "learning_rate": 9.693236992941183e-06, + "loss": 1.0151638, + "memory(GiB)": 302.58, + "step": 55960, + "train_speed(iter/s)": 0.125196 + }, + { + "acc": 0.71676083, + "epoch": 0.31306667486896134, + "grad_norm": 6.0, + "learning_rate": 9.692918001978428e-06, + "loss": 1.12568045, + "memory(GiB)": 302.58, + "step": 55980, + "train_speed(iter/s)": 0.125216 + }, + { + "acc": 0.75311408, + "epoch": 0.3131785243419406, + "grad_norm": 5.96875, + "learning_rate": 9.692598850502996e-06, + "loss": 0.96193399, + "memory(GiB)": 302.58, + "step": 56000, + "train_speed(iter/s)": 0.125238 + }, + { + "epoch": 0.3131785243419406, + "eval_acc": 0.6984176421761218, + "eval_loss": 1.0490608215332031, + "eval_runtime": 7559.4162, + "eval_samples_per_second": 9.959, + "eval_steps_per_second": 9.959, + "step": 56000 + }, + { + "acc": 0.72961807, + "epoch": 0.31329037381491986, + "grad_norm": 6.34375, + "learning_rate": 9.692279538525802e-06, + "loss": 1.05879087, + "memory(GiB)": 302.58, + "step": 56020, + "train_speed(iter/s)": 0.123147 + }, + { + "acc": 0.73020568, + "epoch": 0.31340222328789913, + "grad_norm": 6.09375, + "learning_rate": 9.691960066057772e-06, + "loss": 1.03921957, + "memory(GiB)": 302.58, + "step": 56040, + "train_speed(iter/s)": 0.123168 + }, + { + "acc": 0.73016233, + "epoch": 0.3135140727608784, + "grad_norm": 8.125, + "learning_rate": 9.691640433109828e-06, + "loss": 1.07112827, + "memory(GiB)": 302.58, + "step": 56060, + "train_speed(iter/s)": 0.123189 + }, + { + "acc": 0.73742523, + "epoch": 0.31362592223385766, + "grad_norm": 7.0625, + "learning_rate": 9.691320639692905e-06, + "loss": 1.02706251, + "memory(GiB)": 302.58, + "step": 56080, + "train_speed(iter/s)": 0.123209 + }, + { + "acc": 0.73782163, + "epoch": 0.3137377717068369, + "grad_norm": 8.1875, + "learning_rate": 9.69100068581794e-06, + "loss": 1.02943125, + "memory(GiB)": 302.58, + "step": 56100, + "train_speed(iter/s)": 0.123228 + }, + { + "acc": 0.73921208, + "epoch": 0.3138496211798162, + "grad_norm": 7.09375, + "learning_rate": 9.690680571495876e-06, + "loss": 1.0322216, + "memory(GiB)": 302.58, + "step": 56120, + "train_speed(iter/s)": 0.12325 + }, + { + "acc": 0.7317173, + "epoch": 0.31396147065279545, + "grad_norm": 8.5, + "learning_rate": 9.690360296737664e-06, + "loss": 1.05595541, + "memory(GiB)": 302.58, + "step": 56140, + "train_speed(iter/s)": 0.12327 + }, + { + "acc": 0.73859529, + "epoch": 0.3140733201257747, + "grad_norm": 6.34375, + "learning_rate": 9.690039861554257e-06, + "loss": 1.0321557, + "memory(GiB)": 302.58, + "step": 56160, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.7138938, + "epoch": 0.314185169598754, + "grad_norm": 5.28125, + "learning_rate": 9.689719265956614e-06, + "loss": 1.13372927, + "memory(GiB)": 302.58, + "step": 56180, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.71661863, + "epoch": 0.31429701907173324, + "grad_norm": 7.78125, + "learning_rate": 9.689398509955703e-06, + "loss": 1.12247696, + "memory(GiB)": 302.58, + "step": 56200, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.74953074, + "epoch": 0.3144088685447125, + "grad_norm": 6.96875, + "learning_rate": 9.689077593562492e-06, + "loss": 0.97226267, + "memory(GiB)": 302.58, + "step": 56220, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.74398441, + "epoch": 0.31452071801769177, + "grad_norm": 5.28125, + "learning_rate": 9.68875651678796e-06, + "loss": 1.00230598, + "memory(GiB)": 302.58, + "step": 56240, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.7519887, + "epoch": 0.31463256749067103, + "grad_norm": 5.40625, + "learning_rate": 9.688435279643085e-06, + "loss": 0.96299314, + "memory(GiB)": 302.58, + "step": 56260, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.73774624, + "epoch": 0.3147444169636503, + "grad_norm": 6.1875, + "learning_rate": 9.688113882138858e-06, + "loss": 1.02326269, + "memory(GiB)": 302.58, + "step": 56280, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.7222559, + "epoch": 0.31485626643662956, + "grad_norm": 7.0, + "learning_rate": 9.687792324286273e-06, + "loss": 1.09536324, + "memory(GiB)": 302.58, + "step": 56300, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.74655056, + "epoch": 0.3149681159096088, + "grad_norm": 7.1875, + "learning_rate": 9.687470606096322e-06, + "loss": 0.99141178, + "memory(GiB)": 302.58, + "step": 56320, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.73179436, + "epoch": 0.3150799653825881, + "grad_norm": 5.28125, + "learning_rate": 9.687148727580016e-06, + "loss": 1.06835356, + "memory(GiB)": 302.58, + "step": 56340, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74402637, + "epoch": 0.31519181485556735, + "grad_norm": 5.8125, + "learning_rate": 9.686826688748359e-06, + "loss": 1.01404028, + "memory(GiB)": 302.58, + "step": 56360, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.73744259, + "epoch": 0.3153036643285466, + "grad_norm": 5.5625, + "learning_rate": 9.68650448961237e-06, + "loss": 1.03471479, + "memory(GiB)": 302.58, + "step": 56380, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.72023168, + "epoch": 0.3154155138015259, + "grad_norm": 5.84375, + "learning_rate": 9.686182130183063e-06, + "loss": 1.13042831, + "memory(GiB)": 302.58, + "step": 56400, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.73931098, + "epoch": 0.31552736327450515, + "grad_norm": 6.125, + "learning_rate": 9.685859610471471e-06, + "loss": 1.01917315, + "memory(GiB)": 302.58, + "step": 56420, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.74249477, + "epoch": 0.3156392127474844, + "grad_norm": 5.75, + "learning_rate": 9.685536930488621e-06, + "loss": 1.00962095, + "memory(GiB)": 302.58, + "step": 56440, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74211574, + "epoch": 0.3157510622204637, + "grad_norm": 6.96875, + "learning_rate": 9.68521409024555e-06, + "loss": 0.99388094, + "memory(GiB)": 302.58, + "step": 56460, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.73825908, + "epoch": 0.31586291169344294, + "grad_norm": 6.78125, + "learning_rate": 9.684891089753302e-06, + "loss": 1.02221699, + "memory(GiB)": 302.58, + "step": 56480, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.74557896, + "epoch": 0.3159747611664222, + "grad_norm": 8.1875, + "learning_rate": 9.684567929022921e-06, + "loss": 1.00730219, + "memory(GiB)": 302.58, + "step": 56500, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.73700328, + "epoch": 0.31608661063940147, + "grad_norm": 6.6875, + "learning_rate": 9.684244608065466e-06, + "loss": 1.03457909, + "memory(GiB)": 302.58, + "step": 56520, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.72940807, + "epoch": 0.31619846011238073, + "grad_norm": 5.9375, + "learning_rate": 9.683921126891988e-06, + "loss": 1.06809254, + "memory(GiB)": 302.58, + "step": 56540, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.73661513, + "epoch": 0.31631030958536005, + "grad_norm": 8.875, + "learning_rate": 9.683597485513558e-06, + "loss": 1.03140459, + "memory(GiB)": 302.58, + "step": 56560, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.71422439, + "epoch": 0.3164221590583393, + "grad_norm": 5.5625, + "learning_rate": 9.683273683941241e-06, + "loss": 1.14371691, + "memory(GiB)": 302.58, + "step": 56580, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.73910465, + "epoch": 0.3165340085313186, + "grad_norm": 6.1875, + "learning_rate": 9.682949722186114e-06, + "loss": 1.02613335, + "memory(GiB)": 302.58, + "step": 56600, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.75021129, + "epoch": 0.31664585800429784, + "grad_norm": 9.75, + "learning_rate": 9.682625600259259e-06, + "loss": 0.98495131, + "memory(GiB)": 302.58, + "step": 56620, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.73529482, + "epoch": 0.3167577074772771, + "grad_norm": 8.0625, + "learning_rate": 9.682301318171758e-06, + "loss": 1.05096149, + "memory(GiB)": 302.58, + "step": 56640, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.73455734, + "epoch": 0.31686955695025637, + "grad_norm": 4.875, + "learning_rate": 9.681976875934707e-06, + "loss": 1.054879, + "memory(GiB)": 302.58, + "step": 56660, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.73251638, + "epoch": 0.31698140642323563, + "grad_norm": 6.375, + "learning_rate": 9.681652273559198e-06, + "loss": 1.07678041, + "memory(GiB)": 302.58, + "step": 56680, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.72813196, + "epoch": 0.3170932558962149, + "grad_norm": 10.375, + "learning_rate": 9.681327511056338e-06, + "loss": 1.07157822, + "memory(GiB)": 302.58, + "step": 56700, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.72668233, + "epoch": 0.31720510536919416, + "grad_norm": 9.125, + "learning_rate": 9.681002588437234e-06, + "loss": 1.06189604, + "memory(GiB)": 302.58, + "step": 56720, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.73436561, + "epoch": 0.3173169548421734, + "grad_norm": 6.625, + "learning_rate": 9.680677505712997e-06, + "loss": 1.03790369, + "memory(GiB)": 302.58, + "step": 56740, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.7214262, + "epoch": 0.3174288043151527, + "grad_norm": 6.1875, + "learning_rate": 9.680352262894746e-06, + "loss": 1.10092392, + "memory(GiB)": 302.58, + "step": 56760, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.75077429, + "epoch": 0.31754065378813195, + "grad_norm": 9.4375, + "learning_rate": 9.680026859993608e-06, + "loss": 0.97345791, + "memory(GiB)": 302.58, + "step": 56780, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.73871627, + "epoch": 0.3176525032611112, + "grad_norm": 7.59375, + "learning_rate": 9.679701297020711e-06, + "loss": 1.0307663, + "memory(GiB)": 302.58, + "step": 56800, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.72921581, + "epoch": 0.3177643527340905, + "grad_norm": 7.84375, + "learning_rate": 9.679375573987192e-06, + "loss": 1.05968771, + "memory(GiB)": 302.58, + "step": 56820, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.73461075, + "epoch": 0.31787620220706975, + "grad_norm": 7.75, + "learning_rate": 9.67904969090419e-06, + "loss": 1.02302322, + "memory(GiB)": 302.58, + "step": 56840, + "train_speed(iter/s)": 0.12399 + }, + { + "acc": 0.7232676, + "epoch": 0.317988051680049, + "grad_norm": 6.125, + "learning_rate": 9.67872364778285e-06, + "loss": 1.09028025, + "memory(GiB)": 302.58, + "step": 56860, + "train_speed(iter/s)": 0.124011 + }, + { + "acc": 0.73313475, + "epoch": 0.3180999011530283, + "grad_norm": 7.59375, + "learning_rate": 9.678397444634327e-06, + "loss": 1.05695124, + "memory(GiB)": 302.58, + "step": 56880, + "train_speed(iter/s)": 0.124031 + }, + { + "acc": 0.7396656, + "epoch": 0.31821175062600754, + "grad_norm": 6.5, + "learning_rate": 9.678071081469777e-06, + "loss": 1.02916985, + "memory(GiB)": 302.58, + "step": 56900, + "train_speed(iter/s)": 0.124051 + }, + { + "acc": 0.73866529, + "epoch": 0.3183236000989868, + "grad_norm": 6.375, + "learning_rate": 9.677744558300362e-06, + "loss": 1.01488199, + "memory(GiB)": 302.58, + "step": 56920, + "train_speed(iter/s)": 0.124071 + }, + { + "acc": 0.727314, + "epoch": 0.31843544957196607, + "grad_norm": 7.8125, + "learning_rate": 9.67741787513725e-06, + "loss": 1.10091848, + "memory(GiB)": 302.58, + "step": 56940, + "train_speed(iter/s)": 0.124089 + }, + { + "acc": 0.72496343, + "epoch": 0.31854729904494533, + "grad_norm": 7.8125, + "learning_rate": 9.677091031991614e-06, + "loss": 1.07130947, + "memory(GiB)": 302.58, + "step": 56960, + "train_speed(iter/s)": 0.124109 + }, + { + "acc": 0.74516816, + "epoch": 0.3186591485179246, + "grad_norm": 6.125, + "learning_rate": 9.676764028874635e-06, + "loss": 1.01025333, + "memory(GiB)": 302.58, + "step": 56980, + "train_speed(iter/s)": 0.124128 + }, + { + "acc": 0.72261338, + "epoch": 0.31877099799090386, + "grad_norm": 6.4375, + "learning_rate": 9.676436865797496e-06, + "loss": 1.09799652, + "memory(GiB)": 302.58, + "step": 57000, + "train_speed(iter/s)": 0.124146 + }, + { + "acc": 0.73795009, + "epoch": 0.3188828474638831, + "grad_norm": 8.625, + "learning_rate": 9.67610954277139e-06, + "loss": 1.03703461, + "memory(GiB)": 302.58, + "step": 57020, + "train_speed(iter/s)": 0.124167 + }, + { + "acc": 0.7285511, + "epoch": 0.3189946969368624, + "grad_norm": 9.25, + "learning_rate": 9.675782059807508e-06, + "loss": 1.06484833, + "memory(GiB)": 302.58, + "step": 57040, + "train_speed(iter/s)": 0.124188 + }, + { + "acc": 0.74714088, + "epoch": 0.31910654640984165, + "grad_norm": 4.78125, + "learning_rate": 9.675454416917054e-06, + "loss": 0.981709, + "memory(GiB)": 302.58, + "step": 57060, + "train_speed(iter/s)": 0.124208 + }, + { + "acc": 0.73962965, + "epoch": 0.3192183958828209, + "grad_norm": 5.8125, + "learning_rate": 9.675126614111232e-06, + "loss": 1.03796864, + "memory(GiB)": 302.58, + "step": 57080, + "train_speed(iter/s)": 0.124228 + }, + { + "acc": 0.74486623, + "epoch": 0.3193302453558002, + "grad_norm": 6.6875, + "learning_rate": 9.674798651401256e-06, + "loss": 0.99557924, + "memory(GiB)": 302.58, + "step": 57100, + "train_speed(iter/s)": 0.124249 + }, + { + "acc": 0.74505067, + "epoch": 0.31944209482877944, + "grad_norm": 10.5, + "learning_rate": 9.674470528798344e-06, + "loss": 1.00302896, + "memory(GiB)": 302.58, + "step": 57120, + "train_speed(iter/s)": 0.124271 + }, + { + "acc": 0.73162827, + "epoch": 0.3195539443017587, + "grad_norm": 8.875, + "learning_rate": 9.674142246313716e-06, + "loss": 1.08280764, + "memory(GiB)": 302.58, + "step": 57140, + "train_speed(iter/s)": 0.124289 + }, + { + "acc": 0.72949347, + "epoch": 0.31966579377473797, + "grad_norm": 8.0, + "learning_rate": 9.673813803958602e-06, + "loss": 1.07961378, + "memory(GiB)": 302.58, + "step": 57160, + "train_speed(iter/s)": 0.124309 + }, + { + "acc": 0.74892426, + "epoch": 0.31977764324771724, + "grad_norm": 8.625, + "learning_rate": 9.673485201744234e-06, + "loss": 0.98831015, + "memory(GiB)": 302.58, + "step": 57180, + "train_speed(iter/s)": 0.124329 + }, + { + "acc": 0.73785386, + "epoch": 0.3198894927206965, + "grad_norm": 6.1875, + "learning_rate": 9.673156439681857e-06, + "loss": 1.02958155, + "memory(GiB)": 302.58, + "step": 57200, + "train_speed(iter/s)": 0.124349 + }, + { + "acc": 0.74301815, + "epoch": 0.32000134219367576, + "grad_norm": 9.0, + "learning_rate": 9.672827517782708e-06, + "loss": 0.99310007, + "memory(GiB)": 302.58, + "step": 57220, + "train_speed(iter/s)": 0.12437 + }, + { + "acc": 0.75232239, + "epoch": 0.32011319166665503, + "grad_norm": 7.375, + "learning_rate": 9.672498436058044e-06, + "loss": 0.99219656, + "memory(GiB)": 302.58, + "step": 57240, + "train_speed(iter/s)": 0.124389 + }, + { + "acc": 0.72714381, + "epoch": 0.3202250411396343, + "grad_norm": 8.0625, + "learning_rate": 9.672169194519114e-06, + "loss": 1.08580713, + "memory(GiB)": 302.58, + "step": 57260, + "train_speed(iter/s)": 0.124409 + }, + { + "acc": 0.73504853, + "epoch": 0.32033689061261356, + "grad_norm": 6.3125, + "learning_rate": 9.671839793177184e-06, + "loss": 1.05421228, + "memory(GiB)": 302.58, + "step": 57280, + "train_speed(iter/s)": 0.124429 + }, + { + "acc": 0.71320176, + "epoch": 0.3204487400855928, + "grad_norm": 8.5, + "learning_rate": 9.67151023204352e-06, + "loss": 1.13000813, + "memory(GiB)": 302.58, + "step": 57300, + "train_speed(iter/s)": 0.124449 + }, + { + "acc": 0.72899032, + "epoch": 0.3205605895585721, + "grad_norm": 8.25, + "learning_rate": 9.671180511129393e-06, + "loss": 1.05808849, + "memory(GiB)": 302.58, + "step": 57320, + "train_speed(iter/s)": 0.12447 + }, + { + "acc": 0.72429986, + "epoch": 0.32067243903155135, + "grad_norm": 8.0625, + "learning_rate": 9.67085063044608e-06, + "loss": 1.09017105, + "memory(GiB)": 302.58, + "step": 57340, + "train_speed(iter/s)": 0.124491 + }, + { + "acc": 0.75530257, + "epoch": 0.3207842885045306, + "grad_norm": 6.75, + "learning_rate": 9.670520590004863e-06, + "loss": 0.96548729, + "memory(GiB)": 302.58, + "step": 57360, + "train_speed(iter/s)": 0.124511 + }, + { + "acc": 0.72541637, + "epoch": 0.3208961379775099, + "grad_norm": 9.125, + "learning_rate": 9.670190389817033e-06, + "loss": 1.07959957, + "memory(GiB)": 302.58, + "step": 57380, + "train_speed(iter/s)": 0.124531 + }, + { + "acc": 0.72675872, + "epoch": 0.32100798745048914, + "grad_norm": 5.75, + "learning_rate": 9.669860029893884e-06, + "loss": 1.06094036, + "memory(GiB)": 302.58, + "step": 57400, + "train_speed(iter/s)": 0.124551 + }, + { + "acc": 0.74468331, + "epoch": 0.3211198369234684, + "grad_norm": 4.5625, + "learning_rate": 9.669529510246714e-06, + "loss": 1.01494627, + "memory(GiB)": 302.58, + "step": 57420, + "train_speed(iter/s)": 0.12457 + }, + { + "acc": 0.7361547, + "epoch": 0.32123168639644767, + "grad_norm": 4.75, + "learning_rate": 9.669198830886827e-06, + "loss": 1.04030724, + "memory(GiB)": 302.58, + "step": 57440, + "train_speed(iter/s)": 0.124591 + }, + { + "acc": 0.73607378, + "epoch": 0.32134353586942693, + "grad_norm": 8.375, + "learning_rate": 9.668867991825536e-06, + "loss": 1.03397388, + "memory(GiB)": 302.58, + "step": 57460, + "train_speed(iter/s)": 0.124611 + }, + { + "acc": 0.74471741, + "epoch": 0.3214553853424062, + "grad_norm": 5.5625, + "learning_rate": 9.668536993074154e-06, + "loss": 1.00477514, + "memory(GiB)": 302.58, + "step": 57480, + "train_speed(iter/s)": 0.12463 + }, + { + "acc": 0.72274785, + "epoch": 0.32156723481538546, + "grad_norm": 6.1875, + "learning_rate": 9.668205834644003e-06, + "loss": 1.09801559, + "memory(GiB)": 302.58, + "step": 57500, + "train_speed(iter/s)": 0.124651 + }, + { + "acc": 0.72997651, + "epoch": 0.3216790842883647, + "grad_norm": 5.6875, + "learning_rate": 9.66787451654641e-06, + "loss": 1.07581034, + "memory(GiB)": 302.58, + "step": 57520, + "train_speed(iter/s)": 0.12467 + }, + { + "acc": 0.72337275, + "epoch": 0.321790933761344, + "grad_norm": 7.53125, + "learning_rate": 9.667543038792707e-06, + "loss": 1.11416874, + "memory(GiB)": 302.58, + "step": 57540, + "train_speed(iter/s)": 0.124687 + }, + { + "acc": 0.71629052, + "epoch": 0.32190278323432325, + "grad_norm": 7.53125, + "learning_rate": 9.667211401394234e-06, + "loss": 1.12334232, + "memory(GiB)": 302.58, + "step": 57560, + "train_speed(iter/s)": 0.124708 + }, + { + "acc": 0.73648686, + "epoch": 0.3220146327073025, + "grad_norm": 6.53125, + "learning_rate": 9.66687960436233e-06, + "loss": 1.05422535, + "memory(GiB)": 302.58, + "step": 57580, + "train_speed(iter/s)": 0.124728 + }, + { + "acc": 0.72855062, + "epoch": 0.3221264821802818, + "grad_norm": 6.96875, + "learning_rate": 9.666547647708344e-06, + "loss": 1.07185431, + "memory(GiB)": 302.58, + "step": 57600, + "train_speed(iter/s)": 0.124748 + }, + { + "acc": 0.73627405, + "epoch": 0.32223833165326105, + "grad_norm": 6.0, + "learning_rate": 9.666215531443633e-06, + "loss": 1.01287003, + "memory(GiB)": 302.58, + "step": 57620, + "train_speed(iter/s)": 0.124768 + }, + { + "acc": 0.70867548, + "epoch": 0.3223501811262403, + "grad_norm": 6.46875, + "learning_rate": 9.665883255579554e-06, + "loss": 1.18524055, + "memory(GiB)": 302.58, + "step": 57640, + "train_speed(iter/s)": 0.124789 + }, + { + "acc": 0.7365921, + "epoch": 0.3224620305992196, + "grad_norm": 6.09375, + "learning_rate": 9.665550820127474e-06, + "loss": 1.02320642, + "memory(GiB)": 302.58, + "step": 57660, + "train_speed(iter/s)": 0.124809 + }, + { + "acc": 0.71763191, + "epoch": 0.32257388007219884, + "grad_norm": 4.875, + "learning_rate": 9.66521822509876e-06, + "loss": 1.14117765, + "memory(GiB)": 302.58, + "step": 57680, + "train_speed(iter/s)": 0.124829 + }, + { + "acc": 0.71825066, + "epoch": 0.3226857295451781, + "grad_norm": 6.46875, + "learning_rate": 9.66488547050479e-06, + "loss": 1.09821367, + "memory(GiB)": 302.58, + "step": 57700, + "train_speed(iter/s)": 0.12485 + }, + { + "acc": 0.72303138, + "epoch": 0.32279757901815737, + "grad_norm": 5.09375, + "learning_rate": 9.664552556356945e-06, + "loss": 1.09602699, + "memory(GiB)": 302.58, + "step": 57720, + "train_speed(iter/s)": 0.124869 + }, + { + "acc": 0.74274988, + "epoch": 0.32290942849113663, + "grad_norm": 5.78125, + "learning_rate": 9.664219482666612e-06, + "loss": 1.02330914, + "memory(GiB)": 302.58, + "step": 57740, + "train_speed(iter/s)": 0.124889 + }, + { + "acc": 0.72954345, + "epoch": 0.3230212779641159, + "grad_norm": 8.8125, + "learning_rate": 9.663886249445183e-06, + "loss": 1.06972265, + "memory(GiB)": 302.58, + "step": 57760, + "train_speed(iter/s)": 0.124911 + }, + { + "acc": 0.71721559, + "epoch": 0.32313312743709516, + "grad_norm": 4.4375, + "learning_rate": 9.663552856704055e-06, + "loss": 1.113764, + "memory(GiB)": 302.58, + "step": 57780, + "train_speed(iter/s)": 0.124931 + }, + { + "acc": 0.74552722, + "epoch": 0.3232449769100744, + "grad_norm": 10.875, + "learning_rate": 9.663219304454632e-06, + "loss": 1.00061922, + "memory(GiB)": 302.58, + "step": 57800, + "train_speed(iter/s)": 0.124951 + }, + { + "acc": 0.73145428, + "epoch": 0.3233568263830537, + "grad_norm": 6.59375, + "learning_rate": 9.662885592708323e-06, + "loss": 1.06292486, + "memory(GiB)": 302.58, + "step": 57820, + "train_speed(iter/s)": 0.124969 + }, + { + "acc": 0.7489562, + "epoch": 0.32346867585603295, + "grad_norm": 7.25, + "learning_rate": 9.66255172147654e-06, + "loss": 0.99170685, + "memory(GiB)": 302.58, + "step": 57840, + "train_speed(iter/s)": 0.12499 + }, + { + "acc": 0.72998552, + "epoch": 0.3235805253290122, + "grad_norm": 6.21875, + "learning_rate": 9.662217690770702e-06, + "loss": 1.07711439, + "memory(GiB)": 302.58, + "step": 57860, + "train_speed(iter/s)": 0.12501 + }, + { + "acc": 0.73922963, + "epoch": 0.3236923748019915, + "grad_norm": 10.75, + "learning_rate": 9.661883500602237e-06, + "loss": 1.02471905, + "memory(GiB)": 302.58, + "step": 57880, + "train_speed(iter/s)": 0.12503 + }, + { + "acc": 0.72454271, + "epoch": 0.32380422427497074, + "grad_norm": 14.3125, + "learning_rate": 9.661549150982574e-06, + "loss": 1.10644464, + "memory(GiB)": 302.58, + "step": 57900, + "train_speed(iter/s)": 0.12505 + }, + { + "acc": 0.74381766, + "epoch": 0.32391607374795, + "grad_norm": 8.875, + "learning_rate": 9.661214641923148e-06, + "loss": 1.01316271, + "memory(GiB)": 302.58, + "step": 57920, + "train_speed(iter/s)": 0.12507 + }, + { + "acc": 0.73505678, + "epoch": 0.32402792322092927, + "grad_norm": 6.59375, + "learning_rate": 9.660879973435401e-06, + "loss": 1.04324379, + "memory(GiB)": 302.58, + "step": 57940, + "train_speed(iter/s)": 0.12509 + }, + { + "acc": 0.74819975, + "epoch": 0.32413977269390853, + "grad_norm": 6.59375, + "learning_rate": 9.660545145530782e-06, + "loss": 1.00274725, + "memory(GiB)": 302.58, + "step": 57960, + "train_speed(iter/s)": 0.12511 + }, + { + "acc": 0.7297657, + "epoch": 0.3242516221668878, + "grad_norm": 8.5, + "learning_rate": 9.660210158220739e-06, + "loss": 1.0658143, + "memory(GiB)": 302.58, + "step": 57980, + "train_speed(iter/s)": 0.125131 + }, + { + "acc": 0.74926362, + "epoch": 0.32436347163986706, + "grad_norm": 7.84375, + "learning_rate": 9.659875011516731e-06, + "loss": 0.98524046, + "memory(GiB)": 302.58, + "step": 58000, + "train_speed(iter/s)": 0.125151 + }, + { + "epoch": 0.32436347163986706, + "eval_acc": 0.6988825108706809, + "eval_loss": 1.0477010011672974, + "eval_runtime": 7507.5254, + "eval_samples_per_second": 10.028, + "eval_steps_per_second": 10.028, + "step": 58000 + }, + { + "acc": 0.72567453, + "epoch": 0.3244753211128463, + "grad_norm": 5.4375, + "learning_rate": 9.659539705430222e-06, + "loss": 1.08391962, + "memory(GiB)": 302.58, + "step": 58020, + "train_speed(iter/s)": 0.123147 + }, + { + "acc": 0.73973336, + "epoch": 0.3245871705858256, + "grad_norm": 5.15625, + "learning_rate": 9.65920423997268e-06, + "loss": 1.02967224, + "memory(GiB)": 302.58, + "step": 58040, + "train_speed(iter/s)": 0.123167 + }, + { + "acc": 0.73610473, + "epoch": 0.32469902005880485, + "grad_norm": 7.0, + "learning_rate": 9.658868615155578e-06, + "loss": 1.05029812, + "memory(GiB)": 302.58, + "step": 58060, + "train_speed(iter/s)": 0.123188 + }, + { + "acc": 0.73882117, + "epoch": 0.3248108695317841, + "grad_norm": 6.40625, + "learning_rate": 9.658532830990399e-06, + "loss": 1.03497009, + "memory(GiB)": 302.58, + "step": 58080, + "train_speed(iter/s)": 0.123208 + }, + { + "acc": 0.73347821, + "epoch": 0.3249227190047634, + "grad_norm": 6.28125, + "learning_rate": 9.658196887488624e-06, + "loss": 1.0541563, + "memory(GiB)": 302.58, + "step": 58100, + "train_speed(iter/s)": 0.123227 + }, + { + "acc": 0.74239717, + "epoch": 0.32503456847774265, + "grad_norm": 11.625, + "learning_rate": 9.657860784661746e-06, + "loss": 1.00591841, + "memory(GiB)": 302.58, + "step": 58120, + "train_speed(iter/s)": 0.123247 + }, + { + "acc": 0.72588215, + "epoch": 0.3251464179507219, + "grad_norm": 5.875, + "learning_rate": 9.657524522521258e-06, + "loss": 1.09469738, + "memory(GiB)": 302.58, + "step": 58140, + "train_speed(iter/s)": 0.123268 + }, + { + "acc": 0.74455543, + "epoch": 0.3252582674237012, + "grad_norm": 7.71875, + "learning_rate": 9.657188101078663e-06, + "loss": 1.00332642, + "memory(GiB)": 302.58, + "step": 58160, + "train_speed(iter/s)": 0.123288 + }, + { + "acc": 0.73127193, + "epoch": 0.32537011689668044, + "grad_norm": 4.125, + "learning_rate": 9.656851520345467e-06, + "loss": 1.04818764, + "memory(GiB)": 302.58, + "step": 58180, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.74380708, + "epoch": 0.3254819663696597, + "grad_norm": 7.78125, + "learning_rate": 9.656514780333183e-06, + "loss": 0.99456453, + "memory(GiB)": 302.58, + "step": 58200, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.73694487, + "epoch": 0.32559381584263897, + "grad_norm": 5.28125, + "learning_rate": 9.656177881053327e-06, + "loss": 1.03550138, + "memory(GiB)": 302.58, + "step": 58220, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.73647161, + "epoch": 0.32570566531561823, + "grad_norm": 8.75, + "learning_rate": 9.655840822517424e-06, + "loss": 1.02624187, + "memory(GiB)": 302.58, + "step": 58240, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.7274281, + "epoch": 0.3258175147885975, + "grad_norm": 9.0, + "learning_rate": 9.655503604737002e-06, + "loss": 1.09011602, + "memory(GiB)": 302.58, + "step": 58260, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.72184048, + "epoch": 0.32592936426157676, + "grad_norm": 10.0, + "learning_rate": 9.655166227723594e-06, + "loss": 1.12349825, + "memory(GiB)": 302.58, + "step": 58280, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.72276621, + "epoch": 0.326041213734556, + "grad_norm": 8.9375, + "learning_rate": 9.65482869148874e-06, + "loss": 1.08307781, + "memory(GiB)": 302.58, + "step": 58300, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.72027998, + "epoch": 0.3261530632075353, + "grad_norm": 8.1875, + "learning_rate": 9.654490996043986e-06, + "loss": 1.10518446, + "memory(GiB)": 302.58, + "step": 58320, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.73292108, + "epoch": 0.32626491268051455, + "grad_norm": 5.71875, + "learning_rate": 9.65415314140088e-06, + "loss": 1.03940001, + "memory(GiB)": 302.58, + "step": 58340, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.72564211, + "epoch": 0.3263767621534938, + "grad_norm": 8.1875, + "learning_rate": 9.653815127570978e-06, + "loss": 1.08953686, + "memory(GiB)": 302.58, + "step": 58360, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.71949353, + "epoch": 0.3264886116264731, + "grad_norm": 7.28125, + "learning_rate": 9.653476954565841e-06, + "loss": 1.11893597, + "memory(GiB)": 302.58, + "step": 58380, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.71860328, + "epoch": 0.32660046109945234, + "grad_norm": 6.5625, + "learning_rate": 9.653138622397038e-06, + "loss": 1.1260972, + "memory(GiB)": 302.58, + "step": 58400, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.73303885, + "epoch": 0.3267123105724316, + "grad_norm": 6.5625, + "learning_rate": 9.652800131076138e-06, + "loss": 1.02843924, + "memory(GiB)": 302.58, + "step": 58420, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.73763909, + "epoch": 0.32682416004541087, + "grad_norm": 6.0625, + "learning_rate": 9.652461480614722e-06, + "loss": 1.04069748, + "memory(GiB)": 302.58, + "step": 58440, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.74170027, + "epoch": 0.32693600951839014, + "grad_norm": 7.3125, + "learning_rate": 9.65212267102437e-06, + "loss": 1.00809793, + "memory(GiB)": 302.58, + "step": 58460, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.71784177, + "epoch": 0.3270478589913694, + "grad_norm": 8.9375, + "learning_rate": 9.65178370231667e-06, + "loss": 1.13197412, + "memory(GiB)": 302.58, + "step": 58480, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.73527918, + "epoch": 0.32715970846434866, + "grad_norm": 6.875, + "learning_rate": 9.651444574503218e-06, + "loss": 1.05358944, + "memory(GiB)": 302.58, + "step": 58500, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.74710431, + "epoch": 0.32727155793732793, + "grad_norm": 6.5625, + "learning_rate": 9.651105287595612e-06, + "loss": 0.98406134, + "memory(GiB)": 302.58, + "step": 58520, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.72916765, + "epoch": 0.3273834074103072, + "grad_norm": 8.1875, + "learning_rate": 9.650765841605457e-06, + "loss": 1.04754038, + "memory(GiB)": 302.58, + "step": 58540, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.73037577, + "epoch": 0.32749525688328646, + "grad_norm": 7.5, + "learning_rate": 9.650426236544363e-06, + "loss": 1.04635391, + "memory(GiB)": 302.58, + "step": 58560, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.73421335, + "epoch": 0.3276071063562657, + "grad_norm": 8.0, + "learning_rate": 9.650086472423945e-06, + "loss": 1.04915457, + "memory(GiB)": 302.58, + "step": 58580, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.73606315, + "epoch": 0.327718955829245, + "grad_norm": 8.25, + "learning_rate": 9.649746549255825e-06, + "loss": 1.0475399, + "memory(GiB)": 302.58, + "step": 58600, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.7495924, + "epoch": 0.32783080530222425, + "grad_norm": 4.90625, + "learning_rate": 9.64940646705163e-06, + "loss": 0.97745934, + "memory(GiB)": 302.58, + "step": 58620, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.74311724, + "epoch": 0.3279426547752035, + "grad_norm": 7.0625, + "learning_rate": 9.649066225822991e-06, + "loss": 1.01333408, + "memory(GiB)": 302.58, + "step": 58640, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.72709746, + "epoch": 0.3280545042481828, + "grad_norm": 8.75, + "learning_rate": 9.648725825581544e-06, + "loss": 1.08237524, + "memory(GiB)": 302.58, + "step": 58660, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.74111505, + "epoch": 0.32816635372116204, + "grad_norm": 6.46875, + "learning_rate": 9.648385266338934e-06, + "loss": 1.01171618, + "memory(GiB)": 302.58, + "step": 58680, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.73067703, + "epoch": 0.3282782031941413, + "grad_norm": 7.0, + "learning_rate": 9.648044548106807e-06, + "loss": 1.06807928, + "memory(GiB)": 302.58, + "step": 58700, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.73730545, + "epoch": 0.32839005266712057, + "grad_norm": 6.0, + "learning_rate": 9.647703670896819e-06, + "loss": 1.05836086, + "memory(GiB)": 302.58, + "step": 58720, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.72542729, + "epoch": 0.32850190214009983, + "grad_norm": 6.71875, + "learning_rate": 9.647362634720627e-06, + "loss": 1.06552534, + "memory(GiB)": 302.58, + "step": 58740, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.73956003, + "epoch": 0.3286137516130791, + "grad_norm": 6.28125, + "learning_rate": 9.647021439589898e-06, + "loss": 1.01980772, + "memory(GiB)": 302.58, + "step": 58760, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.74032884, + "epoch": 0.32872560108605836, + "grad_norm": 5.90625, + "learning_rate": 9.6466800855163e-06, + "loss": 1.04197664, + "memory(GiB)": 302.58, + "step": 58780, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73599601, + "epoch": 0.3288374505590376, + "grad_norm": 7.375, + "learning_rate": 9.646338572511509e-06, + "loss": 1.02605, + "memory(GiB)": 302.58, + "step": 58800, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.7229753, + "epoch": 0.3289493000320169, + "grad_norm": 4.34375, + "learning_rate": 9.645996900587205e-06, + "loss": 1.09473562, + "memory(GiB)": 302.58, + "step": 58820, + "train_speed(iter/s)": 0.123944 + }, + { + "acc": 0.741922, + "epoch": 0.32906114950499615, + "grad_norm": 6.40625, + "learning_rate": 9.645655069755076e-06, + "loss": 1.00784454, + "memory(GiB)": 302.58, + "step": 58840, + "train_speed(iter/s)": 0.123963 + }, + { + "acc": 0.74802814, + "epoch": 0.3291729989779754, + "grad_norm": 8.75, + "learning_rate": 9.64531308002681e-06, + "loss": 1.00600128, + "memory(GiB)": 302.58, + "step": 58860, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.74706693, + "epoch": 0.3292848484509547, + "grad_norm": 6.4375, + "learning_rate": 9.64497093141411e-06, + "loss": 0.98638239, + "memory(GiB)": 302.58, + "step": 58880, + "train_speed(iter/s)": 0.124 + }, + { + "acc": 0.7092351, + "epoch": 0.32939669792393395, + "grad_norm": 6.15625, + "learning_rate": 9.644628623928673e-06, + "loss": 1.16768179, + "memory(GiB)": 302.58, + "step": 58900, + "train_speed(iter/s)": 0.12402 + }, + { + "acc": 0.73630171, + "epoch": 0.3295085473969132, + "grad_norm": 6.78125, + "learning_rate": 9.64428615758221e-06, + "loss": 1.03724937, + "memory(GiB)": 302.58, + "step": 58920, + "train_speed(iter/s)": 0.124039 + }, + { + "acc": 0.73045254, + "epoch": 0.3296203968698925, + "grad_norm": 7.40625, + "learning_rate": 9.643943532386435e-06, + "loss": 1.08383522, + "memory(GiB)": 302.58, + "step": 58940, + "train_speed(iter/s)": 0.124058 + }, + { + "acc": 0.73336816, + "epoch": 0.32973224634287174, + "grad_norm": 8.0625, + "learning_rate": 9.643600748353064e-06, + "loss": 1.05492792, + "memory(GiB)": 302.58, + "step": 58960, + "train_speed(iter/s)": 0.124078 + }, + { + "acc": 0.72405615, + "epoch": 0.329844095815851, + "grad_norm": 7.46875, + "learning_rate": 9.643257805493823e-06, + "loss": 1.08082647, + "memory(GiB)": 302.58, + "step": 58980, + "train_speed(iter/s)": 0.124098 + }, + { + "acc": 0.73634129, + "epoch": 0.32995594528883027, + "grad_norm": 6.71875, + "learning_rate": 9.642914703820442e-06, + "loss": 1.03255911, + "memory(GiB)": 302.58, + "step": 59000, + "train_speed(iter/s)": 0.124119 + }, + { + "acc": 0.72636733, + "epoch": 0.33006779476180953, + "grad_norm": 6.4375, + "learning_rate": 9.642571443344657e-06, + "loss": 1.08910446, + "memory(GiB)": 302.58, + "step": 59020, + "train_speed(iter/s)": 0.124138 + }, + { + "acc": 0.72585478, + "epoch": 0.3301796442347888, + "grad_norm": 6.40625, + "learning_rate": 9.642228024078206e-06, + "loss": 1.10133266, + "memory(GiB)": 302.58, + "step": 59040, + "train_speed(iter/s)": 0.124157 + }, + { + "acc": 0.74598961, + "epoch": 0.3302914937077681, + "grad_norm": 5.5625, + "learning_rate": 9.641884446032838e-06, + "loss": 1.00437517, + "memory(GiB)": 302.58, + "step": 59060, + "train_speed(iter/s)": 0.124176 + }, + { + "acc": 0.73728628, + "epoch": 0.3304033431807474, + "grad_norm": 7.625, + "learning_rate": 9.641540709220302e-06, + "loss": 1.02140102, + "memory(GiB)": 302.58, + "step": 59080, + "train_speed(iter/s)": 0.124196 + }, + { + "acc": 0.75406132, + "epoch": 0.33051519265372664, + "grad_norm": 5.8125, + "learning_rate": 9.641196813652355e-06, + "loss": 0.972612, + "memory(GiB)": 302.58, + "step": 59100, + "train_speed(iter/s)": 0.124216 + }, + { + "acc": 0.72976913, + "epoch": 0.3306270421267059, + "grad_norm": 5.5625, + "learning_rate": 9.640852759340761e-06, + "loss": 1.0640872, + "memory(GiB)": 302.58, + "step": 59120, + "train_speed(iter/s)": 0.124236 + }, + { + "acc": 0.71411743, + "epoch": 0.33073889159968517, + "grad_norm": 7.1875, + "learning_rate": 9.640508546297286e-06, + "loss": 1.14863396, + "memory(GiB)": 302.58, + "step": 59140, + "train_speed(iter/s)": 0.124255 + }, + { + "acc": 0.73418317, + "epoch": 0.33085074107266443, + "grad_norm": 5.25, + "learning_rate": 9.640164174533706e-06, + "loss": 1.03155212, + "memory(GiB)": 302.58, + "step": 59160, + "train_speed(iter/s)": 0.124275 + }, + { + "acc": 0.7381515, + "epoch": 0.3309625905456437, + "grad_norm": 4.28125, + "learning_rate": 9.639819644061797e-06, + "loss": 1.02572298, + "memory(GiB)": 302.58, + "step": 59180, + "train_speed(iter/s)": 0.124295 + }, + { + "acc": 0.73201852, + "epoch": 0.33107444001862296, + "grad_norm": 7.09375, + "learning_rate": 9.639474954893344e-06, + "loss": 1.05324211, + "memory(GiB)": 302.58, + "step": 59200, + "train_speed(iter/s)": 0.124315 + }, + { + "acc": 0.73993125, + "epoch": 0.3311862894916022, + "grad_norm": 5.9375, + "learning_rate": 9.639130107040134e-06, + "loss": 1.04127283, + "memory(GiB)": 302.58, + "step": 59220, + "train_speed(iter/s)": 0.124333 + }, + { + "acc": 0.74347148, + "epoch": 0.3312981389645815, + "grad_norm": 9.3125, + "learning_rate": 9.638785100513965e-06, + "loss": 1.00496235, + "memory(GiB)": 302.58, + "step": 59240, + "train_speed(iter/s)": 0.124353 + }, + { + "acc": 0.75070677, + "epoch": 0.33140998843756075, + "grad_norm": 5.90625, + "learning_rate": 9.638439935326638e-06, + "loss": 0.97960148, + "memory(GiB)": 302.58, + "step": 59260, + "train_speed(iter/s)": 0.124373 + }, + { + "acc": 0.72017736, + "epoch": 0.33152183791054, + "grad_norm": 8.1875, + "learning_rate": 9.638094611489956e-06, + "loss": 1.12509184, + "memory(GiB)": 302.58, + "step": 59280, + "train_speed(iter/s)": 0.124391 + }, + { + "acc": 0.75010357, + "epoch": 0.3316336873835193, + "grad_norm": 4.3125, + "learning_rate": 9.637749129015732e-06, + "loss": 0.97206469, + "memory(GiB)": 302.58, + "step": 59300, + "train_speed(iter/s)": 0.12441 + }, + { + "acc": 0.73465891, + "epoch": 0.33174553685649855, + "grad_norm": 8.5625, + "learning_rate": 9.637403487915779e-06, + "loss": 1.04532089, + "memory(GiB)": 302.58, + "step": 59320, + "train_speed(iter/s)": 0.124431 + }, + { + "acc": 0.74260359, + "epoch": 0.3318573863294778, + "grad_norm": 7.0, + "learning_rate": 9.637057688201925e-06, + "loss": 1.01635914, + "memory(GiB)": 302.58, + "step": 59340, + "train_speed(iter/s)": 0.124451 + }, + { + "acc": 0.73659611, + "epoch": 0.3319692358024571, + "grad_norm": 5.46875, + "learning_rate": 9.636711729885994e-06, + "loss": 1.04347382, + "memory(GiB)": 302.58, + "step": 59360, + "train_speed(iter/s)": 0.124471 + }, + { + "acc": 0.73180523, + "epoch": 0.33208108527543634, + "grad_norm": 6.5625, + "learning_rate": 9.63636561297982e-06, + "loss": 1.07351265, + "memory(GiB)": 302.58, + "step": 59380, + "train_speed(iter/s)": 0.124488 + }, + { + "acc": 0.74758406, + "epoch": 0.3321929347484156, + "grad_norm": 7.625, + "learning_rate": 9.636019337495237e-06, + "loss": 1.00808716, + "memory(GiB)": 302.58, + "step": 59400, + "train_speed(iter/s)": 0.124508 + }, + { + "acc": 0.73692355, + "epoch": 0.33230478422139487, + "grad_norm": 5.5, + "learning_rate": 9.635672903444095e-06, + "loss": 1.03315554, + "memory(GiB)": 302.58, + "step": 59420, + "train_speed(iter/s)": 0.124527 + }, + { + "acc": 0.74238448, + "epoch": 0.33241663369437413, + "grad_norm": 6.34375, + "learning_rate": 9.63532631083824e-06, + "loss": 1.01219301, + "memory(GiB)": 302.58, + "step": 59440, + "train_speed(iter/s)": 0.124545 + }, + { + "acc": 0.72815423, + "epoch": 0.3325284831673534, + "grad_norm": 8.5, + "learning_rate": 9.634979559689526e-06, + "loss": 1.08434401, + "memory(GiB)": 302.58, + "step": 59460, + "train_speed(iter/s)": 0.124565 + }, + { + "acc": 0.72464437, + "epoch": 0.33264033264033266, + "grad_norm": 8.9375, + "learning_rate": 9.634632650009816e-06, + "loss": 1.07997761, + "memory(GiB)": 302.58, + "step": 59480, + "train_speed(iter/s)": 0.124582 + }, + { + "acc": 0.73076358, + "epoch": 0.3327521821133119, + "grad_norm": 5.3125, + "learning_rate": 9.634285581810971e-06, + "loss": 1.0955883, + "memory(GiB)": 302.58, + "step": 59500, + "train_speed(iter/s)": 0.124601 + }, + { + "acc": 0.73704247, + "epoch": 0.3328640315862912, + "grad_norm": 6.25, + "learning_rate": 9.633938355104867e-06, + "loss": 1.03839035, + "memory(GiB)": 302.58, + "step": 59520, + "train_speed(iter/s)": 0.124621 + }, + { + "acc": 0.72467771, + "epoch": 0.33297588105927045, + "grad_norm": 7.15625, + "learning_rate": 9.633590969903375e-06, + "loss": 1.08704739, + "memory(GiB)": 302.58, + "step": 59540, + "train_speed(iter/s)": 0.124641 + }, + { + "acc": 0.73408713, + "epoch": 0.3330877305322497, + "grad_norm": 10.5625, + "learning_rate": 9.633243426218379e-06, + "loss": 1.03155193, + "memory(GiB)": 302.58, + "step": 59560, + "train_speed(iter/s)": 0.12466 + }, + { + "acc": 0.73249583, + "epoch": 0.333199580005229, + "grad_norm": 5.9375, + "learning_rate": 9.632895724061766e-06, + "loss": 1.05849915, + "memory(GiB)": 302.58, + "step": 59580, + "train_speed(iter/s)": 0.12468 + }, + { + "acc": 0.71400318, + "epoch": 0.33331142947820824, + "grad_norm": 6.46875, + "learning_rate": 9.63254786344543e-06, + "loss": 1.13954744, + "memory(GiB)": 302.58, + "step": 59600, + "train_speed(iter/s)": 0.1247 + }, + { + "acc": 0.73274608, + "epoch": 0.3334232789511875, + "grad_norm": 11.3125, + "learning_rate": 9.632199844381266e-06, + "loss": 1.07085638, + "memory(GiB)": 302.58, + "step": 59620, + "train_speed(iter/s)": 0.12472 + }, + { + "acc": 0.72855878, + "epoch": 0.33353512842416677, + "grad_norm": 5.1875, + "learning_rate": 9.631851666881182e-06, + "loss": 1.07337818, + "memory(GiB)": 302.58, + "step": 59640, + "train_speed(iter/s)": 0.12474 + }, + { + "acc": 0.7340167, + "epoch": 0.33364697789714604, + "grad_norm": 10.125, + "learning_rate": 9.631503330957081e-06, + "loss": 1.07172775, + "memory(GiB)": 302.58, + "step": 59660, + "train_speed(iter/s)": 0.124758 + }, + { + "acc": 0.74362159, + "epoch": 0.3337588273701253, + "grad_norm": 6.9375, + "learning_rate": 9.631154836620882e-06, + "loss": 0.99135609, + "memory(GiB)": 302.58, + "step": 59680, + "train_speed(iter/s)": 0.124778 + }, + { + "acc": 0.73493166, + "epoch": 0.33387067684310456, + "grad_norm": 6.0, + "learning_rate": 9.630806183884503e-06, + "loss": 1.05512962, + "memory(GiB)": 302.58, + "step": 59700, + "train_speed(iter/s)": 0.124796 + }, + { + "acc": 0.75479412, + "epoch": 0.3339825263160838, + "grad_norm": 6.625, + "learning_rate": 9.630457372759867e-06, + "loss": 0.95007458, + "memory(GiB)": 302.58, + "step": 59720, + "train_speed(iter/s)": 0.124817 + }, + { + "acc": 0.73878193, + "epoch": 0.3340943757890631, + "grad_norm": 7.875, + "learning_rate": 9.630108403258906e-06, + "loss": 1.0307682, + "memory(GiB)": 302.58, + "step": 59740, + "train_speed(iter/s)": 0.124837 + }, + { + "acc": 0.73344407, + "epoch": 0.33420622526204236, + "grad_norm": 6.5, + "learning_rate": 9.629759275393556e-06, + "loss": 1.02988853, + "memory(GiB)": 302.58, + "step": 59760, + "train_speed(iter/s)": 0.124855 + }, + { + "acc": 0.72463961, + "epoch": 0.3343180747350216, + "grad_norm": 8.125, + "learning_rate": 9.629409989175758e-06, + "loss": 1.09306345, + "memory(GiB)": 302.58, + "step": 59780, + "train_speed(iter/s)": 0.124874 + }, + { + "acc": 0.73124943, + "epoch": 0.3344299242080009, + "grad_norm": 10.75, + "learning_rate": 9.62906054461746e-06, + "loss": 1.06177187, + "memory(GiB)": 302.58, + "step": 59800, + "train_speed(iter/s)": 0.124892 + }, + { + "acc": 0.74633088, + "epoch": 0.33454177368098015, + "grad_norm": 8.5625, + "learning_rate": 9.628710941730613e-06, + "loss": 0.99951344, + "memory(GiB)": 302.58, + "step": 59820, + "train_speed(iter/s)": 0.124911 + }, + { + "acc": 0.74238577, + "epoch": 0.3346536231539594, + "grad_norm": 8.4375, + "learning_rate": 9.628361180527174e-06, + "loss": 1.00218334, + "memory(GiB)": 302.58, + "step": 59840, + "train_speed(iter/s)": 0.124931 + }, + { + "acc": 0.72997928, + "epoch": 0.3347654726269387, + "grad_norm": 7.09375, + "learning_rate": 9.628011261019105e-06, + "loss": 1.079282, + "memory(GiB)": 302.58, + "step": 59860, + "train_speed(iter/s)": 0.124951 + }, + { + "acc": 0.72988071, + "epoch": 0.33487732209991794, + "grad_norm": 5.6875, + "learning_rate": 9.627661183218379e-06, + "loss": 1.06070299, + "memory(GiB)": 302.58, + "step": 59880, + "train_speed(iter/s)": 0.124971 + }, + { + "acc": 0.72298021, + "epoch": 0.3349891715728972, + "grad_norm": 5.625, + "learning_rate": 9.627310947136965e-06, + "loss": 1.0857933, + "memory(GiB)": 302.58, + "step": 59900, + "train_speed(iter/s)": 0.124991 + }, + { + "acc": 0.72642927, + "epoch": 0.33510102104587647, + "grad_norm": 7.375, + "learning_rate": 9.626960552786844e-06, + "loss": 1.07953024, + "memory(GiB)": 302.58, + "step": 59920, + "train_speed(iter/s)": 0.12501 + }, + { + "acc": 0.73191051, + "epoch": 0.33521287051885573, + "grad_norm": 7.1875, + "learning_rate": 9.626610000179999e-06, + "loss": 1.04576082, + "memory(GiB)": 302.58, + "step": 59940, + "train_speed(iter/s)": 0.125029 + }, + { + "acc": 0.73436866, + "epoch": 0.335324719991835, + "grad_norm": 6.875, + "learning_rate": 9.626259289328424e-06, + "loss": 1.03353329, + "memory(GiB)": 302.58, + "step": 59960, + "train_speed(iter/s)": 0.125047 + }, + { + "acc": 0.74336915, + "epoch": 0.33543656946481426, + "grad_norm": 6.9375, + "learning_rate": 9.62590842024411e-06, + "loss": 1.01296206, + "memory(GiB)": 302.58, + "step": 59980, + "train_speed(iter/s)": 0.125067 + }, + { + "acc": 0.73992815, + "epoch": 0.3355484189377935, + "grad_norm": 5.84375, + "learning_rate": 9.62555739293906e-06, + "loss": 1.02648411, + "memory(GiB)": 302.58, + "step": 60000, + "train_speed(iter/s)": 0.125086 + }, + { + "epoch": 0.3355484189377935, + "eval_acc": 0.6990102881407017, + "eval_loss": 1.0473049879074097, + "eval_runtime": 7512.7934, + "eval_samples_per_second": 10.021, + "eval_steps_per_second": 10.021, + "step": 60000 + }, + { + "acc": 0.73713536, + "epoch": 0.3356602684107728, + "grad_norm": 6.40625, + "learning_rate": 9.625206207425279e-06, + "loss": 1.03868809, + "memory(GiB)": 302.58, + "step": 60020, + "train_speed(iter/s)": 0.123145 + }, + { + "acc": 0.74947762, + "epoch": 0.33577211788375205, + "grad_norm": 7.6875, + "learning_rate": 9.62485486371478e-06, + "loss": 0.99305201, + "memory(GiB)": 302.58, + "step": 60040, + "train_speed(iter/s)": 0.123165 + }, + { + "acc": 0.75628314, + "epoch": 0.3358839673567313, + "grad_norm": 6.21875, + "learning_rate": 9.624503361819582e-06, + "loss": 0.96241226, + "memory(GiB)": 302.58, + "step": 60060, + "train_speed(iter/s)": 0.123183 + }, + { + "acc": 0.74059262, + "epoch": 0.3359958168297106, + "grad_norm": 10.4375, + "learning_rate": 9.624151701751702e-06, + "loss": 1.02637444, + "memory(GiB)": 302.58, + "step": 60080, + "train_speed(iter/s)": 0.123201 + }, + { + "acc": 0.74862962, + "epoch": 0.33610766630268984, + "grad_norm": 7.96875, + "learning_rate": 9.623799883523172e-06, + "loss": 0.98450956, + "memory(GiB)": 302.58, + "step": 60100, + "train_speed(iter/s)": 0.123222 + }, + { + "acc": 0.73115363, + "epoch": 0.3362195157756691, + "grad_norm": 8.25, + "learning_rate": 9.623447907146027e-06, + "loss": 1.05670767, + "memory(GiB)": 302.58, + "step": 60120, + "train_speed(iter/s)": 0.123241 + }, + { + "acc": 0.72508459, + "epoch": 0.3363313652486484, + "grad_norm": 8.0625, + "learning_rate": 9.6230957726323e-06, + "loss": 1.09083738, + "memory(GiB)": 302.58, + "step": 60140, + "train_speed(iter/s)": 0.12326 + }, + { + "acc": 0.72026567, + "epoch": 0.33644321472162764, + "grad_norm": 8.125, + "learning_rate": 9.622743479994038e-06, + "loss": 1.09442253, + "memory(GiB)": 302.58, + "step": 60160, + "train_speed(iter/s)": 0.12328 + }, + { + "acc": 0.71417103, + "epoch": 0.3365550641946069, + "grad_norm": 6.375, + "learning_rate": 9.622391029243292e-06, + "loss": 1.13979349, + "memory(GiB)": 302.58, + "step": 60180, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.73032932, + "epoch": 0.33666691366758617, + "grad_norm": 5.28125, + "learning_rate": 9.622038420392114e-06, + "loss": 1.08267984, + "memory(GiB)": 302.58, + "step": 60200, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.72613368, + "epoch": 0.33677876314056543, + "grad_norm": 8.875, + "learning_rate": 9.621685653452568e-06, + "loss": 1.10461435, + "memory(GiB)": 302.58, + "step": 60220, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.73906903, + "epoch": 0.3368906126135447, + "grad_norm": 9.375, + "learning_rate": 9.621332728436716e-06, + "loss": 1.03553543, + "memory(GiB)": 302.58, + "step": 60240, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.740346, + "epoch": 0.33700246208652396, + "grad_norm": 6.21875, + "learning_rate": 9.620979645356631e-06, + "loss": 1.01340275, + "memory(GiB)": 302.58, + "step": 60260, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.72208571, + "epoch": 0.3371143115595032, + "grad_norm": 9.5625, + "learning_rate": 9.620626404224392e-06, + "loss": 1.11574755, + "memory(GiB)": 302.58, + "step": 60280, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.73797073, + "epoch": 0.3372261610324825, + "grad_norm": 6.8125, + "learning_rate": 9.620273005052075e-06, + "loss": 1.01837025, + "memory(GiB)": 302.58, + "step": 60300, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.73224025, + "epoch": 0.33733801050546175, + "grad_norm": 10.0625, + "learning_rate": 9.619919447851774e-06, + "loss": 1.058953, + "memory(GiB)": 302.58, + "step": 60320, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.74876785, + "epoch": 0.337449859978441, + "grad_norm": 8.25, + "learning_rate": 9.619565732635575e-06, + "loss": 0.98331909, + "memory(GiB)": 302.58, + "step": 60340, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.73984785, + "epoch": 0.3375617094514203, + "grad_norm": 6.65625, + "learning_rate": 9.61921185941558e-06, + "loss": 1.00428505, + "memory(GiB)": 302.58, + "step": 60360, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.73834639, + "epoch": 0.33767355892439954, + "grad_norm": 11.5, + "learning_rate": 9.618857828203895e-06, + "loss": 1.03451796, + "memory(GiB)": 302.58, + "step": 60380, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.7394206, + "epoch": 0.3377854083973788, + "grad_norm": 4.28125, + "learning_rate": 9.618503639012624e-06, + "loss": 1.01785831, + "memory(GiB)": 302.58, + "step": 60400, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.74780922, + "epoch": 0.33789725787035807, + "grad_norm": 6.65625, + "learning_rate": 9.618149291853885e-06, + "loss": 0.99155951, + "memory(GiB)": 302.58, + "step": 60420, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.73361702, + "epoch": 0.33800910734333733, + "grad_norm": 6.21875, + "learning_rate": 9.617794786739795e-06, + "loss": 1.06270905, + "memory(GiB)": 302.58, + "step": 60440, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.72715068, + "epoch": 0.3381209568163166, + "grad_norm": 6.9375, + "learning_rate": 9.617440123682482e-06, + "loss": 1.07372179, + "memory(GiB)": 302.58, + "step": 60460, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.74880896, + "epoch": 0.33823280628929586, + "grad_norm": 8.5, + "learning_rate": 9.617085302694077e-06, + "loss": 1.03289537, + "memory(GiB)": 302.58, + "step": 60480, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.74770164, + "epoch": 0.3383446557622751, + "grad_norm": 6.71875, + "learning_rate": 9.616730323786712e-06, + "loss": 0.99087095, + "memory(GiB)": 302.58, + "step": 60500, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.7449378, + "epoch": 0.3384565052352544, + "grad_norm": 7.1875, + "learning_rate": 9.61637518697253e-06, + "loss": 0.98590822, + "memory(GiB)": 302.58, + "step": 60520, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.74446192, + "epoch": 0.33856835470823365, + "grad_norm": 8.375, + "learning_rate": 9.61601989226368e-06, + "loss": 1.00980406, + "memory(GiB)": 302.58, + "step": 60540, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.74995575, + "epoch": 0.3386802041812129, + "grad_norm": 5.96875, + "learning_rate": 9.61566443967231e-06, + "loss": 0.96751251, + "memory(GiB)": 302.58, + "step": 60560, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.74117737, + "epoch": 0.3387920536541922, + "grad_norm": 6.34375, + "learning_rate": 9.615308829210584e-06, + "loss": 1.0296174, + "memory(GiB)": 302.58, + "step": 60580, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.72245359, + "epoch": 0.33890390312717145, + "grad_norm": 8.4375, + "learning_rate": 9.61495306089066e-06, + "loss": 1.09487801, + "memory(GiB)": 302.58, + "step": 60600, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.7288671, + "epoch": 0.3390157526001507, + "grad_norm": 6.375, + "learning_rate": 9.614597134724706e-06, + "loss": 1.07204142, + "memory(GiB)": 302.58, + "step": 60620, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.73405523, + "epoch": 0.33912760207313, + "grad_norm": 5.59375, + "learning_rate": 9.614241050724899e-06, + "loss": 1.0520915, + "memory(GiB)": 302.58, + "step": 60640, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.72210732, + "epoch": 0.33923945154610924, + "grad_norm": 5.4375, + "learning_rate": 9.613884808903416e-06, + "loss": 1.11315422, + "memory(GiB)": 302.58, + "step": 60660, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.74761691, + "epoch": 0.3393513010190885, + "grad_norm": 7.03125, + "learning_rate": 9.613528409272443e-06, + "loss": 0.99130011, + "memory(GiB)": 302.58, + "step": 60680, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.73609867, + "epoch": 0.33946315049206777, + "grad_norm": 9.3125, + "learning_rate": 9.61317185184417e-06, + "loss": 1.05434732, + "memory(GiB)": 302.58, + "step": 60700, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.72749047, + "epoch": 0.33957499996504703, + "grad_norm": 6.1875, + "learning_rate": 9.61281513663079e-06, + "loss": 1.0937417, + "memory(GiB)": 302.58, + "step": 60720, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.71933389, + "epoch": 0.3396868494380263, + "grad_norm": 6.9375, + "learning_rate": 9.612458263644507e-06, + "loss": 1.11542301, + "memory(GiB)": 302.58, + "step": 60740, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.73313236, + "epoch": 0.33979869891100556, + "grad_norm": 7.75, + "learning_rate": 9.612101232897525e-06, + "loss": 1.05004883, + "memory(GiB)": 302.58, + "step": 60760, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.7366497, + "epoch": 0.3399105483839848, + "grad_norm": 6.90625, + "learning_rate": 9.611744044402057e-06, + "loss": 1.02530365, + "memory(GiB)": 302.58, + "step": 60780, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.73530207, + "epoch": 0.3400223978569641, + "grad_norm": 7.03125, + "learning_rate": 9.611386698170318e-06, + "loss": 1.03076763, + "memory(GiB)": 302.58, + "step": 60800, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.73584032, + "epoch": 0.34013424732994335, + "grad_norm": 5.125, + "learning_rate": 9.61102919421453e-06, + "loss": 1.03945017, + "memory(GiB)": 302.58, + "step": 60820, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.75493293, + "epoch": 0.3402460968029226, + "grad_norm": 6.40625, + "learning_rate": 9.610671532546924e-06, + "loss": 0.96679497, + "memory(GiB)": 302.58, + "step": 60840, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.74695573, + "epoch": 0.3403579462759019, + "grad_norm": 8.75, + "learning_rate": 9.610313713179733e-06, + "loss": 0.98939056, + "memory(GiB)": 302.58, + "step": 60860, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.74855499, + "epoch": 0.34046979574888114, + "grad_norm": 11.8125, + "learning_rate": 9.609955736125194e-06, + "loss": 0.98915567, + "memory(GiB)": 302.58, + "step": 60880, + "train_speed(iter/s)": 0.123969 + }, + { + "acc": 0.74141445, + "epoch": 0.3405816452218604, + "grad_norm": 5.0, + "learning_rate": 9.60959760139555e-06, + "loss": 1.02478218, + "memory(GiB)": 302.58, + "step": 60900, + "train_speed(iter/s)": 0.123987 + }, + { + "acc": 0.71630316, + "epoch": 0.34069349469483967, + "grad_norm": 8.3125, + "learning_rate": 9.609239309003051e-06, + "loss": 1.14330578, + "memory(GiB)": 302.58, + "step": 60920, + "train_speed(iter/s)": 0.124005 + }, + { + "acc": 0.73486953, + "epoch": 0.34080534416781894, + "grad_norm": 5.34375, + "learning_rate": 9.608880858959953e-06, + "loss": 0.99835405, + "memory(GiB)": 302.58, + "step": 60940, + "train_speed(iter/s)": 0.124024 + }, + { + "acc": 0.74589958, + "epoch": 0.3409171936407982, + "grad_norm": 5.21875, + "learning_rate": 9.608522251278514e-06, + "loss": 0.99196491, + "memory(GiB)": 302.58, + "step": 60960, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.74622774, + "epoch": 0.34102904311377746, + "grad_norm": 5.8125, + "learning_rate": 9.608163485971002e-06, + "loss": 1.01434031, + "memory(GiB)": 302.58, + "step": 60980, + "train_speed(iter/s)": 0.124063 + }, + { + "acc": 0.73078156, + "epoch": 0.3411408925867567, + "grad_norm": 7.40625, + "learning_rate": 9.607804563049686e-06, + "loss": 1.08284855, + "memory(GiB)": 302.58, + "step": 61000, + "train_speed(iter/s)": 0.124083 + }, + { + "acc": 0.71318445, + "epoch": 0.341252742059736, + "grad_norm": 5.8125, + "learning_rate": 9.607445482526844e-06, + "loss": 1.15753784, + "memory(GiB)": 302.58, + "step": 61020, + "train_speed(iter/s)": 0.124101 + }, + { + "acc": 0.73919883, + "epoch": 0.34136459153271526, + "grad_norm": 5.71875, + "learning_rate": 9.607086244414757e-06, + "loss": 1.03362551, + "memory(GiB)": 302.58, + "step": 61040, + "train_speed(iter/s)": 0.12412 + }, + { + "acc": 0.74288821, + "epoch": 0.3414764410056945, + "grad_norm": 7.28125, + "learning_rate": 9.606726848725709e-06, + "loss": 1.02078543, + "memory(GiB)": 302.58, + "step": 61060, + "train_speed(iter/s)": 0.12414 + }, + { + "acc": 0.7321733, + "epoch": 0.3415882904786738, + "grad_norm": 7.6875, + "learning_rate": 9.606367295471997e-06, + "loss": 1.06449785, + "memory(GiB)": 302.58, + "step": 61080, + "train_speed(iter/s)": 0.124159 + }, + { + "acc": 0.74950776, + "epoch": 0.34170013995165305, + "grad_norm": 5.71875, + "learning_rate": 9.606007584665917e-06, + "loss": 0.97428865, + "memory(GiB)": 302.58, + "step": 61100, + "train_speed(iter/s)": 0.124178 + }, + { + "acc": 0.76847744, + "epoch": 0.3418119894246323, + "grad_norm": 6.96875, + "learning_rate": 9.605647716319772e-06, + "loss": 0.88923979, + "memory(GiB)": 302.58, + "step": 61120, + "train_speed(iter/s)": 0.124196 + }, + { + "acc": 0.72943873, + "epoch": 0.3419238388976116, + "grad_norm": 7.03125, + "learning_rate": 9.605287690445873e-06, + "loss": 1.06573105, + "memory(GiB)": 302.58, + "step": 61140, + "train_speed(iter/s)": 0.124215 + }, + { + "acc": 0.75626302, + "epoch": 0.34203568837059084, + "grad_norm": 6.75, + "learning_rate": 9.60492750705653e-06, + "loss": 0.95196714, + "memory(GiB)": 302.58, + "step": 61160, + "train_speed(iter/s)": 0.124234 + }, + { + "acc": 0.72439594, + "epoch": 0.3421475378435701, + "grad_norm": 4.59375, + "learning_rate": 9.604567166164065e-06, + "loss": 1.10674477, + "memory(GiB)": 302.58, + "step": 61180, + "train_speed(iter/s)": 0.12425 + }, + { + "acc": 0.73511457, + "epoch": 0.34225938731654937, + "grad_norm": 6.0625, + "learning_rate": 9.604206667780803e-06, + "loss": 1.03523788, + "memory(GiB)": 302.58, + "step": 61200, + "train_speed(iter/s)": 0.12427 + }, + { + "acc": 0.72388825, + "epoch": 0.34237123678952863, + "grad_norm": 9.5, + "learning_rate": 9.603846011919072e-06, + "loss": 1.09883404, + "memory(GiB)": 302.58, + "step": 61220, + "train_speed(iter/s)": 0.124289 + }, + { + "acc": 0.72953019, + "epoch": 0.3424830862625079, + "grad_norm": 5.59375, + "learning_rate": 9.603485198591209e-06, + "loss": 1.09021635, + "memory(GiB)": 302.58, + "step": 61240, + "train_speed(iter/s)": 0.124309 + }, + { + "acc": 0.73058777, + "epoch": 0.34259493573548716, + "grad_norm": 5.5, + "learning_rate": 9.603124227809555e-06, + "loss": 1.0770628, + "memory(GiB)": 302.58, + "step": 61260, + "train_speed(iter/s)": 0.124327 + }, + { + "acc": 0.72336569, + "epoch": 0.3427067852084664, + "grad_norm": 6.21875, + "learning_rate": 9.60276309958646e-06, + "loss": 1.09511814, + "memory(GiB)": 302.58, + "step": 61280, + "train_speed(iter/s)": 0.124346 + }, + { + "acc": 0.74222283, + "epoch": 0.3428186346814457, + "grad_norm": 5.5, + "learning_rate": 9.602401813934267e-06, + "loss": 0.98802376, + "memory(GiB)": 302.58, + "step": 61300, + "train_speed(iter/s)": 0.124364 + }, + { + "acc": 0.74268794, + "epoch": 0.34293048415442495, + "grad_norm": 8.6875, + "learning_rate": 9.60204037086534e-06, + "loss": 1.02089958, + "memory(GiB)": 302.58, + "step": 61320, + "train_speed(iter/s)": 0.124383 + }, + { + "acc": 0.74744172, + "epoch": 0.3430423336274042, + "grad_norm": 6.5, + "learning_rate": 9.60167877039204e-06, + "loss": 0.99560261, + "memory(GiB)": 302.58, + "step": 61340, + "train_speed(iter/s)": 0.124402 + }, + { + "acc": 0.71999502, + "epoch": 0.3431541831003835, + "grad_norm": 4.5625, + "learning_rate": 9.601317012526734e-06, + "loss": 1.09761438, + "memory(GiB)": 302.58, + "step": 61360, + "train_speed(iter/s)": 0.124419 + }, + { + "acc": 0.74266591, + "epoch": 0.34326603257336274, + "grad_norm": 7.59375, + "learning_rate": 9.600955097281794e-06, + "loss": 0.99349051, + "memory(GiB)": 302.58, + "step": 61380, + "train_speed(iter/s)": 0.124438 + }, + { + "acc": 0.73317294, + "epoch": 0.343377882046342, + "grad_norm": 8.0, + "learning_rate": 9.600593024669604e-06, + "loss": 1.04269743, + "memory(GiB)": 302.58, + "step": 61400, + "train_speed(iter/s)": 0.124458 + }, + { + "acc": 0.75937891, + "epoch": 0.3434897315193213, + "grad_norm": 5.875, + "learning_rate": 9.600230794702542e-06, + "loss": 0.92128468, + "memory(GiB)": 302.58, + "step": 61420, + "train_speed(iter/s)": 0.124477 + }, + { + "acc": 0.72331414, + "epoch": 0.34360158099230054, + "grad_norm": 5.75, + "learning_rate": 9.599868407393e-06, + "loss": 1.11191883, + "memory(GiB)": 302.58, + "step": 61440, + "train_speed(iter/s)": 0.124496 + }, + { + "acc": 0.74049664, + "epoch": 0.3437134304652798, + "grad_norm": 7.25, + "learning_rate": 9.599505862753376e-06, + "loss": 1.03247452, + "memory(GiB)": 302.58, + "step": 61460, + "train_speed(iter/s)": 0.124516 + }, + { + "acc": 0.73581567, + "epoch": 0.34382527993825907, + "grad_norm": 6.46875, + "learning_rate": 9.599143160796063e-06, + "loss": 1.05260973, + "memory(GiB)": 302.58, + "step": 61480, + "train_speed(iter/s)": 0.124535 + }, + { + "acc": 0.72248898, + "epoch": 0.34393712941123833, + "grad_norm": 7.03125, + "learning_rate": 9.598780301533472e-06, + "loss": 1.11518812, + "memory(GiB)": 302.58, + "step": 61500, + "train_speed(iter/s)": 0.124553 + }, + { + "acc": 0.7191905, + "epoch": 0.3440489788842176, + "grad_norm": 8.875, + "learning_rate": 9.598417284978012e-06, + "loss": 1.0913764, + "memory(GiB)": 302.58, + "step": 61520, + "train_speed(iter/s)": 0.124572 + }, + { + "acc": 0.73471866, + "epoch": 0.34416082835719686, + "grad_norm": 8.375, + "learning_rate": 9.5980541111421e-06, + "loss": 1.06016207, + "memory(GiB)": 302.58, + "step": 61540, + "train_speed(iter/s)": 0.124591 + }, + { + "acc": 0.72983685, + "epoch": 0.3442726778301761, + "grad_norm": 6.28125, + "learning_rate": 9.597690780038156e-06, + "loss": 1.0804985, + "memory(GiB)": 302.58, + "step": 61560, + "train_speed(iter/s)": 0.124611 + }, + { + "acc": 0.74578619, + "epoch": 0.34438452730315544, + "grad_norm": 9.125, + "learning_rate": 9.59732729167861e-06, + "loss": 0.9660985, + "memory(GiB)": 302.58, + "step": 61580, + "train_speed(iter/s)": 0.12463 + }, + { + "acc": 0.73525667, + "epoch": 0.3444963767761347, + "grad_norm": 8.375, + "learning_rate": 9.596963646075892e-06, + "loss": 1.04359045, + "memory(GiB)": 302.58, + "step": 61600, + "train_speed(iter/s)": 0.124649 + }, + { + "acc": 0.72452993, + "epoch": 0.34460822624911397, + "grad_norm": 5.59375, + "learning_rate": 9.596599843242442e-06, + "loss": 1.11245003, + "memory(GiB)": 302.58, + "step": 61620, + "train_speed(iter/s)": 0.124666 + }, + { + "acc": 0.72060919, + "epoch": 0.34472007572209323, + "grad_norm": 5.5, + "learning_rate": 9.5962358831907e-06, + "loss": 1.1202734, + "memory(GiB)": 302.58, + "step": 61640, + "train_speed(iter/s)": 0.124685 + }, + { + "acc": 0.72182579, + "epoch": 0.3448319251950725, + "grad_norm": 7.875, + "learning_rate": 9.595871765933118e-06, + "loss": 1.10294895, + "memory(GiB)": 302.58, + "step": 61660, + "train_speed(iter/s)": 0.124704 + }, + { + "acc": 0.73188071, + "epoch": 0.34494377466805176, + "grad_norm": 5.90625, + "learning_rate": 9.595507491482147e-06, + "loss": 1.06498661, + "memory(GiB)": 302.58, + "step": 61680, + "train_speed(iter/s)": 0.124723 + }, + { + "acc": 0.73026304, + "epoch": 0.345055624141031, + "grad_norm": 5.9375, + "learning_rate": 9.595143059850249e-06, + "loss": 1.07336006, + "memory(GiB)": 302.58, + "step": 61700, + "train_speed(iter/s)": 0.124741 + }, + { + "acc": 0.74332633, + "epoch": 0.3451674736140103, + "grad_norm": 7.3125, + "learning_rate": 9.594778471049889e-06, + "loss": 1.00159292, + "memory(GiB)": 302.58, + "step": 61720, + "train_speed(iter/s)": 0.124759 + }, + { + "acc": 0.72548103, + "epoch": 0.34527932308698955, + "grad_norm": 5.5, + "learning_rate": 9.594413725093535e-06, + "loss": 1.08477669, + "memory(GiB)": 302.58, + "step": 61740, + "train_speed(iter/s)": 0.124778 + }, + { + "acc": 0.75081658, + "epoch": 0.3453911725599688, + "grad_norm": 8.0, + "learning_rate": 9.594048821993662e-06, + "loss": 0.95232401, + "memory(GiB)": 302.58, + "step": 61760, + "train_speed(iter/s)": 0.124795 + }, + { + "acc": 0.71442294, + "epoch": 0.3455030220329481, + "grad_norm": 4.65625, + "learning_rate": 9.593683761762751e-06, + "loss": 1.14651241, + "memory(GiB)": 302.58, + "step": 61780, + "train_speed(iter/s)": 0.124814 + }, + { + "acc": 0.73595147, + "epoch": 0.34561487150592735, + "grad_norm": 8.5625, + "learning_rate": 9.59331854441329e-06, + "loss": 1.03295994, + "memory(GiB)": 302.58, + "step": 61800, + "train_speed(iter/s)": 0.124833 + }, + { + "acc": 0.72616806, + "epoch": 0.3457267209789066, + "grad_norm": 9.5, + "learning_rate": 9.59295316995777e-06, + "loss": 1.09850483, + "memory(GiB)": 302.58, + "step": 61820, + "train_speed(iter/s)": 0.124852 + }, + { + "acc": 0.73017011, + "epoch": 0.3458385704518859, + "grad_norm": 7.8125, + "learning_rate": 9.592587638408688e-06, + "loss": 1.05172071, + "memory(GiB)": 302.58, + "step": 61840, + "train_speed(iter/s)": 0.124871 + }, + { + "acc": 0.73119903, + "epoch": 0.34595041992486514, + "grad_norm": 7.59375, + "learning_rate": 9.592221949778547e-06, + "loss": 1.0617651, + "memory(GiB)": 302.58, + "step": 61860, + "train_speed(iter/s)": 0.124889 + }, + { + "acc": 0.72363448, + "epoch": 0.3460622693978444, + "grad_norm": 7.96875, + "learning_rate": 9.591856104079851e-06, + "loss": 1.08218079, + "memory(GiB)": 302.58, + "step": 61880, + "train_speed(iter/s)": 0.124909 + }, + { + "acc": 0.7605176, + "epoch": 0.34617411887082367, + "grad_norm": 7.4375, + "learning_rate": 9.591490101325117e-06, + "loss": 0.9627738, + "memory(GiB)": 302.58, + "step": 61900, + "train_speed(iter/s)": 0.124926 + }, + { + "acc": 0.73600645, + "epoch": 0.34628596834380293, + "grad_norm": 8.75, + "learning_rate": 9.591123941526864e-06, + "loss": 1.0138175, + "memory(GiB)": 302.58, + "step": 61920, + "train_speed(iter/s)": 0.124945 + }, + { + "acc": 0.72790732, + "epoch": 0.3463978178167822, + "grad_norm": 6.46875, + "learning_rate": 9.590757624697612e-06, + "loss": 1.07666483, + "memory(GiB)": 302.58, + "step": 61940, + "train_speed(iter/s)": 0.124964 + }, + { + "acc": 0.7301753, + "epoch": 0.34650966728976146, + "grad_norm": 6.40625, + "learning_rate": 9.590391150849894e-06, + "loss": 1.04441719, + "memory(GiB)": 302.58, + "step": 61960, + "train_speed(iter/s)": 0.124981 + }, + { + "acc": 0.7112289, + "epoch": 0.3466215167627407, + "grad_norm": 4.40625, + "learning_rate": 9.590024519996242e-06, + "loss": 1.15725317, + "memory(GiB)": 302.58, + "step": 61980, + "train_speed(iter/s)": 0.125 + }, + { + "acc": 0.73421221, + "epoch": 0.34673336623572, + "grad_norm": 5.25, + "learning_rate": 9.589657732149197e-06, + "loss": 1.05191708, + "memory(GiB)": 302.58, + "step": 62000, + "train_speed(iter/s)": 0.125018 + }, + { + "epoch": 0.34673336623572, + "eval_acc": 0.6992562298074201, + "eval_loss": 1.0459544658660889, + "eval_runtime": 7499.9067, + "eval_samples_per_second": 10.038, + "eval_steps_per_second": 10.038, + "step": 62000 + }, + { + "acc": 0.74118428, + "epoch": 0.34684521570869925, + "grad_norm": 7.6875, + "learning_rate": 9.589290787321306e-06, + "loss": 0.99310484, + "memory(GiB)": 302.58, + "step": 62020, + "train_speed(iter/s)": 0.123144 + }, + { + "acc": 0.72416539, + "epoch": 0.3469570651816785, + "grad_norm": 4.65625, + "learning_rate": 9.588923685525115e-06, + "loss": 1.10122776, + "memory(GiB)": 302.58, + "step": 62040, + "train_speed(iter/s)": 0.123162 + }, + { + "acc": 0.73941312, + "epoch": 0.3470689146546578, + "grad_norm": 6.59375, + "learning_rate": 9.588556426773183e-06, + "loss": 1.04500418, + "memory(GiB)": 302.58, + "step": 62060, + "train_speed(iter/s)": 0.123181 + }, + { + "acc": 0.72264433, + "epoch": 0.34718076412763704, + "grad_norm": 5.90625, + "learning_rate": 9.588189011078074e-06, + "loss": 1.08668976, + "memory(GiB)": 302.58, + "step": 62080, + "train_speed(iter/s)": 0.1232 + }, + { + "acc": 0.74269581, + "epoch": 0.3472926136006163, + "grad_norm": 6.90625, + "learning_rate": 9.587821438452349e-06, + "loss": 1.01051331, + "memory(GiB)": 302.58, + "step": 62100, + "train_speed(iter/s)": 0.123219 + }, + { + "acc": 0.72859874, + "epoch": 0.34740446307359557, + "grad_norm": 7.78125, + "learning_rate": 9.587453708908585e-06, + "loss": 1.07147799, + "memory(GiB)": 302.58, + "step": 62120, + "train_speed(iter/s)": 0.123236 + }, + { + "acc": 0.73366413, + "epoch": 0.34751631254657483, + "grad_norm": 6.625, + "learning_rate": 9.587085822459358e-06, + "loss": 1.04194593, + "memory(GiB)": 302.58, + "step": 62140, + "train_speed(iter/s)": 0.123254 + }, + { + "acc": 0.73716021, + "epoch": 0.3476281620195541, + "grad_norm": 6.65625, + "learning_rate": 9.58671777911725e-06, + "loss": 1.04723787, + "memory(GiB)": 302.58, + "step": 62160, + "train_speed(iter/s)": 0.123272 + }, + { + "acc": 0.74194555, + "epoch": 0.34774001149253336, + "grad_norm": 5.9375, + "learning_rate": 9.58634957889485e-06, + "loss": 1.01475687, + "memory(GiB)": 302.58, + "step": 62180, + "train_speed(iter/s)": 0.123292 + }, + { + "acc": 0.72834239, + "epoch": 0.3478518609655126, + "grad_norm": 5.34375, + "learning_rate": 9.58598122180475e-06, + "loss": 1.06624804, + "memory(GiB)": 302.58, + "step": 62200, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.73587971, + "epoch": 0.3479637104384919, + "grad_norm": 7.625, + "learning_rate": 9.585612707859552e-06, + "loss": 1.0207222, + "memory(GiB)": 302.58, + "step": 62220, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.74218216, + "epoch": 0.34807555991147116, + "grad_norm": 5.4375, + "learning_rate": 9.585244037071859e-06, + "loss": 0.99725351, + "memory(GiB)": 302.58, + "step": 62240, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.73769732, + "epoch": 0.3481874093844504, + "grad_norm": 9.1875, + "learning_rate": 9.58487520945428e-06, + "loss": 1.0247282, + "memory(GiB)": 302.58, + "step": 62260, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.73456464, + "epoch": 0.3482992588574297, + "grad_norm": 6.84375, + "learning_rate": 9.58450622501943e-06, + "loss": 1.03931437, + "memory(GiB)": 302.58, + "step": 62280, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74203358, + "epoch": 0.34841110833040895, + "grad_norm": 7.28125, + "learning_rate": 9.58413708377993e-06, + "loss": 0.99966974, + "memory(GiB)": 302.58, + "step": 62300, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.72710824, + "epoch": 0.3485229578033882, + "grad_norm": 7.875, + "learning_rate": 9.583767785748407e-06, + "loss": 1.09635925, + "memory(GiB)": 302.58, + "step": 62320, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.73744307, + "epoch": 0.3486348072763675, + "grad_norm": 6.125, + "learning_rate": 9.58339833093749e-06, + "loss": 1.0379961, + "memory(GiB)": 302.58, + "step": 62340, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.73925114, + "epoch": 0.34874665674934674, + "grad_norm": 9.4375, + "learning_rate": 9.583028719359816e-06, + "loss": 1.01746655, + "memory(GiB)": 302.58, + "step": 62360, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.72181025, + "epoch": 0.348858506222326, + "grad_norm": 6.5625, + "learning_rate": 9.58265895102803e-06, + "loss": 1.0794445, + "memory(GiB)": 302.58, + "step": 62380, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.7374979, + "epoch": 0.34897035569530527, + "grad_norm": 5.75, + "learning_rate": 9.582289025954775e-06, + "loss": 1.02575922, + "memory(GiB)": 302.58, + "step": 62400, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74587383, + "epoch": 0.34908220516828453, + "grad_norm": 4.90625, + "learning_rate": 9.581918944152705e-06, + "loss": 1.01658096, + "memory(GiB)": 302.58, + "step": 62420, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.74303432, + "epoch": 0.3491940546412638, + "grad_norm": 11.125, + "learning_rate": 9.581548705634477e-06, + "loss": 1.01444893, + "memory(GiB)": 302.58, + "step": 62440, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73164468, + "epoch": 0.34930590411424306, + "grad_norm": 4.40625, + "learning_rate": 9.58117831041276e-06, + "loss": 1.07589254, + "memory(GiB)": 302.58, + "step": 62460, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.73529954, + "epoch": 0.3494177535872223, + "grad_norm": 7.125, + "learning_rate": 9.580807758500214e-06, + "loss": 1.0543601, + "memory(GiB)": 302.58, + "step": 62480, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.73048081, + "epoch": 0.3495296030602016, + "grad_norm": 6.75, + "learning_rate": 9.58043704990952e-06, + "loss": 1.05834913, + "memory(GiB)": 302.58, + "step": 62500, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.74573936, + "epoch": 0.34964145253318085, + "grad_norm": 7.15625, + "learning_rate": 9.580066184653352e-06, + "loss": 0.98303204, + "memory(GiB)": 302.58, + "step": 62520, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.7503633, + "epoch": 0.3497533020061601, + "grad_norm": 7.0, + "learning_rate": 9.5796951627444e-06, + "loss": 0.97492304, + "memory(GiB)": 302.58, + "step": 62540, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.7302527, + "epoch": 0.3498651514791394, + "grad_norm": 7.40625, + "learning_rate": 9.57932398419535e-06, + "loss": 1.05963144, + "memory(GiB)": 302.58, + "step": 62560, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.75438676, + "epoch": 0.34997700095211864, + "grad_norm": 5.53125, + "learning_rate": 9.578952649018899e-06, + "loss": 0.94908619, + "memory(GiB)": 302.58, + "step": 62580, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.74378438, + "epoch": 0.3500888504250979, + "grad_norm": 7.8125, + "learning_rate": 9.578581157227748e-06, + "loss": 1.04629984, + "memory(GiB)": 302.58, + "step": 62600, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.72580495, + "epoch": 0.3502006998980772, + "grad_norm": 6.40625, + "learning_rate": 9.578209508834605e-06, + "loss": 1.09231186, + "memory(GiB)": 302.58, + "step": 62620, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.72848792, + "epoch": 0.35031254937105644, + "grad_norm": 7.65625, + "learning_rate": 9.577837703852178e-06, + "loss": 1.0727746, + "memory(GiB)": 302.58, + "step": 62640, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.74724951, + "epoch": 0.3504243988440357, + "grad_norm": 8.875, + "learning_rate": 9.577465742293187e-06, + "loss": 0.99052114, + "memory(GiB)": 302.58, + "step": 62660, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.73770185, + "epoch": 0.35053624831701496, + "grad_norm": 8.25, + "learning_rate": 9.57709362417035e-06, + "loss": 1.0246151, + "memory(GiB)": 302.58, + "step": 62680, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.73773241, + "epoch": 0.35064809778999423, + "grad_norm": 4.28125, + "learning_rate": 9.576721349496399e-06, + "loss": 1.02932768, + "memory(GiB)": 302.58, + "step": 62700, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.73874855, + "epoch": 0.3507599472629735, + "grad_norm": 6.65625, + "learning_rate": 9.576348918284066e-06, + "loss": 0.99639626, + "memory(GiB)": 302.58, + "step": 62720, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.7350122, + "epoch": 0.35087179673595276, + "grad_norm": 5.3125, + "learning_rate": 9.575976330546089e-06, + "loss": 1.04062796, + "memory(GiB)": 302.58, + "step": 62740, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.7418293, + "epoch": 0.350983646208932, + "grad_norm": 10.5625, + "learning_rate": 9.57560358629521e-06, + "loss": 1.00467501, + "memory(GiB)": 302.58, + "step": 62760, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.73970661, + "epoch": 0.3510954956819113, + "grad_norm": 8.6875, + "learning_rate": 9.575230685544181e-06, + "loss": 1.04189548, + "memory(GiB)": 302.58, + "step": 62780, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.73714566, + "epoch": 0.35120734515489055, + "grad_norm": 7.40625, + "learning_rate": 9.574857628305755e-06, + "loss": 1.05293865, + "memory(GiB)": 302.58, + "step": 62800, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.72841129, + "epoch": 0.3513191946278698, + "grad_norm": 6.84375, + "learning_rate": 9.574484414592691e-06, + "loss": 1.05571089, + "memory(GiB)": 302.58, + "step": 62820, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.73198724, + "epoch": 0.3514310441008491, + "grad_norm": 7.59375, + "learning_rate": 9.574111044417754e-06, + "loss": 1.05403137, + "memory(GiB)": 302.58, + "step": 62840, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.72722034, + "epoch": 0.35154289357382834, + "grad_norm": 8.1875, + "learning_rate": 9.573737517793716e-06, + "loss": 1.07003069, + "memory(GiB)": 302.58, + "step": 62860, + "train_speed(iter/s)": 0.123923 + }, + { + "acc": 0.72769952, + "epoch": 0.3516547430468076, + "grad_norm": 6.1875, + "learning_rate": 9.573363834733352e-06, + "loss": 1.08045626, + "memory(GiB)": 302.58, + "step": 62880, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.73532858, + "epoch": 0.35176659251978687, + "grad_norm": 8.3125, + "learning_rate": 9.572989995249443e-06, + "loss": 1.01857986, + "memory(GiB)": 302.58, + "step": 62900, + "train_speed(iter/s)": 0.123961 + }, + { + "acc": 0.73429441, + "epoch": 0.35187844199276613, + "grad_norm": 6.25, + "learning_rate": 9.572615999354775e-06, + "loss": 1.02995186, + "memory(GiB)": 302.58, + "step": 62920, + "train_speed(iter/s)": 0.123979 + }, + { + "acc": 0.73164086, + "epoch": 0.3519902914657454, + "grad_norm": 6.15625, + "learning_rate": 9.572241847062143e-06, + "loss": 1.07421465, + "memory(GiB)": 302.58, + "step": 62940, + "train_speed(iter/s)": 0.123997 + }, + { + "acc": 0.74530277, + "epoch": 0.35210214093872466, + "grad_norm": 6.0625, + "learning_rate": 9.571867538384338e-06, + "loss": 1.00288515, + "memory(GiB)": 302.58, + "step": 62960, + "train_speed(iter/s)": 0.124015 + }, + { + "acc": 0.73629923, + "epoch": 0.3522139904117039, + "grad_norm": 7.46875, + "learning_rate": 9.571493073334168e-06, + "loss": 1.03945322, + "memory(GiB)": 302.58, + "step": 62980, + "train_speed(iter/s)": 0.124033 + }, + { + "acc": 0.72719774, + "epoch": 0.3523258398846832, + "grad_norm": 5.96875, + "learning_rate": 9.571118451924439e-06, + "loss": 1.0967701, + "memory(GiB)": 302.58, + "step": 63000, + "train_speed(iter/s)": 0.124052 + }, + { + "acc": 0.71985831, + "epoch": 0.35243768935766245, + "grad_norm": 8.6875, + "learning_rate": 9.570743674167964e-06, + "loss": 1.10948019, + "memory(GiB)": 302.58, + "step": 63020, + "train_speed(iter/s)": 0.12407 + }, + { + "acc": 0.74069428, + "epoch": 0.3525495388306417, + "grad_norm": 6.90625, + "learning_rate": 9.570368740077563e-06, + "loss": 1.0156044, + "memory(GiB)": 302.58, + "step": 63040, + "train_speed(iter/s)": 0.124089 + }, + { + "acc": 0.72014575, + "epoch": 0.352661388303621, + "grad_norm": 6.25, + "learning_rate": 9.569993649666058e-06, + "loss": 1.11140947, + "memory(GiB)": 302.58, + "step": 63060, + "train_speed(iter/s)": 0.124105 + }, + { + "acc": 0.74416652, + "epoch": 0.35277323777660025, + "grad_norm": 6.875, + "learning_rate": 9.569618402946278e-06, + "loss": 1.00843592, + "memory(GiB)": 302.58, + "step": 63080, + "train_speed(iter/s)": 0.124123 + }, + { + "acc": 0.73265047, + "epoch": 0.3528850872495795, + "grad_norm": 5.53125, + "learning_rate": 9.56924299993106e-06, + "loss": 1.05291586, + "memory(GiB)": 302.58, + "step": 63100, + "train_speed(iter/s)": 0.124141 + }, + { + "acc": 0.73005266, + "epoch": 0.3529969367225588, + "grad_norm": 9.4375, + "learning_rate": 9.568867440633243e-06, + "loss": 1.0536622, + "memory(GiB)": 302.58, + "step": 63120, + "train_speed(iter/s)": 0.12416 + }, + { + "acc": 0.73643932, + "epoch": 0.35310878619553804, + "grad_norm": 5.125, + "learning_rate": 9.568491725065672e-06, + "loss": 1.0469595, + "memory(GiB)": 302.58, + "step": 63140, + "train_speed(iter/s)": 0.124179 + }, + { + "acc": 0.7344882, + "epoch": 0.3532206356685173, + "grad_norm": 6.28125, + "learning_rate": 9.568115853241197e-06, + "loss": 1.04001999, + "memory(GiB)": 302.58, + "step": 63160, + "train_speed(iter/s)": 0.124194 + }, + { + "acc": 0.72995129, + "epoch": 0.35333248514149657, + "grad_norm": 7.96875, + "learning_rate": 9.567739825172675e-06, + "loss": 1.06274853, + "memory(GiB)": 302.58, + "step": 63180, + "train_speed(iter/s)": 0.124212 + }, + { + "acc": 0.73054705, + "epoch": 0.35344433461447583, + "grad_norm": 8.1875, + "learning_rate": 9.56736364087297e-06, + "loss": 1.05593538, + "memory(GiB)": 302.58, + "step": 63200, + "train_speed(iter/s)": 0.12423 + }, + { + "acc": 0.7353868, + "epoch": 0.3535561840874551, + "grad_norm": 6.4375, + "learning_rate": 9.56698730035494e-06, + "loss": 1.04968548, + "memory(GiB)": 302.58, + "step": 63220, + "train_speed(iter/s)": 0.124249 + }, + { + "acc": 0.73245707, + "epoch": 0.35366803356043436, + "grad_norm": 9.3125, + "learning_rate": 9.566610803631467e-06, + "loss": 1.0652463, + "memory(GiB)": 302.58, + "step": 63240, + "train_speed(iter/s)": 0.124267 + }, + { + "acc": 0.73828363, + "epoch": 0.3537798830334136, + "grad_norm": 7.53125, + "learning_rate": 9.566234150715421e-06, + "loss": 1.00775118, + "memory(GiB)": 302.58, + "step": 63260, + "train_speed(iter/s)": 0.124283 + }, + { + "acc": 0.73030019, + "epoch": 0.3538917325063929, + "grad_norm": 7.65625, + "learning_rate": 9.56585734161969e-06, + "loss": 1.0674366, + "memory(GiB)": 302.58, + "step": 63280, + "train_speed(iter/s)": 0.124299 + }, + { + "acc": 0.72677388, + "epoch": 0.35400358197937215, + "grad_norm": 6.375, + "learning_rate": 9.565480376357161e-06, + "loss": 1.07630215, + "memory(GiB)": 302.58, + "step": 63300, + "train_speed(iter/s)": 0.124317 + }, + { + "acc": 0.72547793, + "epoch": 0.3541154314523514, + "grad_norm": 7.875, + "learning_rate": 9.565103254940724e-06, + "loss": 1.08922205, + "memory(GiB)": 302.58, + "step": 63320, + "train_speed(iter/s)": 0.124335 + }, + { + "acc": 0.74041848, + "epoch": 0.3542272809253307, + "grad_norm": 6.9375, + "learning_rate": 9.564725977383283e-06, + "loss": 1.00687628, + "memory(GiB)": 302.58, + "step": 63340, + "train_speed(iter/s)": 0.124353 + }, + { + "acc": 0.73721161, + "epoch": 0.35433913039830994, + "grad_norm": 9.875, + "learning_rate": 9.564348543697737e-06, + "loss": 1.02665529, + "memory(GiB)": 302.58, + "step": 63360, + "train_speed(iter/s)": 0.124371 + }, + { + "acc": 0.73955021, + "epoch": 0.3544509798712892, + "grad_norm": 4.375, + "learning_rate": 9.563970953896999e-06, + "loss": 1.01685057, + "memory(GiB)": 302.58, + "step": 63380, + "train_speed(iter/s)": 0.124389 + }, + { + "acc": 0.73466425, + "epoch": 0.35456282934426847, + "grad_norm": 6.3125, + "learning_rate": 9.56359320799398e-06, + "loss": 1.04389143, + "memory(GiB)": 302.58, + "step": 63400, + "train_speed(iter/s)": 0.124408 + }, + { + "acc": 0.74731388, + "epoch": 0.35467467881724773, + "grad_norm": 7.71875, + "learning_rate": 9.563215306001606e-06, + "loss": 1.00489063, + "memory(GiB)": 302.58, + "step": 63420, + "train_speed(iter/s)": 0.124427 + }, + { + "acc": 0.7287066, + "epoch": 0.354786528290227, + "grad_norm": 5.5, + "learning_rate": 9.562837247932795e-06, + "loss": 1.07672358, + "memory(GiB)": 302.58, + "step": 63440, + "train_speed(iter/s)": 0.124445 + }, + { + "acc": 0.72741652, + "epoch": 0.35489837776320626, + "grad_norm": 5.4375, + "learning_rate": 9.562459033800484e-06, + "loss": 1.0746191, + "memory(GiB)": 302.58, + "step": 63460, + "train_speed(iter/s)": 0.124464 + }, + { + "acc": 0.73908558, + "epoch": 0.3550102272361855, + "grad_norm": 10.875, + "learning_rate": 9.562080663617608e-06, + "loss": 1.0289135, + "memory(GiB)": 302.58, + "step": 63480, + "train_speed(iter/s)": 0.124481 + }, + { + "acc": 0.7304935, + "epoch": 0.3551220767091648, + "grad_norm": 6.5, + "learning_rate": 9.561702137397107e-06, + "loss": 1.05940714, + "memory(GiB)": 302.58, + "step": 63500, + "train_speed(iter/s)": 0.1245 + }, + { + "acc": 0.71776447, + "epoch": 0.35523392618214406, + "grad_norm": 5.90625, + "learning_rate": 9.561323455151925e-06, + "loss": 1.13142061, + "memory(GiB)": 302.58, + "step": 63520, + "train_speed(iter/s)": 0.124518 + }, + { + "acc": 0.74302149, + "epoch": 0.3553457756551233, + "grad_norm": 7.40625, + "learning_rate": 9.560944616895021e-06, + "loss": 0.98576794, + "memory(GiB)": 302.58, + "step": 63540, + "train_speed(iter/s)": 0.124536 + }, + { + "acc": 0.75047636, + "epoch": 0.3554576251281026, + "grad_norm": 5.78125, + "learning_rate": 9.560565622639346e-06, + "loss": 0.97933626, + "memory(GiB)": 302.58, + "step": 63560, + "train_speed(iter/s)": 0.124554 + }, + { + "acc": 0.73944049, + "epoch": 0.35556947460108185, + "grad_norm": 7.59375, + "learning_rate": 9.560186472397868e-06, + "loss": 1.04311848, + "memory(GiB)": 302.58, + "step": 63580, + "train_speed(iter/s)": 0.124573 + }, + { + "acc": 0.73910813, + "epoch": 0.3556813240740611, + "grad_norm": 5.75, + "learning_rate": 9.55980716618355e-06, + "loss": 1.01739826, + "memory(GiB)": 302.58, + "step": 63600, + "train_speed(iter/s)": 0.124592 + }, + { + "acc": 0.72476788, + "epoch": 0.3557931735470404, + "grad_norm": 7.5, + "learning_rate": 9.55942770400937e-06, + "loss": 1.09720001, + "memory(GiB)": 302.58, + "step": 63620, + "train_speed(iter/s)": 0.124609 + }, + { + "acc": 0.75258622, + "epoch": 0.35590502302001964, + "grad_norm": 7.6875, + "learning_rate": 9.559048085888305e-06, + "loss": 0.9678091, + "memory(GiB)": 302.58, + "step": 63640, + "train_speed(iter/s)": 0.124627 + }, + { + "acc": 0.7231904, + "epoch": 0.3560168724929989, + "grad_norm": 5.8125, + "learning_rate": 9.55866831183334e-06, + "loss": 1.10259809, + "memory(GiB)": 302.58, + "step": 63660, + "train_speed(iter/s)": 0.124646 + }, + { + "acc": 0.73795266, + "epoch": 0.35612872196597817, + "grad_norm": 6.5625, + "learning_rate": 9.55828838185746e-06, + "loss": 1.02124672, + "memory(GiB)": 302.58, + "step": 63680, + "train_speed(iter/s)": 0.124664 + }, + { + "acc": 0.74478245, + "epoch": 0.35624057143895743, + "grad_norm": 6.5, + "learning_rate": 9.557908295973665e-06, + "loss": 0.98387108, + "memory(GiB)": 302.58, + "step": 63700, + "train_speed(iter/s)": 0.124682 + }, + { + "acc": 0.74330945, + "epoch": 0.3563524209119367, + "grad_norm": 6.53125, + "learning_rate": 9.557528054194956e-06, + "loss": 0.99681282, + "memory(GiB)": 302.58, + "step": 63720, + "train_speed(iter/s)": 0.1247 + }, + { + "acc": 0.73590355, + "epoch": 0.35646427038491596, + "grad_norm": 8.9375, + "learning_rate": 9.557147656534333e-06, + "loss": 1.04290161, + "memory(GiB)": 302.58, + "step": 63740, + "train_speed(iter/s)": 0.124717 + }, + { + "acc": 0.73444138, + "epoch": 0.3565761198578952, + "grad_norm": 6.3125, + "learning_rate": 9.556767103004811e-06, + "loss": 1.06117887, + "memory(GiB)": 302.58, + "step": 63760, + "train_speed(iter/s)": 0.124735 + }, + { + "acc": 0.72627082, + "epoch": 0.3566879693308745, + "grad_norm": 8.5, + "learning_rate": 9.556386393619406e-06, + "loss": 1.06918726, + "memory(GiB)": 302.58, + "step": 63780, + "train_speed(iter/s)": 0.124752 + }, + { + "acc": 0.73747921, + "epoch": 0.35679981880385375, + "grad_norm": 6.125, + "learning_rate": 9.556005528391136e-06, + "loss": 1.03243532, + "memory(GiB)": 302.58, + "step": 63800, + "train_speed(iter/s)": 0.124768 + }, + { + "acc": 0.72329426, + "epoch": 0.356911668276833, + "grad_norm": 8.25, + "learning_rate": 9.555624507333033e-06, + "loss": 1.13132238, + "memory(GiB)": 302.58, + "step": 63820, + "train_speed(iter/s)": 0.124786 + }, + { + "acc": 0.7481452, + "epoch": 0.3570235177498123, + "grad_norm": 10.5, + "learning_rate": 9.555243330458126e-06, + "loss": 0.98922338, + "memory(GiB)": 302.58, + "step": 63840, + "train_speed(iter/s)": 0.124803 + }, + { + "acc": 0.73568344, + "epoch": 0.35713536722279154, + "grad_norm": 8.875, + "learning_rate": 9.55486199777945e-06, + "loss": 1.02347574, + "memory(GiB)": 302.58, + "step": 63860, + "train_speed(iter/s)": 0.124822 + }, + { + "acc": 0.73485584, + "epoch": 0.3572472166957708, + "grad_norm": 7.3125, + "learning_rate": 9.554480509310054e-06, + "loss": 1.03318386, + "memory(GiB)": 302.58, + "step": 63880, + "train_speed(iter/s)": 0.124841 + }, + { + "acc": 0.738692, + "epoch": 0.3573590661687501, + "grad_norm": 7.59375, + "learning_rate": 9.55409886506298e-06, + "loss": 1.02845325, + "memory(GiB)": 302.58, + "step": 63900, + "train_speed(iter/s)": 0.124859 + }, + { + "acc": 0.72844844, + "epoch": 0.35747091564172934, + "grad_norm": 6.34375, + "learning_rate": 9.553717065051286e-06, + "loss": 1.07199888, + "memory(GiB)": 302.58, + "step": 63920, + "train_speed(iter/s)": 0.124876 + }, + { + "acc": 0.73627086, + "epoch": 0.3575827651147086, + "grad_norm": 6.4375, + "learning_rate": 9.55333510928803e-06, + "loss": 1.02616472, + "memory(GiB)": 302.58, + "step": 63940, + "train_speed(iter/s)": 0.124895 + }, + { + "acc": 0.7476439, + "epoch": 0.35769461458768786, + "grad_norm": 7.34375, + "learning_rate": 9.55295299778627e-06, + "loss": 0.99389706, + "memory(GiB)": 302.58, + "step": 63960, + "train_speed(iter/s)": 0.124912 + }, + { + "acc": 0.73759284, + "epoch": 0.35780646406066713, + "grad_norm": 4.9375, + "learning_rate": 9.552570730559085e-06, + "loss": 1.03003569, + "memory(GiB)": 302.58, + "step": 63980, + "train_speed(iter/s)": 0.12493 + }, + { + "acc": 0.74484711, + "epoch": 0.3579183135336464, + "grad_norm": 6.21875, + "learning_rate": 9.552188307619544e-06, + "loss": 0.99164352, + "memory(GiB)": 302.58, + "step": 64000, + "train_speed(iter/s)": 0.124948 + }, + { + "epoch": 0.3579183135336464, + "eval_acc": 0.699547672407869, + "eval_loss": 1.0451440811157227, + "eval_runtime": 7530.8488, + "eval_samples_per_second": 9.997, + "eval_steps_per_second": 9.997, + "step": 64000 + }, + { + "acc": 0.73639436, + "epoch": 0.35803016300662566, + "grad_norm": 7.4375, + "learning_rate": 9.551805728980728e-06, + "loss": 1.04221096, + "memory(GiB)": 302.58, + "step": 64020, + "train_speed(iter/s)": 0.123126 + }, + { + "acc": 0.72564173, + "epoch": 0.3581420124796049, + "grad_norm": 6.6875, + "learning_rate": 9.551422994655723e-06, + "loss": 1.07749062, + "memory(GiB)": 302.58, + "step": 64040, + "train_speed(iter/s)": 0.123144 + }, + { + "acc": 0.72525921, + "epoch": 0.3582538619525842, + "grad_norm": 6.34375, + "learning_rate": 9.551040104657619e-06, + "loss": 1.08905048, + "memory(GiB)": 302.58, + "step": 64060, + "train_speed(iter/s)": 0.123162 + }, + { + "acc": 0.73674607, + "epoch": 0.35836571142556345, + "grad_norm": 10.625, + "learning_rate": 9.550657058999513e-06, + "loss": 1.0309556, + "memory(GiB)": 302.58, + "step": 64080, + "train_speed(iter/s)": 0.12318 + }, + { + "acc": 0.73525257, + "epoch": 0.35847756089854277, + "grad_norm": 6.875, + "learning_rate": 9.550273857694505e-06, + "loss": 1.01692085, + "memory(GiB)": 302.58, + "step": 64100, + "train_speed(iter/s)": 0.123198 + }, + { + "acc": 0.7548912, + "epoch": 0.35858941037152203, + "grad_norm": 5.9375, + "learning_rate": 9.549890500755705e-06, + "loss": 0.97670889, + "memory(GiB)": 302.58, + "step": 64120, + "train_speed(iter/s)": 0.123216 + }, + { + "acc": 0.74733124, + "epoch": 0.3587012598445013, + "grad_norm": 7.15625, + "learning_rate": 9.54950698819622e-06, + "loss": 0.99315996, + "memory(GiB)": 302.58, + "step": 64140, + "train_speed(iter/s)": 0.123234 + }, + { + "acc": 0.74444299, + "epoch": 0.35881310931748056, + "grad_norm": 8.75, + "learning_rate": 9.54912332002917e-06, + "loss": 0.99341555, + "memory(GiB)": 302.58, + "step": 64160, + "train_speed(iter/s)": 0.123253 + }, + { + "acc": 0.7277976, + "epoch": 0.3589249587904598, + "grad_norm": 5.875, + "learning_rate": 9.548739496267678e-06, + "loss": 1.06347761, + "memory(GiB)": 302.58, + "step": 64180, + "train_speed(iter/s)": 0.123272 + }, + { + "acc": 0.73605766, + "epoch": 0.3590368082634391, + "grad_norm": 5.8125, + "learning_rate": 9.548355516924872e-06, + "loss": 1.03984184, + "memory(GiB)": 302.58, + "step": 64200, + "train_speed(iter/s)": 0.12329 + }, + { + "acc": 0.72708173, + "epoch": 0.35914865773641835, + "grad_norm": 5.15625, + "learning_rate": 9.547971382013885e-06, + "loss": 1.09374857, + "memory(GiB)": 302.58, + "step": 64220, + "train_speed(iter/s)": 0.123309 + }, + { + "acc": 0.72661462, + "epoch": 0.3592605072093976, + "grad_norm": 9.0625, + "learning_rate": 9.547587091547856e-06, + "loss": 1.07839937, + "memory(GiB)": 302.58, + "step": 64240, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.73713436, + "epoch": 0.3593723566823769, + "grad_norm": 8.4375, + "learning_rate": 9.547202645539928e-06, + "loss": 1.02564402, + "memory(GiB)": 302.58, + "step": 64260, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.74228826, + "epoch": 0.35948420615535615, + "grad_norm": 7.53125, + "learning_rate": 9.546818044003249e-06, + "loss": 1.02478561, + "memory(GiB)": 302.58, + "step": 64280, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.72991018, + "epoch": 0.3595960556283354, + "grad_norm": 8.625, + "learning_rate": 9.546433286950977e-06, + "loss": 1.05671606, + "memory(GiB)": 302.58, + "step": 64300, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.73918257, + "epoch": 0.3597079051013147, + "grad_norm": 7.25, + "learning_rate": 9.54604837439627e-06, + "loss": 1.01263523, + "memory(GiB)": 302.58, + "step": 64320, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.74189816, + "epoch": 0.35981975457429394, + "grad_norm": 6.34375, + "learning_rate": 9.545663306352292e-06, + "loss": 1.00492048, + "memory(GiB)": 302.58, + "step": 64340, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.71869493, + "epoch": 0.3599316040472732, + "grad_norm": 7.625, + "learning_rate": 9.545278082832217e-06, + "loss": 1.13615665, + "memory(GiB)": 302.58, + "step": 64360, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.72795386, + "epoch": 0.36004345352025247, + "grad_norm": 6.03125, + "learning_rate": 9.544892703849218e-06, + "loss": 1.07824326, + "memory(GiB)": 302.58, + "step": 64380, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73534374, + "epoch": 0.36015530299323173, + "grad_norm": 6.15625, + "learning_rate": 9.544507169416478e-06, + "loss": 1.01860065, + "memory(GiB)": 302.58, + "step": 64400, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.73334451, + "epoch": 0.360267152466211, + "grad_norm": 7.125, + "learning_rate": 9.544121479547181e-06, + "loss": 1.04355526, + "memory(GiB)": 302.58, + "step": 64420, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.72355723, + "epoch": 0.36037900193919026, + "grad_norm": 5.46875, + "learning_rate": 9.543735634254522e-06, + "loss": 1.08718567, + "memory(GiB)": 302.58, + "step": 64440, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74662452, + "epoch": 0.3604908514121695, + "grad_norm": 8.125, + "learning_rate": 9.543349633551696e-06, + "loss": 0.99813643, + "memory(GiB)": 302.58, + "step": 64460, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.7439549, + "epoch": 0.3606027008851488, + "grad_norm": 6.53125, + "learning_rate": 9.542963477451905e-06, + "loss": 1.01231556, + "memory(GiB)": 302.58, + "step": 64480, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.73046126, + "epoch": 0.36071455035812805, + "grad_norm": 7.5625, + "learning_rate": 9.542577165968359e-06, + "loss": 1.08869314, + "memory(GiB)": 302.58, + "step": 64500, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.74135861, + "epoch": 0.3608263998311073, + "grad_norm": 8.375, + "learning_rate": 9.542190699114268e-06, + "loss": 1.00654459, + "memory(GiB)": 302.58, + "step": 64520, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.72611399, + "epoch": 0.3609382493040866, + "grad_norm": 5.78125, + "learning_rate": 9.541804076902855e-06, + "loss": 1.09026451, + "memory(GiB)": 302.58, + "step": 64540, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.72949381, + "epoch": 0.36105009877706584, + "grad_norm": 6.84375, + "learning_rate": 9.54141729934734e-06, + "loss": 1.08802929, + "memory(GiB)": 302.58, + "step": 64560, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.73801818, + "epoch": 0.3611619482500451, + "grad_norm": 6.4375, + "learning_rate": 9.541030366460952e-06, + "loss": 1.03531122, + "memory(GiB)": 302.58, + "step": 64580, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.72608366, + "epoch": 0.36127379772302437, + "grad_norm": 5.53125, + "learning_rate": 9.540643278256926e-06, + "loss": 1.07256021, + "memory(GiB)": 302.58, + "step": 64600, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.74273424, + "epoch": 0.36138564719600363, + "grad_norm": 6.125, + "learning_rate": 9.540256034748502e-06, + "loss": 1.01633797, + "memory(GiB)": 302.58, + "step": 64620, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.73687997, + "epoch": 0.3614974966689829, + "grad_norm": 7.40625, + "learning_rate": 9.539868635948926e-06, + "loss": 1.03523674, + "memory(GiB)": 302.58, + "step": 64640, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.75594764, + "epoch": 0.36160934614196216, + "grad_norm": 5.03125, + "learning_rate": 9.539481081871447e-06, + "loss": 0.96124525, + "memory(GiB)": 302.58, + "step": 64660, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.72575593, + "epoch": 0.3617211956149414, + "grad_norm": 9.375, + "learning_rate": 9.53909337252932e-06, + "loss": 1.07823677, + "memory(GiB)": 302.58, + "step": 64680, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.73778033, + "epoch": 0.3618330450879207, + "grad_norm": 7.71875, + "learning_rate": 9.538705507935807e-06, + "loss": 1.03407679, + "memory(GiB)": 302.58, + "step": 64700, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.73839617, + "epoch": 0.36194489456089995, + "grad_norm": 7.21875, + "learning_rate": 9.538317488104172e-06, + "loss": 1.03120699, + "memory(GiB)": 302.58, + "step": 64720, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.74575014, + "epoch": 0.3620567440338792, + "grad_norm": 7.1875, + "learning_rate": 9.53792931304769e-06, + "loss": 0.99880123, + "memory(GiB)": 302.58, + "step": 64740, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.73560147, + "epoch": 0.3621685935068585, + "grad_norm": 5.6875, + "learning_rate": 9.537540982779636e-06, + "loss": 1.07008848, + "memory(GiB)": 302.58, + "step": 64760, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.73239937, + "epoch": 0.36228044297983775, + "grad_norm": 6.65625, + "learning_rate": 9.537152497313292e-06, + "loss": 1.03745604, + "memory(GiB)": 302.58, + "step": 64780, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.72086902, + "epoch": 0.362392292452817, + "grad_norm": 8.25, + "learning_rate": 9.536763856661945e-06, + "loss": 1.12313194, + "memory(GiB)": 302.58, + "step": 64800, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.71376882, + "epoch": 0.3625041419257963, + "grad_norm": 5.1875, + "learning_rate": 9.53637506083889e-06, + "loss": 1.14744148, + "memory(GiB)": 302.58, + "step": 64820, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.72895021, + "epoch": 0.36261599139877554, + "grad_norm": 10.6875, + "learning_rate": 9.535986109857422e-06, + "loss": 1.08274555, + "memory(GiB)": 302.58, + "step": 64840, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.73128657, + "epoch": 0.3627278408717548, + "grad_norm": 8.75, + "learning_rate": 9.535597003730847e-06, + "loss": 1.06207495, + "memory(GiB)": 302.58, + "step": 64860, + "train_speed(iter/s)": 0.123885 + }, + { + "acc": 0.74043059, + "epoch": 0.36283969034473407, + "grad_norm": 5.0625, + "learning_rate": 9.535207742472472e-06, + "loss": 1.00982542, + "memory(GiB)": 302.58, + "step": 64880, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73002095, + "epoch": 0.36295153981771333, + "grad_norm": 6.40625, + "learning_rate": 9.534818326095611e-06, + "loss": 1.07558079, + "memory(GiB)": 302.58, + "step": 64900, + "train_speed(iter/s)": 0.123921 + }, + { + "acc": 0.73735194, + "epoch": 0.3630633892906926, + "grad_norm": 7.125, + "learning_rate": 9.534428754613585e-06, + "loss": 1.05921955, + "memory(GiB)": 302.58, + "step": 64920, + "train_speed(iter/s)": 0.123938 + }, + { + "acc": 0.72524714, + "epoch": 0.36317523876367186, + "grad_norm": 8.5, + "learning_rate": 9.534039028039717e-06, + "loss": 1.09598713, + "memory(GiB)": 302.58, + "step": 64940, + "train_speed(iter/s)": 0.123955 + }, + { + "acc": 0.72934008, + "epoch": 0.3632870882366511, + "grad_norm": 7.6875, + "learning_rate": 9.533649146387336e-06, + "loss": 1.06091108, + "memory(GiB)": 302.58, + "step": 64960, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.72271643, + "epoch": 0.3633989377096304, + "grad_norm": 7.125, + "learning_rate": 9.533259109669779e-06, + "loss": 1.12653856, + "memory(GiB)": 302.58, + "step": 64980, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.73194137, + "epoch": 0.36351078718260965, + "grad_norm": 6.96875, + "learning_rate": 9.532868917900386e-06, + "loss": 1.05843983, + "memory(GiB)": 302.58, + "step": 65000, + "train_speed(iter/s)": 0.124008 + }, + { + "acc": 0.73465872, + "epoch": 0.3636226366555889, + "grad_norm": 7.625, + "learning_rate": 9.532478571092502e-06, + "loss": 1.05239124, + "memory(GiB)": 302.58, + "step": 65020, + "train_speed(iter/s)": 0.124026 + }, + { + "acc": 0.73751354, + "epoch": 0.3637344861285682, + "grad_norm": 6.15625, + "learning_rate": 9.53208806925948e-06, + "loss": 1.03682404, + "memory(GiB)": 302.58, + "step": 65040, + "train_speed(iter/s)": 0.124045 + }, + { + "acc": 0.74792833, + "epoch": 0.36384633560154744, + "grad_norm": 5.5, + "learning_rate": 9.531697412414674e-06, + "loss": 0.99806099, + "memory(GiB)": 302.58, + "step": 65060, + "train_speed(iter/s)": 0.124062 + }, + { + "acc": 0.73959608, + "epoch": 0.3639581850745267, + "grad_norm": 8.25, + "learning_rate": 9.531306600571448e-06, + "loss": 1.05338078, + "memory(GiB)": 302.58, + "step": 65080, + "train_speed(iter/s)": 0.12408 + }, + { + "acc": 0.72899566, + "epoch": 0.36407003454750597, + "grad_norm": 4.6875, + "learning_rate": 9.530915633743168e-06, + "loss": 1.08258219, + "memory(GiB)": 302.58, + "step": 65100, + "train_speed(iter/s)": 0.124098 + }, + { + "acc": 0.72527261, + "epoch": 0.36418188402048524, + "grad_norm": 5.375, + "learning_rate": 9.530524511943205e-06, + "loss": 1.06394892, + "memory(GiB)": 302.58, + "step": 65120, + "train_speed(iter/s)": 0.124117 + }, + { + "acc": 0.74924908, + "epoch": 0.3642937334934645, + "grad_norm": 9.5625, + "learning_rate": 9.530133235184938e-06, + "loss": 0.97744665, + "memory(GiB)": 302.58, + "step": 65140, + "train_speed(iter/s)": 0.124135 + }, + { + "acc": 0.73309555, + "epoch": 0.36440558296644376, + "grad_norm": 6.4375, + "learning_rate": 9.529741803481749e-06, + "loss": 1.04779921, + "memory(GiB)": 302.58, + "step": 65160, + "train_speed(iter/s)": 0.124153 + }, + { + "acc": 0.71194973, + "epoch": 0.36451743243942303, + "grad_norm": 5.625, + "learning_rate": 9.529350216847028e-06, + "loss": 1.13143959, + "memory(GiB)": 302.58, + "step": 65180, + "train_speed(iter/s)": 0.124171 + }, + { + "acc": 0.73577456, + "epoch": 0.3646292819124023, + "grad_norm": 9.6875, + "learning_rate": 9.528958475294168e-06, + "loss": 1.0451705, + "memory(GiB)": 302.58, + "step": 65200, + "train_speed(iter/s)": 0.124189 + }, + { + "acc": 0.72977424, + "epoch": 0.36474113138538156, + "grad_norm": 5.84375, + "learning_rate": 9.528566578836566e-06, + "loss": 1.07391281, + "memory(GiB)": 302.58, + "step": 65220, + "train_speed(iter/s)": 0.124208 + }, + { + "acc": 0.74043217, + "epoch": 0.3648529808583608, + "grad_norm": 10.0625, + "learning_rate": 9.528174527487629e-06, + "loss": 1.0103816, + "memory(GiB)": 302.58, + "step": 65240, + "train_speed(iter/s)": 0.124227 + }, + { + "acc": 0.7377234, + "epoch": 0.3649648303313401, + "grad_norm": 8.6875, + "learning_rate": 9.527782321260762e-06, + "loss": 1.0337594, + "memory(GiB)": 302.58, + "step": 65260, + "train_speed(iter/s)": 0.124244 + }, + { + "acc": 0.73940349, + "epoch": 0.36507667980431935, + "grad_norm": 7.65625, + "learning_rate": 9.527389960169384e-06, + "loss": 1.02381582, + "memory(GiB)": 302.58, + "step": 65280, + "train_speed(iter/s)": 0.124262 + }, + { + "acc": 0.72263756, + "epoch": 0.3651885292772986, + "grad_norm": 6.21875, + "learning_rate": 9.526997444226913e-06, + "loss": 1.07597065, + "memory(GiB)": 302.58, + "step": 65300, + "train_speed(iter/s)": 0.12428 + }, + { + "acc": 0.73699212, + "epoch": 0.3653003787502779, + "grad_norm": 7.28125, + "learning_rate": 9.526604773446774e-06, + "loss": 1.04290028, + "memory(GiB)": 302.58, + "step": 65320, + "train_speed(iter/s)": 0.124298 + }, + { + "acc": 0.75170894, + "epoch": 0.36541222822325714, + "grad_norm": 6.09375, + "learning_rate": 9.526211947842401e-06, + "loss": 0.9784647, + "memory(GiB)": 302.58, + "step": 65340, + "train_speed(iter/s)": 0.124315 + }, + { + "acc": 0.72979245, + "epoch": 0.3655240776962364, + "grad_norm": 4.28125, + "learning_rate": 9.525818967427226e-06, + "loss": 1.04756718, + "memory(GiB)": 302.58, + "step": 65360, + "train_speed(iter/s)": 0.124333 + }, + { + "acc": 0.75743079, + "epoch": 0.36563592716921567, + "grad_norm": 9.4375, + "learning_rate": 9.525425832214689e-06, + "loss": 0.95058832, + "memory(GiB)": 302.58, + "step": 65380, + "train_speed(iter/s)": 0.124351 + }, + { + "acc": 0.74994955, + "epoch": 0.36574777664219493, + "grad_norm": 5.40625, + "learning_rate": 9.52503254221824e-06, + "loss": 0.98516006, + "memory(GiB)": 302.58, + "step": 65400, + "train_speed(iter/s)": 0.124368 + }, + { + "acc": 0.73761497, + "epoch": 0.3658596261151742, + "grad_norm": 6.5, + "learning_rate": 9.52463909745133e-06, + "loss": 1.03207178, + "memory(GiB)": 302.58, + "step": 65420, + "train_speed(iter/s)": 0.124386 + }, + { + "acc": 0.72246461, + "epoch": 0.36597147558815346, + "grad_norm": 4.90625, + "learning_rate": 9.524245497927416e-06, + "loss": 1.07985649, + "memory(GiB)": 302.58, + "step": 65440, + "train_speed(iter/s)": 0.124403 + }, + { + "acc": 0.72047954, + "epoch": 0.3660833250611327, + "grad_norm": 7.4375, + "learning_rate": 9.52385174365996e-06, + "loss": 1.09548006, + "memory(GiB)": 302.58, + "step": 65460, + "train_speed(iter/s)": 0.124421 + }, + { + "acc": 0.73614478, + "epoch": 0.366195174534112, + "grad_norm": 5.46875, + "learning_rate": 9.523457834662428e-06, + "loss": 1.05670233, + "memory(GiB)": 302.58, + "step": 65480, + "train_speed(iter/s)": 0.124438 + }, + { + "acc": 0.71814499, + "epoch": 0.36630702400709125, + "grad_norm": 6.75, + "learning_rate": 9.523063770948298e-06, + "loss": 1.11024275, + "memory(GiB)": 302.58, + "step": 65500, + "train_speed(iter/s)": 0.124456 + }, + { + "acc": 0.72911682, + "epoch": 0.3664188734800705, + "grad_norm": 5.3125, + "learning_rate": 9.522669552531041e-06, + "loss": 1.04669113, + "memory(GiB)": 302.58, + "step": 65520, + "train_speed(iter/s)": 0.124474 + }, + { + "acc": 0.7462183, + "epoch": 0.3665307229530498, + "grad_norm": 6.21875, + "learning_rate": 9.522275179424147e-06, + "loss": 1.00172539, + "memory(GiB)": 302.58, + "step": 65540, + "train_speed(iter/s)": 0.124491 + }, + { + "acc": 0.72837186, + "epoch": 0.36664257242602905, + "grad_norm": 10.4375, + "learning_rate": 9.5218806516411e-06, + "loss": 1.0936142, + "memory(GiB)": 302.58, + "step": 65560, + "train_speed(iter/s)": 0.12451 + }, + { + "acc": 0.74038367, + "epoch": 0.3667544218990083, + "grad_norm": 6.5625, + "learning_rate": 9.521485969195399e-06, + "loss": 1.01426382, + "memory(GiB)": 302.58, + "step": 65580, + "train_speed(iter/s)": 0.124527 + }, + { + "acc": 0.73869085, + "epoch": 0.3668662713719876, + "grad_norm": 5.84375, + "learning_rate": 9.521091132100538e-06, + "loss": 1.04486313, + "memory(GiB)": 302.58, + "step": 65600, + "train_speed(iter/s)": 0.124545 + }, + { + "acc": 0.75242887, + "epoch": 0.36697812084496684, + "grad_norm": 5.90625, + "learning_rate": 9.520696140370025e-06, + "loss": 0.96424866, + "memory(GiB)": 302.58, + "step": 65620, + "train_speed(iter/s)": 0.124562 + }, + { + "acc": 0.73179221, + "epoch": 0.3670899703179461, + "grad_norm": 8.8125, + "learning_rate": 9.52030099401737e-06, + "loss": 1.05151768, + "memory(GiB)": 302.58, + "step": 65640, + "train_speed(iter/s)": 0.12458 + }, + { + "acc": 0.72527876, + "epoch": 0.36720181979092537, + "grad_norm": 6.4375, + "learning_rate": 9.519905693056087e-06, + "loss": 1.10386238, + "memory(GiB)": 302.58, + "step": 65660, + "train_speed(iter/s)": 0.124598 + }, + { + "acc": 0.72976971, + "epoch": 0.36731366926390463, + "grad_norm": 6.375, + "learning_rate": 9.519510237499697e-06, + "loss": 1.08302422, + "memory(GiB)": 302.58, + "step": 65680, + "train_speed(iter/s)": 0.124614 + }, + { + "acc": 0.7165822, + "epoch": 0.3674255187368839, + "grad_norm": 5.3125, + "learning_rate": 9.519114627361725e-06, + "loss": 1.12809772, + "memory(GiB)": 302.58, + "step": 65700, + "train_speed(iter/s)": 0.124633 + }, + { + "acc": 0.73685226, + "epoch": 0.36753736820986316, + "grad_norm": 7.8125, + "learning_rate": 9.518718862655704e-06, + "loss": 1.02970181, + "memory(GiB)": 302.58, + "step": 65720, + "train_speed(iter/s)": 0.124651 + }, + { + "acc": 0.73521438, + "epoch": 0.3676492176828424, + "grad_norm": 6.09375, + "learning_rate": 9.51832294339517e-06, + "loss": 1.03086615, + "memory(GiB)": 302.58, + "step": 65740, + "train_speed(iter/s)": 0.124669 + }, + { + "acc": 0.72696438, + "epoch": 0.3677610671558217, + "grad_norm": 6.25, + "learning_rate": 9.517926869593663e-06, + "loss": 1.0720767, + "memory(GiB)": 302.58, + "step": 65760, + "train_speed(iter/s)": 0.124686 + }, + { + "acc": 0.75370255, + "epoch": 0.36787291662880095, + "grad_norm": 7.28125, + "learning_rate": 9.517530641264731e-06, + "loss": 0.96160879, + "memory(GiB)": 302.58, + "step": 65780, + "train_speed(iter/s)": 0.124705 + }, + { + "acc": 0.73977637, + "epoch": 0.3679847661017802, + "grad_norm": 6.71875, + "learning_rate": 9.517134258421928e-06, + "loss": 1.02040577, + "memory(GiB)": 302.58, + "step": 65800, + "train_speed(iter/s)": 0.124724 + }, + { + "acc": 0.73679523, + "epoch": 0.3680966155747595, + "grad_norm": 7.46875, + "learning_rate": 9.516737721078809e-06, + "loss": 1.04249907, + "memory(GiB)": 302.58, + "step": 65820, + "train_speed(iter/s)": 0.124742 + }, + { + "acc": 0.73452601, + "epoch": 0.36820846504773874, + "grad_norm": 4.59375, + "learning_rate": 9.516341029248937e-06, + "loss": 1.03213215, + "memory(GiB)": 302.58, + "step": 65840, + "train_speed(iter/s)": 0.12476 + }, + { + "acc": 0.73051867, + "epoch": 0.368320314520718, + "grad_norm": 8.0625, + "learning_rate": 9.515944182945883e-06, + "loss": 1.07484303, + "memory(GiB)": 302.58, + "step": 65860, + "train_speed(iter/s)": 0.124778 + }, + { + "acc": 0.75624242, + "epoch": 0.36843216399369727, + "grad_norm": 4.8125, + "learning_rate": 9.515547182183215e-06, + "loss": 0.96053457, + "memory(GiB)": 302.58, + "step": 65880, + "train_speed(iter/s)": 0.124795 + }, + { + "acc": 0.75037117, + "epoch": 0.36854401346667653, + "grad_norm": 6.75, + "learning_rate": 9.515150026974518e-06, + "loss": 0.95759573, + "memory(GiB)": 302.58, + "step": 65900, + "train_speed(iter/s)": 0.124813 + }, + { + "acc": 0.73320127, + "epoch": 0.3686558629396558, + "grad_norm": 6.6875, + "learning_rate": 9.514752717333371e-06, + "loss": 1.0379631, + "memory(GiB)": 302.58, + "step": 65920, + "train_speed(iter/s)": 0.124831 + }, + { + "acc": 0.71967726, + "epoch": 0.36876771241263506, + "grad_norm": 7.15625, + "learning_rate": 9.514355253273366e-06, + "loss": 1.1102746, + "memory(GiB)": 302.58, + "step": 65940, + "train_speed(iter/s)": 0.124849 + }, + { + "acc": 0.74273653, + "epoch": 0.3688795618856143, + "grad_norm": 9.0625, + "learning_rate": 9.513957634808095e-06, + "loss": 1.0134573, + "memory(GiB)": 302.58, + "step": 65960, + "train_speed(iter/s)": 0.124867 + }, + { + "acc": 0.75710573, + "epoch": 0.3689914113585936, + "grad_norm": 5.75, + "learning_rate": 9.513559861951162e-06, + "loss": 0.94520426, + "memory(GiB)": 302.58, + "step": 65980, + "train_speed(iter/s)": 0.124884 + }, + { + "acc": 0.73476529, + "epoch": 0.36910326083157285, + "grad_norm": 10.8125, + "learning_rate": 9.513161934716169e-06, + "loss": 1.04610672, + "memory(GiB)": 302.58, + "step": 66000, + "train_speed(iter/s)": 0.124902 + }, + { + "epoch": 0.36910326083157285, + "eval_acc": 0.6995568909069021, + "eval_loss": 1.044451117515564, + "eval_runtime": 7527.3369, + "eval_samples_per_second": 10.001, + "eval_steps_per_second": 10.001, + "step": 66000 + }, + { + "acc": 0.74451556, + "epoch": 0.3692151103045521, + "grad_norm": 8.5, + "learning_rate": 9.512763853116726e-06, + "loss": 1.0141901, + "memory(GiB)": 302.58, + "step": 66020, + "train_speed(iter/s)": 0.123135 + }, + { + "acc": 0.73871403, + "epoch": 0.3693269597775314, + "grad_norm": 6.875, + "learning_rate": 9.51236561716645e-06, + "loss": 1.03413057, + "memory(GiB)": 302.58, + "step": 66040, + "train_speed(iter/s)": 0.123152 + }, + { + "acc": 0.72791786, + "epoch": 0.36943880925051065, + "grad_norm": 8.25, + "learning_rate": 9.511967226878962e-06, + "loss": 1.06230068, + "memory(GiB)": 302.58, + "step": 66060, + "train_speed(iter/s)": 0.12317 + }, + { + "acc": 0.74255052, + "epoch": 0.3695506587234899, + "grad_norm": 8.4375, + "learning_rate": 9.511568682267888e-06, + "loss": 1.00320711, + "memory(GiB)": 302.58, + "step": 66080, + "train_speed(iter/s)": 0.123187 + }, + { + "acc": 0.72137575, + "epoch": 0.3696625081964692, + "grad_norm": 4.5625, + "learning_rate": 9.511169983346859e-06, + "loss": 1.10356579, + "memory(GiB)": 302.58, + "step": 66100, + "train_speed(iter/s)": 0.123204 + }, + { + "acc": 0.72811246, + "epoch": 0.36977435766944844, + "grad_norm": 6.21875, + "learning_rate": 9.510771130129512e-06, + "loss": 1.03377972, + "memory(GiB)": 302.58, + "step": 66120, + "train_speed(iter/s)": 0.123221 + }, + { + "acc": 0.73984919, + "epoch": 0.3698862071424277, + "grad_norm": 5.03125, + "learning_rate": 9.51037212262949e-06, + "loss": 1.00469589, + "memory(GiB)": 302.58, + "step": 66140, + "train_speed(iter/s)": 0.123238 + }, + { + "acc": 0.72245049, + "epoch": 0.36999805661540697, + "grad_norm": 10.0625, + "learning_rate": 9.509972960860437e-06, + "loss": 1.08710203, + "memory(GiB)": 302.58, + "step": 66160, + "train_speed(iter/s)": 0.123256 + }, + { + "acc": 0.72169175, + "epoch": 0.37010990608838623, + "grad_norm": 6.46875, + "learning_rate": 9.50957364483601e-06, + "loss": 1.10794477, + "memory(GiB)": 302.58, + "step": 66180, + "train_speed(iter/s)": 0.123274 + }, + { + "acc": 0.72323041, + "epoch": 0.3702217555613655, + "grad_norm": 7.25, + "learning_rate": 9.509174174569864e-06, + "loss": 1.10753689, + "memory(GiB)": 302.58, + "step": 66200, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.74091539, + "epoch": 0.37033360503434476, + "grad_norm": 8.6875, + "learning_rate": 9.508774550075663e-06, + "loss": 1.02881479, + "memory(GiB)": 302.58, + "step": 66220, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.73801708, + "epoch": 0.370445454507324, + "grad_norm": 6.5, + "learning_rate": 9.508374771367077e-06, + "loss": 1.03257084, + "memory(GiB)": 302.58, + "step": 66240, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.73515277, + "epoch": 0.3705573039803033, + "grad_norm": 5.40625, + "learning_rate": 9.507974838457777e-06, + "loss": 1.03825693, + "memory(GiB)": 302.58, + "step": 66260, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.72366991, + "epoch": 0.37066915345328255, + "grad_norm": 6.21875, + "learning_rate": 9.50757475136144e-06, + "loss": 1.08648586, + "memory(GiB)": 302.58, + "step": 66280, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.7247736, + "epoch": 0.3707810029262618, + "grad_norm": 6.3125, + "learning_rate": 9.507174510091759e-06, + "loss": 1.09199495, + "memory(GiB)": 302.58, + "step": 66300, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.7253191, + "epoch": 0.3708928523992411, + "grad_norm": 7.46875, + "learning_rate": 9.506774114662415e-06, + "loss": 1.08268757, + "memory(GiB)": 302.58, + "step": 66320, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.72798676, + "epoch": 0.37100470187222034, + "grad_norm": 7.0625, + "learning_rate": 9.506373565087106e-06, + "loss": 1.06394987, + "memory(GiB)": 302.58, + "step": 66340, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74540491, + "epoch": 0.3711165513451996, + "grad_norm": 8.4375, + "learning_rate": 9.50597286137953e-06, + "loss": 1.00869532, + "memory(GiB)": 302.58, + "step": 66360, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.72244668, + "epoch": 0.37122840081817887, + "grad_norm": 5.25, + "learning_rate": 9.505572003553397e-06, + "loss": 1.09558392, + "memory(GiB)": 302.58, + "step": 66380, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.72929692, + "epoch": 0.37134025029115814, + "grad_norm": 5.78125, + "learning_rate": 9.505170991622415e-06, + "loss": 1.08145342, + "memory(GiB)": 302.58, + "step": 66400, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.73926501, + "epoch": 0.3714520997641374, + "grad_norm": 9.25, + "learning_rate": 9.504769825600297e-06, + "loss": 1.00514479, + "memory(GiB)": 302.58, + "step": 66420, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.72482948, + "epoch": 0.37156394923711666, + "grad_norm": 6.90625, + "learning_rate": 9.504368505500768e-06, + "loss": 1.08287668, + "memory(GiB)": 302.58, + "step": 66440, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.7392354, + "epoch": 0.37167579871009593, + "grad_norm": 7.84375, + "learning_rate": 9.503967031337553e-06, + "loss": 1.02519131, + "memory(GiB)": 302.58, + "step": 66460, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.73031592, + "epoch": 0.3717876481830752, + "grad_norm": 5.78125, + "learning_rate": 9.503565403124383e-06, + "loss": 1.06191759, + "memory(GiB)": 302.58, + "step": 66480, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.7380887, + "epoch": 0.37189949765605446, + "grad_norm": 4.21875, + "learning_rate": 9.503163620874997e-06, + "loss": 1.0331625, + "memory(GiB)": 302.58, + "step": 66500, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.73733888, + "epoch": 0.3720113471290337, + "grad_norm": 6.15625, + "learning_rate": 9.502761684603136e-06, + "loss": 1.03982067, + "memory(GiB)": 302.58, + "step": 66520, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.73602228, + "epoch": 0.372123196602013, + "grad_norm": 5.8125, + "learning_rate": 9.502359594322548e-06, + "loss": 1.04135647, + "memory(GiB)": 302.58, + "step": 66540, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.73449574, + "epoch": 0.37223504607499225, + "grad_norm": 5.03125, + "learning_rate": 9.501957350046984e-06, + "loss": 1.04777136, + "memory(GiB)": 302.58, + "step": 66560, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.73458495, + "epoch": 0.3723468955479715, + "grad_norm": 8.5625, + "learning_rate": 9.501554951790204e-06, + "loss": 1.02901859, + "memory(GiB)": 302.58, + "step": 66580, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.73453097, + "epoch": 0.37245874502095083, + "grad_norm": 7.6875, + "learning_rate": 9.50115239956597e-06, + "loss": 1.05357056, + "memory(GiB)": 302.58, + "step": 66600, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.73896885, + "epoch": 0.3725705944939301, + "grad_norm": 4.15625, + "learning_rate": 9.500749693388051e-06, + "loss": 1.03235826, + "memory(GiB)": 302.58, + "step": 66620, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.73217459, + "epoch": 0.37268244396690936, + "grad_norm": 5.375, + "learning_rate": 9.500346833270222e-06, + "loss": 1.06169624, + "memory(GiB)": 302.58, + "step": 66640, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.74043593, + "epoch": 0.3727942934398886, + "grad_norm": 6.5, + "learning_rate": 9.499943819226261e-06, + "loss": 0.99764347, + "memory(GiB)": 302.58, + "step": 66660, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.74623461, + "epoch": 0.3729061429128679, + "grad_norm": 6.5, + "learning_rate": 9.499540651269952e-06, + "loss": 1.00771875, + "memory(GiB)": 302.58, + "step": 66680, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.73645501, + "epoch": 0.37301799238584715, + "grad_norm": 4.46875, + "learning_rate": 9.499137329415084e-06, + "loss": 1.02000713, + "memory(GiB)": 302.58, + "step": 66700, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.7388165, + "epoch": 0.3731298418588264, + "grad_norm": 6.28125, + "learning_rate": 9.498733853675456e-06, + "loss": 1.01973734, + "memory(GiB)": 302.58, + "step": 66720, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.72532625, + "epoch": 0.3732416913318057, + "grad_norm": 9.5, + "learning_rate": 9.49833022406486e-06, + "loss": 1.07735233, + "memory(GiB)": 302.58, + "step": 66740, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.74186778, + "epoch": 0.37335354080478494, + "grad_norm": 9.0625, + "learning_rate": 9.497926440597112e-06, + "loss": 1.0341692, + "memory(GiB)": 302.58, + "step": 66760, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.7313416, + "epoch": 0.3734653902777642, + "grad_norm": 6.65625, + "learning_rate": 9.497522503286014e-06, + "loss": 1.04690857, + "memory(GiB)": 302.58, + "step": 66780, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.74114671, + "epoch": 0.3735772397507435, + "grad_norm": 7.34375, + "learning_rate": 9.497118412145385e-06, + "loss": 1.00954342, + "memory(GiB)": 302.58, + "step": 66800, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.73240447, + "epoch": 0.37368908922372274, + "grad_norm": 7.25, + "learning_rate": 9.496714167189049e-06, + "loss": 1.05090504, + "memory(GiB)": 302.58, + "step": 66820, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.74672337, + "epoch": 0.373800938696702, + "grad_norm": 7.375, + "learning_rate": 9.496309768430826e-06, + "loss": 0.99347439, + "memory(GiB)": 302.58, + "step": 66840, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.74988918, + "epoch": 0.37391278816968126, + "grad_norm": 9.25, + "learning_rate": 9.495905215884555e-06, + "loss": 0.96945047, + "memory(GiB)": 302.58, + "step": 66860, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.73896122, + "epoch": 0.37402463764266053, + "grad_norm": 7.03125, + "learning_rate": 9.495500509564068e-06, + "loss": 1.01225395, + "memory(GiB)": 302.58, + "step": 66880, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.75493965, + "epoch": 0.3741364871156398, + "grad_norm": 5.84375, + "learning_rate": 9.49509564948321e-06, + "loss": 0.96116552, + "memory(GiB)": 302.58, + "step": 66900, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.73866611, + "epoch": 0.37424833658861906, + "grad_norm": 7.03125, + "learning_rate": 9.494690635655826e-06, + "loss": 1.01080217, + "memory(GiB)": 302.58, + "step": 66920, + "train_speed(iter/s)": 0.123912 + }, + { + "acc": 0.73344631, + "epoch": 0.3743601860615983, + "grad_norm": 5.59375, + "learning_rate": 9.494285468095769e-06, + "loss": 1.05077581, + "memory(GiB)": 302.58, + "step": 66940, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.73083591, + "epoch": 0.3744720355345776, + "grad_norm": 9.4375, + "learning_rate": 9.4938801468169e-06, + "loss": 1.05936737, + "memory(GiB)": 302.58, + "step": 66960, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.74013948, + "epoch": 0.37458388500755685, + "grad_norm": 7.78125, + "learning_rate": 9.493474671833078e-06, + "loss": 1.00240583, + "memory(GiB)": 302.58, + "step": 66980, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.72382603, + "epoch": 0.3746957344805361, + "grad_norm": 8.1875, + "learning_rate": 9.493069043158176e-06, + "loss": 1.07647686, + "memory(GiB)": 302.58, + "step": 67000, + "train_speed(iter/s)": 0.123983 + }, + { + "acc": 0.72279582, + "epoch": 0.3748075839535154, + "grad_norm": 5.4375, + "learning_rate": 9.492663260806064e-06, + "loss": 1.09321032, + "memory(GiB)": 302.58, + "step": 67020, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.72117701, + "epoch": 0.37491943342649464, + "grad_norm": 6.65625, + "learning_rate": 9.492257324790624e-06, + "loss": 1.12309742, + "memory(GiB)": 302.58, + "step": 67040, + "train_speed(iter/s)": 0.124016 + }, + { + "acc": 0.73921652, + "epoch": 0.3750312828994739, + "grad_norm": 6.15625, + "learning_rate": 9.491851235125737e-06, + "loss": 1.02649746, + "memory(GiB)": 302.58, + "step": 67060, + "train_speed(iter/s)": 0.124033 + }, + { + "acc": 0.73433042, + "epoch": 0.37514313237245317, + "grad_norm": 7.25, + "learning_rate": 9.491444991825295e-06, + "loss": 1.05403633, + "memory(GiB)": 302.58, + "step": 67080, + "train_speed(iter/s)": 0.124051 + }, + { + "acc": 0.73816295, + "epoch": 0.37525498184543243, + "grad_norm": 6.75, + "learning_rate": 9.491038594903194e-06, + "loss": 1.04005804, + "memory(GiB)": 302.58, + "step": 67100, + "train_speed(iter/s)": 0.124069 + }, + { + "acc": 0.72883692, + "epoch": 0.3753668313184117, + "grad_norm": 6.0, + "learning_rate": 9.490632044373333e-06, + "loss": 1.08026505, + "memory(GiB)": 302.58, + "step": 67120, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.74374299, + "epoch": 0.37547868079139096, + "grad_norm": 8.125, + "learning_rate": 9.490225340249614e-06, + "loss": 0.99260607, + "memory(GiB)": 302.58, + "step": 67140, + "train_speed(iter/s)": 0.124105 + }, + { + "acc": 0.72755437, + "epoch": 0.3755905302643702, + "grad_norm": 5.875, + "learning_rate": 9.489818482545952e-06, + "loss": 1.06242008, + "memory(GiB)": 302.58, + "step": 67160, + "train_speed(iter/s)": 0.124122 + }, + { + "acc": 0.71951008, + "epoch": 0.3757023797373495, + "grad_norm": 8.125, + "learning_rate": 9.48941147127626e-06, + "loss": 1.12883444, + "memory(GiB)": 302.58, + "step": 67180, + "train_speed(iter/s)": 0.12414 + }, + { + "acc": 0.73740754, + "epoch": 0.37581422921032875, + "grad_norm": 6.125, + "learning_rate": 9.489004306454461e-06, + "loss": 1.0370573, + "memory(GiB)": 302.58, + "step": 67200, + "train_speed(iter/s)": 0.124157 + }, + { + "acc": 0.7099791, + "epoch": 0.375926078683308, + "grad_norm": 6.0625, + "learning_rate": 9.488596988094479e-06, + "loss": 1.14098911, + "memory(GiB)": 302.58, + "step": 67220, + "train_speed(iter/s)": 0.124173 + }, + { + "acc": 0.75723619, + "epoch": 0.3760379281562873, + "grad_norm": 8.625, + "learning_rate": 9.488189516210249e-06, + "loss": 0.96074657, + "memory(GiB)": 302.58, + "step": 67240, + "train_speed(iter/s)": 0.124191 + }, + { + "acc": 0.73075252, + "epoch": 0.37614977762926655, + "grad_norm": 8.4375, + "learning_rate": 9.487781890815705e-06, + "loss": 1.08478365, + "memory(GiB)": 302.58, + "step": 67260, + "train_speed(iter/s)": 0.124209 + }, + { + "acc": 0.73859711, + "epoch": 0.3762616271022458, + "grad_norm": 9.25, + "learning_rate": 9.48737411192479e-06, + "loss": 1.03494596, + "memory(GiB)": 302.58, + "step": 67280, + "train_speed(iter/s)": 0.124224 + }, + { + "acc": 0.74644723, + "epoch": 0.3763734765752251, + "grad_norm": 5.25, + "learning_rate": 9.486966179551452e-06, + "loss": 0.97672434, + "memory(GiB)": 302.58, + "step": 67300, + "train_speed(iter/s)": 0.124242 + }, + { + "acc": 0.73698387, + "epoch": 0.37648532604820434, + "grad_norm": 6.8125, + "learning_rate": 9.486558093709642e-06, + "loss": 1.03977566, + "memory(GiB)": 302.58, + "step": 67320, + "train_speed(iter/s)": 0.124258 + }, + { + "acc": 0.73465381, + "epoch": 0.3765971755211836, + "grad_norm": 8.5625, + "learning_rate": 9.486149854413318e-06, + "loss": 1.03220434, + "memory(GiB)": 302.58, + "step": 67340, + "train_speed(iter/s)": 0.124275 + }, + { + "acc": 0.74489832, + "epoch": 0.37670902499416287, + "grad_norm": 5.71875, + "learning_rate": 9.485741461676445e-06, + "loss": 1.0103611, + "memory(GiB)": 302.58, + "step": 67360, + "train_speed(iter/s)": 0.124292 + }, + { + "acc": 0.7401556, + "epoch": 0.37682087446714213, + "grad_norm": 9.3125, + "learning_rate": 9.485332915512989e-06, + "loss": 1.00735588, + "memory(GiB)": 302.58, + "step": 67380, + "train_speed(iter/s)": 0.124309 + }, + { + "acc": 0.74012647, + "epoch": 0.3769327239401214, + "grad_norm": 6.71875, + "learning_rate": 9.484924215936927e-06, + "loss": 1.02430859, + "memory(GiB)": 302.58, + "step": 67400, + "train_speed(iter/s)": 0.124327 + }, + { + "acc": 0.74825974, + "epoch": 0.37704457341310066, + "grad_norm": 5.40625, + "learning_rate": 9.484515362962232e-06, + "loss": 0.9739543, + "memory(GiB)": 302.58, + "step": 67420, + "train_speed(iter/s)": 0.124344 + }, + { + "acc": 0.74380517, + "epoch": 0.3771564228860799, + "grad_norm": 7.21875, + "learning_rate": 9.484106356602893e-06, + "loss": 1.00666885, + "memory(GiB)": 302.58, + "step": 67440, + "train_speed(iter/s)": 0.124361 + }, + { + "acc": 0.73072743, + "epoch": 0.3772682723590592, + "grad_norm": 10.875, + "learning_rate": 9.483697196872899e-06, + "loss": 1.06099577, + "memory(GiB)": 302.58, + "step": 67460, + "train_speed(iter/s)": 0.124379 + }, + { + "acc": 0.73385658, + "epoch": 0.37738012183203845, + "grad_norm": 6.03125, + "learning_rate": 9.483287883786243e-06, + "loss": 1.03843222, + "memory(GiB)": 302.58, + "step": 67480, + "train_speed(iter/s)": 0.124396 + }, + { + "acc": 0.73810182, + "epoch": 0.3774919713050177, + "grad_norm": 5.53125, + "learning_rate": 9.482878417356925e-06, + "loss": 1.03402748, + "memory(GiB)": 302.58, + "step": 67500, + "train_speed(iter/s)": 0.124413 + }, + { + "acc": 0.73560605, + "epoch": 0.377603820777997, + "grad_norm": 8.1875, + "learning_rate": 9.48246879759895e-06, + "loss": 1.02774572, + "memory(GiB)": 302.58, + "step": 67520, + "train_speed(iter/s)": 0.124431 + }, + { + "acc": 0.73216906, + "epoch": 0.37771567025097624, + "grad_norm": 6.875, + "learning_rate": 9.482059024526328e-06, + "loss": 1.05444784, + "memory(GiB)": 302.58, + "step": 67540, + "train_speed(iter/s)": 0.124448 + }, + { + "acc": 0.72882452, + "epoch": 0.3778275197239555, + "grad_norm": 8.0, + "learning_rate": 9.481649098153077e-06, + "loss": 1.07830133, + "memory(GiB)": 302.58, + "step": 67560, + "train_speed(iter/s)": 0.124465 + }, + { + "acc": 0.74036603, + "epoch": 0.37793936919693477, + "grad_norm": 6.28125, + "learning_rate": 9.481239018493216e-06, + "loss": 1.02139778, + "memory(GiB)": 302.58, + "step": 67580, + "train_speed(iter/s)": 0.124482 + }, + { + "acc": 0.73305559, + "epoch": 0.37805121866991404, + "grad_norm": 7.84375, + "learning_rate": 9.48082878556077e-06, + "loss": 1.05560217, + "memory(GiB)": 302.58, + "step": 67600, + "train_speed(iter/s)": 0.124499 + }, + { + "acc": 0.71588507, + "epoch": 0.3781630681428933, + "grad_norm": 9.3125, + "learning_rate": 9.48041839936977e-06, + "loss": 1.11099405, + "memory(GiB)": 302.58, + "step": 67620, + "train_speed(iter/s)": 0.124516 + }, + { + "acc": 0.74286871, + "epoch": 0.37827491761587256, + "grad_norm": 6.8125, + "learning_rate": 9.480007859934255e-06, + "loss": 1.01172485, + "memory(GiB)": 302.58, + "step": 67640, + "train_speed(iter/s)": 0.124534 + }, + { + "acc": 0.71748233, + "epoch": 0.3783867670888518, + "grad_norm": 7.4375, + "learning_rate": 9.479597167268265e-06, + "loss": 1.12763662, + "memory(GiB)": 302.58, + "step": 67660, + "train_speed(iter/s)": 0.124551 + }, + { + "acc": 0.73317909, + "epoch": 0.3784986165618311, + "grad_norm": 5.90625, + "learning_rate": 9.479186321385848e-06, + "loss": 1.03277016, + "memory(GiB)": 302.58, + "step": 67680, + "train_speed(iter/s)": 0.124568 + }, + { + "acc": 0.74217596, + "epoch": 0.37861046603481036, + "grad_norm": 7.4375, + "learning_rate": 9.478775322301056e-06, + "loss": 0.99165115, + "memory(GiB)": 302.58, + "step": 67700, + "train_speed(iter/s)": 0.124585 + }, + { + "acc": 0.72198477, + "epoch": 0.3787223155077896, + "grad_norm": 4.71875, + "learning_rate": 9.478364170027944e-06, + "loss": 1.10215797, + "memory(GiB)": 302.58, + "step": 67720, + "train_speed(iter/s)": 0.124602 + }, + { + "acc": 0.73970194, + "epoch": 0.3788341649807689, + "grad_norm": 9.6875, + "learning_rate": 9.47795286458058e-06, + "loss": 1.03898687, + "memory(GiB)": 302.58, + "step": 67740, + "train_speed(iter/s)": 0.124619 + }, + { + "acc": 0.72940197, + "epoch": 0.37894601445374815, + "grad_norm": 7.1875, + "learning_rate": 9.477541405973028e-06, + "loss": 1.06950102, + "memory(GiB)": 302.58, + "step": 67760, + "train_speed(iter/s)": 0.124636 + }, + { + "acc": 0.74071164, + "epoch": 0.3790578639267274, + "grad_norm": 7.5, + "learning_rate": 9.477129794219361e-06, + "loss": 1.00911493, + "memory(GiB)": 302.58, + "step": 67780, + "train_speed(iter/s)": 0.124653 + }, + { + "acc": 0.74255438, + "epoch": 0.3791697133997067, + "grad_norm": 8.3125, + "learning_rate": 9.476718029333658e-06, + "loss": 1.00076494, + "memory(GiB)": 302.58, + "step": 67800, + "train_speed(iter/s)": 0.124669 + }, + { + "acc": 0.74290524, + "epoch": 0.37928156287268594, + "grad_norm": 9.75, + "learning_rate": 9.476306111330005e-06, + "loss": 1.01486912, + "memory(GiB)": 302.58, + "step": 67820, + "train_speed(iter/s)": 0.124686 + }, + { + "acc": 0.72799315, + "epoch": 0.3793934123456652, + "grad_norm": 6.6875, + "learning_rate": 9.475894040222488e-06, + "loss": 1.08687286, + "memory(GiB)": 302.58, + "step": 67840, + "train_speed(iter/s)": 0.124702 + }, + { + "acc": 0.74291301, + "epoch": 0.37950526181864447, + "grad_norm": 10.75, + "learning_rate": 9.475481816025201e-06, + "loss": 0.98723288, + "memory(GiB)": 302.58, + "step": 67860, + "train_speed(iter/s)": 0.12472 + }, + { + "acc": 0.72828112, + "epoch": 0.37961711129162373, + "grad_norm": 6.28125, + "learning_rate": 9.475069438752247e-06, + "loss": 1.09085379, + "memory(GiB)": 302.58, + "step": 67880, + "train_speed(iter/s)": 0.124737 + }, + { + "acc": 0.74116387, + "epoch": 0.379728960764603, + "grad_norm": 7.0625, + "learning_rate": 9.474656908417725e-06, + "loss": 1.01756229, + "memory(GiB)": 302.58, + "step": 67900, + "train_speed(iter/s)": 0.124754 + }, + { + "acc": 0.73254623, + "epoch": 0.37984081023758226, + "grad_norm": 5.65625, + "learning_rate": 9.474244225035751e-06, + "loss": 1.05292206, + "memory(GiB)": 302.58, + "step": 67920, + "train_speed(iter/s)": 0.124773 + }, + { + "acc": 0.74042211, + "epoch": 0.3799526597105615, + "grad_norm": 6.5, + "learning_rate": 9.473831388620436e-06, + "loss": 1.01059055, + "memory(GiB)": 302.58, + "step": 67940, + "train_speed(iter/s)": 0.12479 + }, + { + "acc": 0.74130788, + "epoch": 0.3800645091835408, + "grad_norm": 5.21875, + "learning_rate": 9.4734183991859e-06, + "loss": 1.01894312, + "memory(GiB)": 302.58, + "step": 67960, + "train_speed(iter/s)": 0.124808 + }, + { + "acc": 0.73208318, + "epoch": 0.38017635865652005, + "grad_norm": 6.71875, + "learning_rate": 9.473005256746272e-06, + "loss": 1.05792322, + "memory(GiB)": 302.58, + "step": 67980, + "train_speed(iter/s)": 0.124824 + }, + { + "acc": 0.73649478, + "epoch": 0.3802882081294993, + "grad_norm": 5.125, + "learning_rate": 9.47259196131568e-06, + "loss": 1.05353317, + "memory(GiB)": 302.58, + "step": 68000, + "train_speed(iter/s)": 0.124841 + }, + { + "epoch": 0.3802882081294993, + "eval_acc": 0.6998334458778964, + "eval_loss": 1.0435218811035156, + "eval_runtime": 7512.6655, + "eval_samples_per_second": 10.021, + "eval_steps_per_second": 10.021, + "step": 68000 + }, + { + "acc": 0.75799189, + "epoch": 0.3804000576024786, + "grad_norm": 9.0, + "learning_rate": 9.47217851290826e-06, + "loss": 0.94515257, + "memory(GiB)": 302.58, + "step": 68020, + "train_speed(iter/s)": 0.12313 + }, + { + "acc": 0.72952247, + "epoch": 0.38051190707545784, + "grad_norm": 9.375, + "learning_rate": 9.471764911538155e-06, + "loss": 1.06856422, + "memory(GiB)": 302.58, + "step": 68040, + "train_speed(iter/s)": 0.123147 + }, + { + "acc": 0.73580227, + "epoch": 0.3806237565484371, + "grad_norm": 10.5625, + "learning_rate": 9.47135115721951e-06, + "loss": 1.04284506, + "memory(GiB)": 302.58, + "step": 68060, + "train_speed(iter/s)": 0.123165 + }, + { + "acc": 0.73113823, + "epoch": 0.3807356060214164, + "grad_norm": 4.03125, + "learning_rate": 9.470937249966476e-06, + "loss": 1.06937332, + "memory(GiB)": 302.58, + "step": 68080, + "train_speed(iter/s)": 0.123182 + }, + { + "acc": 0.72576051, + "epoch": 0.38084745549439564, + "grad_norm": 7.34375, + "learning_rate": 9.470523189793212e-06, + "loss": 1.09928713, + "memory(GiB)": 302.58, + "step": 68100, + "train_speed(iter/s)": 0.123199 + }, + { + "acc": 0.74462409, + "epoch": 0.3809593049673749, + "grad_norm": 6.34375, + "learning_rate": 9.470108976713879e-06, + "loss": 1.00969915, + "memory(GiB)": 302.58, + "step": 68120, + "train_speed(iter/s)": 0.123216 + }, + { + "acc": 0.73595057, + "epoch": 0.38107115444035417, + "grad_norm": 5.71875, + "learning_rate": 9.469694610742646e-06, + "loss": 1.03212719, + "memory(GiB)": 302.58, + "step": 68140, + "train_speed(iter/s)": 0.123232 + }, + { + "acc": 0.74170847, + "epoch": 0.38118300391333343, + "grad_norm": 6.75, + "learning_rate": 9.469280091893684e-06, + "loss": 1.02872009, + "memory(GiB)": 302.58, + "step": 68160, + "train_speed(iter/s)": 0.123249 + }, + { + "acc": 0.73357663, + "epoch": 0.3812948533863127, + "grad_norm": 9.0625, + "learning_rate": 9.46886542018117e-06, + "loss": 1.05063171, + "memory(GiB)": 302.58, + "step": 68180, + "train_speed(iter/s)": 0.123266 + }, + { + "acc": 0.75201869, + "epoch": 0.38140670285929196, + "grad_norm": 6.59375, + "learning_rate": 9.468450595619288e-06, + "loss": 0.95747833, + "memory(GiB)": 302.58, + "step": 68200, + "train_speed(iter/s)": 0.123283 + }, + { + "acc": 0.75396109, + "epoch": 0.3815185523322712, + "grad_norm": 7.5, + "learning_rate": 9.468035618222228e-06, + "loss": 0.96446199, + "memory(GiB)": 302.58, + "step": 68220, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.7434227, + "epoch": 0.3816304018052505, + "grad_norm": 6.46875, + "learning_rate": 9.46762048800418e-06, + "loss": 1.00366869, + "memory(GiB)": 302.58, + "step": 68240, + "train_speed(iter/s)": 0.123318 + }, + { + "acc": 0.73242798, + "epoch": 0.38174225127822975, + "grad_norm": 7.96875, + "learning_rate": 9.467205204979348e-06, + "loss": 1.05865088, + "memory(GiB)": 302.58, + "step": 68260, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.75042028, + "epoch": 0.381854100751209, + "grad_norm": 5.34375, + "learning_rate": 9.466789769161931e-06, + "loss": 0.99157076, + "memory(GiB)": 302.58, + "step": 68280, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.74516416, + "epoch": 0.3819659502241883, + "grad_norm": 8.6875, + "learning_rate": 9.46637418056614e-06, + "loss": 1.013412, + "memory(GiB)": 302.58, + "step": 68300, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.72722039, + "epoch": 0.38207779969716754, + "grad_norm": 8.5, + "learning_rate": 9.46595843920619e-06, + "loss": 1.07144022, + "memory(GiB)": 302.58, + "step": 68320, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.72664409, + "epoch": 0.3821896491701468, + "grad_norm": 7.71875, + "learning_rate": 9.4655425450963e-06, + "loss": 1.08337278, + "memory(GiB)": 302.58, + "step": 68340, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.74744682, + "epoch": 0.38230149864312607, + "grad_norm": 6.46875, + "learning_rate": 9.465126498250695e-06, + "loss": 1.00894804, + "memory(GiB)": 302.58, + "step": 68360, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.74739985, + "epoch": 0.38241334811610533, + "grad_norm": 6.0625, + "learning_rate": 9.464710298683606e-06, + "loss": 0.99528246, + "memory(GiB)": 302.58, + "step": 68380, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.73474755, + "epoch": 0.3825251975890846, + "grad_norm": 5.5, + "learning_rate": 9.464293946409266e-06, + "loss": 1.03715773, + "memory(GiB)": 302.58, + "step": 68400, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.72534389, + "epoch": 0.38263704706206386, + "grad_norm": 6.03125, + "learning_rate": 9.463877441441918e-06, + "loss": 1.08853674, + "memory(GiB)": 302.58, + "step": 68420, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.73933959, + "epoch": 0.3827488965350431, + "grad_norm": 9.3125, + "learning_rate": 9.463460783795808e-06, + "loss": 1.02109652, + "memory(GiB)": 302.58, + "step": 68440, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.73865209, + "epoch": 0.3828607460080224, + "grad_norm": 7.1875, + "learning_rate": 9.463043973485185e-06, + "loss": 1.03291283, + "memory(GiB)": 302.58, + "step": 68460, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.73845382, + "epoch": 0.38297259548100165, + "grad_norm": 7.9375, + "learning_rate": 9.462627010524305e-06, + "loss": 1.02400436, + "memory(GiB)": 302.58, + "step": 68480, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.72996564, + "epoch": 0.3830844449539809, + "grad_norm": 6.625, + "learning_rate": 9.462209894927433e-06, + "loss": 1.07538233, + "memory(GiB)": 302.58, + "step": 68500, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.74503856, + "epoch": 0.3831962944269602, + "grad_norm": 6.5625, + "learning_rate": 9.461792626708832e-06, + "loss": 1.00668144, + "memory(GiB)": 302.58, + "step": 68520, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.72844768, + "epoch": 0.38330814389993945, + "grad_norm": 6.4375, + "learning_rate": 9.461375205882775e-06, + "loss": 1.08120394, + "memory(GiB)": 302.58, + "step": 68540, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.73105493, + "epoch": 0.3834199933729187, + "grad_norm": 5.65625, + "learning_rate": 9.460957632463539e-06, + "loss": 1.07107449, + "memory(GiB)": 302.58, + "step": 68560, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.72629504, + "epoch": 0.383531842845898, + "grad_norm": 6.375, + "learning_rate": 9.460539906465407e-06, + "loss": 1.09426889, + "memory(GiB)": 302.58, + "step": 68580, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.72670197, + "epoch": 0.38364369231887724, + "grad_norm": 7.59375, + "learning_rate": 9.460122027902668e-06, + "loss": 1.08758421, + "memory(GiB)": 302.58, + "step": 68600, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.73164253, + "epoch": 0.3837555417918565, + "grad_norm": 5.625, + "learning_rate": 9.45970399678961e-06, + "loss": 1.05925188, + "memory(GiB)": 302.58, + "step": 68620, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.74445949, + "epoch": 0.38386739126483577, + "grad_norm": 8.0625, + "learning_rate": 9.459285813140535e-06, + "loss": 0.9981144, + "memory(GiB)": 302.58, + "step": 68640, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.72011514, + "epoch": 0.38397924073781503, + "grad_norm": 7.625, + "learning_rate": 9.458867476969746e-06, + "loss": 1.09753304, + "memory(GiB)": 302.58, + "step": 68660, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.72368088, + "epoch": 0.3840910902107943, + "grad_norm": 5.71875, + "learning_rate": 9.458448988291551e-06, + "loss": 1.10640326, + "memory(GiB)": 302.58, + "step": 68680, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.73534346, + "epoch": 0.38420293968377356, + "grad_norm": 7.5, + "learning_rate": 9.458030347120262e-06, + "loss": 1.05756865, + "memory(GiB)": 302.58, + "step": 68700, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.7347858, + "epoch": 0.3843147891567528, + "grad_norm": 7.03125, + "learning_rate": 9.457611553470199e-06, + "loss": 1.04463453, + "memory(GiB)": 302.58, + "step": 68720, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.74817367, + "epoch": 0.3844266386297321, + "grad_norm": 6.59375, + "learning_rate": 9.457192607355688e-06, + "loss": 0.98623276, + "memory(GiB)": 302.58, + "step": 68740, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.74763141, + "epoch": 0.38453848810271135, + "grad_norm": 6.9375, + "learning_rate": 9.456773508791053e-06, + "loss": 0.98815794, + "memory(GiB)": 302.58, + "step": 68760, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.73053627, + "epoch": 0.3846503375756906, + "grad_norm": 8.0625, + "learning_rate": 9.456354257790636e-06, + "loss": 1.04954548, + "memory(GiB)": 302.58, + "step": 68780, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.75156231, + "epoch": 0.3847621870486699, + "grad_norm": 7.0, + "learning_rate": 9.45593485436877e-06, + "loss": 0.95833797, + "memory(GiB)": 302.58, + "step": 68800, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.7337883, + "epoch": 0.38487403652164914, + "grad_norm": 7.90625, + "learning_rate": 9.455515298539804e-06, + "loss": 1.05391035, + "memory(GiB)": 302.58, + "step": 68820, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.71557188, + "epoch": 0.3849858859946284, + "grad_norm": 9.625, + "learning_rate": 9.455095590318084e-06, + "loss": 1.12291393, + "memory(GiB)": 302.58, + "step": 68840, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.73413091, + "epoch": 0.38509773546760767, + "grad_norm": 5.9375, + "learning_rate": 9.454675729717972e-06, + "loss": 1.05934248, + "memory(GiB)": 302.58, + "step": 68860, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.72625108, + "epoch": 0.38520958494058694, + "grad_norm": 8.3125, + "learning_rate": 9.454255716753822e-06, + "loss": 1.0805151, + "memory(GiB)": 302.58, + "step": 68880, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.7154315, + "epoch": 0.3853214344135662, + "grad_norm": 12.25, + "learning_rate": 9.453835551440002e-06, + "loss": 1.14752464, + "memory(GiB)": 302.58, + "step": 68900, + "train_speed(iter/s)": 0.123866 + }, + { + "acc": 0.71939154, + "epoch": 0.38543328388654546, + "grad_norm": 7.125, + "learning_rate": 9.453415233790886e-06, + "loss": 1.12035866, + "memory(GiB)": 302.58, + "step": 68920, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.72671494, + "epoch": 0.3855451333595247, + "grad_norm": 8.6875, + "learning_rate": 9.452994763820847e-06, + "loss": 1.07992144, + "memory(GiB)": 302.58, + "step": 68940, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.75016584, + "epoch": 0.385656982832504, + "grad_norm": 5.75, + "learning_rate": 9.452574141544266e-06, + "loss": 0.96366758, + "memory(GiB)": 302.58, + "step": 68960, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.74245663, + "epoch": 0.38576883230548326, + "grad_norm": 7.53125, + "learning_rate": 9.45215336697553e-06, + "loss": 1.00547791, + "memory(GiB)": 302.58, + "step": 68980, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.74284325, + "epoch": 0.3858806817784625, + "grad_norm": 6.875, + "learning_rate": 9.451732440129033e-06, + "loss": 0.99697313, + "memory(GiB)": 302.58, + "step": 69000, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.73096681, + "epoch": 0.3859925312514418, + "grad_norm": 6.15625, + "learning_rate": 9.45131136101917e-06, + "loss": 1.07480621, + "memory(GiB)": 302.58, + "step": 69020, + "train_speed(iter/s)": 0.123969 + }, + { + "acc": 0.75239787, + "epoch": 0.38610438072442105, + "grad_norm": 9.125, + "learning_rate": 9.450890129660343e-06, + "loss": 0.95403061, + "memory(GiB)": 302.58, + "step": 69040, + "train_speed(iter/s)": 0.123987 + }, + { + "acc": 0.7396914, + "epoch": 0.3862162301974003, + "grad_norm": 9.125, + "learning_rate": 9.450468746066962e-06, + "loss": 1.01837397, + "memory(GiB)": 302.58, + "step": 69060, + "train_speed(iter/s)": 0.124002 + }, + { + "acc": 0.72587223, + "epoch": 0.3863280796703796, + "grad_norm": 6.4375, + "learning_rate": 9.450047210253437e-06, + "loss": 1.09099836, + "memory(GiB)": 302.58, + "step": 69080, + "train_speed(iter/s)": 0.124017 + }, + { + "acc": 0.72872472, + "epoch": 0.38643992914335884, + "grad_norm": 8.0, + "learning_rate": 9.449625522234184e-06, + "loss": 1.05651636, + "memory(GiB)": 302.58, + "step": 69100, + "train_speed(iter/s)": 0.124034 + }, + { + "acc": 0.74237328, + "epoch": 0.38655177861633816, + "grad_norm": 8.625, + "learning_rate": 9.449203682023632e-06, + "loss": 1.00593271, + "memory(GiB)": 302.58, + "step": 69120, + "train_speed(iter/s)": 0.12405 + }, + { + "acc": 0.74933662, + "epoch": 0.3866636280893174, + "grad_norm": 8.0625, + "learning_rate": 9.448781689636206e-06, + "loss": 0.97232714, + "memory(GiB)": 302.58, + "step": 69140, + "train_speed(iter/s)": 0.124067 + }, + { + "acc": 0.73678732, + "epoch": 0.3867754775622967, + "grad_norm": 6.59375, + "learning_rate": 9.448359545086339e-06, + "loss": 1.05058146, + "memory(GiB)": 302.58, + "step": 69160, + "train_speed(iter/s)": 0.124082 + }, + { + "acc": 0.74122286, + "epoch": 0.38688732703527595, + "grad_norm": 10.5625, + "learning_rate": 9.447937248388467e-06, + "loss": 1.01509638, + "memory(GiB)": 302.58, + "step": 69180, + "train_speed(iter/s)": 0.124099 + }, + { + "acc": 0.72278752, + "epoch": 0.3869991765082552, + "grad_norm": 7.3125, + "learning_rate": 9.44751479955704e-06, + "loss": 1.10862799, + "memory(GiB)": 302.58, + "step": 69200, + "train_speed(iter/s)": 0.124115 + }, + { + "acc": 0.72733054, + "epoch": 0.3871110259812345, + "grad_norm": 4.78125, + "learning_rate": 9.447092198606505e-06, + "loss": 1.06936579, + "memory(GiB)": 302.58, + "step": 69220, + "train_speed(iter/s)": 0.124132 + }, + { + "acc": 0.7337359, + "epoch": 0.38722287545421374, + "grad_norm": 6.96875, + "learning_rate": 9.446669445551315e-06, + "loss": 1.05234127, + "memory(GiB)": 302.58, + "step": 69240, + "train_speed(iter/s)": 0.124148 + }, + { + "acc": 0.75153804, + "epoch": 0.387334724927193, + "grad_norm": 5.3125, + "learning_rate": 9.44624654040593e-06, + "loss": 0.96133308, + "memory(GiB)": 302.58, + "step": 69260, + "train_speed(iter/s)": 0.124166 + }, + { + "acc": 0.73274293, + "epoch": 0.38744657440017227, + "grad_norm": 8.125, + "learning_rate": 9.445823483184813e-06, + "loss": 1.05368738, + "memory(GiB)": 302.58, + "step": 69280, + "train_speed(iter/s)": 0.124183 + }, + { + "acc": 0.72556634, + "epoch": 0.38755842387315154, + "grad_norm": 8.125, + "learning_rate": 9.445400273902436e-06, + "loss": 1.09150763, + "memory(GiB)": 302.58, + "step": 69300, + "train_speed(iter/s)": 0.124199 + }, + { + "acc": 0.74517808, + "epoch": 0.3876702733461308, + "grad_norm": 6.0625, + "learning_rate": 9.444976912573276e-06, + "loss": 1.02221527, + "memory(GiB)": 302.58, + "step": 69320, + "train_speed(iter/s)": 0.124216 + }, + { + "acc": 0.73868117, + "epoch": 0.38778212281911006, + "grad_norm": 5.4375, + "learning_rate": 9.44455339921181e-06, + "loss": 1.02385921, + "memory(GiB)": 302.58, + "step": 69340, + "train_speed(iter/s)": 0.124232 + }, + { + "acc": 0.74597468, + "epoch": 0.38789397229208933, + "grad_norm": 8.8125, + "learning_rate": 9.444129733832523e-06, + "loss": 0.9850152, + "memory(GiB)": 302.58, + "step": 69360, + "train_speed(iter/s)": 0.124249 + }, + { + "acc": 0.74154215, + "epoch": 0.3880058217650686, + "grad_norm": 7.34375, + "learning_rate": 9.443705916449907e-06, + "loss": 1.03476744, + "memory(GiB)": 302.58, + "step": 69380, + "train_speed(iter/s)": 0.124265 + }, + { + "acc": 0.74553499, + "epoch": 0.38811767123804786, + "grad_norm": 5.875, + "learning_rate": 9.44328194707846e-06, + "loss": 1.00332804, + "memory(GiB)": 302.58, + "step": 69400, + "train_speed(iter/s)": 0.124281 + }, + { + "acc": 0.75055571, + "epoch": 0.3882295207110271, + "grad_norm": 7.9375, + "learning_rate": 9.442857825732679e-06, + "loss": 0.97148695, + "memory(GiB)": 302.58, + "step": 69420, + "train_speed(iter/s)": 0.124298 + }, + { + "acc": 0.73664408, + "epoch": 0.3883413701840064, + "grad_norm": 6.59375, + "learning_rate": 9.442433552427073e-06, + "loss": 1.03900757, + "memory(GiB)": 302.58, + "step": 69440, + "train_speed(iter/s)": 0.124315 + }, + { + "acc": 0.72852893, + "epoch": 0.38845321965698565, + "grad_norm": 9.5625, + "learning_rate": 9.442009127176154e-06, + "loss": 1.10357475, + "memory(GiB)": 302.58, + "step": 69460, + "train_speed(iter/s)": 0.124331 + }, + { + "acc": 0.74089036, + "epoch": 0.3885650691299649, + "grad_norm": 9.1875, + "learning_rate": 9.441584549994436e-06, + "loss": 1.00863152, + "memory(GiB)": 302.58, + "step": 69480, + "train_speed(iter/s)": 0.124348 + }, + { + "acc": 0.72261982, + "epoch": 0.3886769186029442, + "grad_norm": 6.84375, + "learning_rate": 9.441159820896444e-06, + "loss": 1.11439829, + "memory(GiB)": 302.58, + "step": 69500, + "train_speed(iter/s)": 0.124365 + }, + { + "acc": 0.73696704, + "epoch": 0.38878876807592344, + "grad_norm": 8.5, + "learning_rate": 9.440734939896704e-06, + "loss": 1.0331749, + "memory(GiB)": 302.58, + "step": 69520, + "train_speed(iter/s)": 0.124382 + }, + { + "acc": 0.73797421, + "epoch": 0.3889006175489027, + "grad_norm": 6.8125, + "learning_rate": 9.440309907009746e-06, + "loss": 1.01918077, + "memory(GiB)": 302.58, + "step": 69540, + "train_speed(iter/s)": 0.124398 + }, + { + "acc": 0.73251214, + "epoch": 0.38901246702188197, + "grad_norm": 9.1875, + "learning_rate": 9.43988472225011e-06, + "loss": 1.07464705, + "memory(GiB)": 302.58, + "step": 69560, + "train_speed(iter/s)": 0.124414 + }, + { + "acc": 0.74455333, + "epoch": 0.38912431649486123, + "grad_norm": 4.5, + "learning_rate": 9.439459385632338e-06, + "loss": 1.02212439, + "memory(GiB)": 302.58, + "step": 69580, + "train_speed(iter/s)": 0.12443 + }, + { + "acc": 0.7374969, + "epoch": 0.3892361659678405, + "grad_norm": 11.375, + "learning_rate": 9.43903389717098e-06, + "loss": 1.03632784, + "memory(GiB)": 302.58, + "step": 69600, + "train_speed(iter/s)": 0.124447 + }, + { + "acc": 0.73695712, + "epoch": 0.38934801544081976, + "grad_norm": 7.34375, + "learning_rate": 9.438608256880583e-06, + "loss": 1.04236603, + "memory(GiB)": 302.58, + "step": 69620, + "train_speed(iter/s)": 0.124464 + }, + { + "acc": 0.74867649, + "epoch": 0.389459864913799, + "grad_norm": 7.34375, + "learning_rate": 9.438182464775712e-06, + "loss": 0.96881294, + "memory(GiB)": 302.58, + "step": 69640, + "train_speed(iter/s)": 0.12448 + }, + { + "acc": 0.73781757, + "epoch": 0.3895717143867783, + "grad_norm": 7.0625, + "learning_rate": 9.437756520870928e-06, + "loss": 1.02116146, + "memory(GiB)": 302.58, + "step": 69660, + "train_speed(iter/s)": 0.124497 + }, + { + "acc": 0.74258213, + "epoch": 0.38968356385975755, + "grad_norm": 9.0625, + "learning_rate": 9.437330425180797e-06, + "loss": 1.01705742, + "memory(GiB)": 302.58, + "step": 69680, + "train_speed(iter/s)": 0.124512 + }, + { + "acc": 0.7374548, + "epoch": 0.3897954133327368, + "grad_norm": 7.375, + "learning_rate": 9.436904177719896e-06, + "loss": 1.0135148, + "memory(GiB)": 302.58, + "step": 69700, + "train_speed(iter/s)": 0.124529 + }, + { + "acc": 0.73018341, + "epoch": 0.3899072628057161, + "grad_norm": 7.46875, + "learning_rate": 9.436477778502804e-06, + "loss": 1.06026821, + "memory(GiB)": 302.58, + "step": 69720, + "train_speed(iter/s)": 0.124546 + }, + { + "acc": 0.72022886, + "epoch": 0.39001911227869535, + "grad_norm": 5.71875, + "learning_rate": 9.436051227544102e-06, + "loss": 1.11296835, + "memory(GiB)": 302.58, + "step": 69740, + "train_speed(iter/s)": 0.124563 + }, + { + "acc": 0.7353086, + "epoch": 0.3901309617516746, + "grad_norm": 7.375, + "learning_rate": 9.435624524858384e-06, + "loss": 1.01797123, + "memory(GiB)": 302.58, + "step": 69760, + "train_speed(iter/s)": 0.124579 + }, + { + "acc": 0.72680292, + "epoch": 0.3902428112246539, + "grad_norm": 5.59375, + "learning_rate": 9.435197670460243e-06, + "loss": 1.06936893, + "memory(GiB)": 302.58, + "step": 69780, + "train_speed(iter/s)": 0.124596 + }, + { + "acc": 0.72450604, + "epoch": 0.39035466069763314, + "grad_norm": 10.125, + "learning_rate": 9.434770664364276e-06, + "loss": 1.06823301, + "memory(GiB)": 302.58, + "step": 69800, + "train_speed(iter/s)": 0.124613 + }, + { + "acc": 0.73675303, + "epoch": 0.3904665101706124, + "grad_norm": 8.0625, + "learning_rate": 9.434343506585091e-06, + "loss": 1.03813972, + "memory(GiB)": 302.58, + "step": 69820, + "train_speed(iter/s)": 0.124628 + }, + { + "acc": 0.71432896, + "epoch": 0.39057835964359167, + "grad_norm": 6.21875, + "learning_rate": 9.433916197137296e-06, + "loss": 1.13183031, + "memory(GiB)": 302.58, + "step": 69840, + "train_speed(iter/s)": 0.124645 + }, + { + "acc": 0.72184596, + "epoch": 0.39069020911657093, + "grad_norm": 8.375, + "learning_rate": 9.433488736035508e-06, + "loss": 1.097229, + "memory(GiB)": 302.58, + "step": 69860, + "train_speed(iter/s)": 0.124661 + }, + { + "acc": 0.72934918, + "epoch": 0.3908020585895502, + "grad_norm": 7.0625, + "learning_rate": 9.433061123294347e-06, + "loss": 1.05877953, + "memory(GiB)": 302.58, + "step": 69880, + "train_speed(iter/s)": 0.124678 + }, + { + "acc": 0.73199387, + "epoch": 0.39091390806252946, + "grad_norm": 5.28125, + "learning_rate": 9.43263335892844e-06, + "loss": 1.06107063, + "memory(GiB)": 302.58, + "step": 69900, + "train_speed(iter/s)": 0.124694 + }, + { + "acc": 0.74337707, + "epoch": 0.3910257575355087, + "grad_norm": 7.78125, + "learning_rate": 9.432205442952415e-06, + "loss": 1.01836948, + "memory(GiB)": 302.58, + "step": 69920, + "train_speed(iter/s)": 0.12471 + }, + { + "acc": 0.73848934, + "epoch": 0.391137607008488, + "grad_norm": 6.9375, + "learning_rate": 9.43177737538091e-06, + "loss": 1.04163685, + "memory(GiB)": 302.58, + "step": 69940, + "train_speed(iter/s)": 0.124727 + }, + { + "acc": 0.73629532, + "epoch": 0.39124945648146725, + "grad_norm": 7.5, + "learning_rate": 9.431349156228566e-06, + "loss": 1.04823265, + "memory(GiB)": 302.58, + "step": 69960, + "train_speed(iter/s)": 0.124745 + }, + { + "acc": 0.74671884, + "epoch": 0.3913613059544465, + "grad_norm": 6.46875, + "learning_rate": 9.43092078551003e-06, + "loss": 1.00590057, + "memory(GiB)": 302.58, + "step": 69980, + "train_speed(iter/s)": 0.124761 + }, + { + "acc": 0.73328595, + "epoch": 0.3914731554274258, + "grad_norm": 6.875, + "learning_rate": 9.430492263239953e-06, + "loss": 1.04616241, + "memory(GiB)": 302.58, + "step": 70000, + "train_speed(iter/s)": 0.124778 + }, + { + "epoch": 0.3914731554274258, + "eval_acc": 0.7001085712420887, + "eval_loss": 1.0424914360046387, + "eval_runtime": 7498.7554, + "eval_samples_per_second": 10.039, + "eval_steps_per_second": 10.039, + "step": 70000 + }, + { + "acc": 0.73352332, + "epoch": 0.39158500490040504, + "grad_norm": 6.84375, + "learning_rate": 9.430063589432991e-06, + "loss": 1.02578154, + "memory(GiB)": 302.58, + "step": 70020, + "train_speed(iter/s)": 0.123119 + }, + { + "acc": 0.73201475, + "epoch": 0.3916968543733843, + "grad_norm": 6.53125, + "learning_rate": 9.429634764103807e-06, + "loss": 1.06335821, + "memory(GiB)": 302.58, + "step": 70040, + "train_speed(iter/s)": 0.123134 + }, + { + "acc": 0.73191533, + "epoch": 0.39180870384636357, + "grad_norm": 5.09375, + "learning_rate": 9.429205787267071e-06, + "loss": 1.02070599, + "memory(GiB)": 302.58, + "step": 70060, + "train_speed(iter/s)": 0.123151 + }, + { + "acc": 0.73441634, + "epoch": 0.39192055331934283, + "grad_norm": 7.75, + "learning_rate": 9.428776658937448e-06, + "loss": 1.04691982, + "memory(GiB)": 302.58, + "step": 70080, + "train_speed(iter/s)": 0.123168 + }, + { + "acc": 0.73567638, + "epoch": 0.3920324027923221, + "grad_norm": 7.0, + "learning_rate": 9.428347379129622e-06, + "loss": 1.05506525, + "memory(GiB)": 302.58, + "step": 70100, + "train_speed(iter/s)": 0.123184 + }, + { + "acc": 0.74044948, + "epoch": 0.39214425226530136, + "grad_norm": 5.1875, + "learning_rate": 9.427917947858276e-06, + "loss": 1.0432313, + "memory(GiB)": 302.58, + "step": 70120, + "train_speed(iter/s)": 0.1232 + }, + { + "acc": 0.71999278, + "epoch": 0.3922561017382806, + "grad_norm": 7.46875, + "learning_rate": 9.427488365138092e-06, + "loss": 1.11375914, + "memory(GiB)": 302.58, + "step": 70140, + "train_speed(iter/s)": 0.123217 + }, + { + "acc": 0.72853007, + "epoch": 0.3923679512112599, + "grad_norm": 6.15625, + "learning_rate": 9.427058630983767e-06, + "loss": 1.07067442, + "memory(GiB)": 302.58, + "step": 70160, + "train_speed(iter/s)": 0.123233 + }, + { + "acc": 0.72522173, + "epoch": 0.39247980068423916, + "grad_norm": 6.53125, + "learning_rate": 9.426628745410002e-06, + "loss": 1.10833883, + "memory(GiB)": 302.58, + "step": 70180, + "train_speed(iter/s)": 0.123249 + }, + { + "acc": 0.72739406, + "epoch": 0.3925916501572184, + "grad_norm": 4.65625, + "learning_rate": 9.426198708431495e-06, + "loss": 1.08159466, + "memory(GiB)": 302.58, + "step": 70200, + "train_speed(iter/s)": 0.123265 + }, + { + "acc": 0.73857899, + "epoch": 0.3927034996301977, + "grad_norm": 5.03125, + "learning_rate": 9.425768520062957e-06, + "loss": 1.04185543, + "memory(GiB)": 302.58, + "step": 70220, + "train_speed(iter/s)": 0.123281 + }, + { + "acc": 0.74031363, + "epoch": 0.39281534910317695, + "grad_norm": 7.1875, + "learning_rate": 9.425338180319103e-06, + "loss": 1.01773272, + "memory(GiB)": 302.58, + "step": 70240, + "train_speed(iter/s)": 0.123297 + }, + { + "acc": 0.74092307, + "epoch": 0.3929271985761562, + "grad_norm": 5.53125, + "learning_rate": 9.42490768921465e-06, + "loss": 1.03142567, + "memory(GiB)": 302.58, + "step": 70260, + "train_speed(iter/s)": 0.123314 + }, + { + "acc": 0.73180346, + "epoch": 0.3930390480491355, + "grad_norm": 7.46875, + "learning_rate": 9.424477046764325e-06, + "loss": 1.06997728, + "memory(GiB)": 302.58, + "step": 70280, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.76086731, + "epoch": 0.39315089752211474, + "grad_norm": 4.96875, + "learning_rate": 9.424046252982852e-06, + "loss": 0.93892193, + "memory(GiB)": 302.58, + "step": 70300, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.74683685, + "epoch": 0.393262746995094, + "grad_norm": 7.15625, + "learning_rate": 9.423615307884972e-06, + "loss": 0.9947588, + "memory(GiB)": 302.58, + "step": 70320, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.74617152, + "epoch": 0.39337459646807327, + "grad_norm": 8.5625, + "learning_rate": 9.423184211485419e-06, + "loss": 1.00563059, + "memory(GiB)": 302.58, + "step": 70340, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.72657928, + "epoch": 0.39348644594105253, + "grad_norm": 7.4375, + "learning_rate": 9.422752963798942e-06, + "loss": 1.07314014, + "memory(GiB)": 302.58, + "step": 70360, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.7412003, + "epoch": 0.3935982954140318, + "grad_norm": 6.59375, + "learning_rate": 9.422321564840289e-06, + "loss": 1.0250885, + "memory(GiB)": 302.58, + "step": 70380, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.72740474, + "epoch": 0.39371014488701106, + "grad_norm": 5.53125, + "learning_rate": 9.421890014624217e-06, + "loss": 1.10142937, + "memory(GiB)": 302.58, + "step": 70400, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.73627605, + "epoch": 0.3938219943599903, + "grad_norm": 6.1875, + "learning_rate": 9.421458313165483e-06, + "loss": 1.02652473, + "memory(GiB)": 302.58, + "step": 70420, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.74237032, + "epoch": 0.3939338438329696, + "grad_norm": 8.375, + "learning_rate": 9.421026460478856e-06, + "loss": 0.98614426, + "memory(GiB)": 302.58, + "step": 70440, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.71642318, + "epoch": 0.39404569330594885, + "grad_norm": 9.125, + "learning_rate": 9.420594456579105e-06, + "loss": 1.12573709, + "memory(GiB)": 302.58, + "step": 70460, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.74444513, + "epoch": 0.3941575427789281, + "grad_norm": 8.5, + "learning_rate": 9.420162301481006e-06, + "loss": 1.01825151, + "memory(GiB)": 302.58, + "step": 70480, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.74251256, + "epoch": 0.3942693922519074, + "grad_norm": 9.0, + "learning_rate": 9.41972999519934e-06, + "loss": 0.99243526, + "memory(GiB)": 302.58, + "step": 70500, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.73869376, + "epoch": 0.39438124172488664, + "grad_norm": 7.46875, + "learning_rate": 9.419297537748894e-06, + "loss": 1.06057386, + "memory(GiB)": 302.58, + "step": 70520, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.74168305, + "epoch": 0.3944930911978659, + "grad_norm": 7.125, + "learning_rate": 9.418864929144459e-06, + "loss": 1.0130827, + "memory(GiB)": 302.58, + "step": 70540, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.73164215, + "epoch": 0.3946049406708452, + "grad_norm": 5.625, + "learning_rate": 9.418432169400832e-06, + "loss": 1.07370949, + "memory(GiB)": 302.58, + "step": 70560, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.72491288, + "epoch": 0.39471679014382444, + "grad_norm": 6.34375, + "learning_rate": 9.417999258532813e-06, + "loss": 1.1097724, + "memory(GiB)": 302.58, + "step": 70580, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.72524681, + "epoch": 0.3948286396168037, + "grad_norm": 6.34375, + "learning_rate": 9.417566196555211e-06, + "loss": 1.09473639, + "memory(GiB)": 302.58, + "step": 70600, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.73214636, + "epoch": 0.39494048908978296, + "grad_norm": 6.65625, + "learning_rate": 9.41713298348284e-06, + "loss": 1.06492586, + "memory(GiB)": 302.58, + "step": 70620, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.73537874, + "epoch": 0.39505233856276223, + "grad_norm": 5.28125, + "learning_rate": 9.416699619330511e-06, + "loss": 1.02173347, + "memory(GiB)": 302.58, + "step": 70640, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.73587532, + "epoch": 0.3951641880357415, + "grad_norm": 6.5, + "learning_rate": 9.416266104113052e-06, + "loss": 1.03703308, + "memory(GiB)": 302.58, + "step": 70660, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.7357059, + "epoch": 0.39527603750872076, + "grad_norm": 6.90625, + "learning_rate": 9.415832437845288e-06, + "loss": 1.04476461, + "memory(GiB)": 302.58, + "step": 70680, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.73807268, + "epoch": 0.3953878869817, + "grad_norm": 7.90625, + "learning_rate": 9.41539862054205e-06, + "loss": 1.0431077, + "memory(GiB)": 302.58, + "step": 70700, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.75139251, + "epoch": 0.3954997364546793, + "grad_norm": 9.1875, + "learning_rate": 9.414964652218182e-06, + "loss": 0.99308281, + "memory(GiB)": 302.58, + "step": 70720, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.72159925, + "epoch": 0.39561158592765855, + "grad_norm": 6.625, + "learning_rate": 9.414530532888522e-06, + "loss": 1.07506647, + "memory(GiB)": 302.58, + "step": 70740, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.72444158, + "epoch": 0.3957234354006378, + "grad_norm": 8.1875, + "learning_rate": 9.414096262567919e-06, + "loss": 1.10203371, + "memory(GiB)": 302.58, + "step": 70760, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.7305583, + "epoch": 0.3958352848736171, + "grad_norm": 6.40625, + "learning_rate": 9.413661841271229e-06, + "loss": 1.06937866, + "memory(GiB)": 302.58, + "step": 70780, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.73067155, + "epoch": 0.39594713434659634, + "grad_norm": 5.71875, + "learning_rate": 9.413227269013306e-06, + "loss": 1.07963734, + "memory(GiB)": 302.58, + "step": 70800, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.74239192, + "epoch": 0.3960589838195756, + "grad_norm": 4.5, + "learning_rate": 9.41279254580902e-06, + "loss": 1.00391703, + "memory(GiB)": 302.58, + "step": 70820, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.71068983, + "epoch": 0.39617083329255487, + "grad_norm": 7.96875, + "learning_rate": 9.412357671673233e-06, + "loss": 1.17899199, + "memory(GiB)": 302.58, + "step": 70840, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.73481159, + "epoch": 0.39628268276553413, + "grad_norm": 6.09375, + "learning_rate": 9.411922646620824e-06, + "loss": 1.01518087, + "memory(GiB)": 302.58, + "step": 70860, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.74657307, + "epoch": 0.3963945322385134, + "grad_norm": 9.6875, + "learning_rate": 9.41148747066667e-06, + "loss": 1.00017452, + "memory(GiB)": 302.58, + "step": 70880, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.74023294, + "epoch": 0.39650638171149266, + "grad_norm": 6.03125, + "learning_rate": 9.411052143825657e-06, + "loss": 1.03875322, + "memory(GiB)": 302.58, + "step": 70900, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.72205625, + "epoch": 0.3966182311844719, + "grad_norm": 8.125, + "learning_rate": 9.410616666112673e-06, + "loss": 1.09686117, + "memory(GiB)": 302.58, + "step": 70920, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.73288031, + "epoch": 0.3967300806574512, + "grad_norm": 8.1875, + "learning_rate": 9.410181037542613e-06, + "loss": 1.03767014, + "memory(GiB)": 302.58, + "step": 70940, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.72583327, + "epoch": 0.39684193013043045, + "grad_norm": 7.6875, + "learning_rate": 9.409745258130377e-06, + "loss": 1.08747454, + "memory(GiB)": 302.58, + "step": 70960, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.73453941, + "epoch": 0.3969537796034097, + "grad_norm": 8.375, + "learning_rate": 9.409309327890871e-06, + "loss": 1.06893206, + "memory(GiB)": 302.58, + "step": 70980, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73097367, + "epoch": 0.397065629076389, + "grad_norm": 8.0625, + "learning_rate": 9.408873246839004e-06, + "loss": 1.07501173, + "memory(GiB)": 302.58, + "step": 71000, + "train_speed(iter/s)": 0.123919 + }, + { + "acc": 0.73281093, + "epoch": 0.39717747854936825, + "grad_norm": 5.84375, + "learning_rate": 9.408437014989692e-06, + "loss": 1.04807215, + "memory(GiB)": 302.58, + "step": 71020, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.71763296, + "epoch": 0.3972893280223475, + "grad_norm": 8.875, + "learning_rate": 9.408000632357855e-06, + "loss": 1.1436902, + "memory(GiB)": 302.58, + "step": 71040, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.74703317, + "epoch": 0.3974011774953268, + "grad_norm": 7.25, + "learning_rate": 9.40756409895842e-06, + "loss": 0.99132853, + "memory(GiB)": 302.58, + "step": 71060, + "train_speed(iter/s)": 0.123967 + }, + { + "acc": 0.73906746, + "epoch": 0.39751302696830604, + "grad_norm": 5.65625, + "learning_rate": 9.407127414806316e-06, + "loss": 1.02589483, + "memory(GiB)": 302.58, + "step": 71080, + "train_speed(iter/s)": 0.123983 + }, + { + "acc": 0.73327122, + "epoch": 0.3976248764412853, + "grad_norm": 5.6875, + "learning_rate": 9.40669057991648e-06, + "loss": 1.05482492, + "memory(GiB)": 302.58, + "step": 71100, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.75580883, + "epoch": 0.39773672591426457, + "grad_norm": 9.125, + "learning_rate": 9.406253594303852e-06, + "loss": 0.96070652, + "memory(GiB)": 302.58, + "step": 71120, + "train_speed(iter/s)": 0.124016 + }, + { + "acc": 0.7482162, + "epoch": 0.39784857538724383, + "grad_norm": 6.9375, + "learning_rate": 9.40581645798338e-06, + "loss": 0.98809242, + "memory(GiB)": 302.58, + "step": 71140, + "train_speed(iter/s)": 0.124032 + }, + { + "acc": 0.73323069, + "epoch": 0.3979604248602231, + "grad_norm": 5.40625, + "learning_rate": 9.405379170970015e-06, + "loss": 1.054, + "memory(GiB)": 302.58, + "step": 71160, + "train_speed(iter/s)": 0.124049 + }, + { + "acc": 0.72476811, + "epoch": 0.39807227433320236, + "grad_norm": 7.3125, + "learning_rate": 9.404941733278714e-06, + "loss": 1.09581356, + "memory(GiB)": 302.58, + "step": 71180, + "train_speed(iter/s)": 0.124066 + }, + { + "acc": 0.728895, + "epoch": 0.3981841238061816, + "grad_norm": 5.875, + "learning_rate": 9.404504144924437e-06, + "loss": 1.07296553, + "memory(GiB)": 302.58, + "step": 71200, + "train_speed(iter/s)": 0.124082 + }, + { + "acc": 0.73955164, + "epoch": 0.3982959732791609, + "grad_norm": 4.65625, + "learning_rate": 9.404066405922152e-06, + "loss": 1.00776653, + "memory(GiB)": 302.58, + "step": 71220, + "train_speed(iter/s)": 0.124098 + }, + { + "acc": 0.75267406, + "epoch": 0.39840782275214015, + "grad_norm": 8.1875, + "learning_rate": 9.403628516286831e-06, + "loss": 0.98053617, + "memory(GiB)": 302.58, + "step": 71240, + "train_speed(iter/s)": 0.124112 + }, + { + "acc": 0.72546916, + "epoch": 0.3985196722251194, + "grad_norm": 7.625, + "learning_rate": 9.40319047603345e-06, + "loss": 1.08249512, + "memory(GiB)": 302.58, + "step": 71260, + "train_speed(iter/s)": 0.124128 + }, + { + "acc": 0.72385755, + "epoch": 0.3986315216980987, + "grad_norm": 10.5, + "learning_rate": 9.402752285176996e-06, + "loss": 1.0910017, + "memory(GiB)": 302.58, + "step": 71280, + "train_speed(iter/s)": 0.124145 + }, + { + "acc": 0.7464232, + "epoch": 0.39874337117107794, + "grad_norm": 5.625, + "learning_rate": 9.40231394373245e-06, + "loss": 0.99536037, + "memory(GiB)": 302.58, + "step": 71300, + "train_speed(iter/s)": 0.124162 + }, + { + "acc": 0.72935119, + "epoch": 0.3988552206440572, + "grad_norm": 7.46875, + "learning_rate": 9.401875451714807e-06, + "loss": 1.07534361, + "memory(GiB)": 302.58, + "step": 71320, + "train_speed(iter/s)": 0.124178 + }, + { + "acc": 0.74424877, + "epoch": 0.39896707011703647, + "grad_norm": 8.5625, + "learning_rate": 9.401436809139068e-06, + "loss": 1.00626984, + "memory(GiB)": 302.58, + "step": 71340, + "train_speed(iter/s)": 0.124195 + }, + { + "acc": 0.71591396, + "epoch": 0.39907891959001573, + "grad_norm": 7.625, + "learning_rate": 9.400998016020234e-06, + "loss": 1.1227829, + "memory(GiB)": 302.58, + "step": 71360, + "train_speed(iter/s)": 0.124211 + }, + { + "acc": 0.75547638, + "epoch": 0.399190769062995, + "grad_norm": 9.25, + "learning_rate": 9.400559072373311e-06, + "loss": 0.97025967, + "memory(GiB)": 302.58, + "step": 71380, + "train_speed(iter/s)": 0.124228 + }, + { + "acc": 0.73174295, + "epoch": 0.39930261853597426, + "grad_norm": 6.625, + "learning_rate": 9.400119978213313e-06, + "loss": 1.05331011, + "memory(GiB)": 302.58, + "step": 71400, + "train_speed(iter/s)": 0.124245 + }, + { + "acc": 0.70275941, + "epoch": 0.3994144680089535, + "grad_norm": 8.5625, + "learning_rate": 9.399680733555262e-06, + "loss": 1.19115772, + "memory(GiB)": 302.58, + "step": 71420, + "train_speed(iter/s)": 0.12426 + }, + { + "acc": 0.73657885, + "epoch": 0.3995263174819328, + "grad_norm": 8.375, + "learning_rate": 9.399241338414177e-06, + "loss": 1.05314693, + "memory(GiB)": 302.58, + "step": 71440, + "train_speed(iter/s)": 0.124277 + }, + { + "acc": 0.73431573, + "epoch": 0.39963816695491206, + "grad_norm": 4.9375, + "learning_rate": 9.39880179280509e-06, + "loss": 1.03123322, + "memory(GiB)": 302.58, + "step": 71460, + "train_speed(iter/s)": 0.124291 + }, + { + "acc": 0.73725023, + "epoch": 0.3997500164278913, + "grad_norm": 8.3125, + "learning_rate": 9.398362096743032e-06, + "loss": 1.04295235, + "memory(GiB)": 302.58, + "step": 71480, + "train_speed(iter/s)": 0.124309 + }, + { + "acc": 0.71919804, + "epoch": 0.3998618659008706, + "grad_norm": 6.78125, + "learning_rate": 9.397922250243044e-06, + "loss": 1.12465038, + "memory(GiB)": 302.58, + "step": 71500, + "train_speed(iter/s)": 0.124324 + }, + { + "acc": 0.75027218, + "epoch": 0.39997371537384985, + "grad_norm": 8.0625, + "learning_rate": 9.39748225332017e-06, + "loss": 0.97320938, + "memory(GiB)": 302.58, + "step": 71520, + "train_speed(iter/s)": 0.12434 + }, + { + "acc": 0.72800431, + "epoch": 0.4000855648468291, + "grad_norm": 4.46875, + "learning_rate": 9.39704210598946e-06, + "loss": 1.06810827, + "memory(GiB)": 302.58, + "step": 71540, + "train_speed(iter/s)": 0.124357 + }, + { + "acc": 0.73829479, + "epoch": 0.4001974143198084, + "grad_norm": 5.34375, + "learning_rate": 9.396601808265966e-06, + "loss": 1.02111988, + "memory(GiB)": 302.58, + "step": 71560, + "train_speed(iter/s)": 0.124373 + }, + { + "acc": 0.73592544, + "epoch": 0.40030926379278764, + "grad_norm": 6.40625, + "learning_rate": 9.396161360164748e-06, + "loss": 1.03648968, + "memory(GiB)": 302.58, + "step": 71580, + "train_speed(iter/s)": 0.124388 + }, + { + "acc": 0.74077001, + "epoch": 0.4004211132657669, + "grad_norm": 5.15625, + "learning_rate": 9.395720761700873e-06, + "loss": 1.02926369, + "memory(GiB)": 302.58, + "step": 71600, + "train_speed(iter/s)": 0.124404 + }, + { + "acc": 0.73312268, + "epoch": 0.40053296273874617, + "grad_norm": 6.65625, + "learning_rate": 9.395280012889409e-06, + "loss": 1.04731779, + "memory(GiB)": 302.58, + "step": 71620, + "train_speed(iter/s)": 0.124421 + }, + { + "acc": 0.74907427, + "epoch": 0.4006448122117255, + "grad_norm": 7.59375, + "learning_rate": 9.39483911374543e-06, + "loss": 0.97900391, + "memory(GiB)": 302.58, + "step": 71640, + "train_speed(iter/s)": 0.124436 + }, + { + "acc": 0.73505898, + "epoch": 0.40075666168470475, + "grad_norm": 5.84375, + "learning_rate": 9.394398064284021e-06, + "loss": 1.02881517, + "memory(GiB)": 302.58, + "step": 71660, + "train_speed(iter/s)": 0.124453 + }, + { + "acc": 0.73536696, + "epoch": 0.400868511157684, + "grad_norm": 5.3125, + "learning_rate": 9.393956864520262e-06, + "loss": 1.03947449, + "memory(GiB)": 302.58, + "step": 71680, + "train_speed(iter/s)": 0.124469 + }, + { + "acc": 0.72949944, + "epoch": 0.4009803606306633, + "grad_norm": 4.5, + "learning_rate": 9.393515514469245e-06, + "loss": 1.09254818, + "memory(GiB)": 302.58, + "step": 71700, + "train_speed(iter/s)": 0.124486 + }, + { + "acc": 0.73730459, + "epoch": 0.40109221010364254, + "grad_norm": 8.0, + "learning_rate": 9.393074014146066e-06, + "loss": 1.02891846, + "memory(GiB)": 302.58, + "step": 71720, + "train_speed(iter/s)": 0.124502 + }, + { + "acc": 0.72261014, + "epoch": 0.4012040595766218, + "grad_norm": 7.59375, + "learning_rate": 9.392632363565828e-06, + "loss": 1.11164427, + "memory(GiB)": 302.58, + "step": 71740, + "train_speed(iter/s)": 0.124518 + }, + { + "acc": 0.73084345, + "epoch": 0.40131590904960107, + "grad_norm": 9.0, + "learning_rate": 9.392190562743632e-06, + "loss": 1.0549118, + "memory(GiB)": 302.58, + "step": 71760, + "train_speed(iter/s)": 0.124534 + }, + { + "acc": 0.74086142, + "epoch": 0.40142775852258034, + "grad_norm": 7.09375, + "learning_rate": 9.39174861169459e-06, + "loss": 1.01688652, + "memory(GiB)": 302.58, + "step": 71780, + "train_speed(iter/s)": 0.124549 + }, + { + "acc": 0.73856354, + "epoch": 0.4015396079955596, + "grad_norm": 6.65625, + "learning_rate": 9.391306510433823e-06, + "loss": 1.03657894, + "memory(GiB)": 302.58, + "step": 71800, + "train_speed(iter/s)": 0.124565 + }, + { + "acc": 0.74436288, + "epoch": 0.40165145746853886, + "grad_norm": 6.40625, + "learning_rate": 9.39086425897645e-06, + "loss": 1.02493477, + "memory(GiB)": 302.58, + "step": 71820, + "train_speed(iter/s)": 0.12458 + }, + { + "acc": 0.72535973, + "epoch": 0.40176330694151813, + "grad_norm": 16.625, + "learning_rate": 9.390421857337592e-06, + "loss": 1.08392439, + "memory(GiB)": 302.58, + "step": 71840, + "train_speed(iter/s)": 0.124596 + }, + { + "acc": 0.71451569, + "epoch": 0.4018751564144974, + "grad_norm": 9.1875, + "learning_rate": 9.389979305532386e-06, + "loss": 1.14293947, + "memory(GiB)": 302.58, + "step": 71860, + "train_speed(iter/s)": 0.124612 + }, + { + "acc": 0.73523407, + "epoch": 0.40198700588747666, + "grad_norm": 5.4375, + "learning_rate": 9.389536603575968e-06, + "loss": 1.04086647, + "memory(GiB)": 302.58, + "step": 71880, + "train_speed(iter/s)": 0.124627 + }, + { + "acc": 0.729006, + "epoch": 0.4020988553604559, + "grad_norm": 7.65625, + "learning_rate": 9.38909375148348e-06, + "loss": 1.06929264, + "memory(GiB)": 302.58, + "step": 71900, + "train_speed(iter/s)": 0.124643 + }, + { + "acc": 0.73945704, + "epoch": 0.4022107048334352, + "grad_norm": 5.9375, + "learning_rate": 9.388650749270068e-06, + "loss": 1.02369118, + "memory(GiB)": 302.58, + "step": 71920, + "train_speed(iter/s)": 0.124659 + }, + { + "acc": 0.73996501, + "epoch": 0.40232255430641445, + "grad_norm": 6.4375, + "learning_rate": 9.388207596950884e-06, + "loss": 1.02363663, + "memory(GiB)": 302.58, + "step": 71940, + "train_speed(iter/s)": 0.124675 + }, + { + "acc": 0.73506532, + "epoch": 0.4024344037793937, + "grad_norm": 4.6875, + "learning_rate": 9.387764294541086e-06, + "loss": 1.051091, + "memory(GiB)": 302.58, + "step": 71960, + "train_speed(iter/s)": 0.124691 + }, + { + "acc": 0.73131294, + "epoch": 0.402546253252373, + "grad_norm": 13.0625, + "learning_rate": 9.387320842055836e-06, + "loss": 1.06391087, + "memory(GiB)": 302.58, + "step": 71980, + "train_speed(iter/s)": 0.124706 + }, + { + "acc": 0.74649467, + "epoch": 0.40265810272535224, + "grad_norm": 6.09375, + "learning_rate": 9.386877239510301e-06, + "loss": 1.00284443, + "memory(GiB)": 302.58, + "step": 72000, + "train_speed(iter/s)": 0.124722 + }, + { + "epoch": 0.40265810272535224, + "eval_acc": 0.7002639547124228, + "eval_loss": 1.0415623188018799, + "eval_runtime": 7507.9449, + "eval_samples_per_second": 10.027, + "eval_steps_per_second": 10.027, + "step": 72000 + }, + { + "acc": 0.72817025, + "epoch": 0.4027699521983315, + "grad_norm": 5.6875, + "learning_rate": 9.386433486919655e-06, + "loss": 1.06572123, + "memory(GiB)": 302.58, + "step": 72020, + "train_speed(iter/s)": 0.123111 + }, + { + "acc": 0.73068171, + "epoch": 0.40288180167131077, + "grad_norm": 7.0625, + "learning_rate": 9.385989584299073e-06, + "loss": 1.05443459, + "memory(GiB)": 302.58, + "step": 72040, + "train_speed(iter/s)": 0.123128 + }, + { + "acc": 0.74497762, + "epoch": 0.40299365114429003, + "grad_norm": 5.125, + "learning_rate": 9.38554553166374e-06, + "loss": 0.99211149, + "memory(GiB)": 302.58, + "step": 72060, + "train_speed(iter/s)": 0.123144 + }, + { + "acc": 0.73349924, + "epoch": 0.4031055006172693, + "grad_norm": 4.5625, + "learning_rate": 9.385101329028846e-06, + "loss": 1.05448427, + "memory(GiB)": 302.58, + "step": 72080, + "train_speed(iter/s)": 0.12316 + }, + { + "acc": 0.72817202, + "epoch": 0.40321735009024856, + "grad_norm": 6.8125, + "learning_rate": 9.38465697640958e-06, + "loss": 1.08958187, + "memory(GiB)": 302.58, + "step": 72100, + "train_speed(iter/s)": 0.123177 + }, + { + "acc": 0.70817375, + "epoch": 0.4033291995632278, + "grad_norm": 7.5625, + "learning_rate": 9.384212473821139e-06, + "loss": 1.14633341, + "memory(GiB)": 302.58, + "step": 72120, + "train_speed(iter/s)": 0.123193 + }, + { + "acc": 0.73689399, + "epoch": 0.4034410490362071, + "grad_norm": 7.21875, + "learning_rate": 9.383767821278734e-06, + "loss": 1.03881216, + "memory(GiB)": 302.58, + "step": 72140, + "train_speed(iter/s)": 0.123209 + }, + { + "acc": 0.73985376, + "epoch": 0.40355289850918635, + "grad_norm": 6.71875, + "learning_rate": 9.383323018797565e-06, + "loss": 1.0161027, + "memory(GiB)": 302.58, + "step": 72160, + "train_speed(iter/s)": 0.123225 + }, + { + "acc": 0.73786798, + "epoch": 0.4036647479821656, + "grad_norm": 5.15625, + "learning_rate": 9.382878066392852e-06, + "loss": 1.02298107, + "memory(GiB)": 302.58, + "step": 72180, + "train_speed(iter/s)": 0.123241 + }, + { + "acc": 0.73831983, + "epoch": 0.4037765974551449, + "grad_norm": 11.5, + "learning_rate": 9.382432964079813e-06, + "loss": 1.04744225, + "memory(GiB)": 302.58, + "step": 72200, + "train_speed(iter/s)": 0.123258 + }, + { + "acc": 0.73475623, + "epoch": 0.40388844692812415, + "grad_norm": 5.9375, + "learning_rate": 9.381987711873667e-06, + "loss": 1.02930822, + "memory(GiB)": 302.58, + "step": 72220, + "train_speed(iter/s)": 0.123274 + }, + { + "acc": 0.74172044, + "epoch": 0.4040002964011034, + "grad_norm": 6.96875, + "learning_rate": 9.381542309789649e-06, + "loss": 1.02973289, + "memory(GiB)": 302.58, + "step": 72240, + "train_speed(iter/s)": 0.12329 + }, + { + "acc": 0.73254962, + "epoch": 0.4041121458740827, + "grad_norm": 9.5625, + "learning_rate": 9.381096757842988e-06, + "loss": 1.05455303, + "memory(GiB)": 302.58, + "step": 72260, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.73720341, + "epoch": 0.40422399534706194, + "grad_norm": 6.9375, + "learning_rate": 9.380651056048928e-06, + "loss": 1.03994789, + "memory(GiB)": 302.58, + "step": 72280, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.73573289, + "epoch": 0.4043358448200412, + "grad_norm": 7.53125, + "learning_rate": 9.38020520442271e-06, + "loss": 1.03532219, + "memory(GiB)": 302.58, + "step": 72300, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.74097562, + "epoch": 0.40444769429302047, + "grad_norm": 4.78125, + "learning_rate": 9.379759202979586e-06, + "loss": 1.02801952, + "memory(GiB)": 302.58, + "step": 72320, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.7318018, + "epoch": 0.40455954376599973, + "grad_norm": 7.375, + "learning_rate": 9.379313051734808e-06, + "loss": 1.06318512, + "memory(GiB)": 302.58, + "step": 72340, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.71745148, + "epoch": 0.404671393238979, + "grad_norm": 7.5625, + "learning_rate": 9.378866750703639e-06, + "loss": 1.11868248, + "memory(GiB)": 302.58, + "step": 72360, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.73416586, + "epoch": 0.40478324271195826, + "grad_norm": 5.28125, + "learning_rate": 9.37842029990134e-06, + "loss": 1.04955235, + "memory(GiB)": 302.58, + "step": 72380, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.74510331, + "epoch": 0.4048950921849375, + "grad_norm": 8.125, + "learning_rate": 9.377973699343183e-06, + "loss": 0.99632711, + "memory(GiB)": 302.58, + "step": 72400, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.73763485, + "epoch": 0.4050069416579168, + "grad_norm": 6.9375, + "learning_rate": 9.377526949044446e-06, + "loss": 1.03049603, + "memory(GiB)": 302.58, + "step": 72420, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.73894777, + "epoch": 0.40511879113089605, + "grad_norm": 7.125, + "learning_rate": 9.377080049020405e-06, + "loss": 1.03394823, + "memory(GiB)": 302.58, + "step": 72440, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.75737996, + "epoch": 0.4052306406038753, + "grad_norm": 10.8125, + "learning_rate": 9.376632999286346e-06, + "loss": 0.92704268, + "memory(GiB)": 302.58, + "step": 72460, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.72924466, + "epoch": 0.4053424900768546, + "grad_norm": 8.375, + "learning_rate": 9.376185799857562e-06, + "loss": 1.09219828, + "memory(GiB)": 302.58, + "step": 72480, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.73734813, + "epoch": 0.40545433954983384, + "grad_norm": 7.09375, + "learning_rate": 9.375738450749347e-06, + "loss": 1.02228203, + "memory(GiB)": 302.58, + "step": 72500, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.72636309, + "epoch": 0.4055661890228131, + "grad_norm": 4.09375, + "learning_rate": 9.375290951977001e-06, + "loss": 1.07649832, + "memory(GiB)": 302.58, + "step": 72520, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.73653393, + "epoch": 0.40567803849579237, + "grad_norm": 7.78125, + "learning_rate": 9.374843303555832e-06, + "loss": 1.04228754, + "memory(GiB)": 302.58, + "step": 72540, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.733148, + "epoch": 0.40578988796877163, + "grad_norm": 8.625, + "learning_rate": 9.374395505501148e-06, + "loss": 1.05179634, + "memory(GiB)": 302.58, + "step": 72560, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.73387437, + "epoch": 0.4059017374417509, + "grad_norm": 4.96875, + "learning_rate": 9.37394755782827e-06, + "loss": 1.02198277, + "memory(GiB)": 302.58, + "step": 72580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.7384243, + "epoch": 0.40601358691473016, + "grad_norm": 6.1875, + "learning_rate": 9.373499460552511e-06, + "loss": 1.05252237, + "memory(GiB)": 302.58, + "step": 72600, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75049696, + "epoch": 0.4061254363877094, + "grad_norm": 10.0625, + "learning_rate": 9.373051213689205e-06, + "loss": 0.97746143, + "memory(GiB)": 302.58, + "step": 72620, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.729037, + "epoch": 0.4062372858606887, + "grad_norm": 9.25, + "learning_rate": 9.372602817253682e-06, + "loss": 1.08034029, + "memory(GiB)": 302.58, + "step": 72640, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.73809781, + "epoch": 0.40634913533366795, + "grad_norm": 9.0625, + "learning_rate": 9.372154271261277e-06, + "loss": 1.04161987, + "memory(GiB)": 302.58, + "step": 72660, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.74702744, + "epoch": 0.4064609848066472, + "grad_norm": 5.375, + "learning_rate": 9.371705575727333e-06, + "loss": 0.99577818, + "memory(GiB)": 302.58, + "step": 72680, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.73556795, + "epoch": 0.4065728342796265, + "grad_norm": 5.125, + "learning_rate": 9.371256730667196e-06, + "loss": 1.05366678, + "memory(GiB)": 302.58, + "step": 72700, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.72242513, + "epoch": 0.40668468375260575, + "grad_norm": 4.46875, + "learning_rate": 9.370807736096216e-06, + "loss": 1.09115105, + "memory(GiB)": 302.58, + "step": 72720, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.72986937, + "epoch": 0.406796533225585, + "grad_norm": 8.25, + "learning_rate": 9.370358592029753e-06, + "loss": 1.08182697, + "memory(GiB)": 302.58, + "step": 72740, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.73807135, + "epoch": 0.4069083826985643, + "grad_norm": 6.3125, + "learning_rate": 9.369909298483168e-06, + "loss": 1.02427692, + "memory(GiB)": 302.58, + "step": 72760, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.71386514, + "epoch": 0.40702023217154354, + "grad_norm": 5.0625, + "learning_rate": 9.369459855471828e-06, + "loss": 1.13819265, + "memory(GiB)": 302.58, + "step": 72780, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.73799009, + "epoch": 0.4071320816445228, + "grad_norm": 8.1875, + "learning_rate": 9.369010263011106e-06, + "loss": 1.03082409, + "memory(GiB)": 302.58, + "step": 72800, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.71998568, + "epoch": 0.40724393111750207, + "grad_norm": 7.625, + "learning_rate": 9.36856052111638e-06, + "loss": 1.10234966, + "memory(GiB)": 302.58, + "step": 72820, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.73004413, + "epoch": 0.40735578059048133, + "grad_norm": 8.0625, + "learning_rate": 9.368110629803032e-06, + "loss": 1.06088438, + "memory(GiB)": 302.58, + "step": 72840, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.75241203, + "epoch": 0.4074676300634606, + "grad_norm": 5.0, + "learning_rate": 9.367660589086448e-06, + "loss": 0.94560041, + "memory(GiB)": 302.58, + "step": 72860, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.72379627, + "epoch": 0.40757947953643986, + "grad_norm": 6.46875, + "learning_rate": 9.367210398982023e-06, + "loss": 1.09178953, + "memory(GiB)": 302.58, + "step": 72880, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.75161762, + "epoch": 0.4076913290094191, + "grad_norm": 9.8125, + "learning_rate": 9.366760059505156e-06, + "loss": 0.97091341, + "memory(GiB)": 302.58, + "step": 72900, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.76424923, + "epoch": 0.4078031784823984, + "grad_norm": 4.53125, + "learning_rate": 9.366309570671247e-06, + "loss": 0.90244598, + "memory(GiB)": 302.58, + "step": 72920, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.75150433, + "epoch": 0.40791502795537765, + "grad_norm": 6.15625, + "learning_rate": 9.365858932495703e-06, + "loss": 0.97382202, + "memory(GiB)": 302.58, + "step": 72940, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.72938175, + "epoch": 0.4080268774283569, + "grad_norm": 6.09375, + "learning_rate": 9.365408144993943e-06, + "loss": 1.05217676, + "memory(GiB)": 302.58, + "step": 72960, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.73822556, + "epoch": 0.4081387269013362, + "grad_norm": 6.03125, + "learning_rate": 9.364957208181383e-06, + "loss": 1.0145751, + "memory(GiB)": 302.58, + "step": 72980, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.73686042, + "epoch": 0.40825057637431544, + "grad_norm": 5.5625, + "learning_rate": 9.364506122073443e-06, + "loss": 1.034762, + "memory(GiB)": 302.58, + "step": 73000, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.74651985, + "epoch": 0.4083624258472947, + "grad_norm": 8.3125, + "learning_rate": 9.364054886685554e-06, + "loss": 0.98548965, + "memory(GiB)": 302.58, + "step": 73020, + "train_speed(iter/s)": 0.123912 + }, + { + "acc": 0.73892884, + "epoch": 0.40847427532027397, + "grad_norm": 5.84375, + "learning_rate": 9.363603502033151e-06, + "loss": 1.02774019, + "memory(GiB)": 302.58, + "step": 73040, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.74698114, + "epoch": 0.40858612479325324, + "grad_norm": 7.0, + "learning_rate": 9.363151968131672e-06, + "loss": 0.97627869, + "memory(GiB)": 302.58, + "step": 73060, + "train_speed(iter/s)": 0.123944 + }, + { + "acc": 0.74289083, + "epoch": 0.4086979742662325, + "grad_norm": 5.8125, + "learning_rate": 9.36270028499656e-06, + "loss": 1.01181965, + "memory(GiB)": 302.58, + "step": 73080, + "train_speed(iter/s)": 0.12396 + }, + { + "acc": 0.72403722, + "epoch": 0.40880982373921176, + "grad_norm": 7.40625, + "learning_rate": 9.362248452643265e-06, + "loss": 1.10987434, + "memory(GiB)": 302.58, + "step": 73100, + "train_speed(iter/s)": 0.123976 + }, + { + "acc": 0.72003298, + "epoch": 0.40892167321219103, + "grad_norm": 5.125, + "learning_rate": 9.36179647108724e-06, + "loss": 1.12625284, + "memory(GiB)": 302.58, + "step": 73120, + "train_speed(iter/s)": 0.123992 + }, + { + "acc": 0.7469183, + "epoch": 0.4090335226851703, + "grad_norm": 9.4375, + "learning_rate": 9.361344340343944e-06, + "loss": 0.9759038, + "memory(GiB)": 302.58, + "step": 73140, + "train_speed(iter/s)": 0.124008 + }, + { + "acc": 0.72368155, + "epoch": 0.40914537215814956, + "grad_norm": 7.78125, + "learning_rate": 9.360892060428843e-06, + "loss": 1.10962019, + "memory(GiB)": 302.58, + "step": 73160, + "train_speed(iter/s)": 0.124025 + }, + { + "acc": 0.72566872, + "epoch": 0.4092572216311288, + "grad_norm": 5.1875, + "learning_rate": 9.360439631357406e-06, + "loss": 1.10946846, + "memory(GiB)": 302.58, + "step": 73180, + "train_speed(iter/s)": 0.12404 + }, + { + "acc": 0.74129205, + "epoch": 0.4093690711041081, + "grad_norm": 13.4375, + "learning_rate": 9.359987053145107e-06, + "loss": 1.00846701, + "memory(GiB)": 302.58, + "step": 73200, + "train_speed(iter/s)": 0.124055 + }, + { + "acc": 0.73555808, + "epoch": 0.40948092057708735, + "grad_norm": 9.625, + "learning_rate": 9.359534325807424e-06, + "loss": 1.04797258, + "memory(GiB)": 302.58, + "step": 73220, + "train_speed(iter/s)": 0.124071 + }, + { + "acc": 0.73381104, + "epoch": 0.4095927700500666, + "grad_norm": 8.5625, + "learning_rate": 9.359081449359845e-06, + "loss": 1.04527979, + "memory(GiB)": 302.58, + "step": 73240, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.72847509, + "epoch": 0.4097046195230459, + "grad_norm": 8.9375, + "learning_rate": 9.358628423817857e-06, + "loss": 1.07878656, + "memory(GiB)": 302.58, + "step": 73260, + "train_speed(iter/s)": 0.124102 + }, + { + "acc": 0.75406156, + "epoch": 0.40981646899602514, + "grad_norm": 5.9375, + "learning_rate": 9.358175249196957e-06, + "loss": 0.94266338, + "memory(GiB)": 302.58, + "step": 73280, + "train_speed(iter/s)": 0.124118 + }, + { + "acc": 0.74247298, + "epoch": 0.4099283184690044, + "grad_norm": 7.21875, + "learning_rate": 9.357721925512645e-06, + "loss": 1.0031702, + "memory(GiB)": 302.58, + "step": 73300, + "train_speed(iter/s)": 0.124132 + }, + { + "acc": 0.74521041, + "epoch": 0.41004016794198367, + "grad_norm": 7.78125, + "learning_rate": 9.357268452780424e-06, + "loss": 1.01053886, + "memory(GiB)": 302.58, + "step": 73320, + "train_speed(iter/s)": 0.124149 + }, + { + "acc": 0.7402936, + "epoch": 0.41015201741496293, + "grad_norm": 6.59375, + "learning_rate": 9.356814831015803e-06, + "loss": 1.02071228, + "memory(GiB)": 302.58, + "step": 73340, + "train_speed(iter/s)": 0.124164 + }, + { + "acc": 0.73715215, + "epoch": 0.4102638668879422, + "grad_norm": 6.875, + "learning_rate": 9.356361060234304e-06, + "loss": 1.05599699, + "memory(GiB)": 302.58, + "step": 73360, + "train_speed(iter/s)": 0.12418 + }, + { + "acc": 0.73607039, + "epoch": 0.41037571636092146, + "grad_norm": 6.375, + "learning_rate": 9.355907140451441e-06, + "loss": 1.05756922, + "memory(GiB)": 302.58, + "step": 73380, + "train_speed(iter/s)": 0.124197 + }, + { + "acc": 0.7459805, + "epoch": 0.4104875658339007, + "grad_norm": 6.09375, + "learning_rate": 9.35545307168274e-06, + "loss": 1.02032309, + "memory(GiB)": 302.58, + "step": 73400, + "train_speed(iter/s)": 0.124213 + }, + { + "acc": 0.73346291, + "epoch": 0.41059941530688, + "grad_norm": 7.53125, + "learning_rate": 9.354998853943737e-06, + "loss": 1.05345793, + "memory(GiB)": 302.58, + "step": 73420, + "train_speed(iter/s)": 0.12423 + }, + { + "acc": 0.71585269, + "epoch": 0.41071126477985925, + "grad_norm": 4.53125, + "learning_rate": 9.354544487249962e-06, + "loss": 1.12640972, + "memory(GiB)": 302.58, + "step": 73440, + "train_speed(iter/s)": 0.124246 + }, + { + "acc": 0.75066113, + "epoch": 0.4108231142528385, + "grad_norm": 6.09375, + "learning_rate": 9.354089971616956e-06, + "loss": 0.9708602, + "memory(GiB)": 302.58, + "step": 73460, + "train_speed(iter/s)": 0.12426 + }, + { + "acc": 0.72454367, + "epoch": 0.4109349637258178, + "grad_norm": 4.34375, + "learning_rate": 9.353635307060269e-06, + "loss": 1.0763154, + "memory(GiB)": 302.58, + "step": 73480, + "train_speed(iter/s)": 0.124277 + }, + { + "acc": 0.74431691, + "epoch": 0.41104681319879705, + "grad_norm": 6.78125, + "learning_rate": 9.353180493595449e-06, + "loss": 1.00432768, + "memory(GiB)": 302.58, + "step": 73500, + "train_speed(iter/s)": 0.124293 + }, + { + "acc": 0.739926, + "epoch": 0.4111586626717763, + "grad_norm": 7.59375, + "learning_rate": 9.35272553123805e-06, + "loss": 1.02100687, + "memory(GiB)": 302.58, + "step": 73520, + "train_speed(iter/s)": 0.124309 + }, + { + "acc": 0.72937026, + "epoch": 0.4112705121447556, + "grad_norm": 5.875, + "learning_rate": 9.35227042000364e-06, + "loss": 1.05984335, + "memory(GiB)": 302.58, + "step": 73540, + "train_speed(iter/s)": 0.124325 + }, + { + "acc": 0.72010479, + "epoch": 0.41138236161773484, + "grad_norm": 5.9375, + "learning_rate": 9.351815159907778e-06, + "loss": 1.09655428, + "memory(GiB)": 302.58, + "step": 73560, + "train_speed(iter/s)": 0.124341 + }, + { + "acc": 0.71497402, + "epoch": 0.4114942110907141, + "grad_norm": 10.0, + "learning_rate": 9.351359750966038e-06, + "loss": 1.11751766, + "memory(GiB)": 302.58, + "step": 73580, + "train_speed(iter/s)": 0.124357 + }, + { + "acc": 0.73021541, + "epoch": 0.41160606056369337, + "grad_norm": 7.65625, + "learning_rate": 9.350904193193998e-06, + "loss": 1.06063738, + "memory(GiB)": 302.58, + "step": 73600, + "train_speed(iter/s)": 0.124371 + }, + { + "acc": 0.72742686, + "epoch": 0.41171791003667263, + "grad_norm": 7.21875, + "learning_rate": 9.350448486607237e-06, + "loss": 1.07665157, + "memory(GiB)": 302.58, + "step": 73620, + "train_speed(iter/s)": 0.124387 + }, + { + "acc": 0.73239217, + "epoch": 0.4118297595096519, + "grad_norm": 5.75, + "learning_rate": 9.349992631221344e-06, + "loss": 1.0334837, + "memory(GiB)": 302.58, + "step": 73640, + "train_speed(iter/s)": 0.124403 + }, + { + "acc": 0.74004087, + "epoch": 0.41194160898263116, + "grad_norm": 8.0, + "learning_rate": 9.34953662705191e-06, + "loss": 1.00301809, + "memory(GiB)": 302.58, + "step": 73660, + "train_speed(iter/s)": 0.12442 + }, + { + "acc": 0.72279449, + "epoch": 0.4120534584556104, + "grad_norm": 5.4375, + "learning_rate": 9.34908047411453e-06, + "loss": 1.09649248, + "memory(GiB)": 302.58, + "step": 73680, + "train_speed(iter/s)": 0.124436 + }, + { + "acc": 0.73719387, + "epoch": 0.4121653079285897, + "grad_norm": 7.375, + "learning_rate": 9.34862417242481e-06, + "loss": 1.0359889, + "memory(GiB)": 302.58, + "step": 73700, + "train_speed(iter/s)": 0.124452 + }, + { + "acc": 0.7362803, + "epoch": 0.41227715740156895, + "grad_norm": 6.375, + "learning_rate": 9.348167721998351e-06, + "loss": 1.01454535, + "memory(GiB)": 302.58, + "step": 73720, + "train_speed(iter/s)": 0.124468 + }, + { + "acc": 0.7389452, + "epoch": 0.4123890068745482, + "grad_norm": 5.78125, + "learning_rate": 9.34771112285077e-06, + "loss": 1.01200294, + "memory(GiB)": 302.58, + "step": 73740, + "train_speed(iter/s)": 0.124484 + }, + { + "acc": 0.73904071, + "epoch": 0.4125008563475275, + "grad_norm": 6.59375, + "learning_rate": 9.347254374997681e-06, + "loss": 1.03276434, + "memory(GiB)": 302.58, + "step": 73760, + "train_speed(iter/s)": 0.1245 + }, + { + "acc": 0.75066886, + "epoch": 0.41261270582050674, + "grad_norm": 8.1875, + "learning_rate": 9.34679747845471e-06, + "loss": 0.98461142, + "memory(GiB)": 302.58, + "step": 73780, + "train_speed(iter/s)": 0.124517 + }, + { + "acc": 0.74646192, + "epoch": 0.412724555293486, + "grad_norm": 7.3125, + "learning_rate": 9.34634043323748e-06, + "loss": 0.99396009, + "memory(GiB)": 302.58, + "step": 73800, + "train_speed(iter/s)": 0.124533 + }, + { + "acc": 0.74424081, + "epoch": 0.41283640476646527, + "grad_norm": 9.0625, + "learning_rate": 9.345883239361624e-06, + "loss": 0.98984394, + "memory(GiB)": 302.58, + "step": 73820, + "train_speed(iter/s)": 0.12455 + }, + { + "acc": 0.7304245, + "epoch": 0.41294825423944453, + "grad_norm": 7.34375, + "learning_rate": 9.345425896842783e-06, + "loss": 1.07767706, + "memory(GiB)": 302.58, + "step": 73840, + "train_speed(iter/s)": 0.124565 + }, + { + "acc": 0.73334036, + "epoch": 0.4130601037124238, + "grad_norm": 5.84375, + "learning_rate": 9.344968405696596e-06, + "loss": 1.05743141, + "memory(GiB)": 302.58, + "step": 73860, + "train_speed(iter/s)": 0.124582 + }, + { + "acc": 0.73053694, + "epoch": 0.41317195318540306, + "grad_norm": 7.0, + "learning_rate": 9.344510765938715e-06, + "loss": 1.08280096, + "memory(GiB)": 302.58, + "step": 73880, + "train_speed(iter/s)": 0.124597 + }, + { + "acc": 0.72986221, + "epoch": 0.4132838026583823, + "grad_norm": 11.875, + "learning_rate": 9.344052977584786e-06, + "loss": 1.06230116, + "memory(GiB)": 302.58, + "step": 73900, + "train_speed(iter/s)": 0.124613 + }, + { + "acc": 0.75008974, + "epoch": 0.4133956521313616, + "grad_norm": 5.3125, + "learning_rate": 9.343595040650473e-06, + "loss": 0.99050455, + "memory(GiB)": 302.58, + "step": 73920, + "train_speed(iter/s)": 0.124627 + }, + { + "acc": 0.74025345, + "epoch": 0.41350750160434085, + "grad_norm": 7.09375, + "learning_rate": 9.343136955151435e-06, + "loss": 1.01287222, + "memory(GiB)": 302.58, + "step": 73940, + "train_speed(iter/s)": 0.124644 + }, + { + "acc": 0.72265563, + "epoch": 0.4136193510773201, + "grad_norm": 6.90625, + "learning_rate": 9.342678721103342e-06, + "loss": 1.07709808, + "memory(GiB)": 302.58, + "step": 73960, + "train_speed(iter/s)": 0.12466 + }, + { + "acc": 0.72682848, + "epoch": 0.4137312005502994, + "grad_norm": 6.25, + "learning_rate": 9.342220338521865e-06, + "loss": 1.07926064, + "memory(GiB)": 302.58, + "step": 73980, + "train_speed(iter/s)": 0.124675 + }, + { + "acc": 0.74547839, + "epoch": 0.41384305002327865, + "grad_norm": 7.21875, + "learning_rate": 9.341761807422687e-06, + "loss": 0.99813604, + "memory(GiB)": 302.58, + "step": 74000, + "train_speed(iter/s)": 0.124691 + }, + { + "epoch": 0.41384305002327865, + "eval_acc": 0.7004765717516203, + "eval_loss": 1.040907621383667, + "eval_runtime": 7566.7435, + "eval_samples_per_second": 9.949, + "eval_steps_per_second": 9.949, + "step": 74000 + }, + { + "acc": 0.70968184, + "epoch": 0.4139548994962579, + "grad_norm": 7.03125, + "learning_rate": 9.341303127821486e-06, + "loss": 1.16365404, + "memory(GiB)": 302.58, + "step": 74020, + "train_speed(iter/s)": 0.12311 + }, + { + "acc": 0.72470727, + "epoch": 0.4140667489692372, + "grad_norm": 7.34375, + "learning_rate": 9.340844299733951e-06, + "loss": 1.09867773, + "memory(GiB)": 302.58, + "step": 74040, + "train_speed(iter/s)": 0.123125 + }, + { + "acc": 0.7392549, + "epoch": 0.41417859844221644, + "grad_norm": 5.09375, + "learning_rate": 9.340385323175779e-06, + "loss": 1.03223925, + "memory(GiB)": 302.58, + "step": 74060, + "train_speed(iter/s)": 0.123142 + }, + { + "acc": 0.75406203, + "epoch": 0.4142904479151957, + "grad_norm": 6.84375, + "learning_rate": 9.339926198162663e-06, + "loss": 0.94388914, + "memory(GiB)": 302.58, + "step": 74080, + "train_speed(iter/s)": 0.123158 + }, + { + "acc": 0.75196476, + "epoch": 0.41440229738817497, + "grad_norm": 6.21875, + "learning_rate": 9.339466924710312e-06, + "loss": 0.96347017, + "memory(GiB)": 302.58, + "step": 74100, + "train_speed(iter/s)": 0.123173 + }, + { + "acc": 0.73916111, + "epoch": 0.41451414686115423, + "grad_norm": 5.53125, + "learning_rate": 9.33900750283443e-06, + "loss": 1.05118132, + "memory(GiB)": 302.58, + "step": 74120, + "train_speed(iter/s)": 0.123188 + }, + { + "acc": 0.74509921, + "epoch": 0.41462599633413355, + "grad_norm": 5.59375, + "learning_rate": 9.338547932550734e-06, + "loss": 1.01098213, + "memory(GiB)": 302.58, + "step": 74140, + "train_speed(iter/s)": 0.123204 + }, + { + "acc": 0.74386711, + "epoch": 0.4147378458071128, + "grad_norm": 6.78125, + "learning_rate": 9.33808821387494e-06, + "loss": 0.99165516, + "memory(GiB)": 302.58, + "step": 74160, + "train_speed(iter/s)": 0.123219 + }, + { + "acc": 0.73941655, + "epoch": 0.4148496952800921, + "grad_norm": 10.0, + "learning_rate": 9.337628346822775e-06, + "loss": 1.03245659, + "memory(GiB)": 302.58, + "step": 74180, + "train_speed(iter/s)": 0.123234 + }, + { + "acc": 0.73153353, + "epoch": 0.41496154475307134, + "grad_norm": 5.78125, + "learning_rate": 9.337168331409967e-06, + "loss": 1.04583607, + "memory(GiB)": 302.58, + "step": 74200, + "train_speed(iter/s)": 0.123249 + }, + { + "acc": 0.74984879, + "epoch": 0.4150733942260506, + "grad_norm": 5.1875, + "learning_rate": 9.336708167652249e-06, + "loss": 0.98904228, + "memory(GiB)": 302.58, + "step": 74220, + "train_speed(iter/s)": 0.123265 + }, + { + "acc": 0.73059192, + "epoch": 0.41518524369902987, + "grad_norm": 4.96875, + "learning_rate": 9.33624785556536e-06, + "loss": 1.06636496, + "memory(GiB)": 302.58, + "step": 74240, + "train_speed(iter/s)": 0.123279 + }, + { + "acc": 0.74535275, + "epoch": 0.41529709317200914, + "grad_norm": 4.53125, + "learning_rate": 9.335787395165043e-06, + "loss": 0.99837179, + "memory(GiB)": 302.58, + "step": 74260, + "train_speed(iter/s)": 0.123294 + }, + { + "acc": 0.71656752, + "epoch": 0.4154089426449884, + "grad_norm": 7.1875, + "learning_rate": 9.335326786467049e-06, + "loss": 1.11517391, + "memory(GiB)": 302.58, + "step": 74280, + "train_speed(iter/s)": 0.12331 + }, + { + "acc": 0.73840995, + "epoch": 0.41552079211796766, + "grad_norm": 7.5, + "learning_rate": 9.334866029487134e-06, + "loss": 1.04092741, + "memory(GiB)": 302.58, + "step": 74300, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.7344377, + "epoch": 0.4156326415909469, + "grad_norm": 4.59375, + "learning_rate": 9.334405124241053e-06, + "loss": 1.05072403, + "memory(GiB)": 302.58, + "step": 74320, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.73667297, + "epoch": 0.4157444910639262, + "grad_norm": 6.65625, + "learning_rate": 9.333944070744574e-06, + "loss": 1.04210501, + "memory(GiB)": 302.58, + "step": 74340, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.7417717, + "epoch": 0.41585634053690546, + "grad_norm": 6.03125, + "learning_rate": 9.333482869013466e-06, + "loss": 1.02826529, + "memory(GiB)": 302.58, + "step": 74360, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.73197169, + "epoch": 0.4159681900098847, + "grad_norm": 8.375, + "learning_rate": 9.3330215190635e-06, + "loss": 1.06965008, + "memory(GiB)": 302.58, + "step": 74380, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.72472773, + "epoch": 0.416080039482864, + "grad_norm": 8.6875, + "learning_rate": 9.332560020910462e-06, + "loss": 1.07965879, + "memory(GiB)": 302.58, + "step": 74400, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.7238421, + "epoch": 0.41619188895584325, + "grad_norm": 6.65625, + "learning_rate": 9.33209837457013e-06, + "loss": 1.0977211, + "memory(GiB)": 302.58, + "step": 74420, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.72280307, + "epoch": 0.4163037384288225, + "grad_norm": 8.0, + "learning_rate": 9.331636580058299e-06, + "loss": 1.11320562, + "memory(GiB)": 302.58, + "step": 74440, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.75332313, + "epoch": 0.4164155879018018, + "grad_norm": 6.875, + "learning_rate": 9.331174637390762e-06, + "loss": 0.94238138, + "memory(GiB)": 302.58, + "step": 74460, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.74547982, + "epoch": 0.41652743737478104, + "grad_norm": 5.03125, + "learning_rate": 9.330712546583317e-06, + "loss": 0.99242496, + "memory(GiB)": 302.58, + "step": 74480, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.72655468, + "epoch": 0.4166392868477603, + "grad_norm": 7.6875, + "learning_rate": 9.330250307651771e-06, + "loss": 1.10667505, + "memory(GiB)": 302.58, + "step": 74500, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.73957953, + "epoch": 0.41675113632073957, + "grad_norm": 6.5, + "learning_rate": 9.329787920611934e-06, + "loss": 1.01282673, + "memory(GiB)": 302.58, + "step": 74520, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.73490334, + "epoch": 0.41686298579371883, + "grad_norm": 9.1875, + "learning_rate": 9.32932538547962e-06, + "loss": 1.04133978, + "memory(GiB)": 302.58, + "step": 74540, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.73391147, + "epoch": 0.4169748352666981, + "grad_norm": 8.4375, + "learning_rate": 9.32886270227065e-06, + "loss": 1.04084644, + "memory(GiB)": 302.58, + "step": 74560, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.7558177, + "epoch": 0.41708668473967736, + "grad_norm": 8.8125, + "learning_rate": 9.32839987100085e-06, + "loss": 0.95933838, + "memory(GiB)": 302.58, + "step": 74580, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.72971892, + "epoch": 0.4171985342126566, + "grad_norm": 8.375, + "learning_rate": 9.327936891686049e-06, + "loss": 1.08368149, + "memory(GiB)": 302.58, + "step": 74600, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.73375964, + "epoch": 0.4173103836856359, + "grad_norm": 6.1875, + "learning_rate": 9.327473764342082e-06, + "loss": 1.03231926, + "memory(GiB)": 302.58, + "step": 74620, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.73785901, + "epoch": 0.41742223315861515, + "grad_norm": 7.375, + "learning_rate": 9.327010488984792e-06, + "loss": 1.0193819, + "memory(GiB)": 302.58, + "step": 74640, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.73590083, + "epoch": 0.4175340826315944, + "grad_norm": 8.375, + "learning_rate": 9.32654706563002e-06, + "loss": 1.05286713, + "memory(GiB)": 302.58, + "step": 74660, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.7385057, + "epoch": 0.4176459321045737, + "grad_norm": 6.78125, + "learning_rate": 9.32608349429362e-06, + "loss": 1.0058609, + "memory(GiB)": 302.58, + "step": 74680, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.73858528, + "epoch": 0.41775778157755294, + "grad_norm": 7.125, + "learning_rate": 9.32561977499145e-06, + "loss": 1.03553429, + "memory(GiB)": 302.58, + "step": 74700, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.73711147, + "epoch": 0.4178696310505322, + "grad_norm": 9.3125, + "learning_rate": 9.325155907739365e-06, + "loss": 1.03344145, + "memory(GiB)": 302.58, + "step": 74720, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.71369171, + "epoch": 0.4179814805235115, + "grad_norm": 6.78125, + "learning_rate": 9.324691892553232e-06, + "loss": 1.14744263, + "memory(GiB)": 302.58, + "step": 74740, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.73364716, + "epoch": 0.41809332999649074, + "grad_norm": 5.3125, + "learning_rate": 9.324227729448925e-06, + "loss": 1.04666252, + "memory(GiB)": 302.58, + "step": 74760, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.72729149, + "epoch": 0.41820517946947, + "grad_norm": 7.34375, + "learning_rate": 9.323763418442317e-06, + "loss": 1.08720551, + "memory(GiB)": 302.58, + "step": 74780, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.74837074, + "epoch": 0.41831702894244926, + "grad_norm": 7.375, + "learning_rate": 9.323298959549291e-06, + "loss": 0.98862572, + "memory(GiB)": 302.58, + "step": 74800, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.73928185, + "epoch": 0.41842887841542853, + "grad_norm": 7.21875, + "learning_rate": 9.32283435278573e-06, + "loss": 1.03613405, + "memory(GiB)": 302.58, + "step": 74820, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.74924254, + "epoch": 0.4185407278884078, + "grad_norm": 17.0, + "learning_rate": 9.32236959816753e-06, + "loss": 0.97701473, + "memory(GiB)": 302.58, + "step": 74840, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.72911038, + "epoch": 0.41865257736138706, + "grad_norm": 6.125, + "learning_rate": 9.321904695710581e-06, + "loss": 1.05234966, + "memory(GiB)": 302.58, + "step": 74860, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.72518311, + "epoch": 0.4187644268343663, + "grad_norm": 4.65625, + "learning_rate": 9.321439645430788e-06, + "loss": 1.09280291, + "memory(GiB)": 302.58, + "step": 74880, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.72569671, + "epoch": 0.4188762763073456, + "grad_norm": 9.3125, + "learning_rate": 9.320974447344056e-06, + "loss": 1.06986561, + "memory(GiB)": 302.58, + "step": 74900, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.73099246, + "epoch": 0.41898812578032485, + "grad_norm": 8.1875, + "learning_rate": 9.320509101466295e-06, + "loss": 1.06701927, + "memory(GiB)": 302.58, + "step": 74920, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.74192343, + "epoch": 0.4190999752533041, + "grad_norm": 5.78125, + "learning_rate": 9.320043607813424e-06, + "loss": 1.00918159, + "memory(GiB)": 302.58, + "step": 74940, + "train_speed(iter/s)": 0.123826 + }, + { + "acc": 0.74336963, + "epoch": 0.4192118247262834, + "grad_norm": 5.71875, + "learning_rate": 9.319577966401364e-06, + "loss": 1.02539291, + "memory(GiB)": 302.58, + "step": 74960, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.75484571, + "epoch": 0.41932367419926264, + "grad_norm": 4.75, + "learning_rate": 9.319112177246038e-06, + "loss": 0.95561609, + "memory(GiB)": 302.58, + "step": 74980, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.72433891, + "epoch": 0.4194355236722419, + "grad_norm": 5.59375, + "learning_rate": 9.318646240363383e-06, + "loss": 1.08413754, + "memory(GiB)": 302.58, + "step": 75000, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.7325017, + "epoch": 0.41954737314522117, + "grad_norm": 7.25, + "learning_rate": 9.318180155769332e-06, + "loss": 1.04707804, + "memory(GiB)": 302.58, + "step": 75020, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.74514899, + "epoch": 0.41965922261820043, + "grad_norm": 5.65625, + "learning_rate": 9.317713923479827e-06, + "loss": 0.99805231, + "memory(GiB)": 302.58, + "step": 75040, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.74876165, + "epoch": 0.4197710720911797, + "grad_norm": 5.875, + "learning_rate": 9.317247543510815e-06, + "loss": 0.96893997, + "memory(GiB)": 302.58, + "step": 75060, + "train_speed(iter/s)": 0.123921 + }, + { + "acc": 0.72612562, + "epoch": 0.41988292156415896, + "grad_norm": 4.90625, + "learning_rate": 9.316781015878244e-06, + "loss": 1.08469934, + "memory(GiB)": 302.58, + "step": 75080, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.74334536, + "epoch": 0.4199947710371382, + "grad_norm": 8.875, + "learning_rate": 9.316314340598078e-06, + "loss": 0.99965029, + "memory(GiB)": 302.58, + "step": 75100, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.73096986, + "epoch": 0.4201066205101175, + "grad_norm": 8.9375, + "learning_rate": 9.315847517686275e-06, + "loss": 1.05920811, + "memory(GiB)": 302.58, + "step": 75120, + "train_speed(iter/s)": 0.123967 + }, + { + "acc": 0.75600176, + "epoch": 0.42021846998309675, + "grad_norm": 7.21875, + "learning_rate": 9.315380547158799e-06, + "loss": 0.95938416, + "memory(GiB)": 302.58, + "step": 75140, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.72697048, + "epoch": 0.420330319456076, + "grad_norm": 6.21875, + "learning_rate": 9.314913429031626e-06, + "loss": 1.07172842, + "memory(GiB)": 302.58, + "step": 75160, + "train_speed(iter/s)": 0.123997 + }, + { + "acc": 0.72578058, + "epoch": 0.4204421689290553, + "grad_norm": 4.1875, + "learning_rate": 9.31444616332073e-06, + "loss": 1.08098345, + "memory(GiB)": 302.58, + "step": 75180, + "train_speed(iter/s)": 0.124012 + }, + { + "acc": 0.749156, + "epoch": 0.42055401840203455, + "grad_norm": 7.5, + "learning_rate": 9.313978750042097e-06, + "loss": 0.97340031, + "memory(GiB)": 302.58, + "step": 75200, + "train_speed(iter/s)": 0.124028 + }, + { + "acc": 0.73695445, + "epoch": 0.4206658678750138, + "grad_norm": 7.84375, + "learning_rate": 9.31351118921171e-06, + "loss": 1.03491402, + "memory(GiB)": 302.58, + "step": 75220, + "train_speed(iter/s)": 0.124044 + }, + { + "acc": 0.73135033, + "epoch": 0.4207777173479931, + "grad_norm": 6.625, + "learning_rate": 9.313043480845562e-06, + "loss": 1.04391947, + "memory(GiB)": 302.58, + "step": 75240, + "train_speed(iter/s)": 0.12406 + }, + { + "acc": 0.74117322, + "epoch": 0.42088956682097234, + "grad_norm": 9.5625, + "learning_rate": 9.312575624959653e-06, + "loss": 1.00985527, + "memory(GiB)": 302.58, + "step": 75260, + "train_speed(iter/s)": 0.124076 + }, + { + "acc": 0.76049294, + "epoch": 0.4210014162939516, + "grad_norm": 7.28125, + "learning_rate": 9.312107621569981e-06, + "loss": 0.93767414, + "memory(GiB)": 302.58, + "step": 75280, + "train_speed(iter/s)": 0.124091 + }, + { + "acc": 0.72957397, + "epoch": 0.42111326576693087, + "grad_norm": 5.3125, + "learning_rate": 9.311639470692555e-06, + "loss": 1.05332327, + "memory(GiB)": 302.58, + "step": 75300, + "train_speed(iter/s)": 0.124106 + }, + { + "acc": 0.73554807, + "epoch": 0.42122511523991013, + "grad_norm": 6.4375, + "learning_rate": 9.311171172343387e-06, + "loss": 1.02512274, + "memory(GiB)": 302.58, + "step": 75320, + "train_speed(iter/s)": 0.124123 + }, + { + "acc": 0.71970448, + "epoch": 0.4213369647128894, + "grad_norm": 7.1875, + "learning_rate": 9.310702726538497e-06, + "loss": 1.12501202, + "memory(GiB)": 302.58, + "step": 75340, + "train_speed(iter/s)": 0.124137 + }, + { + "acc": 0.73580875, + "epoch": 0.42144881418586866, + "grad_norm": 8.1875, + "learning_rate": 9.310234133293903e-06, + "loss": 1.04235945, + "memory(GiB)": 302.58, + "step": 75360, + "train_speed(iter/s)": 0.124152 + }, + { + "acc": 0.73269653, + "epoch": 0.4215606636588479, + "grad_norm": 9.5, + "learning_rate": 9.309765392625634e-06, + "loss": 1.0394949, + "memory(GiB)": 302.58, + "step": 75380, + "train_speed(iter/s)": 0.124166 + }, + { + "acc": 0.72432623, + "epoch": 0.4216725131318272, + "grad_norm": 6.90625, + "learning_rate": 9.309296504549724e-06, + "loss": 1.08727112, + "memory(GiB)": 302.58, + "step": 75400, + "train_speed(iter/s)": 0.124182 + }, + { + "acc": 0.7410069, + "epoch": 0.42178436260480645, + "grad_norm": 6.25, + "learning_rate": 9.308827469082206e-06, + "loss": 1.0204669, + "memory(GiB)": 302.58, + "step": 75420, + "train_speed(iter/s)": 0.124198 + }, + { + "acc": 0.73932085, + "epoch": 0.4218962120777857, + "grad_norm": 7.0, + "learning_rate": 9.308358286239129e-06, + "loss": 1.03917599, + "memory(GiB)": 302.58, + "step": 75440, + "train_speed(iter/s)": 0.124215 + }, + { + "acc": 0.72600546, + "epoch": 0.422008061550765, + "grad_norm": 9.0, + "learning_rate": 9.307888956036535e-06, + "loss": 1.08127346, + "memory(GiB)": 302.58, + "step": 75460, + "train_speed(iter/s)": 0.124231 + }, + { + "acc": 0.71936707, + "epoch": 0.42211991102374424, + "grad_norm": 6.4375, + "learning_rate": 9.307419478490481e-06, + "loss": 1.1093935, + "memory(GiB)": 302.58, + "step": 75480, + "train_speed(iter/s)": 0.124247 + }, + { + "acc": 0.73593268, + "epoch": 0.4222317604967235, + "grad_norm": 6.75, + "learning_rate": 9.30694985361702e-06, + "loss": 1.02136955, + "memory(GiB)": 302.58, + "step": 75500, + "train_speed(iter/s)": 0.124263 + }, + { + "acc": 0.74356146, + "epoch": 0.42234360996970277, + "grad_norm": 6.84375, + "learning_rate": 9.30648008143222e-06, + "loss": 1.00962391, + "memory(GiB)": 302.58, + "step": 75520, + "train_speed(iter/s)": 0.124278 + }, + { + "acc": 0.73878756, + "epoch": 0.42245545944268204, + "grad_norm": 8.25, + "learning_rate": 9.306010161952142e-06, + "loss": 1.02280712, + "memory(GiB)": 302.58, + "step": 75540, + "train_speed(iter/s)": 0.124293 + }, + { + "acc": 0.75295324, + "epoch": 0.4225673089156613, + "grad_norm": 5.65625, + "learning_rate": 9.305540095192865e-06, + "loss": 0.96261711, + "memory(GiB)": 302.58, + "step": 75560, + "train_speed(iter/s)": 0.124308 + }, + { + "acc": 0.72632785, + "epoch": 0.42267915838864056, + "grad_norm": 8.3125, + "learning_rate": 9.305069881170462e-06, + "loss": 1.08402815, + "memory(GiB)": 302.58, + "step": 75580, + "train_speed(iter/s)": 0.124324 + }, + { + "acc": 0.74313388, + "epoch": 0.4227910078616198, + "grad_norm": 5.34375, + "learning_rate": 9.304599519901021e-06, + "loss": 1.00253086, + "memory(GiB)": 302.58, + "step": 75600, + "train_speed(iter/s)": 0.12434 + }, + { + "acc": 0.75103087, + "epoch": 0.4229028573345991, + "grad_norm": 4.9375, + "learning_rate": 9.304129011400625e-06, + "loss": 0.96564188, + "memory(GiB)": 302.58, + "step": 75620, + "train_speed(iter/s)": 0.124356 + }, + { + "acc": 0.74871364, + "epoch": 0.42301470680757836, + "grad_norm": 6.9375, + "learning_rate": 9.30365835568537e-06, + "loss": 1.00075788, + "memory(GiB)": 302.58, + "step": 75640, + "train_speed(iter/s)": 0.12437 + }, + { + "acc": 0.73237896, + "epoch": 0.4231265562805576, + "grad_norm": 7.9375, + "learning_rate": 9.303187552771352e-06, + "loss": 1.06254082, + "memory(GiB)": 302.58, + "step": 75660, + "train_speed(iter/s)": 0.124384 + }, + { + "acc": 0.73543978, + "epoch": 0.4232384057535369, + "grad_norm": 7.0, + "learning_rate": 9.302716602674676e-06, + "loss": 1.03406496, + "memory(GiB)": 302.58, + "step": 75680, + "train_speed(iter/s)": 0.1244 + }, + { + "acc": 0.72056403, + "epoch": 0.42335025522651615, + "grad_norm": 7.53125, + "learning_rate": 9.302245505411446e-06, + "loss": 1.11595106, + "memory(GiB)": 302.58, + "step": 75700, + "train_speed(iter/s)": 0.124417 + }, + { + "acc": 0.75276942, + "epoch": 0.4234621046994954, + "grad_norm": 8.625, + "learning_rate": 9.301774260997778e-06, + "loss": 0.95484304, + "memory(GiB)": 302.58, + "step": 75720, + "train_speed(iter/s)": 0.124433 + }, + { + "acc": 0.71960249, + "epoch": 0.4235739541724747, + "grad_norm": 8.5625, + "learning_rate": 9.301302869449792e-06, + "loss": 1.12758541, + "memory(GiB)": 302.58, + "step": 75740, + "train_speed(iter/s)": 0.124448 + }, + { + "acc": 0.74627328, + "epoch": 0.42368580364545394, + "grad_norm": 5.59375, + "learning_rate": 9.300831330783607e-06, + "loss": 0.98605089, + "memory(GiB)": 302.58, + "step": 75760, + "train_speed(iter/s)": 0.124462 + }, + { + "acc": 0.72422519, + "epoch": 0.4237976531184332, + "grad_norm": 7.78125, + "learning_rate": 9.300359645015353e-06, + "loss": 1.09228964, + "memory(GiB)": 302.58, + "step": 75780, + "train_speed(iter/s)": 0.124477 + }, + { + "acc": 0.74185171, + "epoch": 0.42390950259141247, + "grad_norm": 4.46875, + "learning_rate": 9.299887812161163e-06, + "loss": 1.01590109, + "memory(GiB)": 302.58, + "step": 75800, + "train_speed(iter/s)": 0.124492 + }, + { + "acc": 0.72925596, + "epoch": 0.42402135206439173, + "grad_norm": 6.09375, + "learning_rate": 9.299415832237178e-06, + "loss": 1.07655144, + "memory(GiB)": 302.58, + "step": 75820, + "train_speed(iter/s)": 0.124507 + }, + { + "acc": 0.73726077, + "epoch": 0.424133201537371, + "grad_norm": 4.1875, + "learning_rate": 9.298943705259535e-06, + "loss": 1.05307531, + "memory(GiB)": 302.58, + "step": 75840, + "train_speed(iter/s)": 0.124523 + }, + { + "acc": 0.72912021, + "epoch": 0.42424505101035026, + "grad_norm": 6.875, + "learning_rate": 9.298471431244387e-06, + "loss": 1.07205753, + "memory(GiB)": 302.58, + "step": 75860, + "train_speed(iter/s)": 0.124539 + }, + { + "acc": 0.73077106, + "epoch": 0.4243569004833295, + "grad_norm": 5.375, + "learning_rate": 9.297999010207886e-06, + "loss": 1.06604815, + "memory(GiB)": 302.58, + "step": 75880, + "train_speed(iter/s)": 0.124553 + }, + { + "acc": 0.74098735, + "epoch": 0.4244687499563088, + "grad_norm": 8.0625, + "learning_rate": 9.29752644216619e-06, + "loss": 1.01666536, + "memory(GiB)": 302.58, + "step": 75900, + "train_speed(iter/s)": 0.124569 + }, + { + "acc": 0.73125644, + "epoch": 0.42458059942928805, + "grad_norm": 6.09375, + "learning_rate": 9.297053727135463e-06, + "loss": 1.07436581, + "memory(GiB)": 302.58, + "step": 75920, + "train_speed(iter/s)": 0.124585 + }, + { + "acc": 0.75112147, + "epoch": 0.4246924489022673, + "grad_norm": 7.96875, + "learning_rate": 9.296580865131872e-06, + "loss": 0.96033516, + "memory(GiB)": 302.58, + "step": 75940, + "train_speed(iter/s)": 0.124601 + }, + { + "acc": 0.730901, + "epoch": 0.4248042983752466, + "grad_norm": 4.9375, + "learning_rate": 9.296107856171592e-06, + "loss": 1.04905348, + "memory(GiB)": 302.58, + "step": 75960, + "train_speed(iter/s)": 0.124617 + }, + { + "acc": 0.74422512, + "epoch": 0.42491614784822584, + "grad_norm": 10.625, + "learning_rate": 9.295634700270802e-06, + "loss": 1.00360727, + "memory(GiB)": 302.58, + "step": 75980, + "train_speed(iter/s)": 0.124631 + }, + { + "acc": 0.72912388, + "epoch": 0.4250279973212051, + "grad_norm": 7.0, + "learning_rate": 9.295161397445682e-06, + "loss": 1.07498083, + "memory(GiB)": 302.58, + "step": 76000, + "train_speed(iter/s)": 0.124647 + }, + { + "epoch": 0.4250279973212051, + "eval_acc": 0.700548446466007, + "eval_loss": 1.0398677587509155, + "eval_runtime": 7544.1459, + "eval_samples_per_second": 9.979, + "eval_steps_per_second": 9.979, + "step": 76000 + }, + { + "acc": 0.7543272, + "epoch": 0.4251398467941844, + "grad_norm": 6.71875, + "learning_rate": 9.294687947712424e-06, + "loss": 0.95794878, + "memory(GiB)": 302.58, + "step": 76020, + "train_speed(iter/s)": 0.123113 + }, + { + "acc": 0.75069265, + "epoch": 0.42525169626716364, + "grad_norm": 7.96875, + "learning_rate": 9.29421435108722e-06, + "loss": 0.96746778, + "memory(GiB)": 302.58, + "step": 76040, + "train_speed(iter/s)": 0.123128 + }, + { + "acc": 0.7415772, + "epoch": 0.4253635457401429, + "grad_norm": 7.0625, + "learning_rate": 9.293740607586267e-06, + "loss": 1.01391878, + "memory(GiB)": 302.58, + "step": 76060, + "train_speed(iter/s)": 0.123143 + }, + { + "acc": 0.73151431, + "epoch": 0.42547539521312217, + "grad_norm": 7.28125, + "learning_rate": 9.293266717225773e-06, + "loss": 1.05408964, + "memory(GiB)": 302.58, + "step": 76080, + "train_speed(iter/s)": 0.123159 + }, + { + "acc": 0.7444809, + "epoch": 0.42558724468610143, + "grad_norm": 7.875, + "learning_rate": 9.292792680021941e-06, + "loss": 1.00629196, + "memory(GiB)": 302.58, + "step": 76100, + "train_speed(iter/s)": 0.123174 + }, + { + "acc": 0.72984447, + "epoch": 0.4256990941590807, + "grad_norm": 6.84375, + "learning_rate": 9.292318495990988e-06, + "loss": 1.09427118, + "memory(GiB)": 302.58, + "step": 76120, + "train_speed(iter/s)": 0.12319 + }, + { + "acc": 0.74599199, + "epoch": 0.42581094363205996, + "grad_norm": 6.40625, + "learning_rate": 9.291844165149133e-06, + "loss": 1.01979914, + "memory(GiB)": 302.58, + "step": 76140, + "train_speed(iter/s)": 0.123206 + }, + { + "acc": 0.73571, + "epoch": 0.4259227931050392, + "grad_norm": 4.625, + "learning_rate": 9.291369687512597e-06, + "loss": 1.04304695, + "memory(GiB)": 302.58, + "step": 76160, + "train_speed(iter/s)": 0.123221 + }, + { + "acc": 0.73603578, + "epoch": 0.4260346425780185, + "grad_norm": 7.28125, + "learning_rate": 9.290895063097611e-06, + "loss": 1.0501049, + "memory(GiB)": 302.58, + "step": 76180, + "train_speed(iter/s)": 0.123237 + }, + { + "acc": 0.74562778, + "epoch": 0.42614649205099775, + "grad_norm": 7.5625, + "learning_rate": 9.290420291920408e-06, + "loss": 1.02095022, + "memory(GiB)": 302.58, + "step": 76200, + "train_speed(iter/s)": 0.123251 + }, + { + "acc": 0.72918491, + "epoch": 0.426258341523977, + "grad_norm": 9.625, + "learning_rate": 9.289945373997227e-06, + "loss": 1.05695009, + "memory(GiB)": 302.58, + "step": 76220, + "train_speed(iter/s)": 0.123266 + }, + { + "acc": 0.74296088, + "epoch": 0.4263701909969563, + "grad_norm": 9.8125, + "learning_rate": 9.289470309344309e-06, + "loss": 1.01624203, + "memory(GiB)": 302.58, + "step": 76240, + "train_speed(iter/s)": 0.123281 + }, + { + "acc": 0.74014225, + "epoch": 0.42648204046993554, + "grad_norm": 8.625, + "learning_rate": 9.288995097977907e-06, + "loss": 1.01763315, + "memory(GiB)": 302.58, + "step": 76260, + "train_speed(iter/s)": 0.123297 + }, + { + "acc": 0.7445435, + "epoch": 0.4265938899429148, + "grad_norm": 6.84375, + "learning_rate": 9.288519739914272e-06, + "loss": 0.99766874, + "memory(GiB)": 302.58, + "step": 76280, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.74153953, + "epoch": 0.42670573941589407, + "grad_norm": 7.3125, + "learning_rate": 9.288044235169662e-06, + "loss": 1.00372095, + "memory(GiB)": 302.58, + "step": 76300, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.73086548, + "epoch": 0.42681758888887333, + "grad_norm": 6.9375, + "learning_rate": 9.287568583760343e-06, + "loss": 1.05602007, + "memory(GiB)": 302.58, + "step": 76320, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.73809667, + "epoch": 0.4269294383618526, + "grad_norm": 5.6875, + "learning_rate": 9.287092785702584e-06, + "loss": 1.00771065, + "memory(GiB)": 302.58, + "step": 76340, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.74181399, + "epoch": 0.42704128783483186, + "grad_norm": 9.125, + "learning_rate": 9.286616841012656e-06, + "loss": 1.02942247, + "memory(GiB)": 302.58, + "step": 76360, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.7293869, + "epoch": 0.4271531373078111, + "grad_norm": 6.9375, + "learning_rate": 9.28614074970684e-06, + "loss": 1.08812761, + "memory(GiB)": 302.58, + "step": 76380, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.74998112, + "epoch": 0.4272649867807904, + "grad_norm": 6.3125, + "learning_rate": 9.28566451180142e-06, + "loss": 0.97936287, + "memory(GiB)": 302.58, + "step": 76400, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.7360754, + "epoch": 0.42737683625376965, + "grad_norm": 6.125, + "learning_rate": 9.285188127312685e-06, + "loss": 1.05575237, + "memory(GiB)": 302.58, + "step": 76420, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.73554454, + "epoch": 0.4274886857267489, + "grad_norm": 5.0, + "learning_rate": 9.284711596256927e-06, + "loss": 1.06308641, + "memory(GiB)": 302.58, + "step": 76440, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.73892899, + "epoch": 0.4276005351997282, + "grad_norm": 8.6875, + "learning_rate": 9.284234918650445e-06, + "loss": 1.01824217, + "memory(GiB)": 302.58, + "step": 76460, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.7330749, + "epoch": 0.42771238467270745, + "grad_norm": 8.75, + "learning_rate": 9.283758094509545e-06, + "loss": 1.03954897, + "memory(GiB)": 302.58, + "step": 76480, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.7386085, + "epoch": 0.4278242341456867, + "grad_norm": 9.875, + "learning_rate": 9.283281123850536e-06, + "loss": 1.02034216, + "memory(GiB)": 302.58, + "step": 76500, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.74737525, + "epoch": 0.427936083618666, + "grad_norm": 6.25, + "learning_rate": 9.28280400668973e-06, + "loss": 0.98119097, + "memory(GiB)": 302.58, + "step": 76520, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.7519145, + "epoch": 0.42804793309164524, + "grad_norm": 8.375, + "learning_rate": 9.282326743043444e-06, + "loss": 0.96248722, + "memory(GiB)": 302.58, + "step": 76540, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.73269844, + "epoch": 0.4281597825646245, + "grad_norm": 4.25, + "learning_rate": 9.281849332928006e-06, + "loss": 1.07499866, + "memory(GiB)": 302.58, + "step": 76560, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.72140779, + "epoch": 0.42827163203760377, + "grad_norm": 5.8125, + "learning_rate": 9.281371776359745e-06, + "loss": 1.09978228, + "memory(GiB)": 302.58, + "step": 76580, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.74247451, + "epoch": 0.42838348151058303, + "grad_norm": 7.8125, + "learning_rate": 9.280894073354991e-06, + "loss": 1.02802353, + "memory(GiB)": 302.58, + "step": 76600, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.73098235, + "epoch": 0.4284953309835623, + "grad_norm": 4.5, + "learning_rate": 9.280416223930087e-06, + "loss": 1.09097872, + "memory(GiB)": 302.58, + "step": 76620, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.75013747, + "epoch": 0.42860718045654156, + "grad_norm": 5.15625, + "learning_rate": 9.279938228101375e-06, + "loss": 0.96310816, + "memory(GiB)": 302.58, + "step": 76640, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.73461103, + "epoch": 0.4287190299295209, + "grad_norm": 6.4375, + "learning_rate": 9.279460085885204e-06, + "loss": 1.03477135, + "memory(GiB)": 302.58, + "step": 76660, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.74988546, + "epoch": 0.42883087940250014, + "grad_norm": 8.4375, + "learning_rate": 9.278981797297927e-06, + "loss": 0.98279257, + "memory(GiB)": 302.58, + "step": 76680, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.73874002, + "epoch": 0.4289427288754794, + "grad_norm": 7.625, + "learning_rate": 9.278503362355907e-06, + "loss": 1.01981106, + "memory(GiB)": 302.58, + "step": 76700, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.72901201, + "epoch": 0.42905457834845867, + "grad_norm": 6.46875, + "learning_rate": 9.278024781075504e-06, + "loss": 1.05783558, + "memory(GiB)": 302.58, + "step": 76720, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75394754, + "epoch": 0.42916642782143793, + "grad_norm": 4.75, + "learning_rate": 9.277546053473088e-06, + "loss": 0.97224588, + "memory(GiB)": 302.58, + "step": 76740, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.72153344, + "epoch": 0.4292782772944172, + "grad_norm": 5.15625, + "learning_rate": 9.277067179565033e-06, + "loss": 1.10915661, + "memory(GiB)": 302.58, + "step": 76760, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.73075709, + "epoch": 0.42939012676739646, + "grad_norm": 8.75, + "learning_rate": 9.276588159367718e-06, + "loss": 1.07129164, + "memory(GiB)": 302.58, + "step": 76780, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.74280152, + "epoch": 0.4295019762403757, + "grad_norm": 6.375, + "learning_rate": 9.27610899289753e-06, + "loss": 1.00981731, + "memory(GiB)": 302.58, + "step": 76800, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.73110895, + "epoch": 0.429613825713355, + "grad_norm": 8.6875, + "learning_rate": 9.275629680170852e-06, + "loss": 1.08046722, + "memory(GiB)": 302.58, + "step": 76820, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.73631182, + "epoch": 0.42972567518633425, + "grad_norm": 6.125, + "learning_rate": 9.275150221204083e-06, + "loss": 1.03761911, + "memory(GiB)": 302.58, + "step": 76840, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.7394249, + "epoch": 0.4298375246593135, + "grad_norm": 5.78125, + "learning_rate": 9.274670616013619e-06, + "loss": 1.0246397, + "memory(GiB)": 302.58, + "step": 76860, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.7477428, + "epoch": 0.4299493741322928, + "grad_norm": 7.40625, + "learning_rate": 9.274190864615867e-06, + "loss": 1.00981483, + "memory(GiB)": 302.58, + "step": 76880, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.74745522, + "epoch": 0.43006122360527205, + "grad_norm": 7.65625, + "learning_rate": 9.273710967027232e-06, + "loss": 0.99171, + "memory(GiB)": 302.58, + "step": 76900, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.73529358, + "epoch": 0.4301730730782513, + "grad_norm": 7.3125, + "learning_rate": 9.273230923264132e-06, + "loss": 1.00854473, + "memory(GiB)": 302.58, + "step": 76920, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.74061933, + "epoch": 0.4302849225512306, + "grad_norm": 4.375, + "learning_rate": 9.272750733342984e-06, + "loss": 1.04120302, + "memory(GiB)": 302.58, + "step": 76940, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.73377328, + "epoch": 0.43039677202420984, + "grad_norm": 6.03125, + "learning_rate": 9.272270397280213e-06, + "loss": 1.04638233, + "memory(GiB)": 302.58, + "step": 76960, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.7302238, + "epoch": 0.4305086214971891, + "grad_norm": 7.71875, + "learning_rate": 9.271789915092246e-06, + "loss": 1.0706296, + "memory(GiB)": 302.58, + "step": 76980, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.73173547, + "epoch": 0.43062047097016837, + "grad_norm": 6.8125, + "learning_rate": 9.27130928679552e-06, + "loss": 1.06090708, + "memory(GiB)": 302.58, + "step": 77000, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.73828139, + "epoch": 0.43073232044314763, + "grad_norm": 9.0, + "learning_rate": 9.270828512406472e-06, + "loss": 1.04504499, + "memory(GiB)": 302.58, + "step": 77020, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.73301826, + "epoch": 0.4308441699161269, + "grad_norm": 8.3125, + "learning_rate": 9.270347591941547e-06, + "loss": 1.06434193, + "memory(GiB)": 302.58, + "step": 77040, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.73647394, + "epoch": 0.43095601938910616, + "grad_norm": 6.34375, + "learning_rate": 9.269866525417191e-06, + "loss": 1.02975197, + "memory(GiB)": 302.58, + "step": 77060, + "train_speed(iter/s)": 0.123904 + }, + { + "acc": 0.73008633, + "epoch": 0.4310678688620854, + "grad_norm": 7.375, + "learning_rate": 9.269385312849863e-06, + "loss": 1.08357487, + "memory(GiB)": 302.58, + "step": 77080, + "train_speed(iter/s)": 0.123919 + }, + { + "acc": 0.75595846, + "epoch": 0.4311797183350647, + "grad_norm": 6.375, + "learning_rate": 9.268903954256019e-06, + "loss": 0.93261003, + "memory(GiB)": 302.58, + "step": 77100, + "train_speed(iter/s)": 0.123934 + }, + { + "acc": 0.73125572, + "epoch": 0.43129156780804395, + "grad_norm": 13.4375, + "learning_rate": 9.268422449652122e-06, + "loss": 1.07532978, + "memory(GiB)": 302.58, + "step": 77120, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.72680464, + "epoch": 0.4314034172810232, + "grad_norm": 6.3125, + "learning_rate": 9.267940799054644e-06, + "loss": 1.06538258, + "memory(GiB)": 302.58, + "step": 77140, + "train_speed(iter/s)": 0.123967 + }, + { + "acc": 0.73533111, + "epoch": 0.4315152667540025, + "grad_norm": 6.96875, + "learning_rate": 9.267459002480057e-06, + "loss": 1.04259748, + "memory(GiB)": 302.58, + "step": 77160, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.72135515, + "epoch": 0.43162711622698174, + "grad_norm": 7.0625, + "learning_rate": 9.26697705994484e-06, + "loss": 1.09774656, + "memory(GiB)": 302.58, + "step": 77180, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.74552741, + "epoch": 0.431738965699961, + "grad_norm": 6.09375, + "learning_rate": 9.266494971465477e-06, + "loss": 0.99593163, + "memory(GiB)": 302.58, + "step": 77200, + "train_speed(iter/s)": 0.124012 + }, + { + "acc": 0.72960587, + "epoch": 0.43185081517294027, + "grad_norm": 6.59375, + "learning_rate": 9.266012737058459e-06, + "loss": 1.07317371, + "memory(GiB)": 302.58, + "step": 77220, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.72837462, + "epoch": 0.43196266464591954, + "grad_norm": 9.625, + "learning_rate": 9.265530356740276e-06, + "loss": 1.09327688, + "memory(GiB)": 302.58, + "step": 77240, + "train_speed(iter/s)": 0.124041 + }, + { + "acc": 0.75350494, + "epoch": 0.4320745141188988, + "grad_norm": 8.25, + "learning_rate": 9.26504783052743e-06, + "loss": 0.96502876, + "memory(GiB)": 302.58, + "step": 77260, + "train_speed(iter/s)": 0.124055 + }, + { + "acc": 0.73923464, + "epoch": 0.43218636359187806, + "grad_norm": 5.875, + "learning_rate": 9.264565158436425e-06, + "loss": 1.00561905, + "memory(GiB)": 302.58, + "step": 77280, + "train_speed(iter/s)": 0.12407 + }, + { + "acc": 0.74189744, + "epoch": 0.43229821306485733, + "grad_norm": 6.78125, + "learning_rate": 9.26408234048377e-06, + "loss": 0.99154549, + "memory(GiB)": 302.58, + "step": 77300, + "train_speed(iter/s)": 0.124084 + }, + { + "acc": 0.74800873, + "epoch": 0.4324100625378366, + "grad_norm": 7.78125, + "learning_rate": 9.263599376685975e-06, + "loss": 0.9721941, + "memory(GiB)": 302.58, + "step": 77320, + "train_speed(iter/s)": 0.124099 + }, + { + "acc": 0.72722325, + "epoch": 0.43252191201081586, + "grad_norm": 9.25, + "learning_rate": 9.263116267059563e-06, + "loss": 1.05810661, + "memory(GiB)": 302.58, + "step": 77340, + "train_speed(iter/s)": 0.124115 + }, + { + "acc": 0.73340478, + "epoch": 0.4326337614837951, + "grad_norm": 6.40625, + "learning_rate": 9.262633011621057e-06, + "loss": 1.04317999, + "memory(GiB)": 302.58, + "step": 77360, + "train_speed(iter/s)": 0.124131 + }, + { + "acc": 0.75605927, + "epoch": 0.4327456109567744, + "grad_norm": 5.40625, + "learning_rate": 9.262149610386983e-06, + "loss": 0.94668322, + "memory(GiB)": 302.58, + "step": 77380, + "train_speed(iter/s)": 0.124146 + }, + { + "acc": 0.735536, + "epoch": 0.43285746042975365, + "grad_norm": 7.34375, + "learning_rate": 9.26166606337388e-06, + "loss": 1.00910292, + "memory(GiB)": 302.58, + "step": 77400, + "train_speed(iter/s)": 0.124161 + }, + { + "acc": 0.73826838, + "epoch": 0.4329693099027329, + "grad_norm": 8.875, + "learning_rate": 9.261182370598285e-06, + "loss": 1.01818142, + "memory(GiB)": 302.58, + "step": 77420, + "train_speed(iter/s)": 0.124176 + }, + { + "acc": 0.74048772, + "epoch": 0.4330811593757122, + "grad_norm": 9.0, + "learning_rate": 9.26069853207674e-06, + "loss": 1.03467684, + "memory(GiB)": 302.58, + "step": 77440, + "train_speed(iter/s)": 0.12419 + }, + { + "acc": 0.75463037, + "epoch": 0.43319300884869144, + "grad_norm": 8.625, + "learning_rate": 9.260214547825794e-06, + "loss": 0.92789335, + "memory(GiB)": 302.58, + "step": 77460, + "train_speed(iter/s)": 0.124205 + }, + { + "acc": 0.73460851, + "epoch": 0.4333048583216707, + "grad_norm": 7.4375, + "learning_rate": 9.259730417862003e-06, + "loss": 1.05141993, + "memory(GiB)": 302.58, + "step": 77480, + "train_speed(iter/s)": 0.12422 + }, + { + "acc": 0.73971677, + "epoch": 0.43341670779464997, + "grad_norm": 5.78125, + "learning_rate": 9.259246142201924e-06, + "loss": 1.01807528, + "memory(GiB)": 302.58, + "step": 77500, + "train_speed(iter/s)": 0.124235 + }, + { + "acc": 0.74342566, + "epoch": 0.43352855726762923, + "grad_norm": 6.125, + "learning_rate": 9.258761720862121e-06, + "loss": 1.00860739, + "memory(GiB)": 302.58, + "step": 77520, + "train_speed(iter/s)": 0.124251 + }, + { + "acc": 0.73952813, + "epoch": 0.4336404067406085, + "grad_norm": 8.75, + "learning_rate": 9.258277153859164e-06, + "loss": 1.01194906, + "memory(GiB)": 302.58, + "step": 77540, + "train_speed(iter/s)": 0.124267 + }, + { + "acc": 0.73718162, + "epoch": 0.43375225621358776, + "grad_norm": 5.0, + "learning_rate": 9.257792441209626e-06, + "loss": 1.02454166, + "memory(GiB)": 302.58, + "step": 77560, + "train_speed(iter/s)": 0.124282 + }, + { + "acc": 0.72243004, + "epoch": 0.433864105686567, + "grad_norm": 5.40625, + "learning_rate": 9.257307582930085e-06, + "loss": 1.09585762, + "memory(GiB)": 302.58, + "step": 77580, + "train_speed(iter/s)": 0.124296 + }, + { + "acc": 0.71529822, + "epoch": 0.4339759551595463, + "grad_norm": 5.78125, + "learning_rate": 9.256822579037123e-06, + "loss": 1.1585309, + "memory(GiB)": 302.58, + "step": 77600, + "train_speed(iter/s)": 0.12431 + }, + { + "acc": 0.72870064, + "epoch": 0.43408780463252555, + "grad_norm": 8.4375, + "learning_rate": 9.256337429547332e-06, + "loss": 1.05718727, + "memory(GiB)": 302.58, + "step": 77620, + "train_speed(iter/s)": 0.124325 + }, + { + "acc": 0.74088397, + "epoch": 0.4341996541055048, + "grad_norm": 5.90625, + "learning_rate": 9.255852134477305e-06, + "loss": 1.02620449, + "memory(GiB)": 302.58, + "step": 77640, + "train_speed(iter/s)": 0.12434 + }, + { + "acc": 0.74005589, + "epoch": 0.4343115035784841, + "grad_norm": 10.1875, + "learning_rate": 9.255366693843639e-06, + "loss": 1.03319731, + "memory(GiB)": 302.58, + "step": 77660, + "train_speed(iter/s)": 0.124354 + }, + { + "acc": 0.74462113, + "epoch": 0.43442335305146335, + "grad_norm": 4.4375, + "learning_rate": 9.254881107662939e-06, + "loss": 1.00497551, + "memory(GiB)": 302.58, + "step": 77680, + "train_speed(iter/s)": 0.12437 + }, + { + "acc": 0.72171812, + "epoch": 0.4345352025244426, + "grad_norm": 5.125, + "learning_rate": 9.254395375951815e-06, + "loss": 1.10027704, + "memory(GiB)": 302.58, + "step": 77700, + "train_speed(iter/s)": 0.124385 + }, + { + "acc": 0.7483448, + "epoch": 0.4346470519974219, + "grad_norm": 9.125, + "learning_rate": 9.253909498726878e-06, + "loss": 0.97746859, + "memory(GiB)": 302.58, + "step": 77720, + "train_speed(iter/s)": 0.1244 + }, + { + "acc": 0.72995653, + "epoch": 0.43475890147040114, + "grad_norm": 7.1875, + "learning_rate": 9.253423476004745e-06, + "loss": 1.06326675, + "memory(GiB)": 302.58, + "step": 77740, + "train_speed(iter/s)": 0.124415 + }, + { + "acc": 0.73537459, + "epoch": 0.4348707509433804, + "grad_norm": 8.5625, + "learning_rate": 9.252937307802046e-06, + "loss": 1.03884802, + "memory(GiB)": 302.58, + "step": 77760, + "train_speed(iter/s)": 0.124431 + }, + { + "acc": 0.73945398, + "epoch": 0.43498260041635967, + "grad_norm": 6.78125, + "learning_rate": 9.252450994135402e-06, + "loss": 1.03417816, + "memory(GiB)": 302.58, + "step": 77780, + "train_speed(iter/s)": 0.124446 + }, + { + "acc": 0.74930625, + "epoch": 0.43509444988933893, + "grad_norm": 7.8125, + "learning_rate": 9.251964535021451e-06, + "loss": 0.97411699, + "memory(GiB)": 302.58, + "step": 77800, + "train_speed(iter/s)": 0.12446 + }, + { + "acc": 0.73155479, + "epoch": 0.4352062993623182, + "grad_norm": 8.4375, + "learning_rate": 9.251477930476831e-06, + "loss": 1.08815279, + "memory(GiB)": 302.58, + "step": 77820, + "train_speed(iter/s)": 0.124475 + }, + { + "acc": 0.75105605, + "epoch": 0.43531814883529746, + "grad_norm": 9.125, + "learning_rate": 9.250991180518183e-06, + "loss": 0.99348946, + "memory(GiB)": 302.58, + "step": 77840, + "train_speed(iter/s)": 0.12449 + }, + { + "acc": 0.72884626, + "epoch": 0.4354299983082767, + "grad_norm": 9.25, + "learning_rate": 9.250504285162158e-06, + "loss": 1.05935593, + "memory(GiB)": 302.58, + "step": 77860, + "train_speed(iter/s)": 0.124506 + }, + { + "acc": 0.75209255, + "epoch": 0.435541847781256, + "grad_norm": 6.34375, + "learning_rate": 9.250017244425409e-06, + "loss": 0.98805923, + "memory(GiB)": 302.58, + "step": 77880, + "train_speed(iter/s)": 0.124521 + }, + { + "acc": 0.75421019, + "epoch": 0.43565369725423525, + "grad_norm": 5.46875, + "learning_rate": 9.249530058324594e-06, + "loss": 0.94563808, + "memory(GiB)": 302.58, + "step": 77900, + "train_speed(iter/s)": 0.124537 + }, + { + "acc": 0.74749565, + "epoch": 0.4357655467272145, + "grad_norm": 8.3125, + "learning_rate": 9.249042726876374e-06, + "loss": 0.99696531, + "memory(GiB)": 302.58, + "step": 77920, + "train_speed(iter/s)": 0.124553 + }, + { + "acc": 0.74117041, + "epoch": 0.4358773962001938, + "grad_norm": 7.5, + "learning_rate": 9.248555250097421e-06, + "loss": 0.99986668, + "memory(GiB)": 302.58, + "step": 77940, + "train_speed(iter/s)": 0.124568 + }, + { + "acc": 0.72876101, + "epoch": 0.43598924567317304, + "grad_norm": 7.34375, + "learning_rate": 9.248067628004407e-06, + "loss": 1.06386223, + "memory(GiB)": 302.58, + "step": 77960, + "train_speed(iter/s)": 0.124583 + }, + { + "acc": 0.75796256, + "epoch": 0.4361010951461523, + "grad_norm": 8.4375, + "learning_rate": 9.24757986061401e-06, + "loss": 0.943787, + "memory(GiB)": 302.58, + "step": 77980, + "train_speed(iter/s)": 0.124599 + }, + { + "acc": 0.73411603, + "epoch": 0.43621294461913157, + "grad_norm": 9.625, + "learning_rate": 9.247091947942912e-06, + "loss": 1.05543938, + "memory(GiB)": 302.58, + "step": 78000, + "train_speed(iter/s)": 0.124614 + }, + { + "epoch": 0.43621294461913157, + "eval_acc": 0.7007520421933165, + "eval_loss": 1.0393517017364502, + "eval_runtime": 7501.043, + "eval_samples_per_second": 10.036, + "eval_steps_per_second": 10.036, + "step": 78000 + }, + { + "acc": 0.73570323, + "epoch": 0.43632479409211083, + "grad_norm": 9.0625, + "learning_rate": 9.246603890007803e-06, + "loss": 1.03513479, + "memory(GiB)": 302.58, + "step": 78020, + "train_speed(iter/s)": 0.123129 + }, + { + "acc": 0.74029717, + "epoch": 0.4364366435650901, + "grad_norm": 6.28125, + "learning_rate": 9.246115686825375e-06, + "loss": 1.03009605, + "memory(GiB)": 302.58, + "step": 78040, + "train_speed(iter/s)": 0.123144 + }, + { + "acc": 0.75676451, + "epoch": 0.43654849303806936, + "grad_norm": 8.6875, + "learning_rate": 9.245627338412326e-06, + "loss": 0.95250559, + "memory(GiB)": 302.58, + "step": 78060, + "train_speed(iter/s)": 0.123158 + }, + { + "acc": 0.72512488, + "epoch": 0.4366603425110486, + "grad_norm": 5.625, + "learning_rate": 9.24513884478536e-06, + "loss": 1.11664057, + "memory(GiB)": 302.58, + "step": 78080, + "train_speed(iter/s)": 0.123173 + }, + { + "acc": 0.71436777, + "epoch": 0.4367721919840279, + "grad_norm": 9.9375, + "learning_rate": 9.244650205961185e-06, + "loss": 1.13986273, + "memory(GiB)": 302.58, + "step": 78100, + "train_speed(iter/s)": 0.123188 + }, + { + "acc": 0.73799949, + "epoch": 0.43688404145700716, + "grad_norm": 6.34375, + "learning_rate": 9.244161421956512e-06, + "loss": 1.0234046, + "memory(GiB)": 302.58, + "step": 78120, + "train_speed(iter/s)": 0.123204 + }, + { + "acc": 0.71344862, + "epoch": 0.4369958909299864, + "grad_norm": 6.125, + "learning_rate": 9.243672492788062e-06, + "loss": 1.14426079, + "memory(GiB)": 302.58, + "step": 78140, + "train_speed(iter/s)": 0.123217 + }, + { + "acc": 0.73629823, + "epoch": 0.4371077404029657, + "grad_norm": 7.125, + "learning_rate": 9.243183418472557e-06, + "loss": 1.04683075, + "memory(GiB)": 302.58, + "step": 78160, + "train_speed(iter/s)": 0.123232 + }, + { + "acc": 0.73645115, + "epoch": 0.43721958987594495, + "grad_norm": 6.75, + "learning_rate": 9.242694199026722e-06, + "loss": 1.03696442, + "memory(GiB)": 302.58, + "step": 78180, + "train_speed(iter/s)": 0.123247 + }, + { + "acc": 0.73254805, + "epoch": 0.4373314393489242, + "grad_norm": 8.3125, + "learning_rate": 9.242204834467295e-06, + "loss": 1.06297474, + "memory(GiB)": 302.58, + "step": 78200, + "train_speed(iter/s)": 0.123262 + }, + { + "acc": 0.74407701, + "epoch": 0.4374432888219035, + "grad_norm": 6.3125, + "learning_rate": 9.241715324811009e-06, + "loss": 1.00772905, + "memory(GiB)": 302.58, + "step": 78220, + "train_speed(iter/s)": 0.123277 + }, + { + "acc": 0.73191833, + "epoch": 0.43755513829488274, + "grad_norm": 8.1875, + "learning_rate": 9.241225670074609e-06, + "loss": 1.05776348, + "memory(GiB)": 302.58, + "step": 78240, + "train_speed(iter/s)": 0.123292 + }, + { + "acc": 0.73187947, + "epoch": 0.437666987767862, + "grad_norm": 6.5, + "learning_rate": 9.240735870274842e-06, + "loss": 1.05283051, + "memory(GiB)": 302.58, + "step": 78260, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.7276906, + "epoch": 0.43777883724084127, + "grad_norm": 7.65625, + "learning_rate": 9.240245925428462e-06, + "loss": 1.08812475, + "memory(GiB)": 302.58, + "step": 78280, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.74842405, + "epoch": 0.43789068671382053, + "grad_norm": 7.375, + "learning_rate": 9.239755835552227e-06, + "loss": 0.9696311, + "memory(GiB)": 302.58, + "step": 78300, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.7504817, + "epoch": 0.4380025361867998, + "grad_norm": 7.3125, + "learning_rate": 9.239265600662897e-06, + "loss": 0.95657272, + "memory(GiB)": 302.58, + "step": 78320, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.71094971, + "epoch": 0.43811438565977906, + "grad_norm": 7.375, + "learning_rate": 9.23877522077724e-06, + "loss": 1.15035305, + "memory(GiB)": 302.58, + "step": 78340, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.73243699, + "epoch": 0.4382262351327583, + "grad_norm": 6.3125, + "learning_rate": 9.238284695912033e-06, + "loss": 1.06913643, + "memory(GiB)": 302.58, + "step": 78360, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.73458409, + "epoch": 0.4383380846057376, + "grad_norm": 8.6875, + "learning_rate": 9.237794026084048e-06, + "loss": 1.0368865, + "memory(GiB)": 302.58, + "step": 78380, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.7455503, + "epoch": 0.43844993407871685, + "grad_norm": 7.28125, + "learning_rate": 9.237303211310069e-06, + "loss": 0.98618784, + "memory(GiB)": 302.58, + "step": 78400, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75456862, + "epoch": 0.4385617835516961, + "grad_norm": 8.8125, + "learning_rate": 9.236812251606885e-06, + "loss": 0.95522852, + "memory(GiB)": 302.58, + "step": 78420, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.71861167, + "epoch": 0.4386736330246754, + "grad_norm": 8.75, + "learning_rate": 9.236321146991288e-06, + "loss": 1.11668482, + "memory(GiB)": 302.58, + "step": 78440, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.72905374, + "epoch": 0.43878548249765464, + "grad_norm": 5.65625, + "learning_rate": 9.235829897480073e-06, + "loss": 1.06658926, + "memory(GiB)": 302.58, + "step": 78460, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.7137104, + "epoch": 0.4388973319706339, + "grad_norm": 6.125, + "learning_rate": 9.235338503090045e-06, + "loss": 1.13148746, + "memory(GiB)": 302.58, + "step": 78480, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.73092093, + "epoch": 0.4390091814436132, + "grad_norm": 6.84375, + "learning_rate": 9.23484696383801e-06, + "loss": 1.06263638, + "memory(GiB)": 302.58, + "step": 78500, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.71898942, + "epoch": 0.43912103091659244, + "grad_norm": 5.4375, + "learning_rate": 9.234355279740784e-06, + "loss": 1.15067949, + "memory(GiB)": 302.58, + "step": 78520, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.73376184, + "epoch": 0.4392328803895717, + "grad_norm": 7.8125, + "learning_rate": 9.233863450815175e-06, + "loss": 1.04165173, + "memory(GiB)": 302.58, + "step": 78540, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.73237791, + "epoch": 0.43934472986255096, + "grad_norm": 6.5, + "learning_rate": 9.233371477078015e-06, + "loss": 1.03607578, + "memory(GiB)": 302.58, + "step": 78560, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.74569116, + "epoch": 0.43945657933553023, + "grad_norm": 7.09375, + "learning_rate": 9.232879358546125e-06, + "loss": 0.98895607, + "memory(GiB)": 302.58, + "step": 78580, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75084057, + "epoch": 0.4395684288085095, + "grad_norm": 6.5625, + "learning_rate": 9.232387095236338e-06, + "loss": 0.9644824, + "memory(GiB)": 302.58, + "step": 78600, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.71795979, + "epoch": 0.43968027828148876, + "grad_norm": 6.46875, + "learning_rate": 9.231894687165493e-06, + "loss": 1.1161684, + "memory(GiB)": 302.58, + "step": 78620, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.73459315, + "epoch": 0.439792127754468, + "grad_norm": 5.03125, + "learning_rate": 9.231402134350432e-06, + "loss": 1.06130533, + "memory(GiB)": 302.58, + "step": 78640, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74194593, + "epoch": 0.4399039772274473, + "grad_norm": 5.75, + "learning_rate": 9.230909436807999e-06, + "loss": 1.02253065, + "memory(GiB)": 302.58, + "step": 78660, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.7433043, + "epoch": 0.44001582670042655, + "grad_norm": 5.0625, + "learning_rate": 9.230416594555048e-06, + "loss": 1.02497129, + "memory(GiB)": 302.58, + "step": 78680, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.73043485, + "epoch": 0.4401276761734058, + "grad_norm": 5.28125, + "learning_rate": 9.229923607608433e-06, + "loss": 1.06946659, + "memory(GiB)": 302.58, + "step": 78700, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.73973517, + "epoch": 0.4402395256463851, + "grad_norm": 10.75, + "learning_rate": 9.22943047598502e-06, + "loss": 1.01815987, + "memory(GiB)": 302.58, + "step": 78720, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.73952317, + "epoch": 0.44035137511936434, + "grad_norm": 6.1875, + "learning_rate": 9.228937199701673e-06, + "loss": 1.01808691, + "memory(GiB)": 302.58, + "step": 78740, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.73085399, + "epoch": 0.4404632245923436, + "grad_norm": 8.125, + "learning_rate": 9.228443778775265e-06, + "loss": 1.05547237, + "memory(GiB)": 302.58, + "step": 78760, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.72423344, + "epoch": 0.44057507406532287, + "grad_norm": 7.375, + "learning_rate": 9.227950213222672e-06, + "loss": 1.09337521, + "memory(GiB)": 302.58, + "step": 78780, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.74547424, + "epoch": 0.44068692353830213, + "grad_norm": 5.875, + "learning_rate": 9.227456503060774e-06, + "loss": 1.00402517, + "memory(GiB)": 302.58, + "step": 78800, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.73159537, + "epoch": 0.4407987730112814, + "grad_norm": 6.78125, + "learning_rate": 9.226962648306459e-06, + "loss": 1.07253456, + "memory(GiB)": 302.58, + "step": 78820, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.72940907, + "epoch": 0.44091062248426066, + "grad_norm": 6.28125, + "learning_rate": 9.226468648976618e-06, + "loss": 1.06246386, + "memory(GiB)": 302.58, + "step": 78840, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.72378521, + "epoch": 0.4410224719572399, + "grad_norm": 4.9375, + "learning_rate": 9.225974505088149e-06, + "loss": 1.08363533, + "memory(GiB)": 302.58, + "step": 78860, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.74168468, + "epoch": 0.4411343214302192, + "grad_norm": 9.875, + "learning_rate": 9.22548021665795e-06, + "loss": 0.98607321, + "memory(GiB)": 302.58, + "step": 78880, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.72871675, + "epoch": 0.44124617090319845, + "grad_norm": 6.1875, + "learning_rate": 9.22498578370293e-06, + "loss": 1.08367014, + "memory(GiB)": 302.58, + "step": 78900, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.73441772, + "epoch": 0.4413580203761777, + "grad_norm": 6.09375, + "learning_rate": 9.224491206239997e-06, + "loss": 1.05275269, + "memory(GiB)": 302.58, + "step": 78920, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.73137779, + "epoch": 0.441469869849157, + "grad_norm": 5.78125, + "learning_rate": 9.22399648428607e-06, + "loss": 1.08094969, + "memory(GiB)": 302.58, + "step": 78940, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.73623066, + "epoch": 0.44158171932213625, + "grad_norm": 8.875, + "learning_rate": 9.22350161785807e-06, + "loss": 1.02215242, + "memory(GiB)": 302.58, + "step": 78960, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.73451481, + "epoch": 0.4416935687951155, + "grad_norm": 6.375, + "learning_rate": 9.223006606972923e-06, + "loss": 1.05157824, + "memory(GiB)": 302.58, + "step": 78980, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.75724096, + "epoch": 0.4418054182680948, + "grad_norm": 4.9375, + "learning_rate": 9.22251145164756e-06, + "loss": 0.9519453, + "memory(GiB)": 302.58, + "step": 79000, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.7350502, + "epoch": 0.44191726774107404, + "grad_norm": 11.0, + "learning_rate": 9.222016151898914e-06, + "loss": 1.0673337, + "memory(GiB)": 302.58, + "step": 79020, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.7462913, + "epoch": 0.4420291172140533, + "grad_norm": 5.3125, + "learning_rate": 9.221520707743927e-06, + "loss": 0.99135923, + "memory(GiB)": 302.58, + "step": 79040, + "train_speed(iter/s)": 0.123888 + }, + { + "acc": 0.73025599, + "epoch": 0.44214096668703257, + "grad_norm": 8.1875, + "learning_rate": 9.221025119199549e-06, + "loss": 1.06194305, + "memory(GiB)": 302.58, + "step": 79060, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73426185, + "epoch": 0.44225281616001183, + "grad_norm": 4.53125, + "learning_rate": 9.220529386282727e-06, + "loss": 1.04603167, + "memory(GiB)": 302.58, + "step": 79080, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.75603609, + "epoch": 0.4423646656329911, + "grad_norm": 9.3125, + "learning_rate": 9.220033509010414e-06, + "loss": 0.93432274, + "memory(GiB)": 302.58, + "step": 79100, + "train_speed(iter/s)": 0.123932 + }, + { + "acc": 0.74720182, + "epoch": 0.44247651510597036, + "grad_norm": 6.65625, + "learning_rate": 9.219537487399576e-06, + "loss": 0.97660971, + "memory(GiB)": 302.58, + "step": 79120, + "train_speed(iter/s)": 0.123947 + }, + { + "acc": 0.75371709, + "epoch": 0.4425883645789496, + "grad_norm": 5.84375, + "learning_rate": 9.219041321467176e-06, + "loss": 0.97539253, + "memory(GiB)": 302.58, + "step": 79140, + "train_speed(iter/s)": 0.123962 + }, + { + "acc": 0.73942184, + "epoch": 0.4427002140519289, + "grad_norm": 8.625, + "learning_rate": 9.218545011230183e-06, + "loss": 1.03125944, + "memory(GiB)": 302.58, + "step": 79160, + "train_speed(iter/s)": 0.123976 + }, + { + "acc": 0.7415257, + "epoch": 0.4428120635249082, + "grad_norm": 7.75, + "learning_rate": 9.218048556705576e-06, + "loss": 1.00579748, + "memory(GiB)": 302.58, + "step": 79180, + "train_speed(iter/s)": 0.12399 + }, + { + "acc": 0.73308291, + "epoch": 0.44292391299788747, + "grad_norm": 7.3125, + "learning_rate": 9.217551957910331e-06, + "loss": 1.05671644, + "memory(GiB)": 302.58, + "step": 79200, + "train_speed(iter/s)": 0.124004 + }, + { + "acc": 0.73446364, + "epoch": 0.44303576247086673, + "grad_norm": 7.25, + "learning_rate": 9.217055214861437e-06, + "loss": 1.03746281, + "memory(GiB)": 302.58, + "step": 79220, + "train_speed(iter/s)": 0.124019 + }, + { + "acc": 0.74164438, + "epoch": 0.443147611943846, + "grad_norm": 6.625, + "learning_rate": 9.216558327575883e-06, + "loss": 1.03847523, + "memory(GiB)": 302.58, + "step": 79240, + "train_speed(iter/s)": 0.124034 + }, + { + "acc": 0.75350361, + "epoch": 0.44325946141682526, + "grad_norm": 6.8125, + "learning_rate": 9.216061296070665e-06, + "loss": 0.94760685, + "memory(GiB)": 302.58, + "step": 79260, + "train_speed(iter/s)": 0.124049 + }, + { + "acc": 0.73681102, + "epoch": 0.4433713108898045, + "grad_norm": 7.96875, + "learning_rate": 9.21556412036278e-06, + "loss": 1.01610775, + "memory(GiB)": 302.58, + "step": 79280, + "train_speed(iter/s)": 0.124064 + }, + { + "acc": 0.74203806, + "epoch": 0.4434831603627838, + "grad_norm": 6.40625, + "learning_rate": 9.215066800469235e-06, + "loss": 1.01447935, + "memory(GiB)": 302.58, + "step": 79300, + "train_speed(iter/s)": 0.124079 + }, + { + "acc": 0.73249722, + "epoch": 0.44359500983576305, + "grad_norm": 6.46875, + "learning_rate": 9.21456933640704e-06, + "loss": 1.0408186, + "memory(GiB)": 302.58, + "step": 79320, + "train_speed(iter/s)": 0.124094 + }, + { + "acc": 0.74568586, + "epoch": 0.4437068593087423, + "grad_norm": 9.5, + "learning_rate": 9.21407172819321e-06, + "loss": 0.98089571, + "memory(GiB)": 302.58, + "step": 79340, + "train_speed(iter/s)": 0.124108 + }, + { + "acc": 0.74626899, + "epoch": 0.4438187087817216, + "grad_norm": 8.375, + "learning_rate": 9.213573975844765e-06, + "loss": 1.01620121, + "memory(GiB)": 302.58, + "step": 79360, + "train_speed(iter/s)": 0.124123 + }, + { + "acc": 0.72962599, + "epoch": 0.44393055825470085, + "grad_norm": 5.375, + "learning_rate": 9.213076079378728e-06, + "loss": 1.08086271, + "memory(GiB)": 302.58, + "step": 79380, + "train_speed(iter/s)": 0.124138 + }, + { + "acc": 0.7409091, + "epoch": 0.4440424077276801, + "grad_norm": 8.0625, + "learning_rate": 9.21257803881213e-06, + "loss": 0.9907815, + "memory(GiB)": 302.58, + "step": 79400, + "train_speed(iter/s)": 0.124153 + }, + { + "acc": 0.73299375, + "epoch": 0.4441542572006594, + "grad_norm": 7.375, + "learning_rate": 9.212079854162005e-06, + "loss": 1.05081654, + "memory(GiB)": 302.58, + "step": 79420, + "train_speed(iter/s)": 0.124168 + }, + { + "acc": 0.73064651, + "epoch": 0.44426610667363864, + "grad_norm": 5.375, + "learning_rate": 9.211581525445394e-06, + "loss": 1.0534976, + "memory(GiB)": 302.58, + "step": 79440, + "train_speed(iter/s)": 0.124183 + }, + { + "acc": 0.73278017, + "epoch": 0.4443779561466179, + "grad_norm": 7.53125, + "learning_rate": 9.21108305267934e-06, + "loss": 1.06088333, + "memory(GiB)": 302.58, + "step": 79460, + "train_speed(iter/s)": 0.124199 + }, + { + "acc": 0.74657297, + "epoch": 0.44448980561959717, + "grad_norm": 6.4375, + "learning_rate": 9.210584435880892e-06, + "loss": 1.01787472, + "memory(GiB)": 302.58, + "step": 79480, + "train_speed(iter/s)": 0.124213 + }, + { + "acc": 0.73782587, + "epoch": 0.44460165509257643, + "grad_norm": 5.40625, + "learning_rate": 9.210085675067104e-06, + "loss": 1.02831345, + "memory(GiB)": 302.58, + "step": 79500, + "train_speed(iter/s)": 0.124228 + }, + { + "acc": 0.73733773, + "epoch": 0.4447135045655557, + "grad_norm": 6.6875, + "learning_rate": 9.209586770255039e-06, + "loss": 1.03823709, + "memory(GiB)": 302.58, + "step": 79520, + "train_speed(iter/s)": 0.124243 + }, + { + "acc": 0.73265972, + "epoch": 0.44482535403853496, + "grad_norm": 8.5, + "learning_rate": 9.209087721461755e-06, + "loss": 1.05134935, + "memory(GiB)": 302.58, + "step": 79540, + "train_speed(iter/s)": 0.124258 + }, + { + "acc": 0.72014766, + "epoch": 0.4449372035115142, + "grad_norm": 6.8125, + "learning_rate": 9.208588528704327e-06, + "loss": 1.11221266, + "memory(GiB)": 302.58, + "step": 79560, + "train_speed(iter/s)": 0.124273 + }, + { + "acc": 0.72469134, + "epoch": 0.4450490529844935, + "grad_norm": 8.25, + "learning_rate": 9.208089191999825e-06, + "loss": 1.08230972, + "memory(GiB)": 302.58, + "step": 79580, + "train_speed(iter/s)": 0.124288 + }, + { + "acc": 0.74719634, + "epoch": 0.44516090245747275, + "grad_norm": 8.0625, + "learning_rate": 9.20758971136533e-06, + "loss": 0.98363609, + "memory(GiB)": 302.58, + "step": 79600, + "train_speed(iter/s)": 0.124301 + }, + { + "acc": 0.74250951, + "epoch": 0.445272751930452, + "grad_norm": 4.65625, + "learning_rate": 9.207090086817922e-06, + "loss": 0.97765388, + "memory(GiB)": 302.58, + "step": 79620, + "train_speed(iter/s)": 0.124317 + }, + { + "acc": 0.72631025, + "epoch": 0.4453846014034313, + "grad_norm": 5.25, + "learning_rate": 9.206590318374696e-06, + "loss": 1.09288883, + "memory(GiB)": 302.58, + "step": 79640, + "train_speed(iter/s)": 0.12433 + }, + { + "acc": 0.73116012, + "epoch": 0.44549645087641054, + "grad_norm": 5.28125, + "learning_rate": 9.20609040605274e-06, + "loss": 1.06151943, + "memory(GiB)": 302.58, + "step": 79660, + "train_speed(iter/s)": 0.124345 + }, + { + "acc": 0.73027115, + "epoch": 0.4456083003493898, + "grad_norm": 4.9375, + "learning_rate": 9.205590349869157e-06, + "loss": 1.03759403, + "memory(GiB)": 302.58, + "step": 79680, + "train_speed(iter/s)": 0.124359 + }, + { + "acc": 0.73358111, + "epoch": 0.44572014982236907, + "grad_norm": 9.75, + "learning_rate": 9.205090149841047e-06, + "loss": 1.04315186, + "memory(GiB)": 302.58, + "step": 79700, + "train_speed(iter/s)": 0.124374 + }, + { + "acc": 0.72670875, + "epoch": 0.44583199929534834, + "grad_norm": 8.5, + "learning_rate": 9.204589805985522e-06, + "loss": 1.08727074, + "memory(GiB)": 302.58, + "step": 79720, + "train_speed(iter/s)": 0.124388 + }, + { + "acc": 0.73754096, + "epoch": 0.4459438487683276, + "grad_norm": 5.28125, + "learning_rate": 9.204089318319692e-06, + "loss": 1.01263371, + "memory(GiB)": 302.58, + "step": 79740, + "train_speed(iter/s)": 0.124402 + }, + { + "acc": 0.73025446, + "epoch": 0.44605569824130686, + "grad_norm": 6.0, + "learning_rate": 9.203588686860675e-06, + "loss": 1.06853905, + "memory(GiB)": 302.58, + "step": 79760, + "train_speed(iter/s)": 0.124416 + }, + { + "acc": 0.74632721, + "epoch": 0.44616754771428613, + "grad_norm": 5.25, + "learning_rate": 9.203087911625597e-06, + "loss": 1.01322842, + "memory(GiB)": 302.58, + "step": 79780, + "train_speed(iter/s)": 0.12443 + }, + { + "acc": 0.74062662, + "epoch": 0.4462793971872654, + "grad_norm": 5.46875, + "learning_rate": 9.202586992631586e-06, + "loss": 1.0129837, + "memory(GiB)": 302.58, + "step": 79800, + "train_speed(iter/s)": 0.124445 + }, + { + "acc": 0.7421453, + "epoch": 0.44639124666024466, + "grad_norm": 6.71875, + "learning_rate": 9.202085929895772e-06, + "loss": 1.01939173, + "memory(GiB)": 302.58, + "step": 79820, + "train_speed(iter/s)": 0.12446 + }, + { + "acc": 0.74247856, + "epoch": 0.4465030961332239, + "grad_norm": 7.78125, + "learning_rate": 9.201584723435295e-06, + "loss": 1.01033726, + "memory(GiB)": 302.58, + "step": 79840, + "train_speed(iter/s)": 0.124475 + }, + { + "acc": 0.7391892, + "epoch": 0.4466149456062032, + "grad_norm": 8.375, + "learning_rate": 9.201083373267299e-06, + "loss": 1.0106822, + "memory(GiB)": 302.58, + "step": 79860, + "train_speed(iter/s)": 0.124491 + }, + { + "acc": 0.7259469, + "epoch": 0.44672679507918245, + "grad_norm": 6.40625, + "learning_rate": 9.20058187940893e-06, + "loss": 1.09030809, + "memory(GiB)": 302.58, + "step": 79880, + "train_speed(iter/s)": 0.124505 + }, + { + "acc": 0.73673453, + "epoch": 0.4468386445521617, + "grad_norm": 4.15625, + "learning_rate": 9.200080241877338e-06, + "loss": 1.05449057, + "memory(GiB)": 302.58, + "step": 79900, + "train_speed(iter/s)": 0.12452 + }, + { + "acc": 0.7351985, + "epoch": 0.446950494025141, + "grad_norm": 7.25, + "learning_rate": 9.199578460689686e-06, + "loss": 1.04046021, + "memory(GiB)": 302.58, + "step": 79920, + "train_speed(iter/s)": 0.124535 + }, + { + "acc": 0.72700143, + "epoch": 0.44706234349812024, + "grad_norm": 6.84375, + "learning_rate": 9.199076535863135e-06, + "loss": 1.08591404, + "memory(GiB)": 302.58, + "step": 79940, + "train_speed(iter/s)": 0.124548 + }, + { + "acc": 0.72233906, + "epoch": 0.4471741929710995, + "grad_norm": 7.03125, + "learning_rate": 9.198574467414851e-06, + "loss": 1.08348589, + "memory(GiB)": 302.58, + "step": 79960, + "train_speed(iter/s)": 0.124564 + }, + { + "acc": 0.73411627, + "epoch": 0.44728604244407877, + "grad_norm": 7.90625, + "learning_rate": 9.198072255362007e-06, + "loss": 1.0500102, + "memory(GiB)": 302.58, + "step": 79980, + "train_speed(iter/s)": 0.124577 + }, + { + "acc": 0.74959316, + "epoch": 0.44739789191705803, + "grad_norm": 6.75, + "learning_rate": 9.19756989972178e-06, + "loss": 0.97896767, + "memory(GiB)": 302.58, + "step": 80000, + "train_speed(iter/s)": 0.124591 + }, + { + "epoch": 0.44739789191705803, + "eval_acc": 0.7009585464310162, + "eval_loss": 1.0383275747299194, + "eval_runtime": 7496.151, + "eval_samples_per_second": 10.043, + "eval_steps_per_second": 10.043, + "step": 80000 + }, + { + "acc": 0.7309298, + "epoch": 0.4475097413900373, + "grad_norm": 7.40625, + "learning_rate": 9.19706740051135e-06, + "loss": 1.09287338, + "memory(GiB)": 302.58, + "step": 80020, + "train_speed(iter/s)": 0.123142 + }, + { + "acc": 0.72473578, + "epoch": 0.44762159086301656, + "grad_norm": 7.21875, + "learning_rate": 9.196564757747908e-06, + "loss": 1.07097626, + "memory(GiB)": 302.58, + "step": 80040, + "train_speed(iter/s)": 0.123157 + }, + { + "acc": 0.73612409, + "epoch": 0.4477334403359958, + "grad_norm": 8.4375, + "learning_rate": 9.196061971448646e-06, + "loss": 1.04027767, + "memory(GiB)": 302.58, + "step": 80060, + "train_speed(iter/s)": 0.123171 + }, + { + "acc": 0.73617902, + "epoch": 0.4478452898089751, + "grad_norm": 6.6875, + "learning_rate": 9.195559041630758e-06, + "loss": 1.04684572, + "memory(GiB)": 302.58, + "step": 80080, + "train_speed(iter/s)": 0.123187 + }, + { + "acc": 0.75127654, + "epoch": 0.44795713928195435, + "grad_norm": 7.15625, + "learning_rate": 9.195055968311448e-06, + "loss": 0.99241848, + "memory(GiB)": 302.58, + "step": 80100, + "train_speed(iter/s)": 0.123201 + }, + { + "acc": 0.73526425, + "epoch": 0.4480689887549336, + "grad_norm": 6.25, + "learning_rate": 9.19455275150792e-06, + "loss": 1.04346523, + "memory(GiB)": 302.58, + "step": 80120, + "train_speed(iter/s)": 0.123215 + }, + { + "acc": 0.73224602, + "epoch": 0.4481808382279129, + "grad_norm": 8.75, + "learning_rate": 9.194049391237387e-06, + "loss": 1.06245728, + "memory(GiB)": 302.58, + "step": 80140, + "train_speed(iter/s)": 0.12323 + }, + { + "acc": 0.72675161, + "epoch": 0.44829268770089215, + "grad_norm": 5.25, + "learning_rate": 9.193545887517067e-06, + "loss": 1.0895196, + "memory(GiB)": 302.58, + "step": 80160, + "train_speed(iter/s)": 0.123245 + }, + { + "acc": 0.72302079, + "epoch": 0.4484045371738714, + "grad_norm": 6.84375, + "learning_rate": 9.19304224036418e-06, + "loss": 1.10296183, + "memory(GiB)": 302.58, + "step": 80180, + "train_speed(iter/s)": 0.12326 + }, + { + "acc": 0.72810707, + "epoch": 0.4485163866468507, + "grad_norm": 7.65625, + "learning_rate": 9.192538449795953e-06, + "loss": 1.09578781, + "memory(GiB)": 302.58, + "step": 80200, + "train_speed(iter/s)": 0.123274 + }, + { + "acc": 0.7330864, + "epoch": 0.44862823611982994, + "grad_norm": 4.34375, + "learning_rate": 9.192034515829616e-06, + "loss": 1.03200579, + "memory(GiB)": 302.58, + "step": 80220, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.74753647, + "epoch": 0.4487400855928092, + "grad_norm": 6.1875, + "learning_rate": 9.191530438482408e-06, + "loss": 0.9798666, + "memory(GiB)": 302.58, + "step": 80240, + "train_speed(iter/s)": 0.123304 + }, + { + "acc": 0.7426146, + "epoch": 0.44885193506578847, + "grad_norm": 9.5, + "learning_rate": 9.191026217771566e-06, + "loss": 1.01968374, + "memory(GiB)": 302.58, + "step": 80260, + "train_speed(iter/s)": 0.123318 + }, + { + "acc": 0.73746834, + "epoch": 0.44896378453876773, + "grad_norm": 8.0625, + "learning_rate": 9.190521853714338e-06, + "loss": 1.03484077, + "memory(GiB)": 302.58, + "step": 80280, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.71817775, + "epoch": 0.449075634011747, + "grad_norm": 5.21875, + "learning_rate": 9.190017346327976e-06, + "loss": 1.09458733, + "memory(GiB)": 302.58, + "step": 80300, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.74762731, + "epoch": 0.44918748348472626, + "grad_norm": 7.0, + "learning_rate": 9.189512695629735e-06, + "loss": 0.97574348, + "memory(GiB)": 302.58, + "step": 80320, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.73697114, + "epoch": 0.4492993329577055, + "grad_norm": 6.8125, + "learning_rate": 9.189007901636872e-06, + "loss": 1.05334911, + "memory(GiB)": 302.58, + "step": 80340, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.74962358, + "epoch": 0.4494111824306848, + "grad_norm": 5.09375, + "learning_rate": 9.188502964366659e-06, + "loss": 0.9854723, + "memory(GiB)": 302.58, + "step": 80360, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.73311768, + "epoch": 0.44952303190366405, + "grad_norm": 6.8125, + "learning_rate": 9.187997883836362e-06, + "loss": 1.0539978, + "memory(GiB)": 302.58, + "step": 80380, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.73594723, + "epoch": 0.4496348813766433, + "grad_norm": 9.1875, + "learning_rate": 9.187492660063258e-06, + "loss": 1.07256889, + "memory(GiB)": 302.58, + "step": 80400, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.72985353, + "epoch": 0.4497467308496226, + "grad_norm": 4.0625, + "learning_rate": 9.186987293064626e-06, + "loss": 1.0782917, + "memory(GiB)": 302.58, + "step": 80420, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.75019836, + "epoch": 0.44985858032260184, + "grad_norm": 7.78125, + "learning_rate": 9.186481782857752e-06, + "loss": 0.98860226, + "memory(GiB)": 302.58, + "step": 80440, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.72705846, + "epoch": 0.4499704297955811, + "grad_norm": 13.125, + "learning_rate": 9.185976129459927e-06, + "loss": 1.09152756, + "memory(GiB)": 302.58, + "step": 80460, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.72910218, + "epoch": 0.45008227926856037, + "grad_norm": 5.34375, + "learning_rate": 9.185470332888444e-06, + "loss": 1.06207628, + "memory(GiB)": 302.58, + "step": 80480, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.73817692, + "epoch": 0.45019412874153963, + "grad_norm": 6.03125, + "learning_rate": 9.184964393160604e-06, + "loss": 1.04799175, + "memory(GiB)": 302.58, + "step": 80500, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.73313594, + "epoch": 0.4503059782145189, + "grad_norm": 6.53125, + "learning_rate": 9.184458310293713e-06, + "loss": 1.04677877, + "memory(GiB)": 302.58, + "step": 80520, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.72469554, + "epoch": 0.45041782768749816, + "grad_norm": 5.71875, + "learning_rate": 9.183952084305078e-06, + "loss": 1.07876282, + "memory(GiB)": 302.58, + "step": 80540, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.72711868, + "epoch": 0.4505296771604774, + "grad_norm": 7.75, + "learning_rate": 9.183445715212016e-06, + "loss": 1.08208914, + "memory(GiB)": 302.58, + "step": 80560, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.74179711, + "epoch": 0.4506415266334567, + "grad_norm": 5.46875, + "learning_rate": 9.182939203031844e-06, + "loss": 1.04017143, + "memory(GiB)": 302.58, + "step": 80580, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.73396487, + "epoch": 0.45075337610643595, + "grad_norm": 5.84375, + "learning_rate": 9.182432547781888e-06, + "loss": 1.02747097, + "memory(GiB)": 302.58, + "step": 80600, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.72354612, + "epoch": 0.4508652255794152, + "grad_norm": 7.03125, + "learning_rate": 9.181925749479476e-06, + "loss": 1.09230404, + "memory(GiB)": 302.58, + "step": 80620, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.74561362, + "epoch": 0.4509770750523945, + "grad_norm": 6.40625, + "learning_rate": 9.181418808141944e-06, + "loss": 1.01134224, + "memory(GiB)": 302.58, + "step": 80640, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.73480773, + "epoch": 0.45108892452537375, + "grad_norm": 6.75, + "learning_rate": 9.18091172378663e-06, + "loss": 1.0436718, + "memory(GiB)": 302.58, + "step": 80660, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74398236, + "epoch": 0.451200773998353, + "grad_norm": 7.40625, + "learning_rate": 9.180404496430876e-06, + "loss": 0.98828707, + "memory(GiB)": 302.58, + "step": 80680, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.73871269, + "epoch": 0.4513126234713323, + "grad_norm": 4.09375, + "learning_rate": 9.179897126092033e-06, + "loss": 1.0398345, + "memory(GiB)": 302.58, + "step": 80700, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.72952356, + "epoch": 0.45142447294431154, + "grad_norm": 6.125, + "learning_rate": 9.179389612787455e-06, + "loss": 1.06244793, + "memory(GiB)": 302.58, + "step": 80720, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.72034664, + "epoch": 0.4515363224172908, + "grad_norm": 6.25, + "learning_rate": 9.1788819565345e-06, + "loss": 1.10293293, + "memory(GiB)": 302.58, + "step": 80740, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.72524529, + "epoch": 0.45164817189027007, + "grad_norm": 6.0, + "learning_rate": 9.17837415735053e-06, + "loss": 1.07735443, + "memory(GiB)": 302.58, + "step": 80760, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.73653579, + "epoch": 0.45176002136324933, + "grad_norm": 4.8125, + "learning_rate": 9.177866215252914e-06, + "loss": 0.98920364, + "memory(GiB)": 302.58, + "step": 80780, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.7400681, + "epoch": 0.4518718708362286, + "grad_norm": 6.875, + "learning_rate": 9.177358130259027e-06, + "loss": 1.01731548, + "memory(GiB)": 302.58, + "step": 80800, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.75017276, + "epoch": 0.45198372030920786, + "grad_norm": 5.78125, + "learning_rate": 9.176849902386246e-06, + "loss": 0.987572, + "memory(GiB)": 302.58, + "step": 80820, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.73885942, + "epoch": 0.4520955697821871, + "grad_norm": 6.65625, + "learning_rate": 9.176341531651955e-06, + "loss": 1.00636091, + "memory(GiB)": 302.58, + "step": 80840, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.74858975, + "epoch": 0.4522074192551664, + "grad_norm": 7.09375, + "learning_rate": 9.175833018073538e-06, + "loss": 1.00537758, + "memory(GiB)": 302.58, + "step": 80860, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.73615751, + "epoch": 0.45231926872814565, + "grad_norm": 4.71875, + "learning_rate": 9.175324361668394e-06, + "loss": 1.02492714, + "memory(GiB)": 302.58, + "step": 80880, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.73804526, + "epoch": 0.4524311182011249, + "grad_norm": 8.1875, + "learning_rate": 9.174815562453914e-06, + "loss": 1.0271739, + "memory(GiB)": 302.58, + "step": 80900, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.72028847, + "epoch": 0.4525429676741042, + "grad_norm": 4.875, + "learning_rate": 9.174306620447506e-06, + "loss": 1.11115961, + "memory(GiB)": 302.58, + "step": 80920, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.73115029, + "epoch": 0.45265481714708344, + "grad_norm": 7.09375, + "learning_rate": 9.173797535666574e-06, + "loss": 1.08568077, + "memory(GiB)": 302.58, + "step": 80940, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.7148025, + "epoch": 0.4527666666200627, + "grad_norm": 9.8125, + "learning_rate": 9.173288308128532e-06, + "loss": 1.12469873, + "memory(GiB)": 302.58, + "step": 80960, + "train_speed(iter/s)": 0.123822 + }, + { + "acc": 0.72800403, + "epoch": 0.45287851609304197, + "grad_norm": 8.25, + "learning_rate": 9.172778937850798e-06, + "loss": 1.08429279, + "memory(GiB)": 302.58, + "step": 80980, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.73261156, + "epoch": 0.45299036556602124, + "grad_norm": 5.6875, + "learning_rate": 9.17226942485079e-06, + "loss": 1.04888201, + "memory(GiB)": 302.58, + "step": 81000, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.74124427, + "epoch": 0.4531022150390005, + "grad_norm": 7.03125, + "learning_rate": 9.171759769145939e-06, + "loss": 1.00185766, + "memory(GiB)": 302.58, + "step": 81020, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.73365989, + "epoch": 0.45321406451197976, + "grad_norm": 6.65625, + "learning_rate": 9.171249970753675e-06, + "loss": 1.04312582, + "memory(GiB)": 302.58, + "step": 81040, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.7503437, + "epoch": 0.45332591398495903, + "grad_norm": 6.3125, + "learning_rate": 9.170740029691437e-06, + "loss": 0.97269869, + "memory(GiB)": 302.58, + "step": 81060, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.74141068, + "epoch": 0.4534377634579383, + "grad_norm": 4.8125, + "learning_rate": 9.170229945976665e-06, + "loss": 1.01746254, + "memory(GiB)": 302.58, + "step": 81080, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.74063339, + "epoch": 0.45354961293091756, + "grad_norm": 7.90625, + "learning_rate": 9.169719719626803e-06, + "loss": 0.9843811, + "memory(GiB)": 302.58, + "step": 81100, + "train_speed(iter/s)": 0.123921 + }, + { + "acc": 0.73881645, + "epoch": 0.4536614624038968, + "grad_norm": 6.25, + "learning_rate": 9.169209350659307e-06, + "loss": 1.02946482, + "memory(GiB)": 302.58, + "step": 81120, + "train_speed(iter/s)": 0.123934 + }, + { + "acc": 0.70486169, + "epoch": 0.4537733118768761, + "grad_norm": 7.125, + "learning_rate": 9.16869883909163e-06, + "loss": 1.1592967, + "memory(GiB)": 302.58, + "step": 81140, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.75595169, + "epoch": 0.45388516134985535, + "grad_norm": 7.0625, + "learning_rate": 9.168188184941233e-06, + "loss": 0.93706102, + "memory(GiB)": 302.58, + "step": 81160, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.73704185, + "epoch": 0.4539970108228346, + "grad_norm": 6.9375, + "learning_rate": 9.167677388225587e-06, + "loss": 1.03979597, + "memory(GiB)": 302.58, + "step": 81180, + "train_speed(iter/s)": 0.123978 + }, + { + "acc": 0.73824019, + "epoch": 0.4541088602958139, + "grad_norm": 7.1875, + "learning_rate": 9.167166448962154e-06, + "loss": 1.03140049, + "memory(GiB)": 302.58, + "step": 81200, + "train_speed(iter/s)": 0.123992 + }, + { + "acc": 0.74725623, + "epoch": 0.45422070976879314, + "grad_norm": 7.8125, + "learning_rate": 9.166655367168418e-06, + "loss": 0.97674332, + "memory(GiB)": 302.58, + "step": 81220, + "train_speed(iter/s)": 0.124006 + }, + { + "acc": 0.74216323, + "epoch": 0.4543325592417724, + "grad_norm": 5.3125, + "learning_rate": 9.166144142861856e-06, + "loss": 1.00393209, + "memory(GiB)": 302.58, + "step": 81240, + "train_speed(iter/s)": 0.124019 + }, + { + "acc": 0.72739973, + "epoch": 0.45444440871475167, + "grad_norm": 10.375, + "learning_rate": 9.165632776059953e-06, + "loss": 1.09435968, + "memory(GiB)": 302.58, + "step": 81260, + "train_speed(iter/s)": 0.124032 + }, + { + "acc": 0.75771127, + "epoch": 0.45455625818773093, + "grad_norm": 7.15625, + "learning_rate": 9.1651212667802e-06, + "loss": 0.98423796, + "memory(GiB)": 302.58, + "step": 81280, + "train_speed(iter/s)": 0.124046 + }, + { + "acc": 0.73128457, + "epoch": 0.4546681076607102, + "grad_norm": 6.96875, + "learning_rate": 9.164609615040093e-06, + "loss": 1.04618397, + "memory(GiB)": 302.58, + "step": 81300, + "train_speed(iter/s)": 0.12406 + }, + { + "acc": 0.71771331, + "epoch": 0.45477995713368946, + "grad_norm": 6.9375, + "learning_rate": 9.164097820857131e-06, + "loss": 1.09493618, + "memory(GiB)": 302.58, + "step": 81320, + "train_speed(iter/s)": 0.124075 + }, + { + "acc": 0.72928362, + "epoch": 0.4548918066066687, + "grad_norm": 6.84375, + "learning_rate": 9.163585884248821e-06, + "loss": 1.05990067, + "memory(GiB)": 302.58, + "step": 81340, + "train_speed(iter/s)": 0.124088 + }, + { + "acc": 0.75427766, + "epoch": 0.455003656079648, + "grad_norm": 7.59375, + "learning_rate": 9.163073805232669e-06, + "loss": 0.98023767, + "memory(GiB)": 302.58, + "step": 81360, + "train_speed(iter/s)": 0.124103 + }, + { + "acc": 0.75018358, + "epoch": 0.45511550555262725, + "grad_norm": 7.28125, + "learning_rate": 9.162561583826194e-06, + "loss": 0.99205408, + "memory(GiB)": 302.58, + "step": 81380, + "train_speed(iter/s)": 0.124117 + }, + { + "acc": 0.71467085, + "epoch": 0.4552273550256065, + "grad_norm": 5.75, + "learning_rate": 9.162049220046914e-06, + "loss": 1.10771017, + "memory(GiB)": 302.58, + "step": 81400, + "train_speed(iter/s)": 0.124132 + }, + { + "acc": 0.73454399, + "epoch": 0.4553392044985858, + "grad_norm": 5.75, + "learning_rate": 9.161536713912352e-06, + "loss": 1.04739571, + "memory(GiB)": 302.58, + "step": 81420, + "train_speed(iter/s)": 0.124145 + }, + { + "acc": 0.74062228, + "epoch": 0.45545105397156505, + "grad_norm": 8.3125, + "learning_rate": 9.161024065440039e-06, + "loss": 1.01654739, + "memory(GiB)": 302.58, + "step": 81440, + "train_speed(iter/s)": 0.124159 + }, + { + "acc": 0.73877358, + "epoch": 0.4555629034445443, + "grad_norm": 9.4375, + "learning_rate": 9.160511274647509e-06, + "loss": 1.02397203, + "memory(GiB)": 302.58, + "step": 81460, + "train_speed(iter/s)": 0.124173 + }, + { + "acc": 0.72640285, + "epoch": 0.4556747529175236, + "grad_norm": 5.96875, + "learning_rate": 9.1599983415523e-06, + "loss": 1.08786583, + "memory(GiB)": 302.58, + "step": 81480, + "train_speed(iter/s)": 0.124187 + }, + { + "acc": 0.75690084, + "epoch": 0.45578660239050284, + "grad_norm": 8.5625, + "learning_rate": 9.159485266171957e-06, + "loss": 0.96397343, + "memory(GiB)": 302.58, + "step": 81500, + "train_speed(iter/s)": 0.124201 + }, + { + "acc": 0.73125191, + "epoch": 0.4558984518634821, + "grad_norm": 7.625, + "learning_rate": 9.158972048524029e-06, + "loss": 1.04260349, + "memory(GiB)": 302.58, + "step": 81520, + "train_speed(iter/s)": 0.124215 + }, + { + "acc": 0.72179132, + "epoch": 0.45601030133646137, + "grad_norm": 8.0625, + "learning_rate": 9.158458688626069e-06, + "loss": 1.09031668, + "memory(GiB)": 302.58, + "step": 81540, + "train_speed(iter/s)": 0.12423 + }, + { + "acc": 0.73367004, + "epoch": 0.45612215080944063, + "grad_norm": 7.40625, + "learning_rate": 9.157945186495636e-06, + "loss": 1.0526804, + "memory(GiB)": 302.58, + "step": 81560, + "train_speed(iter/s)": 0.124244 + }, + { + "acc": 0.74458218, + "epoch": 0.4562340002824199, + "grad_norm": 8.9375, + "learning_rate": 9.157431542150292e-06, + "loss": 1.0192914, + "memory(GiB)": 302.58, + "step": 81580, + "train_speed(iter/s)": 0.124259 + }, + { + "acc": 0.73718896, + "epoch": 0.45634584975539916, + "grad_norm": 7.84375, + "learning_rate": 9.156917755607608e-06, + "loss": 1.02806616, + "memory(GiB)": 302.58, + "step": 81600, + "train_speed(iter/s)": 0.124273 + }, + { + "acc": 0.73820748, + "epoch": 0.4564576992283784, + "grad_norm": 6.28125, + "learning_rate": 9.156403826885155e-06, + "loss": 1.00383596, + "memory(GiB)": 302.58, + "step": 81620, + "train_speed(iter/s)": 0.124287 + }, + { + "acc": 0.74129105, + "epoch": 0.4565695487013577, + "grad_norm": 7.84375, + "learning_rate": 9.15588975600051e-06, + "loss": 1.00122862, + "memory(GiB)": 302.58, + "step": 81640, + "train_speed(iter/s)": 0.124301 + }, + { + "acc": 0.73608422, + "epoch": 0.45668139817433695, + "grad_norm": 7.3125, + "learning_rate": 9.155375542971259e-06, + "loss": 1.03813391, + "memory(GiB)": 302.58, + "step": 81660, + "train_speed(iter/s)": 0.124315 + }, + { + "acc": 0.73803806, + "epoch": 0.45679324764731627, + "grad_norm": 7.15625, + "learning_rate": 9.154861187814988e-06, + "loss": 1.04240894, + "memory(GiB)": 302.58, + "step": 81680, + "train_speed(iter/s)": 0.124329 + }, + { + "acc": 0.72736664, + "epoch": 0.45690509712029553, + "grad_norm": 6.03125, + "learning_rate": 9.154346690549288e-06, + "loss": 1.08789434, + "memory(GiB)": 302.58, + "step": 81700, + "train_speed(iter/s)": 0.124343 + }, + { + "acc": 0.72791266, + "epoch": 0.4570169465932748, + "grad_norm": 8.1875, + "learning_rate": 9.15383205119176e-06, + "loss": 1.04989948, + "memory(GiB)": 302.58, + "step": 81720, + "train_speed(iter/s)": 0.124358 + }, + { + "acc": 0.71590242, + "epoch": 0.45712879606625406, + "grad_norm": 8.3125, + "learning_rate": 9.153317269760003e-06, + "loss": 1.12827387, + "memory(GiB)": 302.58, + "step": 81740, + "train_speed(iter/s)": 0.124371 + }, + { + "acc": 0.72920918, + "epoch": 0.4572406455392333, + "grad_norm": 9.625, + "learning_rate": 9.152802346271628e-06, + "loss": 1.06421471, + "memory(GiB)": 302.58, + "step": 81760, + "train_speed(iter/s)": 0.124385 + }, + { + "acc": 0.7387403, + "epoch": 0.4573524950122126, + "grad_norm": 12.0625, + "learning_rate": 9.15228728074424e-06, + "loss": 1.04163752, + "memory(GiB)": 302.58, + "step": 81780, + "train_speed(iter/s)": 0.124399 + }, + { + "acc": 0.74715152, + "epoch": 0.45746434448519185, + "grad_norm": 5.1875, + "learning_rate": 9.151772073195466e-06, + "loss": 0.9761301, + "memory(GiB)": 302.58, + "step": 81800, + "train_speed(iter/s)": 0.124413 + }, + { + "acc": 0.73177166, + "epoch": 0.4575761939581711, + "grad_norm": 8.0625, + "learning_rate": 9.151256723642918e-06, + "loss": 1.07100716, + "memory(GiB)": 302.58, + "step": 81820, + "train_speed(iter/s)": 0.124427 + }, + { + "acc": 0.74446754, + "epoch": 0.4576880434311504, + "grad_norm": 8.625, + "learning_rate": 9.150741232104228e-06, + "loss": 1.01326752, + "memory(GiB)": 302.58, + "step": 81840, + "train_speed(iter/s)": 0.124441 + }, + { + "acc": 0.7451973, + "epoch": 0.45779989290412965, + "grad_norm": 7.65625, + "learning_rate": 9.150225598597026e-06, + "loss": 0.99773855, + "memory(GiB)": 302.58, + "step": 81860, + "train_speed(iter/s)": 0.124456 + }, + { + "acc": 0.75371957, + "epoch": 0.4579117423771089, + "grad_norm": 6.59375, + "learning_rate": 9.14970982313895e-06, + "loss": 0.94540348, + "memory(GiB)": 302.58, + "step": 81880, + "train_speed(iter/s)": 0.12447 + }, + { + "acc": 0.72307472, + "epoch": 0.4580235918500882, + "grad_norm": 6.25, + "learning_rate": 9.14919390574764e-06, + "loss": 1.08957539, + "memory(GiB)": 302.58, + "step": 81900, + "train_speed(iter/s)": 0.124484 + }, + { + "acc": 0.7315752, + "epoch": 0.45813544132306744, + "grad_norm": 4.65625, + "learning_rate": 9.14867784644074e-06, + "loss": 1.05141592, + "memory(GiB)": 302.58, + "step": 81920, + "train_speed(iter/s)": 0.124499 + }, + { + "acc": 0.72156739, + "epoch": 0.4582472907960467, + "grad_norm": 5.25, + "learning_rate": 9.148161645235903e-06, + "loss": 1.10325794, + "memory(GiB)": 302.58, + "step": 81940, + "train_speed(iter/s)": 0.124513 + }, + { + "acc": 0.73773327, + "epoch": 0.45835914026902597, + "grad_norm": 6.03125, + "learning_rate": 9.147645302150784e-06, + "loss": 1.057481, + "memory(GiB)": 302.58, + "step": 81960, + "train_speed(iter/s)": 0.124527 + }, + { + "acc": 0.73963251, + "epoch": 0.45847098974200523, + "grad_norm": 8.3125, + "learning_rate": 9.147128817203043e-06, + "loss": 1.02255974, + "memory(GiB)": 302.58, + "step": 81980, + "train_speed(iter/s)": 0.124541 + }, + { + "acc": 0.74029112, + "epoch": 0.4585828392149845, + "grad_norm": 5.59375, + "learning_rate": 9.14661219041035e-06, + "loss": 1.02684116, + "memory(GiB)": 302.58, + "step": 82000, + "train_speed(iter/s)": 0.124555 + }, + { + "epoch": 0.4585828392149845, + "eval_acc": 0.7010285478675246, + "eval_loss": 1.0379078388214111, + "eval_runtime": 7509.7306, + "eval_samples_per_second": 10.025, + "eval_steps_per_second": 10.025, + "step": 82000 + }, + { + "acc": 0.73640575, + "epoch": 0.45869468868796376, + "grad_norm": 5.15625, + "learning_rate": 9.146095421790368e-06, + "loss": 1.04043837, + "memory(GiB)": 302.58, + "step": 82020, + "train_speed(iter/s)": 0.123142 + }, + { + "acc": 0.74312572, + "epoch": 0.458806538160943, + "grad_norm": 7.65625, + "learning_rate": 9.145578511360776e-06, + "loss": 1.016856, + "memory(GiB)": 302.58, + "step": 82040, + "train_speed(iter/s)": 0.123155 + }, + { + "acc": 0.72747273, + "epoch": 0.4589183876339223, + "grad_norm": 9.25, + "learning_rate": 9.145061459139254e-06, + "loss": 1.06829615, + "memory(GiB)": 302.58, + "step": 82060, + "train_speed(iter/s)": 0.123168 + }, + { + "acc": 0.73939986, + "epoch": 0.45903023710690155, + "grad_norm": 5.53125, + "learning_rate": 9.144544265143487e-06, + "loss": 1.03470612, + "memory(GiB)": 302.58, + "step": 82080, + "train_speed(iter/s)": 0.123182 + }, + { + "acc": 0.72414613, + "epoch": 0.4591420865798808, + "grad_norm": 6.71875, + "learning_rate": 9.144026929391164e-06, + "loss": 1.10488834, + "memory(GiB)": 302.58, + "step": 82100, + "train_speed(iter/s)": 0.123196 + }, + { + "acc": 0.75520625, + "epoch": 0.4592539360528601, + "grad_norm": 6.46875, + "learning_rate": 9.14350945189998e-06, + "loss": 0.93716717, + "memory(GiB)": 302.58, + "step": 82120, + "train_speed(iter/s)": 0.123211 + }, + { + "acc": 0.72815614, + "epoch": 0.45936578552583934, + "grad_norm": 6.84375, + "learning_rate": 9.142991832687634e-06, + "loss": 1.082372, + "memory(GiB)": 302.58, + "step": 82140, + "train_speed(iter/s)": 0.123225 + }, + { + "acc": 0.732164, + "epoch": 0.4594776349988186, + "grad_norm": 7.375, + "learning_rate": 9.142474071771829e-06, + "loss": 1.07353878, + "memory(GiB)": 302.58, + "step": 82160, + "train_speed(iter/s)": 0.123239 + }, + { + "acc": 0.74935837, + "epoch": 0.45958948447179787, + "grad_norm": 6.5625, + "learning_rate": 9.141956169170277e-06, + "loss": 0.97206068, + "memory(GiB)": 302.58, + "step": 82180, + "train_speed(iter/s)": 0.123254 + }, + { + "acc": 0.73913183, + "epoch": 0.45970133394477714, + "grad_norm": 6.15625, + "learning_rate": 9.141438124900687e-06, + "loss": 1.02815704, + "memory(GiB)": 302.58, + "step": 82200, + "train_speed(iter/s)": 0.123268 + }, + { + "acc": 0.74600682, + "epoch": 0.4598131834177564, + "grad_norm": 5.9375, + "learning_rate": 9.140919938980784e-06, + "loss": 1.00731335, + "memory(GiB)": 302.58, + "step": 82220, + "train_speed(iter/s)": 0.123283 + }, + { + "acc": 0.7308465, + "epoch": 0.45992503289073566, + "grad_norm": 7.3125, + "learning_rate": 9.140401611428288e-06, + "loss": 1.07281313, + "memory(GiB)": 302.58, + "step": 82240, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.73964968, + "epoch": 0.4600368823637149, + "grad_norm": 7.75, + "learning_rate": 9.139883142260927e-06, + "loss": 1.03773756, + "memory(GiB)": 302.58, + "step": 82260, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.74957395, + "epoch": 0.4601487318366942, + "grad_norm": 5.375, + "learning_rate": 9.139364531496438e-06, + "loss": 0.97976685, + "memory(GiB)": 302.58, + "step": 82280, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.72311792, + "epoch": 0.46026058130967346, + "grad_norm": 5.53125, + "learning_rate": 9.138845779152555e-06, + "loss": 1.10500746, + "memory(GiB)": 302.58, + "step": 82300, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.72928767, + "epoch": 0.4603724307826527, + "grad_norm": 9.0625, + "learning_rate": 9.138326885247021e-06, + "loss": 1.05151787, + "memory(GiB)": 302.58, + "step": 82320, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.7302865, + "epoch": 0.460484280255632, + "grad_norm": 4.46875, + "learning_rate": 9.137807849797587e-06, + "loss": 1.07285423, + "memory(GiB)": 302.58, + "step": 82340, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.73566685, + "epoch": 0.46059612972861125, + "grad_norm": 5.875, + "learning_rate": 9.137288672822002e-06, + "loss": 1.05988865, + "memory(GiB)": 302.58, + "step": 82360, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.72972503, + "epoch": 0.4607079792015905, + "grad_norm": 7.3125, + "learning_rate": 9.136769354338026e-06, + "loss": 1.08134823, + "memory(GiB)": 302.58, + "step": 82380, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.74196734, + "epoch": 0.4608198286745698, + "grad_norm": 6.9375, + "learning_rate": 9.136249894363422e-06, + "loss": 1.00386066, + "memory(GiB)": 302.58, + "step": 82400, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.73258314, + "epoch": 0.46093167814754904, + "grad_norm": 8.375, + "learning_rate": 9.135730292915955e-06, + "loss": 1.05951414, + "memory(GiB)": 302.58, + "step": 82420, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.72044549, + "epoch": 0.4610435276205283, + "grad_norm": 8.8125, + "learning_rate": 9.135210550013398e-06, + "loss": 1.10333004, + "memory(GiB)": 302.58, + "step": 82440, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.73321133, + "epoch": 0.46115537709350757, + "grad_norm": 7.15625, + "learning_rate": 9.134690665673529e-06, + "loss": 1.05544548, + "memory(GiB)": 302.58, + "step": 82460, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73066177, + "epoch": 0.46126722656648683, + "grad_norm": 7.21875, + "learning_rate": 9.134170639914128e-06, + "loss": 1.08829594, + "memory(GiB)": 302.58, + "step": 82480, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.7546309, + "epoch": 0.4613790760394661, + "grad_norm": 9.9375, + "learning_rate": 9.133650472752981e-06, + "loss": 0.94285603, + "memory(GiB)": 302.58, + "step": 82500, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74249415, + "epoch": 0.46149092551244536, + "grad_norm": 9.9375, + "learning_rate": 9.13313016420788e-06, + "loss": 1.01706152, + "memory(GiB)": 302.58, + "step": 82520, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.7335113, + "epoch": 0.4616027749854246, + "grad_norm": 5.40625, + "learning_rate": 9.132609714296622e-06, + "loss": 1.06625881, + "memory(GiB)": 302.58, + "step": 82540, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74534955, + "epoch": 0.4617146244584039, + "grad_norm": 9.5, + "learning_rate": 9.13208912303701e-06, + "loss": 0.98944387, + "memory(GiB)": 302.58, + "step": 82560, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.72106929, + "epoch": 0.46182647393138315, + "grad_norm": 9.6875, + "learning_rate": 9.131568390446845e-06, + "loss": 1.09923468, + "memory(GiB)": 302.58, + "step": 82580, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.71991792, + "epoch": 0.4619383234043624, + "grad_norm": 5.78125, + "learning_rate": 9.13104751654394e-06, + "loss": 1.12027035, + "memory(GiB)": 302.58, + "step": 82600, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74491835, + "epoch": 0.4620501728773417, + "grad_norm": 6.15625, + "learning_rate": 9.130526501346112e-06, + "loss": 0.99796677, + "memory(GiB)": 302.58, + "step": 82620, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74615073, + "epoch": 0.46216202235032094, + "grad_norm": 9.625, + "learning_rate": 9.13000534487118e-06, + "loss": 0.98390112, + "memory(GiB)": 302.58, + "step": 82640, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.74332461, + "epoch": 0.4622738718233002, + "grad_norm": 8.625, + "learning_rate": 9.12948404713697e-06, + "loss": 0.98591137, + "memory(GiB)": 302.58, + "step": 82660, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.73975697, + "epoch": 0.4623857212962795, + "grad_norm": 6.71875, + "learning_rate": 9.128962608161309e-06, + "loss": 1.03104744, + "memory(GiB)": 302.58, + "step": 82680, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.74734135, + "epoch": 0.46249757076925874, + "grad_norm": 6.59375, + "learning_rate": 9.128441027962036e-06, + "loss": 1.0045332, + "memory(GiB)": 302.58, + "step": 82700, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75420113, + "epoch": 0.462609420242238, + "grad_norm": 4.9375, + "learning_rate": 9.127919306556988e-06, + "loss": 0.95272045, + "memory(GiB)": 302.58, + "step": 82720, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.74643016, + "epoch": 0.46272126971521726, + "grad_norm": 7.21875, + "learning_rate": 9.127397443964012e-06, + "loss": 0.99183512, + "memory(GiB)": 302.58, + "step": 82740, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.74728723, + "epoch": 0.46283311918819653, + "grad_norm": 6.84375, + "learning_rate": 9.126875440200955e-06, + "loss": 1.02235756, + "memory(GiB)": 302.58, + "step": 82760, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.74011188, + "epoch": 0.4629449686611758, + "grad_norm": 7.875, + "learning_rate": 9.12635329528567e-06, + "loss": 1.01979046, + "memory(GiB)": 302.58, + "step": 82780, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.7282939, + "epoch": 0.46305681813415506, + "grad_norm": 8.4375, + "learning_rate": 9.12583100923602e-06, + "loss": 1.05569773, + "memory(GiB)": 302.58, + "step": 82800, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.74495368, + "epoch": 0.4631686676071343, + "grad_norm": 6.65625, + "learning_rate": 9.125308582069868e-06, + "loss": 1.02831631, + "memory(GiB)": 302.58, + "step": 82820, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.74051499, + "epoch": 0.4632805170801136, + "grad_norm": 6.25, + "learning_rate": 9.124786013805078e-06, + "loss": 1.01588936, + "memory(GiB)": 302.58, + "step": 82840, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.73338928, + "epoch": 0.46339236655309285, + "grad_norm": 6.25, + "learning_rate": 9.124263304459529e-06, + "loss": 1.04515343, + "memory(GiB)": 302.58, + "step": 82860, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.74473252, + "epoch": 0.4635042160260721, + "grad_norm": 5.65625, + "learning_rate": 9.123740454051098e-06, + "loss": 0.99296474, + "memory(GiB)": 302.58, + "step": 82880, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.72774572, + "epoch": 0.4636160654990514, + "grad_norm": 9.0, + "learning_rate": 9.123217462597667e-06, + "loss": 1.1066452, + "memory(GiB)": 302.58, + "step": 82900, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.72350335, + "epoch": 0.46372791497203064, + "grad_norm": 8.125, + "learning_rate": 9.122694330117124e-06, + "loss": 1.1318922, + "memory(GiB)": 302.58, + "step": 82920, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.75123553, + "epoch": 0.4638397644450099, + "grad_norm": 6.78125, + "learning_rate": 9.122171056627362e-06, + "loss": 0.99175329, + "memory(GiB)": 302.58, + "step": 82940, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.74064856, + "epoch": 0.46395161391798917, + "grad_norm": 6.0, + "learning_rate": 9.12164764214628e-06, + "loss": 1.01138716, + "memory(GiB)": 302.58, + "step": 82960, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.7390049, + "epoch": 0.46406346339096843, + "grad_norm": 9.25, + "learning_rate": 9.121124086691778e-06, + "loss": 1.03170881, + "memory(GiB)": 302.58, + "step": 82980, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.7489357, + "epoch": 0.4641753128639477, + "grad_norm": 6.78125, + "learning_rate": 9.120600390281764e-06, + "loss": 0.98994236, + "memory(GiB)": 302.58, + "step": 83000, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.73870687, + "epoch": 0.46428716233692696, + "grad_norm": 9.5, + "learning_rate": 9.120076552934152e-06, + "loss": 1.00993567, + "memory(GiB)": 302.58, + "step": 83020, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.74211712, + "epoch": 0.4643990118099062, + "grad_norm": 5.78125, + "learning_rate": 9.119552574666857e-06, + "loss": 1.02600384, + "memory(GiB)": 302.58, + "step": 83040, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.7391573, + "epoch": 0.4645108612828855, + "grad_norm": 7.625, + "learning_rate": 9.1190284554978e-06, + "loss": 1.00764675, + "memory(GiB)": 302.58, + "step": 83060, + "train_speed(iter/s)": 0.123878 + }, + { + "acc": 0.73725829, + "epoch": 0.46462271075586475, + "grad_norm": 7.03125, + "learning_rate": 9.118504195444911e-06, + "loss": 1.03285494, + "memory(GiB)": 302.58, + "step": 83080, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.75153484, + "epoch": 0.464734560228844, + "grad_norm": 7.40625, + "learning_rate": 9.117979794526118e-06, + "loss": 0.95567102, + "memory(GiB)": 302.58, + "step": 83100, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.72584009, + "epoch": 0.4648464097018233, + "grad_norm": 6.25, + "learning_rate": 9.117455252759357e-06, + "loss": 1.06950016, + "memory(GiB)": 302.58, + "step": 83120, + "train_speed(iter/s)": 0.123921 + }, + { + "acc": 0.72767248, + "epoch": 0.46495825917480255, + "grad_norm": 10.75, + "learning_rate": 9.116930570162572e-06, + "loss": 1.06604996, + "memory(GiB)": 302.58, + "step": 83140, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.73924322, + "epoch": 0.4650701086477818, + "grad_norm": 8.4375, + "learning_rate": 9.116405746753708e-06, + "loss": 1.03238697, + "memory(GiB)": 302.58, + "step": 83160, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.74272971, + "epoch": 0.4651819581207611, + "grad_norm": 6.75, + "learning_rate": 9.115880782550713e-06, + "loss": 1.01058846, + "memory(GiB)": 302.58, + "step": 83180, + "train_speed(iter/s)": 0.123962 + }, + { + "acc": 0.72966061, + "epoch": 0.46529380759374034, + "grad_norm": 6.34375, + "learning_rate": 9.115355677571544e-06, + "loss": 1.05052271, + "memory(GiB)": 302.58, + "step": 83200, + "train_speed(iter/s)": 0.123976 + }, + { + "acc": 0.7353476, + "epoch": 0.4654056570667196, + "grad_norm": 5.40625, + "learning_rate": 9.114830431834163e-06, + "loss": 1.03191528, + "memory(GiB)": 302.58, + "step": 83220, + "train_speed(iter/s)": 0.12399 + }, + { + "acc": 0.71976886, + "epoch": 0.46551750653969887, + "grad_norm": 6.0625, + "learning_rate": 9.114305045356531e-06, + "loss": 1.12510433, + "memory(GiB)": 302.58, + "step": 83240, + "train_speed(iter/s)": 0.124004 + }, + { + "acc": 0.73358631, + "epoch": 0.46562935601267813, + "grad_norm": 6.5, + "learning_rate": 9.113779518156622e-06, + "loss": 1.06562738, + "memory(GiB)": 302.58, + "step": 83260, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.72243781, + "epoch": 0.4657412054856574, + "grad_norm": 8.375, + "learning_rate": 9.113253850252409e-06, + "loss": 1.0879303, + "memory(GiB)": 302.58, + "step": 83280, + "train_speed(iter/s)": 0.124032 + }, + { + "acc": 0.72521234, + "epoch": 0.46585305495863666, + "grad_norm": 8.25, + "learning_rate": 9.11272804166187e-06, + "loss": 1.09639015, + "memory(GiB)": 302.58, + "step": 83300, + "train_speed(iter/s)": 0.124045 + }, + { + "acc": 0.73260684, + "epoch": 0.4659649044316159, + "grad_norm": 6.96875, + "learning_rate": 9.112202092402993e-06, + "loss": 1.07113142, + "memory(GiB)": 302.58, + "step": 83320, + "train_speed(iter/s)": 0.124059 + }, + { + "acc": 0.74841561, + "epoch": 0.4660767539045952, + "grad_norm": 7.375, + "learning_rate": 9.111676002493764e-06, + "loss": 0.98155117, + "memory(GiB)": 302.58, + "step": 83340, + "train_speed(iter/s)": 0.124074 + }, + { + "acc": 0.72221332, + "epoch": 0.46618860337757445, + "grad_norm": 6.84375, + "learning_rate": 9.111149771952176e-06, + "loss": 1.10447845, + "memory(GiB)": 302.58, + "step": 83360, + "train_speed(iter/s)": 0.124087 + }, + { + "acc": 0.73243213, + "epoch": 0.4663004528505537, + "grad_norm": 5.625, + "learning_rate": 9.110623400796233e-06, + "loss": 1.04192495, + "memory(GiB)": 302.58, + "step": 83380, + "train_speed(iter/s)": 0.124101 + }, + { + "acc": 0.74117947, + "epoch": 0.466412302323533, + "grad_norm": 9.0, + "learning_rate": 9.110096889043933e-06, + "loss": 0.99530239, + "memory(GiB)": 302.58, + "step": 83400, + "train_speed(iter/s)": 0.124116 + }, + { + "acc": 0.73324137, + "epoch": 0.46652415179651224, + "grad_norm": 5.34375, + "learning_rate": 9.109570236713287e-06, + "loss": 1.06519175, + "memory(GiB)": 302.58, + "step": 83420, + "train_speed(iter/s)": 0.124129 + }, + { + "acc": 0.73848705, + "epoch": 0.4666360012694915, + "grad_norm": 7.65625, + "learning_rate": 9.10904344382231e-06, + "loss": 1.02729769, + "memory(GiB)": 302.58, + "step": 83440, + "train_speed(iter/s)": 0.124143 + }, + { + "acc": 0.73146234, + "epoch": 0.46674785074247077, + "grad_norm": 8.9375, + "learning_rate": 9.108516510389016e-06, + "loss": 1.062362, + "memory(GiB)": 302.58, + "step": 83460, + "train_speed(iter/s)": 0.124157 + }, + { + "acc": 0.74452176, + "epoch": 0.46685970021545004, + "grad_norm": 6.9375, + "learning_rate": 9.10798943643143e-06, + "loss": 0.99923334, + "memory(GiB)": 302.58, + "step": 83480, + "train_speed(iter/s)": 0.124172 + }, + { + "acc": 0.7441112, + "epoch": 0.4669715496884293, + "grad_norm": 6.6875, + "learning_rate": 9.107462221967577e-06, + "loss": 0.99796019, + "memory(GiB)": 302.58, + "step": 83500, + "train_speed(iter/s)": 0.124187 + }, + { + "acc": 0.73967671, + "epoch": 0.46708339916140856, + "grad_norm": 8.1875, + "learning_rate": 9.106934867015495e-06, + "loss": 1.01883011, + "memory(GiB)": 302.58, + "step": 83520, + "train_speed(iter/s)": 0.124202 + }, + { + "acc": 0.74295206, + "epoch": 0.4671952486343878, + "grad_norm": 6.125, + "learning_rate": 9.106407371593217e-06, + "loss": 1.02842665, + "memory(GiB)": 302.58, + "step": 83540, + "train_speed(iter/s)": 0.124216 + }, + { + "acc": 0.73017883, + "epoch": 0.4673070981073671, + "grad_norm": 8.1875, + "learning_rate": 9.105879735718786e-06, + "loss": 1.05639505, + "memory(GiB)": 302.58, + "step": 83560, + "train_speed(iter/s)": 0.124231 + }, + { + "acc": 0.73939481, + "epoch": 0.46741894758034636, + "grad_norm": 10.25, + "learning_rate": 9.105351959410247e-06, + "loss": 1.028654, + "memory(GiB)": 302.58, + "step": 83580, + "train_speed(iter/s)": 0.124245 + }, + { + "acc": 0.73042431, + "epoch": 0.4675307970533256, + "grad_norm": 8.4375, + "learning_rate": 9.104824042685653e-06, + "loss": 1.07336531, + "memory(GiB)": 302.58, + "step": 83600, + "train_speed(iter/s)": 0.124258 + }, + { + "acc": 0.74792452, + "epoch": 0.4676426465263049, + "grad_norm": 8.3125, + "learning_rate": 9.104295985563063e-06, + "loss": 0.98067122, + "memory(GiB)": 302.58, + "step": 83620, + "train_speed(iter/s)": 0.124272 + }, + { + "acc": 0.74334564, + "epoch": 0.46775449599928415, + "grad_norm": 6.25, + "learning_rate": 9.103767788060534e-06, + "loss": 1.00420561, + "memory(GiB)": 302.58, + "step": 83640, + "train_speed(iter/s)": 0.124285 + }, + { + "acc": 0.74684958, + "epoch": 0.4678663454722634, + "grad_norm": 9.5, + "learning_rate": 9.103239450196136e-06, + "loss": 1.00595112, + "memory(GiB)": 302.58, + "step": 83660, + "train_speed(iter/s)": 0.124299 + }, + { + "acc": 0.73883395, + "epoch": 0.4679781949452427, + "grad_norm": 7.96875, + "learning_rate": 9.102710971987936e-06, + "loss": 1.0030879, + "memory(GiB)": 302.58, + "step": 83680, + "train_speed(iter/s)": 0.124313 + }, + { + "acc": 0.73522511, + "epoch": 0.46809004441822194, + "grad_norm": 6.65625, + "learning_rate": 9.102182353454011e-06, + "loss": 1.037607, + "memory(GiB)": 302.58, + "step": 83700, + "train_speed(iter/s)": 0.124326 + }, + { + "acc": 0.73936262, + "epoch": 0.4682018938912012, + "grad_norm": 5.625, + "learning_rate": 9.101653594612443e-06, + "loss": 1.02006063, + "memory(GiB)": 302.58, + "step": 83720, + "train_speed(iter/s)": 0.124341 + }, + { + "acc": 0.73492117, + "epoch": 0.46831374336418047, + "grad_norm": 6.0625, + "learning_rate": 9.101124695481317e-06, + "loss": 1.06152525, + "memory(GiB)": 302.58, + "step": 83740, + "train_speed(iter/s)": 0.124356 + }, + { + "acc": 0.72869625, + "epoch": 0.46842559283715973, + "grad_norm": 10.3125, + "learning_rate": 9.10059565607872e-06, + "loss": 1.06631327, + "memory(GiB)": 302.58, + "step": 83760, + "train_speed(iter/s)": 0.124368 + }, + { + "acc": 0.75409236, + "epoch": 0.468537442310139, + "grad_norm": 6.90625, + "learning_rate": 9.10006647642275e-06, + "loss": 0.95720596, + "memory(GiB)": 302.58, + "step": 83780, + "train_speed(iter/s)": 0.124382 + }, + { + "acc": 0.73690095, + "epoch": 0.46864929178311826, + "grad_norm": 6.9375, + "learning_rate": 9.099537156531505e-06, + "loss": 1.03408556, + "memory(GiB)": 302.58, + "step": 83800, + "train_speed(iter/s)": 0.124395 + }, + { + "acc": 0.74420033, + "epoch": 0.4687611412560975, + "grad_norm": 6.6875, + "learning_rate": 9.099007696423091e-06, + "loss": 1.00174904, + "memory(GiB)": 302.58, + "step": 83820, + "train_speed(iter/s)": 0.12441 + }, + { + "acc": 0.72450747, + "epoch": 0.4688729907290768, + "grad_norm": 7.1875, + "learning_rate": 9.098478096115614e-06, + "loss": 1.10103254, + "memory(GiB)": 302.58, + "step": 83840, + "train_speed(iter/s)": 0.124424 + }, + { + "acc": 0.73686061, + "epoch": 0.46898484020205605, + "grad_norm": 10.375, + "learning_rate": 9.097948355627192e-06, + "loss": 1.04893551, + "memory(GiB)": 302.58, + "step": 83860, + "train_speed(iter/s)": 0.124437 + }, + { + "acc": 0.72939487, + "epoch": 0.4690966896750353, + "grad_norm": 10.875, + "learning_rate": 9.097418474975942e-06, + "loss": 1.05080595, + "memory(GiB)": 302.58, + "step": 83880, + "train_speed(iter/s)": 0.124451 + }, + { + "acc": 0.7511384, + "epoch": 0.4692085391480146, + "grad_norm": 8.1875, + "learning_rate": 9.096888454179988e-06, + "loss": 0.98270178, + "memory(GiB)": 302.58, + "step": 83900, + "train_speed(iter/s)": 0.124465 + }, + { + "acc": 0.73815641, + "epoch": 0.46932038862099384, + "grad_norm": 6.53125, + "learning_rate": 9.096358293257457e-06, + "loss": 1.02638626, + "memory(GiB)": 302.58, + "step": 83920, + "train_speed(iter/s)": 0.12448 + }, + { + "acc": 0.73231573, + "epoch": 0.4694322380939731, + "grad_norm": 6.5, + "learning_rate": 9.095827992226482e-06, + "loss": 1.04747915, + "memory(GiB)": 302.58, + "step": 83940, + "train_speed(iter/s)": 0.124493 + }, + { + "acc": 0.72155871, + "epoch": 0.4695440875669524, + "grad_norm": 6.4375, + "learning_rate": 9.095297551105205e-06, + "loss": 1.11913099, + "memory(GiB)": 302.58, + "step": 83960, + "train_speed(iter/s)": 0.124507 + }, + { + "acc": 0.74170933, + "epoch": 0.46965593703993164, + "grad_norm": 6.75, + "learning_rate": 9.094766969911763e-06, + "loss": 1.02499008, + "memory(GiB)": 302.58, + "step": 83980, + "train_speed(iter/s)": 0.12452 + }, + { + "acc": 0.73642874, + "epoch": 0.4697677865129109, + "grad_norm": 7.59375, + "learning_rate": 9.09423624866431e-06, + "loss": 1.03279667, + "memory(GiB)": 302.58, + "step": 84000, + "train_speed(iter/s)": 0.124535 + }, + { + "epoch": 0.4697677865129109, + "eval_acc": 0.7013350259877869, + "eval_loss": 1.0368666648864746, + "eval_runtime": 7501.5641, + "eval_samples_per_second": 10.036, + "eval_steps_per_second": 10.036, + "step": 84000 + }, + { + "acc": 0.73830204, + "epoch": 0.46987963598589017, + "grad_norm": 5.28125, + "learning_rate": 9.093705387380992e-06, + "loss": 1.0338954, + "memory(GiB)": 302.58, + "step": 84020, + "train_speed(iter/s)": 0.123155 + }, + { + "acc": 0.75046377, + "epoch": 0.46999148545886943, + "grad_norm": 6.9375, + "learning_rate": 9.09317438607997e-06, + "loss": 0.96850157, + "memory(GiB)": 302.58, + "step": 84040, + "train_speed(iter/s)": 0.123169 + }, + { + "acc": 0.73272729, + "epoch": 0.4701033349318487, + "grad_norm": 7.0625, + "learning_rate": 9.092643244779404e-06, + "loss": 1.05479708, + "memory(GiB)": 302.58, + "step": 84060, + "train_speed(iter/s)": 0.123183 + }, + { + "acc": 0.74077821, + "epoch": 0.47021518440482796, + "grad_norm": 6.53125, + "learning_rate": 9.092111963497462e-06, + "loss": 1.03386145, + "memory(GiB)": 302.58, + "step": 84080, + "train_speed(iter/s)": 0.123197 + }, + { + "acc": 0.7206068, + "epoch": 0.4703270338778072, + "grad_norm": 8.1875, + "learning_rate": 9.091580542252317e-06, + "loss": 1.10140276, + "memory(GiB)": 302.58, + "step": 84100, + "train_speed(iter/s)": 0.123212 + }, + { + "acc": 0.73715396, + "epoch": 0.4704388833507865, + "grad_norm": 5.78125, + "learning_rate": 9.091048981062141e-06, + "loss": 1.01333408, + "memory(GiB)": 302.58, + "step": 84120, + "train_speed(iter/s)": 0.123226 + }, + { + "acc": 0.73689485, + "epoch": 0.47055073282376575, + "grad_norm": 8.5625, + "learning_rate": 9.09051727994512e-06, + "loss": 1.04556093, + "memory(GiB)": 302.58, + "step": 84140, + "train_speed(iter/s)": 0.12324 + }, + { + "acc": 0.72519956, + "epoch": 0.470662582296745, + "grad_norm": 6.9375, + "learning_rate": 9.089985438919434e-06, + "loss": 1.08636208, + "memory(GiB)": 302.58, + "step": 84160, + "train_speed(iter/s)": 0.123252 + }, + { + "acc": 0.75567393, + "epoch": 0.4707744317697243, + "grad_norm": 8.625, + "learning_rate": 9.08945345800328e-06, + "loss": 0.9432395, + "memory(GiB)": 302.58, + "step": 84180, + "train_speed(iter/s)": 0.123266 + }, + { + "acc": 0.73481822, + "epoch": 0.4708862812427036, + "grad_norm": 9.125, + "learning_rate": 9.088921337214847e-06, + "loss": 1.0339941, + "memory(GiB)": 302.58, + "step": 84200, + "train_speed(iter/s)": 0.123279 + }, + { + "acc": 0.7513051, + "epoch": 0.47099813071568286, + "grad_norm": 6.9375, + "learning_rate": 9.088389076572342e-06, + "loss": 0.96891575, + "memory(GiB)": 302.58, + "step": 84220, + "train_speed(iter/s)": 0.123293 + }, + { + "acc": 0.72928429, + "epoch": 0.4711099801886621, + "grad_norm": 10.9375, + "learning_rate": 9.087856676093965e-06, + "loss": 1.07639017, + "memory(GiB)": 302.58, + "step": 84240, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.72604828, + "epoch": 0.4712218296616414, + "grad_norm": 9.4375, + "learning_rate": 9.087324135797927e-06, + "loss": 1.07162094, + "memory(GiB)": 302.58, + "step": 84260, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.74396377, + "epoch": 0.47133367913462065, + "grad_norm": 4.65625, + "learning_rate": 9.086791455702444e-06, + "loss": 0.99268312, + "memory(GiB)": 302.58, + "step": 84280, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.7418931, + "epoch": 0.4714455286075999, + "grad_norm": 9.8125, + "learning_rate": 9.086258635825734e-06, + "loss": 1.00432768, + "memory(GiB)": 302.58, + "step": 84300, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.73386917, + "epoch": 0.4715573780805792, + "grad_norm": 7.5625, + "learning_rate": 9.08572567618602e-06, + "loss": 1.06717653, + "memory(GiB)": 302.58, + "step": 84320, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.72361727, + "epoch": 0.47166922755355845, + "grad_norm": 6.90625, + "learning_rate": 9.085192576801536e-06, + "loss": 1.08803196, + "memory(GiB)": 302.58, + "step": 84340, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.73973122, + "epoch": 0.4717810770265377, + "grad_norm": 7.9375, + "learning_rate": 9.08465933769051e-06, + "loss": 1.02538481, + "memory(GiB)": 302.58, + "step": 84360, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.74195828, + "epoch": 0.471892926499517, + "grad_norm": 5.625, + "learning_rate": 9.084125958871183e-06, + "loss": 1.024158, + "memory(GiB)": 302.58, + "step": 84380, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.75922475, + "epoch": 0.47200477597249624, + "grad_norm": 8.3125, + "learning_rate": 9.083592440361798e-06, + "loss": 0.95583591, + "memory(GiB)": 302.58, + "step": 84400, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.73599529, + "epoch": 0.4721166254454755, + "grad_norm": 6.03125, + "learning_rate": 9.083058782180601e-06, + "loss": 1.05369215, + "memory(GiB)": 302.58, + "step": 84420, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74350109, + "epoch": 0.47222847491845477, + "grad_norm": 5.1875, + "learning_rate": 9.082524984345848e-06, + "loss": 1.03262577, + "memory(GiB)": 302.58, + "step": 84440, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.72843776, + "epoch": 0.47234032439143403, + "grad_norm": 7.03125, + "learning_rate": 9.081991046875795e-06, + "loss": 1.08320503, + "memory(GiB)": 302.58, + "step": 84460, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.72851253, + "epoch": 0.4724521738644133, + "grad_norm": 6.90625, + "learning_rate": 9.081456969788705e-06, + "loss": 1.0498065, + "memory(GiB)": 302.58, + "step": 84480, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.71722951, + "epoch": 0.47256402333739256, + "grad_norm": 7.46875, + "learning_rate": 9.080922753102845e-06, + "loss": 1.11614361, + "memory(GiB)": 302.58, + "step": 84500, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.73681741, + "epoch": 0.4726758728103718, + "grad_norm": 4.0625, + "learning_rate": 9.080388396836487e-06, + "loss": 1.05264769, + "memory(GiB)": 302.58, + "step": 84520, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74300089, + "epoch": 0.4727877222833511, + "grad_norm": 7.9375, + "learning_rate": 9.079853901007905e-06, + "loss": 1.00064831, + "memory(GiB)": 302.58, + "step": 84540, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75336409, + "epoch": 0.47289957175633035, + "grad_norm": 10.875, + "learning_rate": 9.079319265635382e-06, + "loss": 0.95781708, + "memory(GiB)": 302.58, + "step": 84560, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.72160892, + "epoch": 0.4730114212293096, + "grad_norm": 6.65625, + "learning_rate": 9.078784490737208e-06, + "loss": 1.09657812, + "memory(GiB)": 302.58, + "step": 84580, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.72424769, + "epoch": 0.4731232707022889, + "grad_norm": 7.96875, + "learning_rate": 9.078249576331671e-06, + "loss": 1.08861313, + "memory(GiB)": 302.58, + "step": 84600, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.73076391, + "epoch": 0.47323512017526814, + "grad_norm": 6.6875, + "learning_rate": 9.077714522437065e-06, + "loss": 1.06278362, + "memory(GiB)": 302.58, + "step": 84620, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.7376873, + "epoch": 0.4733469696482474, + "grad_norm": 8.3125, + "learning_rate": 9.077179329071693e-06, + "loss": 1.02047958, + "memory(GiB)": 302.58, + "step": 84640, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.72080698, + "epoch": 0.47345881912122667, + "grad_norm": 6.875, + "learning_rate": 9.076643996253857e-06, + "loss": 1.11571369, + "memory(GiB)": 302.58, + "step": 84660, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.72974343, + "epoch": 0.47357066859420593, + "grad_norm": 7.53125, + "learning_rate": 9.076108524001874e-06, + "loss": 1.06726933, + "memory(GiB)": 302.58, + "step": 84680, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.73906527, + "epoch": 0.4736825180671852, + "grad_norm": 8.0, + "learning_rate": 9.075572912334051e-06, + "loss": 1.01552382, + "memory(GiB)": 302.58, + "step": 84700, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.76062956, + "epoch": 0.47379436754016446, + "grad_norm": 7.53125, + "learning_rate": 9.075037161268714e-06, + "loss": 0.92813597, + "memory(GiB)": 302.58, + "step": 84720, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.7126277, + "epoch": 0.4739062170131437, + "grad_norm": 4.9375, + "learning_rate": 9.074501270824182e-06, + "loss": 1.14580374, + "memory(GiB)": 302.58, + "step": 84740, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.75114822, + "epoch": 0.474018066486123, + "grad_norm": 5.46875, + "learning_rate": 9.07396524101879e-06, + "loss": 0.96815386, + "memory(GiB)": 302.58, + "step": 84760, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.74195056, + "epoch": 0.47412991595910225, + "grad_norm": 7.875, + "learning_rate": 9.073429071870864e-06, + "loss": 1.00779791, + "memory(GiB)": 302.58, + "step": 84780, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.73545451, + "epoch": 0.4742417654320815, + "grad_norm": 9.0625, + "learning_rate": 9.07289276339875e-06, + "loss": 1.03283758, + "memory(GiB)": 302.58, + "step": 84800, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.73942566, + "epoch": 0.4743536149050608, + "grad_norm": 4.75, + "learning_rate": 9.072356315620788e-06, + "loss": 1.03322983, + "memory(GiB)": 302.58, + "step": 84820, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.73960681, + "epoch": 0.47446546437804005, + "grad_norm": 7.4375, + "learning_rate": 9.071819728555327e-06, + "loss": 1.02232399, + "memory(GiB)": 302.58, + "step": 84840, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.7347815, + "epoch": 0.4745773138510193, + "grad_norm": 7.09375, + "learning_rate": 9.07128300222072e-06, + "loss": 1.05916843, + "memory(GiB)": 302.58, + "step": 84860, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.74794264, + "epoch": 0.4746891633239986, + "grad_norm": 5.9375, + "learning_rate": 9.070746136635324e-06, + "loss": 0.97593317, + "memory(GiB)": 302.58, + "step": 84880, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.73426151, + "epoch": 0.47480101279697784, + "grad_norm": 7.59375, + "learning_rate": 9.070209131817503e-06, + "loss": 1.05493002, + "memory(GiB)": 302.58, + "step": 84900, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.7323451, + "epoch": 0.4749128622699571, + "grad_norm": 4.75, + "learning_rate": 9.069671987785623e-06, + "loss": 1.04645338, + "memory(GiB)": 302.58, + "step": 84920, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.73390822, + "epoch": 0.47502471174293637, + "grad_norm": 8.875, + "learning_rate": 9.069134704558057e-06, + "loss": 1.059445, + "memory(GiB)": 302.58, + "step": 84940, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.75270762, + "epoch": 0.47513656121591563, + "grad_norm": 5.21875, + "learning_rate": 9.068597282153182e-06, + "loss": 0.94876661, + "memory(GiB)": 302.58, + "step": 84960, + "train_speed(iter/s)": 0.123801 + }, + { + "acc": 0.72975202, + "epoch": 0.4752484106888949, + "grad_norm": 6.84375, + "learning_rate": 9.068059720589377e-06, + "loss": 1.05823927, + "memory(GiB)": 302.58, + "step": 84980, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.73397965, + "epoch": 0.47536026016187416, + "grad_norm": 15.5, + "learning_rate": 9.06752201988503e-06, + "loss": 1.05330706, + "memory(GiB)": 302.58, + "step": 85000, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.7227138, + "epoch": 0.4754721096348534, + "grad_norm": 7.25, + "learning_rate": 9.066984180058533e-06, + "loss": 1.08445311, + "memory(GiB)": 302.58, + "step": 85020, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.73005114, + "epoch": 0.4755839591078327, + "grad_norm": 8.9375, + "learning_rate": 9.066446201128281e-06, + "loss": 1.04778585, + "memory(GiB)": 302.58, + "step": 85040, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.73918061, + "epoch": 0.47569580858081195, + "grad_norm": 9.3125, + "learning_rate": 9.065908083112675e-06, + "loss": 1.02311935, + "memory(GiB)": 302.58, + "step": 85060, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.72729607, + "epoch": 0.4758076580537912, + "grad_norm": 7.0625, + "learning_rate": 9.06536982603012e-06, + "loss": 1.09169455, + "memory(GiB)": 302.58, + "step": 85080, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.72992473, + "epoch": 0.4759195075267705, + "grad_norm": 6.09375, + "learning_rate": 9.064831429899024e-06, + "loss": 1.06710844, + "memory(GiB)": 302.58, + "step": 85100, + "train_speed(iter/s)": 0.123898 + }, + { + "acc": 0.72502055, + "epoch": 0.47603135699974974, + "grad_norm": 6.59375, + "learning_rate": 9.064292894737804e-06, + "loss": 1.09495573, + "memory(GiB)": 302.58, + "step": 85120, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.73940773, + "epoch": 0.476143206472729, + "grad_norm": 5.96875, + "learning_rate": 9.063754220564883e-06, + "loss": 0.99657955, + "memory(GiB)": 302.58, + "step": 85140, + "train_speed(iter/s)": 0.123925 + }, + { + "acc": 0.75447612, + "epoch": 0.47625505594570827, + "grad_norm": 8.875, + "learning_rate": 9.063215407398678e-06, + "loss": 0.95815697, + "memory(GiB)": 302.58, + "step": 85160, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.75287151, + "epoch": 0.47636690541868754, + "grad_norm": 8.0625, + "learning_rate": 9.062676455257624e-06, + "loss": 0.97024975, + "memory(GiB)": 302.58, + "step": 85180, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.75341549, + "epoch": 0.4764787548916668, + "grad_norm": 4.09375, + "learning_rate": 9.062137364160152e-06, + "loss": 0.96924219, + "memory(GiB)": 302.58, + "step": 85200, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.74301066, + "epoch": 0.47659060436464606, + "grad_norm": 8.0, + "learning_rate": 9.061598134124702e-06, + "loss": 1.02272491, + "memory(GiB)": 302.58, + "step": 85220, + "train_speed(iter/s)": 0.123979 + }, + { + "acc": 0.73397913, + "epoch": 0.47670245383762533, + "grad_norm": 5.53125, + "learning_rate": 9.061058765169717e-06, + "loss": 1.05704842, + "memory(GiB)": 302.58, + "step": 85240, + "train_speed(iter/s)": 0.123992 + }, + { + "acc": 0.72667904, + "epoch": 0.4768143033106046, + "grad_norm": 6.25, + "learning_rate": 9.060519257313645e-06, + "loss": 1.08494539, + "memory(GiB)": 302.58, + "step": 85260, + "train_speed(iter/s)": 0.124005 + }, + { + "acc": 0.73733268, + "epoch": 0.47692615278358386, + "grad_norm": 8.0625, + "learning_rate": 9.059979610574937e-06, + "loss": 1.02421522, + "memory(GiB)": 302.58, + "step": 85280, + "train_speed(iter/s)": 0.124019 + }, + { + "acc": 0.71853852, + "epoch": 0.4770380022565631, + "grad_norm": 6.5625, + "learning_rate": 9.059439824972054e-06, + "loss": 1.11291952, + "memory(GiB)": 302.58, + "step": 85300, + "train_speed(iter/s)": 0.124033 + }, + { + "acc": 0.73608751, + "epoch": 0.4771498517295424, + "grad_norm": 7.875, + "learning_rate": 9.058899900523457e-06, + "loss": 1.04104071, + "memory(GiB)": 302.58, + "step": 85320, + "train_speed(iter/s)": 0.124046 + }, + { + "acc": 0.74554706, + "epoch": 0.47726170120252165, + "grad_norm": 6.9375, + "learning_rate": 9.058359837247612e-06, + "loss": 0.99858303, + "memory(GiB)": 302.58, + "step": 85340, + "train_speed(iter/s)": 0.12406 + }, + { + "acc": 0.73054962, + "epoch": 0.4773735506755009, + "grad_norm": 8.5, + "learning_rate": 9.057819635162994e-06, + "loss": 1.06286411, + "memory(GiB)": 302.58, + "step": 85360, + "train_speed(iter/s)": 0.124074 + }, + { + "acc": 0.7300384, + "epoch": 0.4774854001484802, + "grad_norm": 8.75, + "learning_rate": 9.057279294288075e-06, + "loss": 1.05617085, + "memory(GiB)": 302.58, + "step": 85380, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.73474922, + "epoch": 0.47759724962145944, + "grad_norm": 8.8125, + "learning_rate": 9.05673881464134e-06, + "loss": 1.02182083, + "memory(GiB)": 302.58, + "step": 85400, + "train_speed(iter/s)": 0.1241 + }, + { + "acc": 0.74330578, + "epoch": 0.4777090990944387, + "grad_norm": 6.1875, + "learning_rate": 9.056198196241275e-06, + "loss": 1.02762804, + "memory(GiB)": 302.58, + "step": 85420, + "train_speed(iter/s)": 0.124114 + }, + { + "acc": 0.73770962, + "epoch": 0.47782094856741797, + "grad_norm": 6.28125, + "learning_rate": 9.055657439106369e-06, + "loss": 1.03123732, + "memory(GiB)": 302.58, + "step": 85440, + "train_speed(iter/s)": 0.124128 + }, + { + "acc": 0.71670604, + "epoch": 0.47793279804039723, + "grad_norm": 8.4375, + "learning_rate": 9.055116543255117e-06, + "loss": 1.13967552, + "memory(GiB)": 302.58, + "step": 85460, + "train_speed(iter/s)": 0.124142 + }, + { + "acc": 0.73223667, + "epoch": 0.4780446475133765, + "grad_norm": 9.75, + "learning_rate": 9.054575508706022e-06, + "loss": 1.05300627, + "memory(GiB)": 302.58, + "step": 85480, + "train_speed(iter/s)": 0.124155 + }, + { + "acc": 0.75506887, + "epoch": 0.47815649698635576, + "grad_norm": 4.28125, + "learning_rate": 9.054034335477587e-06, + "loss": 0.94743729, + "memory(GiB)": 302.58, + "step": 85500, + "train_speed(iter/s)": 0.124169 + }, + { + "acc": 0.72006497, + "epoch": 0.478268346459335, + "grad_norm": 4.78125, + "learning_rate": 9.053493023588324e-06, + "loss": 1.12265234, + "memory(GiB)": 302.58, + "step": 85520, + "train_speed(iter/s)": 0.124183 + }, + { + "acc": 0.74234819, + "epoch": 0.4783801959323143, + "grad_norm": 5.90625, + "learning_rate": 9.052951573056746e-06, + "loss": 1.01539745, + "memory(GiB)": 302.58, + "step": 85540, + "train_speed(iter/s)": 0.124196 + }, + { + "acc": 0.72648811, + "epoch": 0.47849204540529355, + "grad_norm": 5.5625, + "learning_rate": 9.052409983901372e-06, + "loss": 1.08144751, + "memory(GiB)": 302.58, + "step": 85560, + "train_speed(iter/s)": 0.124209 + }, + { + "acc": 0.72783241, + "epoch": 0.4786038948782728, + "grad_norm": 7.4375, + "learning_rate": 9.051868256140728e-06, + "loss": 1.06842318, + "memory(GiB)": 302.58, + "step": 85580, + "train_speed(iter/s)": 0.124224 + }, + { + "acc": 0.72876029, + "epoch": 0.4787157443512521, + "grad_norm": 6.125, + "learning_rate": 9.05132638979334e-06, + "loss": 1.07933989, + "memory(GiB)": 302.58, + "step": 85600, + "train_speed(iter/s)": 0.124237 + }, + { + "acc": 0.74447427, + "epoch": 0.47882759382423135, + "grad_norm": 7.28125, + "learning_rate": 9.050784384877744e-06, + "loss": 1.01104765, + "memory(GiB)": 302.58, + "step": 85620, + "train_speed(iter/s)": 0.124251 + }, + { + "acc": 0.73842626, + "epoch": 0.4789394432972106, + "grad_norm": 7.625, + "learning_rate": 9.050242241412478e-06, + "loss": 1.02255421, + "memory(GiB)": 302.58, + "step": 85640, + "train_speed(iter/s)": 0.124264 + }, + { + "acc": 0.73785477, + "epoch": 0.4790512927701899, + "grad_norm": 6.90625, + "learning_rate": 9.049699959416082e-06, + "loss": 1.01732073, + "memory(GiB)": 302.58, + "step": 85660, + "train_speed(iter/s)": 0.124278 + }, + { + "acc": 0.7495759, + "epoch": 0.47916314224316914, + "grad_norm": 7.0, + "learning_rate": 9.04915753890711e-06, + "loss": 0.98113489, + "memory(GiB)": 302.58, + "step": 85680, + "train_speed(iter/s)": 0.124292 + }, + { + "acc": 0.7264358, + "epoch": 0.4792749917161484, + "grad_norm": 7.53125, + "learning_rate": 9.048614979904107e-06, + "loss": 1.07731829, + "memory(GiB)": 302.58, + "step": 85700, + "train_speed(iter/s)": 0.124307 + }, + { + "acc": 0.72630582, + "epoch": 0.47938684118912767, + "grad_norm": 6.90625, + "learning_rate": 9.048072282425636e-06, + "loss": 1.08000231, + "memory(GiB)": 302.58, + "step": 85720, + "train_speed(iter/s)": 0.124319 + }, + { + "acc": 0.74009414, + "epoch": 0.47949869066210693, + "grad_norm": 6.71875, + "learning_rate": 9.047529446490256e-06, + "loss": 1.02068539, + "memory(GiB)": 302.58, + "step": 85740, + "train_speed(iter/s)": 0.124334 + }, + { + "acc": 0.74191484, + "epoch": 0.4796105401350862, + "grad_norm": 5.8125, + "learning_rate": 9.046986472116534e-06, + "loss": 0.99485989, + "memory(GiB)": 302.58, + "step": 85760, + "train_speed(iter/s)": 0.124348 + }, + { + "acc": 0.72699146, + "epoch": 0.47972238960806546, + "grad_norm": 7.4375, + "learning_rate": 9.046443359323043e-06, + "loss": 1.07864084, + "memory(GiB)": 302.58, + "step": 85780, + "train_speed(iter/s)": 0.124362 + }, + { + "acc": 0.72314858, + "epoch": 0.4798342390810447, + "grad_norm": 6.9375, + "learning_rate": 9.045900108128358e-06, + "loss": 1.09684572, + "memory(GiB)": 302.58, + "step": 85800, + "train_speed(iter/s)": 0.124376 + }, + { + "acc": 0.72476282, + "epoch": 0.479946088554024, + "grad_norm": 8.625, + "learning_rate": 9.045356718551059e-06, + "loss": 1.08907127, + "memory(GiB)": 302.58, + "step": 85820, + "train_speed(iter/s)": 0.12439 + }, + { + "acc": 0.7465951, + "epoch": 0.48005793802700325, + "grad_norm": 7.21875, + "learning_rate": 9.044813190609734e-06, + "loss": 1.0192152, + "memory(GiB)": 302.58, + "step": 85840, + "train_speed(iter/s)": 0.124403 + }, + { + "acc": 0.73421907, + "epoch": 0.4801697874999825, + "grad_norm": 6.875, + "learning_rate": 9.04426952432297e-06, + "loss": 1.04179783, + "memory(GiB)": 302.58, + "step": 85860, + "train_speed(iter/s)": 0.124416 + }, + { + "acc": 0.74025793, + "epoch": 0.4802816369729618, + "grad_norm": 6.9375, + "learning_rate": 9.043725719709368e-06, + "loss": 1.02144976, + "memory(GiB)": 302.58, + "step": 85880, + "train_speed(iter/s)": 0.12443 + }, + { + "acc": 0.74751973, + "epoch": 0.48039348644594104, + "grad_norm": 10.75, + "learning_rate": 9.043181776787522e-06, + "loss": 0.99909983, + "memory(GiB)": 302.58, + "step": 85900, + "train_speed(iter/s)": 0.124444 + }, + { + "acc": 0.75640078, + "epoch": 0.4805053359189203, + "grad_norm": 8.125, + "learning_rate": 9.042637695576038e-06, + "loss": 0.93440332, + "memory(GiB)": 302.58, + "step": 85920, + "train_speed(iter/s)": 0.124458 + }, + { + "acc": 0.73038645, + "epoch": 0.48061718539189957, + "grad_norm": 7.3125, + "learning_rate": 9.042093476093527e-06, + "loss": 1.05961657, + "memory(GiB)": 302.58, + "step": 85940, + "train_speed(iter/s)": 0.124471 + }, + { + "acc": 0.72817359, + "epoch": 0.48072903486487883, + "grad_norm": 6.40625, + "learning_rate": 9.041549118358601e-06, + "loss": 1.07654676, + "memory(GiB)": 302.58, + "step": 85960, + "train_speed(iter/s)": 0.124485 + }, + { + "acc": 0.74068089, + "epoch": 0.4808408843378581, + "grad_norm": 6.75, + "learning_rate": 9.04100462238988e-06, + "loss": 1.02979593, + "memory(GiB)": 302.58, + "step": 85980, + "train_speed(iter/s)": 0.124499 + }, + { + "acc": 0.72793441, + "epoch": 0.48095273381083736, + "grad_norm": 6.9375, + "learning_rate": 9.04045998820599e-06, + "loss": 1.05911417, + "memory(GiB)": 302.58, + "step": 86000, + "train_speed(iter/s)": 0.124513 + }, + { + "epoch": 0.48095273381083736, + "eval_acc": 0.7013426669896592, + "eval_loss": 1.036531686782837, + "eval_runtime": 7506.327, + "eval_samples_per_second": 10.029, + "eval_steps_per_second": 10.029, + "step": 86000 + }, + { + "acc": 0.74097271, + "epoch": 0.4810645832838166, + "grad_norm": 6.8125, + "learning_rate": 9.039915215825552e-06, + "loss": 1.02698345, + "memory(GiB)": 302.58, + "step": 86020, + "train_speed(iter/s)": 0.123165 + }, + { + "acc": 0.73111029, + "epoch": 0.4811764327567959, + "grad_norm": 8.125, + "learning_rate": 9.039370305267206e-06, + "loss": 1.06512785, + "memory(GiB)": 302.58, + "step": 86040, + "train_speed(iter/s)": 0.123178 + }, + { + "acc": 0.73160777, + "epoch": 0.48128828222977516, + "grad_norm": 6.40625, + "learning_rate": 9.038825256549585e-06, + "loss": 1.06347713, + "memory(GiB)": 302.58, + "step": 86060, + "train_speed(iter/s)": 0.123193 + }, + { + "acc": 0.73580046, + "epoch": 0.4814001317027544, + "grad_norm": 5.09375, + "learning_rate": 9.038280069691336e-06, + "loss": 1.03197289, + "memory(GiB)": 302.58, + "step": 86080, + "train_speed(iter/s)": 0.123207 + }, + { + "acc": 0.73937654, + "epoch": 0.4815119811757337, + "grad_norm": 9.6875, + "learning_rate": 9.037734744711104e-06, + "loss": 1.02380962, + "memory(GiB)": 302.58, + "step": 86100, + "train_speed(iter/s)": 0.12322 + }, + { + "acc": 0.72383614, + "epoch": 0.48162383064871295, + "grad_norm": 7.1875, + "learning_rate": 9.037189281627539e-06, + "loss": 1.10844402, + "memory(GiB)": 302.58, + "step": 86120, + "train_speed(iter/s)": 0.123234 + }, + { + "acc": 0.72639027, + "epoch": 0.4817356801216922, + "grad_norm": 8.5, + "learning_rate": 9.036643680459299e-06, + "loss": 1.08703337, + "memory(GiB)": 302.58, + "step": 86140, + "train_speed(iter/s)": 0.123247 + }, + { + "acc": 0.73647704, + "epoch": 0.4818475295946715, + "grad_norm": 7.375, + "learning_rate": 9.036097941225044e-06, + "loss": 1.05454922, + "memory(GiB)": 302.58, + "step": 86160, + "train_speed(iter/s)": 0.12326 + }, + { + "acc": 0.73608689, + "epoch": 0.48195937906765074, + "grad_norm": 5.71875, + "learning_rate": 9.035552063943443e-06, + "loss": 1.02664804, + "memory(GiB)": 302.58, + "step": 86180, + "train_speed(iter/s)": 0.123274 + }, + { + "acc": 0.740729, + "epoch": 0.48207122854063, + "grad_norm": 6.875, + "learning_rate": 9.035006048633163e-06, + "loss": 1.00079727, + "memory(GiB)": 302.58, + "step": 86200, + "train_speed(iter/s)": 0.123287 + }, + { + "acc": 0.72721887, + "epoch": 0.48218307801360927, + "grad_norm": 6.8125, + "learning_rate": 9.034459895312884e-06, + "loss": 1.07321529, + "memory(GiB)": 302.58, + "step": 86220, + "train_speed(iter/s)": 0.123301 + }, + { + "acc": 0.74144945, + "epoch": 0.48229492748658853, + "grad_norm": 5.8125, + "learning_rate": 9.033913604001284e-06, + "loss": 1.02408934, + "memory(GiB)": 302.58, + "step": 86240, + "train_speed(iter/s)": 0.123314 + }, + { + "acc": 0.72428412, + "epoch": 0.4824067769595678, + "grad_norm": 9.75, + "learning_rate": 9.033367174717046e-06, + "loss": 1.08519192, + "memory(GiB)": 302.58, + "step": 86260, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.73902755, + "epoch": 0.48251862643254706, + "grad_norm": 7.21875, + "learning_rate": 9.03282060747886e-06, + "loss": 1.0168004, + "memory(GiB)": 302.58, + "step": 86280, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.7452527, + "epoch": 0.4826304759055263, + "grad_norm": 5.4375, + "learning_rate": 9.032273902305424e-06, + "loss": 0.98906803, + "memory(GiB)": 302.58, + "step": 86300, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.74216733, + "epoch": 0.4827423253785056, + "grad_norm": 9.0625, + "learning_rate": 9.031727059215434e-06, + "loss": 1.02127724, + "memory(GiB)": 302.58, + "step": 86320, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.72864838, + "epoch": 0.48285417485148485, + "grad_norm": 7.6875, + "learning_rate": 9.031180078227593e-06, + "loss": 1.07252464, + "memory(GiB)": 302.58, + "step": 86340, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.74587379, + "epoch": 0.4829660243244641, + "grad_norm": 10.4375, + "learning_rate": 9.030632959360613e-06, + "loss": 0.98596697, + "memory(GiB)": 302.58, + "step": 86360, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.74156327, + "epoch": 0.4830778737974434, + "grad_norm": 5.28125, + "learning_rate": 9.030085702633202e-06, + "loss": 1.0171608, + "memory(GiB)": 302.58, + "step": 86380, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.732376, + "epoch": 0.48318972327042264, + "grad_norm": 9.0625, + "learning_rate": 9.029538308064083e-06, + "loss": 1.05608616, + "memory(GiB)": 302.58, + "step": 86400, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.72679796, + "epoch": 0.4833015727434019, + "grad_norm": 4.5, + "learning_rate": 9.028990775671977e-06, + "loss": 1.06178408, + "memory(GiB)": 302.58, + "step": 86420, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.72407093, + "epoch": 0.4834134222163812, + "grad_norm": 10.25, + "learning_rate": 9.028443105475608e-06, + "loss": 1.11552963, + "memory(GiB)": 302.58, + "step": 86440, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.74265547, + "epoch": 0.48352527168936044, + "grad_norm": 6.21875, + "learning_rate": 9.027895297493714e-06, + "loss": 1.02303448, + "memory(GiB)": 302.58, + "step": 86460, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.72647257, + "epoch": 0.4836371211623397, + "grad_norm": 5.625, + "learning_rate": 9.027347351745028e-06, + "loss": 1.07677889, + "memory(GiB)": 302.58, + "step": 86480, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.72578201, + "epoch": 0.48374897063531896, + "grad_norm": 10.1875, + "learning_rate": 9.026799268248292e-06, + "loss": 1.06854858, + "memory(GiB)": 302.58, + "step": 86500, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.74395976, + "epoch": 0.48386082010829823, + "grad_norm": 6.9375, + "learning_rate": 9.026251047022253e-06, + "loss": 0.9944335, + "memory(GiB)": 302.58, + "step": 86520, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.73718615, + "epoch": 0.4839726695812775, + "grad_norm": 7.28125, + "learning_rate": 9.025702688085659e-06, + "loss": 1.03252811, + "memory(GiB)": 302.58, + "step": 86540, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.73813362, + "epoch": 0.48408451905425676, + "grad_norm": 8.9375, + "learning_rate": 9.02515419145727e-06, + "loss": 1.02424717, + "memory(GiB)": 302.58, + "step": 86560, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.74081593, + "epoch": 0.484196368527236, + "grad_norm": 8.1875, + "learning_rate": 9.024605557155844e-06, + "loss": 1.02158279, + "memory(GiB)": 302.58, + "step": 86580, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.73876185, + "epoch": 0.4843082180002153, + "grad_norm": 6.84375, + "learning_rate": 9.024056785200145e-06, + "loss": 1.01910362, + "memory(GiB)": 302.58, + "step": 86600, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.73538308, + "epoch": 0.48442006747319455, + "grad_norm": 5.53125, + "learning_rate": 9.023507875608945e-06, + "loss": 1.03665152, + "memory(GiB)": 302.58, + "step": 86620, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.74067855, + "epoch": 0.4845319169461738, + "grad_norm": 6.21875, + "learning_rate": 9.022958828401018e-06, + "loss": 1.0282012, + "memory(GiB)": 302.58, + "step": 86640, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.7336256, + "epoch": 0.4846437664191531, + "grad_norm": 5.40625, + "learning_rate": 9.022409643595142e-06, + "loss": 1.05774298, + "memory(GiB)": 302.58, + "step": 86660, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.72988, + "epoch": 0.48475561589213234, + "grad_norm": 6.78125, + "learning_rate": 9.021860321210101e-06, + "loss": 1.05668087, + "memory(GiB)": 302.58, + "step": 86680, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.73833976, + "epoch": 0.4848674653651116, + "grad_norm": 5.71875, + "learning_rate": 9.021310861264685e-06, + "loss": 1.04991856, + "memory(GiB)": 302.58, + "step": 86700, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.72900405, + "epoch": 0.4849793148380909, + "grad_norm": 7.625, + "learning_rate": 9.020761263777685e-06, + "loss": 1.07516422, + "memory(GiB)": 302.58, + "step": 86720, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.73323708, + "epoch": 0.4850911643110702, + "grad_norm": 7.40625, + "learning_rate": 9.020211528767902e-06, + "loss": 1.04215765, + "memory(GiB)": 302.58, + "step": 86740, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.73404512, + "epoch": 0.48520301378404945, + "grad_norm": 7.53125, + "learning_rate": 9.019661656254135e-06, + "loss": 1.05920076, + "memory(GiB)": 302.58, + "step": 86760, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.73640003, + "epoch": 0.4853148632570287, + "grad_norm": 8.875, + "learning_rate": 9.019111646255195e-06, + "loss": 1.02323732, + "memory(GiB)": 302.58, + "step": 86780, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.71959414, + "epoch": 0.485426712730008, + "grad_norm": 5.125, + "learning_rate": 9.01856149878989e-06, + "loss": 1.12508812, + "memory(GiB)": 302.58, + "step": 86800, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.74264574, + "epoch": 0.48553856220298724, + "grad_norm": 5.5, + "learning_rate": 9.018011213877043e-06, + "loss": 1.0195487, + "memory(GiB)": 302.58, + "step": 86820, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.72712808, + "epoch": 0.4856504116759665, + "grad_norm": 8.25, + "learning_rate": 9.017460791535469e-06, + "loss": 1.07855473, + "memory(GiB)": 302.58, + "step": 86840, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.72223415, + "epoch": 0.4857622611489458, + "grad_norm": 5.4375, + "learning_rate": 9.016910231783997e-06, + "loss": 1.1118784, + "memory(GiB)": 302.58, + "step": 86860, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.7234169, + "epoch": 0.48587411062192504, + "grad_norm": 7.03125, + "learning_rate": 9.016359534641459e-06, + "loss": 1.08067513, + "memory(GiB)": 302.58, + "step": 86880, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.73055911, + "epoch": 0.4859859600949043, + "grad_norm": 8.125, + "learning_rate": 9.01580870012669e-06, + "loss": 1.06302338, + "memory(GiB)": 302.58, + "step": 86900, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.72357135, + "epoch": 0.48609780956788357, + "grad_norm": 6.375, + "learning_rate": 9.015257728258529e-06, + "loss": 1.08817577, + "memory(GiB)": 302.58, + "step": 86920, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.72154837, + "epoch": 0.48620965904086283, + "grad_norm": 6.9375, + "learning_rate": 9.01470661905582e-06, + "loss": 1.09674473, + "memory(GiB)": 302.58, + "step": 86940, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.73969116, + "epoch": 0.4863215085138421, + "grad_norm": 7.75, + "learning_rate": 9.014155372537416e-06, + "loss": 1.01407413, + "memory(GiB)": 302.58, + "step": 86960, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.73746271, + "epoch": 0.48643335798682136, + "grad_norm": 6.9375, + "learning_rate": 9.013603988722168e-06, + "loss": 1.01092472, + "memory(GiB)": 302.58, + "step": 86980, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.73125, + "epoch": 0.4865452074598006, + "grad_norm": 6.5, + "learning_rate": 9.01305246762894e-06, + "loss": 1.06602669, + "memory(GiB)": 302.58, + "step": 87000, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.73274546, + "epoch": 0.4866570569327799, + "grad_norm": 9.5, + "learning_rate": 9.01250080927659e-06, + "loss": 1.03494482, + "memory(GiB)": 302.58, + "step": 87020, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.73186431, + "epoch": 0.48676890640575915, + "grad_norm": 8.625, + "learning_rate": 9.01194901368399e-06, + "loss": 1.0589777, + "memory(GiB)": 302.58, + "step": 87040, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.72635355, + "epoch": 0.4868807558787384, + "grad_norm": 6.78125, + "learning_rate": 9.011397080870011e-06, + "loss": 1.1023756, + "memory(GiB)": 302.58, + "step": 87060, + "train_speed(iter/s)": 0.123869 + }, + { + "acc": 0.73508506, + "epoch": 0.4869926053517177, + "grad_norm": 6.5625, + "learning_rate": 9.010845010853533e-06, + "loss": 1.05371275, + "memory(GiB)": 302.58, + "step": 87080, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.71826105, + "epoch": 0.48710445482469694, + "grad_norm": 5.34375, + "learning_rate": 9.010292803653438e-06, + "loss": 1.12982998, + "memory(GiB)": 302.58, + "step": 87100, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.73893442, + "epoch": 0.4872163042976762, + "grad_norm": 5.53125, + "learning_rate": 9.009740459288612e-06, + "loss": 1.03760777, + "memory(GiB)": 302.58, + "step": 87120, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.72337565, + "epoch": 0.48732815377065547, + "grad_norm": 7.03125, + "learning_rate": 9.009187977777947e-06, + "loss": 1.09810047, + "memory(GiB)": 302.58, + "step": 87140, + "train_speed(iter/s)": 0.123919 + }, + { + "acc": 0.7399416, + "epoch": 0.48744000324363473, + "grad_norm": 7.03125, + "learning_rate": 9.008635359140343e-06, + "loss": 1.04414072, + "memory(GiB)": 302.58, + "step": 87160, + "train_speed(iter/s)": 0.123932 + }, + { + "acc": 0.73971481, + "epoch": 0.487551852716614, + "grad_norm": 8.4375, + "learning_rate": 9.008082603394696e-06, + "loss": 1.03705511, + "memory(GiB)": 302.58, + "step": 87180, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.73350267, + "epoch": 0.48766370218959326, + "grad_norm": 5.0625, + "learning_rate": 9.007529710559916e-06, + "loss": 1.04546881, + "memory(GiB)": 302.58, + "step": 87200, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.73864937, + "epoch": 0.4877755516625725, + "grad_norm": 8.5, + "learning_rate": 9.006976680654914e-06, + "loss": 1.0192358, + "memory(GiB)": 302.58, + "step": 87220, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.74164948, + "epoch": 0.4878874011355518, + "grad_norm": 7.65625, + "learning_rate": 9.0064235136986e-06, + "loss": 0.999928, + "memory(GiB)": 302.58, + "step": 87240, + "train_speed(iter/s)": 0.123985 + }, + { + "acc": 0.74518352, + "epoch": 0.48799925060853105, + "grad_norm": 6.3125, + "learning_rate": 9.005870209709899e-06, + "loss": 0.99231701, + "memory(GiB)": 302.58, + "step": 87260, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.71964741, + "epoch": 0.4881111000815103, + "grad_norm": 8.125, + "learning_rate": 9.005316768707737e-06, + "loss": 1.0939414, + "memory(GiB)": 302.58, + "step": 87280, + "train_speed(iter/s)": 0.124013 + }, + { + "acc": 0.72108254, + "epoch": 0.4882229495544896, + "grad_norm": 6.625, + "learning_rate": 9.00476319071104e-06, + "loss": 1.09214821, + "memory(GiB)": 302.58, + "step": 87300, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.74055743, + "epoch": 0.48833479902746885, + "grad_norm": 8.9375, + "learning_rate": 9.00420947573874e-06, + "loss": 1.01351271, + "memory(GiB)": 302.58, + "step": 87320, + "train_speed(iter/s)": 0.124041 + }, + { + "acc": 0.74321961, + "epoch": 0.4884466485004481, + "grad_norm": 5.84375, + "learning_rate": 9.00365562380978e-06, + "loss": 1.01678629, + "memory(GiB)": 302.58, + "step": 87340, + "train_speed(iter/s)": 0.124055 + }, + { + "acc": 0.74679513, + "epoch": 0.4885584979734274, + "grad_norm": 8.3125, + "learning_rate": 9.003101634943105e-06, + "loss": 1.00257139, + "memory(GiB)": 302.58, + "step": 87360, + "train_speed(iter/s)": 0.124068 + }, + { + "acc": 0.72388177, + "epoch": 0.48867034744640664, + "grad_norm": 7.9375, + "learning_rate": 9.00254750915766e-06, + "loss": 1.10053282, + "memory(GiB)": 302.58, + "step": 87380, + "train_speed(iter/s)": 0.124082 + }, + { + "acc": 0.72842951, + "epoch": 0.4887821969193859, + "grad_norm": 6.65625, + "learning_rate": 9.001993246472399e-06, + "loss": 1.07904119, + "memory(GiB)": 302.58, + "step": 87400, + "train_speed(iter/s)": 0.124095 + }, + { + "acc": 0.74125428, + "epoch": 0.48889404639236517, + "grad_norm": 6.84375, + "learning_rate": 9.001438846906276e-06, + "loss": 0.99628487, + "memory(GiB)": 302.58, + "step": 87420, + "train_speed(iter/s)": 0.124108 + }, + { + "acc": 0.74303484, + "epoch": 0.48900589586534443, + "grad_norm": 5.65625, + "learning_rate": 9.000884310478258e-06, + "loss": 1.03111115, + "memory(GiB)": 302.58, + "step": 87440, + "train_speed(iter/s)": 0.124121 + }, + { + "acc": 0.73326221, + "epoch": 0.4891177453383237, + "grad_norm": 8.8125, + "learning_rate": 9.00032963720731e-06, + "loss": 1.04942303, + "memory(GiB)": 302.58, + "step": 87460, + "train_speed(iter/s)": 0.124135 + }, + { + "acc": 0.72884026, + "epoch": 0.48922959481130296, + "grad_norm": 5.40625, + "learning_rate": 8.999774827112405e-06, + "loss": 1.05816717, + "memory(GiB)": 302.58, + "step": 87480, + "train_speed(iter/s)": 0.124148 + }, + { + "acc": 0.73504767, + "epoch": 0.4893414442842822, + "grad_norm": 10.5625, + "learning_rate": 8.999219880212516e-06, + "loss": 1.05088444, + "memory(GiB)": 302.58, + "step": 87500, + "train_speed(iter/s)": 0.124161 + }, + { + "acc": 0.74429493, + "epoch": 0.4894532937572615, + "grad_norm": 6.84375, + "learning_rate": 8.998664796526626e-06, + "loss": 1.00140209, + "memory(GiB)": 302.58, + "step": 87520, + "train_speed(iter/s)": 0.124176 + }, + { + "acc": 0.74210591, + "epoch": 0.48956514323024075, + "grad_norm": 4.6875, + "learning_rate": 8.998109576073722e-06, + "loss": 1.01785164, + "memory(GiB)": 302.58, + "step": 87540, + "train_speed(iter/s)": 0.12419 + }, + { + "acc": 0.73412766, + "epoch": 0.48967699270322, + "grad_norm": 11.625, + "learning_rate": 8.997554218872791e-06, + "loss": 1.06147814, + "memory(GiB)": 302.58, + "step": 87560, + "train_speed(iter/s)": 0.124204 + }, + { + "acc": 0.74314709, + "epoch": 0.4897888421761993, + "grad_norm": 10.75, + "learning_rate": 8.996998724942831e-06, + "loss": 1.0033432, + "memory(GiB)": 302.58, + "step": 87580, + "train_speed(iter/s)": 0.124218 + }, + { + "acc": 0.74101496, + "epoch": 0.48990069164917854, + "grad_norm": 9.5, + "learning_rate": 8.996443094302841e-06, + "loss": 1.00535288, + "memory(GiB)": 302.58, + "step": 87600, + "train_speed(iter/s)": 0.124232 + }, + { + "acc": 0.73289251, + "epoch": 0.4900125411221578, + "grad_norm": 7.59375, + "learning_rate": 8.995887326971823e-06, + "loss": 1.07175303, + "memory(GiB)": 302.58, + "step": 87620, + "train_speed(iter/s)": 0.124246 + }, + { + "acc": 0.74153981, + "epoch": 0.49012439059513707, + "grad_norm": 6.84375, + "learning_rate": 8.995331422968789e-06, + "loss": 1.01669645, + "memory(GiB)": 302.58, + "step": 87640, + "train_speed(iter/s)": 0.124259 + }, + { + "acc": 0.72342982, + "epoch": 0.49023624006811634, + "grad_norm": 8.1875, + "learning_rate": 8.994775382312753e-06, + "loss": 1.09139795, + "memory(GiB)": 302.58, + "step": 87660, + "train_speed(iter/s)": 0.124273 + }, + { + "acc": 0.72966332, + "epoch": 0.4903480895410956, + "grad_norm": 6.8125, + "learning_rate": 8.994219205022729e-06, + "loss": 1.06037521, + "memory(GiB)": 302.58, + "step": 87680, + "train_speed(iter/s)": 0.124287 + }, + { + "acc": 0.73795042, + "epoch": 0.49045993901407486, + "grad_norm": 5.1875, + "learning_rate": 8.993662891117745e-06, + "loss": 1.03601208, + "memory(GiB)": 302.58, + "step": 87700, + "train_speed(iter/s)": 0.1243 + }, + { + "acc": 0.7384923, + "epoch": 0.49057178848705413, + "grad_norm": 8.875, + "learning_rate": 8.993106440616825e-06, + "loss": 1.02744188, + "memory(GiB)": 302.58, + "step": 87720, + "train_speed(iter/s)": 0.124314 + }, + { + "acc": 0.73087149, + "epoch": 0.4906836379600334, + "grad_norm": 6.46875, + "learning_rate": 8.992549853539004e-06, + "loss": 1.07371683, + "memory(GiB)": 302.58, + "step": 87740, + "train_speed(iter/s)": 0.124326 + }, + { + "acc": 0.7337965, + "epoch": 0.49079548743301266, + "grad_norm": 9.6875, + "learning_rate": 8.991993129903318e-06, + "loss": 1.04990358, + "memory(GiB)": 302.58, + "step": 87760, + "train_speed(iter/s)": 0.124339 + }, + { + "acc": 0.744871, + "epoch": 0.4909073369059919, + "grad_norm": 8.3125, + "learning_rate": 8.991436269728808e-06, + "loss": 0.98600693, + "memory(GiB)": 302.58, + "step": 87780, + "train_speed(iter/s)": 0.124353 + }, + { + "acc": 0.74415278, + "epoch": 0.4910191863789712, + "grad_norm": 6.84375, + "learning_rate": 8.990879273034521e-06, + "loss": 1.00702419, + "memory(GiB)": 302.58, + "step": 87800, + "train_speed(iter/s)": 0.124366 + }, + { + "acc": 0.73826542, + "epoch": 0.49113103585195045, + "grad_norm": 9.1875, + "learning_rate": 8.99032213983951e-06, + "loss": 0.99016914, + "memory(GiB)": 302.58, + "step": 87820, + "train_speed(iter/s)": 0.124379 + }, + { + "acc": 0.72886958, + "epoch": 0.4912428853249297, + "grad_norm": 7.21875, + "learning_rate": 8.989764870162827e-06, + "loss": 1.0629509, + "memory(GiB)": 302.58, + "step": 87840, + "train_speed(iter/s)": 0.124393 + }, + { + "acc": 0.73366289, + "epoch": 0.491354734797909, + "grad_norm": 9.4375, + "learning_rate": 8.989207464023533e-06, + "loss": 1.05173483, + "memory(GiB)": 302.58, + "step": 87860, + "train_speed(iter/s)": 0.124407 + }, + { + "acc": 0.75599399, + "epoch": 0.49146658427088824, + "grad_norm": 7.09375, + "learning_rate": 8.988649921440697e-06, + "loss": 0.96821184, + "memory(GiB)": 302.58, + "step": 87880, + "train_speed(iter/s)": 0.124419 + }, + { + "acc": 0.73633313, + "epoch": 0.4915784337438675, + "grad_norm": 5.78125, + "learning_rate": 8.988092242433384e-06, + "loss": 1.05820436, + "memory(GiB)": 302.58, + "step": 87900, + "train_speed(iter/s)": 0.124433 + }, + { + "acc": 0.74688659, + "epoch": 0.49169028321684677, + "grad_norm": 8.4375, + "learning_rate": 8.987534427020672e-06, + "loss": 0.978337, + "memory(GiB)": 302.58, + "step": 87920, + "train_speed(iter/s)": 0.124447 + }, + { + "acc": 0.73323288, + "epoch": 0.49180213268982603, + "grad_norm": 7.40625, + "learning_rate": 8.986976475221636e-06, + "loss": 1.03837509, + "memory(GiB)": 302.58, + "step": 87940, + "train_speed(iter/s)": 0.124461 + }, + { + "acc": 0.74572453, + "epoch": 0.4919139821628053, + "grad_norm": 6.96875, + "learning_rate": 8.986418387055366e-06, + "loss": 0.97853041, + "memory(GiB)": 302.58, + "step": 87960, + "train_speed(iter/s)": 0.124475 + }, + { + "acc": 0.73669991, + "epoch": 0.49202583163578456, + "grad_norm": 5.78125, + "learning_rate": 8.985860162540944e-06, + "loss": 1.04026604, + "memory(GiB)": 302.58, + "step": 87980, + "train_speed(iter/s)": 0.124488 + }, + { + "acc": 0.73688602, + "epoch": 0.4921376811087638, + "grad_norm": 6.96875, + "learning_rate": 8.985301801697465e-06, + "loss": 1.0413372, + "memory(GiB)": 302.58, + "step": 88000, + "train_speed(iter/s)": 0.124502 + }, + { + "epoch": 0.4921376811087638, + "eval_acc": 0.7015223044788398, + "eval_loss": 1.0354808568954468, + "eval_runtime": 7501.8453, + "eval_samples_per_second": 10.035, + "eval_steps_per_second": 10.035, + "step": 88000 + }, + { + "acc": 0.72996588, + "epoch": 0.4922495305817431, + "grad_norm": 6.375, + "learning_rate": 8.98474330454403e-06, + "loss": 1.06001816, + "memory(GiB)": 302.58, + "step": 88020, + "train_speed(iter/s)": 0.123185 + }, + { + "acc": 0.74518862, + "epoch": 0.49236138005472235, + "grad_norm": 11.625, + "learning_rate": 8.984184671099736e-06, + "loss": 0.99119034, + "memory(GiB)": 302.58, + "step": 88040, + "train_speed(iter/s)": 0.123198 + }, + { + "acc": 0.74150643, + "epoch": 0.4924732295277016, + "grad_norm": 6.90625, + "learning_rate": 8.983625901383695e-06, + "loss": 1.04111872, + "memory(GiB)": 302.58, + "step": 88060, + "train_speed(iter/s)": 0.123212 + }, + { + "acc": 0.73617177, + "epoch": 0.4925850790006809, + "grad_norm": 6.1875, + "learning_rate": 8.983066995415016e-06, + "loss": 1.05134659, + "memory(GiB)": 302.58, + "step": 88080, + "train_speed(iter/s)": 0.123225 + }, + { + "acc": 0.73016934, + "epoch": 0.49269692847366015, + "grad_norm": 4.34375, + "learning_rate": 8.982507953212813e-06, + "loss": 1.06371698, + "memory(GiB)": 302.58, + "step": 88100, + "train_speed(iter/s)": 0.123237 + }, + { + "acc": 0.71525455, + "epoch": 0.4928087779466394, + "grad_norm": 6.0625, + "learning_rate": 8.981948774796213e-06, + "loss": 1.12974167, + "memory(GiB)": 302.58, + "step": 88120, + "train_speed(iter/s)": 0.12325 + }, + { + "acc": 0.73877602, + "epoch": 0.4929206274196187, + "grad_norm": 10.0, + "learning_rate": 8.981389460184338e-06, + "loss": 1.03364258, + "memory(GiB)": 302.58, + "step": 88140, + "train_speed(iter/s)": 0.123263 + }, + { + "acc": 0.74612961, + "epoch": 0.49303247689259794, + "grad_norm": 6.0, + "learning_rate": 8.980830009396317e-06, + "loss": 0.99500523, + "memory(GiB)": 302.58, + "step": 88160, + "train_speed(iter/s)": 0.123277 + }, + { + "acc": 0.75161662, + "epoch": 0.4931443263655772, + "grad_norm": 8.1875, + "learning_rate": 8.98027042245129e-06, + "loss": 0.98880415, + "memory(GiB)": 302.58, + "step": 88180, + "train_speed(iter/s)": 0.12329 + }, + { + "acc": 0.7524251, + "epoch": 0.49325617583855647, + "grad_norm": 7.90625, + "learning_rate": 8.97971069936839e-06, + "loss": 0.97565231, + "memory(GiB)": 302.58, + "step": 88200, + "train_speed(iter/s)": 0.123304 + }, + { + "acc": 0.74574332, + "epoch": 0.49336802531153573, + "grad_norm": 12.5625, + "learning_rate": 8.979150840166766e-06, + "loss": 1.0181448, + "memory(GiB)": 302.58, + "step": 88220, + "train_speed(iter/s)": 0.123316 + }, + { + "acc": 0.73873005, + "epoch": 0.493479874784515, + "grad_norm": 6.125, + "learning_rate": 8.978590844865566e-06, + "loss": 1.04129648, + "memory(GiB)": 302.58, + "step": 88240, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.72709141, + "epoch": 0.49359172425749426, + "grad_norm": 7.0625, + "learning_rate": 8.978030713483944e-06, + "loss": 1.10513973, + "memory(GiB)": 302.58, + "step": 88260, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.71493006, + "epoch": 0.4937035737304735, + "grad_norm": 5.96875, + "learning_rate": 8.977470446041056e-06, + "loss": 1.14666758, + "memory(GiB)": 302.58, + "step": 88280, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.74466491, + "epoch": 0.4938154232034528, + "grad_norm": 8.125, + "learning_rate": 8.976910042556068e-06, + "loss": 1.01545715, + "memory(GiB)": 302.58, + "step": 88300, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.75197773, + "epoch": 0.49392727267643205, + "grad_norm": 4.71875, + "learning_rate": 8.976349503048146e-06, + "loss": 0.97081337, + "memory(GiB)": 302.58, + "step": 88320, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.74672813, + "epoch": 0.4940391221494113, + "grad_norm": 5.59375, + "learning_rate": 8.975788827536461e-06, + "loss": 0.97496481, + "memory(GiB)": 302.58, + "step": 88340, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.73300328, + "epoch": 0.4941509716223906, + "grad_norm": 7.6875, + "learning_rate": 8.975228016040192e-06, + "loss": 1.05631113, + "memory(GiB)": 302.58, + "step": 88360, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74084601, + "epoch": 0.49426282109536984, + "grad_norm": 8.625, + "learning_rate": 8.97466706857852e-06, + "loss": 1.0367919, + "memory(GiB)": 302.58, + "step": 88380, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.71582003, + "epoch": 0.4943746705683491, + "grad_norm": 5.84375, + "learning_rate": 8.97410598517063e-06, + "loss": 1.16031857, + "memory(GiB)": 302.58, + "step": 88400, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.73776751, + "epoch": 0.49448652004132837, + "grad_norm": 7.5625, + "learning_rate": 8.973544765835715e-06, + "loss": 1.08001842, + "memory(GiB)": 302.58, + "step": 88420, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.74315906, + "epoch": 0.49459836951430763, + "grad_norm": 6.0, + "learning_rate": 8.97298341059297e-06, + "loss": 0.99692469, + "memory(GiB)": 302.58, + "step": 88440, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74421434, + "epoch": 0.4947102189872869, + "grad_norm": 6.75, + "learning_rate": 8.972421919461593e-06, + "loss": 1.00311003, + "memory(GiB)": 302.58, + "step": 88460, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.72804871, + "epoch": 0.49482206846026616, + "grad_norm": 8.0625, + "learning_rate": 8.97186029246079e-06, + "loss": 1.08111315, + "memory(GiB)": 302.58, + "step": 88480, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.73401394, + "epoch": 0.4949339179332454, + "grad_norm": 6.90625, + "learning_rate": 8.97129852960977e-06, + "loss": 1.07716846, + "memory(GiB)": 302.58, + "step": 88500, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.72928367, + "epoch": 0.4950457674062247, + "grad_norm": 7.03125, + "learning_rate": 8.97073663092775e-06, + "loss": 1.07231731, + "memory(GiB)": 302.58, + "step": 88520, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75472064, + "epoch": 0.49515761687920395, + "grad_norm": 7.9375, + "learning_rate": 8.970174596433945e-06, + "loss": 0.94323864, + "memory(GiB)": 302.58, + "step": 88540, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.7348803, + "epoch": 0.4952694663521832, + "grad_norm": 8.0, + "learning_rate": 8.969612426147577e-06, + "loss": 1.06713057, + "memory(GiB)": 302.58, + "step": 88560, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.7221736, + "epoch": 0.4953813158251625, + "grad_norm": 6.40625, + "learning_rate": 8.969050120087881e-06, + "loss": 1.10224218, + "memory(GiB)": 302.58, + "step": 88580, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.73253026, + "epoch": 0.49549316529814175, + "grad_norm": 7.34375, + "learning_rate": 8.968487678274084e-06, + "loss": 1.06142578, + "memory(GiB)": 302.58, + "step": 88600, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.73087573, + "epoch": 0.495605014771121, + "grad_norm": 6.875, + "learning_rate": 8.967925100725424e-06, + "loss": 1.06686726, + "memory(GiB)": 302.58, + "step": 88620, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.73049207, + "epoch": 0.4957168642441003, + "grad_norm": 5.625, + "learning_rate": 8.967362387461143e-06, + "loss": 1.06838245, + "memory(GiB)": 302.58, + "step": 88640, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.72990713, + "epoch": 0.49582871371707954, + "grad_norm": 6.15625, + "learning_rate": 8.96679953850049e-06, + "loss": 1.07527447, + "memory(GiB)": 302.58, + "step": 88660, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.72223349, + "epoch": 0.4959405631900588, + "grad_norm": 8.25, + "learning_rate": 8.966236553862714e-06, + "loss": 1.09047375, + "memory(GiB)": 302.58, + "step": 88680, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.73274736, + "epoch": 0.49605241266303807, + "grad_norm": 9.1875, + "learning_rate": 8.96567343356707e-06, + "loss": 1.05303974, + "memory(GiB)": 302.58, + "step": 88700, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.73093271, + "epoch": 0.49616426213601733, + "grad_norm": 7.125, + "learning_rate": 8.965110177632822e-06, + "loss": 1.06787977, + "memory(GiB)": 302.58, + "step": 88720, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.73404832, + "epoch": 0.4962761116089966, + "grad_norm": 8.125, + "learning_rate": 8.964546786079231e-06, + "loss": 1.0675189, + "memory(GiB)": 302.58, + "step": 88740, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.73612413, + "epoch": 0.49638796108197586, + "grad_norm": 5.34375, + "learning_rate": 8.963983258925567e-06, + "loss": 1.02536602, + "memory(GiB)": 302.58, + "step": 88760, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.74790635, + "epoch": 0.4964998105549551, + "grad_norm": 7.53125, + "learning_rate": 8.96341959619111e-06, + "loss": 1.02479239, + "memory(GiB)": 302.58, + "step": 88780, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.74096026, + "epoch": 0.4966116600279344, + "grad_norm": 5.0625, + "learning_rate": 8.962855797895133e-06, + "loss": 0.99911785, + "memory(GiB)": 302.58, + "step": 88800, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.7217802, + "epoch": 0.49672350950091365, + "grad_norm": 5.5, + "learning_rate": 8.962291864056923e-06, + "loss": 1.0900341, + "memory(GiB)": 302.58, + "step": 88820, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.74022636, + "epoch": 0.4968353589738929, + "grad_norm": 4.3125, + "learning_rate": 8.961727794695767e-06, + "loss": 1.02995462, + "memory(GiB)": 302.58, + "step": 88840, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.7290473, + "epoch": 0.4969472084468722, + "grad_norm": 8.75, + "learning_rate": 8.961163589830956e-06, + "loss": 1.07596121, + "memory(GiB)": 302.58, + "step": 88860, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.73001537, + "epoch": 0.49705905791985144, + "grad_norm": 6.6875, + "learning_rate": 8.960599249481792e-06, + "loss": 1.06007128, + "memory(GiB)": 302.58, + "step": 88880, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.7390821, + "epoch": 0.4971709073928307, + "grad_norm": 7.375, + "learning_rate": 8.960034773667573e-06, + "loss": 1.00719614, + "memory(GiB)": 302.58, + "step": 88900, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.74692602, + "epoch": 0.49728275686580997, + "grad_norm": 9.4375, + "learning_rate": 8.959470162407611e-06, + "loss": 0.97828407, + "memory(GiB)": 302.58, + "step": 88920, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.72512517, + "epoch": 0.49739460633878924, + "grad_norm": 8.0, + "learning_rate": 8.958905415721211e-06, + "loss": 1.10112362, + "memory(GiB)": 302.58, + "step": 88940, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.73113418, + "epoch": 0.4975064558117685, + "grad_norm": 8.3125, + "learning_rate": 8.958340533627694e-06, + "loss": 1.06759806, + "memory(GiB)": 302.58, + "step": 88960, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.72582421, + "epoch": 0.49761830528474776, + "grad_norm": 5.09375, + "learning_rate": 8.957775516146378e-06, + "loss": 1.08223991, + "memory(GiB)": 302.58, + "step": 88980, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.73677707, + "epoch": 0.49773015475772703, + "grad_norm": 6.125, + "learning_rate": 8.957210363296592e-06, + "loss": 1.02434711, + "memory(GiB)": 302.58, + "step": 89000, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.7410275, + "epoch": 0.4978420042307063, + "grad_norm": 5.6875, + "learning_rate": 8.956645075097661e-06, + "loss": 1.0046422, + "memory(GiB)": 302.58, + "step": 89020, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.72819486, + "epoch": 0.49795385370368556, + "grad_norm": 6.4375, + "learning_rate": 8.956079651568924e-06, + "loss": 1.07259188, + "memory(GiB)": 302.58, + "step": 89040, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.74263997, + "epoch": 0.4980657031766648, + "grad_norm": 4.71875, + "learning_rate": 8.955514092729717e-06, + "loss": 1.000315, + "memory(GiB)": 302.58, + "step": 89060, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.75569487, + "epoch": 0.4981775526496441, + "grad_norm": 5.09375, + "learning_rate": 8.954948398599388e-06, + "loss": 0.95936546, + "memory(GiB)": 302.58, + "step": 89080, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.73625126, + "epoch": 0.49828940212262335, + "grad_norm": 4.59375, + "learning_rate": 8.954382569197282e-06, + "loss": 1.03709927, + "memory(GiB)": 302.58, + "step": 89100, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.72835312, + "epoch": 0.4984012515956026, + "grad_norm": 7.0625, + "learning_rate": 8.953816604542752e-06, + "loss": 1.05866442, + "memory(GiB)": 302.58, + "step": 89120, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.71447191, + "epoch": 0.4985131010685819, + "grad_norm": 9.0625, + "learning_rate": 8.953250504655155e-06, + "loss": 1.13423319, + "memory(GiB)": 302.58, + "step": 89140, + "train_speed(iter/s)": 0.123919 + }, + { + "acc": 0.73206205, + "epoch": 0.49862495054156114, + "grad_norm": 8.0625, + "learning_rate": 8.952684269553859e-06, + "loss": 1.046486, + "memory(GiB)": 302.58, + "step": 89160, + "train_speed(iter/s)": 0.123932 + }, + { + "acc": 0.73299346, + "epoch": 0.4987368000145404, + "grad_norm": 7.78125, + "learning_rate": 8.952117899258226e-06, + "loss": 1.05534925, + "memory(GiB)": 302.58, + "step": 89180, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.72852888, + "epoch": 0.49884864948751967, + "grad_norm": 6.75, + "learning_rate": 8.951551393787628e-06, + "loss": 1.07273121, + "memory(GiB)": 302.58, + "step": 89200, + "train_speed(iter/s)": 0.123959 + }, + { + "acc": 0.73481236, + "epoch": 0.498960498960499, + "grad_norm": 7.53125, + "learning_rate": 8.950984753161445e-06, + "loss": 1.0558815, + "memory(GiB)": 302.58, + "step": 89220, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.7335094, + "epoch": 0.49907234843347825, + "grad_norm": 8.25, + "learning_rate": 8.950417977399053e-06, + "loss": 1.05246496, + "memory(GiB)": 302.58, + "step": 89240, + "train_speed(iter/s)": 0.123985 + }, + { + "acc": 0.71772385, + "epoch": 0.4991841979064575, + "grad_norm": 5.15625, + "learning_rate": 8.94985106651984e-06, + "loss": 1.12923269, + "memory(GiB)": 302.58, + "step": 89260, + "train_speed(iter/s)": 0.123998 + }, + { + "acc": 0.72642894, + "epoch": 0.4992960473794368, + "grad_norm": 6.3125, + "learning_rate": 8.949284020543196e-06, + "loss": 1.08893757, + "memory(GiB)": 302.58, + "step": 89280, + "train_speed(iter/s)": 0.124012 + }, + { + "acc": 0.73598065, + "epoch": 0.49940789685241604, + "grad_norm": 8.1875, + "learning_rate": 8.948716839488516e-06, + "loss": 1.0404871, + "memory(GiB)": 302.58, + "step": 89300, + "train_speed(iter/s)": 0.124025 + }, + { + "acc": 0.74650574, + "epoch": 0.4995197463253953, + "grad_norm": 7.4375, + "learning_rate": 8.9481495233752e-06, + "loss": 0.98537169, + "memory(GiB)": 302.58, + "step": 89320, + "train_speed(iter/s)": 0.124038 + }, + { + "acc": 0.74902239, + "epoch": 0.4996315957983746, + "grad_norm": 7.5625, + "learning_rate": 8.947582072222648e-06, + "loss": 0.96982012, + "memory(GiB)": 302.58, + "step": 89340, + "train_speed(iter/s)": 0.124052 + }, + { + "acc": 0.73383684, + "epoch": 0.49974344527135384, + "grad_norm": 5.28125, + "learning_rate": 8.947014486050274e-06, + "loss": 1.06828957, + "memory(GiB)": 302.58, + "step": 89360, + "train_speed(iter/s)": 0.124065 + }, + { + "acc": 0.74000945, + "epoch": 0.4998552947443331, + "grad_norm": 7.28125, + "learning_rate": 8.946446764877489e-06, + "loss": 1.01952991, + "memory(GiB)": 302.58, + "step": 89380, + "train_speed(iter/s)": 0.124078 + }, + { + "acc": 0.7492969, + "epoch": 0.49996714421731236, + "grad_norm": 5.9375, + "learning_rate": 8.94587890872371e-06, + "loss": 1.01053982, + "memory(GiB)": 302.58, + "step": 89400, + "train_speed(iter/s)": 0.124092 + }, + { + "acc": 0.7272274, + "epoch": 0.5000789936902916, + "grad_norm": 8.125, + "learning_rate": 8.94531091760836e-06, + "loss": 1.08783655, + "memory(GiB)": 302.58, + "step": 89420, + "train_speed(iter/s)": 0.124105 + }, + { + "acc": 0.73886738, + "epoch": 0.5001908431632709, + "grad_norm": 8.25, + "learning_rate": 8.944742791550867e-06, + "loss": 1.03823071, + "memory(GiB)": 302.58, + "step": 89440, + "train_speed(iter/s)": 0.124118 + }, + { + "acc": 0.73646002, + "epoch": 0.5003026926362502, + "grad_norm": 7.5625, + "learning_rate": 8.944174530570663e-06, + "loss": 1.03851709, + "memory(GiB)": 302.58, + "step": 89460, + "train_speed(iter/s)": 0.124131 + }, + { + "acc": 0.74392853, + "epoch": 0.5004145421092294, + "grad_norm": 7.0, + "learning_rate": 8.943606134687183e-06, + "loss": 0.99918785, + "memory(GiB)": 302.58, + "step": 89480, + "train_speed(iter/s)": 0.124145 + }, + { + "acc": 0.7189826, + "epoch": 0.5005263915822087, + "grad_norm": 5.84375, + "learning_rate": 8.943037603919867e-06, + "loss": 1.12592249, + "memory(GiB)": 302.58, + "step": 89500, + "train_speed(iter/s)": 0.124158 + }, + { + "acc": 0.74380469, + "epoch": 0.500638241055188, + "grad_norm": 8.3125, + "learning_rate": 8.942468938288164e-06, + "loss": 1.0125412, + "memory(GiB)": 302.58, + "step": 89520, + "train_speed(iter/s)": 0.124171 + }, + { + "acc": 0.75779858, + "epoch": 0.5007500905281672, + "grad_norm": 5.09375, + "learning_rate": 8.94190013781152e-06, + "loss": 0.95953293, + "memory(GiB)": 302.58, + "step": 89540, + "train_speed(iter/s)": 0.124184 + }, + { + "acc": 0.74617786, + "epoch": 0.5008619400011465, + "grad_norm": 6.0625, + "learning_rate": 8.941331202509393e-06, + "loss": 0.98632526, + "memory(GiB)": 302.58, + "step": 89560, + "train_speed(iter/s)": 0.124197 + }, + { + "acc": 0.73931313, + "epoch": 0.5009737894741257, + "grad_norm": 6.59375, + "learning_rate": 8.94076213240124e-06, + "loss": 1.04018669, + "memory(GiB)": 302.58, + "step": 89580, + "train_speed(iter/s)": 0.12421 + }, + { + "acc": 0.75416293, + "epoch": 0.501085638947105, + "grad_norm": 7.59375, + "learning_rate": 8.940192927506528e-06, + "loss": 0.9745225, + "memory(GiB)": 302.58, + "step": 89600, + "train_speed(iter/s)": 0.124224 + }, + { + "acc": 0.7450386, + "epoch": 0.5011974884200843, + "grad_norm": 9.4375, + "learning_rate": 8.939623587844724e-06, + "loss": 0.99065428, + "memory(GiB)": 302.58, + "step": 89620, + "train_speed(iter/s)": 0.124236 + }, + { + "acc": 0.74879694, + "epoch": 0.5013093378930635, + "grad_norm": 6.6875, + "learning_rate": 8.939054113435299e-06, + "loss": 0.98900747, + "memory(GiB)": 302.58, + "step": 89640, + "train_speed(iter/s)": 0.124248 + }, + { + "acc": 0.75427618, + "epoch": 0.5014211873660428, + "grad_norm": 6.34375, + "learning_rate": 8.938484504297734e-06, + "loss": 0.94973793, + "memory(GiB)": 302.58, + "step": 89660, + "train_speed(iter/s)": 0.124262 + }, + { + "acc": 0.73621202, + "epoch": 0.5015330368390221, + "grad_norm": 8.0, + "learning_rate": 8.93791476045151e-06, + "loss": 1.04981136, + "memory(GiB)": 302.58, + "step": 89680, + "train_speed(iter/s)": 0.124274 + }, + { + "acc": 0.73562751, + "epoch": 0.5016448863120013, + "grad_norm": 8.0625, + "learning_rate": 8.937344881916116e-06, + "loss": 1.0468873, + "memory(GiB)": 302.58, + "step": 89700, + "train_speed(iter/s)": 0.124287 + }, + { + "acc": 0.73721046, + "epoch": 0.5017567357849806, + "grad_norm": 5.71875, + "learning_rate": 8.93677486871104e-06, + "loss": 1.05172415, + "memory(GiB)": 302.58, + "step": 89720, + "train_speed(iter/s)": 0.124299 + }, + { + "acc": 0.74485688, + "epoch": 0.5018685852579599, + "grad_norm": 5.71875, + "learning_rate": 8.93620472085578e-06, + "loss": 1.02863407, + "memory(GiB)": 302.58, + "step": 89740, + "train_speed(iter/s)": 0.124312 + }, + { + "acc": 0.76021261, + "epoch": 0.5019804347309391, + "grad_norm": 8.0625, + "learning_rate": 8.93563443836984e-06, + "loss": 0.92869549, + "memory(GiB)": 302.58, + "step": 89760, + "train_speed(iter/s)": 0.124323 + }, + { + "acc": 0.74094839, + "epoch": 0.5020922842039184, + "grad_norm": 5.375, + "learning_rate": 8.935064021272721e-06, + "loss": 1.00417776, + "memory(GiB)": 302.58, + "step": 89780, + "train_speed(iter/s)": 0.124337 + }, + { + "acc": 0.73910041, + "epoch": 0.5022041336768976, + "grad_norm": 4.40625, + "learning_rate": 8.934493469583934e-06, + "loss": 1.03514233, + "memory(GiB)": 302.58, + "step": 89800, + "train_speed(iter/s)": 0.12435 + }, + { + "acc": 0.73133011, + "epoch": 0.5023159831498769, + "grad_norm": 5.125, + "learning_rate": 8.933922783322998e-06, + "loss": 1.06940441, + "memory(GiB)": 302.58, + "step": 89820, + "train_speed(iter/s)": 0.124363 + }, + { + "acc": 0.73277783, + "epoch": 0.5024278326228562, + "grad_norm": 6.53125, + "learning_rate": 8.933351962509424e-06, + "loss": 1.03970413, + "memory(GiB)": 302.58, + "step": 89840, + "train_speed(iter/s)": 0.124375 + }, + { + "acc": 0.72855954, + "epoch": 0.5025396820958354, + "grad_norm": 5.46875, + "learning_rate": 8.932781007162743e-06, + "loss": 1.08718615, + "memory(GiB)": 302.58, + "step": 89860, + "train_speed(iter/s)": 0.124388 + }, + { + "acc": 0.74527011, + "epoch": 0.5026515315688147, + "grad_norm": 6.5, + "learning_rate": 8.932209917302479e-06, + "loss": 1.01205339, + "memory(GiB)": 302.58, + "step": 89880, + "train_speed(iter/s)": 0.124401 + }, + { + "acc": 0.7341855, + "epoch": 0.502763381041794, + "grad_norm": 7.8125, + "learning_rate": 8.931638692948169e-06, + "loss": 1.05413074, + "memory(GiB)": 302.58, + "step": 89900, + "train_speed(iter/s)": 0.124414 + }, + { + "acc": 0.73945103, + "epoch": 0.5028752305147732, + "grad_norm": 8.4375, + "learning_rate": 8.931067334119349e-06, + "loss": 1.01425447, + "memory(GiB)": 302.58, + "step": 89920, + "train_speed(iter/s)": 0.124427 + }, + { + "acc": 0.74047937, + "epoch": 0.5029870799877525, + "grad_norm": 10.875, + "learning_rate": 8.93049584083556e-06, + "loss": 1.02893238, + "memory(GiB)": 302.58, + "step": 89940, + "train_speed(iter/s)": 0.12444 + }, + { + "acc": 0.72626872, + "epoch": 0.5030989294607318, + "grad_norm": 7.6875, + "learning_rate": 8.92992421311635e-06, + "loss": 1.08159952, + "memory(GiB)": 302.58, + "step": 89960, + "train_speed(iter/s)": 0.124452 + }, + { + "acc": 0.73435268, + "epoch": 0.503210778933711, + "grad_norm": 8.5, + "learning_rate": 8.929352450981273e-06, + "loss": 1.05498524, + "memory(GiB)": 302.58, + "step": 89980, + "train_speed(iter/s)": 0.124466 + }, + { + "acc": 0.74621086, + "epoch": 0.5033226284066903, + "grad_norm": 8.25, + "learning_rate": 8.928780554449878e-06, + "loss": 1.00010014, + "memory(GiB)": 302.58, + "step": 90000, + "train_speed(iter/s)": 0.124479 + }, + { + "epoch": 0.5033226284066903, + "eval_acc": 0.7016683215597819, + "eval_loss": 1.034952998161316, + "eval_runtime": 7548.8136, + "eval_samples_per_second": 9.973, + "eval_steps_per_second": 9.973, + "step": 90000 + }, + { + "acc": 0.74300032, + "epoch": 0.5034344778796696, + "grad_norm": 9.75, + "learning_rate": 8.928208523541734e-06, + "loss": 0.99882698, + "memory(GiB)": 302.58, + "step": 90020, + "train_speed(iter/s)": 0.123184 + }, + { + "acc": 0.72890868, + "epoch": 0.5035463273526488, + "grad_norm": 5.84375, + "learning_rate": 8.927636358276401e-06, + "loss": 1.07668266, + "memory(GiB)": 302.58, + "step": 90040, + "train_speed(iter/s)": 0.123198 + }, + { + "acc": 0.75105395, + "epoch": 0.5036581768256281, + "grad_norm": 7.8125, + "learning_rate": 8.927064058673452e-06, + "loss": 0.97991924, + "memory(GiB)": 302.58, + "step": 90060, + "train_speed(iter/s)": 0.12321 + }, + { + "acc": 0.7453898, + "epoch": 0.5037700262986073, + "grad_norm": 8.875, + "learning_rate": 8.926491624752458e-06, + "loss": 0.99905825, + "memory(GiB)": 302.58, + "step": 90080, + "train_speed(iter/s)": 0.123224 + }, + { + "acc": 0.7501636, + "epoch": 0.5038818757715866, + "grad_norm": 8.6875, + "learning_rate": 8.925919056532999e-06, + "loss": 1.00365906, + "memory(GiB)": 302.58, + "step": 90100, + "train_speed(iter/s)": 0.123237 + }, + { + "acc": 0.72911305, + "epoch": 0.5039937252445659, + "grad_norm": 7.9375, + "learning_rate": 8.92534635403466e-06, + "loss": 1.0682126, + "memory(GiB)": 302.58, + "step": 90120, + "train_speed(iter/s)": 0.12325 + }, + { + "acc": 0.74220581, + "epoch": 0.5041055747175451, + "grad_norm": 7.59375, + "learning_rate": 8.92477351727703e-06, + "loss": 1.00275793, + "memory(GiB)": 302.58, + "step": 90140, + "train_speed(iter/s)": 0.123263 + }, + { + "acc": 0.7295383, + "epoch": 0.5042174241905244, + "grad_norm": 6.0625, + "learning_rate": 8.9242005462797e-06, + "loss": 1.07009268, + "memory(GiB)": 302.58, + "step": 90160, + "train_speed(iter/s)": 0.123276 + }, + { + "acc": 0.73338256, + "epoch": 0.5043292736635037, + "grad_norm": 8.25, + "learning_rate": 8.923627441062267e-06, + "loss": 1.04363012, + "memory(GiB)": 302.58, + "step": 90180, + "train_speed(iter/s)": 0.12329 + }, + { + "acc": 0.73541064, + "epoch": 0.5044411231364829, + "grad_norm": 6.78125, + "learning_rate": 8.923054201644337e-06, + "loss": 1.01867752, + "memory(GiB)": 302.58, + "step": 90200, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.73957877, + "epoch": 0.5045529726094622, + "grad_norm": 7.9375, + "learning_rate": 8.922480828045511e-06, + "loss": 1.05890989, + "memory(GiB)": 302.58, + "step": 90220, + "train_speed(iter/s)": 0.123316 + }, + { + "acc": 0.73318238, + "epoch": 0.5046648220824415, + "grad_norm": 6.9375, + "learning_rate": 8.921907320285403e-06, + "loss": 1.03167, + "memory(GiB)": 302.58, + "step": 90240, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.7299799, + "epoch": 0.5047766715554207, + "grad_norm": 6.15625, + "learning_rate": 8.92133367838363e-06, + "loss": 1.05013485, + "memory(GiB)": 302.58, + "step": 90260, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.73148379, + "epoch": 0.5048885210284, + "grad_norm": 8.0, + "learning_rate": 8.92075990235981e-06, + "loss": 1.06236229, + "memory(GiB)": 302.58, + "step": 90280, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.7368547, + "epoch": 0.5050003705013792, + "grad_norm": 6.1875, + "learning_rate": 8.92018599223357e-06, + "loss": 1.0328289, + "memory(GiB)": 302.58, + "step": 90300, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.74234791, + "epoch": 0.5051122199743585, + "grad_norm": 7.59375, + "learning_rate": 8.919611948024537e-06, + "loss": 1.01733198, + "memory(GiB)": 302.58, + "step": 90320, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.73862677, + "epoch": 0.5052240694473378, + "grad_norm": 7.1875, + "learning_rate": 8.919037769752348e-06, + "loss": 1.01304474, + "memory(GiB)": 302.58, + "step": 90340, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.73011141, + "epoch": 0.505335918920317, + "grad_norm": 4.375, + "learning_rate": 8.91846345743664e-06, + "loss": 1.05906286, + "memory(GiB)": 302.58, + "step": 90360, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.72920942, + "epoch": 0.5054477683932963, + "grad_norm": 6.59375, + "learning_rate": 8.917889011097055e-06, + "loss": 1.06967745, + "memory(GiB)": 302.58, + "step": 90380, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.72267494, + "epoch": 0.5055596178662756, + "grad_norm": 9.3125, + "learning_rate": 8.917314430753244e-06, + "loss": 1.10613861, + "memory(GiB)": 302.58, + "step": 90400, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.74053626, + "epoch": 0.5056714673392548, + "grad_norm": 8.125, + "learning_rate": 8.916739716424857e-06, + "loss": 1.03462782, + "memory(GiB)": 302.58, + "step": 90420, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.73601861, + "epoch": 0.5057833168122341, + "grad_norm": 8.25, + "learning_rate": 8.916164868131553e-06, + "loss": 1.04104023, + "memory(GiB)": 302.58, + "step": 90440, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.7341887, + "epoch": 0.5058951662852134, + "grad_norm": 7.15625, + "learning_rate": 8.915589885892992e-06, + "loss": 1.03899002, + "memory(GiB)": 302.58, + "step": 90460, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.72568002, + "epoch": 0.5060070157581926, + "grad_norm": 6.46875, + "learning_rate": 8.915014769728843e-06, + "loss": 1.07615471, + "memory(GiB)": 302.58, + "step": 90480, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.75400295, + "epoch": 0.5061188652311719, + "grad_norm": 4.65625, + "learning_rate": 8.914439519658771e-06, + "loss": 0.96415567, + "memory(GiB)": 302.58, + "step": 90500, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.72635703, + "epoch": 0.5062307147041512, + "grad_norm": 5.96875, + "learning_rate": 8.913864135702458e-06, + "loss": 1.07119646, + "memory(GiB)": 302.58, + "step": 90520, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.75436902, + "epoch": 0.5063425641771304, + "grad_norm": 5.34375, + "learning_rate": 8.913288617879581e-06, + "loss": 0.95392818, + "memory(GiB)": 302.58, + "step": 90540, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.73815918, + "epoch": 0.5064544136501097, + "grad_norm": 6.625, + "learning_rate": 8.912712966209824e-06, + "loss": 1.02675056, + "memory(GiB)": 302.58, + "step": 90560, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.74492316, + "epoch": 0.506566263123089, + "grad_norm": 7.34375, + "learning_rate": 8.912137180712875e-06, + "loss": 1.01390152, + "memory(GiB)": 302.58, + "step": 90580, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.73434267, + "epoch": 0.5066781125960682, + "grad_norm": 9.625, + "learning_rate": 8.911561261408431e-06, + "loss": 1.06647539, + "memory(GiB)": 302.58, + "step": 90600, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.74894652, + "epoch": 0.5067899620690475, + "grad_norm": 7.25, + "learning_rate": 8.91098520831619e-06, + "loss": 0.98510389, + "memory(GiB)": 302.58, + "step": 90620, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.74178748, + "epoch": 0.5069018115420267, + "grad_norm": 9.625, + "learning_rate": 8.910409021455851e-06, + "loss": 0.99882278, + "memory(GiB)": 302.58, + "step": 90640, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.743894, + "epoch": 0.507013661015006, + "grad_norm": 6.4375, + "learning_rate": 8.909832700847126e-06, + "loss": 0.98886499, + "memory(GiB)": 302.58, + "step": 90660, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.73976669, + "epoch": 0.5071255104879853, + "grad_norm": 6.90625, + "learning_rate": 8.909256246509723e-06, + "loss": 1.02349377, + "memory(GiB)": 302.58, + "step": 90680, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.74026861, + "epoch": 0.5072373599609645, + "grad_norm": 5.75, + "learning_rate": 8.908679658463361e-06, + "loss": 1.02614202, + "memory(GiB)": 302.58, + "step": 90700, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.71785808, + "epoch": 0.5073492094339438, + "grad_norm": 9.5625, + "learning_rate": 8.90810293672776e-06, + "loss": 1.1420577, + "memory(GiB)": 302.58, + "step": 90720, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.73771367, + "epoch": 0.5074610589069231, + "grad_norm": 7.375, + "learning_rate": 8.907526081322647e-06, + "loss": 1.04520864, + "memory(GiB)": 302.58, + "step": 90740, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.73278718, + "epoch": 0.5075729083799023, + "grad_norm": 8.8125, + "learning_rate": 8.906949092267751e-06, + "loss": 1.04779139, + "memory(GiB)": 302.58, + "step": 90760, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.73937764, + "epoch": 0.5076847578528816, + "grad_norm": 7.59375, + "learning_rate": 8.90637196958281e-06, + "loss": 1.02017326, + "memory(GiB)": 302.58, + "step": 90780, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.74978757, + "epoch": 0.5077966073258608, + "grad_norm": 9.625, + "learning_rate": 8.905794713287558e-06, + "loss": 0.97598114, + "memory(GiB)": 302.58, + "step": 90800, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.73036227, + "epoch": 0.5079084567988401, + "grad_norm": 6.84375, + "learning_rate": 8.905217323401742e-06, + "loss": 1.08156166, + "memory(GiB)": 302.58, + "step": 90820, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.7318686, + "epoch": 0.5080203062718194, + "grad_norm": 6.59375, + "learning_rate": 8.904639799945111e-06, + "loss": 1.05681047, + "memory(GiB)": 302.58, + "step": 90840, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.73299141, + "epoch": 0.5081321557447986, + "grad_norm": 12.375, + "learning_rate": 8.904062142937419e-06, + "loss": 1.06990051, + "memory(GiB)": 302.58, + "step": 90860, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.75147138, + "epoch": 0.5082440052177779, + "grad_norm": 6.0, + "learning_rate": 8.903484352398422e-06, + "loss": 0.99086342, + "memory(GiB)": 302.58, + "step": 90880, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.75176473, + "epoch": 0.5083558546907572, + "grad_norm": 7.0625, + "learning_rate": 8.90290642834788e-06, + "loss": 0.98216562, + "memory(GiB)": 302.58, + "step": 90900, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.73560843, + "epoch": 0.5084677041637364, + "grad_norm": 4.9375, + "learning_rate": 8.902328370805564e-06, + "loss": 1.04109249, + "memory(GiB)": 302.58, + "step": 90920, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.74473257, + "epoch": 0.5085795536367157, + "grad_norm": 5.09375, + "learning_rate": 8.901750179791244e-06, + "loss": 1.01040363, + "memory(GiB)": 302.58, + "step": 90940, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.73647866, + "epoch": 0.508691403109695, + "grad_norm": 11.625, + "learning_rate": 8.901171855324696e-06, + "loss": 1.04695635, + "memory(GiB)": 302.58, + "step": 90960, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.73278794, + "epoch": 0.5088032525826742, + "grad_norm": 8.8125, + "learning_rate": 8.900593397425699e-06, + "loss": 1.04931669, + "memory(GiB)": 302.58, + "step": 90980, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.71018806, + "epoch": 0.5089151020556535, + "grad_norm": 5.59375, + "learning_rate": 8.900014806114042e-06, + "loss": 1.17193651, + "memory(GiB)": 302.58, + "step": 91000, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.73896356, + "epoch": 0.5090269515286328, + "grad_norm": 6.5625, + "learning_rate": 8.899436081409509e-06, + "loss": 1.00651579, + "memory(GiB)": 302.58, + "step": 91020, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.74320407, + "epoch": 0.509138801001612, + "grad_norm": 11.8125, + "learning_rate": 8.898857223331898e-06, + "loss": 1.01717615, + "memory(GiB)": 302.58, + "step": 91040, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.73327842, + "epoch": 0.5092506504745913, + "grad_norm": 6.4375, + "learning_rate": 8.898278231901008e-06, + "loss": 1.07111378, + "memory(GiB)": 302.58, + "step": 91060, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.73396301, + "epoch": 0.5093624999475705, + "grad_norm": 5.8125, + "learning_rate": 8.89769910713664e-06, + "loss": 1.04434299, + "memory(GiB)": 302.58, + "step": 91080, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.738977, + "epoch": 0.5094743494205498, + "grad_norm": 7.28125, + "learning_rate": 8.897119849058605e-06, + "loss": 1.03571301, + "memory(GiB)": 302.58, + "step": 91100, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.73796711, + "epoch": 0.5095861988935291, + "grad_norm": 4.75, + "learning_rate": 8.896540457686712e-06, + "loss": 1.02424479, + "memory(GiB)": 302.58, + "step": 91120, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.75527802, + "epoch": 0.5096980483665083, + "grad_norm": 6.53125, + "learning_rate": 8.895960933040781e-06, + "loss": 0.94762526, + "memory(GiB)": 302.58, + "step": 91140, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.74672089, + "epoch": 0.5098098978394876, + "grad_norm": 7.0, + "learning_rate": 8.89538127514063e-06, + "loss": 0.99672985, + "memory(GiB)": 302.58, + "step": 91160, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.73741775, + "epoch": 0.5099217473124669, + "grad_norm": 6.90625, + "learning_rate": 8.89480148400609e-06, + "loss": 1.02085514, + "memory(GiB)": 302.58, + "step": 91180, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.71919231, + "epoch": 0.5100335967854461, + "grad_norm": 6.0, + "learning_rate": 8.89422155965699e-06, + "loss": 1.11243076, + "memory(GiB)": 302.58, + "step": 91200, + "train_speed(iter/s)": 0.123948 + }, + { + "acc": 0.74459248, + "epoch": 0.5101454462584254, + "grad_norm": 7.125, + "learning_rate": 8.893641502113162e-06, + "loss": 0.99994383, + "memory(GiB)": 302.58, + "step": 91220, + "train_speed(iter/s)": 0.123961 + }, + { + "acc": 0.74541373, + "epoch": 0.5102572957314047, + "grad_norm": 7.25, + "learning_rate": 8.89306131139445e-06, + "loss": 1.01592073, + "memory(GiB)": 302.58, + "step": 91240, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.7399868, + "epoch": 0.5103691452043839, + "grad_norm": 5.46875, + "learning_rate": 8.892480987520695e-06, + "loss": 1.0142869, + "memory(GiB)": 302.58, + "step": 91260, + "train_speed(iter/s)": 0.123986 + }, + { + "acc": 0.7312407, + "epoch": 0.5104809946773632, + "grad_norm": 8.5, + "learning_rate": 8.89190053051175e-06, + "loss": 1.0670085, + "memory(GiB)": 302.58, + "step": 91280, + "train_speed(iter/s)": 0.124 + }, + { + "acc": 0.74783835, + "epoch": 0.5105928441503425, + "grad_norm": 7.53125, + "learning_rate": 8.891319940387463e-06, + "loss": 0.98722506, + "memory(GiB)": 302.58, + "step": 91300, + "train_speed(iter/s)": 0.124014 + }, + { + "acc": 0.73531656, + "epoch": 0.5107046936233217, + "grad_norm": 7.90625, + "learning_rate": 8.890739217167697e-06, + "loss": 1.04263449, + "memory(GiB)": 302.58, + "step": 91320, + "train_speed(iter/s)": 0.124026 + }, + { + "acc": 0.71478243, + "epoch": 0.510816543096301, + "grad_norm": 6.90625, + "learning_rate": 8.890158360872312e-06, + "loss": 1.12616816, + "memory(GiB)": 302.58, + "step": 91340, + "train_speed(iter/s)": 0.124039 + }, + { + "acc": 0.74662642, + "epoch": 0.5109283925692802, + "grad_norm": 6.21875, + "learning_rate": 8.889577371521174e-06, + "loss": 0.99528103, + "memory(GiB)": 302.58, + "step": 91360, + "train_speed(iter/s)": 0.124051 + }, + { + "acc": 0.7263186, + "epoch": 0.5110402420422595, + "grad_norm": 7.96875, + "learning_rate": 8.888996249134158e-06, + "loss": 1.08072758, + "memory(GiB)": 302.58, + "step": 91380, + "train_speed(iter/s)": 0.124064 + }, + { + "acc": 0.73672862, + "epoch": 0.5111520915152388, + "grad_norm": 7.5, + "learning_rate": 8.888414993731138e-06, + "loss": 1.03709564, + "memory(GiB)": 302.58, + "step": 91400, + "train_speed(iter/s)": 0.124076 + }, + { + "acc": 0.73272095, + "epoch": 0.511263940988218, + "grad_norm": 7.3125, + "learning_rate": 8.887833605331998e-06, + "loss": 1.03696432, + "memory(GiB)": 302.58, + "step": 91420, + "train_speed(iter/s)": 0.124088 + }, + { + "acc": 0.74403195, + "epoch": 0.5113757904611973, + "grad_norm": 6.875, + "learning_rate": 8.887252083956617e-06, + "loss": 0.99347048, + "memory(GiB)": 302.58, + "step": 91440, + "train_speed(iter/s)": 0.124101 + }, + { + "acc": 0.74098105, + "epoch": 0.5114876399341766, + "grad_norm": 6.5625, + "learning_rate": 8.886670429624891e-06, + "loss": 1.00890388, + "memory(GiB)": 302.58, + "step": 91460, + "train_speed(iter/s)": 0.124114 + }, + { + "acc": 0.72221742, + "epoch": 0.5115994894071558, + "grad_norm": 5.46875, + "learning_rate": 8.88608864235671e-06, + "loss": 1.10919867, + "memory(GiB)": 302.58, + "step": 91480, + "train_speed(iter/s)": 0.124127 + }, + { + "acc": 0.74574451, + "epoch": 0.5117113388801351, + "grad_norm": 5.53125, + "learning_rate": 8.885506722171978e-06, + "loss": 1.00761366, + "memory(GiB)": 302.58, + "step": 91500, + "train_speed(iter/s)": 0.124139 + }, + { + "acc": 0.74474058, + "epoch": 0.5118231883531144, + "grad_norm": 7.90625, + "learning_rate": 8.884924669090593e-06, + "loss": 1.01865721, + "memory(GiB)": 302.58, + "step": 91520, + "train_speed(iter/s)": 0.124152 + }, + { + "acc": 0.74044762, + "epoch": 0.5119350378260936, + "grad_norm": 7.6875, + "learning_rate": 8.884342483132466e-06, + "loss": 1.03086052, + "memory(GiB)": 302.58, + "step": 91540, + "train_speed(iter/s)": 0.124164 + }, + { + "acc": 0.75359569, + "epoch": 0.5120468872990729, + "grad_norm": 7.625, + "learning_rate": 8.883760164317509e-06, + "loss": 0.94971142, + "memory(GiB)": 302.58, + "step": 91560, + "train_speed(iter/s)": 0.124176 + }, + { + "acc": 0.73920121, + "epoch": 0.5121587367720521, + "grad_norm": 7.75, + "learning_rate": 8.88317771266564e-06, + "loss": 0.99968643, + "memory(GiB)": 302.58, + "step": 91580, + "train_speed(iter/s)": 0.124189 + }, + { + "acc": 0.73705297, + "epoch": 0.5122705862450314, + "grad_norm": 5.40625, + "learning_rate": 8.88259512819678e-06, + "loss": 1.05313511, + "memory(GiB)": 302.58, + "step": 91600, + "train_speed(iter/s)": 0.124202 + }, + { + "acc": 0.73471646, + "epoch": 0.5123824357180107, + "grad_norm": 7.34375, + "learning_rate": 8.882012410930853e-06, + "loss": 1.04103756, + "memory(GiB)": 302.58, + "step": 91620, + "train_speed(iter/s)": 0.124215 + }, + { + "acc": 0.71719165, + "epoch": 0.5124942851909899, + "grad_norm": 5.9375, + "learning_rate": 8.881429560887795e-06, + "loss": 1.12795553, + "memory(GiB)": 302.58, + "step": 91640, + "train_speed(iter/s)": 0.124228 + }, + { + "acc": 0.74486308, + "epoch": 0.5126061346639692, + "grad_norm": 8.0625, + "learning_rate": 8.880846578087536e-06, + "loss": 1.00234461, + "memory(GiB)": 302.58, + "step": 91660, + "train_speed(iter/s)": 0.124242 + }, + { + "acc": 0.73334136, + "epoch": 0.5127179841369485, + "grad_norm": 6.8125, + "learning_rate": 8.880263462550021e-06, + "loss": 1.05736074, + "memory(GiB)": 302.58, + "step": 91680, + "train_speed(iter/s)": 0.124256 + }, + { + "acc": 0.72423353, + "epoch": 0.5128298336099277, + "grad_norm": 5.875, + "learning_rate": 8.87968021429519e-06, + "loss": 1.06439505, + "memory(GiB)": 302.58, + "step": 91700, + "train_speed(iter/s)": 0.124269 + }, + { + "acc": 0.72736526, + "epoch": 0.512941683082907, + "grad_norm": 9.3125, + "learning_rate": 8.879096833342993e-06, + "loss": 1.07656059, + "memory(GiB)": 302.58, + "step": 91720, + "train_speed(iter/s)": 0.124283 + }, + { + "acc": 0.74385242, + "epoch": 0.5130535325558863, + "grad_norm": 6.78125, + "learning_rate": 8.878513319713384e-06, + "loss": 1.00493259, + "memory(GiB)": 302.58, + "step": 91740, + "train_speed(iter/s)": 0.124296 + }, + { + "acc": 0.74158115, + "epoch": 0.5131653820288655, + "grad_norm": 5.9375, + "learning_rate": 8.877929673426322e-06, + "loss": 1.02973528, + "memory(GiB)": 302.58, + "step": 91760, + "train_speed(iter/s)": 0.124309 + }, + { + "acc": 0.73982701, + "epoch": 0.5132772315018448, + "grad_norm": 8.1875, + "learning_rate": 8.877345894501768e-06, + "loss": 1.01281052, + "memory(GiB)": 302.58, + "step": 91780, + "train_speed(iter/s)": 0.124322 + }, + { + "acc": 0.74490161, + "epoch": 0.513389080974824, + "grad_norm": 6.125, + "learning_rate": 8.87676198295969e-06, + "loss": 1.0011054, + "memory(GiB)": 302.58, + "step": 91800, + "train_speed(iter/s)": 0.124335 + }, + { + "acc": 0.74963136, + "epoch": 0.5135009304478033, + "grad_norm": 4.78125, + "learning_rate": 8.87617793882006e-06, + "loss": 0.99991226, + "memory(GiB)": 302.58, + "step": 91820, + "train_speed(iter/s)": 0.124347 + }, + { + "acc": 0.74103241, + "epoch": 0.5136127799207826, + "grad_norm": 7.84375, + "learning_rate": 8.875593762102852e-06, + "loss": 1.01094742, + "memory(GiB)": 302.58, + "step": 91840, + "train_speed(iter/s)": 0.124359 + }, + { + "acc": 0.74592824, + "epoch": 0.5137246293937618, + "grad_norm": 4.09375, + "learning_rate": 8.875009452828049e-06, + "loss": 0.9900526, + "memory(GiB)": 302.58, + "step": 91860, + "train_speed(iter/s)": 0.124371 + }, + { + "acc": 0.73716316, + "epoch": 0.5138364788667411, + "grad_norm": 5.84375, + "learning_rate": 8.874425011015636e-06, + "loss": 1.02252216, + "memory(GiB)": 302.58, + "step": 91880, + "train_speed(iter/s)": 0.124384 + }, + { + "acc": 0.75885406, + "epoch": 0.5139483283397204, + "grad_norm": 5.9375, + "learning_rate": 8.8738404366856e-06, + "loss": 0.95077677, + "memory(GiB)": 302.58, + "step": 91900, + "train_speed(iter/s)": 0.124397 + }, + { + "acc": 0.73204699, + "epoch": 0.5140601778126996, + "grad_norm": 8.6875, + "learning_rate": 8.873255729857939e-06, + "loss": 1.06294823, + "memory(GiB)": 302.58, + "step": 91920, + "train_speed(iter/s)": 0.124409 + }, + { + "acc": 0.75108647, + "epoch": 0.5141720272856789, + "grad_norm": 8.625, + "learning_rate": 8.87267089055265e-06, + "loss": 0.97222195, + "memory(GiB)": 302.58, + "step": 91940, + "train_speed(iter/s)": 0.124421 + }, + { + "acc": 0.73372674, + "epoch": 0.5142838767586582, + "grad_norm": 9.1875, + "learning_rate": 8.872085918789736e-06, + "loss": 1.04328089, + "memory(GiB)": 302.58, + "step": 91960, + "train_speed(iter/s)": 0.124433 + }, + { + "acc": 0.74962821, + "epoch": 0.5143957262316374, + "grad_norm": 6.03125, + "learning_rate": 8.871500814589208e-06, + "loss": 0.98016586, + "memory(GiB)": 302.58, + "step": 91980, + "train_speed(iter/s)": 0.124446 + }, + { + "acc": 0.7254087, + "epoch": 0.5145075757046167, + "grad_norm": 8.6875, + "learning_rate": 8.870915577971073e-06, + "loss": 1.08545675, + "memory(GiB)": 302.58, + "step": 92000, + "train_speed(iter/s)": 0.124458 + }, + { + "epoch": 0.5145075757046167, + "eval_acc": 0.701890945846593, + "eval_loss": 1.033949613571167, + "eval_runtime": 7507.9759, + "eval_samples_per_second": 10.027, + "eval_steps_per_second": 10.027, + "step": 92000 + }, + { + "acc": 0.7464026, + "epoch": 0.514619425177596, + "grad_norm": 8.75, + "learning_rate": 8.870330208955353e-06, + "loss": 0.98629255, + "memory(GiB)": 302.58, + "step": 92020, + "train_speed(iter/s)": 0.123198 + }, + { + "acc": 0.74518814, + "epoch": 0.5147312746505752, + "grad_norm": 8.6875, + "learning_rate": 8.869744707562067e-06, + "loss": 0.98671589, + "memory(GiB)": 302.58, + "step": 92040, + "train_speed(iter/s)": 0.12321 + }, + { + "acc": 0.72378602, + "epoch": 0.5148431241235545, + "grad_norm": 6.28125, + "learning_rate": 8.869159073811241e-06, + "loss": 1.06009092, + "memory(GiB)": 302.58, + "step": 92060, + "train_speed(iter/s)": 0.123223 + }, + { + "acc": 0.72358727, + "epoch": 0.5149549735965337, + "grad_norm": 8.0, + "learning_rate": 8.868573307722905e-06, + "loss": 1.12213945, + "memory(GiB)": 302.58, + "step": 92080, + "train_speed(iter/s)": 0.123236 + }, + { + "acc": 0.73544188, + "epoch": 0.515066823069513, + "grad_norm": 9.0, + "learning_rate": 8.867987409317096e-06, + "loss": 1.03154917, + "memory(GiB)": 302.58, + "step": 92100, + "train_speed(iter/s)": 0.123248 + }, + { + "acc": 0.73954515, + "epoch": 0.5151786725424923, + "grad_norm": 9.0625, + "learning_rate": 8.867401378613853e-06, + "loss": 1.00994081, + "memory(GiB)": 302.58, + "step": 92120, + "train_speed(iter/s)": 0.123261 + }, + { + "acc": 0.75366354, + "epoch": 0.5152905220154715, + "grad_norm": 6.53125, + "learning_rate": 8.866815215633218e-06, + "loss": 0.94337149, + "memory(GiB)": 302.58, + "step": 92140, + "train_speed(iter/s)": 0.123274 + }, + { + "acc": 0.74597054, + "epoch": 0.5154023714884508, + "grad_norm": 8.3125, + "learning_rate": 8.866228920395241e-06, + "loss": 0.9948575, + "memory(GiB)": 302.58, + "step": 92160, + "train_speed(iter/s)": 0.123287 + }, + { + "acc": 0.71853566, + "epoch": 0.5155142209614301, + "grad_norm": 6.375, + "learning_rate": 8.865642492919978e-06, + "loss": 1.12637844, + "memory(GiB)": 302.58, + "step": 92180, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.7491282, + "epoch": 0.5156260704344093, + "grad_norm": 7.8125, + "learning_rate": 8.865055933227482e-06, + "loss": 0.96887732, + "memory(GiB)": 302.58, + "step": 92200, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.73800683, + "epoch": 0.5157379199073886, + "grad_norm": 5.4375, + "learning_rate": 8.864469241337818e-06, + "loss": 1.03838558, + "memory(GiB)": 302.58, + "step": 92220, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.74136071, + "epoch": 0.5158497693803679, + "grad_norm": 5.6875, + "learning_rate": 8.863882417271053e-06, + "loss": 1.01882362, + "memory(GiB)": 302.58, + "step": 92240, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.74805417, + "epoch": 0.5159616188533471, + "grad_norm": 8.1875, + "learning_rate": 8.863295461047257e-06, + "loss": 1.00482311, + "memory(GiB)": 302.58, + "step": 92260, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.74436355, + "epoch": 0.5160734683263264, + "grad_norm": 8.8125, + "learning_rate": 8.862708372686505e-06, + "loss": 1.01805191, + "memory(GiB)": 302.58, + "step": 92280, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.74243917, + "epoch": 0.5161853177993057, + "grad_norm": 7.90625, + "learning_rate": 8.862121152208878e-06, + "loss": 0.99167509, + "memory(GiB)": 302.58, + "step": 92300, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.74231257, + "epoch": 0.5162971672722849, + "grad_norm": 7.4375, + "learning_rate": 8.861533799634462e-06, + "loss": 1.01895714, + "memory(GiB)": 302.58, + "step": 92320, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.71324706, + "epoch": 0.5164090167452642, + "grad_norm": 9.125, + "learning_rate": 8.860946314983346e-06, + "loss": 1.14196243, + "memory(GiB)": 302.58, + "step": 92340, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.73761878, + "epoch": 0.5165208662182434, + "grad_norm": 6.0625, + "learning_rate": 8.860358698275623e-06, + "loss": 1.02645407, + "memory(GiB)": 302.58, + "step": 92360, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.73687272, + "epoch": 0.5166327156912227, + "grad_norm": 5.40625, + "learning_rate": 8.859770949531392e-06, + "loss": 1.02378836, + "memory(GiB)": 302.58, + "step": 92380, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.73412428, + "epoch": 0.516744565164202, + "grad_norm": 10.6875, + "learning_rate": 8.859183068770754e-06, + "loss": 1.07176762, + "memory(GiB)": 302.58, + "step": 92400, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.73278165, + "epoch": 0.5168564146371812, + "grad_norm": 8.25, + "learning_rate": 8.85859505601382e-06, + "loss": 1.05415201, + "memory(GiB)": 302.58, + "step": 92420, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.76042991, + "epoch": 0.5169682641101605, + "grad_norm": 8.5, + "learning_rate": 8.858006911280696e-06, + "loss": 0.92730722, + "memory(GiB)": 302.58, + "step": 92440, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.74607091, + "epoch": 0.5170801135831398, + "grad_norm": 8.0625, + "learning_rate": 8.857418634591506e-06, + "loss": 0.99935703, + "memory(GiB)": 302.58, + "step": 92460, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.73390503, + "epoch": 0.517191963056119, + "grad_norm": 9.0625, + "learning_rate": 8.856830225966366e-06, + "loss": 1.05764399, + "memory(GiB)": 302.58, + "step": 92480, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.7316432, + "epoch": 0.5173038125290983, + "grad_norm": 5.78125, + "learning_rate": 8.856241685425402e-06, + "loss": 1.05955162, + "memory(GiB)": 302.58, + "step": 92500, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.72603664, + "epoch": 0.5174156620020776, + "grad_norm": 9.125, + "learning_rate": 8.855653012988746e-06, + "loss": 1.09939709, + "memory(GiB)": 302.58, + "step": 92520, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.72624784, + "epoch": 0.5175275114750568, + "grad_norm": 3.875, + "learning_rate": 8.85506420867653e-06, + "loss": 1.09349232, + "memory(GiB)": 302.58, + "step": 92540, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.7424758, + "epoch": 0.5176393609480361, + "grad_norm": 8.0, + "learning_rate": 8.854475272508895e-06, + "loss": 0.9902379, + "memory(GiB)": 302.58, + "step": 92560, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.74382606, + "epoch": 0.5177512104210154, + "grad_norm": 8.25, + "learning_rate": 8.853886204505981e-06, + "loss": 1.01603327, + "memory(GiB)": 302.58, + "step": 92580, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.74821792, + "epoch": 0.5178630598939946, + "grad_norm": 8.25, + "learning_rate": 8.853297004687942e-06, + "loss": 0.99256411, + "memory(GiB)": 302.58, + "step": 92600, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.7297029, + "epoch": 0.5179749093669739, + "grad_norm": 6.84375, + "learning_rate": 8.852707673074924e-06, + "loss": 1.08103895, + "memory(GiB)": 302.58, + "step": 92620, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74115, + "epoch": 0.5180867588399531, + "grad_norm": 7.03125, + "learning_rate": 8.852118209687088e-06, + "loss": 1.02402945, + "memory(GiB)": 302.58, + "step": 92640, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.74382505, + "epoch": 0.5181986083129324, + "grad_norm": 7.21875, + "learning_rate": 8.851528614544594e-06, + "loss": 1.00060167, + "memory(GiB)": 302.58, + "step": 92660, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.74595618, + "epoch": 0.5183104577859117, + "grad_norm": 7.0625, + "learning_rate": 8.85093888766761e-06, + "loss": 0.98852396, + "memory(GiB)": 302.58, + "step": 92680, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.73867316, + "epoch": 0.5184223072588909, + "grad_norm": 6.75, + "learning_rate": 8.850349029076305e-06, + "loss": 1.03016243, + "memory(GiB)": 302.58, + "step": 92700, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.73517728, + "epoch": 0.5185341567318702, + "grad_norm": 7.78125, + "learning_rate": 8.849759038790852e-06, + "loss": 1.04450455, + "memory(GiB)": 302.58, + "step": 92720, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.74803491, + "epoch": 0.5186460062048495, + "grad_norm": 9.625, + "learning_rate": 8.849168916831433e-06, + "loss": 0.98735991, + "memory(GiB)": 302.58, + "step": 92740, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.7455174, + "epoch": 0.5187578556778287, + "grad_norm": 7.53125, + "learning_rate": 8.848578663218233e-06, + "loss": 1.01047144, + "memory(GiB)": 302.58, + "step": 92760, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.72547174, + "epoch": 0.518869705150808, + "grad_norm": 8.1875, + "learning_rate": 8.847988277971439e-06, + "loss": 1.08939161, + "memory(GiB)": 302.58, + "step": 92780, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.74231291, + "epoch": 0.5189815546237873, + "grad_norm": 6.96875, + "learning_rate": 8.847397761111243e-06, + "loss": 0.99262505, + "memory(GiB)": 302.58, + "step": 92800, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.73566332, + "epoch": 0.5190934040967665, + "grad_norm": 3.734375, + "learning_rate": 8.846807112657844e-06, + "loss": 1.04620228, + "memory(GiB)": 302.58, + "step": 92820, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.73764682, + "epoch": 0.5192052535697458, + "grad_norm": 8.4375, + "learning_rate": 8.846216332631445e-06, + "loss": 1.02479763, + "memory(GiB)": 302.58, + "step": 92840, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.7468996, + "epoch": 0.519317103042725, + "grad_norm": 8.25, + "learning_rate": 8.84562542105225e-06, + "loss": 0.99187317, + "memory(GiB)": 302.58, + "step": 92860, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.72442956, + "epoch": 0.5194289525157043, + "grad_norm": 7.0, + "learning_rate": 8.845034377940474e-06, + "loss": 1.10216303, + "memory(GiB)": 302.58, + "step": 92880, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.7454195, + "epoch": 0.5195408019886836, + "grad_norm": 7.78125, + "learning_rate": 8.84444320331633e-06, + "loss": 0.98956842, + "memory(GiB)": 302.58, + "step": 92900, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.75418029, + "epoch": 0.5196526514616628, + "grad_norm": 9.0, + "learning_rate": 8.843851897200035e-06, + "loss": 0.96006384, + "memory(GiB)": 302.58, + "step": 92920, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.73970823, + "epoch": 0.5197645009346421, + "grad_norm": 5.78125, + "learning_rate": 8.843260459611818e-06, + "loss": 1.01896963, + "memory(GiB)": 302.58, + "step": 92940, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.74542332, + "epoch": 0.5198763504076214, + "grad_norm": 8.8125, + "learning_rate": 8.842668890571906e-06, + "loss": 0.99567604, + "memory(GiB)": 302.58, + "step": 92960, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.74349704, + "epoch": 0.5199881998806006, + "grad_norm": 9.1875, + "learning_rate": 8.842077190100536e-06, + "loss": 0.99211884, + "memory(GiB)": 302.58, + "step": 92980, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.73701367, + "epoch": 0.52010004935358, + "grad_norm": 5.53125, + "learning_rate": 8.841485358217939e-06, + "loss": 1.03667183, + "memory(GiB)": 302.58, + "step": 93000, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.73562832, + "epoch": 0.5202118988265593, + "grad_norm": 6.78125, + "learning_rate": 8.840893394944363e-06, + "loss": 1.01521654, + "memory(GiB)": 302.58, + "step": 93020, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.7700161, + "epoch": 0.5203237482995385, + "grad_norm": 5.03125, + "learning_rate": 8.840301300300056e-06, + "loss": 0.90130415, + "memory(GiB)": 302.58, + "step": 93040, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.72708855, + "epoch": 0.5204355977725178, + "grad_norm": 6.9375, + "learning_rate": 8.839709074305265e-06, + "loss": 1.07325821, + "memory(GiB)": 302.58, + "step": 93060, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.73745747, + "epoch": 0.5205474472454971, + "grad_norm": 5.65625, + "learning_rate": 8.839116716980249e-06, + "loss": 1.0389431, + "memory(GiB)": 302.58, + "step": 93080, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.7410615, + "epoch": 0.5206592967184763, + "grad_norm": 8.125, + "learning_rate": 8.838524228345269e-06, + "loss": 1.01286993, + "memory(GiB)": 302.58, + "step": 93100, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.74160104, + "epoch": 0.5207711461914556, + "grad_norm": 10.3125, + "learning_rate": 8.837931608420587e-06, + "loss": 1.02060127, + "memory(GiB)": 302.58, + "step": 93120, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.74480982, + "epoch": 0.5208829956644349, + "grad_norm": 5.65625, + "learning_rate": 8.837338857226475e-06, + "loss": 0.99792032, + "memory(GiB)": 302.58, + "step": 93140, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.74648848, + "epoch": 0.5209948451374141, + "grad_norm": 8.0625, + "learning_rate": 8.836745974783205e-06, + "loss": 0.97894611, + "memory(GiB)": 302.58, + "step": 93160, + "train_speed(iter/s)": 0.123919 + }, + { + "acc": 0.74288197, + "epoch": 0.5211066946103934, + "grad_norm": 10.0625, + "learning_rate": 8.836152961111059e-06, + "loss": 1.02509747, + "memory(GiB)": 302.58, + "step": 93180, + "train_speed(iter/s)": 0.123932 + }, + { + "acc": 0.74218369, + "epoch": 0.5212185440833726, + "grad_norm": 8.875, + "learning_rate": 8.835559816230316e-06, + "loss": 1.01870565, + "memory(GiB)": 302.58, + "step": 93200, + "train_speed(iter/s)": 0.123944 + }, + { + "acc": 0.74290714, + "epoch": 0.5213303935563519, + "grad_norm": 9.25, + "learning_rate": 8.834966540161265e-06, + "loss": 1.02778234, + "memory(GiB)": 302.58, + "step": 93220, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.72792673, + "epoch": 0.5214422430293312, + "grad_norm": 10.25, + "learning_rate": 8.834373132924199e-06, + "loss": 1.08668385, + "memory(GiB)": 302.58, + "step": 93240, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.72944837, + "epoch": 0.5215540925023104, + "grad_norm": 7.6875, + "learning_rate": 8.833779594539413e-06, + "loss": 1.06291485, + "memory(GiB)": 302.58, + "step": 93260, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.71416259, + "epoch": 0.5216659419752897, + "grad_norm": 4.6875, + "learning_rate": 8.833185925027208e-06, + "loss": 1.14765358, + "memory(GiB)": 302.58, + "step": 93280, + "train_speed(iter/s)": 0.123997 + }, + { + "acc": 0.74790468, + "epoch": 0.521777791448269, + "grad_norm": 4.78125, + "learning_rate": 8.832592124407888e-06, + "loss": 0.9681241, + "memory(GiB)": 302.58, + "step": 93300, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.73776679, + "epoch": 0.5218896409212482, + "grad_norm": 8.3125, + "learning_rate": 8.831998192701766e-06, + "loss": 1.02414675, + "memory(GiB)": 302.58, + "step": 93320, + "train_speed(iter/s)": 0.124022 + }, + { + "acc": 0.73089156, + "epoch": 0.5220014903942275, + "grad_norm": 5.875, + "learning_rate": 8.831404129929156e-06, + "loss": 1.05961294, + "memory(GiB)": 302.58, + "step": 93340, + "train_speed(iter/s)": 0.124035 + }, + { + "acc": 0.74246125, + "epoch": 0.5221133398672068, + "grad_norm": 8.5625, + "learning_rate": 8.830809936110374e-06, + "loss": 1.02717876, + "memory(GiB)": 302.58, + "step": 93360, + "train_speed(iter/s)": 0.124047 + }, + { + "acc": 0.73346243, + "epoch": 0.522225189340186, + "grad_norm": 7.875, + "learning_rate": 8.830215611265746e-06, + "loss": 1.05725889, + "memory(GiB)": 302.58, + "step": 93380, + "train_speed(iter/s)": 0.124059 + }, + { + "acc": 0.73988643, + "epoch": 0.5223370388131653, + "grad_norm": 7.25, + "learning_rate": 8.8296211554156e-06, + "loss": 1.02141771, + "memory(GiB)": 302.58, + "step": 93400, + "train_speed(iter/s)": 0.124072 + }, + { + "acc": 0.74346485, + "epoch": 0.5224488882861446, + "grad_norm": 5.625, + "learning_rate": 8.829026568580265e-06, + "loss": 1.00872746, + "memory(GiB)": 302.58, + "step": 93420, + "train_speed(iter/s)": 0.124084 + }, + { + "acc": 0.72411966, + "epoch": 0.5225607377591238, + "grad_norm": 7.90625, + "learning_rate": 8.82843185078008e-06, + "loss": 1.09482784, + "memory(GiB)": 302.58, + "step": 93440, + "train_speed(iter/s)": 0.124097 + }, + { + "acc": 0.73085785, + "epoch": 0.5226725872321031, + "grad_norm": 6.28125, + "learning_rate": 8.827837002035386e-06, + "loss": 1.05807085, + "memory(GiB)": 302.58, + "step": 93460, + "train_speed(iter/s)": 0.124108 + }, + { + "acc": 0.73258328, + "epoch": 0.5227844367050823, + "grad_norm": 5.53125, + "learning_rate": 8.82724202236653e-06, + "loss": 1.04229393, + "memory(GiB)": 302.58, + "step": 93480, + "train_speed(iter/s)": 0.124121 + }, + { + "acc": 0.73933711, + "epoch": 0.5228962861780616, + "grad_norm": 7.71875, + "learning_rate": 8.826646911793861e-06, + "loss": 1.04901743, + "memory(GiB)": 302.58, + "step": 93500, + "train_speed(iter/s)": 0.124134 + }, + { + "acc": 0.7151123, + "epoch": 0.5230081356510409, + "grad_norm": 5.6875, + "learning_rate": 8.826051670337734e-06, + "loss": 1.12135458, + "memory(GiB)": 302.58, + "step": 93520, + "train_speed(iter/s)": 0.124147 + }, + { + "acc": 0.73774595, + "epoch": 0.5231199851240201, + "grad_norm": 6.90625, + "learning_rate": 8.825456298018507e-06, + "loss": 1.04632587, + "memory(GiB)": 302.58, + "step": 93540, + "train_speed(iter/s)": 0.124159 + }, + { + "acc": 0.7237627, + "epoch": 0.5232318345969994, + "grad_norm": 6.75, + "learning_rate": 8.824860794856544e-06, + "loss": 1.09981995, + "memory(GiB)": 302.58, + "step": 93560, + "train_speed(iter/s)": 0.124173 + }, + { + "acc": 0.74928818, + "epoch": 0.5233436840699787, + "grad_norm": 5.96875, + "learning_rate": 8.824265160872213e-06, + "loss": 1.00779476, + "memory(GiB)": 302.58, + "step": 93580, + "train_speed(iter/s)": 0.124186 + }, + { + "acc": 0.74151692, + "epoch": 0.5234555335429579, + "grad_norm": 5.78125, + "learning_rate": 8.823669396085888e-06, + "loss": 0.99651175, + "memory(GiB)": 302.58, + "step": 93600, + "train_speed(iter/s)": 0.124198 + }, + { + "acc": 0.75579057, + "epoch": 0.5235673830159372, + "grad_norm": 9.4375, + "learning_rate": 8.823073500517945e-06, + "loss": 0.96607943, + "memory(GiB)": 302.58, + "step": 93620, + "train_speed(iter/s)": 0.124211 + }, + { + "acc": 0.74056039, + "epoch": 0.5236792324889165, + "grad_norm": 5.75, + "learning_rate": 8.822477474188766e-06, + "loss": 1.00696211, + "memory(GiB)": 302.58, + "step": 93640, + "train_speed(iter/s)": 0.124224 + }, + { + "acc": 0.74486761, + "epoch": 0.5237910819618957, + "grad_norm": 5.40625, + "learning_rate": 8.821881317118735e-06, + "loss": 0.98889771, + "memory(GiB)": 302.58, + "step": 93660, + "train_speed(iter/s)": 0.124236 + }, + { + "acc": 0.72889071, + "epoch": 0.523902931434875, + "grad_norm": 8.5, + "learning_rate": 8.821285029328245e-06, + "loss": 1.06070681, + "memory(GiB)": 302.58, + "step": 93680, + "train_speed(iter/s)": 0.124249 + }, + { + "acc": 0.73994584, + "epoch": 0.5240147809078542, + "grad_norm": 6.71875, + "learning_rate": 8.82068861083769e-06, + "loss": 1.02435837, + "memory(GiB)": 302.58, + "step": 93700, + "train_speed(iter/s)": 0.12426 + }, + { + "acc": 0.72543278, + "epoch": 0.5241266303808335, + "grad_norm": 5.5625, + "learning_rate": 8.820092061667472e-06, + "loss": 1.08958368, + "memory(GiB)": 302.58, + "step": 93720, + "train_speed(iter/s)": 0.124273 + }, + { + "acc": 0.75036855, + "epoch": 0.5242384798538128, + "grad_norm": 6.90625, + "learning_rate": 8.81949538183799e-06, + "loss": 0.98291864, + "memory(GiB)": 302.58, + "step": 93740, + "train_speed(iter/s)": 0.124286 + }, + { + "acc": 0.73574033, + "epoch": 0.524350329326792, + "grad_norm": 5.4375, + "learning_rate": 8.818898571369653e-06, + "loss": 1.05353174, + "memory(GiB)": 302.58, + "step": 93760, + "train_speed(iter/s)": 0.124297 + }, + { + "acc": 0.73091178, + "epoch": 0.5244621787997713, + "grad_norm": 7.90625, + "learning_rate": 8.818301630282879e-06, + "loss": 1.07397747, + "memory(GiB)": 302.58, + "step": 93780, + "train_speed(iter/s)": 0.12431 + }, + { + "acc": 0.75247569, + "epoch": 0.5245740282727506, + "grad_norm": 8.9375, + "learning_rate": 8.817704558598079e-06, + "loss": 0.96309061, + "memory(GiB)": 302.58, + "step": 93800, + "train_speed(iter/s)": 0.124323 + }, + { + "acc": 0.74643707, + "epoch": 0.5246858777457298, + "grad_norm": 5.84375, + "learning_rate": 8.81710735633568e-06, + "loss": 0.97270451, + "memory(GiB)": 302.58, + "step": 93820, + "train_speed(iter/s)": 0.124335 + }, + { + "acc": 0.73746338, + "epoch": 0.5247977272187091, + "grad_norm": 8.6875, + "learning_rate": 8.816510023516103e-06, + "loss": 1.03878536, + "memory(GiB)": 302.58, + "step": 93840, + "train_speed(iter/s)": 0.124347 + }, + { + "acc": 0.72743626, + "epoch": 0.5249095766916884, + "grad_norm": 7.375, + "learning_rate": 8.815912560159782e-06, + "loss": 1.09583149, + "memory(GiB)": 302.58, + "step": 93860, + "train_speed(iter/s)": 0.12436 + }, + { + "acc": 0.73414664, + "epoch": 0.5250214261646676, + "grad_norm": 6.03125, + "learning_rate": 8.815314966287151e-06, + "loss": 1.04016762, + "memory(GiB)": 302.58, + "step": 93880, + "train_speed(iter/s)": 0.124373 + }, + { + "acc": 0.72615747, + "epoch": 0.5251332756376469, + "grad_norm": 5.53125, + "learning_rate": 8.814717241918653e-06, + "loss": 1.09528294, + "memory(GiB)": 302.58, + "step": 93900, + "train_speed(iter/s)": 0.124385 + }, + { + "acc": 0.74804888, + "epoch": 0.5252451251106262, + "grad_norm": 7.09375, + "learning_rate": 8.814119387074727e-06, + "loss": 0.96933393, + "memory(GiB)": 302.58, + "step": 93920, + "train_speed(iter/s)": 0.124397 + }, + { + "acc": 0.73144197, + "epoch": 0.5253569745836054, + "grad_norm": 6.75, + "learning_rate": 8.813521401775823e-06, + "loss": 1.06822062, + "memory(GiB)": 302.58, + "step": 93940, + "train_speed(iter/s)": 0.12441 + }, + { + "acc": 0.7448832, + "epoch": 0.5254688240565847, + "grad_norm": 8.1875, + "learning_rate": 8.812923286042396e-06, + "loss": 1.00893106, + "memory(GiB)": 302.58, + "step": 93960, + "train_speed(iter/s)": 0.124422 + }, + { + "acc": 0.73985085, + "epoch": 0.525580673529564, + "grad_norm": 9.125, + "learning_rate": 8.812325039894902e-06, + "loss": 1.02751369, + "memory(GiB)": 302.58, + "step": 93980, + "train_speed(iter/s)": 0.124435 + }, + { + "acc": 0.74044728, + "epoch": 0.5256925230025432, + "grad_norm": 7.4375, + "learning_rate": 8.811726663353804e-06, + "loss": 1.0289299, + "memory(GiB)": 302.58, + "step": 94000, + "train_speed(iter/s)": 0.124447 + }, + { + "epoch": 0.5256925230025432, + "eval_acc": 0.7019861872376734, + "eval_loss": 1.0333856344223022, + "eval_runtime": 7488.0218, + "eval_samples_per_second": 10.054, + "eval_steps_per_second": 10.054, + "step": 94000 + }, + { + "acc": 0.72365394, + "epoch": 0.5258043724755225, + "grad_norm": 6.1875, + "learning_rate": 8.811128156439566e-06, + "loss": 1.10799751, + "memory(GiB)": 302.58, + "step": 94020, + "train_speed(iter/s)": 0.123216 + }, + { + "acc": 0.73720813, + "epoch": 0.5259162219485017, + "grad_norm": 7.9375, + "learning_rate": 8.81052951917266e-06, + "loss": 1.03987694, + "memory(GiB)": 302.58, + "step": 94040, + "train_speed(iter/s)": 0.123228 + }, + { + "acc": 0.73303261, + "epoch": 0.526028071421481, + "grad_norm": 12.9375, + "learning_rate": 8.809930751573563e-06, + "loss": 1.0553936, + "memory(GiB)": 302.58, + "step": 94060, + "train_speed(iter/s)": 0.12324 + }, + { + "acc": 0.74851503, + "epoch": 0.5261399208944603, + "grad_norm": 4.59375, + "learning_rate": 8.809331853662752e-06, + "loss": 0.98760881, + "memory(GiB)": 302.58, + "step": 94080, + "train_speed(iter/s)": 0.123252 + }, + { + "acc": 0.73665223, + "epoch": 0.5262517703674395, + "grad_norm": 7.9375, + "learning_rate": 8.808732825460712e-06, + "loss": 1.0311738, + "memory(GiB)": 302.58, + "step": 94100, + "train_speed(iter/s)": 0.123265 + }, + { + "acc": 0.73426852, + "epoch": 0.5263636198404188, + "grad_norm": 6.15625, + "learning_rate": 8.808133666987936e-06, + "loss": 1.0507534, + "memory(GiB)": 302.58, + "step": 94120, + "train_speed(iter/s)": 0.123278 + }, + { + "acc": 0.73871131, + "epoch": 0.5264754693133981, + "grad_norm": 8.5, + "learning_rate": 8.80753437826491e-06, + "loss": 1.0270196, + "memory(GiB)": 302.58, + "step": 94140, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.72966118, + "epoch": 0.5265873187863773, + "grad_norm": 5.90625, + "learning_rate": 8.806934959312135e-06, + "loss": 1.09083557, + "memory(GiB)": 302.58, + "step": 94160, + "train_speed(iter/s)": 0.123304 + }, + { + "acc": 0.7414825, + "epoch": 0.5266991682593566, + "grad_norm": 7.90625, + "learning_rate": 8.806335410150112e-06, + "loss": 1.01900482, + "memory(GiB)": 302.58, + "step": 94180, + "train_speed(iter/s)": 0.123316 + }, + { + "acc": 0.73763185, + "epoch": 0.5268110177323359, + "grad_norm": 8.625, + "learning_rate": 8.805735730799353e-06, + "loss": 1.04879265, + "memory(GiB)": 302.58, + "step": 94200, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.73919377, + "epoch": 0.5269228672053151, + "grad_norm": 5.78125, + "learning_rate": 8.80513592128036e-06, + "loss": 1.01001463, + "memory(GiB)": 302.58, + "step": 94220, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.73216543, + "epoch": 0.5270347166782944, + "grad_norm": 6.46875, + "learning_rate": 8.804535981613654e-06, + "loss": 1.05213327, + "memory(GiB)": 302.58, + "step": 94240, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.73422103, + "epoch": 0.5271465661512736, + "grad_norm": 7.78125, + "learning_rate": 8.803935911819754e-06, + "loss": 1.03297977, + "memory(GiB)": 302.58, + "step": 94260, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.7579442, + "epoch": 0.5272584156242529, + "grad_norm": 5.53125, + "learning_rate": 8.803335711919183e-06, + "loss": 0.93709831, + "memory(GiB)": 302.58, + "step": 94280, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.73464618, + "epoch": 0.5273702650972322, + "grad_norm": 6.34375, + "learning_rate": 8.80273538193247e-06, + "loss": 1.04933577, + "memory(GiB)": 302.58, + "step": 94300, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.752739, + "epoch": 0.5274821145702114, + "grad_norm": 6.625, + "learning_rate": 8.802134921880151e-06, + "loss": 0.94858637, + "memory(GiB)": 302.58, + "step": 94320, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.74327593, + "epoch": 0.5275939640431907, + "grad_norm": 5.5625, + "learning_rate": 8.801534331782761e-06, + "loss": 1.01118565, + "memory(GiB)": 302.58, + "step": 94340, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.7274343, + "epoch": 0.52770581351617, + "grad_norm": 6.46875, + "learning_rate": 8.80093361166084e-06, + "loss": 1.05845051, + "memory(GiB)": 302.58, + "step": 94360, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74198909, + "epoch": 0.5278176629891492, + "grad_norm": 8.625, + "learning_rate": 8.80033276153494e-06, + "loss": 1.01103535, + "memory(GiB)": 302.58, + "step": 94380, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.73983164, + "epoch": 0.5279295124621285, + "grad_norm": 8.5, + "learning_rate": 8.799731781425608e-06, + "loss": 1.03620338, + "memory(GiB)": 302.58, + "step": 94400, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.72593398, + "epoch": 0.5280413619351078, + "grad_norm": 6.5, + "learning_rate": 8.7991306713534e-06, + "loss": 1.09507427, + "memory(GiB)": 302.58, + "step": 94420, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.72565818, + "epoch": 0.528153211408087, + "grad_norm": 6.25, + "learning_rate": 8.798529431338877e-06, + "loss": 1.11065702, + "memory(GiB)": 302.58, + "step": 94440, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.73454127, + "epoch": 0.5282650608810663, + "grad_norm": 7.78125, + "learning_rate": 8.797928061402602e-06, + "loss": 1.03612623, + "memory(GiB)": 302.58, + "step": 94460, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.75284534, + "epoch": 0.5283769103540455, + "grad_norm": 6.71875, + "learning_rate": 8.797326561565145e-06, + "loss": 0.97326002, + "memory(GiB)": 302.58, + "step": 94480, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.73863726, + "epoch": 0.5284887598270248, + "grad_norm": 9.4375, + "learning_rate": 8.796724931847079e-06, + "loss": 1.02893496, + "memory(GiB)": 302.58, + "step": 94500, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.73626409, + "epoch": 0.5286006093000041, + "grad_norm": 5.6875, + "learning_rate": 8.79612317226898e-06, + "loss": 1.02592287, + "memory(GiB)": 302.58, + "step": 94520, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.74023418, + "epoch": 0.5287124587729833, + "grad_norm": 7.96875, + "learning_rate": 8.795521282851431e-06, + "loss": 1.03227444, + "memory(GiB)": 302.58, + "step": 94540, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.72456264, + "epoch": 0.5288243082459626, + "grad_norm": 5.6875, + "learning_rate": 8.79491926361502e-06, + "loss": 1.11358814, + "memory(GiB)": 302.58, + "step": 94560, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.7423902, + "epoch": 0.5289361577189419, + "grad_norm": 6.15625, + "learning_rate": 8.794317114580337e-06, + "loss": 1.01132965, + "memory(GiB)": 302.58, + "step": 94580, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.72921114, + "epoch": 0.5290480071919211, + "grad_norm": 9.6875, + "learning_rate": 8.793714835767977e-06, + "loss": 1.07055101, + "memory(GiB)": 302.58, + "step": 94600, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.73598208, + "epoch": 0.5291598566649004, + "grad_norm": 4.875, + "learning_rate": 8.79311242719854e-06, + "loss": 1.03310881, + "memory(GiB)": 302.58, + "step": 94620, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74509406, + "epoch": 0.5292717061378797, + "grad_norm": 11.5, + "learning_rate": 8.79250988889263e-06, + "loss": 0.9994462, + "memory(GiB)": 302.58, + "step": 94640, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.74156847, + "epoch": 0.5293835556108589, + "grad_norm": 6.21875, + "learning_rate": 8.791907220870855e-06, + "loss": 1.02526417, + "memory(GiB)": 302.58, + "step": 94660, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.72983313, + "epoch": 0.5294954050838382, + "grad_norm": 8.6875, + "learning_rate": 8.791304423153831e-06, + "loss": 1.07742872, + "memory(GiB)": 302.58, + "step": 94680, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.74510579, + "epoch": 0.5296072545568175, + "grad_norm": 5.0, + "learning_rate": 8.790701495762173e-06, + "loss": 0.98856497, + "memory(GiB)": 302.58, + "step": 94700, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.73568888, + "epoch": 0.5297191040297967, + "grad_norm": 5.40625, + "learning_rate": 8.790098438716504e-06, + "loss": 1.04190617, + "memory(GiB)": 302.58, + "step": 94720, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.7555367, + "epoch": 0.529830953502776, + "grad_norm": 8.0, + "learning_rate": 8.789495252037451e-06, + "loss": 0.94596596, + "memory(GiB)": 302.58, + "step": 94740, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.73924732, + "epoch": 0.5299428029757552, + "grad_norm": 9.6875, + "learning_rate": 8.788891935745646e-06, + "loss": 1.00603151, + "memory(GiB)": 302.58, + "step": 94760, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.7369935, + "epoch": 0.5300546524487345, + "grad_norm": 4.375, + "learning_rate": 8.788288489861719e-06, + "loss": 1.04825506, + "memory(GiB)": 302.58, + "step": 94780, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74775114, + "epoch": 0.5301665019217138, + "grad_norm": 9.0, + "learning_rate": 8.787684914406314e-06, + "loss": 0.99315128, + "memory(GiB)": 302.58, + "step": 94800, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.75078187, + "epoch": 0.530278351394693, + "grad_norm": 10.9375, + "learning_rate": 8.787081209400074e-06, + "loss": 0.95934763, + "memory(GiB)": 302.58, + "step": 94820, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.73055501, + "epoch": 0.5303902008676723, + "grad_norm": 6.0, + "learning_rate": 8.786477374863652e-06, + "loss": 1.08162537, + "memory(GiB)": 302.58, + "step": 94840, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.73225746, + "epoch": 0.5305020503406516, + "grad_norm": 5.84375, + "learning_rate": 8.785873410817692e-06, + "loss": 1.0522296, + "memory(GiB)": 302.58, + "step": 94860, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.73817863, + "epoch": 0.5306138998136308, + "grad_norm": 7.21875, + "learning_rate": 8.785269317282859e-06, + "loss": 1.00895929, + "memory(GiB)": 302.58, + "step": 94880, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.74646082, + "epoch": 0.5307257492866101, + "grad_norm": 7.34375, + "learning_rate": 8.784665094279814e-06, + "loss": 0.99465799, + "memory(GiB)": 302.58, + "step": 94900, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.72970252, + "epoch": 0.5308375987595894, + "grad_norm": 7.28125, + "learning_rate": 8.784060741829219e-06, + "loss": 1.07229319, + "memory(GiB)": 302.58, + "step": 94920, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.74950967, + "epoch": 0.5309494482325686, + "grad_norm": 7.5, + "learning_rate": 8.78345625995175e-06, + "loss": 0.98189678, + "memory(GiB)": 302.58, + "step": 94940, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.73148937, + "epoch": 0.5310612977055479, + "grad_norm": 8.1875, + "learning_rate": 8.78285164866808e-06, + "loss": 1.03423796, + "memory(GiB)": 302.58, + "step": 94960, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.73646216, + "epoch": 0.5311731471785271, + "grad_norm": 6.15625, + "learning_rate": 8.782246907998887e-06, + "loss": 1.01871214, + "memory(GiB)": 302.58, + "step": 94980, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.72374287, + "epoch": 0.5312849966515064, + "grad_norm": 8.0625, + "learning_rate": 8.781642037964858e-06, + "loss": 1.09817066, + "memory(GiB)": 302.58, + "step": 95000, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.73004808, + "epoch": 0.5313968461244857, + "grad_norm": 5.125, + "learning_rate": 8.781037038586678e-06, + "loss": 1.06901655, + "memory(GiB)": 302.58, + "step": 95020, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.72890248, + "epoch": 0.5315086955974649, + "grad_norm": 8.125, + "learning_rate": 8.780431909885043e-06, + "loss": 1.10862732, + "memory(GiB)": 302.58, + "step": 95040, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.7214663, + "epoch": 0.5316205450704442, + "grad_norm": 8.4375, + "learning_rate": 8.77982665188065e-06, + "loss": 1.09383593, + "memory(GiB)": 302.58, + "step": 95060, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.75290136, + "epoch": 0.5317323945434235, + "grad_norm": 7.90625, + "learning_rate": 8.7792212645942e-06, + "loss": 0.96813307, + "memory(GiB)": 302.58, + "step": 95080, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.74978809, + "epoch": 0.5318442440164027, + "grad_norm": 6.96875, + "learning_rate": 8.778615748046399e-06, + "loss": 0.98375883, + "memory(GiB)": 302.58, + "step": 95100, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.7472538, + "epoch": 0.531956093489382, + "grad_norm": 6.9375, + "learning_rate": 8.778010102257956e-06, + "loss": 0.98742571, + "memory(GiB)": 302.58, + "step": 95120, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.71680946, + "epoch": 0.5320679429623613, + "grad_norm": 5.125, + "learning_rate": 8.77740432724959e-06, + "loss": 1.10647726, + "memory(GiB)": 302.58, + "step": 95140, + "train_speed(iter/s)": 0.123904 + }, + { + "acc": 0.7441411, + "epoch": 0.5321797924353405, + "grad_norm": 8.1875, + "learning_rate": 8.776798423042017e-06, + "loss": 1.00031643, + "memory(GiB)": 302.58, + "step": 95160, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.75303783, + "epoch": 0.5322916419083198, + "grad_norm": 5.3125, + "learning_rate": 8.776192389655963e-06, + "loss": 0.96712761, + "memory(GiB)": 302.58, + "step": 95180, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.74994254, + "epoch": 0.532403491381299, + "grad_norm": 9.75, + "learning_rate": 8.775586227112154e-06, + "loss": 0.98496933, + "memory(GiB)": 302.58, + "step": 95200, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.7431776, + "epoch": 0.5325153408542783, + "grad_norm": 5.53125, + "learning_rate": 8.774979935431324e-06, + "loss": 0.9976594, + "memory(GiB)": 302.58, + "step": 95220, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.73919683, + "epoch": 0.5326271903272576, + "grad_norm": 6.53125, + "learning_rate": 8.774373514634211e-06, + "loss": 1.0309412, + "memory(GiB)": 302.58, + "step": 95240, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.73121228, + "epoch": 0.5327390398002368, + "grad_norm": 6.0625, + "learning_rate": 8.773766964741556e-06, + "loss": 1.07022581, + "memory(GiB)": 302.58, + "step": 95260, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.74215293, + "epoch": 0.5328508892732161, + "grad_norm": 8.5625, + "learning_rate": 8.773160285774103e-06, + "loss": 1.02819366, + "memory(GiB)": 302.58, + "step": 95280, + "train_speed(iter/s)": 0.123987 + }, + { + "acc": 0.73286839, + "epoch": 0.5329627387461954, + "grad_norm": 7.28125, + "learning_rate": 8.772553477752606e-06, + "loss": 1.04100018, + "memory(GiB)": 302.58, + "step": 95300, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.73676839, + "epoch": 0.5330745882191746, + "grad_norm": 8.1875, + "learning_rate": 8.771946540697816e-06, + "loss": 1.03465109, + "memory(GiB)": 302.58, + "step": 95320, + "train_speed(iter/s)": 0.124011 + }, + { + "acc": 0.74361548, + "epoch": 0.5331864376921539, + "grad_norm": 7.46875, + "learning_rate": 8.771339474630494e-06, + "loss": 0.99350948, + "memory(GiB)": 302.58, + "step": 95340, + "train_speed(iter/s)": 0.124023 + }, + { + "acc": 0.74509959, + "epoch": 0.5332982871651332, + "grad_norm": 9.5625, + "learning_rate": 8.770732279571403e-06, + "loss": 1.01409273, + "memory(GiB)": 302.58, + "step": 95360, + "train_speed(iter/s)": 0.124036 + }, + { + "acc": 0.7209156, + "epoch": 0.5334101366381124, + "grad_norm": 15.0625, + "learning_rate": 8.770124955541312e-06, + "loss": 1.09283657, + "memory(GiB)": 302.58, + "step": 95380, + "train_speed(iter/s)": 0.124048 + }, + { + "acc": 0.74947295, + "epoch": 0.5335219861110917, + "grad_norm": 6.90625, + "learning_rate": 8.769517502560993e-06, + "loss": 0.99367399, + "memory(GiB)": 302.58, + "step": 95400, + "train_speed(iter/s)": 0.124061 + }, + { + "acc": 0.72931247, + "epoch": 0.533633835584071, + "grad_norm": 7.8125, + "learning_rate": 8.768909920651221e-06, + "loss": 1.05039282, + "memory(GiB)": 302.58, + "step": 95420, + "train_speed(iter/s)": 0.124074 + }, + { + "acc": 0.74041405, + "epoch": 0.5337456850570502, + "grad_norm": 5.75, + "learning_rate": 8.768302209832781e-06, + "loss": 0.99974365, + "memory(GiB)": 302.58, + "step": 95440, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.73658023, + "epoch": 0.5338575345300295, + "grad_norm": 9.125, + "learning_rate": 8.767694370126457e-06, + "loss": 1.0173296, + "memory(GiB)": 302.58, + "step": 95460, + "train_speed(iter/s)": 0.124097 + }, + { + "acc": 0.73987017, + "epoch": 0.5339693840030088, + "grad_norm": 6.46875, + "learning_rate": 8.767086401553036e-06, + "loss": 1.02544956, + "memory(GiB)": 302.58, + "step": 95480, + "train_speed(iter/s)": 0.12411 + }, + { + "acc": 0.74261756, + "epoch": 0.534081233475988, + "grad_norm": 9.3125, + "learning_rate": 8.766478304133315e-06, + "loss": 1.00536032, + "memory(GiB)": 302.58, + "step": 95500, + "train_speed(iter/s)": 0.124121 + }, + { + "acc": 0.74280734, + "epoch": 0.5341930829489673, + "grad_norm": 10.125, + "learning_rate": 8.765870077888095e-06, + "loss": 1.01641846, + "memory(GiB)": 302.58, + "step": 95520, + "train_speed(iter/s)": 0.124133 + }, + { + "acc": 0.74505091, + "epoch": 0.5343049324219465, + "grad_norm": 8.4375, + "learning_rate": 8.765261722838173e-06, + "loss": 0.99678135, + "memory(GiB)": 302.58, + "step": 95540, + "train_speed(iter/s)": 0.124145 + }, + { + "acc": 0.72587061, + "epoch": 0.5344167818949258, + "grad_norm": 7.90625, + "learning_rate": 8.764653239004365e-06, + "loss": 1.08929491, + "memory(GiB)": 302.58, + "step": 95560, + "train_speed(iter/s)": 0.124157 + }, + { + "acc": 0.74032793, + "epoch": 0.5345286313679051, + "grad_norm": 6.125, + "learning_rate": 8.764044626407478e-06, + "loss": 1.03310547, + "memory(GiB)": 302.58, + "step": 95580, + "train_speed(iter/s)": 0.124169 + }, + { + "acc": 0.72129378, + "epoch": 0.5346404808408843, + "grad_norm": 7.34375, + "learning_rate": 8.763435885068329e-06, + "loss": 1.10795889, + "memory(GiB)": 302.58, + "step": 95600, + "train_speed(iter/s)": 0.124181 + }, + { + "acc": 0.73613801, + "epoch": 0.5347523303138636, + "grad_norm": 5.4375, + "learning_rate": 8.762827015007739e-06, + "loss": 1.0532877, + "memory(GiB)": 302.58, + "step": 95620, + "train_speed(iter/s)": 0.124194 + }, + { + "acc": 0.73658023, + "epoch": 0.5348641797868429, + "grad_norm": 7.53125, + "learning_rate": 8.762218016246534e-06, + "loss": 1.02887449, + "memory(GiB)": 302.58, + "step": 95640, + "train_speed(iter/s)": 0.124207 + }, + { + "acc": 0.73959293, + "epoch": 0.5349760292598221, + "grad_norm": 6.78125, + "learning_rate": 8.761608888805542e-06, + "loss": 1.01521406, + "memory(GiB)": 302.58, + "step": 95660, + "train_speed(iter/s)": 0.124219 + }, + { + "acc": 0.75559478, + "epoch": 0.5350878787328014, + "grad_norm": 7.1875, + "learning_rate": 8.760999632705599e-06, + "loss": 0.94371328, + "memory(GiB)": 302.58, + "step": 95680, + "train_speed(iter/s)": 0.124232 + }, + { + "acc": 0.73092527, + "epoch": 0.5351997282057807, + "grad_norm": 4.75, + "learning_rate": 8.760390247967543e-06, + "loss": 1.03826189, + "memory(GiB)": 302.58, + "step": 95700, + "train_speed(iter/s)": 0.124244 + }, + { + "acc": 0.73734388, + "epoch": 0.5353115776787599, + "grad_norm": 8.9375, + "learning_rate": 8.759780734612216e-06, + "loss": 1.04163532, + "memory(GiB)": 302.58, + "step": 95720, + "train_speed(iter/s)": 0.124257 + }, + { + "acc": 0.74709849, + "epoch": 0.5354234271517392, + "grad_norm": 5.34375, + "learning_rate": 8.759171092660466e-06, + "loss": 1.00301275, + "memory(GiB)": 302.58, + "step": 95740, + "train_speed(iter/s)": 0.12427 + }, + { + "acc": 0.75275092, + "epoch": 0.5355352766247184, + "grad_norm": 7.1875, + "learning_rate": 8.758561322133145e-06, + "loss": 0.95290165, + "memory(GiB)": 302.58, + "step": 95760, + "train_speed(iter/s)": 0.124283 + }, + { + "acc": 0.737113, + "epoch": 0.5356471260976977, + "grad_norm": 6.84375, + "learning_rate": 8.75795142305111e-06, + "loss": 1.01906528, + "memory(GiB)": 302.58, + "step": 95780, + "train_speed(iter/s)": 0.124295 + }, + { + "acc": 0.7463954, + "epoch": 0.535758975570677, + "grad_norm": 9.4375, + "learning_rate": 8.757341395435218e-06, + "loss": 1.00872602, + "memory(GiB)": 302.58, + "step": 95800, + "train_speed(iter/s)": 0.124308 + }, + { + "acc": 0.73987465, + "epoch": 0.5358708250436562, + "grad_norm": 6.5625, + "learning_rate": 8.756731239306335e-06, + "loss": 1.01804132, + "memory(GiB)": 302.58, + "step": 95820, + "train_speed(iter/s)": 0.124321 + }, + { + "acc": 0.72820139, + "epoch": 0.5359826745166355, + "grad_norm": 4.59375, + "learning_rate": 8.756120954685335e-06, + "loss": 1.08540297, + "memory(GiB)": 302.58, + "step": 95840, + "train_speed(iter/s)": 0.124332 + }, + { + "acc": 0.72454777, + "epoch": 0.5360945239896148, + "grad_norm": 8.0625, + "learning_rate": 8.755510541593085e-06, + "loss": 1.10476866, + "memory(GiB)": 302.58, + "step": 95860, + "train_speed(iter/s)": 0.124345 + }, + { + "acc": 0.73196716, + "epoch": 0.536206373462594, + "grad_norm": 7.875, + "learning_rate": 8.754900000050468e-06, + "loss": 1.05405664, + "memory(GiB)": 302.58, + "step": 95880, + "train_speed(iter/s)": 0.124356 + }, + { + "acc": 0.7369772, + "epoch": 0.5363182229355733, + "grad_norm": 8.8125, + "learning_rate": 8.754289330078363e-06, + "loss": 1.03243923, + "memory(GiB)": 302.58, + "step": 95900, + "train_speed(iter/s)": 0.124369 + }, + { + "acc": 0.72878184, + "epoch": 0.5364300724085526, + "grad_norm": 8.3125, + "learning_rate": 8.753678531697659e-06, + "loss": 1.06395102, + "memory(GiB)": 302.58, + "step": 95920, + "train_speed(iter/s)": 0.124381 + }, + { + "acc": 0.74574103, + "epoch": 0.5365419218815318, + "grad_norm": 4.53125, + "learning_rate": 8.753067604929246e-06, + "loss": 0.98074179, + "memory(GiB)": 302.58, + "step": 95940, + "train_speed(iter/s)": 0.124394 + }, + { + "acc": 0.73055296, + "epoch": 0.5366537713545111, + "grad_norm": 5.3125, + "learning_rate": 8.752456549794022e-06, + "loss": 1.05979271, + "memory(GiB)": 302.58, + "step": 95960, + "train_speed(iter/s)": 0.124407 + }, + { + "acc": 0.74365973, + "epoch": 0.5367656208274904, + "grad_norm": 8.25, + "learning_rate": 8.751845366312884e-06, + "loss": 0.99526606, + "memory(GiB)": 302.58, + "step": 95980, + "train_speed(iter/s)": 0.124419 + }, + { + "acc": 0.7434176, + "epoch": 0.5368774703004696, + "grad_norm": 8.625, + "learning_rate": 8.75123405450674e-06, + "loss": 1.01010742, + "memory(GiB)": 302.58, + "step": 96000, + "train_speed(iter/s)": 0.124431 + }, + { + "epoch": 0.5368774703004696, + "eval_acc": 0.7020360755853821, + "eval_loss": 1.0329664945602417, + "eval_runtime": 7501.4963, + "eval_samples_per_second": 10.036, + "eval_steps_per_second": 10.036, + "step": 96000 + }, + { + "acc": 0.73752236, + "epoch": 0.5369893197734489, + "grad_norm": 6.40625, + "learning_rate": 8.750622614396494e-06, + "loss": 1.03608942, + "memory(GiB)": 302.58, + "step": 96020, + "train_speed(iter/s)": 0.123224 + }, + { + "acc": 0.74698892, + "epoch": 0.5371011692464281, + "grad_norm": 6.8125, + "learning_rate": 8.750011046003062e-06, + "loss": 0.98343201, + "memory(GiB)": 302.58, + "step": 96040, + "train_speed(iter/s)": 0.123237 + }, + { + "acc": 0.73427181, + "epoch": 0.5372130187194074, + "grad_norm": 6.78125, + "learning_rate": 8.74939934934736e-06, + "loss": 1.05857162, + "memory(GiB)": 302.58, + "step": 96060, + "train_speed(iter/s)": 0.123248 + }, + { + "acc": 0.7253087, + "epoch": 0.5373248681923867, + "grad_norm": 5.875, + "learning_rate": 8.748787524450313e-06, + "loss": 1.07976065, + "memory(GiB)": 302.58, + "step": 96080, + "train_speed(iter/s)": 0.123261 + }, + { + "acc": 0.74238248, + "epoch": 0.5374367176653659, + "grad_norm": 6.8125, + "learning_rate": 8.748175571332846e-06, + "loss": 1.01009398, + "memory(GiB)": 302.58, + "step": 96100, + "train_speed(iter/s)": 0.123273 + }, + { + "acc": 0.73983126, + "epoch": 0.5375485671383452, + "grad_norm": 6.625, + "learning_rate": 8.74756349001589e-06, + "loss": 1.01948671, + "memory(GiB)": 302.58, + "step": 96120, + "train_speed(iter/s)": 0.123285 + }, + { + "acc": 0.73699641, + "epoch": 0.5376604166113245, + "grad_norm": 7.375, + "learning_rate": 8.746951280520377e-06, + "loss": 1.03705072, + "memory(GiB)": 302.58, + "step": 96140, + "train_speed(iter/s)": 0.123297 + }, + { + "acc": 0.73530722, + "epoch": 0.5377722660843037, + "grad_norm": 6.40625, + "learning_rate": 8.746338942867248e-06, + "loss": 1.05397425, + "memory(GiB)": 302.58, + "step": 96160, + "train_speed(iter/s)": 0.123309 + }, + { + "acc": 0.73786755, + "epoch": 0.537884115557283, + "grad_norm": 6.3125, + "learning_rate": 8.745726477077451e-06, + "loss": 1.03388577, + "memory(GiB)": 302.58, + "step": 96180, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.74313989, + "epoch": 0.5379959650302623, + "grad_norm": 9.375, + "learning_rate": 8.745113883171929e-06, + "loss": 1.00687199, + "memory(GiB)": 302.58, + "step": 96200, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.74386802, + "epoch": 0.5381078145032415, + "grad_norm": 7.21875, + "learning_rate": 8.744501161171639e-06, + "loss": 0.99268341, + "memory(GiB)": 302.58, + "step": 96220, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.73689394, + "epoch": 0.5382196639762208, + "grad_norm": 5.5, + "learning_rate": 8.743888311097534e-06, + "loss": 1.03463116, + "memory(GiB)": 302.58, + "step": 96240, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.75129743, + "epoch": 0.5383315134492, + "grad_norm": 7.125, + "learning_rate": 8.743275332970577e-06, + "loss": 0.98690481, + "memory(GiB)": 302.58, + "step": 96260, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.73072138, + "epoch": 0.5384433629221793, + "grad_norm": 7.0, + "learning_rate": 8.742662226811736e-06, + "loss": 1.07187929, + "memory(GiB)": 302.58, + "step": 96280, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.74384475, + "epoch": 0.5385552123951586, + "grad_norm": 4.5625, + "learning_rate": 8.742048992641977e-06, + "loss": 1.00976744, + "memory(GiB)": 302.58, + "step": 96300, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.73789105, + "epoch": 0.5386670618681378, + "grad_norm": 6.875, + "learning_rate": 8.741435630482277e-06, + "loss": 1.03348389, + "memory(GiB)": 302.58, + "step": 96320, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.72319112, + "epoch": 0.5387789113411171, + "grad_norm": 8.1875, + "learning_rate": 8.740822140353615e-06, + "loss": 1.09975119, + "memory(GiB)": 302.58, + "step": 96340, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74490213, + "epoch": 0.5388907608140964, + "grad_norm": 5.78125, + "learning_rate": 8.740208522276976e-06, + "loss": 1.00234232, + "memory(GiB)": 302.58, + "step": 96360, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.7366147, + "epoch": 0.5390026102870756, + "grad_norm": 8.4375, + "learning_rate": 8.739594776273343e-06, + "loss": 1.03304186, + "memory(GiB)": 302.58, + "step": 96380, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.73838534, + "epoch": 0.5391144597600549, + "grad_norm": 9.0625, + "learning_rate": 8.738980902363712e-06, + "loss": 1.01855001, + "memory(GiB)": 302.58, + "step": 96400, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.73630729, + "epoch": 0.5392263092330342, + "grad_norm": 5.5, + "learning_rate": 8.738366900569078e-06, + "loss": 1.03904152, + "memory(GiB)": 302.58, + "step": 96420, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.7240025, + "epoch": 0.5393381587060134, + "grad_norm": 6.5625, + "learning_rate": 8.737752770910442e-06, + "loss": 1.1158824, + "memory(GiB)": 302.58, + "step": 96440, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.7478941, + "epoch": 0.5394500081789927, + "grad_norm": 6.625, + "learning_rate": 8.737138513408808e-06, + "loss": 0.97977486, + "memory(GiB)": 302.58, + "step": 96460, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.72307253, + "epoch": 0.539561857651972, + "grad_norm": 7.09375, + "learning_rate": 8.736524128085187e-06, + "loss": 1.10007706, + "memory(GiB)": 302.58, + "step": 96480, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.7547471, + "epoch": 0.5396737071249512, + "grad_norm": 6.59375, + "learning_rate": 8.735909614960594e-06, + "loss": 0.95269384, + "memory(GiB)": 302.58, + "step": 96500, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.73879447, + "epoch": 0.5397855565979305, + "grad_norm": 5.84375, + "learning_rate": 8.735294974056045e-06, + "loss": 1.05862665, + "memory(GiB)": 302.58, + "step": 96520, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.76374459, + "epoch": 0.5398974060709097, + "grad_norm": 6.875, + "learning_rate": 8.734680205392564e-06, + "loss": 0.91718874, + "memory(GiB)": 302.58, + "step": 96540, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.74390106, + "epoch": 0.540009255543889, + "grad_norm": 8.8125, + "learning_rate": 8.734065308991178e-06, + "loss": 1.02466803, + "memory(GiB)": 302.58, + "step": 96560, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.72874699, + "epoch": 0.5401211050168683, + "grad_norm": 6.78125, + "learning_rate": 8.733450284872917e-06, + "loss": 1.0763772, + "memory(GiB)": 302.58, + "step": 96580, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.74092917, + "epoch": 0.5402329544898475, + "grad_norm": 5.6875, + "learning_rate": 8.732835133058816e-06, + "loss": 1.04979124, + "memory(GiB)": 302.58, + "step": 96600, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.73447552, + "epoch": 0.5403448039628268, + "grad_norm": 8.3125, + "learning_rate": 8.732219853569922e-06, + "loss": 1.03042173, + "memory(GiB)": 302.58, + "step": 96620, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.73290796, + "epoch": 0.5404566534358061, + "grad_norm": 8.5, + "learning_rate": 8.73160444642727e-06, + "loss": 1.06748571, + "memory(GiB)": 302.58, + "step": 96640, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.72798481, + "epoch": 0.5405685029087853, + "grad_norm": 7.375, + "learning_rate": 8.730988911651916e-06, + "loss": 1.06567001, + "memory(GiB)": 302.58, + "step": 96660, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74058194, + "epoch": 0.5406803523817646, + "grad_norm": 6.375, + "learning_rate": 8.73037324926491e-06, + "loss": 1.02713881, + "memory(GiB)": 302.58, + "step": 96680, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.72902369, + "epoch": 0.5407922018547439, + "grad_norm": 5.78125, + "learning_rate": 8.729757459287308e-06, + "loss": 1.06874247, + "memory(GiB)": 302.58, + "step": 96700, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.74732485, + "epoch": 0.5409040513277231, + "grad_norm": 10.4375, + "learning_rate": 8.729141541740175e-06, + "loss": 1.00134315, + "memory(GiB)": 302.58, + "step": 96720, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.73030987, + "epoch": 0.5410159008007024, + "grad_norm": 7.40625, + "learning_rate": 8.728525496644578e-06, + "loss": 1.07089777, + "memory(GiB)": 302.58, + "step": 96740, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.72287326, + "epoch": 0.5411277502736817, + "grad_norm": 10.8125, + "learning_rate": 8.727909324021586e-06, + "loss": 1.12343168, + "memory(GiB)": 302.58, + "step": 96760, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.7477118, + "epoch": 0.5412395997466609, + "grad_norm": 6.875, + "learning_rate": 8.727293023892273e-06, + "loss": 1.00043097, + "memory(GiB)": 302.58, + "step": 96780, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.75211139, + "epoch": 0.5413514492196402, + "grad_norm": 6.375, + "learning_rate": 8.726676596277722e-06, + "loss": 0.97007666, + "memory(GiB)": 302.58, + "step": 96800, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.73396826, + "epoch": 0.5414632986926194, + "grad_norm": 6.5625, + "learning_rate": 8.726060041199012e-06, + "loss": 1.03769732, + "memory(GiB)": 302.58, + "step": 96820, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.73813472, + "epoch": 0.5415751481655987, + "grad_norm": 9.0625, + "learning_rate": 8.725443358677235e-06, + "loss": 1.02647963, + "memory(GiB)": 302.58, + "step": 96840, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.73655505, + "epoch": 0.541686997638578, + "grad_norm": 5.5, + "learning_rate": 8.72482654873348e-06, + "loss": 1.01330385, + "memory(GiB)": 302.58, + "step": 96860, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.74102225, + "epoch": 0.5417988471115572, + "grad_norm": 6.8125, + "learning_rate": 8.724209611388847e-06, + "loss": 1.00507917, + "memory(GiB)": 302.58, + "step": 96880, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.72990355, + "epoch": 0.5419106965845365, + "grad_norm": 7.875, + "learning_rate": 8.723592546664438e-06, + "loss": 1.0838048, + "memory(GiB)": 302.58, + "step": 96900, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.72421122, + "epoch": 0.5420225460575158, + "grad_norm": 7.53125, + "learning_rate": 8.722975354581355e-06, + "loss": 1.08740358, + "memory(GiB)": 302.58, + "step": 96920, + "train_speed(iter/s)": 0.123765 + }, + { + "acc": 0.74059906, + "epoch": 0.542134395530495, + "grad_norm": 4.65625, + "learning_rate": 8.72235803516071e-06, + "loss": 1.02351103, + "memory(GiB)": 302.58, + "step": 96940, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.73306069, + "epoch": 0.5422462450034743, + "grad_norm": 7.90625, + "learning_rate": 8.721740588423617e-06, + "loss": 1.04758863, + "memory(GiB)": 302.58, + "step": 96960, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.73807607, + "epoch": 0.5423580944764536, + "grad_norm": 8.75, + "learning_rate": 8.721123014391195e-06, + "loss": 1.02645874, + "memory(GiB)": 302.58, + "step": 96980, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.72266512, + "epoch": 0.5424699439494328, + "grad_norm": 9.75, + "learning_rate": 8.720505313084566e-06, + "loss": 1.09324284, + "memory(GiB)": 302.58, + "step": 97000, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.73069019, + "epoch": 0.5425817934224121, + "grad_norm": 6.0, + "learning_rate": 8.719887484524856e-06, + "loss": 1.07362356, + "memory(GiB)": 302.58, + "step": 97020, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.73342795, + "epoch": 0.5426936428953913, + "grad_norm": 9.125, + "learning_rate": 8.7192695287332e-06, + "loss": 1.04628325, + "memory(GiB)": 302.58, + "step": 97040, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.74445519, + "epoch": 0.5428054923683706, + "grad_norm": 7.21875, + "learning_rate": 8.718651445730735e-06, + "loss": 1.01144714, + "memory(GiB)": 302.58, + "step": 97060, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.74316707, + "epoch": 0.5429173418413499, + "grad_norm": 6.78125, + "learning_rate": 8.718033235538596e-06, + "loss": 1.00695591, + "memory(GiB)": 302.58, + "step": 97080, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.75327244, + "epoch": 0.5430291913143291, + "grad_norm": 9.4375, + "learning_rate": 8.717414898177932e-06, + "loss": 0.95890579, + "memory(GiB)": 302.58, + "step": 97100, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.73860741, + "epoch": 0.5431410407873084, + "grad_norm": 5.0625, + "learning_rate": 8.716796433669891e-06, + "loss": 1.02880974, + "memory(GiB)": 302.58, + "step": 97120, + "train_speed(iter/s)": 0.123885 + }, + { + "acc": 0.72543859, + "epoch": 0.5432528902602877, + "grad_norm": 6.59375, + "learning_rate": 8.716177842035626e-06, + "loss": 1.07931662, + "memory(GiB)": 302.58, + "step": 97140, + "train_speed(iter/s)": 0.123898 + }, + { + "acc": 0.72765274, + "epoch": 0.5433647397332669, + "grad_norm": 8.375, + "learning_rate": 8.715559123296296e-06, + "loss": 1.10307665, + "memory(GiB)": 302.58, + "step": 97160, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.74413342, + "epoch": 0.5434765892062462, + "grad_norm": 9.6875, + "learning_rate": 8.71494027747306e-06, + "loss": 0.99612932, + "memory(GiB)": 302.58, + "step": 97180, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.75132632, + "epoch": 0.5435884386792255, + "grad_norm": 9.0625, + "learning_rate": 8.714321304587088e-06, + "loss": 0.9675437, + "memory(GiB)": 302.58, + "step": 97200, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.74749784, + "epoch": 0.5437002881522047, + "grad_norm": 6.75, + "learning_rate": 8.713702204659552e-06, + "loss": 0.97770758, + "memory(GiB)": 302.58, + "step": 97220, + "train_speed(iter/s)": 0.123947 + }, + { + "acc": 0.73237858, + "epoch": 0.543812137625184, + "grad_norm": 8.3125, + "learning_rate": 8.71308297771162e-06, + "loss": 1.06864433, + "memory(GiB)": 302.58, + "step": 97240, + "train_speed(iter/s)": 0.123959 + }, + { + "acc": 0.74888325, + "epoch": 0.5439239870981633, + "grad_norm": 8.1875, + "learning_rate": 8.71246362376448e-06, + "loss": 0.97865038, + "memory(GiB)": 302.58, + "step": 97260, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.73211718, + "epoch": 0.5440358365711425, + "grad_norm": 7.6875, + "learning_rate": 8.71184414283931e-06, + "loss": 1.02818089, + "memory(GiB)": 302.58, + "step": 97280, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.73145037, + "epoch": 0.5441476860441218, + "grad_norm": 7.96875, + "learning_rate": 8.711224534957301e-06, + "loss": 1.06219149, + "memory(GiB)": 302.58, + "step": 97300, + "train_speed(iter/s)": 0.123995 + }, + { + "acc": 0.74013443, + "epoch": 0.544259535517101, + "grad_norm": 6.9375, + "learning_rate": 8.710604800139647e-06, + "loss": 1.02428522, + "memory(GiB)": 302.58, + "step": 97320, + "train_speed(iter/s)": 0.124006 + }, + { + "acc": 0.74306855, + "epoch": 0.5443713849900803, + "grad_norm": 8.75, + "learning_rate": 8.70998493840754e-06, + "loss": 1.02045269, + "memory(GiB)": 302.58, + "step": 97340, + "train_speed(iter/s)": 0.124017 + }, + { + "acc": 0.73292184, + "epoch": 0.5444832344630596, + "grad_norm": 7.0625, + "learning_rate": 8.709364949782186e-06, + "loss": 1.04882078, + "memory(GiB)": 302.58, + "step": 97360, + "train_speed(iter/s)": 0.12403 + }, + { + "acc": 0.72395763, + "epoch": 0.5445950839360388, + "grad_norm": 6.53125, + "learning_rate": 8.708744834284789e-06, + "loss": 1.09385386, + "memory(GiB)": 302.58, + "step": 97380, + "train_speed(iter/s)": 0.124042 + }, + { + "acc": 0.74427013, + "epoch": 0.5447069334090181, + "grad_norm": 8.0625, + "learning_rate": 8.708124591936555e-06, + "loss": 1.00976763, + "memory(GiB)": 302.58, + "step": 97400, + "train_speed(iter/s)": 0.124054 + }, + { + "acc": 0.74032679, + "epoch": 0.5448187828819974, + "grad_norm": 6.71875, + "learning_rate": 8.707504222758705e-06, + "loss": 1.01562357, + "memory(GiB)": 302.58, + "step": 97420, + "train_speed(iter/s)": 0.124066 + }, + { + "acc": 0.73474441, + "epoch": 0.5449306323549766, + "grad_norm": 6.59375, + "learning_rate": 8.706883726772454e-06, + "loss": 1.05366697, + "memory(GiB)": 302.58, + "step": 97440, + "train_speed(iter/s)": 0.124077 + }, + { + "acc": 0.74255576, + "epoch": 0.5450424818279559, + "grad_norm": 9.1875, + "learning_rate": 8.706263103999024e-06, + "loss": 1.01971893, + "memory(GiB)": 302.58, + "step": 97460, + "train_speed(iter/s)": 0.124089 + }, + { + "acc": 0.74017177, + "epoch": 0.5451543313009352, + "grad_norm": 8.25, + "learning_rate": 8.705642354459644e-06, + "loss": 1.02381716, + "memory(GiB)": 302.58, + "step": 97480, + "train_speed(iter/s)": 0.124101 + }, + { + "acc": 0.73288841, + "epoch": 0.5452661807739144, + "grad_norm": 7.3125, + "learning_rate": 8.705021478175547e-06, + "loss": 1.03173361, + "memory(GiB)": 302.58, + "step": 97500, + "train_speed(iter/s)": 0.124113 + }, + { + "acc": 0.72681937, + "epoch": 0.5453780302468937, + "grad_norm": 8.375, + "learning_rate": 8.704400475167966e-06, + "loss": 1.06438065, + "memory(GiB)": 302.58, + "step": 97520, + "train_speed(iter/s)": 0.124125 + }, + { + "acc": 0.7488955, + "epoch": 0.545489879719873, + "grad_norm": 6.1875, + "learning_rate": 8.703779345458143e-06, + "loss": 0.98301344, + "memory(GiB)": 302.58, + "step": 97540, + "train_speed(iter/s)": 0.124137 + }, + { + "acc": 0.73431821, + "epoch": 0.5456017291928522, + "grad_norm": 9.375, + "learning_rate": 8.70315808906732e-06, + "loss": 1.05900965, + "memory(GiB)": 302.58, + "step": 97560, + "train_speed(iter/s)": 0.124149 + }, + { + "acc": 0.74801397, + "epoch": 0.5457135786658315, + "grad_norm": 8.875, + "learning_rate": 8.70253670601675e-06, + "loss": 1.00013094, + "memory(GiB)": 302.58, + "step": 97580, + "train_speed(iter/s)": 0.124161 + }, + { + "acc": 0.73300819, + "epoch": 0.5458254281388107, + "grad_norm": 4.71875, + "learning_rate": 8.701915196327683e-06, + "loss": 1.05092916, + "memory(GiB)": 302.58, + "step": 97600, + "train_speed(iter/s)": 0.124173 + }, + { + "acc": 0.74113851, + "epoch": 0.54593727761179, + "grad_norm": 5.625, + "learning_rate": 8.70129356002138e-06, + "loss": 1.02239857, + "memory(GiB)": 302.58, + "step": 97620, + "train_speed(iter/s)": 0.124185 + }, + { + "acc": 0.74372525, + "epoch": 0.5460491270847693, + "grad_norm": 4.78125, + "learning_rate": 8.700671797119098e-06, + "loss": 1.00842648, + "memory(GiB)": 302.58, + "step": 97640, + "train_speed(iter/s)": 0.124197 + }, + { + "acc": 0.72731819, + "epoch": 0.5461609765577485, + "grad_norm": 7.09375, + "learning_rate": 8.700049907642107e-06, + "loss": 1.088276, + "memory(GiB)": 302.58, + "step": 97660, + "train_speed(iter/s)": 0.124209 + }, + { + "acc": 0.72056575, + "epoch": 0.5462728260307278, + "grad_norm": 6.78125, + "learning_rate": 8.699427891611675e-06, + "loss": 1.11067009, + "memory(GiB)": 302.58, + "step": 97680, + "train_speed(iter/s)": 0.124222 + }, + { + "acc": 0.73103709, + "epoch": 0.5463846755037071, + "grad_norm": 7.5, + "learning_rate": 8.698805749049081e-06, + "loss": 1.06425514, + "memory(GiB)": 302.58, + "step": 97700, + "train_speed(iter/s)": 0.124233 + }, + { + "acc": 0.7404779, + "epoch": 0.5464965249766863, + "grad_norm": 5.5625, + "learning_rate": 8.698183479975601e-06, + "loss": 1.02429848, + "memory(GiB)": 302.58, + "step": 97720, + "train_speed(iter/s)": 0.124245 + }, + { + "acc": 0.73341646, + "epoch": 0.5466083744496656, + "grad_norm": 6.8125, + "learning_rate": 8.69756108441252e-06, + "loss": 1.05421438, + "memory(GiB)": 302.58, + "step": 97740, + "train_speed(iter/s)": 0.124256 + }, + { + "acc": 0.73820977, + "epoch": 0.5467202239226449, + "grad_norm": 6.09375, + "learning_rate": 8.696938562381125e-06, + "loss": 1.03778992, + "memory(GiB)": 302.58, + "step": 97760, + "train_speed(iter/s)": 0.124268 + }, + { + "acc": 0.73870387, + "epoch": 0.5468320733956241, + "grad_norm": 8.8125, + "learning_rate": 8.696315913902708e-06, + "loss": 1.01413937, + "memory(GiB)": 302.58, + "step": 97780, + "train_speed(iter/s)": 0.124279 + }, + { + "acc": 0.7484889, + "epoch": 0.5469439228686034, + "grad_norm": 7.375, + "learning_rate": 8.695693138998567e-06, + "loss": 0.97544346, + "memory(GiB)": 302.58, + "step": 97800, + "train_speed(iter/s)": 0.124291 + }, + { + "acc": 0.74282246, + "epoch": 0.5470557723415826, + "grad_norm": 8.1875, + "learning_rate": 8.69507023769e-06, + "loss": 0.99959526, + "memory(GiB)": 302.58, + "step": 97820, + "train_speed(iter/s)": 0.124304 + }, + { + "acc": 0.73728991, + "epoch": 0.5471676218145619, + "grad_norm": 7.4375, + "learning_rate": 8.694447209998315e-06, + "loss": 1.04152279, + "memory(GiB)": 302.58, + "step": 97840, + "train_speed(iter/s)": 0.124315 + }, + { + "acc": 0.74234128, + "epoch": 0.5472794712875412, + "grad_norm": 9.5625, + "learning_rate": 8.693824055944822e-06, + "loss": 0.99502068, + "memory(GiB)": 302.58, + "step": 97860, + "train_speed(iter/s)": 0.124326 + }, + { + "acc": 0.73175778, + "epoch": 0.5473913207605204, + "grad_norm": 6.15625, + "learning_rate": 8.693200775550832e-06, + "loss": 1.04586391, + "memory(GiB)": 302.58, + "step": 97880, + "train_speed(iter/s)": 0.124338 + }, + { + "acc": 0.74296913, + "epoch": 0.5475031702334997, + "grad_norm": 7.46875, + "learning_rate": 8.692577368837665e-06, + "loss": 1.01030598, + "memory(GiB)": 302.58, + "step": 97900, + "train_speed(iter/s)": 0.124351 + }, + { + "acc": 0.73520207, + "epoch": 0.547615019706479, + "grad_norm": 11.9375, + "learning_rate": 8.691953835826643e-06, + "loss": 1.05544109, + "memory(GiB)": 302.58, + "step": 97920, + "train_speed(iter/s)": 0.124362 + }, + { + "acc": 0.73576174, + "epoch": 0.5477268691794582, + "grad_norm": 7.125, + "learning_rate": 8.691330176539093e-06, + "loss": 1.01994276, + "memory(GiB)": 302.58, + "step": 97940, + "train_speed(iter/s)": 0.124374 + }, + { + "acc": 0.74497075, + "epoch": 0.5478387186524375, + "grad_norm": 5.40625, + "learning_rate": 8.690706390996346e-06, + "loss": 1.01108236, + "memory(GiB)": 302.58, + "step": 97960, + "train_speed(iter/s)": 0.124386 + }, + { + "acc": 0.7270256, + "epoch": 0.5479505681254168, + "grad_norm": 7.15625, + "learning_rate": 8.69008247921974e-06, + "loss": 1.10505857, + "memory(GiB)": 302.58, + "step": 97980, + "train_speed(iter/s)": 0.124399 + }, + { + "acc": 0.72976751, + "epoch": 0.548062417598396, + "grad_norm": 10.4375, + "learning_rate": 8.68945844123061e-06, + "loss": 1.06130896, + "memory(GiB)": 302.58, + "step": 98000, + "train_speed(iter/s)": 0.12441 + }, + { + "epoch": 0.548062417598396, + "eval_acc": 0.7022210371274802, + "eval_loss": 1.0320801734924316, + "eval_runtime": 7507.0212, + "eval_samples_per_second": 10.028, + "eval_steps_per_second": 10.028, + "step": 98000 + }, + { + "acc": 0.73311901, + "epoch": 0.5481742670713754, + "grad_norm": 5.21875, + "learning_rate": 8.688834277050301e-06, + "loss": 1.04497118, + "memory(GiB)": 302.58, + "step": 98020, + "train_speed(iter/s)": 0.123228 + }, + { + "acc": 0.74031568, + "epoch": 0.5482861165443547, + "grad_norm": 7.15625, + "learning_rate": 8.688209986700165e-06, + "loss": 1.0169282, + "memory(GiB)": 302.58, + "step": 98040, + "train_speed(iter/s)": 0.12324 + }, + { + "acc": 0.71029525, + "epoch": 0.5483979660173339, + "grad_norm": 6.625, + "learning_rate": 8.68758557020155e-06, + "loss": 1.15169382, + "memory(GiB)": 302.58, + "step": 98060, + "train_speed(iter/s)": 0.123252 + }, + { + "acc": 0.76510491, + "epoch": 0.5485098154903132, + "grad_norm": 8.0625, + "learning_rate": 8.686961027575818e-06, + "loss": 0.89933939, + "memory(GiB)": 302.58, + "step": 98080, + "train_speed(iter/s)": 0.123264 + }, + { + "acc": 0.73877826, + "epoch": 0.5486216649632925, + "grad_norm": 5.25, + "learning_rate": 8.686336358844328e-06, + "loss": 1.04412365, + "memory(GiB)": 302.58, + "step": 98100, + "train_speed(iter/s)": 0.123276 + }, + { + "acc": 0.73925719, + "epoch": 0.5487335144362717, + "grad_norm": 7.0, + "learning_rate": 8.685711564028442e-06, + "loss": 1.02975407, + "memory(GiB)": 302.58, + "step": 98120, + "train_speed(iter/s)": 0.123288 + }, + { + "acc": 0.74471345, + "epoch": 0.548845363909251, + "grad_norm": 6.71875, + "learning_rate": 8.685086643149536e-06, + "loss": 1.0017561, + "memory(GiB)": 302.58, + "step": 98140, + "train_speed(iter/s)": 0.123299 + }, + { + "acc": 0.72661681, + "epoch": 0.5489572133822302, + "grad_norm": 8.0, + "learning_rate": 8.684461596228981e-06, + "loss": 1.0863656, + "memory(GiB)": 302.58, + "step": 98160, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.73365726, + "epoch": 0.5490690628552095, + "grad_norm": 7.59375, + "learning_rate": 8.683836423288158e-06, + "loss": 1.04609947, + "memory(GiB)": 302.58, + "step": 98180, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.73584609, + "epoch": 0.5491809123281888, + "grad_norm": 7.15625, + "learning_rate": 8.683211124348447e-06, + "loss": 1.03313799, + "memory(GiB)": 302.58, + "step": 98200, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.74674697, + "epoch": 0.549292761801168, + "grad_norm": 5.4375, + "learning_rate": 8.682585699431237e-06, + "loss": 0.99196892, + "memory(GiB)": 302.58, + "step": 98220, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.73640327, + "epoch": 0.5494046112741473, + "grad_norm": 7.09375, + "learning_rate": 8.681960148557917e-06, + "loss": 1.03731213, + "memory(GiB)": 302.58, + "step": 98240, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.73893781, + "epoch": 0.5495164607471266, + "grad_norm": 6.40625, + "learning_rate": 8.681334471749886e-06, + "loss": 1.04745426, + "memory(GiB)": 302.58, + "step": 98260, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.74818888, + "epoch": 0.5496283102201058, + "grad_norm": 8.125, + "learning_rate": 8.680708669028543e-06, + "loss": 0.98287668, + "memory(GiB)": 302.58, + "step": 98280, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.72240109, + "epoch": 0.5497401596930851, + "grad_norm": 9.3125, + "learning_rate": 8.680082740415292e-06, + "loss": 1.09245243, + "memory(GiB)": 302.58, + "step": 98300, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.74237614, + "epoch": 0.5498520091660644, + "grad_norm": 7.4375, + "learning_rate": 8.679456685931543e-06, + "loss": 1.01415319, + "memory(GiB)": 302.58, + "step": 98320, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.72596474, + "epoch": 0.5499638586390436, + "grad_norm": 5.9375, + "learning_rate": 8.678830505598707e-06, + "loss": 1.08667593, + "memory(GiB)": 302.58, + "step": 98340, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.73492975, + "epoch": 0.5500757081120229, + "grad_norm": 9.0, + "learning_rate": 8.678204199438202e-06, + "loss": 1.04697781, + "memory(GiB)": 302.58, + "step": 98360, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.72928262, + "epoch": 0.5501875575850022, + "grad_norm": 5.875, + "learning_rate": 8.677577767471453e-06, + "loss": 1.06850204, + "memory(GiB)": 302.58, + "step": 98380, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.73506036, + "epoch": 0.5502994070579814, + "grad_norm": 13.4375, + "learning_rate": 8.67695120971988e-06, + "loss": 1.06140957, + "memory(GiB)": 302.58, + "step": 98400, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.74298244, + "epoch": 0.5504112565309607, + "grad_norm": 5.59375, + "learning_rate": 8.676324526204918e-06, + "loss": 0.98740177, + "memory(GiB)": 302.58, + "step": 98420, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.76218801, + "epoch": 0.5505231060039399, + "grad_norm": 6.34375, + "learning_rate": 8.675697716948e-06, + "loss": 0.91440115, + "memory(GiB)": 302.58, + "step": 98440, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.71926665, + "epoch": 0.5506349554769192, + "grad_norm": 6.53125, + "learning_rate": 8.675070781970563e-06, + "loss": 1.1218956, + "memory(GiB)": 302.58, + "step": 98460, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.72906613, + "epoch": 0.5507468049498985, + "grad_norm": 6.0, + "learning_rate": 8.674443721294053e-06, + "loss": 1.0743885, + "memory(GiB)": 302.58, + "step": 98480, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.73402481, + "epoch": 0.5508586544228777, + "grad_norm": 5.96875, + "learning_rate": 8.673816534939916e-06, + "loss": 1.05953712, + "memory(GiB)": 302.58, + "step": 98500, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.73633513, + "epoch": 0.550970503895857, + "grad_norm": 8.4375, + "learning_rate": 8.673189222929604e-06, + "loss": 1.01406441, + "memory(GiB)": 302.58, + "step": 98520, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.73707166, + "epoch": 0.5510823533688363, + "grad_norm": 6.71875, + "learning_rate": 8.672561785284575e-06, + "loss": 1.04484482, + "memory(GiB)": 302.58, + "step": 98540, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75815778, + "epoch": 0.5511942028418155, + "grad_norm": 7.15625, + "learning_rate": 8.671934222026286e-06, + "loss": 0.94317188, + "memory(GiB)": 302.58, + "step": 98560, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.73250642, + "epoch": 0.5513060523147948, + "grad_norm": 7.40625, + "learning_rate": 8.671306533176203e-06, + "loss": 1.05820103, + "memory(GiB)": 302.58, + "step": 98580, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.73767982, + "epoch": 0.5514179017877741, + "grad_norm": 6.5625, + "learning_rate": 8.670678718755796e-06, + "loss": 1.03776598, + "memory(GiB)": 302.58, + "step": 98600, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.72455997, + "epoch": 0.5515297512607533, + "grad_norm": 7.28125, + "learning_rate": 8.670050778786535e-06, + "loss": 1.07025394, + "memory(GiB)": 302.58, + "step": 98620, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.72597852, + "epoch": 0.5516416007337326, + "grad_norm": 7.125, + "learning_rate": 8.669422713289903e-06, + "loss": 1.07862968, + "memory(GiB)": 302.58, + "step": 98640, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.73006015, + "epoch": 0.5517534502067118, + "grad_norm": 5.75, + "learning_rate": 8.668794522287379e-06, + "loss": 1.07786436, + "memory(GiB)": 302.58, + "step": 98660, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.73188205, + "epoch": 0.5518652996796911, + "grad_norm": 7.3125, + "learning_rate": 8.668166205800447e-06, + "loss": 1.05646, + "memory(GiB)": 302.58, + "step": 98680, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.74019017, + "epoch": 0.5519771491526704, + "grad_norm": 7.40625, + "learning_rate": 8.6675377638506e-06, + "loss": 1.01632452, + "memory(GiB)": 302.58, + "step": 98700, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.723277, + "epoch": 0.5520889986256496, + "grad_norm": 8.0, + "learning_rate": 8.666909196459332e-06, + "loss": 1.1147438, + "memory(GiB)": 302.58, + "step": 98720, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.7404757, + "epoch": 0.5522008480986289, + "grad_norm": 9.125, + "learning_rate": 8.66628050364814e-06, + "loss": 1.02901154, + "memory(GiB)": 302.58, + "step": 98740, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.72846718, + "epoch": 0.5523126975716082, + "grad_norm": 6.5625, + "learning_rate": 8.665651685438533e-06, + "loss": 1.0749383, + "memory(GiB)": 302.58, + "step": 98760, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.73923993, + "epoch": 0.5524245470445874, + "grad_norm": 7.21875, + "learning_rate": 8.665022741852014e-06, + "loss": 1.03547916, + "memory(GiB)": 302.58, + "step": 98780, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.73461695, + "epoch": 0.5525363965175667, + "grad_norm": 8.9375, + "learning_rate": 8.664393672910094e-06, + "loss": 1.04382057, + "memory(GiB)": 302.58, + "step": 98800, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.72456365, + "epoch": 0.552648245990546, + "grad_norm": 7.3125, + "learning_rate": 8.663764478634293e-06, + "loss": 1.08138018, + "memory(GiB)": 302.58, + "step": 98820, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.73599205, + "epoch": 0.5527600954635252, + "grad_norm": 6.875, + "learning_rate": 8.663135159046127e-06, + "loss": 1.02320375, + "memory(GiB)": 302.58, + "step": 98840, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.74545603, + "epoch": 0.5528719449365045, + "grad_norm": 6.21875, + "learning_rate": 8.662505714167126e-06, + "loss": 0.99398012, + "memory(GiB)": 302.58, + "step": 98860, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.71016169, + "epoch": 0.5529837944094838, + "grad_norm": 9.5625, + "learning_rate": 8.661876144018813e-06, + "loss": 1.13497229, + "memory(GiB)": 302.58, + "step": 98880, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.74027195, + "epoch": 0.553095643882463, + "grad_norm": 9.0, + "learning_rate": 8.661246448622727e-06, + "loss": 1.04264174, + "memory(GiB)": 302.58, + "step": 98900, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.73968916, + "epoch": 0.5532074933554423, + "grad_norm": 7.21875, + "learning_rate": 8.660616628000402e-06, + "loss": 1.03395386, + "memory(GiB)": 302.58, + "step": 98920, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.72839022, + "epoch": 0.5533193428284215, + "grad_norm": 7.53125, + "learning_rate": 8.65998668217338e-06, + "loss": 1.07059956, + "memory(GiB)": 302.58, + "step": 98940, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.74693208, + "epoch": 0.5534311923014008, + "grad_norm": 7.21875, + "learning_rate": 8.659356611163208e-06, + "loss": 1.00228348, + "memory(GiB)": 302.58, + "step": 98960, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.73385282, + "epoch": 0.5535430417743801, + "grad_norm": 7.84375, + "learning_rate": 8.658726414991436e-06, + "loss": 1.03905592, + "memory(GiB)": 302.58, + "step": 98980, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.72944283, + "epoch": 0.5536548912473593, + "grad_norm": 6.1875, + "learning_rate": 8.658096093679621e-06, + "loss": 1.06734514, + "memory(GiB)": 302.58, + "step": 99000, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.73075743, + "epoch": 0.5537667407203386, + "grad_norm": 8.125, + "learning_rate": 8.65746564724932e-06, + "loss": 1.08217831, + "memory(GiB)": 302.58, + "step": 99020, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.75677061, + "epoch": 0.5538785901933179, + "grad_norm": 5.34375, + "learning_rate": 8.656835075722095e-06, + "loss": 0.95912066, + "memory(GiB)": 302.58, + "step": 99040, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.74032788, + "epoch": 0.5539904396662971, + "grad_norm": 5.21875, + "learning_rate": 8.656204379119516e-06, + "loss": 1.00104656, + "memory(GiB)": 302.58, + "step": 99060, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.73005433, + "epoch": 0.5541022891392764, + "grad_norm": 5.90625, + "learning_rate": 8.655573557463152e-06, + "loss": 1.0656703, + "memory(GiB)": 302.58, + "step": 99080, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.73090549, + "epoch": 0.5542141386122557, + "grad_norm": 5.4375, + "learning_rate": 8.654942610774581e-06, + "loss": 1.06750135, + "memory(GiB)": 302.58, + "step": 99100, + "train_speed(iter/s)": 0.123866 + }, + { + "acc": 0.73810368, + "epoch": 0.5543259880852349, + "grad_norm": 7.4375, + "learning_rate": 8.654311539075386e-06, + "loss": 1.04358664, + "memory(GiB)": 302.58, + "step": 99120, + "train_speed(iter/s)": 0.123878 + }, + { + "acc": 0.72636337, + "epoch": 0.5544378375582142, + "grad_norm": 5.9375, + "learning_rate": 8.653680342387146e-06, + "loss": 1.07916479, + "memory(GiB)": 302.58, + "step": 99140, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.7328042, + "epoch": 0.5545496870311935, + "grad_norm": 7.59375, + "learning_rate": 8.653049020731454e-06, + "loss": 1.0532053, + "memory(GiB)": 302.58, + "step": 99160, + "train_speed(iter/s)": 0.123902 + }, + { + "acc": 0.72892008, + "epoch": 0.5546615365041727, + "grad_norm": 5.0, + "learning_rate": 8.652417574129901e-06, + "loss": 1.07513552, + "memory(GiB)": 302.58, + "step": 99180, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.74068427, + "epoch": 0.554773385977152, + "grad_norm": 9.9375, + "learning_rate": 8.651786002604088e-06, + "loss": 1.02180405, + "memory(GiB)": 302.58, + "step": 99200, + "train_speed(iter/s)": 0.123926 + }, + { + "acc": 0.76212893, + "epoch": 0.5548852354501312, + "grad_norm": 7.25, + "learning_rate": 8.65115430617561e-06, + "loss": 0.93887739, + "memory(GiB)": 302.58, + "step": 99220, + "train_speed(iter/s)": 0.123938 + }, + { + "acc": 0.72742505, + "epoch": 0.5549970849231105, + "grad_norm": 8.3125, + "learning_rate": 8.650522484866082e-06, + "loss": 1.07426186, + "memory(GiB)": 302.58, + "step": 99240, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.72261057, + "epoch": 0.5551089343960898, + "grad_norm": 8.625, + "learning_rate": 8.649890538697107e-06, + "loss": 1.09846344, + "memory(GiB)": 302.58, + "step": 99260, + "train_speed(iter/s)": 0.123961 + }, + { + "acc": 0.73177228, + "epoch": 0.555220783869069, + "grad_norm": 8.625, + "learning_rate": 8.649258467690301e-06, + "loss": 1.05478182, + "memory(GiB)": 302.58, + "step": 99280, + "train_speed(iter/s)": 0.123973 + }, + { + "acc": 0.74771051, + "epoch": 0.5553326333420483, + "grad_norm": 5.4375, + "learning_rate": 8.648626271867286e-06, + "loss": 0.97436733, + "memory(GiB)": 302.58, + "step": 99300, + "train_speed(iter/s)": 0.123985 + }, + { + "acc": 0.73369484, + "epoch": 0.5554444828150276, + "grad_norm": 6.71875, + "learning_rate": 8.647993951249682e-06, + "loss": 1.03841238, + "memory(GiB)": 302.58, + "step": 99320, + "train_speed(iter/s)": 0.123997 + }, + { + "acc": 0.73853722, + "epoch": 0.5555563322880068, + "grad_norm": 6.75, + "learning_rate": 8.647361505859118e-06, + "loss": 1.01799088, + "memory(GiB)": 302.58, + "step": 99340, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.75272303, + "epoch": 0.5556681817609861, + "grad_norm": 11.5, + "learning_rate": 8.646728935717224e-06, + "loss": 0.97489328, + "memory(GiB)": 302.58, + "step": 99360, + "train_speed(iter/s)": 0.12402 + }, + { + "acc": 0.74739671, + "epoch": 0.5557800312339654, + "grad_norm": 7.8125, + "learning_rate": 8.646096240845639e-06, + "loss": 0.99392757, + "memory(GiB)": 302.58, + "step": 99380, + "train_speed(iter/s)": 0.124032 + }, + { + "acc": 0.74829783, + "epoch": 0.5558918807069446, + "grad_norm": 6.15625, + "learning_rate": 8.645463421265998e-06, + "loss": 1.00020933, + "memory(GiB)": 302.58, + "step": 99400, + "train_speed(iter/s)": 0.124044 + }, + { + "acc": 0.72711849, + "epoch": 0.5560037301799239, + "grad_norm": 5.03125, + "learning_rate": 8.644830476999951e-06, + "loss": 1.07180624, + "memory(GiB)": 302.58, + "step": 99420, + "train_speed(iter/s)": 0.124055 + }, + { + "acc": 0.72881231, + "epoch": 0.5561155796529031, + "grad_norm": 7.0, + "learning_rate": 8.644197408069143e-06, + "loss": 1.0847332, + "memory(GiB)": 302.58, + "step": 99440, + "train_speed(iter/s)": 0.124066 + }, + { + "acc": 0.72415237, + "epoch": 0.5562274291258824, + "grad_norm": 5.71875, + "learning_rate": 8.643564214495229e-06, + "loss": 1.0813076, + "memory(GiB)": 302.58, + "step": 99460, + "train_speed(iter/s)": 0.124078 + }, + { + "acc": 0.73491039, + "epoch": 0.5563392785988617, + "grad_norm": 6.09375, + "learning_rate": 8.642930896299865e-06, + "loss": 1.04509621, + "memory(GiB)": 302.58, + "step": 99480, + "train_speed(iter/s)": 0.12409 + }, + { + "acc": 0.73866162, + "epoch": 0.5564511280718409, + "grad_norm": 7.90625, + "learning_rate": 8.642297453504713e-06, + "loss": 1.03184586, + "memory(GiB)": 302.58, + "step": 99500, + "train_speed(iter/s)": 0.124102 + }, + { + "acc": 0.74366665, + "epoch": 0.5565629775448202, + "grad_norm": 8.5, + "learning_rate": 8.641663886131439e-06, + "loss": 1.00563097, + "memory(GiB)": 302.58, + "step": 99520, + "train_speed(iter/s)": 0.124114 + }, + { + "acc": 0.73596382, + "epoch": 0.5566748270177995, + "grad_norm": 9.3125, + "learning_rate": 8.641030194201712e-06, + "loss": 1.03273525, + "memory(GiB)": 302.58, + "step": 99540, + "train_speed(iter/s)": 0.124125 + }, + { + "acc": 0.73378239, + "epoch": 0.5567866764907787, + "grad_norm": 7.6875, + "learning_rate": 8.640396377737208e-06, + "loss": 1.04917049, + "memory(GiB)": 302.58, + "step": 99560, + "train_speed(iter/s)": 0.124136 + }, + { + "acc": 0.73006477, + "epoch": 0.556898525963758, + "grad_norm": 8.5, + "learning_rate": 8.639762436759603e-06, + "loss": 1.0649929, + "memory(GiB)": 302.58, + "step": 99580, + "train_speed(iter/s)": 0.124148 + }, + { + "acc": 0.73947186, + "epoch": 0.5570103754367373, + "grad_norm": 7.4375, + "learning_rate": 8.639128371290582e-06, + "loss": 1.01421604, + "memory(GiB)": 302.58, + "step": 99600, + "train_speed(iter/s)": 0.12416 + }, + { + "acc": 0.72980518, + "epoch": 0.5571222249097165, + "grad_norm": 6.28125, + "learning_rate": 8.638494181351832e-06, + "loss": 1.05881414, + "memory(GiB)": 302.58, + "step": 99620, + "train_speed(iter/s)": 0.124172 + }, + { + "acc": 0.74376421, + "epoch": 0.5572340743826958, + "grad_norm": 7.21875, + "learning_rate": 8.637859866965044e-06, + "loss": 0.98511267, + "memory(GiB)": 302.58, + "step": 99640, + "train_speed(iter/s)": 0.124183 + }, + { + "acc": 0.7333612, + "epoch": 0.557345923855675, + "grad_norm": 5.875, + "learning_rate": 8.637225428151912e-06, + "loss": 1.06240625, + "memory(GiB)": 302.58, + "step": 99660, + "train_speed(iter/s)": 0.124195 + }, + { + "acc": 0.74975452, + "epoch": 0.5574577733286543, + "grad_norm": 6.46875, + "learning_rate": 8.636590864934137e-06, + "loss": 0.98546371, + "memory(GiB)": 302.58, + "step": 99680, + "train_speed(iter/s)": 0.124205 + }, + { + "acc": 0.73752589, + "epoch": 0.5575696228016336, + "grad_norm": 5.90625, + "learning_rate": 8.635956177333423e-06, + "loss": 1.03004761, + "memory(GiB)": 302.58, + "step": 99700, + "train_speed(iter/s)": 0.124217 + }, + { + "acc": 0.74259505, + "epoch": 0.5576814722746128, + "grad_norm": 6.1875, + "learning_rate": 8.635321365371478e-06, + "loss": 0.98806849, + "memory(GiB)": 302.58, + "step": 99720, + "train_speed(iter/s)": 0.124229 + }, + { + "acc": 0.74051638, + "epoch": 0.5577933217475921, + "grad_norm": 4.90625, + "learning_rate": 8.634686429070017e-06, + "loss": 1.01521273, + "memory(GiB)": 302.58, + "step": 99740, + "train_speed(iter/s)": 0.124241 + }, + { + "acc": 0.7431818, + "epoch": 0.5579051712205714, + "grad_norm": 9.1875, + "learning_rate": 8.634051368450753e-06, + "loss": 1.02118006, + "memory(GiB)": 302.58, + "step": 99760, + "train_speed(iter/s)": 0.124252 + }, + { + "acc": 0.73637166, + "epoch": 0.5580170206935506, + "grad_norm": 6.0625, + "learning_rate": 8.63341618353541e-06, + "loss": 1.04688187, + "memory(GiB)": 302.58, + "step": 99780, + "train_speed(iter/s)": 0.124263 + }, + { + "acc": 0.73032269, + "epoch": 0.5581288701665299, + "grad_norm": 5.3125, + "learning_rate": 8.632780874345713e-06, + "loss": 1.06158533, + "memory(GiB)": 302.58, + "step": 99800, + "train_speed(iter/s)": 0.124275 + }, + { + "acc": 0.71864605, + "epoch": 0.5582407196395092, + "grad_norm": 5.21875, + "learning_rate": 8.63214544090339e-06, + "loss": 1.11126633, + "memory(GiB)": 302.58, + "step": 99820, + "train_speed(iter/s)": 0.124286 + }, + { + "acc": 0.74080582, + "epoch": 0.5583525691124884, + "grad_norm": 7.25, + "learning_rate": 8.631509883230175e-06, + "loss": 1.03154898, + "memory(GiB)": 302.58, + "step": 99840, + "train_speed(iter/s)": 0.124298 + }, + { + "acc": 0.73205309, + "epoch": 0.5584644185854677, + "grad_norm": 7.09375, + "learning_rate": 8.630874201347807e-06, + "loss": 1.04810715, + "memory(GiB)": 302.58, + "step": 99860, + "train_speed(iter/s)": 0.12431 + }, + { + "acc": 0.74447937, + "epoch": 0.558576268058447, + "grad_norm": 6.0, + "learning_rate": 8.63023839527803e-06, + "loss": 1.00260515, + "memory(GiB)": 302.58, + "step": 99880, + "train_speed(iter/s)": 0.124321 + }, + { + "acc": 0.74604931, + "epoch": 0.5586881175314262, + "grad_norm": 6.28125, + "learning_rate": 8.629602465042586e-06, + "loss": 0.99158392, + "memory(GiB)": 302.58, + "step": 99900, + "train_speed(iter/s)": 0.124333 + }, + { + "acc": 0.73639398, + "epoch": 0.5587999670044055, + "grad_norm": 4.71875, + "learning_rate": 8.628966410663228e-06, + "loss": 1.03675766, + "memory(GiB)": 302.58, + "step": 99920, + "train_speed(iter/s)": 0.124345 + }, + { + "acc": 0.73828883, + "epoch": 0.5589118164773847, + "grad_norm": 7.65625, + "learning_rate": 8.628330232161714e-06, + "loss": 1.04896412, + "memory(GiB)": 302.58, + "step": 99940, + "train_speed(iter/s)": 0.124357 + }, + { + "acc": 0.75670261, + "epoch": 0.559023665950364, + "grad_norm": 8.8125, + "learning_rate": 8.6276939295598e-06, + "loss": 0.94471083, + "memory(GiB)": 302.58, + "step": 99960, + "train_speed(iter/s)": 0.124368 + }, + { + "acc": 0.72435584, + "epoch": 0.5591355154233433, + "grad_norm": 7.4375, + "learning_rate": 8.627057502879251e-06, + "loss": 1.11904297, + "memory(GiB)": 302.58, + "step": 99980, + "train_speed(iter/s)": 0.12438 + }, + { + "acc": 0.74932542, + "epoch": 0.5592473648963225, + "grad_norm": 5.5625, + "learning_rate": 8.626420952141834e-06, + "loss": 0.97158184, + "memory(GiB)": 302.58, + "step": 100000, + "train_speed(iter/s)": 0.124392 + }, + { + "epoch": 0.5592473648963225, + "eval_acc": 0.7024089564767547, + "eval_loss": 1.0314146280288696, + "eval_runtime": 7496.4325, + "eval_samples_per_second": 10.043, + "eval_steps_per_second": 10.043, + "step": 100000 + }, + { + "acc": 0.7283946, + "epoch": 0.5593592143693018, + "grad_norm": 8.1875, + "learning_rate": 8.625784277369322e-06, + "loss": 1.05832653, + "memory(GiB)": 302.58, + "step": 100020, + "train_speed(iter/s)": 0.123234 + }, + { + "acc": 0.73237457, + "epoch": 0.5594710638422811, + "grad_norm": 6.0625, + "learning_rate": 8.625147478583492e-06, + "loss": 1.06374712, + "memory(GiB)": 302.58, + "step": 100040, + "train_speed(iter/s)": 0.123246 + }, + { + "acc": 0.7316328, + "epoch": 0.5595829133152603, + "grad_norm": 9.5625, + "learning_rate": 8.624510555806121e-06, + "loss": 1.05368004, + "memory(GiB)": 302.58, + "step": 100060, + "train_speed(iter/s)": 0.123257 + }, + { + "acc": 0.74072981, + "epoch": 0.5596947627882396, + "grad_norm": 7.53125, + "learning_rate": 8.623873509058995e-06, + "loss": 1.01867056, + "memory(GiB)": 302.58, + "step": 100080, + "train_speed(iter/s)": 0.123269 + }, + { + "acc": 0.73412638, + "epoch": 0.5598066122612189, + "grad_norm": 9.25, + "learning_rate": 8.623236338363905e-06, + "loss": 1.04848289, + "memory(GiB)": 302.58, + "step": 100100, + "train_speed(iter/s)": 0.12328 + }, + { + "acc": 0.73786521, + "epoch": 0.5599184617341981, + "grad_norm": 5.21875, + "learning_rate": 8.622599043742643e-06, + "loss": 1.02312746, + "memory(GiB)": 302.58, + "step": 100120, + "train_speed(iter/s)": 0.123292 + }, + { + "acc": 0.73457966, + "epoch": 0.5600303112071774, + "grad_norm": 6.96875, + "learning_rate": 8.621961625217009e-06, + "loss": 1.06646891, + "memory(GiB)": 302.58, + "step": 100140, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.73335581, + "epoch": 0.5601421606801567, + "grad_norm": 9.5, + "learning_rate": 8.6213240828088e-06, + "loss": 1.03539696, + "memory(GiB)": 302.58, + "step": 100160, + "train_speed(iter/s)": 0.123315 + }, + { + "acc": 0.71673369, + "epoch": 0.5602540101531359, + "grad_norm": 8.125, + "learning_rate": 8.620686416539824e-06, + "loss": 1.11494865, + "memory(GiB)": 302.58, + "step": 100180, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.73710027, + "epoch": 0.5603658596261152, + "grad_norm": 6.25, + "learning_rate": 8.620048626431893e-06, + "loss": 1.04214964, + "memory(GiB)": 302.58, + "step": 100200, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.73323255, + "epoch": 0.5604777090990944, + "grad_norm": 7.65625, + "learning_rate": 8.61941071250682e-06, + "loss": 1.0597374, + "memory(GiB)": 302.58, + "step": 100220, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.73121686, + "epoch": 0.5605895585720737, + "grad_norm": 5.15625, + "learning_rate": 8.618772674786424e-06, + "loss": 1.07892475, + "memory(GiB)": 302.58, + "step": 100240, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.73107619, + "epoch": 0.560701408045053, + "grad_norm": 6.3125, + "learning_rate": 8.61813451329253e-06, + "loss": 1.05134001, + "memory(GiB)": 302.58, + "step": 100260, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.73404651, + "epoch": 0.5608132575180322, + "grad_norm": 7.25, + "learning_rate": 8.617496228046957e-06, + "loss": 1.02539034, + "memory(GiB)": 302.58, + "step": 100280, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.73756695, + "epoch": 0.5609251069910115, + "grad_norm": 7.09375, + "learning_rate": 8.616857819071547e-06, + "loss": 1.04403896, + "memory(GiB)": 302.58, + "step": 100300, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.74158201, + "epoch": 0.5610369564639908, + "grad_norm": 8.3125, + "learning_rate": 8.61621928638813e-06, + "loss": 1.01722775, + "memory(GiB)": 302.58, + "step": 100320, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.74989147, + "epoch": 0.56114880593697, + "grad_norm": 6.40625, + "learning_rate": 8.615580630018545e-06, + "loss": 0.97180586, + "memory(GiB)": 302.58, + "step": 100340, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74174585, + "epoch": 0.5612606554099493, + "grad_norm": 6.96875, + "learning_rate": 8.614941849984638e-06, + "loss": 1.02218113, + "memory(GiB)": 302.58, + "step": 100360, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.73632345, + "epoch": 0.5613725048829286, + "grad_norm": 7.15625, + "learning_rate": 8.61430294630826e-06, + "loss": 1.04434166, + "memory(GiB)": 302.58, + "step": 100380, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.7294713, + "epoch": 0.5614843543559078, + "grad_norm": 5.71875, + "learning_rate": 8.613663919011258e-06, + "loss": 1.0589922, + "memory(GiB)": 302.58, + "step": 100400, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.74345775, + "epoch": 0.5615962038288871, + "grad_norm": 8.4375, + "learning_rate": 8.613024768115492e-06, + "loss": 1.00066967, + "memory(GiB)": 302.58, + "step": 100420, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.73641148, + "epoch": 0.5617080533018664, + "grad_norm": 5.78125, + "learning_rate": 8.612385493642822e-06, + "loss": 1.01494188, + "memory(GiB)": 302.58, + "step": 100440, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74078732, + "epoch": 0.5618199027748456, + "grad_norm": 8.25, + "learning_rate": 8.611746095615114e-06, + "loss": 1.01972666, + "memory(GiB)": 302.58, + "step": 100460, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.74534521, + "epoch": 0.5619317522478249, + "grad_norm": 7.0, + "learning_rate": 8.611106574054236e-06, + "loss": 1.0046237, + "memory(GiB)": 302.58, + "step": 100480, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.7362525, + "epoch": 0.5620436017208041, + "grad_norm": 7.71875, + "learning_rate": 8.610466928982064e-06, + "loss": 1.03891392, + "memory(GiB)": 302.58, + "step": 100500, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.72929134, + "epoch": 0.5621554511937834, + "grad_norm": 6.84375, + "learning_rate": 8.609827160420473e-06, + "loss": 1.05278015, + "memory(GiB)": 302.58, + "step": 100520, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.73957882, + "epoch": 0.5622673006667627, + "grad_norm": 8.1875, + "learning_rate": 8.609187268391348e-06, + "loss": 1.05091124, + "memory(GiB)": 302.58, + "step": 100540, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.74876337, + "epoch": 0.5623791501397419, + "grad_norm": 6.15625, + "learning_rate": 8.608547252916575e-06, + "loss": 0.9624361, + "memory(GiB)": 302.58, + "step": 100560, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.73807178, + "epoch": 0.5624909996127212, + "grad_norm": 11.5, + "learning_rate": 8.607907114018041e-06, + "loss": 1.02808342, + "memory(GiB)": 302.58, + "step": 100580, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.74305892, + "epoch": 0.5626028490857005, + "grad_norm": 7.78125, + "learning_rate": 8.607266851717644e-06, + "loss": 1.00472956, + "memory(GiB)": 302.58, + "step": 100600, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.73761368, + "epoch": 0.5627146985586797, + "grad_norm": 6.125, + "learning_rate": 8.606626466037285e-06, + "loss": 1.03107042, + "memory(GiB)": 302.58, + "step": 100620, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.72801862, + "epoch": 0.562826548031659, + "grad_norm": 6.125, + "learning_rate": 8.605985956998862e-06, + "loss": 1.06546926, + "memory(GiB)": 302.58, + "step": 100640, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75220885, + "epoch": 0.5629383975046383, + "grad_norm": 4.71875, + "learning_rate": 8.605345324624285e-06, + "loss": 0.95069571, + "memory(GiB)": 302.58, + "step": 100660, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.75146503, + "epoch": 0.5630502469776175, + "grad_norm": 7.09375, + "learning_rate": 8.604704568935466e-06, + "loss": 0.97894506, + "memory(GiB)": 302.58, + "step": 100680, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.72943339, + "epoch": 0.5631620964505968, + "grad_norm": 6.0625, + "learning_rate": 8.604063689954321e-06, + "loss": 1.05276279, + "memory(GiB)": 302.58, + "step": 100700, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.74784489, + "epoch": 0.563273945923576, + "grad_norm": 7.0625, + "learning_rate": 8.60342268770277e-06, + "loss": 0.99524689, + "memory(GiB)": 302.58, + "step": 100720, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.73965101, + "epoch": 0.5633857953965553, + "grad_norm": 5.25, + "learning_rate": 8.602781562202736e-06, + "loss": 1.02218113, + "memory(GiB)": 302.58, + "step": 100740, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.72081485, + "epoch": 0.5634976448695346, + "grad_norm": 6.0, + "learning_rate": 8.602140313476148e-06, + "loss": 1.10737066, + "memory(GiB)": 302.58, + "step": 100760, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.73184061, + "epoch": 0.5636094943425138, + "grad_norm": 6.75, + "learning_rate": 8.60149894154494e-06, + "loss": 1.03409348, + "memory(GiB)": 302.58, + "step": 100780, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.73672767, + "epoch": 0.5637213438154931, + "grad_norm": 6.6875, + "learning_rate": 8.600857446431048e-06, + "loss": 1.03442335, + "memory(GiB)": 302.58, + "step": 100800, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.74960613, + "epoch": 0.5638331932884724, + "grad_norm": 6.125, + "learning_rate": 8.600215828156414e-06, + "loss": 0.97066355, + "memory(GiB)": 302.58, + "step": 100820, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.72071438, + "epoch": 0.5639450427614516, + "grad_norm": 5.59375, + "learning_rate": 8.599574086742981e-06, + "loss": 1.1026228, + "memory(GiB)": 302.58, + "step": 100840, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.74287829, + "epoch": 0.5640568922344309, + "grad_norm": 5.46875, + "learning_rate": 8.598932222212701e-06, + "loss": 1.01558371, + "memory(GiB)": 302.58, + "step": 100860, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.73480339, + "epoch": 0.5641687417074102, + "grad_norm": 6.4375, + "learning_rate": 8.598290234587528e-06, + "loss": 1.04811401, + "memory(GiB)": 302.58, + "step": 100880, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.73328094, + "epoch": 0.5642805911803894, + "grad_norm": 7.96875, + "learning_rate": 8.597648123889417e-06, + "loss": 1.0418973, + "memory(GiB)": 302.58, + "step": 100900, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.73889551, + "epoch": 0.5643924406533687, + "grad_norm": 5.8125, + "learning_rate": 8.597005890140336e-06, + "loss": 1.01898108, + "memory(GiB)": 302.58, + "step": 100920, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.73990922, + "epoch": 0.564504290126348, + "grad_norm": 9.375, + "learning_rate": 8.596363533362244e-06, + "loss": 1.02892179, + "memory(GiB)": 302.58, + "step": 100940, + "train_speed(iter/s)": 0.123765 + }, + { + "acc": 0.73635321, + "epoch": 0.5646161395993272, + "grad_norm": 10.125, + "learning_rate": 8.595721053577118e-06, + "loss": 1.00617323, + "memory(GiB)": 302.58, + "step": 100960, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.72269464, + "epoch": 0.5647279890723065, + "grad_norm": 7.15625, + "learning_rate": 8.595078450806928e-06, + "loss": 1.09325933, + "memory(GiB)": 302.58, + "step": 100980, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.74378896, + "epoch": 0.5648398385452857, + "grad_norm": 7.15625, + "learning_rate": 8.594435725073658e-06, + "loss": 1.01124115, + "memory(GiB)": 302.58, + "step": 101000, + "train_speed(iter/s)": 0.123801 + }, + { + "acc": 0.73831134, + "epoch": 0.564951688018265, + "grad_norm": 7.0, + "learning_rate": 8.593792876399287e-06, + "loss": 1.03840504, + "memory(GiB)": 302.58, + "step": 101020, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.72581244, + "epoch": 0.5650635374912443, + "grad_norm": 6.6875, + "learning_rate": 8.593149904805803e-06, + "loss": 1.07268963, + "memory(GiB)": 302.58, + "step": 101040, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.73691392, + "epoch": 0.5651753869642235, + "grad_norm": 7.5625, + "learning_rate": 8.5925068103152e-06, + "loss": 1.04762802, + "memory(GiB)": 302.58, + "step": 101060, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.7260694, + "epoch": 0.5652872364372028, + "grad_norm": 5.46875, + "learning_rate": 8.591863592949472e-06, + "loss": 1.08032103, + "memory(GiB)": 302.58, + "step": 101080, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.73259091, + "epoch": 0.5653990859101821, + "grad_norm": 8.8125, + "learning_rate": 8.591220252730621e-06, + "loss": 1.04078922, + "memory(GiB)": 302.58, + "step": 101100, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.75246081, + "epoch": 0.5655109353831613, + "grad_norm": 7.375, + "learning_rate": 8.59057678968065e-06, + "loss": 0.96174421, + "memory(GiB)": 302.58, + "step": 101120, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.72744312, + "epoch": 0.5656227848561406, + "grad_norm": 7.59375, + "learning_rate": 8.589933203821566e-06, + "loss": 1.09237432, + "memory(GiB)": 302.58, + "step": 101140, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.74566488, + "epoch": 0.5657346343291199, + "grad_norm": 9.8125, + "learning_rate": 8.589289495175382e-06, + "loss": 0.99479799, + "memory(GiB)": 302.58, + "step": 101160, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.73828015, + "epoch": 0.5658464838020991, + "grad_norm": 9.875, + "learning_rate": 8.588645663764117e-06, + "loss": 1.02968788, + "memory(GiB)": 302.58, + "step": 101180, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.75879302, + "epoch": 0.5659583332750784, + "grad_norm": 7.0, + "learning_rate": 8.588001709609793e-06, + "loss": 0.92822952, + "memory(GiB)": 302.58, + "step": 101200, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.73079276, + "epoch": 0.5660701827480576, + "grad_norm": 8.125, + "learning_rate": 8.58735763273443e-06, + "loss": 1.05351496, + "memory(GiB)": 302.58, + "step": 101220, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.73705306, + "epoch": 0.5661820322210369, + "grad_norm": 7.15625, + "learning_rate": 8.586713433160061e-06, + "loss": 1.02918663, + "memory(GiB)": 302.58, + "step": 101240, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.72580175, + "epoch": 0.5662938816940162, + "grad_norm": 10.125, + "learning_rate": 8.586069110908723e-06, + "loss": 1.08987274, + "memory(GiB)": 302.58, + "step": 101260, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.73793292, + "epoch": 0.5664057311669954, + "grad_norm": 6.4375, + "learning_rate": 8.585424666002448e-06, + "loss": 1.04044752, + "memory(GiB)": 302.58, + "step": 101280, + "train_speed(iter/s)": 0.123963 + }, + { + "acc": 0.73547306, + "epoch": 0.5665175806399747, + "grad_norm": 6.21875, + "learning_rate": 8.58478009846328e-06, + "loss": 1.05249043, + "memory(GiB)": 302.58, + "step": 101300, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.73188286, + "epoch": 0.566629430112954, + "grad_norm": 4.1875, + "learning_rate": 8.584135408313266e-06, + "loss": 1.05121813, + "memory(GiB)": 302.58, + "step": 101320, + "train_speed(iter/s)": 0.123986 + }, + { + "acc": 0.7409164, + "epoch": 0.5667412795859332, + "grad_norm": 5.15625, + "learning_rate": 8.583490595574455e-06, + "loss": 1.01523972, + "memory(GiB)": 302.58, + "step": 101340, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.72014084, + "epoch": 0.5668531290589125, + "grad_norm": 5.84375, + "learning_rate": 8.582845660268904e-06, + "loss": 1.10643711, + "memory(GiB)": 302.58, + "step": 101360, + "train_speed(iter/s)": 0.124008 + }, + { + "acc": 0.75205894, + "epoch": 0.5669649785318918, + "grad_norm": 6.375, + "learning_rate": 8.58220060241867e-06, + "loss": 0.964359, + "memory(GiB)": 302.58, + "step": 101380, + "train_speed(iter/s)": 0.124021 + }, + { + "acc": 0.74775429, + "epoch": 0.567076828004871, + "grad_norm": 10.75, + "learning_rate": 8.581555422045817e-06, + "loss": 1.00402584, + "memory(GiB)": 302.58, + "step": 101400, + "train_speed(iter/s)": 0.124033 + }, + { + "acc": 0.73181143, + "epoch": 0.5671886774778503, + "grad_norm": 7.90625, + "learning_rate": 8.580910119172411e-06, + "loss": 1.04600773, + "memory(GiB)": 302.58, + "step": 101420, + "train_speed(iter/s)": 0.124045 + }, + { + "acc": 0.74774418, + "epoch": 0.5673005269508296, + "grad_norm": 8.5, + "learning_rate": 8.580264693820526e-06, + "loss": 1.00227985, + "memory(GiB)": 302.58, + "step": 101440, + "train_speed(iter/s)": 0.124056 + }, + { + "acc": 0.74205785, + "epoch": 0.5674123764238088, + "grad_norm": 6.875, + "learning_rate": 8.579619146012234e-06, + "loss": 1.0034626, + "memory(GiB)": 302.58, + "step": 101460, + "train_speed(iter/s)": 0.124068 + }, + { + "acc": 0.73948326, + "epoch": 0.5675242258967881, + "grad_norm": 6.40625, + "learning_rate": 8.578973475769617e-06, + "loss": 1.03395824, + "memory(GiB)": 302.58, + "step": 101480, + "train_speed(iter/s)": 0.124079 + }, + { + "acc": 0.74877825, + "epoch": 0.5676360753697673, + "grad_norm": 10.375, + "learning_rate": 8.57832768311476e-06, + "loss": 0.98996449, + "memory(GiB)": 302.58, + "step": 101500, + "train_speed(iter/s)": 0.12409 + }, + { + "acc": 0.74335237, + "epoch": 0.5677479248427466, + "grad_norm": 6.71875, + "learning_rate": 8.577681768069748e-06, + "loss": 0.99072771, + "memory(GiB)": 302.58, + "step": 101520, + "train_speed(iter/s)": 0.124102 + }, + { + "acc": 0.75185366, + "epoch": 0.5678597743157259, + "grad_norm": 8.1875, + "learning_rate": 8.577035730656675e-06, + "loss": 0.98346252, + "memory(GiB)": 302.58, + "step": 101540, + "train_speed(iter/s)": 0.124113 + }, + { + "acc": 0.75501232, + "epoch": 0.5679716237887051, + "grad_norm": 5.84375, + "learning_rate": 8.576389570897638e-06, + "loss": 0.94525785, + "memory(GiB)": 302.58, + "step": 101560, + "train_speed(iter/s)": 0.124125 + }, + { + "acc": 0.73731203, + "epoch": 0.5680834732616844, + "grad_norm": 6.53125, + "learning_rate": 8.575743288814736e-06, + "loss": 1.02888718, + "memory(GiB)": 302.58, + "step": 101580, + "train_speed(iter/s)": 0.124137 + }, + { + "acc": 0.7349864, + "epoch": 0.5681953227346637, + "grad_norm": 7.96875, + "learning_rate": 8.575096884430076e-06, + "loss": 1.05584192, + "memory(GiB)": 302.58, + "step": 101600, + "train_speed(iter/s)": 0.124149 + }, + { + "acc": 0.75919199, + "epoch": 0.5683071722076429, + "grad_norm": 7.84375, + "learning_rate": 8.574450357765767e-06, + "loss": 0.9427433, + "memory(GiB)": 302.58, + "step": 101620, + "train_speed(iter/s)": 0.12416 + }, + { + "acc": 0.74006486, + "epoch": 0.5684190216806222, + "grad_norm": 5.09375, + "learning_rate": 8.57380370884392e-06, + "loss": 1.02187519, + "memory(GiB)": 302.58, + "step": 101640, + "train_speed(iter/s)": 0.124172 + }, + { + "acc": 0.71893387, + "epoch": 0.5685308711536015, + "grad_norm": 6.75, + "learning_rate": 8.573156937686656e-06, + "loss": 1.11681948, + "memory(GiB)": 302.58, + "step": 101660, + "train_speed(iter/s)": 0.124184 + }, + { + "acc": 0.74510636, + "epoch": 0.5686427206265807, + "grad_norm": 7.03125, + "learning_rate": 8.572510044316093e-06, + "loss": 0.99342279, + "memory(GiB)": 302.58, + "step": 101680, + "train_speed(iter/s)": 0.124196 + }, + { + "acc": 0.74904256, + "epoch": 0.56875457009956, + "grad_norm": 7.0625, + "learning_rate": 8.571863028754358e-06, + "loss": 0.99580603, + "memory(GiB)": 302.58, + "step": 101700, + "train_speed(iter/s)": 0.124207 + }, + { + "acc": 0.74040813, + "epoch": 0.5688664195725393, + "grad_norm": 7.15625, + "learning_rate": 8.571215891023583e-06, + "loss": 0.99218702, + "memory(GiB)": 302.58, + "step": 101720, + "train_speed(iter/s)": 0.124219 + }, + { + "acc": 0.7364285, + "epoch": 0.5689782690455185, + "grad_norm": 5.09375, + "learning_rate": 8.570568631145899e-06, + "loss": 1.03284159, + "memory(GiB)": 302.58, + "step": 101740, + "train_speed(iter/s)": 0.124231 + }, + { + "acc": 0.73913608, + "epoch": 0.5690901185184978, + "grad_norm": 6.0625, + "learning_rate": 8.569921249143447e-06, + "loss": 1.03128538, + "memory(GiB)": 302.58, + "step": 101760, + "train_speed(iter/s)": 0.124243 + }, + { + "acc": 0.74688025, + "epoch": 0.569201967991477, + "grad_norm": 9.5625, + "learning_rate": 8.569273745038368e-06, + "loss": 0.98698282, + "memory(GiB)": 302.58, + "step": 101780, + "train_speed(iter/s)": 0.124255 + }, + { + "acc": 0.7429059, + "epoch": 0.5693138174644563, + "grad_norm": 6.15625, + "learning_rate": 8.56862611885281e-06, + "loss": 0.98937998, + "memory(GiB)": 302.58, + "step": 101800, + "train_speed(iter/s)": 0.124266 + }, + { + "acc": 0.73395777, + "epoch": 0.5694256669374356, + "grad_norm": 7.875, + "learning_rate": 8.56797837060892e-06, + "loss": 1.03991156, + "memory(GiB)": 302.58, + "step": 101820, + "train_speed(iter/s)": 0.124278 + }, + { + "acc": 0.75473733, + "epoch": 0.5695375164104148, + "grad_norm": 6.4375, + "learning_rate": 8.567330500328859e-06, + "loss": 0.96085491, + "memory(GiB)": 302.58, + "step": 101840, + "train_speed(iter/s)": 0.124288 + }, + { + "acc": 0.7394608, + "epoch": 0.5696493658833941, + "grad_norm": 8.8125, + "learning_rate": 8.566682508034781e-06, + "loss": 1.0274209, + "memory(GiB)": 302.58, + "step": 101860, + "train_speed(iter/s)": 0.124299 + }, + { + "acc": 0.72846427, + "epoch": 0.5697612153563734, + "grad_norm": 8.0625, + "learning_rate": 8.566034393748853e-06, + "loss": 1.08056202, + "memory(GiB)": 302.58, + "step": 101880, + "train_speed(iter/s)": 0.124311 + }, + { + "acc": 0.75655627, + "epoch": 0.5698730648293526, + "grad_norm": 8.75, + "learning_rate": 8.56538615749324e-06, + "loss": 0.93456688, + "memory(GiB)": 302.58, + "step": 101900, + "train_speed(iter/s)": 0.124323 + }, + { + "acc": 0.7244616, + "epoch": 0.5699849143023319, + "grad_norm": 6.84375, + "learning_rate": 8.564737799290115e-06, + "loss": 1.10676041, + "memory(GiB)": 302.58, + "step": 101920, + "train_speed(iter/s)": 0.124335 + }, + { + "acc": 0.73534427, + "epoch": 0.5700967637753112, + "grad_norm": 6.125, + "learning_rate": 8.564089319161655e-06, + "loss": 1.03996592, + "memory(GiB)": 302.58, + "step": 101940, + "train_speed(iter/s)": 0.124346 + }, + { + "acc": 0.7314302, + "epoch": 0.5702086132482904, + "grad_norm": 5.84375, + "learning_rate": 8.563440717130038e-06, + "loss": 1.0336545, + "memory(GiB)": 302.58, + "step": 101960, + "train_speed(iter/s)": 0.124358 + }, + { + "acc": 0.74291296, + "epoch": 0.5703204627212697, + "grad_norm": 5.8125, + "learning_rate": 8.56279199321745e-06, + "loss": 1.00277891, + "memory(GiB)": 302.58, + "step": 101980, + "train_speed(iter/s)": 0.124371 + }, + { + "acc": 0.74819202, + "epoch": 0.570432312194249, + "grad_norm": 6.59375, + "learning_rate": 8.562143147446078e-06, + "loss": 0.98131371, + "memory(GiB)": 302.58, + "step": 102000, + "train_speed(iter/s)": 0.124383 + }, + { + "epoch": 0.570432312194249, + "eval_acc": 0.7025547270737654, + "eval_loss": 1.0307267904281616, + "eval_runtime": 7510.4672, + "eval_samples_per_second": 10.024, + "eval_steps_per_second": 10.024, + "step": 102000 + }, + { + "acc": 0.75096898, + "epoch": 0.5705441616672282, + "grad_norm": 7.125, + "learning_rate": 8.561494179838115e-06, + "loss": 0.97438011, + "memory(GiB)": 302.58, + "step": 102020, + "train_speed(iter/s)": 0.123247 + }, + { + "acc": 0.74170985, + "epoch": 0.5706560111402075, + "grad_norm": 7.59375, + "learning_rate": 8.560845090415757e-06, + "loss": 1.00267553, + "memory(GiB)": 302.58, + "step": 102040, + "train_speed(iter/s)": 0.123259 + }, + { + "acc": 0.74635363, + "epoch": 0.5707678606131867, + "grad_norm": 10.0, + "learning_rate": 8.560195879201208e-06, + "loss": 0.97627125, + "memory(GiB)": 302.58, + "step": 102060, + "train_speed(iter/s)": 0.123271 + }, + { + "acc": 0.7413362, + "epoch": 0.570879710086166, + "grad_norm": 6.0, + "learning_rate": 8.559546546216668e-06, + "loss": 1.0257781, + "memory(GiB)": 302.58, + "step": 102080, + "train_speed(iter/s)": 0.123282 + }, + { + "acc": 0.73258452, + "epoch": 0.5709915595591453, + "grad_norm": 8.125, + "learning_rate": 8.558897091484351e-06, + "loss": 1.06407757, + "memory(GiB)": 302.58, + "step": 102100, + "train_speed(iter/s)": 0.123294 + }, + { + "acc": 0.72886229, + "epoch": 0.5711034090321245, + "grad_norm": 6.03125, + "learning_rate": 8.558247515026468e-06, + "loss": 1.08272438, + "memory(GiB)": 302.58, + "step": 102120, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.74271154, + "epoch": 0.5712152585051038, + "grad_norm": 8.5, + "learning_rate": 8.557597816865238e-06, + "loss": 0.99872065, + "memory(GiB)": 302.58, + "step": 102140, + "train_speed(iter/s)": 0.123316 + }, + { + "acc": 0.73640118, + "epoch": 0.5713271079780831, + "grad_norm": 7.65625, + "learning_rate": 8.55694799702288e-06, + "loss": 1.04676781, + "memory(GiB)": 302.58, + "step": 102160, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.73154373, + "epoch": 0.5714389574510623, + "grad_norm": 5.25, + "learning_rate": 8.556298055521623e-06, + "loss": 1.05608168, + "memory(GiB)": 302.58, + "step": 102180, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.73508763, + "epoch": 0.5715508069240416, + "grad_norm": 6.84375, + "learning_rate": 8.555647992383696e-06, + "loss": 1.04041204, + "memory(GiB)": 302.58, + "step": 102200, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.72096682, + "epoch": 0.5716626563970209, + "grad_norm": 8.25, + "learning_rate": 8.554997807631333e-06, + "loss": 1.10841846, + "memory(GiB)": 302.58, + "step": 102220, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.73932714, + "epoch": 0.5717745058700001, + "grad_norm": 7.34375, + "learning_rate": 8.55434750128677e-06, + "loss": 1.02778854, + "memory(GiB)": 302.58, + "step": 102240, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.73439126, + "epoch": 0.5718863553429794, + "grad_norm": 9.6875, + "learning_rate": 8.553697073372255e-06, + "loss": 1.04819403, + "memory(GiB)": 302.58, + "step": 102260, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75564761, + "epoch": 0.5719982048159586, + "grad_norm": 7.0625, + "learning_rate": 8.55304652391003e-06, + "loss": 0.96356373, + "memory(GiB)": 302.58, + "step": 102280, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.72897439, + "epoch": 0.5721100542889379, + "grad_norm": 8.1875, + "learning_rate": 8.552395852922348e-06, + "loss": 1.05926924, + "memory(GiB)": 302.58, + "step": 102300, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.74743643, + "epoch": 0.5722219037619172, + "grad_norm": 7.625, + "learning_rate": 8.551745060431464e-06, + "loss": 0.98718262, + "memory(GiB)": 302.58, + "step": 102320, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.73142853, + "epoch": 0.5723337532348964, + "grad_norm": 6.03125, + "learning_rate": 8.551094146459636e-06, + "loss": 1.04472256, + "memory(GiB)": 302.58, + "step": 102340, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.75031748, + "epoch": 0.5724456027078757, + "grad_norm": 8.5, + "learning_rate": 8.550443111029127e-06, + "loss": 0.98066854, + "memory(GiB)": 302.58, + "step": 102360, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.74200954, + "epoch": 0.572557452180855, + "grad_norm": 9.375, + "learning_rate": 8.549791954162208e-06, + "loss": 1.00836115, + "memory(GiB)": 302.58, + "step": 102380, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.74179931, + "epoch": 0.5726693016538342, + "grad_norm": 4.5, + "learning_rate": 8.549140675881145e-06, + "loss": 1.01162748, + "memory(GiB)": 302.58, + "step": 102400, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.75214987, + "epoch": 0.5727811511268135, + "grad_norm": 8.1875, + "learning_rate": 8.548489276208218e-06, + "loss": 0.96638184, + "memory(GiB)": 302.58, + "step": 102420, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74294443, + "epoch": 0.5728930005997928, + "grad_norm": 6.875, + "learning_rate": 8.547837755165706e-06, + "loss": 1.00945396, + "memory(GiB)": 302.58, + "step": 102440, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74624534, + "epoch": 0.573004850072772, + "grad_norm": 8.3125, + "learning_rate": 8.547186112775893e-06, + "loss": 1.00001869, + "memory(GiB)": 302.58, + "step": 102460, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.71436124, + "epoch": 0.5731166995457513, + "grad_norm": 7.25, + "learning_rate": 8.546534349061068e-06, + "loss": 1.15615501, + "memory(GiB)": 302.58, + "step": 102480, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.73893061, + "epoch": 0.5732285490187305, + "grad_norm": 8.6875, + "learning_rate": 8.545882464043522e-06, + "loss": 1.03665934, + "memory(GiB)": 302.58, + "step": 102500, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.75839529, + "epoch": 0.5733403984917098, + "grad_norm": 6.625, + "learning_rate": 8.545230457745552e-06, + "loss": 0.94104929, + "memory(GiB)": 302.58, + "step": 102520, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73011303, + "epoch": 0.5734522479646891, + "grad_norm": 7.625, + "learning_rate": 8.54457833018946e-06, + "loss": 1.04971285, + "memory(GiB)": 302.58, + "step": 102540, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.74190187, + "epoch": 0.5735640974376683, + "grad_norm": 5.59375, + "learning_rate": 8.543926081397546e-06, + "loss": 1.00785255, + "memory(GiB)": 302.58, + "step": 102560, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.7331048, + "epoch": 0.5736759469106476, + "grad_norm": 8.3125, + "learning_rate": 8.543273711392126e-06, + "loss": 1.06840105, + "memory(GiB)": 302.58, + "step": 102580, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.72874079, + "epoch": 0.5737877963836269, + "grad_norm": 7.78125, + "learning_rate": 8.542621220195508e-06, + "loss": 1.07088737, + "memory(GiB)": 302.58, + "step": 102600, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.73652711, + "epoch": 0.5738996458566061, + "grad_norm": 9.0625, + "learning_rate": 8.54196860783001e-06, + "loss": 1.03865232, + "memory(GiB)": 302.58, + "step": 102620, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.74228849, + "epoch": 0.5740114953295854, + "grad_norm": 8.4375, + "learning_rate": 8.541315874317957e-06, + "loss": 0.99336405, + "memory(GiB)": 302.58, + "step": 102640, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.74040556, + "epoch": 0.5741233448025647, + "grad_norm": 6.59375, + "learning_rate": 8.54066301968167e-06, + "loss": 1.01234951, + "memory(GiB)": 302.58, + "step": 102660, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.76167259, + "epoch": 0.5742351942755439, + "grad_norm": 7.15625, + "learning_rate": 8.54001004394348e-06, + "loss": 0.96189318, + "memory(GiB)": 302.58, + "step": 102680, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.71948814, + "epoch": 0.5743470437485232, + "grad_norm": 8.9375, + "learning_rate": 8.539356947125724e-06, + "loss": 1.10132418, + "memory(GiB)": 302.58, + "step": 102700, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.72975354, + "epoch": 0.5744588932215025, + "grad_norm": 5.84375, + "learning_rate": 8.538703729250734e-06, + "loss": 1.07686453, + "memory(GiB)": 302.58, + "step": 102720, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.7309309, + "epoch": 0.5745707426944817, + "grad_norm": 7.8125, + "learning_rate": 8.538050390340858e-06, + "loss": 1.07848225, + "memory(GiB)": 302.58, + "step": 102740, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.7417851, + "epoch": 0.574682592167461, + "grad_norm": 9.1875, + "learning_rate": 8.53739693041844e-06, + "loss": 1.02799244, + "memory(GiB)": 302.58, + "step": 102760, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.71570053, + "epoch": 0.5747944416404402, + "grad_norm": 5.5, + "learning_rate": 8.53674334950583e-06, + "loss": 1.12472181, + "memory(GiB)": 302.58, + "step": 102780, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.7399292, + "epoch": 0.5749062911134195, + "grad_norm": 5.3125, + "learning_rate": 8.53608964762538e-06, + "loss": 1.00643606, + "memory(GiB)": 302.58, + "step": 102800, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.74794903, + "epoch": 0.5750181405863988, + "grad_norm": 7.96875, + "learning_rate": 8.535435824799454e-06, + "loss": 0.99498005, + "memory(GiB)": 302.58, + "step": 102820, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.7386457, + "epoch": 0.575129990059378, + "grad_norm": 8.375, + "learning_rate": 8.534781881050412e-06, + "loss": 1.04481144, + "memory(GiB)": 302.58, + "step": 102840, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.74622898, + "epoch": 0.5752418395323573, + "grad_norm": 7.9375, + "learning_rate": 8.534127816400621e-06, + "loss": 0.97653513, + "memory(GiB)": 302.58, + "step": 102860, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74410872, + "epoch": 0.5753536890053366, + "grad_norm": 6.65625, + "learning_rate": 8.533473630872453e-06, + "loss": 0.99192629, + "memory(GiB)": 302.58, + "step": 102880, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.71464734, + "epoch": 0.5754655384783158, + "grad_norm": 6.6875, + "learning_rate": 8.53281932448828e-06, + "loss": 1.15156527, + "memory(GiB)": 302.58, + "step": 102900, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.74043412, + "epoch": 0.5755773879512951, + "grad_norm": 7.78125, + "learning_rate": 8.532164897270486e-06, + "loss": 1.02263803, + "memory(GiB)": 302.58, + "step": 102920, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.74184146, + "epoch": 0.5756892374242744, + "grad_norm": 6.21875, + "learning_rate": 8.531510349241452e-06, + "loss": 1.02570953, + "memory(GiB)": 302.58, + "step": 102940, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.72545948, + "epoch": 0.5758010868972536, + "grad_norm": 6.84375, + "learning_rate": 8.530855680423566e-06, + "loss": 1.07023592, + "memory(GiB)": 302.58, + "step": 102960, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.73458543, + "epoch": 0.5759129363702329, + "grad_norm": 6.8125, + "learning_rate": 8.53020089083922e-06, + "loss": 1.06063137, + "memory(GiB)": 302.58, + "step": 102980, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.7399271, + "epoch": 0.5760247858432122, + "grad_norm": 7.15625, + "learning_rate": 8.529545980510808e-06, + "loss": 1.00694494, + "memory(GiB)": 302.58, + "step": 103000, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.73861451, + "epoch": 0.5761366353161914, + "grad_norm": 5.90625, + "learning_rate": 8.528890949460733e-06, + "loss": 1.01914272, + "memory(GiB)": 302.58, + "step": 103020, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.74006557, + "epoch": 0.5762484847891708, + "grad_norm": 4.5625, + "learning_rate": 8.528235797711398e-06, + "loss": 1.06541739, + "memory(GiB)": 302.58, + "step": 103040, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.74468837, + "epoch": 0.57636033426215, + "grad_norm": 6.5625, + "learning_rate": 8.52758052528521e-06, + "loss": 1.02010927, + "memory(GiB)": 302.58, + "step": 103060, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.73620486, + "epoch": 0.5764721837351293, + "grad_norm": 8.0, + "learning_rate": 8.526925132204582e-06, + "loss": 1.04188795, + "memory(GiB)": 302.58, + "step": 103080, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.73333983, + "epoch": 0.5765840332081086, + "grad_norm": 7.59375, + "learning_rate": 8.526269618491932e-06, + "loss": 1.06367197, + "memory(GiB)": 302.58, + "step": 103100, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.742976, + "epoch": 0.5766958826810878, + "grad_norm": 7.09375, + "learning_rate": 8.525613984169678e-06, + "loss": 0.99155102, + "memory(GiB)": 302.58, + "step": 103120, + "train_speed(iter/s)": 0.123875 + }, + { + "acc": 0.73614707, + "epoch": 0.5768077321540671, + "grad_norm": 8.0, + "learning_rate": 8.524958229260249e-06, + "loss": 1.03454084, + "memory(GiB)": 302.58, + "step": 103140, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.74207649, + "epoch": 0.5769195816270464, + "grad_norm": 8.3125, + "learning_rate": 8.52430235378607e-06, + "loss": 1.01722326, + "memory(GiB)": 302.58, + "step": 103160, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.73785529, + "epoch": 0.5770314311000256, + "grad_norm": 6.71875, + "learning_rate": 8.523646357769575e-06, + "loss": 1.02940149, + "memory(GiB)": 302.58, + "step": 103180, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.7472795, + "epoch": 0.5771432805730049, + "grad_norm": 7.3125, + "learning_rate": 8.5229902412332e-06, + "loss": 0.99759769, + "memory(GiB)": 302.58, + "step": 103200, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.73245091, + "epoch": 0.5772551300459842, + "grad_norm": 7.8125, + "learning_rate": 8.522334004199389e-06, + "loss": 1.05575609, + "memory(GiB)": 302.58, + "step": 103220, + "train_speed(iter/s)": 0.12393 + }, + { + "acc": 0.73992596, + "epoch": 0.5773669795189634, + "grad_norm": 6.09375, + "learning_rate": 8.521677646690586e-06, + "loss": 1.01464138, + "memory(GiB)": 302.58, + "step": 103240, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.74434228, + "epoch": 0.5774788289919427, + "grad_norm": 5.5625, + "learning_rate": 8.52102116872924e-06, + "loss": 0.99833899, + "memory(GiB)": 302.58, + "step": 103260, + "train_speed(iter/s)": 0.123953 + }, + { + "acc": 0.74300342, + "epoch": 0.577590678464922, + "grad_norm": 6.40625, + "learning_rate": 8.520364570337808e-06, + "loss": 1.00858755, + "memory(GiB)": 302.58, + "step": 103280, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.73496447, + "epoch": 0.5777025279379012, + "grad_norm": 5.59375, + "learning_rate": 8.51970785153874e-06, + "loss": 1.03856335, + "memory(GiB)": 302.58, + "step": 103300, + "train_speed(iter/s)": 0.123976 + }, + { + "acc": 0.73960872, + "epoch": 0.5778143774108805, + "grad_norm": 9.125, + "learning_rate": 8.519051012354506e-06, + "loss": 1.00302668, + "memory(GiB)": 302.58, + "step": 103320, + "train_speed(iter/s)": 0.123987 + }, + { + "acc": 0.73736439, + "epoch": 0.5779262268838598, + "grad_norm": 6.4375, + "learning_rate": 8.518394052807568e-06, + "loss": 1.01851892, + "memory(GiB)": 302.58, + "step": 103340, + "train_speed(iter/s)": 0.123998 + }, + { + "acc": 0.73969965, + "epoch": 0.578038076356839, + "grad_norm": 6.59375, + "learning_rate": 8.517736972920397e-06, + "loss": 1.02787657, + "memory(GiB)": 302.58, + "step": 103360, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.73315973, + "epoch": 0.5781499258298183, + "grad_norm": 10.3125, + "learning_rate": 8.517079772715468e-06, + "loss": 1.05384426, + "memory(GiB)": 302.58, + "step": 103380, + "train_speed(iter/s)": 0.124021 + }, + { + "acc": 0.74178243, + "epoch": 0.5782617753027975, + "grad_norm": 6.9375, + "learning_rate": 8.516422452215257e-06, + "loss": 1.01783838, + "memory(GiB)": 302.58, + "step": 103400, + "train_speed(iter/s)": 0.124032 + }, + { + "acc": 0.750948, + "epoch": 0.5783736247757768, + "grad_norm": 7.28125, + "learning_rate": 8.515765011442248e-06, + "loss": 0.97142811, + "memory(GiB)": 302.58, + "step": 103420, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.73789263, + "epoch": 0.5784854742487561, + "grad_norm": 8.0, + "learning_rate": 8.51510745041893e-06, + "loss": 1.00847054, + "memory(GiB)": 302.58, + "step": 103440, + "train_speed(iter/s)": 0.124054 + }, + { + "acc": 0.73828702, + "epoch": 0.5785973237217353, + "grad_norm": 7.90625, + "learning_rate": 8.514449769167787e-06, + "loss": 1.02545471, + "memory(GiB)": 302.58, + "step": 103460, + "train_speed(iter/s)": 0.124065 + }, + { + "acc": 0.72181568, + "epoch": 0.5787091731947146, + "grad_norm": 8.9375, + "learning_rate": 8.51379196771132e-06, + "loss": 1.10265446, + "memory(GiB)": 302.58, + "step": 103480, + "train_speed(iter/s)": 0.124077 + }, + { + "acc": 0.72829556, + "epoch": 0.5788210226676939, + "grad_norm": 7.5, + "learning_rate": 8.513134046072026e-06, + "loss": 1.06540613, + "memory(GiB)": 302.58, + "step": 103500, + "train_speed(iter/s)": 0.124088 + }, + { + "acc": 0.7532618, + "epoch": 0.5789328721406731, + "grad_norm": 7.09375, + "learning_rate": 8.512476004272407e-06, + "loss": 0.96650734, + "memory(GiB)": 302.58, + "step": 103520, + "train_speed(iter/s)": 0.124099 + }, + { + "acc": 0.73751125, + "epoch": 0.5790447216136524, + "grad_norm": 9.5625, + "learning_rate": 8.51181784233497e-06, + "loss": 1.02771358, + "memory(GiB)": 302.58, + "step": 103540, + "train_speed(iter/s)": 0.12411 + }, + { + "acc": 0.73327713, + "epoch": 0.5791565710866317, + "grad_norm": 7.6875, + "learning_rate": 8.511159560282229e-06, + "loss": 1.04529104, + "memory(GiB)": 302.58, + "step": 103560, + "train_speed(iter/s)": 0.124122 + }, + { + "acc": 0.73978496, + "epoch": 0.5792684205596109, + "grad_norm": 8.8125, + "learning_rate": 8.510501158136698e-06, + "loss": 1.0255991, + "memory(GiB)": 302.58, + "step": 103580, + "train_speed(iter/s)": 0.124132 + }, + { + "acc": 0.73263364, + "epoch": 0.5793802700325902, + "grad_norm": 9.8125, + "learning_rate": 8.509842635920894e-06, + "loss": 1.05286398, + "memory(GiB)": 302.58, + "step": 103600, + "train_speed(iter/s)": 0.124144 + }, + { + "acc": 0.74180675, + "epoch": 0.5794921195055694, + "grad_norm": 6.96875, + "learning_rate": 8.509183993657344e-06, + "loss": 1.01893587, + "memory(GiB)": 302.58, + "step": 103620, + "train_speed(iter/s)": 0.124155 + }, + { + "acc": 0.74408851, + "epoch": 0.5796039689785487, + "grad_norm": 8.75, + "learning_rate": 8.508525231368572e-06, + "loss": 1.01891613, + "memory(GiB)": 302.58, + "step": 103640, + "train_speed(iter/s)": 0.124166 + }, + { + "acc": 0.74019432, + "epoch": 0.579715818451528, + "grad_norm": 5.25, + "learning_rate": 8.507866349077115e-06, + "loss": 1.02985821, + "memory(GiB)": 302.58, + "step": 103660, + "train_speed(iter/s)": 0.124177 + }, + { + "acc": 0.73757143, + "epoch": 0.5798276679245072, + "grad_norm": 9.6875, + "learning_rate": 8.507207346805504e-06, + "loss": 1.02692757, + "memory(GiB)": 302.58, + "step": 103680, + "train_speed(iter/s)": 0.124188 + }, + { + "acc": 0.73111153, + "epoch": 0.5799395173974865, + "grad_norm": 8.0625, + "learning_rate": 8.506548224576278e-06, + "loss": 1.05072775, + "memory(GiB)": 302.58, + "step": 103700, + "train_speed(iter/s)": 0.124199 + }, + { + "acc": 0.74539671, + "epoch": 0.5800513668704658, + "grad_norm": 4.53125, + "learning_rate": 8.505888982411987e-06, + "loss": 0.99994678, + "memory(GiB)": 302.58, + "step": 103720, + "train_speed(iter/s)": 0.124211 + }, + { + "acc": 0.73667188, + "epoch": 0.580163216343445, + "grad_norm": 7.3125, + "learning_rate": 8.505229620335176e-06, + "loss": 1.02988644, + "memory(GiB)": 302.58, + "step": 103740, + "train_speed(iter/s)": 0.124222 + }, + { + "acc": 0.73524013, + "epoch": 0.5802750658164243, + "grad_norm": 6.0625, + "learning_rate": 8.504570138368396e-06, + "loss": 1.03762894, + "memory(GiB)": 302.58, + "step": 103760, + "train_speed(iter/s)": 0.124234 + }, + { + "acc": 0.76233654, + "epoch": 0.5803869152894036, + "grad_norm": 6.40625, + "learning_rate": 8.503910536534205e-06, + "loss": 0.93681879, + "memory(GiB)": 302.58, + "step": 103780, + "train_speed(iter/s)": 0.124245 + }, + { + "acc": 0.72688107, + "epoch": 0.5804987647623828, + "grad_norm": 5.8125, + "learning_rate": 8.503250814855164e-06, + "loss": 1.07237215, + "memory(GiB)": 302.58, + "step": 103800, + "train_speed(iter/s)": 0.124256 + }, + { + "acc": 0.74736896, + "epoch": 0.5806106142353621, + "grad_norm": 7.40625, + "learning_rate": 8.502590973353835e-06, + "loss": 1.00156841, + "memory(GiB)": 302.58, + "step": 103820, + "train_speed(iter/s)": 0.124266 + }, + { + "acc": 0.74930668, + "epoch": 0.5807224637083414, + "grad_norm": 6.96875, + "learning_rate": 8.501931012052788e-06, + "loss": 0.97393541, + "memory(GiB)": 302.58, + "step": 103840, + "train_speed(iter/s)": 0.124277 + }, + { + "acc": 0.72616253, + "epoch": 0.5808343131813206, + "grad_norm": 5.4375, + "learning_rate": 8.501270930974597e-06, + "loss": 1.07951603, + "memory(GiB)": 302.58, + "step": 103860, + "train_speed(iter/s)": 0.124289 + }, + { + "acc": 0.73142071, + "epoch": 0.5809461626542999, + "grad_norm": 9.4375, + "learning_rate": 8.500610730141838e-06, + "loss": 1.04115391, + "memory(GiB)": 302.58, + "step": 103880, + "train_speed(iter/s)": 0.1243 + }, + { + "acc": 0.72818403, + "epoch": 0.5810580121272791, + "grad_norm": 5.53125, + "learning_rate": 8.499950409577091e-06, + "loss": 1.07701511, + "memory(GiB)": 302.58, + "step": 103900, + "train_speed(iter/s)": 0.124312 + }, + { + "acc": 0.75528607, + "epoch": 0.5811698616002584, + "grad_norm": 9.0625, + "learning_rate": 8.499289969302944e-06, + "loss": 0.95609035, + "memory(GiB)": 302.58, + "step": 103920, + "train_speed(iter/s)": 0.124323 + }, + { + "acc": 0.73788199, + "epoch": 0.5812817110732377, + "grad_norm": 7.0625, + "learning_rate": 8.498629409341983e-06, + "loss": 1.03480005, + "memory(GiB)": 302.58, + "step": 103940, + "train_speed(iter/s)": 0.124334 + }, + { + "acc": 0.74583664, + "epoch": 0.5813935605462169, + "grad_norm": 8.9375, + "learning_rate": 8.497968729716802e-06, + "loss": 0.98833151, + "memory(GiB)": 302.58, + "step": 103960, + "train_speed(iter/s)": 0.124344 + }, + { + "acc": 0.74971972, + "epoch": 0.5815054100191962, + "grad_norm": 5.28125, + "learning_rate": 8.497307930449999e-06, + "loss": 0.96550007, + "memory(GiB)": 302.58, + "step": 103980, + "train_speed(iter/s)": 0.124355 + }, + { + "acc": 0.74016767, + "epoch": 0.5816172594921755, + "grad_norm": 5.8125, + "learning_rate": 8.496647011564176e-06, + "loss": 1.0350316, + "memory(GiB)": 302.58, + "step": 104000, + "train_speed(iter/s)": 0.124366 + }, + { + "epoch": 0.5816172594921755, + "eval_acc": 0.7026817648919922, + "eval_loss": 1.0302927494049072, + "eval_runtime": 7508.3064, + "eval_samples_per_second": 10.027, + "eval_steps_per_second": 10.027, + "step": 104000 + }, + { + "acc": 0.74474983, + "epoch": 0.5817291089651547, + "grad_norm": 8.0, + "learning_rate": 8.495985973081936e-06, + "loss": 0.9857996, + "memory(GiB)": 302.58, + "step": 104020, + "train_speed(iter/s)": 0.123252 + }, + { + "acc": 0.72272549, + "epoch": 0.581840958438134, + "grad_norm": 7.28125, + "learning_rate": 8.49532481502589e-06, + "loss": 1.09590521, + "memory(GiB)": 302.58, + "step": 104040, + "train_speed(iter/s)": 0.123263 + }, + { + "acc": 0.73872118, + "epoch": 0.5819528079111133, + "grad_norm": 8.875, + "learning_rate": 8.494663537418653e-06, + "loss": 1.00278454, + "memory(GiB)": 302.58, + "step": 104060, + "train_speed(iter/s)": 0.123273 + }, + { + "acc": 0.71867099, + "epoch": 0.5820646573840925, + "grad_norm": 6.375, + "learning_rate": 8.494002140282841e-06, + "loss": 1.12284718, + "memory(GiB)": 302.58, + "step": 104080, + "train_speed(iter/s)": 0.123284 + }, + { + "acc": 0.72819972, + "epoch": 0.5821765068570718, + "grad_norm": 6.59375, + "learning_rate": 8.493340623641079e-06, + "loss": 1.09854469, + "memory(GiB)": 302.58, + "step": 104100, + "train_speed(iter/s)": 0.123296 + }, + { + "acc": 0.73716888, + "epoch": 0.582288356330051, + "grad_norm": 8.375, + "learning_rate": 8.492678987515988e-06, + "loss": 1.03348484, + "memory(GiB)": 302.58, + "step": 104120, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.72885127, + "epoch": 0.5824002058030303, + "grad_norm": 6.125, + "learning_rate": 8.492017231930202e-06, + "loss": 1.0680315, + "memory(GiB)": 302.58, + "step": 104140, + "train_speed(iter/s)": 0.123318 + }, + { + "acc": 0.74290266, + "epoch": 0.5825120552760096, + "grad_norm": 8.75, + "learning_rate": 8.491355356906353e-06, + "loss": 1.01321049, + "memory(GiB)": 302.58, + "step": 104160, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.73328619, + "epoch": 0.5826239047489888, + "grad_norm": 7.75, + "learning_rate": 8.49069336246708e-06, + "loss": 1.06471205, + "memory(GiB)": 302.58, + "step": 104180, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.76037135, + "epoch": 0.5827357542219681, + "grad_norm": 8.5625, + "learning_rate": 8.490031248635027e-06, + "loss": 0.93427372, + "memory(GiB)": 302.58, + "step": 104200, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.73513532, + "epoch": 0.5828476036949474, + "grad_norm": 7.28125, + "learning_rate": 8.489369015432836e-06, + "loss": 1.05470762, + "memory(GiB)": 302.58, + "step": 104220, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.72164917, + "epoch": 0.5829594531679266, + "grad_norm": 8.625, + "learning_rate": 8.488706662883162e-06, + "loss": 1.0954586, + "memory(GiB)": 302.58, + "step": 104240, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.73398209, + "epoch": 0.5830713026409059, + "grad_norm": 6.1875, + "learning_rate": 8.488044191008658e-06, + "loss": 1.06304855, + "memory(GiB)": 302.58, + "step": 104260, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.7327374, + "epoch": 0.5831831521138852, + "grad_norm": 8.5625, + "learning_rate": 8.487381599831981e-06, + "loss": 1.05338211, + "memory(GiB)": 302.58, + "step": 104280, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74118733, + "epoch": 0.5832950015868644, + "grad_norm": 7.09375, + "learning_rate": 8.486718889375796e-06, + "loss": 1.01640444, + "memory(GiB)": 302.58, + "step": 104300, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.73696952, + "epoch": 0.5834068510598437, + "grad_norm": 6.0625, + "learning_rate": 8.486056059662768e-06, + "loss": 1.02634315, + "memory(GiB)": 302.58, + "step": 104320, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.72527084, + "epoch": 0.583518700532823, + "grad_norm": 9.8125, + "learning_rate": 8.485393110715569e-06, + "loss": 1.08932257, + "memory(GiB)": 302.58, + "step": 104340, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.73834925, + "epoch": 0.5836305500058022, + "grad_norm": 6.03125, + "learning_rate": 8.484730042556874e-06, + "loss": 1.04998503, + "memory(GiB)": 302.58, + "step": 104360, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.72906413, + "epoch": 0.5837423994787815, + "grad_norm": 8.25, + "learning_rate": 8.484066855209362e-06, + "loss": 1.07530212, + "memory(GiB)": 302.58, + "step": 104380, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.73892455, + "epoch": 0.5838542489517607, + "grad_norm": 8.625, + "learning_rate": 8.483403548695716e-06, + "loss": 1.01496334, + "memory(GiB)": 302.58, + "step": 104400, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74597707, + "epoch": 0.58396609842474, + "grad_norm": 6.15625, + "learning_rate": 8.482740123038622e-06, + "loss": 0.98419895, + "memory(GiB)": 302.58, + "step": 104420, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.73608508, + "epoch": 0.5840779478977193, + "grad_norm": 7.4375, + "learning_rate": 8.482076578260774e-06, + "loss": 1.03118229, + "memory(GiB)": 302.58, + "step": 104440, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.73366914, + "epoch": 0.5841897973706985, + "grad_norm": 9.125, + "learning_rate": 8.481412914384865e-06, + "loss": 1.05684242, + "memory(GiB)": 302.58, + "step": 104460, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.74172053, + "epoch": 0.5843016468436778, + "grad_norm": 6.375, + "learning_rate": 8.480749131433595e-06, + "loss": 1.00530119, + "memory(GiB)": 302.58, + "step": 104480, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.72762694, + "epoch": 0.5844134963166571, + "grad_norm": 8.5, + "learning_rate": 8.480085229429666e-06, + "loss": 1.06008034, + "memory(GiB)": 302.58, + "step": 104500, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.73767657, + "epoch": 0.5845253457896363, + "grad_norm": 7.65625, + "learning_rate": 8.479421208395788e-06, + "loss": 1.03155556, + "memory(GiB)": 302.58, + "step": 104520, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74723988, + "epoch": 0.5846371952626156, + "grad_norm": 6.3125, + "learning_rate": 8.478757068354673e-06, + "loss": 0.98619509, + "memory(GiB)": 302.58, + "step": 104540, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.73361807, + "epoch": 0.5847490447355949, + "grad_norm": 6.15625, + "learning_rate": 8.478092809329035e-06, + "loss": 1.05449438, + "memory(GiB)": 302.58, + "step": 104560, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.72922974, + "epoch": 0.5848608942085741, + "grad_norm": 5.90625, + "learning_rate": 8.477428431341594e-06, + "loss": 1.05636787, + "memory(GiB)": 302.58, + "step": 104580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.74683022, + "epoch": 0.5849727436815534, + "grad_norm": 6.75, + "learning_rate": 8.476763934415074e-06, + "loss": 0.99456844, + "memory(GiB)": 302.58, + "step": 104600, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.727878, + "epoch": 0.5850845931545327, + "grad_norm": 5.28125, + "learning_rate": 8.476099318572202e-06, + "loss": 1.09241018, + "memory(GiB)": 302.58, + "step": 104620, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.73862543, + "epoch": 0.5851964426275119, + "grad_norm": 8.8125, + "learning_rate": 8.475434583835712e-06, + "loss": 1.03648691, + "memory(GiB)": 302.58, + "step": 104640, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.74284706, + "epoch": 0.5853082921004912, + "grad_norm": 8.4375, + "learning_rate": 8.474769730228338e-06, + "loss": 1.02872715, + "memory(GiB)": 302.58, + "step": 104660, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.7370317, + "epoch": 0.5854201415734704, + "grad_norm": 6.5, + "learning_rate": 8.474104757772822e-06, + "loss": 1.04920473, + "memory(GiB)": 302.58, + "step": 104680, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.71729574, + "epoch": 0.5855319910464497, + "grad_norm": 8.6875, + "learning_rate": 8.473439666491907e-06, + "loss": 1.11222305, + "memory(GiB)": 302.58, + "step": 104700, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.74033065, + "epoch": 0.585643840519429, + "grad_norm": 7.9375, + "learning_rate": 8.472774456408342e-06, + "loss": 1.02619019, + "memory(GiB)": 302.58, + "step": 104720, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.72941446, + "epoch": 0.5857556899924082, + "grad_norm": 8.0625, + "learning_rate": 8.472109127544877e-06, + "loss": 1.1137455, + "memory(GiB)": 302.58, + "step": 104740, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.7365911, + "epoch": 0.5858675394653875, + "grad_norm": 7.4375, + "learning_rate": 8.471443679924271e-06, + "loss": 1.0415369, + "memory(GiB)": 302.58, + "step": 104760, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.73548908, + "epoch": 0.5859793889383668, + "grad_norm": 6.78125, + "learning_rate": 8.470778113569285e-06, + "loss": 1.06192293, + "memory(GiB)": 302.58, + "step": 104780, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.74536433, + "epoch": 0.586091238411346, + "grad_norm": 7.25, + "learning_rate": 8.470112428502679e-06, + "loss": 0.9888607, + "memory(GiB)": 302.58, + "step": 104800, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.7387116, + "epoch": 0.5862030878843253, + "grad_norm": 5.03125, + "learning_rate": 8.469446624747226e-06, + "loss": 1.0287446, + "memory(GiB)": 302.58, + "step": 104820, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.73078966, + "epoch": 0.5863149373573046, + "grad_norm": 6.28125, + "learning_rate": 8.468780702325698e-06, + "loss": 1.07006073, + "memory(GiB)": 302.58, + "step": 104840, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.73858118, + "epoch": 0.5864267868302838, + "grad_norm": 8.4375, + "learning_rate": 8.46811466126087e-06, + "loss": 1.0323988, + "memory(GiB)": 302.58, + "step": 104860, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.74167376, + "epoch": 0.5865386363032631, + "grad_norm": 9.125, + "learning_rate": 8.467448501575524e-06, + "loss": 1.02471838, + "memory(GiB)": 302.58, + "step": 104880, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.7592844, + "epoch": 0.5866504857762423, + "grad_norm": 7.8125, + "learning_rate": 8.466782223292443e-06, + "loss": 0.94006529, + "memory(GiB)": 302.58, + "step": 104900, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.73280902, + "epoch": 0.5867623352492216, + "grad_norm": 8.9375, + "learning_rate": 8.466115826434418e-06, + "loss": 1.05930166, + "memory(GiB)": 302.58, + "step": 104920, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.72924376, + "epoch": 0.5868741847222009, + "grad_norm": 8.375, + "learning_rate": 8.465449311024241e-06, + "loss": 1.07770557, + "memory(GiB)": 302.58, + "step": 104940, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.73155394, + "epoch": 0.5869860341951801, + "grad_norm": 6.46875, + "learning_rate": 8.464782677084709e-06, + "loss": 1.06764555, + "memory(GiB)": 302.58, + "step": 104960, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.73995242, + "epoch": 0.5870978836681594, + "grad_norm": 6.34375, + "learning_rate": 8.464115924638621e-06, + "loss": 1.01598082, + "memory(GiB)": 302.58, + "step": 104980, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.74568338, + "epoch": 0.5872097331411387, + "grad_norm": 5.9375, + "learning_rate": 8.463449053708787e-06, + "loss": 0.97019482, + "memory(GiB)": 302.58, + "step": 105000, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.73385658, + "epoch": 0.5873215826141179, + "grad_norm": 5.1875, + "learning_rate": 8.46278206431801e-06, + "loss": 1.0524106, + "memory(GiB)": 302.58, + "step": 105020, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.71497898, + "epoch": 0.5874334320870972, + "grad_norm": 7.65625, + "learning_rate": 8.46211495648911e-06, + "loss": 1.12009296, + "memory(GiB)": 302.58, + "step": 105040, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.71946435, + "epoch": 0.5875452815600765, + "grad_norm": 10.1875, + "learning_rate": 8.461447730244897e-06, + "loss": 1.0900279, + "memory(GiB)": 302.58, + "step": 105060, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.74603901, + "epoch": 0.5876571310330557, + "grad_norm": 6.34375, + "learning_rate": 8.460780385608196e-06, + "loss": 1.00011148, + "memory(GiB)": 302.58, + "step": 105080, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.73834448, + "epoch": 0.587768980506035, + "grad_norm": 5.46875, + "learning_rate": 8.460112922601834e-06, + "loss": 1.03564415, + "memory(GiB)": 302.58, + "step": 105100, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.7390193, + "epoch": 0.5878808299790143, + "grad_norm": 8.5625, + "learning_rate": 8.459445341248636e-06, + "loss": 1.05358419, + "memory(GiB)": 302.58, + "step": 105120, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.74238482, + "epoch": 0.5879926794519935, + "grad_norm": 9.4375, + "learning_rate": 8.45877764157144e-06, + "loss": 1.01983995, + "memory(GiB)": 302.58, + "step": 105140, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.74090533, + "epoch": 0.5881045289249728, + "grad_norm": 6.84375, + "learning_rate": 8.45810982359308e-06, + "loss": 1.01724796, + "memory(GiB)": 302.58, + "step": 105160, + "train_speed(iter/s)": 0.123885 + }, + { + "acc": 0.74231148, + "epoch": 0.588216378397952, + "grad_norm": 6.375, + "learning_rate": 8.457441887336398e-06, + "loss": 1.01676226, + "memory(GiB)": 302.58, + "step": 105180, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.73895903, + "epoch": 0.5883282278709313, + "grad_norm": 7.875, + "learning_rate": 8.456773832824242e-06, + "loss": 1.04132519, + "memory(GiB)": 302.58, + "step": 105200, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.76664472, + "epoch": 0.5884400773439106, + "grad_norm": 7.375, + "learning_rate": 8.45610566007946e-06, + "loss": 0.9030673, + "memory(GiB)": 302.58, + "step": 105220, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.72351975, + "epoch": 0.5885519268168898, + "grad_norm": 6.1875, + "learning_rate": 8.455437369124903e-06, + "loss": 1.10269165, + "memory(GiB)": 302.58, + "step": 105240, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.74919243, + "epoch": 0.5886637762898691, + "grad_norm": 9.375, + "learning_rate": 8.454768959983432e-06, + "loss": 0.97633839, + "memory(GiB)": 302.58, + "step": 105260, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.72054505, + "epoch": 0.5887756257628484, + "grad_norm": 6.9375, + "learning_rate": 8.454100432677907e-06, + "loss": 1.10240068, + "memory(GiB)": 302.58, + "step": 105280, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.74034123, + "epoch": 0.5888874752358276, + "grad_norm": 7.4375, + "learning_rate": 8.453431787231197e-06, + "loss": 1.03917027, + "memory(GiB)": 302.58, + "step": 105300, + "train_speed(iter/s)": 0.12396 + }, + { + "acc": 0.73549438, + "epoch": 0.5889993247088069, + "grad_norm": 8.0625, + "learning_rate": 8.452763023666167e-06, + "loss": 1.0520647, + "memory(GiB)": 302.58, + "step": 105320, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.71744881, + "epoch": 0.5891111741817862, + "grad_norm": 8.0625, + "learning_rate": 8.452094142005694e-06, + "loss": 1.1352416, + "memory(GiB)": 302.58, + "step": 105340, + "train_speed(iter/s)": 0.123981 + }, + { + "acc": 0.7164763, + "epoch": 0.5892230236547654, + "grad_norm": 7.90625, + "learning_rate": 8.451425142272655e-06, + "loss": 1.13487282, + "memory(GiB)": 302.58, + "step": 105360, + "train_speed(iter/s)": 0.123992 + }, + { + "acc": 0.73336868, + "epoch": 0.5893348731277447, + "grad_norm": 9.75, + "learning_rate": 8.450756024489932e-06, + "loss": 1.07092104, + "memory(GiB)": 302.58, + "step": 105380, + "train_speed(iter/s)": 0.124003 + }, + { + "acc": 0.73389716, + "epoch": 0.589446722600724, + "grad_norm": 5.9375, + "learning_rate": 8.450086788680411e-06, + "loss": 1.05499134, + "memory(GiB)": 302.58, + "step": 105400, + "train_speed(iter/s)": 0.124013 + }, + { + "acc": 0.7380178, + "epoch": 0.5895585720737032, + "grad_norm": 7.6875, + "learning_rate": 8.449417434866982e-06, + "loss": 1.04923382, + "memory(GiB)": 302.58, + "step": 105420, + "train_speed(iter/s)": 0.124025 + }, + { + "acc": 0.74081149, + "epoch": 0.5896704215466825, + "grad_norm": 7.0, + "learning_rate": 8.44874796307254e-06, + "loss": 1.02538586, + "memory(GiB)": 302.58, + "step": 105440, + "train_speed(iter/s)": 0.124036 + }, + { + "acc": 0.72489061, + "epoch": 0.5897822710196617, + "grad_norm": 10.25, + "learning_rate": 8.448078373319981e-06, + "loss": 1.09697895, + "memory(GiB)": 302.58, + "step": 105460, + "train_speed(iter/s)": 0.124048 + }, + { + "acc": 0.73583612, + "epoch": 0.589894120492641, + "grad_norm": 8.0625, + "learning_rate": 8.447408665632207e-06, + "loss": 1.04207544, + "memory(GiB)": 302.58, + "step": 105480, + "train_speed(iter/s)": 0.124059 + }, + { + "acc": 0.74027705, + "epoch": 0.5900059699656203, + "grad_norm": 10.25, + "learning_rate": 8.446738840032127e-06, + "loss": 1.02609529, + "memory(GiB)": 302.58, + "step": 105500, + "train_speed(iter/s)": 0.12407 + }, + { + "acc": 0.75252509, + "epoch": 0.5901178194385995, + "grad_norm": 8.0, + "learning_rate": 8.44606889654265e-06, + "loss": 0.94893847, + "memory(GiB)": 302.58, + "step": 105520, + "train_speed(iter/s)": 0.124081 + }, + { + "acc": 0.74328871, + "epoch": 0.5902296689115788, + "grad_norm": 5.1875, + "learning_rate": 8.445398835186687e-06, + "loss": 1.01128254, + "memory(GiB)": 302.58, + "step": 105540, + "train_speed(iter/s)": 0.124093 + }, + { + "acc": 0.75177736, + "epoch": 0.5903415183845581, + "grad_norm": 6.625, + "learning_rate": 8.444728655987159e-06, + "loss": 0.9790658, + "memory(GiB)": 302.58, + "step": 105560, + "train_speed(iter/s)": 0.124104 + }, + { + "acc": 0.73051658, + "epoch": 0.5904533678575373, + "grad_norm": 10.0625, + "learning_rate": 8.444058358966989e-06, + "loss": 1.05166378, + "memory(GiB)": 302.58, + "step": 105580, + "train_speed(iter/s)": 0.124115 + }, + { + "acc": 0.74786434, + "epoch": 0.5905652173305166, + "grad_norm": 4.90625, + "learning_rate": 8.443387944149102e-06, + "loss": 0.98567791, + "memory(GiB)": 302.58, + "step": 105600, + "train_speed(iter/s)": 0.124126 + }, + { + "acc": 0.72423725, + "epoch": 0.5906770668034959, + "grad_norm": 7.59375, + "learning_rate": 8.442717411556429e-06, + "loss": 1.08956032, + "memory(GiB)": 302.58, + "step": 105620, + "train_speed(iter/s)": 0.124136 + }, + { + "acc": 0.74018235, + "epoch": 0.5907889162764751, + "grad_norm": 6.59375, + "learning_rate": 8.442046761211903e-06, + "loss": 1.02189188, + "memory(GiB)": 302.58, + "step": 105640, + "train_speed(iter/s)": 0.124148 + }, + { + "acc": 0.72689886, + "epoch": 0.5909007657494544, + "grad_norm": 8.375, + "learning_rate": 8.441375993138465e-06, + "loss": 1.07537756, + "memory(GiB)": 302.58, + "step": 105660, + "train_speed(iter/s)": 0.124159 + }, + { + "acc": 0.74079776, + "epoch": 0.5910126152224336, + "grad_norm": 5.5, + "learning_rate": 8.440705107359055e-06, + "loss": 1.00104179, + "memory(GiB)": 302.58, + "step": 105680, + "train_speed(iter/s)": 0.12417 + }, + { + "acc": 0.71770039, + "epoch": 0.5911244646954129, + "grad_norm": 6.15625, + "learning_rate": 8.44003410389662e-06, + "loss": 1.10649986, + "memory(GiB)": 302.58, + "step": 105700, + "train_speed(iter/s)": 0.124181 + }, + { + "acc": 0.74558625, + "epoch": 0.5912363141683922, + "grad_norm": 6.0, + "learning_rate": 8.439362982774109e-06, + "loss": 0.99813271, + "memory(GiB)": 302.58, + "step": 105720, + "train_speed(iter/s)": 0.124191 + }, + { + "acc": 0.75307927, + "epoch": 0.5913481636413714, + "grad_norm": 8.875, + "learning_rate": 8.43869174401448e-06, + "loss": 1.0002903, + "memory(GiB)": 302.58, + "step": 105740, + "train_speed(iter/s)": 0.124201 + }, + { + "acc": 0.73261991, + "epoch": 0.5914600131143507, + "grad_norm": 7.71875, + "learning_rate": 8.438020387640689e-06, + "loss": 1.03762531, + "memory(GiB)": 302.58, + "step": 105760, + "train_speed(iter/s)": 0.124212 + }, + { + "acc": 0.73380446, + "epoch": 0.59157186258733, + "grad_norm": 9.1875, + "learning_rate": 8.4373489136757e-06, + "loss": 1.03633471, + "memory(GiB)": 302.58, + "step": 105780, + "train_speed(iter/s)": 0.124223 + }, + { + "acc": 0.74100966, + "epoch": 0.5916837120603092, + "grad_norm": 9.0, + "learning_rate": 8.436677322142477e-06, + "loss": 1.0248476, + "memory(GiB)": 302.58, + "step": 105800, + "train_speed(iter/s)": 0.124234 + }, + { + "acc": 0.73510962, + "epoch": 0.5917955615332885, + "grad_norm": 7.875, + "learning_rate": 8.436005613063993e-06, + "loss": 1.05464582, + "memory(GiB)": 302.58, + "step": 105820, + "train_speed(iter/s)": 0.124245 + }, + { + "acc": 0.74041166, + "epoch": 0.5919074110062678, + "grad_norm": 7.625, + "learning_rate": 8.435333786463223e-06, + "loss": 1.03785543, + "memory(GiB)": 302.58, + "step": 105840, + "train_speed(iter/s)": 0.124256 + }, + { + "acc": 0.72320266, + "epoch": 0.592019260479247, + "grad_norm": 8.5, + "learning_rate": 8.434661842363142e-06, + "loss": 1.10242481, + "memory(GiB)": 302.58, + "step": 105860, + "train_speed(iter/s)": 0.124267 + }, + { + "acc": 0.72754831, + "epoch": 0.5921311099522263, + "grad_norm": 9.5625, + "learning_rate": 8.433989780786737e-06, + "loss": 1.06833782, + "memory(GiB)": 302.58, + "step": 105880, + "train_speed(iter/s)": 0.124278 + }, + { + "acc": 0.73624039, + "epoch": 0.5922429594252056, + "grad_norm": 6.75, + "learning_rate": 8.43331760175699e-06, + "loss": 1.0594842, + "memory(GiB)": 302.58, + "step": 105900, + "train_speed(iter/s)": 0.124289 + }, + { + "acc": 0.72602735, + "epoch": 0.5923548088981848, + "grad_norm": 7.625, + "learning_rate": 8.432645305296898e-06, + "loss": 1.0867321, + "memory(GiB)": 302.58, + "step": 105920, + "train_speed(iter/s)": 0.1243 + }, + { + "acc": 0.75495534, + "epoch": 0.5924666583711641, + "grad_norm": 8.3125, + "learning_rate": 8.431972891429449e-06, + "loss": 0.94644032, + "memory(GiB)": 302.58, + "step": 105940, + "train_speed(iter/s)": 0.124311 + }, + { + "acc": 0.73989639, + "epoch": 0.5925785078441433, + "grad_norm": 8.375, + "learning_rate": 8.431300360177645e-06, + "loss": 1.00529709, + "memory(GiB)": 302.58, + "step": 105960, + "train_speed(iter/s)": 0.124322 + }, + { + "acc": 0.727246, + "epoch": 0.5926903573171226, + "grad_norm": 5.5625, + "learning_rate": 8.43062771156449e-06, + "loss": 1.04391508, + "memory(GiB)": 302.58, + "step": 105980, + "train_speed(iter/s)": 0.124334 + }, + { + "acc": 0.71953564, + "epoch": 0.5928022067901019, + "grad_norm": 6.21875, + "learning_rate": 8.429954945612988e-06, + "loss": 1.10803852, + "memory(GiB)": 302.58, + "step": 106000, + "train_speed(iter/s)": 0.124345 + }, + { + "epoch": 0.5928022067901019, + "eval_acc": 0.7028234931525285, + "eval_loss": 1.029431939125061, + "eval_runtime": 7495.7513, + "eval_samples_per_second": 10.043, + "eval_steps_per_second": 10.043, + "step": 106000 + }, + { + "acc": 0.74732642, + "epoch": 0.5929140562630811, + "grad_norm": 8.5, + "learning_rate": 8.42928206234615e-06, + "loss": 0.98596754, + "memory(GiB)": 302.58, + "step": 106020, + "train_speed(iter/s)": 0.123253 + }, + { + "acc": 0.75232425, + "epoch": 0.5930259057360604, + "grad_norm": 6.5, + "learning_rate": 8.428609061786992e-06, + "loss": 0.94909334, + "memory(GiB)": 302.58, + "step": 106040, + "train_speed(iter/s)": 0.123265 + }, + { + "acc": 0.7474483, + "epoch": 0.5931377552090397, + "grad_norm": 6.71875, + "learning_rate": 8.427935943958531e-06, + "loss": 0.99859896, + "memory(GiB)": 302.58, + "step": 106060, + "train_speed(iter/s)": 0.123276 + }, + { + "acc": 0.74274626, + "epoch": 0.5932496046820189, + "grad_norm": 5.96875, + "learning_rate": 8.427262708883794e-06, + "loss": 1.00747261, + "memory(GiB)": 302.58, + "step": 106080, + "train_speed(iter/s)": 0.123287 + }, + { + "acc": 0.74924088, + "epoch": 0.5933614541549982, + "grad_norm": 7.46875, + "learning_rate": 8.426589356585804e-06, + "loss": 0.9797245, + "memory(GiB)": 302.58, + "step": 106100, + "train_speed(iter/s)": 0.123297 + }, + { + "acc": 0.73705215, + "epoch": 0.5934733036279775, + "grad_norm": 6.78125, + "learning_rate": 8.425915887087591e-06, + "loss": 1.0286603, + "memory(GiB)": 302.58, + "step": 106120, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.73040042, + "epoch": 0.5935851531009567, + "grad_norm": 7.125, + "learning_rate": 8.425242300412193e-06, + "loss": 1.06595154, + "memory(GiB)": 302.58, + "step": 106140, + "train_speed(iter/s)": 0.12332 + }, + { + "acc": 0.73789253, + "epoch": 0.593697002573936, + "grad_norm": 7.96875, + "learning_rate": 8.424568596582645e-06, + "loss": 1.02066965, + "memory(GiB)": 302.58, + "step": 106160, + "train_speed(iter/s)": 0.123331 + }, + { + "acc": 0.73433785, + "epoch": 0.5938088520469152, + "grad_norm": 8.75, + "learning_rate": 8.423894775621994e-06, + "loss": 1.04465227, + "memory(GiB)": 302.58, + "step": 106180, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.74939351, + "epoch": 0.5939207015198945, + "grad_norm": 7.125, + "learning_rate": 8.423220837553284e-06, + "loss": 0.98732471, + "memory(GiB)": 302.58, + "step": 106200, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.72721601, + "epoch": 0.5940325509928738, + "grad_norm": 6.59375, + "learning_rate": 8.422546782399568e-06, + "loss": 1.06344852, + "memory(GiB)": 302.58, + "step": 106220, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.72808061, + "epoch": 0.594144400465853, + "grad_norm": 7.75, + "learning_rate": 8.421872610183897e-06, + "loss": 1.07845049, + "memory(GiB)": 302.58, + "step": 106240, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.73585143, + "epoch": 0.5942562499388323, + "grad_norm": 8.3125, + "learning_rate": 8.421198320929333e-06, + "loss": 1.0405323, + "memory(GiB)": 302.58, + "step": 106260, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.72184758, + "epoch": 0.5943680994118116, + "grad_norm": 4.21875, + "learning_rate": 8.42052391465894e-06, + "loss": 1.10272732, + "memory(GiB)": 302.58, + "step": 106280, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.7448935, + "epoch": 0.5944799488847908, + "grad_norm": 4.90625, + "learning_rate": 8.41984939139578e-06, + "loss": 0.97878323, + "memory(GiB)": 302.58, + "step": 106300, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.73576112, + "epoch": 0.5945917983577701, + "grad_norm": 7.0625, + "learning_rate": 8.419174751162929e-06, + "loss": 1.02626867, + "memory(GiB)": 302.58, + "step": 106320, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.731847, + "epoch": 0.5947036478307494, + "grad_norm": 6.46875, + "learning_rate": 8.418499993983458e-06, + "loss": 1.06916761, + "memory(GiB)": 302.58, + "step": 106340, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.73724933, + "epoch": 0.5948154973037286, + "grad_norm": 5.34375, + "learning_rate": 8.417825119880449e-06, + "loss": 1.02646551, + "memory(GiB)": 302.58, + "step": 106360, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.74596992, + "epoch": 0.5949273467767079, + "grad_norm": 5.875, + "learning_rate": 8.417150128876982e-06, + "loss": 1.00057306, + "memory(GiB)": 302.58, + "step": 106380, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.75041227, + "epoch": 0.5950391962496872, + "grad_norm": 5.5, + "learning_rate": 8.416475020996146e-06, + "loss": 0.98088217, + "memory(GiB)": 302.58, + "step": 106400, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.73885417, + "epoch": 0.5951510457226664, + "grad_norm": 6.71875, + "learning_rate": 8.41579979626103e-06, + "loss": 1.02225027, + "memory(GiB)": 302.58, + "step": 106420, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.73307886, + "epoch": 0.5952628951956457, + "grad_norm": 7.875, + "learning_rate": 8.415124454694732e-06, + "loss": 1.0544219, + "memory(GiB)": 302.58, + "step": 106440, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.73581605, + "epoch": 0.5953747446686249, + "grad_norm": 9.0625, + "learning_rate": 8.414448996320346e-06, + "loss": 1.03306742, + "memory(GiB)": 302.58, + "step": 106460, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.7272963, + "epoch": 0.5954865941416042, + "grad_norm": 4.90625, + "learning_rate": 8.413773421160978e-06, + "loss": 1.08316317, + "memory(GiB)": 302.58, + "step": 106480, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.75261998, + "epoch": 0.5955984436145835, + "grad_norm": 5.1875, + "learning_rate": 8.413097729239735e-06, + "loss": 0.97133846, + "memory(GiB)": 302.58, + "step": 106500, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.72214675, + "epoch": 0.5957102930875627, + "grad_norm": 5.6875, + "learning_rate": 8.412421920579727e-06, + "loss": 1.12762499, + "memory(GiB)": 302.58, + "step": 106520, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.73884373, + "epoch": 0.595822142560542, + "grad_norm": 7.71875, + "learning_rate": 8.411745995204068e-06, + "loss": 1.02612171, + "memory(GiB)": 302.58, + "step": 106540, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.7334033, + "epoch": 0.5959339920335213, + "grad_norm": 6.09375, + "learning_rate": 8.411069953135879e-06, + "loss": 1.04495344, + "memory(GiB)": 302.58, + "step": 106560, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74933043, + "epoch": 0.5960458415065005, + "grad_norm": 10.0625, + "learning_rate": 8.41039379439828e-06, + "loss": 0.95631876, + "memory(GiB)": 302.58, + "step": 106580, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.74932518, + "epoch": 0.5961576909794798, + "grad_norm": 6.8125, + "learning_rate": 8.4097175190144e-06, + "loss": 0.95838661, + "memory(GiB)": 302.58, + "step": 106600, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.73490496, + "epoch": 0.5962695404524591, + "grad_norm": 6.40625, + "learning_rate": 8.409041127007368e-06, + "loss": 1.05350933, + "memory(GiB)": 302.58, + "step": 106620, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.72932935, + "epoch": 0.5963813899254383, + "grad_norm": 5.46875, + "learning_rate": 8.40836461840032e-06, + "loss": 1.0632966, + "memory(GiB)": 302.58, + "step": 106640, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.74456153, + "epoch": 0.5964932393984176, + "grad_norm": 9.375, + "learning_rate": 8.407687993216396e-06, + "loss": 1.00986862, + "memory(GiB)": 302.58, + "step": 106660, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.73536983, + "epoch": 0.5966050888713968, + "grad_norm": 10.0625, + "learning_rate": 8.407011251478735e-06, + "loss": 1.05726204, + "memory(GiB)": 302.58, + "step": 106680, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.73450136, + "epoch": 0.5967169383443761, + "grad_norm": 7.40625, + "learning_rate": 8.406334393210486e-06, + "loss": 1.06156998, + "memory(GiB)": 302.58, + "step": 106700, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.75025725, + "epoch": 0.5968287878173554, + "grad_norm": 5.9375, + "learning_rate": 8.4056574184348e-06, + "loss": 0.96437874, + "memory(GiB)": 302.58, + "step": 106720, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.7130589, + "epoch": 0.5969406372903346, + "grad_norm": 6.125, + "learning_rate": 8.404980327174831e-06, + "loss": 1.16130714, + "memory(GiB)": 302.58, + "step": 106740, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.73387723, + "epoch": 0.5970524867633139, + "grad_norm": 7.25, + "learning_rate": 8.40430311945374e-06, + "loss": 1.04246931, + "memory(GiB)": 302.58, + "step": 106760, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.72318025, + "epoch": 0.5971643362362932, + "grad_norm": 7.125, + "learning_rate": 8.403625795294686e-06, + "loss": 1.0974473, + "memory(GiB)": 302.58, + "step": 106780, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.73275805, + "epoch": 0.5972761857092724, + "grad_norm": 6.96875, + "learning_rate": 8.402948354720837e-06, + "loss": 1.04085951, + "memory(GiB)": 302.58, + "step": 106800, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.74977994, + "epoch": 0.5973880351822517, + "grad_norm": 7.8125, + "learning_rate": 8.402270797755363e-06, + "loss": 0.97379475, + "memory(GiB)": 302.58, + "step": 106820, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.74323106, + "epoch": 0.597499884655231, + "grad_norm": 7.8125, + "learning_rate": 8.40159312442144e-06, + "loss": 1.0219327, + "memory(GiB)": 302.58, + "step": 106840, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.74166188, + "epoch": 0.5976117341282102, + "grad_norm": 7.4375, + "learning_rate": 8.400915334742248e-06, + "loss": 1.02591314, + "memory(GiB)": 302.58, + "step": 106860, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.73099136, + "epoch": 0.5977235836011895, + "grad_norm": 9.125, + "learning_rate": 8.400237428740965e-06, + "loss": 1.05899534, + "memory(GiB)": 302.58, + "step": 106880, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.74801102, + "epoch": 0.5978354330741688, + "grad_norm": 6.84375, + "learning_rate": 8.39955940644078e-06, + "loss": 0.98535728, + "memory(GiB)": 302.58, + "step": 106900, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.73757992, + "epoch": 0.597947282547148, + "grad_norm": 6.90625, + "learning_rate": 8.398881267864886e-06, + "loss": 1.04782877, + "memory(GiB)": 302.58, + "step": 106920, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.74707108, + "epoch": 0.5980591320201273, + "grad_norm": 7.875, + "learning_rate": 8.398203013036473e-06, + "loss": 0.99580259, + "memory(GiB)": 302.58, + "step": 106940, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.72640333, + "epoch": 0.5981709814931065, + "grad_norm": 8.9375, + "learning_rate": 8.397524641978742e-06, + "loss": 1.09944992, + "memory(GiB)": 302.58, + "step": 106960, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.73760548, + "epoch": 0.5982828309660858, + "grad_norm": 6.125, + "learning_rate": 8.396846154714894e-06, + "loss": 1.02442513, + "memory(GiB)": 302.58, + "step": 106980, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.74062042, + "epoch": 0.5983946804390651, + "grad_norm": 5.59375, + "learning_rate": 8.396167551268138e-06, + "loss": 1.01965342, + "memory(GiB)": 302.58, + "step": 107000, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.75071445, + "epoch": 0.5985065299120443, + "grad_norm": 4.8125, + "learning_rate": 8.395488831661683e-06, + "loss": 0.9616293, + "memory(GiB)": 302.58, + "step": 107020, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.73912663, + "epoch": 0.5986183793850236, + "grad_norm": 8.125, + "learning_rate": 8.394809995918743e-06, + "loss": 1.02462797, + "memory(GiB)": 302.58, + "step": 107040, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.7231555, + "epoch": 0.5987302288580029, + "grad_norm": 6.375, + "learning_rate": 8.394131044062535e-06, + "loss": 1.10133152, + "memory(GiB)": 302.58, + "step": 107060, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.74120221, + "epoch": 0.5988420783309821, + "grad_norm": 6.53125, + "learning_rate": 8.393451976116285e-06, + "loss": 1.02815323, + "memory(GiB)": 302.58, + "step": 107080, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.73683953, + "epoch": 0.5989539278039614, + "grad_norm": 6.53125, + "learning_rate": 8.392772792103215e-06, + "loss": 1.03169231, + "memory(GiB)": 302.58, + "step": 107100, + "train_speed(iter/s)": 0.123846 + }, + { + "acc": 0.73854785, + "epoch": 0.5990657772769407, + "grad_norm": 9.9375, + "learning_rate": 8.392093492046558e-06, + "loss": 1.03402586, + "memory(GiB)": 302.58, + "step": 107120, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.73773437, + "epoch": 0.5991776267499199, + "grad_norm": 6.46875, + "learning_rate": 8.391414075969549e-06, + "loss": 1.05543165, + "memory(GiB)": 302.58, + "step": 107140, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74114828, + "epoch": 0.5992894762228992, + "grad_norm": 8.9375, + "learning_rate": 8.390734543895423e-06, + "loss": 1.01362858, + "memory(GiB)": 302.58, + "step": 107160, + "train_speed(iter/s)": 0.123878 + }, + { + "acc": 0.74006271, + "epoch": 0.5994013256958785, + "grad_norm": 7.0, + "learning_rate": 8.390054895847424e-06, + "loss": 1.01444597, + "memory(GiB)": 302.58, + "step": 107180, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.74097567, + "epoch": 0.5995131751688577, + "grad_norm": 8.4375, + "learning_rate": 8.389375131848798e-06, + "loss": 1.031598, + "memory(GiB)": 302.58, + "step": 107200, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.74802346, + "epoch": 0.599625024641837, + "grad_norm": 9.75, + "learning_rate": 8.388695251922795e-06, + "loss": 0.98719196, + "memory(GiB)": 302.58, + "step": 107220, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.72186699, + "epoch": 0.5997368741148162, + "grad_norm": 7.4375, + "learning_rate": 8.388015256092669e-06, + "loss": 1.11527624, + "memory(GiB)": 302.58, + "step": 107240, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.7359241, + "epoch": 0.5998487235877955, + "grad_norm": 8.1875, + "learning_rate": 8.387335144381678e-06, + "loss": 1.02607784, + "memory(GiB)": 302.58, + "step": 107260, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.72828174, + "epoch": 0.5999605730607748, + "grad_norm": 6.5625, + "learning_rate": 8.386654916813086e-06, + "loss": 1.07499189, + "memory(GiB)": 302.58, + "step": 107280, + "train_speed(iter/s)": 0.123943 + }, + { + "acc": 0.74159989, + "epoch": 0.600072422533754, + "grad_norm": 7.875, + "learning_rate": 8.385974573410156e-06, + "loss": 1.02263412, + "memory(GiB)": 302.58, + "step": 107300, + "train_speed(iter/s)": 0.123954 + }, + { + "acc": 0.72772617, + "epoch": 0.6001842720067333, + "grad_norm": 8.6875, + "learning_rate": 8.385294114196158e-06, + "loss": 1.07992697, + "memory(GiB)": 302.58, + "step": 107320, + "train_speed(iter/s)": 0.123966 + }, + { + "acc": 0.73360286, + "epoch": 0.6002961214797126, + "grad_norm": 6.25, + "learning_rate": 8.384613539194369e-06, + "loss": 1.04443789, + "memory(GiB)": 302.58, + "step": 107340, + "train_speed(iter/s)": 0.123976 + }, + { + "acc": 0.74504466, + "epoch": 0.6004079709526918, + "grad_norm": 4.6875, + "learning_rate": 8.383932848428062e-06, + "loss": 0.99165039, + "memory(GiB)": 302.58, + "step": 107360, + "train_speed(iter/s)": 0.123988 + }, + { + "acc": 0.72616835, + "epoch": 0.6005198204256711, + "grad_norm": 8.5625, + "learning_rate": 8.383252041920523e-06, + "loss": 1.07665758, + "memory(GiB)": 302.58, + "step": 107380, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.74823027, + "epoch": 0.6006316698986504, + "grad_norm": 8.6875, + "learning_rate": 8.382571119695036e-06, + "loss": 0.98251724, + "memory(GiB)": 302.58, + "step": 107400, + "train_speed(iter/s)": 0.12401 + }, + { + "acc": 0.74347038, + "epoch": 0.6007435193716296, + "grad_norm": 7.875, + "learning_rate": 8.381890081774891e-06, + "loss": 1.02245741, + "memory(GiB)": 302.58, + "step": 107420, + "train_speed(iter/s)": 0.124021 + }, + { + "acc": 0.74057274, + "epoch": 0.6008553688446089, + "grad_norm": 7.5, + "learning_rate": 8.381208928183381e-06, + "loss": 1.00347672, + "memory(GiB)": 302.58, + "step": 107440, + "train_speed(iter/s)": 0.124031 + }, + { + "acc": 0.74752994, + "epoch": 0.6009672183175881, + "grad_norm": 6.59375, + "learning_rate": 8.380527658943804e-06, + "loss": 0.98869457, + "memory(GiB)": 302.58, + "step": 107460, + "train_speed(iter/s)": 0.124042 + }, + { + "acc": 0.74203315, + "epoch": 0.6010790677905674, + "grad_norm": 8.625, + "learning_rate": 8.379846274079462e-06, + "loss": 1.00617561, + "memory(GiB)": 302.58, + "step": 107480, + "train_speed(iter/s)": 0.124053 + }, + { + "acc": 0.74556551, + "epoch": 0.6011909172635467, + "grad_norm": 6.84375, + "learning_rate": 8.379164773613659e-06, + "loss": 0.97040634, + "memory(GiB)": 302.58, + "step": 107500, + "train_speed(iter/s)": 0.124064 + }, + { + "acc": 0.74425359, + "epoch": 0.6013027667365259, + "grad_norm": 5.40625, + "learning_rate": 8.378483157569709e-06, + "loss": 0.99219599, + "memory(GiB)": 302.58, + "step": 107520, + "train_speed(iter/s)": 0.124075 + }, + { + "acc": 0.74781704, + "epoch": 0.6014146162095052, + "grad_norm": 8.9375, + "learning_rate": 8.37780142597092e-06, + "loss": 0.99789047, + "memory(GiB)": 302.58, + "step": 107540, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.74707255, + "epoch": 0.6015264656824845, + "grad_norm": 6.96875, + "learning_rate": 8.377119578840612e-06, + "loss": 0.98895445, + "memory(GiB)": 302.58, + "step": 107560, + "train_speed(iter/s)": 0.124097 + }, + { + "acc": 0.75009551, + "epoch": 0.6016383151554637, + "grad_norm": 7.25, + "learning_rate": 8.376437616202105e-06, + "loss": 0.99173203, + "memory(GiB)": 302.58, + "step": 107580, + "train_speed(iter/s)": 0.124108 + }, + { + "acc": 0.73035445, + "epoch": 0.601750164628443, + "grad_norm": 7.6875, + "learning_rate": 8.375755538078726e-06, + "loss": 1.07852688, + "memory(GiB)": 302.58, + "step": 107600, + "train_speed(iter/s)": 0.124118 + }, + { + "acc": 0.74192801, + "epoch": 0.6018620141014223, + "grad_norm": 6.21875, + "learning_rate": 8.3750733444938e-06, + "loss": 1.00110931, + "memory(GiB)": 302.58, + "step": 107620, + "train_speed(iter/s)": 0.124128 + }, + { + "acc": 0.74282465, + "epoch": 0.6019738635744015, + "grad_norm": 6.09375, + "learning_rate": 8.374391035470668e-06, + "loss": 1.01030293, + "memory(GiB)": 302.58, + "step": 107640, + "train_speed(iter/s)": 0.124139 + }, + { + "acc": 0.74395099, + "epoch": 0.6020857130473808, + "grad_norm": 5.5, + "learning_rate": 8.37370861103266e-06, + "loss": 0.98776455, + "memory(GiB)": 302.58, + "step": 107660, + "train_speed(iter/s)": 0.12415 + }, + { + "acc": 0.72818823, + "epoch": 0.60219756252036, + "grad_norm": 8.375, + "learning_rate": 8.373026071203121e-06, + "loss": 1.08531799, + "memory(GiB)": 302.58, + "step": 107680, + "train_speed(iter/s)": 0.124161 + }, + { + "acc": 0.746139, + "epoch": 0.6023094119933393, + "grad_norm": 8.125, + "learning_rate": 8.372343416005396e-06, + "loss": 0.99448833, + "memory(GiB)": 302.58, + "step": 107700, + "train_speed(iter/s)": 0.124172 + }, + { + "acc": 0.73206062, + "epoch": 0.6024212614663186, + "grad_norm": 8.375, + "learning_rate": 8.371660645462832e-06, + "loss": 1.07555637, + "memory(GiB)": 302.58, + "step": 107720, + "train_speed(iter/s)": 0.124183 + }, + { + "acc": 0.74962296, + "epoch": 0.6025331109392978, + "grad_norm": 7.53125, + "learning_rate": 8.370977759598781e-06, + "loss": 0.99654799, + "memory(GiB)": 302.58, + "step": 107740, + "train_speed(iter/s)": 0.124193 + }, + { + "acc": 0.7252389, + "epoch": 0.6026449604122771, + "grad_norm": 7.78125, + "learning_rate": 8.370294758436603e-06, + "loss": 1.08148861, + "memory(GiB)": 302.58, + "step": 107760, + "train_speed(iter/s)": 0.124203 + }, + { + "acc": 0.72560625, + "epoch": 0.6027568098852564, + "grad_norm": 6.875, + "learning_rate": 8.369611641999656e-06, + "loss": 1.08706341, + "memory(GiB)": 302.58, + "step": 107780, + "train_speed(iter/s)": 0.124213 + }, + { + "acc": 0.73397527, + "epoch": 0.6028686593582356, + "grad_norm": 6.84375, + "learning_rate": 8.368928410311308e-06, + "loss": 1.05000935, + "memory(GiB)": 302.58, + "step": 107800, + "train_speed(iter/s)": 0.124224 + }, + { + "acc": 0.74407811, + "epoch": 0.6029805088312149, + "grad_norm": 6.875, + "learning_rate": 8.368245063394924e-06, + "loss": 1.00676041, + "memory(GiB)": 302.58, + "step": 107820, + "train_speed(iter/s)": 0.124236 + }, + { + "acc": 0.73888836, + "epoch": 0.6030923583041942, + "grad_norm": 5.9375, + "learning_rate": 8.367561601273879e-06, + "loss": 1.03227348, + "memory(GiB)": 302.58, + "step": 107840, + "train_speed(iter/s)": 0.124246 + }, + { + "acc": 0.73659086, + "epoch": 0.6032042077771734, + "grad_norm": 10.375, + "learning_rate": 8.36687802397155e-06, + "loss": 1.02715073, + "memory(GiB)": 302.58, + "step": 107860, + "train_speed(iter/s)": 0.124257 + }, + { + "acc": 0.73593063, + "epoch": 0.6033160572501527, + "grad_norm": 6.40625, + "learning_rate": 8.366194331511316e-06, + "loss": 1.04693708, + "memory(GiB)": 302.58, + "step": 107880, + "train_speed(iter/s)": 0.124267 + }, + { + "acc": 0.7294414, + "epoch": 0.603427906723132, + "grad_norm": 7.875, + "learning_rate": 8.365510523916562e-06, + "loss": 1.07980433, + "memory(GiB)": 302.58, + "step": 107900, + "train_speed(iter/s)": 0.124278 + }, + { + "acc": 0.73254352, + "epoch": 0.6035397561961112, + "grad_norm": 5.21875, + "learning_rate": 8.364826601210677e-06, + "loss": 1.05397387, + "memory(GiB)": 302.58, + "step": 107920, + "train_speed(iter/s)": 0.124288 + }, + { + "acc": 0.75700316, + "epoch": 0.6036516056690905, + "grad_norm": 9.0, + "learning_rate": 8.36414256341705e-06, + "loss": 0.94705467, + "memory(GiB)": 302.58, + "step": 107940, + "train_speed(iter/s)": 0.124299 + }, + { + "acc": 0.75016341, + "epoch": 0.6037634551420697, + "grad_norm": 7.375, + "learning_rate": 8.363458410559083e-06, + "loss": 1.00480986, + "memory(GiB)": 302.58, + "step": 107960, + "train_speed(iter/s)": 0.12431 + }, + { + "acc": 0.73216009, + "epoch": 0.603875304615049, + "grad_norm": 8.8125, + "learning_rate": 8.36277414266017e-06, + "loss": 1.07653656, + "memory(GiB)": 302.58, + "step": 107980, + "train_speed(iter/s)": 0.124321 + }, + { + "acc": 0.74115334, + "epoch": 0.6039871540880283, + "grad_norm": 5.96875, + "learning_rate": 8.362089759743721e-06, + "loss": 1.01330051, + "memory(GiB)": 302.58, + "step": 108000, + "train_speed(iter/s)": 0.124331 + }, + { + "epoch": 0.6039871540880283, + "eval_acc": 0.7030939846188111, + "eval_loss": 1.0286993980407715, + "eval_runtime": 7502.9562, + "eval_samples_per_second": 10.034, + "eval_steps_per_second": 10.034, + "step": 108000 + }, + { + "acc": 0.73650961, + "epoch": 0.6040990035610075, + "grad_norm": 5.78125, + "learning_rate": 8.36140526183314e-06, + "loss": 1.04517632, + "memory(GiB)": 302.58, + "step": 108020, + "train_speed(iter/s)": 0.123258 + }, + { + "acc": 0.72599273, + "epoch": 0.6042108530339868, + "grad_norm": 7.5625, + "learning_rate": 8.36072064895184e-06, + "loss": 1.07070417, + "memory(GiB)": 302.58, + "step": 108040, + "train_speed(iter/s)": 0.12327 + }, + { + "acc": 0.73307467, + "epoch": 0.6043227025069661, + "grad_norm": 6.78125, + "learning_rate": 8.36003592112324e-06, + "loss": 1.05306816, + "memory(GiB)": 302.58, + "step": 108060, + "train_speed(iter/s)": 0.123281 + }, + { + "acc": 0.74894609, + "epoch": 0.6044345519799454, + "grad_norm": 6.6875, + "learning_rate": 8.359351078370754e-06, + "loss": 0.98432617, + "memory(GiB)": 302.58, + "step": 108080, + "train_speed(iter/s)": 0.123292 + }, + { + "acc": 0.74054723, + "epoch": 0.6045464014529247, + "grad_norm": 7.46875, + "learning_rate": 8.35866612071781e-06, + "loss": 1.00972185, + "memory(GiB)": 302.58, + "step": 108100, + "train_speed(iter/s)": 0.123302 + }, + { + "acc": 0.75563526, + "epoch": 0.604658250925904, + "grad_norm": 8.1875, + "learning_rate": 8.357981048187835e-06, + "loss": 0.97918978, + "memory(GiB)": 302.58, + "step": 108120, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.73764081, + "epoch": 0.6047701003988832, + "grad_norm": 10.0625, + "learning_rate": 8.35729586080426e-06, + "loss": 1.03180962, + "memory(GiB)": 302.58, + "step": 108140, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.73716402, + "epoch": 0.6048819498718625, + "grad_norm": 8.4375, + "learning_rate": 8.356610558590522e-06, + "loss": 1.04176302, + "memory(GiB)": 302.58, + "step": 108160, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.73192806, + "epoch": 0.6049937993448418, + "grad_norm": 6.34375, + "learning_rate": 8.355925141570055e-06, + "loss": 1.04970026, + "memory(GiB)": 302.58, + "step": 108180, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.74017959, + "epoch": 0.605105648817821, + "grad_norm": 6.75, + "learning_rate": 8.35523960976631e-06, + "loss": 1.02608957, + "memory(GiB)": 302.58, + "step": 108200, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.72765927, + "epoch": 0.6052174982908003, + "grad_norm": 7.3125, + "learning_rate": 8.354553963202731e-06, + "loss": 1.08203812, + "memory(GiB)": 302.58, + "step": 108220, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.73806391, + "epoch": 0.6053293477637796, + "grad_norm": 7.25, + "learning_rate": 8.35386820190277e-06, + "loss": 1.05705585, + "memory(GiB)": 302.58, + "step": 108240, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.72369699, + "epoch": 0.6054411972367588, + "grad_norm": 5.65625, + "learning_rate": 8.353182325889881e-06, + "loss": 1.08345995, + "memory(GiB)": 302.58, + "step": 108260, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.72700734, + "epoch": 0.6055530467097381, + "grad_norm": 7.03125, + "learning_rate": 8.352496335187525e-06, + "loss": 1.06977835, + "memory(GiB)": 302.58, + "step": 108280, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.75250168, + "epoch": 0.6056648961827173, + "grad_norm": 6.1875, + "learning_rate": 8.351810229819162e-06, + "loss": 0.97708187, + "memory(GiB)": 302.58, + "step": 108300, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.73681555, + "epoch": 0.6057767456556966, + "grad_norm": 6.28125, + "learning_rate": 8.351124009808263e-06, + "loss": 1.02848263, + "memory(GiB)": 302.58, + "step": 108320, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74847732, + "epoch": 0.6058885951286759, + "grad_norm": 5.3125, + "learning_rate": 8.350437675178294e-06, + "loss": 0.97623377, + "memory(GiB)": 302.58, + "step": 108340, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.74164686, + "epoch": 0.6060004446016551, + "grad_norm": 4.9375, + "learning_rate": 8.349751225952734e-06, + "loss": 1.0141964, + "memory(GiB)": 302.58, + "step": 108360, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74076815, + "epoch": 0.6061122940746344, + "grad_norm": 5.375, + "learning_rate": 8.349064662155061e-06, + "loss": 1.04067087, + "memory(GiB)": 302.58, + "step": 108380, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.73260417, + "epoch": 0.6062241435476137, + "grad_norm": 7.78125, + "learning_rate": 8.348377983808758e-06, + "loss": 1.06594801, + "memory(GiB)": 302.58, + "step": 108400, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.71999583, + "epoch": 0.6063359930205929, + "grad_norm": 4.71875, + "learning_rate": 8.347691190937306e-06, + "loss": 1.11573563, + "memory(GiB)": 302.58, + "step": 108420, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.73504629, + "epoch": 0.6064478424935722, + "grad_norm": 9.5625, + "learning_rate": 8.347004283564204e-06, + "loss": 1.04370575, + "memory(GiB)": 302.58, + "step": 108440, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.73596978, + "epoch": 0.6065596919665515, + "grad_norm": 4.9375, + "learning_rate": 8.346317261712942e-06, + "loss": 1.04285841, + "memory(GiB)": 302.58, + "step": 108460, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.74438953, + "epoch": 0.6066715414395307, + "grad_norm": 6.6875, + "learning_rate": 8.345630125407016e-06, + "loss": 0.97239294, + "memory(GiB)": 302.58, + "step": 108480, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.7320622, + "epoch": 0.60678339091251, + "grad_norm": 5.0, + "learning_rate": 8.344942874669935e-06, + "loss": 1.06959753, + "memory(GiB)": 302.58, + "step": 108500, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.73404255, + "epoch": 0.6068952403854893, + "grad_norm": 8.125, + "learning_rate": 8.344255509525198e-06, + "loss": 1.03957043, + "memory(GiB)": 302.58, + "step": 108520, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74824667, + "epoch": 0.6070070898584685, + "grad_norm": 8.75, + "learning_rate": 8.34356802999632e-06, + "loss": 0.98315182, + "memory(GiB)": 302.58, + "step": 108540, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.73727398, + "epoch": 0.6071189393314478, + "grad_norm": 7.84375, + "learning_rate": 8.342880436106813e-06, + "loss": 1.02137623, + "memory(GiB)": 302.58, + "step": 108560, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74474258, + "epoch": 0.607230788804427, + "grad_norm": 8.3125, + "learning_rate": 8.342192727880193e-06, + "loss": 1.00796604, + "memory(GiB)": 302.58, + "step": 108580, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.75605049, + "epoch": 0.6073426382774063, + "grad_norm": 7.71875, + "learning_rate": 8.341504905339987e-06, + "loss": 0.95887976, + "memory(GiB)": 302.58, + "step": 108600, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.73160539, + "epoch": 0.6074544877503856, + "grad_norm": 5.5, + "learning_rate": 8.340816968509717e-06, + "loss": 1.0347928, + "memory(GiB)": 302.58, + "step": 108620, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.72954502, + "epoch": 0.6075663372233648, + "grad_norm": 7.75, + "learning_rate": 8.340128917412911e-06, + "loss": 1.05097103, + "memory(GiB)": 302.58, + "step": 108640, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.74667091, + "epoch": 0.6076781866963441, + "grad_norm": 9.25, + "learning_rate": 8.339440752073106e-06, + "loss": 0.97203875, + "memory(GiB)": 302.58, + "step": 108660, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.72575235, + "epoch": 0.6077900361693234, + "grad_norm": 7.3125, + "learning_rate": 8.33875247251384e-06, + "loss": 1.06719379, + "memory(GiB)": 302.58, + "step": 108680, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.74666376, + "epoch": 0.6079018856423026, + "grad_norm": 9.375, + "learning_rate": 8.33806407875865e-06, + "loss": 1.00088997, + "memory(GiB)": 302.58, + "step": 108700, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74198685, + "epoch": 0.6080137351152819, + "grad_norm": 7.9375, + "learning_rate": 8.337375570831086e-06, + "loss": 1.00569916, + "memory(GiB)": 302.58, + "step": 108720, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.72454934, + "epoch": 0.6081255845882612, + "grad_norm": 9.125, + "learning_rate": 8.336686948754693e-06, + "loss": 1.10303125, + "memory(GiB)": 302.58, + "step": 108740, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.73813767, + "epoch": 0.6082374340612404, + "grad_norm": 6.96875, + "learning_rate": 8.335998212553027e-06, + "loss": 1.02152081, + "memory(GiB)": 302.58, + "step": 108760, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.74080725, + "epoch": 0.6083492835342197, + "grad_norm": 4.96875, + "learning_rate": 8.335309362249645e-06, + "loss": 1.02781363, + "memory(GiB)": 302.58, + "step": 108780, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.72894239, + "epoch": 0.608461133007199, + "grad_norm": 5.125, + "learning_rate": 8.334620397868105e-06, + "loss": 1.08612776, + "memory(GiB)": 302.58, + "step": 108800, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.75965466, + "epoch": 0.6085729824801782, + "grad_norm": 9.75, + "learning_rate": 8.333931319431974e-06, + "loss": 0.95055475, + "memory(GiB)": 302.58, + "step": 108820, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74394383, + "epoch": 0.6086848319531575, + "grad_norm": 8.375, + "learning_rate": 8.33324212696482e-06, + "loss": 1.00303249, + "memory(GiB)": 302.58, + "step": 108840, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.74146228, + "epoch": 0.6087966814261367, + "grad_norm": 6.21875, + "learning_rate": 8.332552820490215e-06, + "loss": 1.01534252, + "memory(GiB)": 302.58, + "step": 108860, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.74747243, + "epoch": 0.608908530899116, + "grad_norm": 7.65625, + "learning_rate": 8.331863400031737e-06, + "loss": 0.97939873, + "memory(GiB)": 302.58, + "step": 108880, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.72712827, + "epoch": 0.6090203803720953, + "grad_norm": 8.375, + "learning_rate": 8.331173865612968e-06, + "loss": 1.06293449, + "memory(GiB)": 302.58, + "step": 108900, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.72616901, + "epoch": 0.6091322298450745, + "grad_norm": 7.40625, + "learning_rate": 8.330484217257487e-06, + "loss": 1.07203913, + "memory(GiB)": 302.58, + "step": 108920, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.7516304, + "epoch": 0.6092440793180538, + "grad_norm": 6.8125, + "learning_rate": 8.329794454988887e-06, + "loss": 0.99458418, + "memory(GiB)": 302.58, + "step": 108940, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.73573871, + "epoch": 0.6093559287910331, + "grad_norm": 6.96875, + "learning_rate": 8.329104578830756e-06, + "loss": 1.02657318, + "memory(GiB)": 302.58, + "step": 108960, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.73432617, + "epoch": 0.6094677782640123, + "grad_norm": 8.375, + "learning_rate": 8.328414588806694e-06, + "loss": 1.06012611, + "memory(GiB)": 302.58, + "step": 108980, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.73843923, + "epoch": 0.6095796277369916, + "grad_norm": 5.28125, + "learning_rate": 8.327724484940298e-06, + "loss": 1.02932024, + "memory(GiB)": 302.58, + "step": 109000, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.7379499, + "epoch": 0.6096914772099709, + "grad_norm": 6.59375, + "learning_rate": 8.327034267255175e-06, + "loss": 1.04102564, + "memory(GiB)": 302.58, + "step": 109020, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.73841839, + "epoch": 0.6098033266829501, + "grad_norm": 5.90625, + "learning_rate": 8.326343935774927e-06, + "loss": 1.06420403, + "memory(GiB)": 302.58, + "step": 109040, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.74354835, + "epoch": 0.6099151761559294, + "grad_norm": 7.8125, + "learning_rate": 8.32565349052317e-06, + "loss": 1.00731783, + "memory(GiB)": 302.58, + "step": 109060, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.73722668, + "epoch": 0.6100270256289086, + "grad_norm": 6.65625, + "learning_rate": 8.32496293152352e-06, + "loss": 1.01991959, + "memory(GiB)": 302.58, + "step": 109080, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.74181585, + "epoch": 0.6101388751018879, + "grad_norm": 6.5, + "learning_rate": 8.324272258799593e-06, + "loss": 1.02514954, + "memory(GiB)": 302.58, + "step": 109100, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.72638321, + "epoch": 0.6102507245748672, + "grad_norm": 8.625, + "learning_rate": 8.323581472375014e-06, + "loss": 1.04132528, + "memory(GiB)": 302.58, + "step": 109120, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.75177827, + "epoch": 0.6103625740478464, + "grad_norm": 5.9375, + "learning_rate": 8.322890572273408e-06, + "loss": 0.95083218, + "memory(GiB)": 302.58, + "step": 109140, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.75867305, + "epoch": 0.6104744235208257, + "grad_norm": 6.78125, + "learning_rate": 8.32219955851841e-06, + "loss": 0.94336834, + "memory(GiB)": 302.58, + "step": 109160, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.75214643, + "epoch": 0.610586272993805, + "grad_norm": 8.0, + "learning_rate": 8.321508431133653e-06, + "loss": 0.97239876, + "memory(GiB)": 302.58, + "step": 109180, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.75055785, + "epoch": 0.6106981224667842, + "grad_norm": 6.84375, + "learning_rate": 8.320817190142774e-06, + "loss": 0.97993841, + "memory(GiB)": 302.58, + "step": 109200, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.73092523, + "epoch": 0.6108099719397635, + "grad_norm": 7.84375, + "learning_rate": 8.320125835569417e-06, + "loss": 1.03056126, + "memory(GiB)": 302.58, + "step": 109220, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.7443079, + "epoch": 0.6109218214127428, + "grad_norm": 9.625, + "learning_rate": 8.319434367437231e-06, + "loss": 0.99377356, + "memory(GiB)": 302.58, + "step": 109240, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.72574811, + "epoch": 0.611033670885722, + "grad_norm": 7.21875, + "learning_rate": 8.318742785769862e-06, + "loss": 1.0840436, + "memory(GiB)": 302.58, + "step": 109260, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.74195433, + "epoch": 0.6111455203587013, + "grad_norm": 7.375, + "learning_rate": 8.318051090590967e-06, + "loss": 1.01712189, + "memory(GiB)": 302.58, + "step": 109280, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.74136605, + "epoch": 0.6112573698316806, + "grad_norm": 8.0, + "learning_rate": 8.317359281924202e-06, + "loss": 1.01016111, + "memory(GiB)": 302.58, + "step": 109300, + "train_speed(iter/s)": 0.123938 + }, + { + "acc": 0.75090013, + "epoch": 0.6113692193046598, + "grad_norm": 7.84375, + "learning_rate": 8.31666735979323e-06, + "loss": 0.9818881, + "memory(GiB)": 302.58, + "step": 109320, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.73166685, + "epoch": 0.6114810687776391, + "grad_norm": 6.1875, + "learning_rate": 8.315975324221721e-06, + "loss": 1.04401941, + "memory(GiB)": 302.58, + "step": 109340, + "train_speed(iter/s)": 0.123959 + }, + { + "acc": 0.74263248, + "epoch": 0.6115929182506183, + "grad_norm": 8.4375, + "learning_rate": 8.31528317523334e-06, + "loss": 1.01834641, + "memory(GiB)": 302.58, + "step": 109360, + "train_speed(iter/s)": 0.12397 + }, + { + "acc": 0.7377718, + "epoch": 0.6117047677235976, + "grad_norm": 7.03125, + "learning_rate": 8.31459091285176e-06, + "loss": 1.03782969, + "memory(GiB)": 302.58, + "step": 109380, + "train_speed(iter/s)": 0.123981 + }, + { + "acc": 0.72963738, + "epoch": 0.6118166171965769, + "grad_norm": 9.3125, + "learning_rate": 8.313898537100662e-06, + "loss": 1.07807407, + "memory(GiB)": 302.58, + "step": 109400, + "train_speed(iter/s)": 0.123992 + }, + { + "acc": 0.73126655, + "epoch": 0.6119284666695561, + "grad_norm": 4.375, + "learning_rate": 8.313206048003725e-06, + "loss": 1.08027296, + "memory(GiB)": 302.58, + "step": 109420, + "train_speed(iter/s)": 0.124002 + }, + { + "acc": 0.73707671, + "epoch": 0.6120403161425354, + "grad_norm": 4.875, + "learning_rate": 8.312513445584636e-06, + "loss": 1.02367716, + "memory(GiB)": 302.58, + "step": 109440, + "train_speed(iter/s)": 0.124013 + }, + { + "acc": 0.75557795, + "epoch": 0.6121521656155147, + "grad_norm": 7.46875, + "learning_rate": 8.311820729867085e-06, + "loss": 0.95201645, + "memory(GiB)": 302.58, + "step": 109460, + "train_speed(iter/s)": 0.124024 + }, + { + "acc": 0.74725285, + "epoch": 0.6122640150884939, + "grad_norm": 6.4375, + "learning_rate": 8.31112790087476e-06, + "loss": 0.99117842, + "memory(GiB)": 302.58, + "step": 109480, + "train_speed(iter/s)": 0.124035 + }, + { + "acc": 0.73754992, + "epoch": 0.6123758645614732, + "grad_norm": 10.0625, + "learning_rate": 8.310434958631363e-06, + "loss": 1.01820793, + "memory(GiB)": 302.58, + "step": 109500, + "train_speed(iter/s)": 0.124045 + }, + { + "acc": 0.73299708, + "epoch": 0.6124877140344525, + "grad_norm": 6.71875, + "learning_rate": 8.309741903160593e-06, + "loss": 1.08487711, + "memory(GiB)": 302.58, + "step": 109520, + "train_speed(iter/s)": 0.124055 + }, + { + "acc": 0.74326081, + "epoch": 0.6125995635074317, + "grad_norm": 5.875, + "learning_rate": 8.309048734486156e-06, + "loss": 0.99025955, + "memory(GiB)": 302.58, + "step": 109540, + "train_speed(iter/s)": 0.124066 + }, + { + "acc": 0.73456602, + "epoch": 0.612711412980411, + "grad_norm": 8.75, + "learning_rate": 8.30835545263176e-06, + "loss": 1.04513712, + "memory(GiB)": 302.58, + "step": 109560, + "train_speed(iter/s)": 0.124077 + }, + { + "acc": 0.72220435, + "epoch": 0.6128232624533902, + "grad_norm": 7.15625, + "learning_rate": 8.307662057621115e-06, + "loss": 1.10888662, + "memory(GiB)": 302.58, + "step": 109580, + "train_speed(iter/s)": 0.124087 + }, + { + "acc": 0.73812485, + "epoch": 0.6129351119263695, + "grad_norm": 6.21875, + "learning_rate": 8.306968549477941e-06, + "loss": 1.03358727, + "memory(GiB)": 302.58, + "step": 109600, + "train_speed(iter/s)": 0.124098 + }, + { + "acc": 0.75252995, + "epoch": 0.6130469613993488, + "grad_norm": 7.90625, + "learning_rate": 8.306274928225957e-06, + "loss": 0.9604352, + "memory(GiB)": 302.58, + "step": 109620, + "train_speed(iter/s)": 0.124109 + }, + { + "acc": 0.74398575, + "epoch": 0.613158810872328, + "grad_norm": 5.40625, + "learning_rate": 8.305581193888886e-06, + "loss": 1.00970402, + "memory(GiB)": 302.58, + "step": 109640, + "train_speed(iter/s)": 0.124119 + }, + { + "acc": 0.73488698, + "epoch": 0.6132706603453073, + "grad_norm": 10.3125, + "learning_rate": 8.304887346490455e-06, + "loss": 1.04997129, + "memory(GiB)": 302.58, + "step": 109660, + "train_speed(iter/s)": 0.124129 + }, + { + "acc": 0.74220085, + "epoch": 0.6133825098182866, + "grad_norm": 6.34375, + "learning_rate": 8.304193386054399e-06, + "loss": 1.0210557, + "memory(GiB)": 302.58, + "step": 109680, + "train_speed(iter/s)": 0.12414 + }, + { + "acc": 0.73754454, + "epoch": 0.6134943592912658, + "grad_norm": 7.375, + "learning_rate": 8.30349931260445e-06, + "loss": 1.00865507, + "memory(GiB)": 302.58, + "step": 109700, + "train_speed(iter/s)": 0.12415 + }, + { + "acc": 0.73052154, + "epoch": 0.6136062087642451, + "grad_norm": 7.03125, + "learning_rate": 8.302805126164351e-06, + "loss": 1.08059902, + "memory(GiB)": 302.58, + "step": 109720, + "train_speed(iter/s)": 0.124161 + }, + { + "acc": 0.74290495, + "epoch": 0.6137180582372244, + "grad_norm": 8.5, + "learning_rate": 8.302110826757844e-06, + "loss": 1.00223694, + "memory(GiB)": 302.58, + "step": 109740, + "train_speed(iter/s)": 0.124172 + }, + { + "acc": 0.74569316, + "epoch": 0.6138299077102036, + "grad_norm": 7.6875, + "learning_rate": 8.301416414408674e-06, + "loss": 0.99307985, + "memory(GiB)": 302.58, + "step": 109760, + "train_speed(iter/s)": 0.124182 + }, + { + "acc": 0.7391881, + "epoch": 0.6139417571831829, + "grad_norm": 10.5, + "learning_rate": 8.300721889140596e-06, + "loss": 1.03392944, + "memory(GiB)": 302.58, + "step": 109780, + "train_speed(iter/s)": 0.124192 + }, + { + "acc": 0.72538891, + "epoch": 0.6140536066561622, + "grad_norm": 7.65625, + "learning_rate": 8.300027250977362e-06, + "loss": 1.09171782, + "memory(GiB)": 302.58, + "step": 109800, + "train_speed(iter/s)": 0.124202 + }, + { + "acc": 0.73525825, + "epoch": 0.6141654561291414, + "grad_norm": 6.90625, + "learning_rate": 8.299332499942733e-06, + "loss": 1.0378335, + "memory(GiB)": 302.58, + "step": 109820, + "train_speed(iter/s)": 0.124213 + }, + { + "acc": 0.74474692, + "epoch": 0.6142773056021207, + "grad_norm": 9.8125, + "learning_rate": 8.29863763606047e-06, + "loss": 1.0060627, + "memory(GiB)": 302.58, + "step": 109840, + "train_speed(iter/s)": 0.124223 + }, + { + "acc": 0.73038073, + "epoch": 0.6143891550751, + "grad_norm": 6.34375, + "learning_rate": 8.297942659354341e-06, + "loss": 1.07825346, + "memory(GiB)": 302.58, + "step": 109860, + "train_speed(iter/s)": 0.124234 + }, + { + "acc": 0.73389111, + "epoch": 0.6145010045480792, + "grad_norm": 6.4375, + "learning_rate": 8.297247569848115e-06, + "loss": 1.05429506, + "memory(GiB)": 302.58, + "step": 109880, + "train_speed(iter/s)": 0.124244 + }, + { + "acc": 0.73319335, + "epoch": 0.6146128540210585, + "grad_norm": 6.3125, + "learning_rate": 8.296552367565566e-06, + "loss": 1.04986162, + "memory(GiB)": 302.58, + "step": 109900, + "train_speed(iter/s)": 0.124256 + }, + { + "acc": 0.73418264, + "epoch": 0.6147247034940377, + "grad_norm": 7.21875, + "learning_rate": 8.295857052530473e-06, + "loss": 1.03287878, + "memory(GiB)": 302.58, + "step": 109920, + "train_speed(iter/s)": 0.124266 + }, + { + "acc": 0.74367194, + "epoch": 0.614836552967017, + "grad_norm": 5.59375, + "learning_rate": 8.29516162476662e-06, + "loss": 0.99718828, + "memory(GiB)": 302.58, + "step": 109940, + "train_speed(iter/s)": 0.124277 + }, + { + "acc": 0.75283694, + "epoch": 0.6149484024399963, + "grad_norm": 7.09375, + "learning_rate": 8.294466084297788e-06, + "loss": 0.97255535, + "memory(GiB)": 302.58, + "step": 109960, + "train_speed(iter/s)": 0.124287 + }, + { + "acc": 0.74745007, + "epoch": 0.6150602519129755, + "grad_norm": 8.9375, + "learning_rate": 8.293770431147771e-06, + "loss": 1.0011364, + "memory(GiB)": 302.58, + "step": 109980, + "train_speed(iter/s)": 0.124299 + }, + { + "acc": 0.73763199, + "epoch": 0.6151721013859548, + "grad_norm": 6.9375, + "learning_rate": 8.293074665340361e-06, + "loss": 1.02432013, + "memory(GiB)": 302.58, + "step": 110000, + "train_speed(iter/s)": 0.124309 + }, + { + "epoch": 0.6151721013859548, + "eval_acc": 0.7031138019268931, + "eval_loss": 1.0284388065338135, + "eval_runtime": 7500.6003, + "eval_samples_per_second": 10.037, + "eval_steps_per_second": 10.037, + "step": 110000 + }, + { + "acc": 0.72643881, + "epoch": 0.6152839508589341, + "grad_norm": 9.5, + "learning_rate": 8.292378786899355e-06, + "loss": 1.06298952, + "memory(GiB)": 302.58, + "step": 110020, + "train_speed(iter/s)": 0.123257 + }, + { + "acc": 0.73017092, + "epoch": 0.6153958003319133, + "grad_norm": 7.03125, + "learning_rate": 8.291682795848555e-06, + "loss": 1.06652784, + "memory(GiB)": 302.58, + "step": 110040, + "train_speed(iter/s)": 0.123268 + }, + { + "acc": 0.7347826, + "epoch": 0.6155076498048926, + "grad_norm": 7.34375, + "learning_rate": 8.290986692211764e-06, + "loss": 1.0539155, + "memory(GiB)": 302.58, + "step": 110060, + "train_speed(iter/s)": 0.123278 + }, + { + "acc": 0.74524212, + "epoch": 0.6156194992778719, + "grad_norm": 7.125, + "learning_rate": 8.290290476012794e-06, + "loss": 1.00135012, + "memory(GiB)": 302.58, + "step": 110080, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.72180634, + "epoch": 0.6157313487508511, + "grad_norm": 7.6875, + "learning_rate": 8.289594147275458e-06, + "loss": 1.13178902, + "memory(GiB)": 302.58, + "step": 110100, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.73482957, + "epoch": 0.6158431982238304, + "grad_norm": 6.75, + "learning_rate": 8.288897706023569e-06, + "loss": 1.04019041, + "memory(GiB)": 302.58, + "step": 110120, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.73635421, + "epoch": 0.6159550476968096, + "grad_norm": 6.4375, + "learning_rate": 8.28820115228095e-06, + "loss": 1.04371166, + "memory(GiB)": 302.58, + "step": 110140, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.73670235, + "epoch": 0.6160668971697889, + "grad_norm": 9.625, + "learning_rate": 8.287504486071425e-06, + "loss": 1.05531559, + "memory(GiB)": 302.58, + "step": 110160, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.73336287, + "epoch": 0.6161787466427682, + "grad_norm": 7.09375, + "learning_rate": 8.286807707418823e-06, + "loss": 1.04547071, + "memory(GiB)": 302.58, + "step": 110180, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.7285665, + "epoch": 0.6162905961157474, + "grad_norm": 9.5, + "learning_rate": 8.286110816346976e-06, + "loss": 1.08828773, + "memory(GiB)": 302.58, + "step": 110200, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.7377223, + "epoch": 0.6164024455887267, + "grad_norm": 7.71875, + "learning_rate": 8.285413812879717e-06, + "loss": 1.02921476, + "memory(GiB)": 302.58, + "step": 110220, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.74391303, + "epoch": 0.616514295061706, + "grad_norm": 5.40625, + "learning_rate": 8.28471669704089e-06, + "loss": 0.98431883, + "memory(GiB)": 302.58, + "step": 110240, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.7615201, + "epoch": 0.6166261445346852, + "grad_norm": 6.78125, + "learning_rate": 8.284019468854333e-06, + "loss": 0.93889294, + "memory(GiB)": 302.58, + "step": 110260, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.75400381, + "epoch": 0.6167379940076645, + "grad_norm": 6.0, + "learning_rate": 8.283322128343899e-06, + "loss": 0.97375793, + "memory(GiB)": 302.58, + "step": 110280, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.73258119, + "epoch": 0.6168498434806438, + "grad_norm": 6.8125, + "learning_rate": 8.282624675533438e-06, + "loss": 1.0627861, + "memory(GiB)": 302.58, + "step": 110300, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.75157523, + "epoch": 0.616961692953623, + "grad_norm": 3.84375, + "learning_rate": 8.281927110446803e-06, + "loss": 0.98736334, + "memory(GiB)": 302.58, + "step": 110320, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75381632, + "epoch": 0.6170735424266023, + "grad_norm": 5.28125, + "learning_rate": 8.281229433107854e-06, + "loss": 0.95998487, + "memory(GiB)": 302.58, + "step": 110340, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.73163791, + "epoch": 0.6171853918995815, + "grad_norm": 7.65625, + "learning_rate": 8.280531643540456e-06, + "loss": 1.05549212, + "memory(GiB)": 302.58, + "step": 110360, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.74290328, + "epoch": 0.6172972413725608, + "grad_norm": 6.8125, + "learning_rate": 8.27983374176847e-06, + "loss": 1.00309076, + "memory(GiB)": 302.58, + "step": 110380, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.73306761, + "epoch": 0.6174090908455401, + "grad_norm": 7.21875, + "learning_rate": 8.279135727815773e-06, + "loss": 1.04412966, + "memory(GiB)": 302.58, + "step": 110400, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.74314938, + "epoch": 0.6175209403185193, + "grad_norm": 7.46875, + "learning_rate": 8.278437601706232e-06, + "loss": 1.00995007, + "memory(GiB)": 302.58, + "step": 110420, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74571419, + "epoch": 0.6176327897914986, + "grad_norm": 5.78125, + "learning_rate": 8.277739363463734e-06, + "loss": 0.98530102, + "memory(GiB)": 302.58, + "step": 110440, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.74496374, + "epoch": 0.6177446392644779, + "grad_norm": 6.0625, + "learning_rate": 8.277041013112154e-06, + "loss": 0.99455786, + "memory(GiB)": 302.58, + "step": 110460, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.73335872, + "epoch": 0.6178564887374571, + "grad_norm": 6.6875, + "learning_rate": 8.276342550675381e-06, + "loss": 1.05176897, + "memory(GiB)": 302.58, + "step": 110480, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.73578959, + "epoch": 0.6179683382104364, + "grad_norm": 8.1875, + "learning_rate": 8.275643976177303e-06, + "loss": 1.01354704, + "memory(GiB)": 302.58, + "step": 110500, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.72205844, + "epoch": 0.6180801876834157, + "grad_norm": 6.1875, + "learning_rate": 8.274945289641816e-06, + "loss": 1.12218561, + "memory(GiB)": 302.58, + "step": 110520, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.72319746, + "epoch": 0.6181920371563949, + "grad_norm": 7.65625, + "learning_rate": 8.274246491092815e-06, + "loss": 1.08395977, + "memory(GiB)": 302.58, + "step": 110540, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.74204741, + "epoch": 0.6183038866293742, + "grad_norm": 7.5625, + "learning_rate": 8.2735475805542e-06, + "loss": 1.01360712, + "memory(GiB)": 302.58, + "step": 110560, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74293237, + "epoch": 0.6184157361023535, + "grad_norm": 6.96875, + "learning_rate": 8.27284855804988e-06, + "loss": 1.01511011, + "memory(GiB)": 302.58, + "step": 110580, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.72260895, + "epoch": 0.6185275855753327, + "grad_norm": 6.5625, + "learning_rate": 8.27214942360376e-06, + "loss": 1.11409407, + "memory(GiB)": 302.58, + "step": 110600, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.74399223, + "epoch": 0.618639435048312, + "grad_norm": 5.78125, + "learning_rate": 8.271450177239752e-06, + "loss": 1.01976595, + "memory(GiB)": 302.58, + "step": 110620, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.74747448, + "epoch": 0.6187512845212912, + "grad_norm": 6.625, + "learning_rate": 8.270750818981776e-06, + "loss": 1.00897284, + "memory(GiB)": 302.58, + "step": 110640, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.73390379, + "epoch": 0.6188631339942705, + "grad_norm": 8.3125, + "learning_rate": 8.270051348853752e-06, + "loss": 1.03698826, + "memory(GiB)": 302.58, + "step": 110660, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.7214685, + "epoch": 0.6189749834672498, + "grad_norm": 6.09375, + "learning_rate": 8.269351766879602e-06, + "loss": 1.09430609, + "memory(GiB)": 302.58, + "step": 110680, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.74220405, + "epoch": 0.619086832940229, + "grad_norm": 8.75, + "learning_rate": 8.268652073083254e-06, + "loss": 1.02097092, + "memory(GiB)": 302.58, + "step": 110700, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.73294578, + "epoch": 0.6191986824132083, + "grad_norm": 5.96875, + "learning_rate": 8.26795226748864e-06, + "loss": 1.05496588, + "memory(GiB)": 302.58, + "step": 110720, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.74806528, + "epoch": 0.6193105318861876, + "grad_norm": 8.125, + "learning_rate": 8.267252350119698e-06, + "loss": 0.97138815, + "memory(GiB)": 302.58, + "step": 110740, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.74863219, + "epoch": 0.6194223813591668, + "grad_norm": 6.34375, + "learning_rate": 8.266552321000366e-06, + "loss": 0.98719358, + "memory(GiB)": 302.58, + "step": 110760, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.73241577, + "epoch": 0.6195342308321461, + "grad_norm": 6.5625, + "learning_rate": 8.265852180154584e-06, + "loss": 1.0417099, + "memory(GiB)": 302.58, + "step": 110780, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.74047918, + "epoch": 0.6196460803051254, + "grad_norm": 6.0, + "learning_rate": 8.265151927606304e-06, + "loss": 1.02151394, + "memory(GiB)": 302.58, + "step": 110800, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.74535499, + "epoch": 0.6197579297781046, + "grad_norm": 8.0, + "learning_rate": 8.264451563379475e-06, + "loss": 1.00454826, + "memory(GiB)": 302.58, + "step": 110820, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.74456844, + "epoch": 0.6198697792510839, + "grad_norm": 5.09375, + "learning_rate": 8.263751087498049e-06, + "loss": 0.99726791, + "memory(GiB)": 302.58, + "step": 110840, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.75332208, + "epoch": 0.6199816287240631, + "grad_norm": 6.25, + "learning_rate": 8.263050499985987e-06, + "loss": 0.95491562, + "memory(GiB)": 302.58, + "step": 110860, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.72007761, + "epoch": 0.6200934781970424, + "grad_norm": 6.40625, + "learning_rate": 8.26234980086725e-06, + "loss": 1.11813879, + "memory(GiB)": 302.58, + "step": 110880, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.74571619, + "epoch": 0.6202053276700217, + "grad_norm": 6.6875, + "learning_rate": 8.261648990165808e-06, + "loss": 0.99886332, + "memory(GiB)": 302.58, + "step": 110900, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.734692, + "epoch": 0.6203171771430009, + "grad_norm": 9.625, + "learning_rate": 8.260948067905627e-06, + "loss": 1.06422348, + "memory(GiB)": 302.58, + "step": 110920, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.75459456, + "epoch": 0.6204290266159802, + "grad_norm": 5.875, + "learning_rate": 8.260247034110682e-06, + "loss": 0.96624031, + "memory(GiB)": 302.58, + "step": 110940, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.76104021, + "epoch": 0.6205408760889595, + "grad_norm": 5.71875, + "learning_rate": 8.259545888804949e-06, + "loss": 0.92251673, + "memory(GiB)": 302.58, + "step": 110960, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.73963051, + "epoch": 0.6206527255619387, + "grad_norm": 5.40625, + "learning_rate": 8.258844632012412e-06, + "loss": 1.01529312, + "memory(GiB)": 302.58, + "step": 110980, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.75354505, + "epoch": 0.620764575034918, + "grad_norm": 7.34375, + "learning_rate": 8.258143263757057e-06, + "loss": 0.96391296, + "memory(GiB)": 302.58, + "step": 111000, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.7403873, + "epoch": 0.6208764245078973, + "grad_norm": 10.0625, + "learning_rate": 8.25744178406287e-06, + "loss": 1.03415155, + "memory(GiB)": 302.58, + "step": 111020, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.72488885, + "epoch": 0.6209882739808765, + "grad_norm": 5.28125, + "learning_rate": 8.256740192953842e-06, + "loss": 1.06373301, + "memory(GiB)": 302.58, + "step": 111040, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.72396812, + "epoch": 0.6211001234538558, + "grad_norm": 7.0, + "learning_rate": 8.256038490453977e-06, + "loss": 1.08659811, + "memory(GiB)": 302.58, + "step": 111060, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.7225791, + "epoch": 0.621211972926835, + "grad_norm": 6.90625, + "learning_rate": 8.255336676587267e-06, + "loss": 1.09636583, + "memory(GiB)": 302.58, + "step": 111080, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.72051659, + "epoch": 0.6213238223998143, + "grad_norm": 7.125, + "learning_rate": 8.254634751377723e-06, + "loss": 1.11294632, + "memory(GiB)": 302.58, + "step": 111100, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.73513255, + "epoch": 0.6214356718727936, + "grad_norm": 7.3125, + "learning_rate": 8.25393271484935e-06, + "loss": 1.04141855, + "memory(GiB)": 302.58, + "step": 111120, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.74939003, + "epoch": 0.6215475213457728, + "grad_norm": 5.625, + "learning_rate": 8.253230567026159e-06, + "loss": 0.97508392, + "memory(GiB)": 302.58, + "step": 111140, + "train_speed(iter/s)": 0.123839 + }, + { + "acc": 0.75093112, + "epoch": 0.6216593708187521, + "grad_norm": 6.40625, + "learning_rate": 8.252528307932168e-06, + "loss": 0.98046093, + "memory(GiB)": 302.58, + "step": 111160, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.73205976, + "epoch": 0.6217712202917314, + "grad_norm": 6.375, + "learning_rate": 8.251825937591394e-06, + "loss": 1.05883255, + "memory(GiB)": 302.58, + "step": 111180, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.74356956, + "epoch": 0.6218830697647106, + "grad_norm": 8.5625, + "learning_rate": 8.251123456027862e-06, + "loss": 0.99147806, + "memory(GiB)": 302.58, + "step": 111200, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.73845868, + "epoch": 0.6219949192376899, + "grad_norm": 5.3125, + "learning_rate": 8.2504208632656e-06, + "loss": 1.04909525, + "memory(GiB)": 302.58, + "step": 111220, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.72952228, + "epoch": 0.6221067687106692, + "grad_norm": 8.4375, + "learning_rate": 8.249718159328636e-06, + "loss": 1.06013184, + "memory(GiB)": 302.58, + "step": 111240, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.7539125, + "epoch": 0.6222186181836484, + "grad_norm": 8.8125, + "learning_rate": 8.24901534424101e-06, + "loss": 0.98417463, + "memory(GiB)": 302.58, + "step": 111260, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73324094, + "epoch": 0.6223304676566277, + "grad_norm": 8.8125, + "learning_rate": 8.248312418026753e-06, + "loss": 1.0580266, + "memory(GiB)": 302.58, + "step": 111280, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.73710947, + "epoch": 0.622442317129607, + "grad_norm": 7.6875, + "learning_rate": 8.247609380709914e-06, + "loss": 1.04625845, + "memory(GiB)": 302.58, + "step": 111300, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.75498748, + "epoch": 0.6225541666025862, + "grad_norm": 6.15625, + "learning_rate": 8.246906232314532e-06, + "loss": 0.96155815, + "memory(GiB)": 302.58, + "step": 111320, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.73821697, + "epoch": 0.6226660160755655, + "grad_norm": 6.15625, + "learning_rate": 8.246202972864665e-06, + "loss": 1.01914263, + "memory(GiB)": 302.58, + "step": 111340, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.71770043, + "epoch": 0.6227778655485448, + "grad_norm": 6.875, + "learning_rate": 8.245499602384362e-06, + "loss": 1.11948061, + "memory(GiB)": 302.58, + "step": 111360, + "train_speed(iter/s)": 0.123957 + }, + { + "acc": 0.74877839, + "epoch": 0.622889715021524, + "grad_norm": 10.625, + "learning_rate": 8.24479612089768e-06, + "loss": 0.98696194, + "memory(GiB)": 302.58, + "step": 111380, + "train_speed(iter/s)": 0.123967 + }, + { + "acc": 0.74587317, + "epoch": 0.6230015644945033, + "grad_norm": 6.78125, + "learning_rate": 8.244092528428685e-06, + "loss": 0.99113131, + "memory(GiB)": 302.58, + "step": 111400, + "train_speed(iter/s)": 0.123978 + }, + { + "acc": 0.73064785, + "epoch": 0.6231134139674825, + "grad_norm": 6.71875, + "learning_rate": 8.243388825001434e-06, + "loss": 1.0834096, + "memory(GiB)": 302.58, + "step": 111420, + "train_speed(iter/s)": 0.123988 + }, + { + "acc": 0.73878112, + "epoch": 0.6232252634404618, + "grad_norm": 7.4375, + "learning_rate": 8.242685010640005e-06, + "loss": 1.03072891, + "memory(GiB)": 302.58, + "step": 111440, + "train_speed(iter/s)": 0.123998 + }, + { + "acc": 0.74077568, + "epoch": 0.6233371129134411, + "grad_norm": 9.3125, + "learning_rate": 8.241981085368464e-06, + "loss": 1.00741901, + "memory(GiB)": 302.58, + "step": 111460, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.73414097, + "epoch": 0.6234489623864203, + "grad_norm": 8.125, + "learning_rate": 8.241277049210892e-06, + "loss": 1.04372616, + "memory(GiB)": 302.58, + "step": 111480, + "train_speed(iter/s)": 0.12402 + }, + { + "acc": 0.74847136, + "epoch": 0.6235608118593996, + "grad_norm": 9.75, + "learning_rate": 8.240572902191364e-06, + "loss": 1.005091, + "memory(GiB)": 302.58, + "step": 111500, + "train_speed(iter/s)": 0.124031 + }, + { + "acc": 0.72717404, + "epoch": 0.6236726613323789, + "grad_norm": 8.5625, + "learning_rate": 8.23986864433397e-06, + "loss": 1.0748703, + "memory(GiB)": 302.58, + "step": 111520, + "train_speed(iter/s)": 0.124041 + }, + { + "acc": 0.73275337, + "epoch": 0.6237845108053581, + "grad_norm": 7.625, + "learning_rate": 8.239164275662794e-06, + "loss": 1.03104496, + "memory(GiB)": 302.58, + "step": 111540, + "train_speed(iter/s)": 0.124051 + }, + { + "acc": 0.73249879, + "epoch": 0.6238963602783374, + "grad_norm": 9.375, + "learning_rate": 8.238459796201928e-06, + "loss": 1.06142921, + "memory(GiB)": 302.58, + "step": 111560, + "train_speed(iter/s)": 0.124061 + }, + { + "acc": 0.74914346, + "epoch": 0.6240082097513167, + "grad_norm": 8.0625, + "learning_rate": 8.237755205975468e-06, + "loss": 0.99823027, + "memory(GiB)": 302.58, + "step": 111580, + "train_speed(iter/s)": 0.124072 + }, + { + "acc": 0.73442526, + "epoch": 0.6241200592242959, + "grad_norm": 7.1875, + "learning_rate": 8.237050505007514e-06, + "loss": 1.0321804, + "memory(GiB)": 302.58, + "step": 111600, + "train_speed(iter/s)": 0.124082 + }, + { + "acc": 0.71694031, + "epoch": 0.6242319086972752, + "grad_norm": 6.5, + "learning_rate": 8.23634569332217e-06, + "loss": 1.1306963, + "memory(GiB)": 302.58, + "step": 111620, + "train_speed(iter/s)": 0.124093 + }, + { + "acc": 0.7460391, + "epoch": 0.6243437581702544, + "grad_norm": 4.5, + "learning_rate": 8.235640770943538e-06, + "loss": 0.99559336, + "memory(GiB)": 302.58, + "step": 111640, + "train_speed(iter/s)": 0.124103 + }, + { + "acc": 0.72455654, + "epoch": 0.6244556076432337, + "grad_norm": 8.25, + "learning_rate": 8.234935737895732e-06, + "loss": 1.11872873, + "memory(GiB)": 302.58, + "step": 111660, + "train_speed(iter/s)": 0.124114 + }, + { + "acc": 0.73836613, + "epoch": 0.624567457116213, + "grad_norm": 5.4375, + "learning_rate": 8.234230594202867e-06, + "loss": 1.04089012, + "memory(GiB)": 302.58, + "step": 111680, + "train_speed(iter/s)": 0.124125 + }, + { + "acc": 0.7449719, + "epoch": 0.6246793065891922, + "grad_norm": 5.125, + "learning_rate": 8.233525339889062e-06, + "loss": 1.00600338, + "memory(GiB)": 302.58, + "step": 111700, + "train_speed(iter/s)": 0.124135 + }, + { + "acc": 0.75825701, + "epoch": 0.6247911560621715, + "grad_norm": 9.25, + "learning_rate": 8.232819974978435e-06, + "loss": 0.9357955, + "memory(GiB)": 302.58, + "step": 111720, + "train_speed(iter/s)": 0.124146 + }, + { + "acc": 0.71430459, + "epoch": 0.6249030055351508, + "grad_norm": 5.21875, + "learning_rate": 8.232114499495114e-06, + "loss": 1.14136858, + "memory(GiB)": 302.58, + "step": 111740, + "train_speed(iter/s)": 0.124156 + }, + { + "acc": 0.75503106, + "epoch": 0.62501485500813, + "grad_norm": 5.53125, + "learning_rate": 8.23140891346323e-06, + "loss": 0.94020452, + "memory(GiB)": 302.58, + "step": 111760, + "train_speed(iter/s)": 0.124166 + }, + { + "acc": 0.73890171, + "epoch": 0.6251267044811093, + "grad_norm": 9.6875, + "learning_rate": 8.230703216906912e-06, + "loss": 1.03057947, + "memory(GiB)": 302.58, + "step": 111780, + "train_speed(iter/s)": 0.124176 + }, + { + "acc": 0.72842593, + "epoch": 0.6252385539540886, + "grad_norm": 6.96875, + "learning_rate": 8.229997409850301e-06, + "loss": 1.08623381, + "memory(GiB)": 302.58, + "step": 111800, + "train_speed(iter/s)": 0.124187 + }, + { + "acc": 0.75020766, + "epoch": 0.6253504034270678, + "grad_norm": 9.9375, + "learning_rate": 8.229291492317538e-06, + "loss": 0.97281179, + "memory(GiB)": 302.58, + "step": 111820, + "train_speed(iter/s)": 0.124198 + }, + { + "acc": 0.72916846, + "epoch": 0.6254622529000471, + "grad_norm": 7.9375, + "learning_rate": 8.228585464332764e-06, + "loss": 1.07829132, + "memory(GiB)": 302.58, + "step": 111840, + "train_speed(iter/s)": 0.124209 + }, + { + "acc": 0.75093379, + "epoch": 0.6255741023730264, + "grad_norm": 5.5, + "learning_rate": 8.22787932592013e-06, + "loss": 0.97617502, + "memory(GiB)": 302.58, + "step": 111860, + "train_speed(iter/s)": 0.124219 + }, + { + "acc": 0.74850993, + "epoch": 0.6256859518460056, + "grad_norm": 7.1875, + "learning_rate": 8.227173077103789e-06, + "loss": 0.984231, + "memory(GiB)": 302.58, + "step": 111880, + "train_speed(iter/s)": 0.124229 + }, + { + "acc": 0.75030694, + "epoch": 0.6257978013189849, + "grad_norm": 5.84375, + "learning_rate": 8.226466717907894e-06, + "loss": 0.96388941, + "memory(GiB)": 302.58, + "step": 111900, + "train_speed(iter/s)": 0.12424 + }, + { + "acc": 0.71873984, + "epoch": 0.6259096507919641, + "grad_norm": 5.6875, + "learning_rate": 8.225760248356607e-06, + "loss": 1.09854622, + "memory(GiB)": 302.58, + "step": 111920, + "train_speed(iter/s)": 0.124251 + }, + { + "acc": 0.74523716, + "epoch": 0.6260215002649434, + "grad_norm": 7.03125, + "learning_rate": 8.22505366847409e-06, + "loss": 1.00477676, + "memory(GiB)": 302.58, + "step": 111940, + "train_speed(iter/s)": 0.124261 + }, + { + "acc": 0.75134277, + "epoch": 0.6261333497379227, + "grad_norm": 10.0625, + "learning_rate": 8.224346978284513e-06, + "loss": 0.98162766, + "memory(GiB)": 302.58, + "step": 111960, + "train_speed(iter/s)": 0.124272 + }, + { + "acc": 0.72920508, + "epoch": 0.6262451992109019, + "grad_norm": 9.25, + "learning_rate": 8.223640177812044e-06, + "loss": 1.07865438, + "memory(GiB)": 302.58, + "step": 111980, + "train_speed(iter/s)": 0.124282 + }, + { + "acc": 0.73608832, + "epoch": 0.6263570486838812, + "grad_norm": 7.0, + "learning_rate": 8.22293326708086e-06, + "loss": 1.02370949, + "memory(GiB)": 302.58, + "step": 112000, + "train_speed(iter/s)": 0.124293 + }, + { + "epoch": 0.6263570486838812, + "eval_acc": 0.7032073179304539, + "eval_loss": 1.0276103019714355, + "eval_runtime": 7510.0381, + "eval_samples_per_second": 10.024, + "eval_steps_per_second": 10.024, + "step": 112000 + }, + { + "acc": 0.74465184, + "epoch": 0.6264688981568605, + "grad_norm": 6.90625, + "learning_rate": 8.222226246115137e-06, + "loss": 0.99798861, + "memory(GiB)": 302.58, + "step": 112020, + "train_speed(iter/s)": 0.123258 + }, + { + "acc": 0.74261861, + "epoch": 0.6265807476298397, + "grad_norm": 8.5, + "learning_rate": 8.22151911493906e-06, + "loss": 1.02809963, + "memory(GiB)": 302.58, + "step": 112040, + "train_speed(iter/s)": 0.123269 + }, + { + "acc": 0.72866359, + "epoch": 0.626692597102819, + "grad_norm": 9.3125, + "learning_rate": 8.220811873576815e-06, + "loss": 1.06917744, + "memory(GiB)": 302.58, + "step": 112060, + "train_speed(iter/s)": 0.12328 + }, + { + "acc": 0.73980041, + "epoch": 0.6268044465757983, + "grad_norm": 6.4375, + "learning_rate": 8.22010452205259e-06, + "loss": 1.02333269, + "memory(GiB)": 302.58, + "step": 112080, + "train_speed(iter/s)": 0.12329 + }, + { + "acc": 0.71824164, + "epoch": 0.6269162960487775, + "grad_norm": 8.625, + "learning_rate": 8.219397060390579e-06, + "loss": 1.09727516, + "memory(GiB)": 302.58, + "step": 112100, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.74779034, + "epoch": 0.6270281455217568, + "grad_norm": 7.4375, + "learning_rate": 8.218689488614982e-06, + "loss": 0.98164721, + "memory(GiB)": 302.58, + "step": 112120, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.73960395, + "epoch": 0.627139994994736, + "grad_norm": 6.40625, + "learning_rate": 8.217981806749998e-06, + "loss": 1.02138414, + "memory(GiB)": 302.58, + "step": 112140, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.73847833, + "epoch": 0.6272518444677153, + "grad_norm": 5.34375, + "learning_rate": 8.21727401481983e-06, + "loss": 1.02340755, + "memory(GiB)": 302.58, + "step": 112160, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.73950109, + "epoch": 0.6273636939406946, + "grad_norm": 8.875, + "learning_rate": 8.21656611284869e-06, + "loss": 1.02706795, + "memory(GiB)": 302.58, + "step": 112180, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.75882192, + "epoch": 0.6274755434136738, + "grad_norm": 6.3125, + "learning_rate": 8.21585810086079e-06, + "loss": 0.93987951, + "memory(GiB)": 302.58, + "step": 112200, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.75133014, + "epoch": 0.6275873928866531, + "grad_norm": 7.96875, + "learning_rate": 8.215149978880345e-06, + "loss": 0.96612473, + "memory(GiB)": 302.58, + "step": 112220, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.73295512, + "epoch": 0.6276992423596324, + "grad_norm": 6.9375, + "learning_rate": 8.214441746931579e-06, + "loss": 1.06176453, + "memory(GiB)": 302.58, + "step": 112240, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.75387444, + "epoch": 0.6278110918326116, + "grad_norm": 6.03125, + "learning_rate": 8.213733405038708e-06, + "loss": 0.95757246, + "memory(GiB)": 302.58, + "step": 112260, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.73239889, + "epoch": 0.6279229413055909, + "grad_norm": 8.375, + "learning_rate": 8.213024953225966e-06, + "loss": 1.06603689, + "memory(GiB)": 302.58, + "step": 112280, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74035592, + "epoch": 0.6280347907785702, + "grad_norm": 6.90625, + "learning_rate": 8.212316391517583e-06, + "loss": 1.03810768, + "memory(GiB)": 302.58, + "step": 112300, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.74303293, + "epoch": 0.6281466402515494, + "grad_norm": 8.9375, + "learning_rate": 8.211607719937793e-06, + "loss": 1.03706875, + "memory(GiB)": 302.58, + "step": 112320, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.7378911, + "epoch": 0.6282584897245287, + "grad_norm": 7.03125, + "learning_rate": 8.210898938510834e-06, + "loss": 1.03496218, + "memory(GiB)": 302.58, + "step": 112340, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75551829, + "epoch": 0.628370339197508, + "grad_norm": 8.375, + "learning_rate": 8.210190047260951e-06, + "loss": 0.95247307, + "memory(GiB)": 302.58, + "step": 112360, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.73908176, + "epoch": 0.6284821886704872, + "grad_norm": 7.15625, + "learning_rate": 8.209481046212391e-06, + "loss": 1.01416178, + "memory(GiB)": 302.58, + "step": 112380, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.7438839, + "epoch": 0.6285940381434665, + "grad_norm": 5.875, + "learning_rate": 8.2087719353894e-06, + "loss": 0.99380674, + "memory(GiB)": 302.58, + "step": 112400, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.74266586, + "epoch": 0.6287058876164457, + "grad_norm": 7.09375, + "learning_rate": 8.208062714816233e-06, + "loss": 1.02421074, + "memory(GiB)": 302.58, + "step": 112420, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.74096861, + "epoch": 0.628817737089425, + "grad_norm": 8.8125, + "learning_rate": 8.20735338451715e-06, + "loss": 1.01370182, + "memory(GiB)": 302.58, + "step": 112440, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.74755101, + "epoch": 0.6289295865624043, + "grad_norm": 6.8125, + "learning_rate": 8.206643944516412e-06, + "loss": 0.99574356, + "memory(GiB)": 302.58, + "step": 112460, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.72832837, + "epoch": 0.6290414360353835, + "grad_norm": 8.0625, + "learning_rate": 8.205934394838281e-06, + "loss": 1.07583637, + "memory(GiB)": 302.58, + "step": 112480, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.73089938, + "epoch": 0.6291532855083628, + "grad_norm": 6.59375, + "learning_rate": 8.20522473550703e-06, + "loss": 1.06666698, + "memory(GiB)": 302.58, + "step": 112500, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.74904256, + "epoch": 0.6292651349813421, + "grad_norm": 6.59375, + "learning_rate": 8.204514966546929e-06, + "loss": 0.98712358, + "memory(GiB)": 302.58, + "step": 112520, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.73202291, + "epoch": 0.6293769844543213, + "grad_norm": 5.6875, + "learning_rate": 8.203805087982255e-06, + "loss": 1.04651642, + "memory(GiB)": 302.58, + "step": 112540, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.74105453, + "epoch": 0.6294888339273006, + "grad_norm": 5.15625, + "learning_rate": 8.203095099837287e-06, + "loss": 1.00550146, + "memory(GiB)": 302.58, + "step": 112560, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.74386644, + "epoch": 0.6296006834002799, + "grad_norm": 7.90625, + "learning_rate": 8.20238500213631e-06, + "loss": 1.00390711, + "memory(GiB)": 302.58, + "step": 112580, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.7360342, + "epoch": 0.6297125328732591, + "grad_norm": 6.875, + "learning_rate": 8.201674794903611e-06, + "loss": 1.0450633, + "memory(GiB)": 302.58, + "step": 112600, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.73928666, + "epoch": 0.6298243823462384, + "grad_norm": 8.0625, + "learning_rate": 8.200964478163483e-06, + "loss": 1.02098465, + "memory(GiB)": 302.58, + "step": 112620, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.75856309, + "epoch": 0.6299362318192177, + "grad_norm": 8.75, + "learning_rate": 8.200254051940219e-06, + "loss": 0.94371538, + "memory(GiB)": 302.58, + "step": 112640, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74248624, + "epoch": 0.6300480812921969, + "grad_norm": 6.75, + "learning_rate": 8.199543516258119e-06, + "loss": 1.01679869, + "memory(GiB)": 302.58, + "step": 112660, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.72646914, + "epoch": 0.6301599307651762, + "grad_norm": 6.59375, + "learning_rate": 8.198832871141486e-06, + "loss": 1.07587328, + "memory(GiB)": 302.58, + "step": 112680, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.75087352, + "epoch": 0.6302717802381554, + "grad_norm": 6.21875, + "learning_rate": 8.198122116614626e-06, + "loss": 0.96922436, + "memory(GiB)": 302.58, + "step": 112700, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.73130822, + "epoch": 0.6303836297111347, + "grad_norm": 9.25, + "learning_rate": 8.197411252701847e-06, + "loss": 1.08104153, + "memory(GiB)": 302.58, + "step": 112720, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.74325552, + "epoch": 0.630495479184114, + "grad_norm": 12.6875, + "learning_rate": 8.196700279427466e-06, + "loss": 1.01357851, + "memory(GiB)": 302.58, + "step": 112740, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.7260561, + "epoch": 0.6306073286570932, + "grad_norm": 6.28125, + "learning_rate": 8.195989196815797e-06, + "loss": 1.08269939, + "memory(GiB)": 302.58, + "step": 112760, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.74541378, + "epoch": 0.6307191781300725, + "grad_norm": 6.375, + "learning_rate": 8.195278004891165e-06, + "loss": 0.98703442, + "memory(GiB)": 302.58, + "step": 112780, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.72583456, + "epoch": 0.6308310276030518, + "grad_norm": 5.59375, + "learning_rate": 8.194566703677893e-06, + "loss": 1.08506413, + "memory(GiB)": 302.58, + "step": 112800, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.74049034, + "epoch": 0.630942877076031, + "grad_norm": 7.28125, + "learning_rate": 8.193855293200309e-06, + "loss": 1.00999613, + "memory(GiB)": 302.58, + "step": 112820, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.75542855, + "epoch": 0.6310547265490103, + "grad_norm": 9.6875, + "learning_rate": 8.193143773482748e-06, + "loss": 0.9551651, + "memory(GiB)": 302.58, + "step": 112840, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.7441401, + "epoch": 0.6311665760219896, + "grad_norm": 10.4375, + "learning_rate": 8.192432144549543e-06, + "loss": 0.99146099, + "memory(GiB)": 302.58, + "step": 112860, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.74874811, + "epoch": 0.6312784254949688, + "grad_norm": 6.0625, + "learning_rate": 8.191720406425037e-06, + "loss": 0.96706591, + "memory(GiB)": 302.58, + "step": 112880, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.74215546, + "epoch": 0.6313902749679481, + "grad_norm": 8.8125, + "learning_rate": 8.191008559133572e-06, + "loss": 1.02289162, + "memory(GiB)": 302.58, + "step": 112900, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.74164162, + "epoch": 0.6315021244409273, + "grad_norm": 7.96875, + "learning_rate": 8.190296602699496e-06, + "loss": 1.03268251, + "memory(GiB)": 302.58, + "step": 112920, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.72953057, + "epoch": 0.6316139739139066, + "grad_norm": 9.1875, + "learning_rate": 8.18958453714716e-06, + "loss": 1.04970942, + "memory(GiB)": 302.58, + "step": 112940, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.74583611, + "epoch": 0.6317258233868859, + "grad_norm": 8.75, + "learning_rate": 8.18887236250092e-06, + "loss": 1.00575542, + "memory(GiB)": 302.58, + "step": 112960, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.73401446, + "epoch": 0.6318376728598651, + "grad_norm": 11.0, + "learning_rate": 8.188160078785131e-06, + "loss": 1.04281111, + "memory(GiB)": 302.58, + "step": 112980, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.75435634, + "epoch": 0.6319495223328444, + "grad_norm": 5.0, + "learning_rate": 8.187447686024158e-06, + "loss": 0.94126205, + "memory(GiB)": 302.58, + "step": 113000, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.73813257, + "epoch": 0.6320613718058237, + "grad_norm": 7.21875, + "learning_rate": 8.18673518424237e-06, + "loss": 1.02385244, + "memory(GiB)": 302.58, + "step": 113020, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.73814988, + "epoch": 0.6321732212788029, + "grad_norm": 5.53125, + "learning_rate": 8.186022573464132e-06, + "loss": 1.02944975, + "memory(GiB)": 302.58, + "step": 113040, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.74059777, + "epoch": 0.6322850707517822, + "grad_norm": 7.0625, + "learning_rate": 8.18530985371382e-06, + "loss": 1.01917372, + "memory(GiB)": 302.58, + "step": 113060, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.7283783, + "epoch": 0.6323969202247615, + "grad_norm": 7.9375, + "learning_rate": 8.18459702501581e-06, + "loss": 1.08588648, + "memory(GiB)": 302.58, + "step": 113080, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.74160247, + "epoch": 0.6325087696977408, + "grad_norm": 6.25, + "learning_rate": 8.183884087394485e-06, + "loss": 1.02666502, + "memory(GiB)": 302.58, + "step": 113100, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.72344356, + "epoch": 0.6326206191707201, + "grad_norm": 7.90625, + "learning_rate": 8.183171040874228e-06, + "loss": 1.08373213, + "memory(GiB)": 302.58, + "step": 113120, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.73850422, + "epoch": 0.6327324686436994, + "grad_norm": 6.625, + "learning_rate": 8.182457885479426e-06, + "loss": 1.02493343, + "memory(GiB)": 302.58, + "step": 113140, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.74309182, + "epoch": 0.6328443181166786, + "grad_norm": 5.8125, + "learning_rate": 8.181744621234476e-06, + "loss": 1.02641668, + "memory(GiB)": 302.58, + "step": 113160, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.72878361, + "epoch": 0.6329561675896579, + "grad_norm": 4.0, + "learning_rate": 8.181031248163767e-06, + "loss": 1.06835632, + "memory(GiB)": 302.58, + "step": 113180, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.7314539, + "epoch": 0.6330680170626372, + "grad_norm": 8.0625, + "learning_rate": 8.180317766291705e-06, + "loss": 1.05649014, + "memory(GiB)": 302.58, + "step": 113200, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.74693713, + "epoch": 0.6331798665356164, + "grad_norm": 8.3125, + "learning_rate": 8.17960417564269e-06, + "loss": 0.98818312, + "memory(GiB)": 302.58, + "step": 113220, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.73479114, + "epoch": 0.6332917160085957, + "grad_norm": 5.09375, + "learning_rate": 8.17889047624113e-06, + "loss": 1.06268272, + "memory(GiB)": 302.58, + "step": 113240, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.72375379, + "epoch": 0.633403565481575, + "grad_norm": 7.5625, + "learning_rate": 8.178176668111437e-06, + "loss": 1.08629913, + "memory(GiB)": 302.58, + "step": 113260, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.72492876, + "epoch": 0.6335154149545542, + "grad_norm": 5.9375, + "learning_rate": 8.177462751278024e-06, + "loss": 1.09038897, + "memory(GiB)": 302.58, + "step": 113280, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.72722316, + "epoch": 0.6336272644275335, + "grad_norm": 6.96875, + "learning_rate": 8.176748725765309e-06, + "loss": 1.06810989, + "memory(GiB)": 302.58, + "step": 113300, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.71292033, + "epoch": 0.6337391139005127, + "grad_norm": 8.625, + "learning_rate": 8.176034591597715e-06, + "loss": 1.1208147, + "memory(GiB)": 302.58, + "step": 113320, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.75643191, + "epoch": 0.633850963373492, + "grad_norm": 6.0625, + "learning_rate": 8.175320348799664e-06, + "loss": 0.92189922, + "memory(GiB)": 302.58, + "step": 113340, + "train_speed(iter/s)": 0.123937 + }, + { + "acc": 0.72354097, + "epoch": 0.6339628128464713, + "grad_norm": 4.96875, + "learning_rate": 8.174605997395592e-06, + "loss": 1.07184038, + "memory(GiB)": 302.58, + "step": 113360, + "train_speed(iter/s)": 0.123948 + }, + { + "acc": 0.72430534, + "epoch": 0.6340746623194505, + "grad_norm": 7.0625, + "learning_rate": 8.173891537409927e-06, + "loss": 1.10012083, + "memory(GiB)": 302.58, + "step": 113380, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.73592787, + "epoch": 0.6341865117924298, + "grad_norm": 5.84375, + "learning_rate": 8.173176968867106e-06, + "loss": 1.03169022, + "memory(GiB)": 302.58, + "step": 113400, + "train_speed(iter/s)": 0.123969 + }, + { + "acc": 0.7352694, + "epoch": 0.6342983612654091, + "grad_norm": 6.125, + "learning_rate": 8.172462291791574e-06, + "loss": 1.06010332, + "memory(GiB)": 302.58, + "step": 113420, + "train_speed(iter/s)": 0.123979 + }, + { + "acc": 0.72609172, + "epoch": 0.6344102107383883, + "grad_norm": 5.3125, + "learning_rate": 8.17174750620777e-06, + "loss": 1.06089592, + "memory(GiB)": 302.58, + "step": 113440, + "train_speed(iter/s)": 0.12399 + }, + { + "acc": 0.73785157, + "epoch": 0.6345220602113676, + "grad_norm": 6.40625, + "learning_rate": 8.171032612140144e-06, + "loss": 1.04958506, + "memory(GiB)": 302.58, + "step": 113460, + "train_speed(iter/s)": 0.124001 + }, + { + "acc": 0.74255195, + "epoch": 0.6346339096843469, + "grad_norm": 4.59375, + "learning_rate": 8.170317609613148e-06, + "loss": 0.99985399, + "memory(GiB)": 302.58, + "step": 113480, + "train_speed(iter/s)": 0.124011 + }, + { + "acc": 0.72780232, + "epoch": 0.6347457591573261, + "grad_norm": 7.28125, + "learning_rate": 8.169602498651236e-06, + "loss": 1.09150105, + "memory(GiB)": 302.58, + "step": 113500, + "train_speed(iter/s)": 0.124021 + }, + { + "acc": 0.72413073, + "epoch": 0.6348576086303054, + "grad_norm": 6.5, + "learning_rate": 8.16888727927887e-06, + "loss": 1.07735929, + "memory(GiB)": 302.58, + "step": 113520, + "train_speed(iter/s)": 0.124032 + }, + { + "acc": 0.73064694, + "epoch": 0.6349694581032846, + "grad_norm": 6.34375, + "learning_rate": 8.168171951520509e-06, + "loss": 1.04854546, + "memory(GiB)": 302.58, + "step": 113540, + "train_speed(iter/s)": 0.124042 + }, + { + "acc": 0.73461628, + "epoch": 0.6350813075762639, + "grad_norm": 8.375, + "learning_rate": 8.167456515400623e-06, + "loss": 1.04530172, + "memory(GiB)": 302.58, + "step": 113560, + "train_speed(iter/s)": 0.124052 + }, + { + "acc": 0.72831674, + "epoch": 0.6351931570492432, + "grad_norm": 10.375, + "learning_rate": 8.166740970943678e-06, + "loss": 1.09221573, + "memory(GiB)": 302.58, + "step": 113580, + "train_speed(iter/s)": 0.124063 + }, + { + "acc": 0.72296948, + "epoch": 0.6353050065222224, + "grad_norm": 7.8125, + "learning_rate": 8.166025318174152e-06, + "loss": 1.10858517, + "memory(GiB)": 302.58, + "step": 113600, + "train_speed(iter/s)": 0.124074 + }, + { + "acc": 0.74663696, + "epoch": 0.6354168559952017, + "grad_norm": 9.3125, + "learning_rate": 8.165309557116521e-06, + "loss": 1.00812235, + "memory(GiB)": 302.58, + "step": 113620, + "train_speed(iter/s)": 0.124084 + }, + { + "acc": 0.7453578, + "epoch": 0.635528705468181, + "grad_norm": 9.0, + "learning_rate": 8.164593687795265e-06, + "loss": 0.99506292, + "memory(GiB)": 302.58, + "step": 113640, + "train_speed(iter/s)": 0.124095 + }, + { + "acc": 0.74762783, + "epoch": 0.6356405549411602, + "grad_norm": 6.6875, + "learning_rate": 8.163877710234872e-06, + "loss": 0.98146315, + "memory(GiB)": 302.58, + "step": 113660, + "train_speed(iter/s)": 0.124105 + }, + { + "acc": 0.73178105, + "epoch": 0.6357524044141395, + "grad_norm": 7.0, + "learning_rate": 8.163161624459828e-06, + "loss": 1.04869394, + "memory(GiB)": 302.58, + "step": 113680, + "train_speed(iter/s)": 0.124116 + }, + { + "acc": 0.74286537, + "epoch": 0.6358642538871188, + "grad_norm": 5.84375, + "learning_rate": 8.162445430494626e-06, + "loss": 1.00048141, + "memory(GiB)": 302.58, + "step": 113700, + "train_speed(iter/s)": 0.124127 + }, + { + "acc": 0.74285688, + "epoch": 0.635976103360098, + "grad_norm": 8.1875, + "learning_rate": 8.161729128363762e-06, + "loss": 1.03284664, + "memory(GiB)": 302.58, + "step": 113720, + "train_speed(iter/s)": 0.124137 + }, + { + "acc": 0.7483376, + "epoch": 0.6360879528330773, + "grad_norm": 5.84375, + "learning_rate": 8.161012718091737e-06, + "loss": 0.99625931, + "memory(GiB)": 302.58, + "step": 113740, + "train_speed(iter/s)": 0.124147 + }, + { + "acc": 0.72726111, + "epoch": 0.6361998023060565, + "grad_norm": 5.5, + "learning_rate": 8.160296199703053e-06, + "loss": 1.0881217, + "memory(GiB)": 302.58, + "step": 113760, + "train_speed(iter/s)": 0.124157 + }, + { + "acc": 0.73053684, + "epoch": 0.6363116517790358, + "grad_norm": 9.25, + "learning_rate": 8.15957957322222e-06, + "loss": 1.07983704, + "memory(GiB)": 302.58, + "step": 113780, + "train_speed(iter/s)": 0.124167 + }, + { + "acc": 0.74321041, + "epoch": 0.6364235012520151, + "grad_norm": 7.5625, + "learning_rate": 8.158862838673745e-06, + "loss": 1.02220669, + "memory(GiB)": 302.58, + "step": 113800, + "train_speed(iter/s)": 0.124177 + }, + { + "acc": 0.75147524, + "epoch": 0.6365353507249943, + "grad_norm": 4.96875, + "learning_rate": 8.158145996082146e-06, + "loss": 0.96321049, + "memory(GiB)": 302.58, + "step": 113820, + "train_speed(iter/s)": 0.124188 + }, + { + "acc": 0.73791676, + "epoch": 0.6366472001979736, + "grad_norm": 7.625, + "learning_rate": 8.157429045471937e-06, + "loss": 1.03037367, + "memory(GiB)": 302.58, + "step": 113840, + "train_speed(iter/s)": 0.124198 + }, + { + "acc": 0.74647999, + "epoch": 0.6367590496709529, + "grad_norm": 8.4375, + "learning_rate": 8.156711986867644e-06, + "loss": 0.99911299, + "memory(GiB)": 302.58, + "step": 113860, + "train_speed(iter/s)": 0.124209 + }, + { + "acc": 0.73337045, + "epoch": 0.6368708991439321, + "grad_norm": 6.6875, + "learning_rate": 8.15599482029379e-06, + "loss": 1.0443759, + "memory(GiB)": 302.58, + "step": 113880, + "train_speed(iter/s)": 0.124219 + }, + { + "acc": 0.73246217, + "epoch": 0.6369827486169114, + "grad_norm": 7.15625, + "learning_rate": 8.155277545774909e-06, + "loss": 1.07577829, + "memory(GiB)": 302.58, + "step": 113900, + "train_speed(iter/s)": 0.12423 + }, + { + "acc": 0.74184003, + "epoch": 0.6370945980898907, + "grad_norm": 6.09375, + "learning_rate": 8.15456016333553e-06, + "loss": 1.03195772, + "memory(GiB)": 302.58, + "step": 113920, + "train_speed(iter/s)": 0.12424 + }, + { + "acc": 0.73093729, + "epoch": 0.6372064475628699, + "grad_norm": 8.5, + "learning_rate": 8.15384267300019e-06, + "loss": 1.06109018, + "memory(GiB)": 302.58, + "step": 113940, + "train_speed(iter/s)": 0.124251 + }, + { + "acc": 0.74734235, + "epoch": 0.6373182970358492, + "grad_norm": 6.15625, + "learning_rate": 8.15312507479343e-06, + "loss": 0.98620567, + "memory(GiB)": 302.58, + "step": 113960, + "train_speed(iter/s)": 0.124262 + }, + { + "acc": 0.75241861, + "epoch": 0.6374301465088285, + "grad_norm": 5.46875, + "learning_rate": 8.152407368739792e-06, + "loss": 0.9571002, + "memory(GiB)": 302.58, + "step": 113980, + "train_speed(iter/s)": 0.124271 + }, + { + "acc": 0.72288418, + "epoch": 0.6375419959818077, + "grad_norm": 9.4375, + "learning_rate": 8.151689554863828e-06, + "loss": 1.09804831, + "memory(GiB)": 302.58, + "step": 114000, + "train_speed(iter/s)": 0.124282 + }, + { + "epoch": 0.6375419959818077, + "eval_acc": 0.7034396043873745, + "eval_loss": 1.0270447731018066, + "eval_runtime": 7539.6214, + "eval_samples_per_second": 9.985, + "eval_steps_per_second": 9.985, + "step": 114000 + }, + { + "acc": 0.72701259, + "epoch": 0.637653845454787, + "grad_norm": 6.0, + "learning_rate": 8.150971633190087e-06, + "loss": 1.10025129, + "memory(GiB)": 302.58, + "step": 114020, + "train_speed(iter/s)": 0.123261 + }, + { + "acc": 0.74635305, + "epoch": 0.6377656949277662, + "grad_norm": 9.875, + "learning_rate": 8.150253603743127e-06, + "loss": 1.00258417, + "memory(GiB)": 302.58, + "step": 114040, + "train_speed(iter/s)": 0.123272 + }, + { + "acc": 0.73804922, + "epoch": 0.6378775444007455, + "grad_norm": 7.625, + "learning_rate": 8.149535466547502e-06, + "loss": 1.03493452, + "memory(GiB)": 302.58, + "step": 114060, + "train_speed(iter/s)": 0.123282 + }, + { + "acc": 0.72794342, + "epoch": 0.6379893938737248, + "grad_norm": 4.84375, + "learning_rate": 8.148817221627779e-06, + "loss": 1.09789314, + "memory(GiB)": 302.58, + "step": 114080, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.7375958, + "epoch": 0.638101243346704, + "grad_norm": 11.1875, + "learning_rate": 8.148098869008521e-06, + "loss": 1.03741474, + "memory(GiB)": 302.58, + "step": 114100, + "train_speed(iter/s)": 0.123302 + }, + { + "acc": 0.7430603, + "epoch": 0.6382130928196833, + "grad_norm": 5.78125, + "learning_rate": 8.1473804087143e-06, + "loss": 0.98850927, + "memory(GiB)": 302.58, + "step": 114120, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.73885155, + "epoch": 0.6383249422926626, + "grad_norm": 6.25, + "learning_rate": 8.146661840769687e-06, + "loss": 1.00364056, + "memory(GiB)": 302.58, + "step": 114140, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.73961482, + "epoch": 0.6384367917656418, + "grad_norm": 5.1875, + "learning_rate": 8.145943165199264e-06, + "loss": 1.04088736, + "memory(GiB)": 302.58, + "step": 114160, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.74595041, + "epoch": 0.6385486412386211, + "grad_norm": 7.46875, + "learning_rate": 8.145224382027607e-06, + "loss": 1.01310167, + "memory(GiB)": 302.58, + "step": 114180, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.72391219, + "epoch": 0.6386604907116004, + "grad_norm": 6.40625, + "learning_rate": 8.144505491279304e-06, + "loss": 1.11833162, + "memory(GiB)": 302.58, + "step": 114200, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.75247288, + "epoch": 0.6387723401845796, + "grad_norm": 5.3125, + "learning_rate": 8.143786492978942e-06, + "loss": 0.96435585, + "memory(GiB)": 302.58, + "step": 114220, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.74392586, + "epoch": 0.6388841896575589, + "grad_norm": 7.5, + "learning_rate": 8.143067387151113e-06, + "loss": 1.01569729, + "memory(GiB)": 302.58, + "step": 114240, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.73891597, + "epoch": 0.6389960391305382, + "grad_norm": 7.34375, + "learning_rate": 8.142348173820415e-06, + "loss": 1.01727543, + "memory(GiB)": 302.58, + "step": 114260, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.73822999, + "epoch": 0.6391078886035174, + "grad_norm": 7.09375, + "learning_rate": 8.141628853011443e-06, + "loss": 1.03324547, + "memory(GiB)": 302.58, + "step": 114280, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.73861985, + "epoch": 0.6392197380764967, + "grad_norm": 7.15625, + "learning_rate": 8.140909424748803e-06, + "loss": 1.02807665, + "memory(GiB)": 302.58, + "step": 114300, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.75073991, + "epoch": 0.6393315875494759, + "grad_norm": 6.78125, + "learning_rate": 8.140189889057102e-06, + "loss": 0.98811245, + "memory(GiB)": 302.58, + "step": 114320, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.73565631, + "epoch": 0.6394434370224552, + "grad_norm": 8.25, + "learning_rate": 8.139470245960948e-06, + "loss": 1.02302456, + "memory(GiB)": 302.58, + "step": 114340, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.71662445, + "epoch": 0.6395552864954345, + "grad_norm": 7.59375, + "learning_rate": 8.138750495484957e-06, + "loss": 1.12711935, + "memory(GiB)": 302.58, + "step": 114360, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.74766269, + "epoch": 0.6396671359684137, + "grad_norm": 7.875, + "learning_rate": 8.138030637653746e-06, + "loss": 0.9876915, + "memory(GiB)": 302.58, + "step": 114380, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.7497076, + "epoch": 0.639778985441393, + "grad_norm": 8.3125, + "learning_rate": 8.137310672491939e-06, + "loss": 0.95293512, + "memory(GiB)": 302.58, + "step": 114400, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.72986317, + "epoch": 0.6398908349143723, + "grad_norm": 9.3125, + "learning_rate": 8.136590600024157e-06, + "loss": 1.0621335, + "memory(GiB)": 302.58, + "step": 114420, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74903631, + "epoch": 0.6400026843873515, + "grad_norm": 5.875, + "learning_rate": 8.135870420275032e-06, + "loss": 0.98803949, + "memory(GiB)": 302.58, + "step": 114440, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.70774307, + "epoch": 0.6401145338603308, + "grad_norm": 7.25, + "learning_rate": 8.135150133269192e-06, + "loss": 1.16549654, + "memory(GiB)": 302.58, + "step": 114460, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.73648667, + "epoch": 0.6402263833333101, + "grad_norm": 9.4375, + "learning_rate": 8.134429739031278e-06, + "loss": 1.03032856, + "memory(GiB)": 302.58, + "step": 114480, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75025163, + "epoch": 0.6403382328062893, + "grad_norm": 8.3125, + "learning_rate": 8.133709237585928e-06, + "loss": 0.9656292, + "memory(GiB)": 302.58, + "step": 114500, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.73145452, + "epoch": 0.6404500822792686, + "grad_norm": 7.15625, + "learning_rate": 8.132988628957785e-06, + "loss": 1.0654952, + "memory(GiB)": 302.58, + "step": 114520, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.73815746, + "epoch": 0.6405619317522478, + "grad_norm": 5.25, + "learning_rate": 8.132267913171497e-06, + "loss": 1.03625422, + "memory(GiB)": 302.58, + "step": 114540, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.75548058, + "epoch": 0.6406737812252271, + "grad_norm": 7.3125, + "learning_rate": 8.131547090251715e-06, + "loss": 0.97123384, + "memory(GiB)": 302.58, + "step": 114560, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.7310185, + "epoch": 0.6407856306982064, + "grad_norm": 7.3125, + "learning_rate": 8.130826160223092e-06, + "loss": 1.06908903, + "memory(GiB)": 302.58, + "step": 114580, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.74044037, + "epoch": 0.6408974801711856, + "grad_norm": 7.65625, + "learning_rate": 8.130105123110285e-06, + "loss": 1.03094635, + "memory(GiB)": 302.58, + "step": 114600, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.74094305, + "epoch": 0.6410093296441649, + "grad_norm": 7.8125, + "learning_rate": 8.12938397893796e-06, + "loss": 1.02439003, + "memory(GiB)": 302.58, + "step": 114620, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.75774608, + "epoch": 0.6411211791171442, + "grad_norm": 7.75, + "learning_rate": 8.128662727730779e-06, + "loss": 0.94670029, + "memory(GiB)": 302.58, + "step": 114640, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.72163591, + "epoch": 0.6412330285901234, + "grad_norm": 6.59375, + "learning_rate": 8.127941369513413e-06, + "loss": 1.10747843, + "memory(GiB)": 302.58, + "step": 114660, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.740939, + "epoch": 0.6413448780631027, + "grad_norm": 8.75, + "learning_rate": 8.12721990431053e-06, + "loss": 1.02025394, + "memory(GiB)": 302.58, + "step": 114680, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.74656353, + "epoch": 0.641456727536082, + "grad_norm": 7.0625, + "learning_rate": 8.126498332146815e-06, + "loss": 0.95875978, + "memory(GiB)": 302.58, + "step": 114700, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.75302749, + "epoch": 0.6415685770090612, + "grad_norm": 10.9375, + "learning_rate": 8.12577665304694e-06, + "loss": 0.97485285, + "memory(GiB)": 302.58, + "step": 114720, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.73323431, + "epoch": 0.6416804264820405, + "grad_norm": 5.3125, + "learning_rate": 8.125054867035593e-06, + "loss": 1.05350981, + "memory(GiB)": 302.58, + "step": 114740, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.7364933, + "epoch": 0.6417922759550198, + "grad_norm": 5.5625, + "learning_rate": 8.12433297413746e-06, + "loss": 1.04275408, + "memory(GiB)": 302.58, + "step": 114760, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.74685469, + "epoch": 0.641904125427999, + "grad_norm": 7.03125, + "learning_rate": 8.123610974377231e-06, + "loss": 0.99649868, + "memory(GiB)": 302.58, + "step": 114780, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.74530354, + "epoch": 0.6420159749009783, + "grad_norm": 6.5, + "learning_rate": 8.122888867779605e-06, + "loss": 1.00614538, + "memory(GiB)": 302.58, + "step": 114800, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.74619355, + "epoch": 0.6421278243739575, + "grad_norm": 5.21875, + "learning_rate": 8.122166654369276e-06, + "loss": 0.99874268, + "memory(GiB)": 302.58, + "step": 114820, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.7480082, + "epoch": 0.6422396738469368, + "grad_norm": 7.78125, + "learning_rate": 8.121444334170946e-06, + "loss": 0.97507372, + "memory(GiB)": 302.58, + "step": 114840, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.72322459, + "epoch": 0.6423515233199161, + "grad_norm": 7.8125, + "learning_rate": 8.120721907209323e-06, + "loss": 1.0764514, + "memory(GiB)": 302.58, + "step": 114860, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.71989655, + "epoch": 0.6424633727928953, + "grad_norm": 8.3125, + "learning_rate": 8.119999373509115e-06, + "loss": 1.11333551, + "memory(GiB)": 302.58, + "step": 114880, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.74426756, + "epoch": 0.6425752222658746, + "grad_norm": 5.3125, + "learning_rate": 8.119276733095037e-06, + "loss": 1.01656771, + "memory(GiB)": 302.58, + "step": 114900, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.74136257, + "epoch": 0.6426870717388539, + "grad_norm": 6.125, + "learning_rate": 8.118553985991801e-06, + "loss": 1.01267405, + "memory(GiB)": 302.58, + "step": 114920, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.73746028, + "epoch": 0.6427989212118331, + "grad_norm": 7.5625, + "learning_rate": 8.11783113222413e-06, + "loss": 1.00635777, + "memory(GiB)": 302.58, + "step": 114940, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.73260765, + "epoch": 0.6429107706848124, + "grad_norm": 9.3125, + "learning_rate": 8.11710817181675e-06, + "loss": 1.03597956, + "memory(GiB)": 302.58, + "step": 114960, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.74538302, + "epoch": 0.6430226201577917, + "grad_norm": 8.0, + "learning_rate": 8.116385104794384e-06, + "loss": 0.99211941, + "memory(GiB)": 302.58, + "step": 114980, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.72523646, + "epoch": 0.6431344696307709, + "grad_norm": 8.25, + "learning_rate": 8.115661931181768e-06, + "loss": 1.08691225, + "memory(GiB)": 302.58, + "step": 115000, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.74001689, + "epoch": 0.6432463191037502, + "grad_norm": 8.375, + "learning_rate": 8.114938651003634e-06, + "loss": 1.02014818, + "memory(GiB)": 302.58, + "step": 115020, + "train_speed(iter/s)": 0.123775 + }, + { + "acc": 0.73924627, + "epoch": 0.6433581685767295, + "grad_norm": 6.53125, + "learning_rate": 8.11421526428472e-06, + "loss": 1.02401447, + "memory(GiB)": 302.58, + "step": 115040, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.73556776, + "epoch": 0.6434700180497087, + "grad_norm": 7.15625, + "learning_rate": 8.11349177104977e-06, + "loss": 1.04815664, + "memory(GiB)": 302.58, + "step": 115060, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.73949323, + "epoch": 0.643581867522688, + "grad_norm": 7.5, + "learning_rate": 8.112768171323529e-06, + "loss": 1.03271942, + "memory(GiB)": 302.58, + "step": 115080, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.73035669, + "epoch": 0.6436937169956672, + "grad_norm": 8.0625, + "learning_rate": 8.112044465130743e-06, + "loss": 1.05375624, + "memory(GiB)": 302.58, + "step": 115100, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.74392595, + "epoch": 0.6438055664686465, + "grad_norm": 9.9375, + "learning_rate": 8.111320652496172e-06, + "loss": 1.019697, + "memory(GiB)": 302.58, + "step": 115120, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.74735136, + "epoch": 0.6439174159416258, + "grad_norm": 5.25, + "learning_rate": 8.110596733444568e-06, + "loss": 1.01191654, + "memory(GiB)": 302.58, + "step": 115140, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.74759164, + "epoch": 0.644029265414605, + "grad_norm": 8.625, + "learning_rate": 8.109872708000692e-06, + "loss": 0.99498873, + "memory(GiB)": 302.58, + "step": 115160, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.73022809, + "epoch": 0.6441411148875843, + "grad_norm": 6.15625, + "learning_rate": 8.109148576189307e-06, + "loss": 1.08478689, + "memory(GiB)": 302.58, + "step": 115180, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.74106126, + "epoch": 0.6442529643605636, + "grad_norm": 6.21875, + "learning_rate": 8.108424338035183e-06, + "loss": 0.98267403, + "memory(GiB)": 302.58, + "step": 115200, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.73903375, + "epoch": 0.6443648138335428, + "grad_norm": 8.9375, + "learning_rate": 8.107699993563092e-06, + "loss": 1.03674774, + "memory(GiB)": 302.58, + "step": 115220, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.72208629, + "epoch": 0.6444766633065221, + "grad_norm": 6.125, + "learning_rate": 8.106975542797803e-06, + "loss": 1.09007673, + "memory(GiB)": 302.58, + "step": 115240, + "train_speed(iter/s)": 0.123885 + }, + { + "acc": 0.73042045, + "epoch": 0.6445885127795014, + "grad_norm": 6.28125, + "learning_rate": 8.106250985764102e-06, + "loss": 1.06552544, + "memory(GiB)": 302.58, + "step": 115260, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.75043335, + "epoch": 0.6447003622524806, + "grad_norm": 6.90625, + "learning_rate": 8.105526322486765e-06, + "loss": 0.98633261, + "memory(GiB)": 302.58, + "step": 115280, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.73303833, + "epoch": 0.6448122117254599, + "grad_norm": 5.90625, + "learning_rate": 8.104801552990583e-06, + "loss": 1.06500616, + "memory(GiB)": 302.58, + "step": 115300, + "train_speed(iter/s)": 0.123915 + }, + { + "acc": 0.74147224, + "epoch": 0.6449240611984391, + "grad_norm": 7.5625, + "learning_rate": 8.10407667730034e-06, + "loss": 0.97482119, + "memory(GiB)": 302.58, + "step": 115320, + "train_speed(iter/s)": 0.123925 + }, + { + "acc": 0.73631864, + "epoch": 0.6450359106714184, + "grad_norm": 5.90625, + "learning_rate": 8.103351695440835e-06, + "loss": 1.04057798, + "memory(GiB)": 302.58, + "step": 115340, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.74979563, + "epoch": 0.6451477601443977, + "grad_norm": 6.40625, + "learning_rate": 8.10262660743686e-06, + "loss": 0.971984, + "memory(GiB)": 302.58, + "step": 115360, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.73169551, + "epoch": 0.6452596096173769, + "grad_norm": 6.25, + "learning_rate": 8.101901413313217e-06, + "loss": 1.05651321, + "memory(GiB)": 302.58, + "step": 115380, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.73641462, + "epoch": 0.6453714590903562, + "grad_norm": 7.6875, + "learning_rate": 8.10117611309471e-06, + "loss": 1.03747864, + "memory(GiB)": 302.58, + "step": 115400, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.73100243, + "epoch": 0.6454833085633355, + "grad_norm": 7.4375, + "learning_rate": 8.100450706806146e-06, + "loss": 1.07159748, + "memory(GiB)": 302.58, + "step": 115420, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.73752947, + "epoch": 0.6455951580363147, + "grad_norm": 5.8125, + "learning_rate": 8.099725194472337e-06, + "loss": 1.03987207, + "memory(GiB)": 302.58, + "step": 115440, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.74520588, + "epoch": 0.645707007509294, + "grad_norm": 9.1875, + "learning_rate": 8.098999576118096e-06, + "loss": 1.00167875, + "memory(GiB)": 302.58, + "step": 115460, + "train_speed(iter/s)": 0.123994 + }, + { + "acc": 0.72617741, + "epoch": 0.6458188569822733, + "grad_norm": 6.5, + "learning_rate": 8.098273851768244e-06, + "loss": 1.09225187, + "memory(GiB)": 302.58, + "step": 115480, + "train_speed(iter/s)": 0.124005 + }, + { + "acc": 0.72259793, + "epoch": 0.6459307064552525, + "grad_norm": 8.8125, + "learning_rate": 8.0975480214476e-06, + "loss": 1.1231514, + "memory(GiB)": 302.58, + "step": 115500, + "train_speed(iter/s)": 0.124015 + }, + { + "acc": 0.72289681, + "epoch": 0.6460425559282318, + "grad_norm": 7.5, + "learning_rate": 8.096822085180995e-06, + "loss": 1.10037498, + "memory(GiB)": 302.58, + "step": 115520, + "train_speed(iter/s)": 0.124026 + }, + { + "acc": 0.74169555, + "epoch": 0.646154405401211, + "grad_norm": 9.6875, + "learning_rate": 8.096096042993254e-06, + "loss": 0.99797354, + "memory(GiB)": 302.58, + "step": 115540, + "train_speed(iter/s)": 0.124036 + }, + { + "acc": 0.74643407, + "epoch": 0.6462662548741903, + "grad_norm": 6.75, + "learning_rate": 8.095369894909208e-06, + "loss": 0.9935791, + "memory(GiB)": 302.58, + "step": 115560, + "train_speed(iter/s)": 0.124046 + }, + { + "acc": 0.74184737, + "epoch": 0.6463781043471696, + "grad_norm": 8.3125, + "learning_rate": 8.0946436409537e-06, + "loss": 1.02058353, + "memory(GiB)": 302.58, + "step": 115580, + "train_speed(iter/s)": 0.124056 + }, + { + "acc": 0.72987461, + "epoch": 0.6464899538201488, + "grad_norm": 9.1875, + "learning_rate": 8.093917281151566e-06, + "loss": 1.0744112, + "memory(GiB)": 302.58, + "step": 115600, + "train_speed(iter/s)": 0.124067 + }, + { + "acc": 0.73841376, + "epoch": 0.6466018032931281, + "grad_norm": 7.65625, + "learning_rate": 8.093190815527648e-06, + "loss": 1.02583466, + "memory(GiB)": 302.58, + "step": 115620, + "train_speed(iter/s)": 0.124077 + }, + { + "acc": 0.75776691, + "epoch": 0.6467136527661074, + "grad_norm": 6.8125, + "learning_rate": 8.092464244106798e-06, + "loss": 0.93970156, + "memory(GiB)": 302.58, + "step": 115640, + "train_speed(iter/s)": 0.124088 + }, + { + "acc": 0.73708525, + "epoch": 0.6468255022390866, + "grad_norm": 7.875, + "learning_rate": 8.091737566913865e-06, + "loss": 1.04533339, + "memory(GiB)": 302.58, + "step": 115660, + "train_speed(iter/s)": 0.124097 + }, + { + "acc": 0.73841743, + "epoch": 0.6469373517120659, + "grad_norm": 6.53125, + "learning_rate": 8.091010783973702e-06, + "loss": 1.04155703, + "memory(GiB)": 302.58, + "step": 115680, + "train_speed(iter/s)": 0.124108 + }, + { + "acc": 0.75114527, + "epoch": 0.6470492011850452, + "grad_norm": 11.5, + "learning_rate": 8.09028389531117e-06, + "loss": 0.97422886, + "memory(GiB)": 302.58, + "step": 115700, + "train_speed(iter/s)": 0.124119 + }, + { + "acc": 0.75538945, + "epoch": 0.6471610506580244, + "grad_norm": 6.09375, + "learning_rate": 8.08955690095113e-06, + "loss": 0.93481808, + "memory(GiB)": 302.58, + "step": 115720, + "train_speed(iter/s)": 0.124129 + }, + { + "acc": 0.72247672, + "epoch": 0.6472729001310037, + "grad_norm": 5.625, + "learning_rate": 8.088829800918446e-06, + "loss": 1.12542992, + "memory(GiB)": 302.58, + "step": 115740, + "train_speed(iter/s)": 0.12414 + }, + { + "acc": 0.75836625, + "epoch": 0.647384749603983, + "grad_norm": 9.0, + "learning_rate": 8.08810259523799e-06, + "loss": 0.92839136, + "memory(GiB)": 302.58, + "step": 115760, + "train_speed(iter/s)": 0.12415 + }, + { + "acc": 0.71983905, + "epoch": 0.6474965990769622, + "grad_norm": 7.71875, + "learning_rate": 8.087375283934632e-06, + "loss": 1.11462765, + "memory(GiB)": 302.58, + "step": 115780, + "train_speed(iter/s)": 0.12416 + }, + { + "acc": 0.72854185, + "epoch": 0.6476084485499415, + "grad_norm": 5.84375, + "learning_rate": 8.08664786703325e-06, + "loss": 1.08854227, + "memory(GiB)": 302.58, + "step": 115800, + "train_speed(iter/s)": 0.12417 + }, + { + "acc": 0.73571239, + "epoch": 0.6477202980229207, + "grad_norm": 6.15625, + "learning_rate": 8.085920344558723e-06, + "loss": 1.02941837, + "memory(GiB)": 302.58, + "step": 115820, + "train_speed(iter/s)": 0.12418 + }, + { + "acc": 0.74062662, + "epoch": 0.6478321474959, + "grad_norm": 6.78125, + "learning_rate": 8.085192716535936e-06, + "loss": 1.01768026, + "memory(GiB)": 302.58, + "step": 115840, + "train_speed(iter/s)": 0.12419 + }, + { + "acc": 0.76245518, + "epoch": 0.6479439969688793, + "grad_norm": 7.9375, + "learning_rate": 8.084464982989775e-06, + "loss": 0.93124495, + "memory(GiB)": 302.58, + "step": 115860, + "train_speed(iter/s)": 0.1242 + }, + { + "acc": 0.73082833, + "epoch": 0.6480558464418585, + "grad_norm": 9.1875, + "learning_rate": 8.08373714394513e-06, + "loss": 1.05756664, + "memory(GiB)": 302.58, + "step": 115880, + "train_speed(iter/s)": 0.12421 + }, + { + "acc": 0.75121746, + "epoch": 0.6481676959148378, + "grad_norm": 7.5625, + "learning_rate": 8.083009199426897e-06, + "loss": 0.96657486, + "memory(GiB)": 302.58, + "step": 115900, + "train_speed(iter/s)": 0.124219 + }, + { + "acc": 0.7464828, + "epoch": 0.6482795453878171, + "grad_norm": 8.0, + "learning_rate": 8.082281149459973e-06, + "loss": 0.982061, + "memory(GiB)": 302.58, + "step": 115920, + "train_speed(iter/s)": 0.124229 + }, + { + "acc": 0.71386437, + "epoch": 0.6483913948607963, + "grad_norm": 4.78125, + "learning_rate": 8.08155299406926e-06, + "loss": 1.12790451, + "memory(GiB)": 302.58, + "step": 115940, + "train_speed(iter/s)": 0.12424 + }, + { + "acc": 0.73745589, + "epoch": 0.6485032443337756, + "grad_norm": 10.4375, + "learning_rate": 8.080824733279664e-06, + "loss": 1.01936359, + "memory(GiB)": 302.58, + "step": 115960, + "train_speed(iter/s)": 0.12425 + }, + { + "acc": 0.74038372, + "epoch": 0.6486150938067549, + "grad_norm": 8.25, + "learning_rate": 8.080096367116093e-06, + "loss": 1.01759415, + "memory(GiB)": 302.58, + "step": 115980, + "train_speed(iter/s)": 0.12426 + }, + { + "acc": 0.74685631, + "epoch": 0.6487269432797341, + "grad_norm": 7.375, + "learning_rate": 8.079367895603459e-06, + "loss": 0.98840065, + "memory(GiB)": 302.58, + "step": 116000, + "train_speed(iter/s)": 0.12427 + }, + { + "epoch": 0.6487269432797341, + "eval_acc": 0.7035436206064116, + "eval_loss": 1.0265074968338013, + "eval_runtime": 7501.0013, + "eval_samples_per_second": 10.036, + "eval_steps_per_second": 10.036, + "step": 116000 + }, + { + "acc": 0.73663368, + "epoch": 0.6488387927527134, + "grad_norm": 9.75, + "learning_rate": 8.07863931876668e-06, + "loss": 1.01658754, + "memory(GiB)": 302.58, + "step": 116020, + "train_speed(iter/s)": 0.123272 + }, + { + "acc": 0.74677606, + "epoch": 0.6489506422256927, + "grad_norm": 6.15625, + "learning_rate": 8.077910636630673e-06, + "loss": 1.00048361, + "memory(GiB)": 302.58, + "step": 116040, + "train_speed(iter/s)": 0.123282 + }, + { + "acc": 0.73773317, + "epoch": 0.6490624916986719, + "grad_norm": 4.90625, + "learning_rate": 8.077181849220362e-06, + "loss": 1.02238464, + "memory(GiB)": 302.58, + "step": 116060, + "train_speed(iter/s)": 0.123292 + }, + { + "acc": 0.74705639, + "epoch": 0.6491743411716512, + "grad_norm": 6.84375, + "learning_rate": 8.076452956560675e-06, + "loss": 0.98471575, + "memory(GiB)": 302.58, + "step": 116080, + "train_speed(iter/s)": 0.123302 + }, + { + "acc": 0.73730664, + "epoch": 0.6492861906446304, + "grad_norm": 5.65625, + "learning_rate": 8.075723958676542e-06, + "loss": 1.04279394, + "memory(GiB)": 302.58, + "step": 116100, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.75010004, + "epoch": 0.6493980401176097, + "grad_norm": 9.3125, + "learning_rate": 8.074994855592896e-06, + "loss": 0.96938562, + "memory(GiB)": 302.58, + "step": 116120, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.75571094, + "epoch": 0.649509889590589, + "grad_norm": 9.0625, + "learning_rate": 8.074265647334677e-06, + "loss": 0.96767349, + "memory(GiB)": 302.58, + "step": 116140, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.7361784, + "epoch": 0.6496217390635682, + "grad_norm": 8.625, + "learning_rate": 8.073536333926822e-06, + "loss": 1.0218708, + "memory(GiB)": 302.58, + "step": 116160, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.73173876, + "epoch": 0.6497335885365475, + "grad_norm": 7.28125, + "learning_rate": 8.07280691539428e-06, + "loss": 1.06292048, + "memory(GiB)": 302.58, + "step": 116180, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.7366847, + "epoch": 0.6498454380095268, + "grad_norm": 8.5, + "learning_rate": 8.072077391761997e-06, + "loss": 1.05641356, + "memory(GiB)": 302.58, + "step": 116200, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.72129178, + "epoch": 0.649957287482506, + "grad_norm": 4.75, + "learning_rate": 8.071347763054927e-06, + "loss": 1.09369411, + "memory(GiB)": 302.58, + "step": 116220, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.74330459, + "epoch": 0.6500691369554853, + "grad_norm": 6.21875, + "learning_rate": 8.070618029298024e-06, + "loss": 1.00961885, + "memory(GiB)": 302.58, + "step": 116240, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.73855147, + "epoch": 0.6501809864284646, + "grad_norm": 10.6875, + "learning_rate": 8.069888190516249e-06, + "loss": 1.02873917, + "memory(GiB)": 302.58, + "step": 116260, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.72561378, + "epoch": 0.6502928359014438, + "grad_norm": 7.34375, + "learning_rate": 8.06915824673456e-06, + "loss": 1.07997408, + "memory(GiB)": 302.58, + "step": 116280, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.73181686, + "epoch": 0.6504046853744231, + "grad_norm": 8.75, + "learning_rate": 8.068428197977931e-06, + "loss": 1.04876003, + "memory(GiB)": 302.58, + "step": 116300, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.73699827, + "epoch": 0.6505165348474024, + "grad_norm": 8.5625, + "learning_rate": 8.067698044271326e-06, + "loss": 1.03119745, + "memory(GiB)": 302.58, + "step": 116320, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.74125853, + "epoch": 0.6506283843203816, + "grad_norm": 5.3125, + "learning_rate": 8.06696778563972e-06, + "loss": 1.02057667, + "memory(GiB)": 302.58, + "step": 116340, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.73385587, + "epoch": 0.6507402337933609, + "grad_norm": 6.875, + "learning_rate": 8.066237422108092e-06, + "loss": 1.06481047, + "memory(GiB)": 302.58, + "step": 116360, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.73206077, + "epoch": 0.6508520832663401, + "grad_norm": 6.875, + "learning_rate": 8.06550695370142e-06, + "loss": 1.05487709, + "memory(GiB)": 302.58, + "step": 116380, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.75762038, + "epoch": 0.6509639327393194, + "grad_norm": 8.375, + "learning_rate": 8.06477638044469e-06, + "loss": 0.94712257, + "memory(GiB)": 302.58, + "step": 116400, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.73932509, + "epoch": 0.6510757822122987, + "grad_norm": 8.5625, + "learning_rate": 8.06404570236289e-06, + "loss": 1.00442667, + "memory(GiB)": 302.58, + "step": 116420, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74594841, + "epoch": 0.6511876316852779, + "grad_norm": 8.25, + "learning_rate": 8.063314919481011e-06, + "loss": 1.00375719, + "memory(GiB)": 302.58, + "step": 116440, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.7535522, + "epoch": 0.6512994811582572, + "grad_norm": 6.96875, + "learning_rate": 8.062584031824049e-06, + "loss": 0.96646175, + "memory(GiB)": 302.58, + "step": 116460, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.72914672, + "epoch": 0.6514113306312365, + "grad_norm": 6.875, + "learning_rate": 8.061853039417001e-06, + "loss": 1.06761551, + "memory(GiB)": 302.58, + "step": 116480, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75920858, + "epoch": 0.6515231801042157, + "grad_norm": 6.5625, + "learning_rate": 8.06112194228487e-06, + "loss": 0.97001781, + "memory(GiB)": 302.58, + "step": 116500, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.72172828, + "epoch": 0.651635029577195, + "grad_norm": 6.8125, + "learning_rate": 8.060390740452664e-06, + "loss": 1.1085186, + "memory(GiB)": 302.58, + "step": 116520, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.72610278, + "epoch": 0.6517468790501743, + "grad_norm": 4.9375, + "learning_rate": 8.059659433945389e-06, + "loss": 1.09863329, + "memory(GiB)": 302.58, + "step": 116540, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75148978, + "epoch": 0.6518587285231535, + "grad_norm": 8.0, + "learning_rate": 8.05892802278806e-06, + "loss": 0.97084694, + "memory(GiB)": 302.58, + "step": 116560, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.73072877, + "epoch": 0.6519705779961328, + "grad_norm": 6.53125, + "learning_rate": 8.058196507005692e-06, + "loss": 1.07557306, + "memory(GiB)": 302.58, + "step": 116580, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74859433, + "epoch": 0.652082427469112, + "grad_norm": 7.28125, + "learning_rate": 8.057464886623306e-06, + "loss": 0.98813477, + "memory(GiB)": 302.58, + "step": 116600, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.73860278, + "epoch": 0.6521942769420913, + "grad_norm": 7.0, + "learning_rate": 8.056733161665927e-06, + "loss": 1.03683996, + "memory(GiB)": 302.58, + "step": 116620, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74016666, + "epoch": 0.6523061264150706, + "grad_norm": 5.78125, + "learning_rate": 8.056001332158581e-06, + "loss": 1.04404631, + "memory(GiB)": 302.58, + "step": 116640, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.74504471, + "epoch": 0.6524179758880498, + "grad_norm": 9.9375, + "learning_rate": 8.0552693981263e-06, + "loss": 1.00421991, + "memory(GiB)": 302.58, + "step": 116660, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.75431476, + "epoch": 0.6525298253610291, + "grad_norm": 9.6875, + "learning_rate": 8.054537359594114e-06, + "loss": 0.96137428, + "memory(GiB)": 302.58, + "step": 116680, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.730971, + "epoch": 0.6526416748340084, + "grad_norm": 5.4375, + "learning_rate": 8.053805216587068e-06, + "loss": 1.05954437, + "memory(GiB)": 302.58, + "step": 116700, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.73700809, + "epoch": 0.6527535243069876, + "grad_norm": 9.75, + "learning_rate": 8.0530729691302e-06, + "loss": 1.02108974, + "memory(GiB)": 302.58, + "step": 116720, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.74895535, + "epoch": 0.6528653737799669, + "grad_norm": 8.5625, + "learning_rate": 8.052340617248555e-06, + "loss": 0.99355392, + "memory(GiB)": 302.58, + "step": 116740, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.73521075, + "epoch": 0.6529772232529462, + "grad_norm": 5.5625, + "learning_rate": 8.051608160967181e-06, + "loss": 1.06876345, + "memory(GiB)": 302.58, + "step": 116760, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.73917871, + "epoch": 0.6530890727259254, + "grad_norm": 7.9375, + "learning_rate": 8.050875600311133e-06, + "loss": 1.03721142, + "memory(GiB)": 302.58, + "step": 116780, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.73083305, + "epoch": 0.6532009221989047, + "grad_norm": 4.0625, + "learning_rate": 8.050142935305465e-06, + "loss": 1.05286551, + "memory(GiB)": 302.58, + "step": 116800, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.72513571, + "epoch": 0.653312771671884, + "grad_norm": 4.5625, + "learning_rate": 8.049410165975237e-06, + "loss": 1.11467314, + "memory(GiB)": 302.58, + "step": 116820, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.74984889, + "epoch": 0.6534246211448632, + "grad_norm": 8.375, + "learning_rate": 8.048677292345512e-06, + "loss": 0.96453161, + "memory(GiB)": 302.58, + "step": 116840, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.71730657, + "epoch": 0.6535364706178425, + "grad_norm": 8.1875, + "learning_rate": 8.047944314441356e-06, + "loss": 1.12284079, + "memory(GiB)": 302.58, + "step": 116860, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.72780733, + "epoch": 0.6536483200908217, + "grad_norm": 8.0625, + "learning_rate": 8.047211232287839e-06, + "loss": 1.06469612, + "memory(GiB)": 302.58, + "step": 116880, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.73960028, + "epoch": 0.653760169563801, + "grad_norm": 6.03125, + "learning_rate": 8.046478045910038e-06, + "loss": 1.02282019, + "memory(GiB)": 302.58, + "step": 116900, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.73436055, + "epoch": 0.6538720190367803, + "grad_norm": 10.5625, + "learning_rate": 8.045744755333025e-06, + "loss": 1.04693041, + "memory(GiB)": 302.58, + "step": 116920, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.76304355, + "epoch": 0.6539838685097595, + "grad_norm": 9.0625, + "learning_rate": 8.045011360581885e-06, + "loss": 0.93447485, + "memory(GiB)": 302.58, + "step": 116940, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.74097166, + "epoch": 0.6540957179827388, + "grad_norm": 9.3125, + "learning_rate": 8.0442778616817e-06, + "loss": 1.02627897, + "memory(GiB)": 302.58, + "step": 116960, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.74507284, + "epoch": 0.6542075674557181, + "grad_norm": 5.4375, + "learning_rate": 8.043544258657559e-06, + "loss": 1.00452843, + "memory(GiB)": 302.58, + "step": 116980, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.7463748, + "epoch": 0.6543194169286973, + "grad_norm": 5.5, + "learning_rate": 8.042810551534554e-06, + "loss": 1.01884565, + "memory(GiB)": 302.58, + "step": 117000, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.75851641, + "epoch": 0.6544312664016766, + "grad_norm": 9.0, + "learning_rate": 8.04207674033778e-06, + "loss": 0.94665937, + "memory(GiB)": 302.58, + "step": 117020, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.73348904, + "epoch": 0.6545431158746559, + "grad_norm": 6.3125, + "learning_rate": 8.041342825092336e-06, + "loss": 1.0294075, + "memory(GiB)": 302.58, + "step": 117040, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.74645963, + "epoch": 0.6546549653476351, + "grad_norm": 9.5, + "learning_rate": 8.040608805823323e-06, + "loss": 1.00785398, + "memory(GiB)": 302.58, + "step": 117060, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.73343124, + "epoch": 0.6547668148206144, + "grad_norm": 8.125, + "learning_rate": 8.039874682555845e-06, + "loss": 1.04470806, + "memory(GiB)": 302.58, + "step": 117080, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.72675629, + "epoch": 0.6548786642935936, + "grad_norm": 9.1875, + "learning_rate": 8.039140455315016e-06, + "loss": 1.09637985, + "memory(GiB)": 302.58, + "step": 117100, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.73788152, + "epoch": 0.6549905137665729, + "grad_norm": 6.21875, + "learning_rate": 8.038406124125945e-06, + "loss": 1.05030022, + "memory(GiB)": 302.58, + "step": 117120, + "train_speed(iter/s)": 0.123822 + }, + { + "acc": 0.74190736, + "epoch": 0.6551023632395522, + "grad_norm": 7.71875, + "learning_rate": 8.037671689013754e-06, + "loss": 1.02675676, + "memory(GiB)": 302.58, + "step": 117140, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.74696674, + "epoch": 0.6552142127125314, + "grad_norm": 6.125, + "learning_rate": 8.036937150003554e-06, + "loss": 0.98845568, + "memory(GiB)": 302.58, + "step": 117160, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.73017554, + "epoch": 0.6553260621855107, + "grad_norm": 7.1875, + "learning_rate": 8.036202507120477e-06, + "loss": 1.07254858, + "memory(GiB)": 302.58, + "step": 117180, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.74082251, + "epoch": 0.65543791165849, + "grad_norm": 8.0625, + "learning_rate": 8.035467760389647e-06, + "loss": 1.02205524, + "memory(GiB)": 302.58, + "step": 117200, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.73492899, + "epoch": 0.6555497611314692, + "grad_norm": 7.34375, + "learning_rate": 8.034732909836192e-06, + "loss": 1.04889612, + "memory(GiB)": 302.58, + "step": 117220, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.74756169, + "epoch": 0.6556616106044485, + "grad_norm": 7.1875, + "learning_rate": 8.033997955485248e-06, + "loss": 0.98850336, + "memory(GiB)": 302.58, + "step": 117240, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.74008136, + "epoch": 0.6557734600774278, + "grad_norm": 5.0, + "learning_rate": 8.033262897361955e-06, + "loss": 1.01991711, + "memory(GiB)": 302.58, + "step": 117260, + "train_speed(iter/s)": 0.123891 + }, + { + "acc": 0.7343008, + "epoch": 0.655885309550407, + "grad_norm": 6.28125, + "learning_rate": 8.032527735491453e-06, + "loss": 1.02264881, + "memory(GiB)": 302.58, + "step": 117280, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.74449148, + "epoch": 0.6559971590233863, + "grad_norm": 6.1875, + "learning_rate": 8.031792469898886e-06, + "loss": 1.00849857, + "memory(GiB)": 302.58, + "step": 117300, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.74442677, + "epoch": 0.6561090084963656, + "grad_norm": 5.78125, + "learning_rate": 8.031057100609404e-06, + "loss": 1.01870441, + "memory(GiB)": 302.58, + "step": 117320, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.73813376, + "epoch": 0.6562208579693448, + "grad_norm": 6.28125, + "learning_rate": 8.030321627648157e-06, + "loss": 1.03316841, + "memory(GiB)": 302.58, + "step": 117340, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.74489384, + "epoch": 0.6563327074423241, + "grad_norm": 4.5625, + "learning_rate": 8.029586051040301e-06, + "loss": 1.0072752, + "memory(GiB)": 302.58, + "step": 117360, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.73498449, + "epoch": 0.6564445569153033, + "grad_norm": 6.8125, + "learning_rate": 8.028850370810997e-06, + "loss": 1.06759291, + "memory(GiB)": 302.58, + "step": 117380, + "train_speed(iter/s)": 0.123948 + }, + { + "acc": 0.72227311, + "epoch": 0.6565564063882826, + "grad_norm": 5.375, + "learning_rate": 8.028114586985405e-06, + "loss": 1.11360064, + "memory(GiB)": 302.58, + "step": 117400, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.75409317, + "epoch": 0.6566682558612619, + "grad_norm": 7.28125, + "learning_rate": 8.02737869958869e-06, + "loss": 0.95329676, + "memory(GiB)": 302.58, + "step": 117420, + "train_speed(iter/s)": 0.123968 + }, + { + "acc": 0.73655686, + "epoch": 0.6567801053342411, + "grad_norm": 6.84375, + "learning_rate": 8.02664270864603e-06, + "loss": 1.03404264, + "memory(GiB)": 302.58, + "step": 117440, + "train_speed(iter/s)": 0.123978 + }, + { + "acc": 0.75331173, + "epoch": 0.6568919548072204, + "grad_norm": 6.125, + "learning_rate": 8.025906614182587e-06, + "loss": 0.98056536, + "memory(GiB)": 302.58, + "step": 117460, + "train_speed(iter/s)": 0.123988 + }, + { + "acc": 0.72943616, + "epoch": 0.6570038042801997, + "grad_norm": 6.375, + "learning_rate": 8.025170416223544e-06, + "loss": 1.08943748, + "memory(GiB)": 302.58, + "step": 117480, + "train_speed(iter/s)": 0.123997 + }, + { + "acc": 0.72431798, + "epoch": 0.6571156537531789, + "grad_norm": 4.78125, + "learning_rate": 8.024434114794081e-06, + "loss": 1.08480215, + "memory(GiB)": 302.58, + "step": 117500, + "train_speed(iter/s)": 0.124007 + }, + { + "acc": 0.73869109, + "epoch": 0.6572275032261582, + "grad_norm": 8.0, + "learning_rate": 8.02369770991938e-06, + "loss": 1.04545345, + "memory(GiB)": 302.58, + "step": 117520, + "train_speed(iter/s)": 0.124017 + }, + { + "acc": 0.74516888, + "epoch": 0.6573393526991375, + "grad_norm": 9.5625, + "learning_rate": 8.02296120162463e-06, + "loss": 0.99618902, + "memory(GiB)": 302.58, + "step": 117540, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.72541966, + "epoch": 0.6574512021721167, + "grad_norm": 7.75, + "learning_rate": 8.022224589935022e-06, + "loss": 1.08452635, + "memory(GiB)": 302.58, + "step": 117560, + "train_speed(iter/s)": 0.124037 + }, + { + "acc": 0.73387995, + "epoch": 0.657563051645096, + "grad_norm": 5.09375, + "learning_rate": 8.021487874875749e-06, + "loss": 1.04768639, + "memory(GiB)": 302.58, + "step": 117580, + "train_speed(iter/s)": 0.124047 + }, + { + "acc": 0.74811883, + "epoch": 0.6576749011180753, + "grad_norm": 5.96875, + "learning_rate": 8.020751056472009e-06, + "loss": 0.9950223, + "memory(GiB)": 302.58, + "step": 117600, + "train_speed(iter/s)": 0.124056 + }, + { + "acc": 0.72560115, + "epoch": 0.6577867505910545, + "grad_norm": 4.9375, + "learning_rate": 8.020014134749006e-06, + "loss": 1.10558271, + "memory(GiB)": 302.58, + "step": 117620, + "train_speed(iter/s)": 0.124066 + }, + { + "acc": 0.73516393, + "epoch": 0.6578986000640338, + "grad_norm": 8.1875, + "learning_rate": 8.019277109731943e-06, + "loss": 1.02966928, + "memory(GiB)": 302.58, + "step": 117640, + "train_speed(iter/s)": 0.124076 + }, + { + "acc": 0.7366982, + "epoch": 0.658010449537013, + "grad_norm": 4.5625, + "learning_rate": 8.018539981446027e-06, + "loss": 1.01441259, + "memory(GiB)": 302.58, + "step": 117660, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.7419106, + "epoch": 0.6581222990099923, + "grad_norm": 7.6875, + "learning_rate": 8.017802749916475e-06, + "loss": 0.99978514, + "memory(GiB)": 302.58, + "step": 117680, + "train_speed(iter/s)": 0.124096 + }, + { + "acc": 0.73094749, + "epoch": 0.6582341484829716, + "grad_norm": 7.40625, + "learning_rate": 8.017065415168499e-06, + "loss": 1.0497941, + "memory(GiB)": 302.58, + "step": 117700, + "train_speed(iter/s)": 0.124106 + }, + { + "acc": 0.75103884, + "epoch": 0.6583459979559508, + "grad_norm": 6.5625, + "learning_rate": 8.016327977227319e-06, + "loss": 0.96013002, + "memory(GiB)": 302.58, + "step": 117720, + "train_speed(iter/s)": 0.124116 + }, + { + "acc": 0.73289561, + "epoch": 0.6584578474289301, + "grad_norm": 5.46875, + "learning_rate": 8.015590436118156e-06, + "loss": 1.04958801, + "memory(GiB)": 302.58, + "step": 117740, + "train_speed(iter/s)": 0.124126 + }, + { + "acc": 0.75215917, + "epoch": 0.6585696969019094, + "grad_norm": 9.0625, + "learning_rate": 8.014852791866239e-06, + "loss": 0.95894375, + "memory(GiB)": 302.58, + "step": 117760, + "train_speed(iter/s)": 0.124136 + }, + { + "acc": 0.74395032, + "epoch": 0.6586815463748886, + "grad_norm": 7.3125, + "learning_rate": 8.014115044496797e-06, + "loss": 0.99196835, + "memory(GiB)": 302.58, + "step": 117780, + "train_speed(iter/s)": 0.124146 + }, + { + "acc": 0.73338437, + "epoch": 0.6587933958478679, + "grad_norm": 6.96875, + "learning_rate": 8.013377194035062e-06, + "loss": 1.0506197, + "memory(GiB)": 302.58, + "step": 117800, + "train_speed(iter/s)": 0.124156 + }, + { + "acc": 0.74280195, + "epoch": 0.6589052453208472, + "grad_norm": 6.3125, + "learning_rate": 8.012639240506273e-06, + "loss": 1.00689087, + "memory(GiB)": 302.58, + "step": 117820, + "train_speed(iter/s)": 0.124166 + }, + { + "acc": 0.73836327, + "epoch": 0.6590170947938264, + "grad_norm": 9.6875, + "learning_rate": 8.011901183935669e-06, + "loss": 1.03130455, + "memory(GiB)": 302.58, + "step": 117840, + "train_speed(iter/s)": 0.124175 + }, + { + "acc": 0.73570142, + "epoch": 0.6591289442668057, + "grad_norm": 6.46875, + "learning_rate": 8.011163024348493e-06, + "loss": 1.04791555, + "memory(GiB)": 302.58, + "step": 117860, + "train_speed(iter/s)": 0.124185 + }, + { + "acc": 0.74548783, + "epoch": 0.659240793739785, + "grad_norm": 7.6875, + "learning_rate": 8.010424761769995e-06, + "loss": 1.00643682, + "memory(GiB)": 302.58, + "step": 117880, + "train_speed(iter/s)": 0.124194 + }, + { + "acc": 0.73103542, + "epoch": 0.6593526432127642, + "grad_norm": 7.21875, + "learning_rate": 8.009686396225423e-06, + "loss": 1.06502905, + "memory(GiB)": 302.58, + "step": 117900, + "train_speed(iter/s)": 0.124204 + }, + { + "acc": 0.7383914, + "epoch": 0.6594644926857435, + "grad_norm": 8.8125, + "learning_rate": 8.008947927740034e-06, + "loss": 1.04791298, + "memory(GiB)": 302.58, + "step": 117920, + "train_speed(iter/s)": 0.124213 + }, + { + "acc": 0.72857304, + "epoch": 0.6595763421587227, + "grad_norm": 9.25, + "learning_rate": 8.008209356339084e-06, + "loss": 1.07492094, + "memory(GiB)": 302.58, + "step": 117940, + "train_speed(iter/s)": 0.124223 + }, + { + "acc": 0.7509831, + "epoch": 0.659688191631702, + "grad_norm": 5.84375, + "learning_rate": 8.007470682047835e-06, + "loss": 0.95438175, + "memory(GiB)": 302.58, + "step": 117960, + "train_speed(iter/s)": 0.124234 + }, + { + "acc": 0.73482666, + "epoch": 0.6598000411046813, + "grad_norm": 8.0, + "learning_rate": 8.006731904891553e-06, + "loss": 1.02828503, + "memory(GiB)": 302.58, + "step": 117980, + "train_speed(iter/s)": 0.124244 + }, + { + "acc": 0.7319098, + "epoch": 0.6599118905776605, + "grad_norm": 5.5, + "learning_rate": 8.005993024895505e-06, + "loss": 1.05810032, + "memory(GiB)": 302.58, + "step": 118000, + "train_speed(iter/s)": 0.124254 + }, + { + "epoch": 0.6599118905776605, + "eval_acc": 0.7037221735662942, + "eval_loss": 1.025738000869751, + "eval_runtime": 7491.2923, + "eval_samples_per_second": 10.049, + "eval_steps_per_second": 10.049, + "step": 118000 + }, + { + "acc": 0.73407745, + "epoch": 0.6600237400506398, + "grad_norm": 5.1875, + "learning_rate": 8.005254042084964e-06, + "loss": 1.04162931, + "memory(GiB)": 302.58, + "step": 118020, + "train_speed(iter/s)": 0.123274 + }, + { + "acc": 0.72958689, + "epoch": 0.6601355895236191, + "grad_norm": 10.375, + "learning_rate": 8.004514956485206e-06, + "loss": 1.07845421, + "memory(GiB)": 302.58, + "step": 118040, + "train_speed(iter/s)": 0.123283 + }, + { + "acc": 0.73936729, + "epoch": 0.6602474389965983, + "grad_norm": 10.5, + "learning_rate": 8.003775768121509e-06, + "loss": 1.03505793, + "memory(GiB)": 302.58, + "step": 118060, + "train_speed(iter/s)": 0.123293 + }, + { + "acc": 0.735673, + "epoch": 0.6603592884695776, + "grad_norm": 6.25, + "learning_rate": 8.003036477019155e-06, + "loss": 1.01085653, + "memory(GiB)": 302.58, + "step": 118080, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.73421321, + "epoch": 0.6604711379425569, + "grad_norm": 7.75, + "learning_rate": 8.002297083203431e-06, + "loss": 1.04166937, + "memory(GiB)": 302.58, + "step": 118100, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.74450083, + "epoch": 0.6605829874155362, + "grad_norm": 7.75, + "learning_rate": 8.001557586699628e-06, + "loss": 0.99990587, + "memory(GiB)": 302.58, + "step": 118120, + "train_speed(iter/s)": 0.123322 + }, + { + "acc": 0.70878916, + "epoch": 0.6606948368885155, + "grad_norm": 6.375, + "learning_rate": 8.000817987533038e-06, + "loss": 1.14902105, + "memory(GiB)": 302.58, + "step": 118140, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.74056702, + "epoch": 0.6608066863614948, + "grad_norm": 8.8125, + "learning_rate": 8.000078285728958e-06, + "loss": 1.02079782, + "memory(GiB)": 302.58, + "step": 118160, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.72752838, + "epoch": 0.660918535834474, + "grad_norm": 8.375, + "learning_rate": 7.999338481312686e-06, + "loss": 1.09897718, + "memory(GiB)": 302.58, + "step": 118180, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.74118214, + "epoch": 0.6610303853074533, + "grad_norm": 8.75, + "learning_rate": 7.998598574309529e-06, + "loss": 1.00645876, + "memory(GiB)": 302.58, + "step": 118200, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.74371867, + "epoch": 0.6611422347804325, + "grad_norm": 8.375, + "learning_rate": 7.997858564744791e-06, + "loss": 1.01300564, + "memory(GiB)": 302.58, + "step": 118220, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.74655886, + "epoch": 0.6612540842534118, + "grad_norm": 6.65625, + "learning_rate": 7.997118452643785e-06, + "loss": 1.00422382, + "memory(GiB)": 302.58, + "step": 118240, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.73689079, + "epoch": 0.6613659337263911, + "grad_norm": 11.5, + "learning_rate": 7.996378238031823e-06, + "loss": 1.04646139, + "memory(GiB)": 302.58, + "step": 118260, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.73617349, + "epoch": 0.6614777831993703, + "grad_norm": 5.625, + "learning_rate": 7.995637920934225e-06, + "loss": 1.04452438, + "memory(GiB)": 302.58, + "step": 118280, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.72883821, + "epoch": 0.6615896326723496, + "grad_norm": 9.125, + "learning_rate": 7.99489750137631e-06, + "loss": 1.07204142, + "memory(GiB)": 302.58, + "step": 118300, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75527201, + "epoch": 0.6617014821453289, + "grad_norm": 4.5, + "learning_rate": 7.994156979383406e-06, + "loss": 0.95429211, + "memory(GiB)": 302.58, + "step": 118320, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.75792899, + "epoch": 0.6618133316183081, + "grad_norm": 7.78125, + "learning_rate": 7.993416354980838e-06, + "loss": 0.96414471, + "memory(GiB)": 302.58, + "step": 118340, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.7400537, + "epoch": 0.6619251810912874, + "grad_norm": 8.9375, + "learning_rate": 7.992675628193939e-06, + "loss": 1.03469505, + "memory(GiB)": 302.58, + "step": 118360, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.73269496, + "epoch": 0.6620370305642667, + "grad_norm": 7.09375, + "learning_rate": 7.991934799048045e-06, + "loss": 1.02734327, + "memory(GiB)": 302.58, + "step": 118380, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.7345139, + "epoch": 0.6621488800372459, + "grad_norm": 7.1875, + "learning_rate": 7.991193867568492e-06, + "loss": 1.0331233, + "memory(GiB)": 302.58, + "step": 118400, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.73674378, + "epoch": 0.6622607295102252, + "grad_norm": 6.9375, + "learning_rate": 7.990452833780625e-06, + "loss": 1.03438053, + "memory(GiB)": 302.58, + "step": 118420, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.72716446, + "epoch": 0.6623725789832045, + "grad_norm": 5.78125, + "learning_rate": 7.989711697709788e-06, + "loss": 1.0655961, + "memory(GiB)": 302.58, + "step": 118440, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.74446893, + "epoch": 0.6624844284561837, + "grad_norm": 5.34375, + "learning_rate": 7.98897045938133e-06, + "loss": 0.98556442, + "memory(GiB)": 302.58, + "step": 118460, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.74920149, + "epoch": 0.662596277929163, + "grad_norm": 6.75, + "learning_rate": 7.988229118820605e-06, + "loss": 0.9608902, + "memory(GiB)": 302.58, + "step": 118480, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.72809577, + "epoch": 0.6627081274021422, + "grad_norm": 6.6875, + "learning_rate": 7.987487676052969e-06, + "loss": 1.08239822, + "memory(GiB)": 302.58, + "step": 118500, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.75938487, + "epoch": 0.6628199768751215, + "grad_norm": 8.5625, + "learning_rate": 7.986746131103782e-06, + "loss": 0.95947485, + "memory(GiB)": 302.58, + "step": 118520, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.72924728, + "epoch": 0.6629318263481008, + "grad_norm": 10.5625, + "learning_rate": 7.986004483998406e-06, + "loss": 1.07245121, + "memory(GiB)": 302.58, + "step": 118540, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73808398, + "epoch": 0.66304367582108, + "grad_norm": 5.96875, + "learning_rate": 7.985262734762209e-06, + "loss": 1.03654699, + "memory(GiB)": 302.58, + "step": 118560, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.74029531, + "epoch": 0.6631555252940593, + "grad_norm": 5.5, + "learning_rate": 7.98452088342056e-06, + "loss": 1.02609978, + "memory(GiB)": 302.58, + "step": 118580, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74680223, + "epoch": 0.6632673747670386, + "grad_norm": 6.71875, + "learning_rate": 7.983778929998833e-06, + "loss": 1.0084465, + "memory(GiB)": 302.58, + "step": 118600, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.73097143, + "epoch": 0.6633792242400178, + "grad_norm": 9.125, + "learning_rate": 7.983036874522405e-06, + "loss": 1.05014505, + "memory(GiB)": 302.58, + "step": 118620, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.7331573, + "epoch": 0.6634910737129971, + "grad_norm": 6.5625, + "learning_rate": 7.982294717016658e-06, + "loss": 1.07675467, + "memory(GiB)": 302.58, + "step": 118640, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.75052404, + "epoch": 0.6636029231859764, + "grad_norm": 9.625, + "learning_rate": 7.981552457506974e-06, + "loss": 0.98507252, + "memory(GiB)": 302.58, + "step": 118660, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.72532811, + "epoch": 0.6637147726589556, + "grad_norm": 7.21875, + "learning_rate": 7.980810096018742e-06, + "loss": 1.10613575, + "memory(GiB)": 302.58, + "step": 118680, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.75838566, + "epoch": 0.6638266221319349, + "grad_norm": 6.03125, + "learning_rate": 7.980067632577352e-06, + "loss": 0.94178123, + "memory(GiB)": 302.58, + "step": 118700, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.74275331, + "epoch": 0.6639384716049141, + "grad_norm": 6.125, + "learning_rate": 7.979325067208202e-06, + "loss": 1.01914787, + "memory(GiB)": 302.58, + "step": 118720, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.72689137, + "epoch": 0.6640503210778934, + "grad_norm": 5.0, + "learning_rate": 7.978582399936685e-06, + "loss": 1.0651123, + "memory(GiB)": 302.58, + "step": 118740, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.74221616, + "epoch": 0.6641621705508727, + "grad_norm": 6.78125, + "learning_rate": 7.977839630788207e-06, + "loss": 1.01521435, + "memory(GiB)": 302.58, + "step": 118760, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.72538347, + "epoch": 0.6642740200238519, + "grad_norm": 5.8125, + "learning_rate": 7.97709675978817e-06, + "loss": 1.07916851, + "memory(GiB)": 302.58, + "step": 118780, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.7366015, + "epoch": 0.6643858694968312, + "grad_norm": 5.34375, + "learning_rate": 7.976353786961984e-06, + "loss": 1.02785921, + "memory(GiB)": 302.58, + "step": 118800, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.73881841, + "epoch": 0.6644977189698105, + "grad_norm": 6.5625, + "learning_rate": 7.97561071233506e-06, + "loss": 1.01719894, + "memory(GiB)": 302.58, + "step": 118820, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.73413839, + "epoch": 0.6646095684427897, + "grad_norm": 6.625, + "learning_rate": 7.974867535932815e-06, + "loss": 1.05255594, + "memory(GiB)": 302.58, + "step": 118840, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.73477526, + "epoch": 0.664721417915769, + "grad_norm": 6.78125, + "learning_rate": 7.974124257780667e-06, + "loss": 1.06050205, + "memory(GiB)": 302.58, + "step": 118860, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.74540763, + "epoch": 0.6648332673887483, + "grad_norm": 7.65625, + "learning_rate": 7.973380877904038e-06, + "loss": 0.9912879, + "memory(GiB)": 302.58, + "step": 118880, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.73737655, + "epoch": 0.6649451168617275, + "grad_norm": 9.1875, + "learning_rate": 7.972637396328356e-06, + "loss": 1.0437191, + "memory(GiB)": 302.58, + "step": 118900, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.73953266, + "epoch": 0.6650569663347068, + "grad_norm": 7.03125, + "learning_rate": 7.971893813079049e-06, + "loss": 1.00180378, + "memory(GiB)": 302.58, + "step": 118920, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.73715873, + "epoch": 0.665168815807686, + "grad_norm": 8.125, + "learning_rate": 7.971150128181547e-06, + "loss": 1.03743811, + "memory(GiB)": 302.58, + "step": 118940, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.75017219, + "epoch": 0.6652806652806653, + "grad_norm": 5.6875, + "learning_rate": 7.970406341661292e-06, + "loss": 0.98908901, + "memory(GiB)": 302.58, + "step": 118960, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.73608527, + "epoch": 0.6653925147536446, + "grad_norm": 6.25, + "learning_rate": 7.96966245354372e-06, + "loss": 1.03787298, + "memory(GiB)": 302.58, + "step": 118980, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.73978138, + "epoch": 0.6655043642266238, + "grad_norm": 7.125, + "learning_rate": 7.968918463854278e-06, + "loss": 1.03556137, + "memory(GiB)": 302.58, + "step": 119000, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.72988534, + "epoch": 0.6656162136996031, + "grad_norm": 5.8125, + "learning_rate": 7.96817437261841e-06, + "loss": 1.05753269, + "memory(GiB)": 302.58, + "step": 119020, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.74596152, + "epoch": 0.6657280631725824, + "grad_norm": 12.5, + "learning_rate": 7.967430179861564e-06, + "loss": 1.00440083, + "memory(GiB)": 302.58, + "step": 119040, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.73498392, + "epoch": 0.6658399126455616, + "grad_norm": 5.0625, + "learning_rate": 7.966685885609198e-06, + "loss": 1.031143, + "memory(GiB)": 302.58, + "step": 119060, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.73499494, + "epoch": 0.6659517621185409, + "grad_norm": 7.84375, + "learning_rate": 7.965941489886767e-06, + "loss": 1.02475719, + "memory(GiB)": 302.58, + "step": 119080, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.74371319, + "epoch": 0.6660636115915202, + "grad_norm": 7.6875, + "learning_rate": 7.965196992719733e-06, + "loss": 0.99333992, + "memory(GiB)": 302.58, + "step": 119100, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.74528513, + "epoch": 0.6661754610644994, + "grad_norm": 9.75, + "learning_rate": 7.96445239413356e-06, + "loss": 0.99716902, + "memory(GiB)": 302.58, + "step": 119120, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.7410625, + "epoch": 0.6662873105374787, + "grad_norm": 6.46875, + "learning_rate": 7.963707694153713e-06, + "loss": 1.03202248, + "memory(GiB)": 302.58, + "step": 119140, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.73612099, + "epoch": 0.666399160010458, + "grad_norm": 5.5, + "learning_rate": 7.962962892805667e-06, + "loss": 1.03387804, + "memory(GiB)": 302.58, + "step": 119160, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.74700322, + "epoch": 0.6665110094834372, + "grad_norm": 7.4375, + "learning_rate": 7.962217990114894e-06, + "loss": 1.00237846, + "memory(GiB)": 302.58, + "step": 119180, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.72620873, + "epoch": 0.6666228589564165, + "grad_norm": 9.1875, + "learning_rate": 7.961472986106874e-06, + "loss": 1.07339096, + "memory(GiB)": 302.58, + "step": 119200, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.72593241, + "epoch": 0.6667347084293958, + "grad_norm": 6.09375, + "learning_rate": 7.960727880807085e-06, + "loss": 1.09046688, + "memory(GiB)": 302.58, + "step": 119220, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.74711542, + "epoch": 0.666846557902375, + "grad_norm": 4.8125, + "learning_rate": 7.959982674241015e-06, + "loss": 1.01132784, + "memory(GiB)": 302.58, + "step": 119240, + "train_speed(iter/s)": 0.123875 + }, + { + "acc": 0.74150457, + "epoch": 0.6669584073753543, + "grad_norm": 9.5625, + "learning_rate": 7.959237366434154e-06, + "loss": 1.01441851, + "memory(GiB)": 302.58, + "step": 119260, + "train_speed(iter/s)": 0.123885 + }, + { + "acc": 0.75352569, + "epoch": 0.6670702568483335, + "grad_norm": 7.90625, + "learning_rate": 7.95849195741199e-06, + "loss": 0.95003319, + "memory(GiB)": 302.58, + "step": 119280, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.73994632, + "epoch": 0.6671821063213128, + "grad_norm": 8.625, + "learning_rate": 7.957746447200019e-06, + "loss": 0.99805765, + "memory(GiB)": 302.58, + "step": 119300, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.72868347, + "epoch": 0.6672939557942921, + "grad_norm": 7.15625, + "learning_rate": 7.957000835823742e-06, + "loss": 1.0786294, + "memory(GiB)": 302.58, + "step": 119320, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.72884927, + "epoch": 0.6674058052672713, + "grad_norm": 7.625, + "learning_rate": 7.95625512330866e-06, + "loss": 1.06814699, + "memory(GiB)": 302.58, + "step": 119340, + "train_speed(iter/s)": 0.123926 + }, + { + "acc": 0.73548002, + "epoch": 0.6675176547402506, + "grad_norm": 9.875, + "learning_rate": 7.955509309680279e-06, + "loss": 1.02797365, + "memory(GiB)": 302.58, + "step": 119360, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.75391622, + "epoch": 0.6676295042132299, + "grad_norm": 7.40625, + "learning_rate": 7.954763394964107e-06, + "loss": 0.96592598, + "memory(GiB)": 302.58, + "step": 119380, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.74484468, + "epoch": 0.6677413536862091, + "grad_norm": 6.9375, + "learning_rate": 7.954017379185656e-06, + "loss": 0.98761282, + "memory(GiB)": 302.58, + "step": 119400, + "train_speed(iter/s)": 0.123955 + }, + { + "acc": 0.73468981, + "epoch": 0.6678532031591884, + "grad_norm": 8.25, + "learning_rate": 7.953271262370446e-06, + "loss": 1.04503489, + "memory(GiB)": 302.58, + "step": 119420, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.72098503, + "epoch": 0.6679650526321677, + "grad_norm": 8.5, + "learning_rate": 7.952525044543995e-06, + "loss": 1.09815493, + "memory(GiB)": 302.58, + "step": 119440, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.72737532, + "epoch": 0.6680769021051469, + "grad_norm": 7.65625, + "learning_rate": 7.951778725731824e-06, + "loss": 1.07216883, + "memory(GiB)": 302.58, + "step": 119460, + "train_speed(iter/s)": 0.123985 + }, + { + "acc": 0.74648829, + "epoch": 0.6681887515781262, + "grad_norm": 9.125, + "learning_rate": 7.951032305959461e-06, + "loss": 0.97420378, + "memory(GiB)": 302.58, + "step": 119480, + "train_speed(iter/s)": 0.123995 + }, + { + "acc": 0.73775773, + "epoch": 0.6683006010511054, + "grad_norm": 6.59375, + "learning_rate": 7.950285785252434e-06, + "loss": 1.03435316, + "memory(GiB)": 302.58, + "step": 119500, + "train_speed(iter/s)": 0.124004 + }, + { + "acc": 0.74311008, + "epoch": 0.6684124505240847, + "grad_norm": 6.3125, + "learning_rate": 7.94953916363628e-06, + "loss": 0.99810047, + "memory(GiB)": 302.58, + "step": 119520, + "train_speed(iter/s)": 0.124014 + }, + { + "acc": 0.74264045, + "epoch": 0.668524299997064, + "grad_norm": 7.25, + "learning_rate": 7.948792441136533e-06, + "loss": 1.02269592, + "memory(GiB)": 302.58, + "step": 119540, + "train_speed(iter/s)": 0.124024 + }, + { + "acc": 0.73136811, + "epoch": 0.6686361494700432, + "grad_norm": 6.34375, + "learning_rate": 7.948045617778734e-06, + "loss": 1.07640476, + "memory(GiB)": 302.58, + "step": 119560, + "train_speed(iter/s)": 0.124034 + }, + { + "acc": 0.74314761, + "epoch": 0.6687479989430225, + "grad_norm": 5.8125, + "learning_rate": 7.947298693588428e-06, + "loss": 1.03451166, + "memory(GiB)": 302.58, + "step": 119580, + "train_speed(iter/s)": 0.124044 + }, + { + "acc": 0.75206594, + "epoch": 0.6688598484160018, + "grad_norm": 5.59375, + "learning_rate": 7.946551668591161e-06, + "loss": 0.98634453, + "memory(GiB)": 302.58, + "step": 119600, + "train_speed(iter/s)": 0.124054 + }, + { + "acc": 0.73531513, + "epoch": 0.668971697888981, + "grad_norm": 7.875, + "learning_rate": 7.945804542812482e-06, + "loss": 1.04635143, + "memory(GiB)": 302.58, + "step": 119620, + "train_speed(iter/s)": 0.124063 + }, + { + "acc": 0.72131839, + "epoch": 0.6690835473619603, + "grad_norm": 6.84375, + "learning_rate": 7.945057316277949e-06, + "loss": 1.10554285, + "memory(GiB)": 302.58, + "step": 119640, + "train_speed(iter/s)": 0.124073 + }, + { + "acc": 0.72447085, + "epoch": 0.6691953968349396, + "grad_norm": 5.46875, + "learning_rate": 7.944309989013115e-06, + "loss": 1.06904526, + "memory(GiB)": 302.58, + "step": 119660, + "train_speed(iter/s)": 0.124083 + }, + { + "acc": 0.71206174, + "epoch": 0.6693072463079188, + "grad_norm": 7.5625, + "learning_rate": 7.943562561043546e-06, + "loss": 1.1291687, + "memory(GiB)": 302.58, + "step": 119680, + "train_speed(iter/s)": 0.124092 + }, + { + "acc": 0.73962994, + "epoch": 0.6694190957808981, + "grad_norm": 6.8125, + "learning_rate": 7.942815032394801e-06, + "loss": 1.01436567, + "memory(GiB)": 302.58, + "step": 119700, + "train_speed(iter/s)": 0.124102 + }, + { + "acc": 0.75584879, + "epoch": 0.6695309452538774, + "grad_norm": 9.125, + "learning_rate": 7.942067403092452e-06, + "loss": 0.97718544, + "memory(GiB)": 302.58, + "step": 119720, + "train_speed(iter/s)": 0.124112 + }, + { + "acc": 0.74861717, + "epoch": 0.6696427947268566, + "grad_norm": 6.03125, + "learning_rate": 7.941319673162067e-06, + "loss": 0.97799273, + "memory(GiB)": 302.58, + "step": 119740, + "train_speed(iter/s)": 0.124122 + }, + { + "acc": 0.7535347, + "epoch": 0.6697546441998359, + "grad_norm": 7.96875, + "learning_rate": 7.940571842629225e-06, + "loss": 0.95016079, + "memory(GiB)": 302.58, + "step": 119760, + "train_speed(iter/s)": 0.124131 + }, + { + "acc": 0.75540791, + "epoch": 0.6698664936728151, + "grad_norm": 5.6875, + "learning_rate": 7.9398239115195e-06, + "loss": 0.94525299, + "memory(GiB)": 302.58, + "step": 119780, + "train_speed(iter/s)": 0.12414 + }, + { + "acc": 0.74070845, + "epoch": 0.6699783431457944, + "grad_norm": 6.40625, + "learning_rate": 7.939075879858476e-06, + "loss": 1.03203115, + "memory(GiB)": 302.58, + "step": 119800, + "train_speed(iter/s)": 0.12415 + }, + { + "acc": 0.74613547, + "epoch": 0.6700901926187737, + "grad_norm": 7.28125, + "learning_rate": 7.938327747671738e-06, + "loss": 0.99734697, + "memory(GiB)": 302.58, + "step": 119820, + "train_speed(iter/s)": 0.124159 + }, + { + "acc": 0.73327522, + "epoch": 0.6702020420917529, + "grad_norm": 8.4375, + "learning_rate": 7.937579514984872e-06, + "loss": 1.05992823, + "memory(GiB)": 302.58, + "step": 119840, + "train_speed(iter/s)": 0.124169 + }, + { + "acc": 0.75435457, + "epoch": 0.6703138915647322, + "grad_norm": 7.78125, + "learning_rate": 7.936831181823472e-06, + "loss": 0.99097662, + "memory(GiB)": 302.58, + "step": 119860, + "train_speed(iter/s)": 0.124178 + }, + { + "acc": 0.74596944, + "epoch": 0.6704257410377115, + "grad_norm": 6.09375, + "learning_rate": 7.936082748213134e-06, + "loss": 1.01303864, + "memory(GiB)": 302.58, + "step": 119880, + "train_speed(iter/s)": 0.124188 + }, + { + "acc": 0.7447165, + "epoch": 0.6705375905106907, + "grad_norm": 5.5, + "learning_rate": 7.935334214179457e-06, + "loss": 0.99355574, + "memory(GiB)": 302.58, + "step": 119900, + "train_speed(iter/s)": 0.124197 + }, + { + "acc": 0.74644523, + "epoch": 0.67064943998367, + "grad_norm": 6.03125, + "learning_rate": 7.934585579748039e-06, + "loss": 0.99765368, + "memory(GiB)": 302.58, + "step": 119920, + "train_speed(iter/s)": 0.124207 + }, + { + "acc": 0.73309746, + "epoch": 0.6707612894566493, + "grad_norm": 5.5, + "learning_rate": 7.933836844944491e-06, + "loss": 1.06328144, + "memory(GiB)": 302.58, + "step": 119940, + "train_speed(iter/s)": 0.124216 + }, + { + "acc": 0.73949509, + "epoch": 0.6708731389296285, + "grad_norm": 9.5625, + "learning_rate": 7.93308800979442e-06, + "loss": 1.02546406, + "memory(GiB)": 302.58, + "step": 119960, + "train_speed(iter/s)": 0.124226 + }, + { + "acc": 0.73732834, + "epoch": 0.6709849884026078, + "grad_norm": 8.0, + "learning_rate": 7.93233907432344e-06, + "loss": 1.05157328, + "memory(GiB)": 302.58, + "step": 119980, + "train_speed(iter/s)": 0.124235 + }, + { + "acc": 0.74540267, + "epoch": 0.671096837875587, + "grad_norm": 5.0625, + "learning_rate": 7.931590038557161e-06, + "loss": 0.99749651, + "memory(GiB)": 302.58, + "step": 120000, + "train_speed(iter/s)": 0.124245 + }, + { + "epoch": 0.671096837875587, + "eval_acc": 0.7037784211994322, + "eval_loss": 1.02523672580719, + "eval_runtime": 7509.9497, + "eval_samples_per_second": 10.024, + "eval_steps_per_second": 10.024, + "step": 120000 + }, + { + "acc": 0.74359169, + "epoch": 0.6712086873485663, + "grad_norm": 9.8125, + "learning_rate": 7.930840902521211e-06, + "loss": 1.00899124, + "memory(GiB)": 302.58, + "step": 120020, + "train_speed(iter/s)": 0.123279 + }, + { + "acc": 0.73106675, + "epoch": 0.6713205368215456, + "grad_norm": 6.8125, + "learning_rate": 7.93009166624121e-06, + "loss": 1.05540085, + "memory(GiB)": 302.58, + "step": 120040, + "train_speed(iter/s)": 0.123288 + }, + { + "acc": 0.73347611, + "epoch": 0.6714323862945248, + "grad_norm": 6.375, + "learning_rate": 7.929342329742778e-06, + "loss": 1.06286011, + "memory(GiB)": 302.58, + "step": 120060, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.74115038, + "epoch": 0.6715442357675041, + "grad_norm": 8.3125, + "learning_rate": 7.928592893051551e-06, + "loss": 1.01991882, + "memory(GiB)": 302.58, + "step": 120080, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.74201508, + "epoch": 0.6716560852404834, + "grad_norm": 6.96875, + "learning_rate": 7.927843356193163e-06, + "loss": 1.0183672, + "memory(GiB)": 302.58, + "step": 120100, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.74065733, + "epoch": 0.6717679347134626, + "grad_norm": 7.78125, + "learning_rate": 7.927093719193246e-06, + "loss": 1.00934362, + "memory(GiB)": 302.58, + "step": 120120, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.73804564, + "epoch": 0.6718797841864419, + "grad_norm": 7.75, + "learning_rate": 7.926343982077443e-06, + "loss": 1.04066801, + "memory(GiB)": 302.58, + "step": 120140, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.73678203, + "epoch": 0.6719916336594212, + "grad_norm": 6.5625, + "learning_rate": 7.925594144871395e-06, + "loss": 1.04696388, + "memory(GiB)": 302.58, + "step": 120160, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.74444609, + "epoch": 0.6721034831324004, + "grad_norm": 7.1875, + "learning_rate": 7.924844207600753e-06, + "loss": 0.99511747, + "memory(GiB)": 302.58, + "step": 120180, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.75123487, + "epoch": 0.6722153326053797, + "grad_norm": 6.40625, + "learning_rate": 7.924094170291166e-06, + "loss": 0.9638936, + "memory(GiB)": 302.58, + "step": 120200, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.74248986, + "epoch": 0.672327182078359, + "grad_norm": 6.25, + "learning_rate": 7.923344032968284e-06, + "loss": 1.02158413, + "memory(GiB)": 302.58, + "step": 120220, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.73196163, + "epoch": 0.6724390315513382, + "grad_norm": 4.96875, + "learning_rate": 7.922593795657765e-06, + "loss": 1.04084406, + "memory(GiB)": 302.58, + "step": 120240, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.74037709, + "epoch": 0.6725508810243175, + "grad_norm": 7.53125, + "learning_rate": 7.921843458385273e-06, + "loss": 0.99801512, + "memory(GiB)": 302.58, + "step": 120260, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.75563054, + "epoch": 0.6726627304972967, + "grad_norm": 5.5, + "learning_rate": 7.92109302117647e-06, + "loss": 0.95553551, + "memory(GiB)": 302.58, + "step": 120280, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.72589512, + "epoch": 0.672774579970276, + "grad_norm": 6.46875, + "learning_rate": 7.920342484057021e-06, + "loss": 1.09050159, + "memory(GiB)": 302.58, + "step": 120300, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.75102935, + "epoch": 0.6728864294432553, + "grad_norm": 7.65625, + "learning_rate": 7.919591847052602e-06, + "loss": 0.9746501, + "memory(GiB)": 302.58, + "step": 120320, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.75354886, + "epoch": 0.6729982789162345, + "grad_norm": 9.5625, + "learning_rate": 7.918841110188881e-06, + "loss": 0.98185644, + "memory(GiB)": 302.58, + "step": 120340, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.73068542, + "epoch": 0.6731101283892138, + "grad_norm": 5.34375, + "learning_rate": 7.91809027349154e-06, + "loss": 1.07108259, + "memory(GiB)": 302.58, + "step": 120360, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.75527897, + "epoch": 0.6732219778621931, + "grad_norm": 6.46875, + "learning_rate": 7.917339336986259e-06, + "loss": 0.95526524, + "memory(GiB)": 302.58, + "step": 120380, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.75059657, + "epoch": 0.6733338273351723, + "grad_norm": 10.75, + "learning_rate": 7.916588300698723e-06, + "loss": 0.96580343, + "memory(GiB)": 302.58, + "step": 120400, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.7308815, + "epoch": 0.6734456768081516, + "grad_norm": 5.90625, + "learning_rate": 7.915837164654617e-06, + "loss": 1.0615366, + "memory(GiB)": 302.58, + "step": 120420, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.7337563, + "epoch": 0.6735575262811309, + "grad_norm": 7.8125, + "learning_rate": 7.915085928879634e-06, + "loss": 1.05966845, + "memory(GiB)": 302.58, + "step": 120440, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74399114, + "epoch": 0.6736693757541101, + "grad_norm": 6.3125, + "learning_rate": 7.91433459339947e-06, + "loss": 1.01674414, + "memory(GiB)": 302.58, + "step": 120460, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.75772395, + "epoch": 0.6737812252270894, + "grad_norm": 12.4375, + "learning_rate": 7.913583158239822e-06, + "loss": 0.94955187, + "memory(GiB)": 302.58, + "step": 120480, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.72499051, + "epoch": 0.6738930747000687, + "grad_norm": 5.0, + "learning_rate": 7.91283162342639e-06, + "loss": 1.09098873, + "memory(GiB)": 302.58, + "step": 120500, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.73332524, + "epoch": 0.6740049241730479, + "grad_norm": 8.75, + "learning_rate": 7.91207998898488e-06, + "loss": 1.0546608, + "memory(GiB)": 302.58, + "step": 120520, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.73782859, + "epoch": 0.6741167736460272, + "grad_norm": 6.5625, + "learning_rate": 7.911328254941001e-06, + "loss": 1.0359479, + "memory(GiB)": 302.58, + "step": 120540, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74946399, + "epoch": 0.6742286231190064, + "grad_norm": 6.03125, + "learning_rate": 7.910576421320464e-06, + "loss": 0.99082413, + "memory(GiB)": 302.58, + "step": 120560, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.73496861, + "epoch": 0.6743404725919857, + "grad_norm": 5.125, + "learning_rate": 7.909824488148985e-06, + "loss": 1.03975496, + "memory(GiB)": 302.58, + "step": 120580, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.73974619, + "epoch": 0.674452322064965, + "grad_norm": 5.65625, + "learning_rate": 7.90907245545228e-06, + "loss": 1.0431448, + "memory(GiB)": 302.58, + "step": 120600, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75414171, + "epoch": 0.6745641715379442, + "grad_norm": 7.71875, + "learning_rate": 7.908320323256075e-06, + "loss": 0.9844429, + "memory(GiB)": 302.58, + "step": 120620, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74125776, + "epoch": 0.6746760210109235, + "grad_norm": 7.59375, + "learning_rate": 7.90756809158609e-06, + "loss": 1.03059549, + "memory(GiB)": 302.58, + "step": 120640, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.72764668, + "epoch": 0.6747878704839028, + "grad_norm": 6.375, + "learning_rate": 7.906815760468058e-06, + "loss": 1.08298664, + "memory(GiB)": 302.58, + "step": 120660, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.71863132, + "epoch": 0.674899719956882, + "grad_norm": 7.71875, + "learning_rate": 7.906063329927711e-06, + "loss": 1.11812449, + "memory(GiB)": 302.58, + "step": 120680, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.74533334, + "epoch": 0.6750115694298613, + "grad_norm": 7.78125, + "learning_rate": 7.905310799990781e-06, + "loss": 1.02022076, + "memory(GiB)": 302.58, + "step": 120700, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.72735748, + "epoch": 0.6751234189028406, + "grad_norm": 7.8125, + "learning_rate": 7.904558170683013e-06, + "loss": 1.06704082, + "memory(GiB)": 302.58, + "step": 120720, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.73890443, + "epoch": 0.6752352683758198, + "grad_norm": 8.125, + "learning_rate": 7.903805442030141e-06, + "loss": 1.05634689, + "memory(GiB)": 302.58, + "step": 120740, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.73108983, + "epoch": 0.6753471178487991, + "grad_norm": 8.5, + "learning_rate": 7.903052614057917e-06, + "loss": 1.08027935, + "memory(GiB)": 302.58, + "step": 120760, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.75438461, + "epoch": 0.6754589673217783, + "grad_norm": 7.4375, + "learning_rate": 7.902299686792087e-06, + "loss": 0.95142078, + "memory(GiB)": 302.58, + "step": 120780, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.75654011, + "epoch": 0.6755708167947576, + "grad_norm": 8.875, + "learning_rate": 7.901546660258406e-06, + "loss": 0.95262642, + "memory(GiB)": 302.58, + "step": 120800, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.7427484, + "epoch": 0.6756826662677369, + "grad_norm": 6.21875, + "learning_rate": 7.90079353448263e-06, + "loss": 1.02248297, + "memory(GiB)": 302.58, + "step": 120820, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.73527422, + "epoch": 0.6757945157407161, + "grad_norm": 8.875, + "learning_rate": 7.900040309490516e-06, + "loss": 1.05455303, + "memory(GiB)": 302.58, + "step": 120840, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.74821362, + "epoch": 0.6759063652136954, + "grad_norm": 7.59375, + "learning_rate": 7.899286985307826e-06, + "loss": 0.96984396, + "memory(GiB)": 302.58, + "step": 120860, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.75606074, + "epoch": 0.6760182146866747, + "grad_norm": 8.8125, + "learning_rate": 7.898533561960329e-06, + "loss": 0.94144802, + "memory(GiB)": 302.58, + "step": 120880, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.74782028, + "epoch": 0.6761300641596539, + "grad_norm": 8.1875, + "learning_rate": 7.897780039473793e-06, + "loss": 0.99131174, + "memory(GiB)": 302.58, + "step": 120900, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.75084324, + "epoch": 0.6762419136326332, + "grad_norm": 9.1875, + "learning_rate": 7.897026417873993e-06, + "loss": 0.98912716, + "memory(GiB)": 302.58, + "step": 120920, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.73935766, + "epoch": 0.6763537631056125, + "grad_norm": 6.75, + "learning_rate": 7.8962726971867e-06, + "loss": 1.02363739, + "memory(GiB)": 302.58, + "step": 120940, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.73520589, + "epoch": 0.6764656125785917, + "grad_norm": 5.40625, + "learning_rate": 7.895518877437698e-06, + "loss": 1.03594685, + "memory(GiB)": 302.58, + "step": 120960, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.74721708, + "epoch": 0.676577462051571, + "grad_norm": 5.90625, + "learning_rate": 7.89476495865277e-06, + "loss": 0.9958559, + "memory(GiB)": 302.58, + "step": 120980, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.75240707, + "epoch": 0.6766893115245503, + "grad_norm": 6.375, + "learning_rate": 7.894010940857701e-06, + "loss": 0.97836466, + "memory(GiB)": 302.58, + "step": 121000, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.73420506, + "epoch": 0.6768011609975295, + "grad_norm": 7.8125, + "learning_rate": 7.893256824078279e-06, + "loss": 1.04098711, + "memory(GiB)": 302.58, + "step": 121020, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.73282442, + "epoch": 0.6769130104705088, + "grad_norm": 8.9375, + "learning_rate": 7.892502608340303e-06, + "loss": 1.04329462, + "memory(GiB)": 302.58, + "step": 121040, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.73522139, + "epoch": 0.677024859943488, + "grad_norm": 6.75, + "learning_rate": 7.891748293669563e-06, + "loss": 1.03454256, + "memory(GiB)": 302.58, + "step": 121060, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.74976339, + "epoch": 0.6771367094164673, + "grad_norm": 6.3125, + "learning_rate": 7.890993880091864e-06, + "loss": 0.98618898, + "memory(GiB)": 302.58, + "step": 121080, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.71685162, + "epoch": 0.6772485588894466, + "grad_norm": 5.25, + "learning_rate": 7.890239367633005e-06, + "loss": 1.13704109, + "memory(GiB)": 302.58, + "step": 121100, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.72915783, + "epoch": 0.6773604083624258, + "grad_norm": 6.625, + "learning_rate": 7.889484756318798e-06, + "loss": 1.07308655, + "memory(GiB)": 302.58, + "step": 121120, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.73427534, + "epoch": 0.6774722578354051, + "grad_norm": 6.125, + "learning_rate": 7.888730046175046e-06, + "loss": 1.04688158, + "memory(GiB)": 302.58, + "step": 121140, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.73162584, + "epoch": 0.6775841073083844, + "grad_norm": 4.71875, + "learning_rate": 7.88797523722757e-06, + "loss": 1.04912224, + "memory(GiB)": 302.58, + "step": 121160, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.74786282, + "epoch": 0.6776959567813636, + "grad_norm": 8.125, + "learning_rate": 7.88722032950218e-06, + "loss": 0.99072514, + "memory(GiB)": 302.58, + "step": 121180, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.74356484, + "epoch": 0.6778078062543429, + "grad_norm": 4.65625, + "learning_rate": 7.886465323024704e-06, + "loss": 0.99907913, + "memory(GiB)": 302.58, + "step": 121200, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.74516373, + "epoch": 0.6779196557273222, + "grad_norm": 6.34375, + "learning_rate": 7.885710217820957e-06, + "loss": 0.99760551, + "memory(GiB)": 302.58, + "step": 121220, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.73632603, + "epoch": 0.6780315052003014, + "grad_norm": 6.0625, + "learning_rate": 7.884955013916773e-06, + "loss": 1.06072321, + "memory(GiB)": 302.58, + "step": 121240, + "train_speed(iter/s)": 0.123866 + }, + { + "acc": 0.74181881, + "epoch": 0.6781433546732807, + "grad_norm": 7.1875, + "learning_rate": 7.884199711337978e-06, + "loss": 0.99931593, + "memory(GiB)": 302.58, + "step": 121260, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.74258366, + "epoch": 0.67825520414626, + "grad_norm": 6.09375, + "learning_rate": 7.883444310110409e-06, + "loss": 1.00723505, + "memory(GiB)": 302.58, + "step": 121280, + "train_speed(iter/s)": 0.123885 + }, + { + "acc": 0.71476054, + "epoch": 0.6783670536192392, + "grad_norm": 5.3125, + "learning_rate": 7.8826888102599e-06, + "loss": 1.14073963, + "memory(GiB)": 302.58, + "step": 121300, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.74932899, + "epoch": 0.6784789030922185, + "grad_norm": 8.125, + "learning_rate": 7.881933211812295e-06, + "loss": 0.98400173, + "memory(GiB)": 302.58, + "step": 121320, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.74253325, + "epoch": 0.6785907525651977, + "grad_norm": 6.5, + "learning_rate": 7.881177514793432e-06, + "loss": 1.00561304, + "memory(GiB)": 302.58, + "step": 121340, + "train_speed(iter/s)": 0.123915 + }, + { + "acc": 0.74759545, + "epoch": 0.678702602038177, + "grad_norm": 7.8125, + "learning_rate": 7.880421719229164e-06, + "loss": 1.01613989, + "memory(GiB)": 302.58, + "step": 121360, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.75550609, + "epoch": 0.6788144515111563, + "grad_norm": 9.6875, + "learning_rate": 7.879665825145339e-06, + "loss": 0.97414618, + "memory(GiB)": 302.58, + "step": 121380, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.73301735, + "epoch": 0.6789263009841355, + "grad_norm": 6.03125, + "learning_rate": 7.878909832567813e-06, + "loss": 1.03773623, + "memory(GiB)": 302.58, + "step": 121400, + "train_speed(iter/s)": 0.123942 + }, + { + "acc": 0.75369411, + "epoch": 0.6790381504571148, + "grad_norm": 7.90625, + "learning_rate": 7.87815374152244e-06, + "loss": 0.94906292, + "memory(GiB)": 302.58, + "step": 121420, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.75305343, + "epoch": 0.6791499999300941, + "grad_norm": 6.3125, + "learning_rate": 7.877397552035086e-06, + "loss": 0.9775363, + "memory(GiB)": 302.58, + "step": 121440, + "train_speed(iter/s)": 0.123961 + }, + { + "acc": 0.75104079, + "epoch": 0.6792618494030733, + "grad_norm": 9.0625, + "learning_rate": 7.87664126413161e-06, + "loss": 0.98515491, + "memory(GiB)": 302.58, + "step": 121460, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.73789725, + "epoch": 0.6793736988760526, + "grad_norm": 7.25, + "learning_rate": 7.87588487783788e-06, + "loss": 1.01675968, + "memory(GiB)": 302.58, + "step": 121480, + "train_speed(iter/s)": 0.12398 + }, + { + "acc": 0.72861414, + "epoch": 0.6794855483490319, + "grad_norm": 7.375, + "learning_rate": 7.87512839317977e-06, + "loss": 1.05881901, + "memory(GiB)": 302.58, + "step": 121500, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.72220182, + "epoch": 0.6795973978220111, + "grad_norm": 6.1875, + "learning_rate": 7.874371810183151e-06, + "loss": 1.12482786, + "memory(GiB)": 302.58, + "step": 121520, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.74477892, + "epoch": 0.6797092472949904, + "grad_norm": 7.34375, + "learning_rate": 7.873615128873901e-06, + "loss": 1.00754757, + "memory(GiB)": 302.58, + "step": 121540, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.74386759, + "epoch": 0.6798210967679696, + "grad_norm": 8.8125, + "learning_rate": 7.872858349277902e-06, + "loss": 1.00346909, + "memory(GiB)": 302.58, + "step": 121560, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.74123106, + "epoch": 0.6799329462409489, + "grad_norm": 6.875, + "learning_rate": 7.872101471421038e-06, + "loss": 1.0169836, + "memory(GiB)": 302.58, + "step": 121580, + "train_speed(iter/s)": 0.124028 + }, + { + "acc": 0.74614196, + "epoch": 0.6800447957139282, + "grad_norm": 7.78125, + "learning_rate": 7.871344495329198e-06, + "loss": 0.98842821, + "memory(GiB)": 302.58, + "step": 121600, + "train_speed(iter/s)": 0.124037 + }, + { + "acc": 0.74981661, + "epoch": 0.6801566451869074, + "grad_norm": 10.0, + "learning_rate": 7.87058742102827e-06, + "loss": 0.99939098, + "memory(GiB)": 302.58, + "step": 121620, + "train_speed(iter/s)": 0.124046 + }, + { + "acc": 0.75920033, + "epoch": 0.6802684946598867, + "grad_norm": 7.25, + "learning_rate": 7.86983024854415e-06, + "loss": 0.9468379, + "memory(GiB)": 302.58, + "step": 121640, + "train_speed(iter/s)": 0.124055 + }, + { + "acc": 0.75108609, + "epoch": 0.680380344132866, + "grad_norm": 6.28125, + "learning_rate": 7.869072977902735e-06, + "loss": 0.96920872, + "memory(GiB)": 302.58, + "step": 121660, + "train_speed(iter/s)": 0.124066 + }, + { + "acc": 0.76540661, + "epoch": 0.6804921936058452, + "grad_norm": 9.875, + "learning_rate": 7.868315609129927e-06, + "loss": 0.90303869, + "memory(GiB)": 302.58, + "step": 121680, + "train_speed(iter/s)": 0.124075 + }, + { + "acc": 0.75087285, + "epoch": 0.6806040430788245, + "grad_norm": 7.09375, + "learning_rate": 7.86755814225163e-06, + "loss": 0.96029968, + "memory(GiB)": 302.58, + "step": 121700, + "train_speed(iter/s)": 0.124085 + }, + { + "acc": 0.75083241, + "epoch": 0.6807158925518038, + "grad_norm": 7.21875, + "learning_rate": 7.866800577293752e-06, + "loss": 0.96538792, + "memory(GiB)": 302.58, + "step": 121720, + "train_speed(iter/s)": 0.124094 + }, + { + "acc": 0.73910756, + "epoch": 0.680827742024783, + "grad_norm": 6.3125, + "learning_rate": 7.866042914282204e-06, + "loss": 1.02493954, + "memory(GiB)": 302.58, + "step": 121740, + "train_speed(iter/s)": 0.124104 + }, + { + "acc": 0.75213623, + "epoch": 0.6809395914977623, + "grad_norm": 10.8125, + "learning_rate": 7.865285153242902e-06, + "loss": 0.96978073, + "memory(GiB)": 302.58, + "step": 121760, + "train_speed(iter/s)": 0.124114 + }, + { + "acc": 0.73323855, + "epoch": 0.6810514409707416, + "grad_norm": 6.875, + "learning_rate": 7.864527294201757e-06, + "loss": 1.03880939, + "memory(GiB)": 302.58, + "step": 121780, + "train_speed(iter/s)": 0.124123 + }, + { + "acc": 0.7418303, + "epoch": 0.6811632904437208, + "grad_norm": 7.34375, + "learning_rate": 7.8637693371847e-06, + "loss": 1.02057791, + "memory(GiB)": 302.58, + "step": 121800, + "train_speed(iter/s)": 0.124132 + }, + { + "acc": 0.73837638, + "epoch": 0.6812751399167001, + "grad_norm": 7.25, + "learning_rate": 7.86301128221765e-06, + "loss": 1.05111952, + "memory(GiB)": 302.58, + "step": 121820, + "train_speed(iter/s)": 0.124141 + }, + { + "acc": 0.73047166, + "epoch": 0.6813869893896793, + "grad_norm": 5.25, + "learning_rate": 7.862253129326536e-06, + "loss": 1.09984674, + "memory(GiB)": 302.58, + "step": 121840, + "train_speed(iter/s)": 0.124151 + }, + { + "acc": 0.73213692, + "epoch": 0.6814988388626586, + "grad_norm": 5.4375, + "learning_rate": 7.861494878537287e-06, + "loss": 1.0634675, + "memory(GiB)": 302.58, + "step": 121860, + "train_speed(iter/s)": 0.12416 + }, + { + "acc": 0.73925934, + "epoch": 0.6816106883356379, + "grad_norm": 10.375, + "learning_rate": 7.86073652987584e-06, + "loss": 1.03348198, + "memory(GiB)": 302.58, + "step": 121880, + "train_speed(iter/s)": 0.12417 + }, + { + "acc": 0.74773436, + "epoch": 0.6817225378086171, + "grad_norm": 9.875, + "learning_rate": 7.859978083368134e-06, + "loss": 0.99028282, + "memory(GiB)": 302.58, + "step": 121900, + "train_speed(iter/s)": 0.124179 + }, + { + "acc": 0.73073249, + "epoch": 0.6818343872815964, + "grad_norm": 7.875, + "learning_rate": 7.859219539040109e-06, + "loss": 1.06421223, + "memory(GiB)": 302.58, + "step": 121920, + "train_speed(iter/s)": 0.124189 + }, + { + "acc": 0.73370838, + "epoch": 0.6819462367545757, + "grad_norm": 6.71875, + "learning_rate": 7.858460896917708e-06, + "loss": 1.0436739, + "memory(GiB)": 302.58, + "step": 121940, + "train_speed(iter/s)": 0.124199 + }, + { + "acc": 0.73138361, + "epoch": 0.6820580862275549, + "grad_norm": 7.84375, + "learning_rate": 7.857702157026883e-06, + "loss": 1.05990896, + "memory(GiB)": 302.58, + "step": 121960, + "train_speed(iter/s)": 0.124208 + }, + { + "acc": 0.74015145, + "epoch": 0.6821699357005342, + "grad_norm": 7.78125, + "learning_rate": 7.85694331939358e-06, + "loss": 1.01206417, + "memory(GiB)": 302.58, + "step": 121980, + "train_speed(iter/s)": 0.124218 + }, + { + "acc": 0.73249087, + "epoch": 0.6822817851735135, + "grad_norm": 5.5, + "learning_rate": 7.856184384043756e-06, + "loss": 1.052139, + "memory(GiB)": 302.58, + "step": 122000, + "train_speed(iter/s)": 0.124228 + }, + { + "epoch": 0.6822817851735135, + "eval_acc": 0.7039395230969739, + "eval_loss": 1.024634599685669, + "eval_runtime": 7493.0885, + "eval_samples_per_second": 10.047, + "eval_steps_per_second": 10.047, + "step": 122000 + }, + { + "acc": 0.74231067, + "epoch": 0.6823936346464927, + "grad_norm": 5.78125, + "learning_rate": 7.85542535100337e-06, + "loss": 1.02680941, + "memory(GiB)": 302.58, + "step": 122020, + "train_speed(iter/s)": 0.12328 + }, + { + "acc": 0.72559094, + "epoch": 0.682505484119472, + "grad_norm": 5.21875, + "learning_rate": 7.854666220298384e-06, + "loss": 1.06812878, + "memory(GiB)": 302.58, + "step": 122040, + "train_speed(iter/s)": 0.12329 + }, + { + "acc": 0.75576568, + "epoch": 0.6826173335924512, + "grad_norm": 8.0, + "learning_rate": 7.853906991954761e-06, + "loss": 0.95746393, + "memory(GiB)": 302.58, + "step": 122060, + "train_speed(iter/s)": 0.123299 + }, + { + "acc": 0.7439342, + "epoch": 0.6827291830654305, + "grad_norm": 8.5625, + "learning_rate": 7.853147665998469e-06, + "loss": 1.00681324, + "memory(GiB)": 302.58, + "step": 122080, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.75015616, + "epoch": 0.6828410325384098, + "grad_norm": 8.0625, + "learning_rate": 7.852388242455478e-06, + "loss": 0.993923, + "memory(GiB)": 302.58, + "step": 122100, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.72543263, + "epoch": 0.682952882011389, + "grad_norm": 4.9375, + "learning_rate": 7.851628721351767e-06, + "loss": 1.07928076, + "memory(GiB)": 302.58, + "step": 122120, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.73019567, + "epoch": 0.6830647314843683, + "grad_norm": 7.375, + "learning_rate": 7.85086910271331e-06, + "loss": 1.06339502, + "memory(GiB)": 302.58, + "step": 122140, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.76137099, + "epoch": 0.6831765809573476, + "grad_norm": 7.8125, + "learning_rate": 7.850109386566092e-06, + "loss": 0.92728014, + "memory(GiB)": 302.58, + "step": 122160, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.74352465, + "epoch": 0.6832884304303268, + "grad_norm": 7.53125, + "learning_rate": 7.849349572936093e-06, + "loss": 0.9961853, + "memory(GiB)": 302.58, + "step": 122180, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.76238179, + "epoch": 0.6834002799033061, + "grad_norm": 7.5, + "learning_rate": 7.848589661849303e-06, + "loss": 0.93537617, + "memory(GiB)": 302.58, + "step": 122200, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.74755473, + "epoch": 0.6835121293762854, + "grad_norm": 6.125, + "learning_rate": 7.847829653331717e-06, + "loss": 0.96640797, + "memory(GiB)": 302.58, + "step": 122220, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.74519525, + "epoch": 0.6836239788492646, + "grad_norm": 5.53125, + "learning_rate": 7.847069547409325e-06, + "loss": 0.99305067, + "memory(GiB)": 302.58, + "step": 122240, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.75114627, + "epoch": 0.6837358283222439, + "grad_norm": 7.4375, + "learning_rate": 7.846309344108127e-06, + "loss": 0.97990084, + "memory(GiB)": 302.58, + "step": 122260, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.74370599, + "epoch": 0.6838476777952232, + "grad_norm": 5.84375, + "learning_rate": 7.845549043454124e-06, + "loss": 1.00090094, + "memory(GiB)": 302.58, + "step": 122280, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.74472632, + "epoch": 0.6839595272682024, + "grad_norm": 7.9375, + "learning_rate": 7.84478864547332e-06, + "loss": 0.99695911, + "memory(GiB)": 302.58, + "step": 122300, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.74437943, + "epoch": 0.6840713767411817, + "grad_norm": 4.78125, + "learning_rate": 7.844028150191725e-06, + "loss": 1.00648012, + "memory(GiB)": 302.58, + "step": 122320, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.74819245, + "epoch": 0.6841832262141609, + "grad_norm": 6.90625, + "learning_rate": 7.843267557635348e-06, + "loss": 0.99355545, + "memory(GiB)": 302.58, + "step": 122340, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.74424663, + "epoch": 0.6842950756871402, + "grad_norm": 7.09375, + "learning_rate": 7.842506867830204e-06, + "loss": 0.99576731, + "memory(GiB)": 302.58, + "step": 122360, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.75859036, + "epoch": 0.6844069251601195, + "grad_norm": 8.0625, + "learning_rate": 7.841746080802312e-06, + "loss": 0.95224895, + "memory(GiB)": 302.58, + "step": 122380, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.74012556, + "epoch": 0.6845187746330987, + "grad_norm": 5.75, + "learning_rate": 7.840985196577694e-06, + "loss": 1.01558142, + "memory(GiB)": 302.58, + "step": 122400, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.75996246, + "epoch": 0.684630624106078, + "grad_norm": 5.59375, + "learning_rate": 7.840224215182375e-06, + "loss": 0.91661272, + "memory(GiB)": 302.58, + "step": 122420, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.74330788, + "epoch": 0.6847424735790573, + "grad_norm": 6.53125, + "learning_rate": 7.83946313664238e-06, + "loss": 1.01451864, + "memory(GiB)": 302.58, + "step": 122440, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.71921482, + "epoch": 0.6848543230520365, + "grad_norm": 8.8125, + "learning_rate": 7.838701960983744e-06, + "loss": 1.12053976, + "memory(GiB)": 302.58, + "step": 122460, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.75022655, + "epoch": 0.6849661725250158, + "grad_norm": 8.0, + "learning_rate": 7.837940688232497e-06, + "loss": 0.98618965, + "memory(GiB)": 302.58, + "step": 122480, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.73789878, + "epoch": 0.6850780219979951, + "grad_norm": 8.0, + "learning_rate": 7.83717931841468e-06, + "loss": 1.0091671, + "memory(GiB)": 302.58, + "step": 122500, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.72992411, + "epoch": 0.6851898714709743, + "grad_norm": 9.75, + "learning_rate": 7.836417851556335e-06, + "loss": 1.05586748, + "memory(GiB)": 302.58, + "step": 122520, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.74175205, + "epoch": 0.6853017209439536, + "grad_norm": 10.1875, + "learning_rate": 7.835656287683507e-06, + "loss": 1.00037374, + "memory(GiB)": 302.58, + "step": 122540, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.73688898, + "epoch": 0.6854135704169328, + "grad_norm": 6.59375, + "learning_rate": 7.834894626822239e-06, + "loss": 1.05710773, + "memory(GiB)": 302.58, + "step": 122560, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.7539042, + "epoch": 0.6855254198899121, + "grad_norm": 6.28125, + "learning_rate": 7.834132868998588e-06, + "loss": 0.96946878, + "memory(GiB)": 302.58, + "step": 122580, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74507456, + "epoch": 0.6856372693628914, + "grad_norm": 6.6875, + "learning_rate": 7.833371014238604e-06, + "loss": 1.0098074, + "memory(GiB)": 302.58, + "step": 122600, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.73612962, + "epoch": 0.6857491188358706, + "grad_norm": 7.125, + "learning_rate": 7.832609062568348e-06, + "loss": 1.04652452, + "memory(GiB)": 302.58, + "step": 122620, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.72877555, + "epoch": 0.6858609683088499, + "grad_norm": 8.125, + "learning_rate": 7.83184701401388e-06, + "loss": 1.07022696, + "memory(GiB)": 302.58, + "step": 122640, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.75285702, + "epoch": 0.6859728177818292, + "grad_norm": 7.8125, + "learning_rate": 7.831084868601264e-06, + "loss": 0.96382065, + "memory(GiB)": 302.58, + "step": 122660, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.74512949, + "epoch": 0.6860846672548084, + "grad_norm": 10.8125, + "learning_rate": 7.830322626356569e-06, + "loss": 1.01459856, + "memory(GiB)": 302.58, + "step": 122680, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.73885198, + "epoch": 0.6861965167277877, + "grad_norm": 4.875, + "learning_rate": 7.829560287305865e-06, + "loss": 1.00090914, + "memory(GiB)": 302.58, + "step": 122700, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74469504, + "epoch": 0.686308366200767, + "grad_norm": 7.8125, + "learning_rate": 7.828797851475227e-06, + "loss": 0.98805437, + "memory(GiB)": 302.58, + "step": 122720, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.758004, + "epoch": 0.6864202156737462, + "grad_norm": 8.4375, + "learning_rate": 7.828035318890735e-06, + "loss": 0.93728561, + "memory(GiB)": 302.58, + "step": 122740, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.74527106, + "epoch": 0.6865320651467255, + "grad_norm": 7.4375, + "learning_rate": 7.827272689578464e-06, + "loss": 0.99468374, + "memory(GiB)": 302.58, + "step": 122760, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.74094167, + "epoch": 0.6866439146197048, + "grad_norm": 5.8125, + "learning_rate": 7.826509963564504e-06, + "loss": 1.01459608, + "memory(GiB)": 302.58, + "step": 122780, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.73853874, + "epoch": 0.686755764092684, + "grad_norm": 6.125, + "learning_rate": 7.825747140874939e-06, + "loss": 1.01531458, + "memory(GiB)": 302.58, + "step": 122800, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.75569997, + "epoch": 0.6868676135656633, + "grad_norm": 6.40625, + "learning_rate": 7.824984221535863e-06, + "loss": 0.95146236, + "memory(GiB)": 302.58, + "step": 122820, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.72501483, + "epoch": 0.6869794630386425, + "grad_norm": 10.5, + "learning_rate": 7.82422120557337e-06, + "loss": 1.07819719, + "memory(GiB)": 302.58, + "step": 122840, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.74959068, + "epoch": 0.6870913125116218, + "grad_norm": 7.84375, + "learning_rate": 7.823458093013554e-06, + "loss": 0.98872051, + "memory(GiB)": 302.58, + "step": 122860, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.74602227, + "epoch": 0.6872031619846011, + "grad_norm": 5.59375, + "learning_rate": 7.82269488388252e-06, + "loss": 1.00646811, + "memory(GiB)": 302.58, + "step": 122880, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.73876209, + "epoch": 0.6873150114575803, + "grad_norm": 6.09375, + "learning_rate": 7.821931578206369e-06, + "loss": 1.02157631, + "memory(GiB)": 302.58, + "step": 122900, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.72926822, + "epoch": 0.6874268609305596, + "grad_norm": 7.96875, + "learning_rate": 7.821168176011212e-06, + "loss": 1.08841105, + "memory(GiB)": 302.58, + "step": 122920, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.74165964, + "epoch": 0.6875387104035389, + "grad_norm": 10.125, + "learning_rate": 7.820404677323155e-06, + "loss": 1.01588545, + "memory(GiB)": 302.58, + "step": 122940, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75122051, + "epoch": 0.6876505598765181, + "grad_norm": 6.65625, + "learning_rate": 7.819641082168316e-06, + "loss": 0.97799292, + "memory(GiB)": 302.58, + "step": 122960, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.7478302, + "epoch": 0.6877624093494974, + "grad_norm": 7.4375, + "learning_rate": 7.818877390572812e-06, + "loss": 0.96847878, + "memory(GiB)": 302.58, + "step": 122980, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.73088121, + "epoch": 0.6878742588224767, + "grad_norm": 6.59375, + "learning_rate": 7.818113602562762e-06, + "loss": 1.06930256, + "memory(GiB)": 302.58, + "step": 123000, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.74599757, + "epoch": 0.6879861082954559, + "grad_norm": 8.5, + "learning_rate": 7.81734971816429e-06, + "loss": 0.9897788, + "memory(GiB)": 302.58, + "step": 123020, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.73926802, + "epoch": 0.6880979577684352, + "grad_norm": 5.8125, + "learning_rate": 7.816585737403526e-06, + "loss": 1.02051535, + "memory(GiB)": 302.58, + "step": 123040, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.75709581, + "epoch": 0.6882098072414145, + "grad_norm": 6.28125, + "learning_rate": 7.815821660306597e-06, + "loss": 0.95856237, + "memory(GiB)": 302.58, + "step": 123060, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.75383658, + "epoch": 0.6883216567143937, + "grad_norm": 9.0, + "learning_rate": 7.81505748689964e-06, + "loss": 0.95391216, + "memory(GiB)": 302.58, + "step": 123080, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.73923988, + "epoch": 0.688433506187373, + "grad_norm": 7.71875, + "learning_rate": 7.814293217208789e-06, + "loss": 1.05094128, + "memory(GiB)": 302.58, + "step": 123100, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.75022011, + "epoch": 0.6885453556603522, + "grad_norm": 8.6875, + "learning_rate": 7.813528851260188e-06, + "loss": 0.98118677, + "memory(GiB)": 302.58, + "step": 123120, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.75096974, + "epoch": 0.6886572051333315, + "grad_norm": 7.0625, + "learning_rate": 7.812764389079977e-06, + "loss": 0.98053417, + "memory(GiB)": 302.58, + "step": 123140, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.72398925, + "epoch": 0.6887690546063109, + "grad_norm": 6.78125, + "learning_rate": 7.811999830694306e-06, + "loss": 1.09718971, + "memory(GiB)": 302.58, + "step": 123160, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.72920609, + "epoch": 0.6888809040792901, + "grad_norm": 14.125, + "learning_rate": 7.811235176129324e-06, + "loss": 1.07685862, + "memory(GiB)": 302.58, + "step": 123180, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.74103112, + "epoch": 0.6889927535522694, + "grad_norm": 7.125, + "learning_rate": 7.810470425411183e-06, + "loss": 1.02504225, + "memory(GiB)": 302.58, + "step": 123200, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.72339759, + "epoch": 0.6891046030252487, + "grad_norm": 4.59375, + "learning_rate": 7.809705578566044e-06, + "loss": 1.09397945, + "memory(GiB)": 302.58, + "step": 123220, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.74951401, + "epoch": 0.6892164524982279, + "grad_norm": 9.0625, + "learning_rate": 7.808940635620063e-06, + "loss": 0.97352571, + "memory(GiB)": 302.58, + "step": 123240, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.71787252, + "epoch": 0.6893283019712072, + "grad_norm": 7.34375, + "learning_rate": 7.808175596599407e-06, + "loss": 1.11036339, + "memory(GiB)": 302.58, + "step": 123260, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.73727942, + "epoch": 0.6894401514441865, + "grad_norm": 4.65625, + "learning_rate": 7.80741046153024e-06, + "loss": 1.04844666, + "memory(GiB)": 302.58, + "step": 123280, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.76466098, + "epoch": 0.6895520009171657, + "grad_norm": 8.625, + "learning_rate": 7.806645230438734e-06, + "loss": 0.93737698, + "memory(GiB)": 302.58, + "step": 123300, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.72957754, + "epoch": 0.689663850390145, + "grad_norm": 6.40625, + "learning_rate": 7.805879903351062e-06, + "loss": 1.07038651, + "memory(GiB)": 302.58, + "step": 123320, + "train_speed(iter/s)": 0.123899 + }, + { + "acc": 0.72659435, + "epoch": 0.6897756998631243, + "grad_norm": 6.875, + "learning_rate": 7.805114480293398e-06, + "loss": 1.08202248, + "memory(GiB)": 302.58, + "step": 123340, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.75381532, + "epoch": 0.6898875493361035, + "grad_norm": 8.5625, + "learning_rate": 7.804348961291926e-06, + "loss": 0.96376266, + "memory(GiB)": 302.58, + "step": 123360, + "train_speed(iter/s)": 0.123919 + }, + { + "acc": 0.73951797, + "epoch": 0.6899993988090828, + "grad_norm": 8.8125, + "learning_rate": 7.803583346372825e-06, + "loss": 1.03816042, + "memory(GiB)": 302.58, + "step": 123380, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.76392522, + "epoch": 0.690111248282062, + "grad_norm": 5.40625, + "learning_rate": 7.802817635562286e-06, + "loss": 0.91584206, + "memory(GiB)": 302.58, + "step": 123400, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.74759655, + "epoch": 0.6902230977550413, + "grad_norm": 6.3125, + "learning_rate": 7.802051828886495e-06, + "loss": 0.98341646, + "memory(GiB)": 302.58, + "step": 123420, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.74488444, + "epoch": 0.6903349472280206, + "grad_norm": 7.0, + "learning_rate": 7.801285926371649e-06, + "loss": 0.99937267, + "memory(GiB)": 302.58, + "step": 123440, + "train_speed(iter/s)": 0.123955 + }, + { + "acc": 0.73234382, + "epoch": 0.6904467967009998, + "grad_norm": 7.09375, + "learning_rate": 7.800519928043939e-06, + "loss": 1.05664406, + "memory(GiB)": 302.58, + "step": 123460, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.73086028, + "epoch": 0.6905586461739791, + "grad_norm": 5.5625, + "learning_rate": 7.799753833929567e-06, + "loss": 1.05754223, + "memory(GiB)": 302.58, + "step": 123480, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.7473969, + "epoch": 0.6906704956469584, + "grad_norm": 6.21875, + "learning_rate": 7.798987644054736e-06, + "loss": 0.9947216, + "memory(GiB)": 302.58, + "step": 123500, + "train_speed(iter/s)": 0.123983 + }, + { + "acc": 0.73560529, + "epoch": 0.6907823451199376, + "grad_norm": 6.09375, + "learning_rate": 7.798221358445653e-06, + "loss": 1.04283047, + "memory(GiB)": 302.58, + "step": 123520, + "train_speed(iter/s)": 0.123992 + }, + { + "acc": 0.74884953, + "epoch": 0.6908941945929169, + "grad_norm": 6.21875, + "learning_rate": 7.797454977128526e-06, + "loss": 0.98554325, + "memory(GiB)": 302.58, + "step": 123540, + "train_speed(iter/s)": 0.124002 + }, + { + "acc": 0.75035114, + "epoch": 0.6910060440658962, + "grad_norm": 6.09375, + "learning_rate": 7.79668850012957e-06, + "loss": 0.94481649, + "memory(GiB)": 302.58, + "step": 123560, + "train_speed(iter/s)": 0.124012 + }, + { + "acc": 0.74013734, + "epoch": 0.6911178935388754, + "grad_norm": 7.75, + "learning_rate": 7.795921927474999e-06, + "loss": 1.03958416, + "memory(GiB)": 302.58, + "step": 123580, + "train_speed(iter/s)": 0.124022 + }, + { + "acc": 0.73521957, + "epoch": 0.6912297430118547, + "grad_norm": 8.5625, + "learning_rate": 7.795155259191031e-06, + "loss": 1.0364316, + "memory(GiB)": 302.58, + "step": 123600, + "train_speed(iter/s)": 0.124031 + }, + { + "acc": 0.75382276, + "epoch": 0.691341592484834, + "grad_norm": 6.40625, + "learning_rate": 7.794388495303891e-06, + "loss": 0.98521318, + "memory(GiB)": 302.58, + "step": 123620, + "train_speed(iter/s)": 0.124041 + }, + { + "acc": 0.73064966, + "epoch": 0.6914534419578132, + "grad_norm": 5.84375, + "learning_rate": 7.793621635839804e-06, + "loss": 1.06686668, + "memory(GiB)": 302.58, + "step": 123640, + "train_speed(iter/s)": 0.12405 + }, + { + "acc": 0.74199972, + "epoch": 0.6915652914307925, + "grad_norm": 7.03125, + "learning_rate": 7.792854680824997e-06, + "loss": 1.01094379, + "memory(GiB)": 302.58, + "step": 123660, + "train_speed(iter/s)": 0.12406 + }, + { + "acc": 0.73813014, + "epoch": 0.6916771409037717, + "grad_norm": 8.5625, + "learning_rate": 7.792087630285706e-06, + "loss": 1.03812246, + "memory(GiB)": 302.58, + "step": 123680, + "train_speed(iter/s)": 0.12407 + }, + { + "acc": 0.75909147, + "epoch": 0.691788990376751, + "grad_norm": 9.3125, + "learning_rate": 7.791320484248164e-06, + "loss": 0.92692461, + "memory(GiB)": 302.58, + "step": 123700, + "train_speed(iter/s)": 0.124079 + }, + { + "acc": 0.74618087, + "epoch": 0.6919008398497303, + "grad_norm": 7.25, + "learning_rate": 7.790553242738608e-06, + "loss": 1.002386, + "memory(GiB)": 302.58, + "step": 123720, + "train_speed(iter/s)": 0.124089 + }, + { + "acc": 0.72736444, + "epoch": 0.6920126893227095, + "grad_norm": 7.1875, + "learning_rate": 7.789785905783285e-06, + "loss": 1.07557163, + "memory(GiB)": 302.58, + "step": 123740, + "train_speed(iter/s)": 0.124098 + }, + { + "acc": 0.72855182, + "epoch": 0.6921245387956888, + "grad_norm": 7.375, + "learning_rate": 7.78901847340844e-06, + "loss": 1.0670105, + "memory(GiB)": 302.58, + "step": 123760, + "train_speed(iter/s)": 0.124108 + }, + { + "acc": 0.7290307, + "epoch": 0.6922363882686681, + "grad_norm": 6.21875, + "learning_rate": 7.788250945640314e-06, + "loss": 1.0746542, + "memory(GiB)": 302.58, + "step": 123780, + "train_speed(iter/s)": 0.124117 + }, + { + "acc": 0.73185453, + "epoch": 0.6923482377416473, + "grad_norm": 6.8125, + "learning_rate": 7.787483322505167e-06, + "loss": 1.07849331, + "memory(GiB)": 302.58, + "step": 123800, + "train_speed(iter/s)": 0.124127 + }, + { + "acc": 0.7355947, + "epoch": 0.6924600872146266, + "grad_norm": 6.40625, + "learning_rate": 7.786715604029253e-06, + "loss": 1.04019241, + "memory(GiB)": 302.58, + "step": 123820, + "train_speed(iter/s)": 0.124136 + }, + { + "acc": 0.72885294, + "epoch": 0.6925719366876059, + "grad_norm": 7.0625, + "learning_rate": 7.785947790238829e-06, + "loss": 1.07709494, + "memory(GiB)": 302.58, + "step": 123840, + "train_speed(iter/s)": 0.124145 + }, + { + "acc": 0.74415379, + "epoch": 0.6926837861605851, + "grad_norm": 7.6875, + "learning_rate": 7.785179881160154e-06, + "loss": 0.9870574, + "memory(GiB)": 302.58, + "step": 123860, + "train_speed(iter/s)": 0.124154 + }, + { + "acc": 0.74227109, + "epoch": 0.6927956356335644, + "grad_norm": 5.625, + "learning_rate": 7.784411876819496e-06, + "loss": 0.99837542, + "memory(GiB)": 302.58, + "step": 123880, + "train_speed(iter/s)": 0.124164 + }, + { + "acc": 0.72926655, + "epoch": 0.6929074851065437, + "grad_norm": 6.0, + "learning_rate": 7.783643777243123e-06, + "loss": 1.05606575, + "memory(GiB)": 302.58, + "step": 123900, + "train_speed(iter/s)": 0.124174 + }, + { + "acc": 0.7599936, + "epoch": 0.6930193345795229, + "grad_norm": 7.25, + "learning_rate": 7.782875582457307e-06, + "loss": 0.94342127, + "memory(GiB)": 302.58, + "step": 123920, + "train_speed(iter/s)": 0.124184 + }, + { + "acc": 0.72222147, + "epoch": 0.6931311840525022, + "grad_norm": 8.9375, + "learning_rate": 7.78210729248832e-06, + "loss": 1.10255699, + "memory(GiB)": 302.58, + "step": 123940, + "train_speed(iter/s)": 0.124194 + }, + { + "acc": 0.73672528, + "epoch": 0.6932430335254814, + "grad_norm": 4.9375, + "learning_rate": 7.781338907362444e-06, + "loss": 1.0307785, + "memory(GiB)": 302.58, + "step": 123960, + "train_speed(iter/s)": 0.124203 + }, + { + "acc": 0.73112192, + "epoch": 0.6933548829984607, + "grad_norm": 7.90625, + "learning_rate": 7.780570427105956e-06, + "loss": 1.06821375, + "memory(GiB)": 302.58, + "step": 123980, + "train_speed(iter/s)": 0.124213 + }, + { + "acc": 0.73299627, + "epoch": 0.69346673247144, + "grad_norm": 7.71875, + "learning_rate": 7.779801851745145e-06, + "loss": 1.07172995, + "memory(GiB)": 302.58, + "step": 124000, + "train_speed(iter/s)": 0.124223 + }, + { + "epoch": 0.69346673247144, + "eval_acc": 0.7040800682346396, + "eval_loss": 1.02397882938385, + "eval_runtime": 7499.0157, + "eval_samples_per_second": 10.039, + "eval_steps_per_second": 10.039, + "step": 124000 + }, + { + "acc": 0.76216578, + "epoch": 0.6935785819444192, + "grad_norm": 8.125, + "learning_rate": 7.779033181306294e-06, + "loss": 0.91315289, + "memory(GiB)": 302.58, + "step": 124020, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.74384127, + "epoch": 0.6936904314173985, + "grad_norm": 5.5625, + "learning_rate": 7.778264415815697e-06, + "loss": 1.00527, + "memory(GiB)": 302.58, + "step": 124040, + "train_speed(iter/s)": 0.123299 + }, + { + "acc": 0.71170931, + "epoch": 0.6938022808903778, + "grad_norm": 5.15625, + "learning_rate": 7.777495555299648e-06, + "loss": 1.13288555, + "memory(GiB)": 302.58, + "step": 124060, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.75752273, + "epoch": 0.693914130363357, + "grad_norm": 6.8125, + "learning_rate": 7.776726599784442e-06, + "loss": 0.95785265, + "memory(GiB)": 302.58, + "step": 124080, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.74022412, + "epoch": 0.6940259798363363, + "grad_norm": 6.625, + "learning_rate": 7.775957549296383e-06, + "loss": 1.02189493, + "memory(GiB)": 302.58, + "step": 124100, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.74697385, + "epoch": 0.6941378293093156, + "grad_norm": 8.8125, + "learning_rate": 7.775188403861772e-06, + "loss": 0.9988018, + "memory(GiB)": 302.58, + "step": 124120, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.75138311, + "epoch": 0.6942496787822948, + "grad_norm": 7.5, + "learning_rate": 7.774419163506919e-06, + "loss": 0.97640162, + "memory(GiB)": 302.58, + "step": 124140, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.72804923, + "epoch": 0.6943615282552741, + "grad_norm": 8.3125, + "learning_rate": 7.773649828258134e-06, + "loss": 1.07381163, + "memory(GiB)": 302.58, + "step": 124160, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.75005436, + "epoch": 0.6944733777282533, + "grad_norm": 9.5625, + "learning_rate": 7.772880398141728e-06, + "loss": 0.98396139, + "memory(GiB)": 302.58, + "step": 124180, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.74613786, + "epoch": 0.6945852272012326, + "grad_norm": 8.125, + "learning_rate": 7.77211087318402e-06, + "loss": 1.01089621, + "memory(GiB)": 302.58, + "step": 124200, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.71754332, + "epoch": 0.6946970766742119, + "grad_norm": 7.5, + "learning_rate": 7.771341253411331e-06, + "loss": 1.12765369, + "memory(GiB)": 302.58, + "step": 124220, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.74172301, + "epoch": 0.6948089261471911, + "grad_norm": 5.8125, + "learning_rate": 7.770571538849983e-06, + "loss": 0.99316235, + "memory(GiB)": 302.58, + "step": 124240, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.74178996, + "epoch": 0.6949207756201704, + "grad_norm": 5.25, + "learning_rate": 7.769801729526303e-06, + "loss": 0.99715548, + "memory(GiB)": 302.58, + "step": 124260, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.74934349, + "epoch": 0.6950326250931497, + "grad_norm": 10.125, + "learning_rate": 7.76903182546662e-06, + "loss": 0.96500349, + "memory(GiB)": 302.58, + "step": 124280, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.71922588, + "epoch": 0.6951444745661289, + "grad_norm": 6.3125, + "learning_rate": 7.76826182669727e-06, + "loss": 1.11808958, + "memory(GiB)": 302.58, + "step": 124300, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.73367047, + "epoch": 0.6952563240391082, + "grad_norm": 6.90625, + "learning_rate": 7.767491733244588e-06, + "loss": 1.0615942, + "memory(GiB)": 302.58, + "step": 124320, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.73759799, + "epoch": 0.6953681735120875, + "grad_norm": 8.3125, + "learning_rate": 7.766721545134911e-06, + "loss": 1.05713663, + "memory(GiB)": 302.58, + "step": 124340, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.72382903, + "epoch": 0.6954800229850667, + "grad_norm": 5.0, + "learning_rate": 7.765951262394586e-06, + "loss": 1.09613008, + "memory(GiB)": 302.58, + "step": 124360, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.72773561, + "epoch": 0.695591872458046, + "grad_norm": 6.65625, + "learning_rate": 7.765180885049956e-06, + "loss": 1.06979952, + "memory(GiB)": 302.58, + "step": 124380, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.76052914, + "epoch": 0.6957037219310253, + "grad_norm": 5.46875, + "learning_rate": 7.764410413127373e-06, + "loss": 0.9340764, + "memory(GiB)": 302.58, + "step": 124400, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.73381948, + "epoch": 0.6958155714040045, + "grad_norm": 8.125, + "learning_rate": 7.763639846653186e-06, + "loss": 1.06098671, + "memory(GiB)": 302.58, + "step": 124420, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.74665041, + "epoch": 0.6959274208769838, + "grad_norm": 7.53125, + "learning_rate": 7.762869185653754e-06, + "loss": 1.00091238, + "memory(GiB)": 302.58, + "step": 124440, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.75157609, + "epoch": 0.696039270349963, + "grad_norm": 7.6875, + "learning_rate": 7.762098430155436e-06, + "loss": 0.9817131, + "memory(GiB)": 302.58, + "step": 124460, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.73065524, + "epoch": 0.6961511198229423, + "grad_norm": 7.03125, + "learning_rate": 7.761327580184594e-06, + "loss": 1.07431259, + "memory(GiB)": 302.58, + "step": 124480, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.73960485, + "epoch": 0.6962629692959216, + "grad_norm": 5.71875, + "learning_rate": 7.760556635767592e-06, + "loss": 1.01739216, + "memory(GiB)": 302.58, + "step": 124500, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.74414396, + "epoch": 0.6963748187689008, + "grad_norm": 5.78125, + "learning_rate": 7.759785596930797e-06, + "loss": 0.98997231, + "memory(GiB)": 302.58, + "step": 124520, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.71882634, + "epoch": 0.6964866682418801, + "grad_norm": 4.84375, + "learning_rate": 7.759014463700585e-06, + "loss": 1.12794418, + "memory(GiB)": 302.58, + "step": 124540, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.74187837, + "epoch": 0.6965985177148594, + "grad_norm": 6.90625, + "learning_rate": 7.758243236103331e-06, + "loss": 0.99912205, + "memory(GiB)": 302.58, + "step": 124560, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74131875, + "epoch": 0.6967103671878386, + "grad_norm": 5.65625, + "learning_rate": 7.757471914165411e-06, + "loss": 1.00169001, + "memory(GiB)": 302.58, + "step": 124580, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.73168616, + "epoch": 0.6968222166608179, + "grad_norm": 8.125, + "learning_rate": 7.756700497913208e-06, + "loss": 1.10156212, + "memory(GiB)": 302.58, + "step": 124600, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.71073585, + "epoch": 0.6969340661337972, + "grad_norm": 7.375, + "learning_rate": 7.755928987373108e-06, + "loss": 1.15366316, + "memory(GiB)": 302.58, + "step": 124620, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.73039589, + "epoch": 0.6970459156067764, + "grad_norm": 7.625, + "learning_rate": 7.755157382571496e-06, + "loss": 1.09199877, + "memory(GiB)": 302.58, + "step": 124640, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.72604022, + "epoch": 0.6971577650797557, + "grad_norm": 6.65625, + "learning_rate": 7.754385683534767e-06, + "loss": 1.08431864, + "memory(GiB)": 302.58, + "step": 124660, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.75173736, + "epoch": 0.697269614552735, + "grad_norm": 6.65625, + "learning_rate": 7.75361389028931e-06, + "loss": 0.97874889, + "memory(GiB)": 302.58, + "step": 124680, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.72540698, + "epoch": 0.6973814640257142, + "grad_norm": 4.96875, + "learning_rate": 7.752842002861531e-06, + "loss": 1.09186182, + "memory(GiB)": 302.58, + "step": 124700, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.73631778, + "epoch": 0.6974933134986935, + "grad_norm": 7.84375, + "learning_rate": 7.752070021277825e-06, + "loss": 1.05309706, + "memory(GiB)": 302.58, + "step": 124720, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.74542689, + "epoch": 0.6976051629716727, + "grad_norm": 8.625, + "learning_rate": 7.751297945564597e-06, + "loss": 0.9916193, + "memory(GiB)": 302.58, + "step": 124740, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.74565668, + "epoch": 0.697717012444652, + "grad_norm": 5.65625, + "learning_rate": 7.750525775748255e-06, + "loss": 0.98956652, + "memory(GiB)": 302.58, + "step": 124760, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.74065342, + "epoch": 0.6978288619176313, + "grad_norm": 6.28125, + "learning_rate": 7.749753511855211e-06, + "loss": 1.04525299, + "memory(GiB)": 302.58, + "step": 124780, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.746384, + "epoch": 0.6979407113906105, + "grad_norm": 6.5, + "learning_rate": 7.748981153911876e-06, + "loss": 0.97744217, + "memory(GiB)": 302.58, + "step": 124800, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.74472308, + "epoch": 0.6980525608635898, + "grad_norm": 8.3125, + "learning_rate": 7.74820870194467e-06, + "loss": 1.00024824, + "memory(GiB)": 302.58, + "step": 124820, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.74310236, + "epoch": 0.6981644103365691, + "grad_norm": 8.0, + "learning_rate": 7.747436155980013e-06, + "loss": 1.010783, + "memory(GiB)": 302.58, + "step": 124840, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.74668474, + "epoch": 0.6982762598095483, + "grad_norm": 6.90625, + "learning_rate": 7.746663516044325e-06, + "loss": 0.99511433, + "memory(GiB)": 302.58, + "step": 124860, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.73656282, + "epoch": 0.6983881092825276, + "grad_norm": 7.3125, + "learning_rate": 7.745890782164036e-06, + "loss": 1.04639502, + "memory(GiB)": 302.58, + "step": 124880, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.73155766, + "epoch": 0.6984999587555069, + "grad_norm": 8.4375, + "learning_rate": 7.745117954365577e-06, + "loss": 1.05401278, + "memory(GiB)": 302.58, + "step": 124900, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.74526854, + "epoch": 0.6986118082284861, + "grad_norm": 8.375, + "learning_rate": 7.744345032675378e-06, + "loss": 1.00856218, + "memory(GiB)": 302.58, + "step": 124920, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.73514915, + "epoch": 0.6987236577014654, + "grad_norm": 6.0, + "learning_rate": 7.743572017119877e-06, + "loss": 1.05255404, + "memory(GiB)": 302.58, + "step": 124940, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75258007, + "epoch": 0.6988355071744446, + "grad_norm": 6.90625, + "learning_rate": 7.742798907725514e-06, + "loss": 0.9731988, + "memory(GiB)": 302.58, + "step": 124960, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.74394927, + "epoch": 0.6989473566474239, + "grad_norm": 4.53125, + "learning_rate": 7.74202570451873e-06, + "loss": 1.00438242, + "memory(GiB)": 302.58, + "step": 124980, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.73597283, + "epoch": 0.6990592061204032, + "grad_norm": 9.6875, + "learning_rate": 7.741252407525974e-06, + "loss": 1.05237875, + "memory(GiB)": 302.58, + "step": 125000, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.75508618, + "epoch": 0.6991710555933824, + "grad_norm": 6.0, + "learning_rate": 7.740479016773693e-06, + "loss": 0.98280725, + "memory(GiB)": 302.58, + "step": 125020, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.73474841, + "epoch": 0.6992829050663617, + "grad_norm": 5.0625, + "learning_rate": 7.739705532288337e-06, + "loss": 1.03474646, + "memory(GiB)": 302.58, + "step": 125040, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.74626727, + "epoch": 0.699394754539341, + "grad_norm": 8.4375, + "learning_rate": 7.738931954096366e-06, + "loss": 0.99414692, + "memory(GiB)": 302.58, + "step": 125060, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.71985054, + "epoch": 0.6995066040123202, + "grad_norm": 7.4375, + "learning_rate": 7.73815828222424e-06, + "loss": 1.11870127, + "memory(GiB)": 302.58, + "step": 125080, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.7491221, + "epoch": 0.6996184534852995, + "grad_norm": 5.59375, + "learning_rate": 7.737384516698415e-06, + "loss": 0.99592762, + "memory(GiB)": 302.58, + "step": 125100, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.74900084, + "epoch": 0.6997303029582788, + "grad_norm": 10.9375, + "learning_rate": 7.73661065754536e-06, + "loss": 0.99077473, + "memory(GiB)": 302.58, + "step": 125120, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.76415968, + "epoch": 0.699842152431258, + "grad_norm": 8.5625, + "learning_rate": 7.735836704791545e-06, + "loss": 0.92837458, + "memory(GiB)": 302.58, + "step": 125140, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.73561449, + "epoch": 0.6999540019042373, + "grad_norm": 7.78125, + "learning_rate": 7.735062658463438e-06, + "loss": 1.04608774, + "memory(GiB)": 302.58, + "step": 125160, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.731738, + "epoch": 0.7000658513772166, + "grad_norm": 7.03125, + "learning_rate": 7.734288518587515e-06, + "loss": 1.07590532, + "memory(GiB)": 302.58, + "step": 125180, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.73769474, + "epoch": 0.7001777008501958, + "grad_norm": 10.125, + "learning_rate": 7.733514285190254e-06, + "loss": 1.04305182, + "memory(GiB)": 302.58, + "step": 125200, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.74611893, + "epoch": 0.7002895503231751, + "grad_norm": 9.1875, + "learning_rate": 7.732739958298138e-06, + "loss": 0.97323132, + "memory(GiB)": 302.58, + "step": 125220, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.73936882, + "epoch": 0.7004013997961543, + "grad_norm": 5.25, + "learning_rate": 7.731965537937649e-06, + "loss": 1.04471407, + "memory(GiB)": 302.58, + "step": 125240, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.7461442, + "epoch": 0.7005132492691336, + "grad_norm": 6.28125, + "learning_rate": 7.731191024135276e-06, + "loss": 0.98994493, + "memory(GiB)": 302.58, + "step": 125260, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.71155891, + "epoch": 0.7006250987421129, + "grad_norm": 4.5625, + "learning_rate": 7.730416416917511e-06, + "loss": 1.14205961, + "memory(GiB)": 302.58, + "step": 125280, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.74415035, + "epoch": 0.7007369482150921, + "grad_norm": 7.4375, + "learning_rate": 7.729641716310846e-06, + "loss": 1.00344172, + "memory(GiB)": 302.58, + "step": 125300, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.73559017, + "epoch": 0.7008487976880714, + "grad_norm": 7.65625, + "learning_rate": 7.728866922341778e-06, + "loss": 1.04988899, + "memory(GiB)": 302.58, + "step": 125320, + "train_speed(iter/s)": 0.123899 + }, + { + "acc": 0.73821015, + "epoch": 0.7009606471610507, + "grad_norm": 8.3125, + "learning_rate": 7.72809203503681e-06, + "loss": 1.02159796, + "memory(GiB)": 302.58, + "step": 125340, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.7518199, + "epoch": 0.7010724966340299, + "grad_norm": 8.5, + "learning_rate": 7.727317054422442e-06, + "loss": 0.96825962, + "memory(GiB)": 302.58, + "step": 125360, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.73159599, + "epoch": 0.7011843461070092, + "grad_norm": 6.40625, + "learning_rate": 7.726541980525184e-06, + "loss": 1.04932728, + "memory(GiB)": 302.58, + "step": 125380, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.75077133, + "epoch": 0.7012961955799885, + "grad_norm": 5.59375, + "learning_rate": 7.725766813371545e-06, + "loss": 0.97950392, + "memory(GiB)": 302.58, + "step": 125400, + "train_speed(iter/s)": 0.123937 + }, + { + "acc": 0.74780588, + "epoch": 0.7014080450529677, + "grad_norm": 6.65625, + "learning_rate": 7.724991552988036e-06, + "loss": 1.01137743, + "memory(GiB)": 302.58, + "step": 125420, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.74528575, + "epoch": 0.701519894525947, + "grad_norm": 6.5, + "learning_rate": 7.724216199401175e-06, + "loss": 0.97968988, + "memory(GiB)": 302.58, + "step": 125440, + "train_speed(iter/s)": 0.123955 + }, + { + "acc": 0.7344018, + "epoch": 0.7016317439989262, + "grad_norm": 8.75, + "learning_rate": 7.723440752637484e-06, + "loss": 1.06484385, + "memory(GiB)": 302.58, + "step": 125460, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.75397229, + "epoch": 0.7017435934719055, + "grad_norm": 7.5625, + "learning_rate": 7.72266521272348e-06, + "loss": 0.96413059, + "memory(GiB)": 302.58, + "step": 125480, + "train_speed(iter/s)": 0.123973 + }, + { + "acc": 0.74481087, + "epoch": 0.7018554429448848, + "grad_norm": 3.921875, + "learning_rate": 7.721889579685698e-06, + "loss": 0.99127474, + "memory(GiB)": 302.58, + "step": 125500, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.73137813, + "epoch": 0.701967292417864, + "grad_norm": 12.3125, + "learning_rate": 7.721113853550658e-06, + "loss": 1.067241, + "memory(GiB)": 302.58, + "step": 125520, + "train_speed(iter/s)": 0.123991 + }, + { + "acc": 0.74807372, + "epoch": 0.7020791418908433, + "grad_norm": 5.5625, + "learning_rate": 7.720338034344894e-06, + "loss": 0.97977657, + "memory(GiB)": 302.58, + "step": 125540, + "train_speed(iter/s)": 0.124 + }, + { + "acc": 0.74303122, + "epoch": 0.7021909913638226, + "grad_norm": 5.78125, + "learning_rate": 7.719562122094945e-06, + "loss": 1.03011522, + "memory(GiB)": 302.58, + "step": 125560, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.74113078, + "epoch": 0.7023028408368018, + "grad_norm": 7.3125, + "learning_rate": 7.718786116827348e-06, + "loss": 1.02181005, + "memory(GiB)": 302.58, + "step": 125580, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.73856659, + "epoch": 0.7024146903097811, + "grad_norm": 5.0625, + "learning_rate": 7.718010018568646e-06, + "loss": 1.02372217, + "memory(GiB)": 302.58, + "step": 125600, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.74110847, + "epoch": 0.7025265397827604, + "grad_norm": 8.6875, + "learning_rate": 7.717233827345382e-06, + "loss": 1.01990814, + "memory(GiB)": 302.58, + "step": 125620, + "train_speed(iter/s)": 0.124036 + }, + { + "acc": 0.73894286, + "epoch": 0.7026383892557396, + "grad_norm": 8.875, + "learning_rate": 7.716457543184103e-06, + "loss": 1.01883707, + "memory(GiB)": 302.58, + "step": 125640, + "train_speed(iter/s)": 0.124045 + }, + { + "acc": 0.73180904, + "epoch": 0.7027502387287189, + "grad_norm": 8.1875, + "learning_rate": 7.715681166111365e-06, + "loss": 1.05742903, + "memory(GiB)": 302.58, + "step": 125660, + "train_speed(iter/s)": 0.124054 + }, + { + "acc": 0.74456353, + "epoch": 0.7028620882016982, + "grad_norm": 12.375, + "learning_rate": 7.714904696153718e-06, + "loss": 1.00238171, + "memory(GiB)": 302.58, + "step": 125680, + "train_speed(iter/s)": 0.124064 + }, + { + "acc": 0.7247231, + "epoch": 0.7029739376746774, + "grad_norm": 9.1875, + "learning_rate": 7.714128133337724e-06, + "loss": 1.07068281, + "memory(GiB)": 302.58, + "step": 125700, + "train_speed(iter/s)": 0.124073 + }, + { + "acc": 0.72221785, + "epoch": 0.7030857871476567, + "grad_norm": 7.84375, + "learning_rate": 7.71335147768994e-06, + "loss": 1.1111021, + "memory(GiB)": 302.58, + "step": 125720, + "train_speed(iter/s)": 0.124083 + }, + { + "acc": 0.73888173, + "epoch": 0.703197636620636, + "grad_norm": 6.6875, + "learning_rate": 7.712574729236933e-06, + "loss": 1.05848331, + "memory(GiB)": 302.58, + "step": 125740, + "train_speed(iter/s)": 0.124091 + }, + { + "acc": 0.73624096, + "epoch": 0.7033094860936152, + "grad_norm": 7.03125, + "learning_rate": 7.711797888005266e-06, + "loss": 1.03410892, + "memory(GiB)": 302.58, + "step": 125760, + "train_speed(iter/s)": 0.124101 + }, + { + "acc": 0.7505106, + "epoch": 0.7034213355665945, + "grad_norm": 7.3125, + "learning_rate": 7.711020954021514e-06, + "loss": 0.98645182, + "memory(GiB)": 302.58, + "step": 125780, + "train_speed(iter/s)": 0.124111 + }, + { + "acc": 0.7392695, + "epoch": 0.7035331850395737, + "grad_norm": 8.1875, + "learning_rate": 7.710243927312249e-06, + "loss": 1.03664637, + "memory(GiB)": 302.58, + "step": 125800, + "train_speed(iter/s)": 0.124119 + }, + { + "acc": 0.74178591, + "epoch": 0.703645034512553, + "grad_norm": 6.8125, + "learning_rate": 7.709466807904047e-06, + "loss": 1.04773664, + "memory(GiB)": 302.58, + "step": 125820, + "train_speed(iter/s)": 0.124129 + }, + { + "acc": 0.73562422, + "epoch": 0.7037568839855323, + "grad_norm": 8.75, + "learning_rate": 7.708689595823491e-06, + "loss": 1.03589191, + "memory(GiB)": 302.58, + "step": 125840, + "train_speed(iter/s)": 0.124138 + }, + { + "acc": 0.72973924, + "epoch": 0.7038687334585115, + "grad_norm": 7.15625, + "learning_rate": 7.707912291097159e-06, + "loss": 1.08083553, + "memory(GiB)": 302.58, + "step": 125860, + "train_speed(iter/s)": 0.124148 + }, + { + "acc": 0.73069115, + "epoch": 0.7039805829314908, + "grad_norm": 4.1875, + "learning_rate": 7.707134893751641e-06, + "loss": 1.0638114, + "memory(GiB)": 302.58, + "step": 125880, + "train_speed(iter/s)": 0.124157 + }, + { + "acc": 0.75583591, + "epoch": 0.7040924324044701, + "grad_norm": 4.03125, + "learning_rate": 7.706357403813527e-06, + "loss": 0.95473595, + "memory(GiB)": 302.58, + "step": 125900, + "train_speed(iter/s)": 0.124167 + }, + { + "acc": 0.75207801, + "epoch": 0.7042042818774493, + "grad_norm": 10.125, + "learning_rate": 7.705579821309408e-06, + "loss": 0.96392307, + "memory(GiB)": 302.58, + "step": 125920, + "train_speed(iter/s)": 0.124177 + }, + { + "acc": 0.73552284, + "epoch": 0.7043161313504286, + "grad_norm": 5.46875, + "learning_rate": 7.704802146265878e-06, + "loss": 1.0197607, + "memory(GiB)": 302.58, + "step": 125940, + "train_speed(iter/s)": 0.124187 + }, + { + "acc": 0.7596756, + "epoch": 0.7044279808234079, + "grad_norm": 7.65625, + "learning_rate": 7.704024378709539e-06, + "loss": 0.93641949, + "memory(GiB)": 302.58, + "step": 125960, + "train_speed(iter/s)": 0.124197 + }, + { + "acc": 0.73195562, + "epoch": 0.7045398302963871, + "grad_norm": 4.71875, + "learning_rate": 7.703246518666992e-06, + "loss": 1.06606636, + "memory(GiB)": 302.58, + "step": 125980, + "train_speed(iter/s)": 0.124207 + }, + { + "acc": 0.74176965, + "epoch": 0.7046516797693664, + "grad_norm": 7.34375, + "learning_rate": 7.702468566164842e-06, + "loss": 1.00517874, + "memory(GiB)": 302.58, + "step": 126000, + "train_speed(iter/s)": 0.124216 + }, + { + "epoch": 0.7046516797693664, + "eval_acc": 0.7041670277656261, + "eval_loss": 1.0233464241027832, + "eval_runtime": 7504.1232, + "eval_samples_per_second": 10.032, + "eval_steps_per_second": 10.032, + "step": 126000 + }, + { + "acc": 0.72441978, + "epoch": 0.7047635292423456, + "grad_norm": 6.46875, + "learning_rate": 7.701690521229699e-06, + "loss": 1.09230137, + "memory(GiB)": 302.58, + "step": 126020, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.72574892, + "epoch": 0.7048753787153249, + "grad_norm": 7.0, + "learning_rate": 7.700912383888172e-06, + "loss": 1.08727016, + "memory(GiB)": 302.58, + "step": 126040, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.73507791, + "epoch": 0.7049872281883042, + "grad_norm": 7.375, + "learning_rate": 7.700134154166877e-06, + "loss": 1.02655983, + "memory(GiB)": 302.58, + "step": 126060, + "train_speed(iter/s)": 0.123316 + }, + { + "acc": 0.74523449, + "epoch": 0.7050990776612834, + "grad_norm": 6.34375, + "learning_rate": 7.699355832092433e-06, + "loss": 1.00343094, + "memory(GiB)": 302.58, + "step": 126080, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.74889503, + "epoch": 0.7052109271342627, + "grad_norm": 5.28125, + "learning_rate": 7.69857741769146e-06, + "loss": 1.01053619, + "memory(GiB)": 302.58, + "step": 126100, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.71959, + "epoch": 0.705322776607242, + "grad_norm": 6.84375, + "learning_rate": 7.697798910990581e-06, + "loss": 1.11072359, + "memory(GiB)": 302.58, + "step": 126120, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.72099276, + "epoch": 0.7054346260802212, + "grad_norm": 7.03125, + "learning_rate": 7.697020312016425e-06, + "loss": 1.10998106, + "memory(GiB)": 302.58, + "step": 126140, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.74698691, + "epoch": 0.7055464755532005, + "grad_norm": 11.3125, + "learning_rate": 7.696241620795624e-06, + "loss": 0.99544716, + "memory(GiB)": 302.58, + "step": 126160, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.74613652, + "epoch": 0.7056583250261798, + "grad_norm": 8.875, + "learning_rate": 7.695462837354809e-06, + "loss": 0.99901381, + "memory(GiB)": 302.58, + "step": 126180, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.74219842, + "epoch": 0.705770174499159, + "grad_norm": 6.5, + "learning_rate": 7.694683961720618e-06, + "loss": 1.01400642, + "memory(GiB)": 302.58, + "step": 126200, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.73265228, + "epoch": 0.7058820239721383, + "grad_norm": 5.46875, + "learning_rate": 7.69390499391969e-06, + "loss": 1.06777439, + "memory(GiB)": 302.58, + "step": 126220, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.74304609, + "epoch": 0.7059938734451175, + "grad_norm": 5.34375, + "learning_rate": 7.693125933978671e-06, + "loss": 1.01195946, + "memory(GiB)": 302.58, + "step": 126240, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.72744164, + "epoch": 0.7061057229180968, + "grad_norm": 6.1875, + "learning_rate": 7.692346781924203e-06, + "loss": 1.06499519, + "memory(GiB)": 302.58, + "step": 126260, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.73779469, + "epoch": 0.7062175723910761, + "grad_norm": 9.1875, + "learning_rate": 7.69156753778294e-06, + "loss": 1.03158827, + "memory(GiB)": 302.58, + "step": 126280, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.72863922, + "epoch": 0.7063294218640553, + "grad_norm": 6.21875, + "learning_rate": 7.690788201581532e-06, + "loss": 1.08369112, + "memory(GiB)": 302.58, + "step": 126300, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.73942246, + "epoch": 0.7064412713370346, + "grad_norm": 9.375, + "learning_rate": 7.690008773346635e-06, + "loss": 1.01414261, + "memory(GiB)": 302.58, + "step": 126320, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.73286743, + "epoch": 0.7065531208100139, + "grad_norm": 7.09375, + "learning_rate": 7.689229253104909e-06, + "loss": 1.06026917, + "memory(GiB)": 302.58, + "step": 126340, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.73125544, + "epoch": 0.7066649702829931, + "grad_norm": 8.6875, + "learning_rate": 7.688449640883014e-06, + "loss": 1.05198212, + "memory(GiB)": 302.58, + "step": 126360, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73112888, + "epoch": 0.7067768197559724, + "grad_norm": 6.21875, + "learning_rate": 7.687669936707615e-06, + "loss": 1.05224257, + "memory(GiB)": 302.58, + "step": 126380, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.74911017, + "epoch": 0.7068886692289517, + "grad_norm": 9.75, + "learning_rate": 7.686890140605382e-06, + "loss": 0.97489328, + "memory(GiB)": 302.58, + "step": 126400, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.72237692, + "epoch": 0.7070005187019309, + "grad_norm": 6.15625, + "learning_rate": 7.68611025260299e-06, + "loss": 1.10369778, + "memory(GiB)": 302.58, + "step": 126420, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.7413362, + "epoch": 0.7071123681749102, + "grad_norm": 8.5625, + "learning_rate": 7.685330272727105e-06, + "loss": 1.00101213, + "memory(GiB)": 302.58, + "step": 126440, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.73223333, + "epoch": 0.7072242176478895, + "grad_norm": 5.5, + "learning_rate": 7.684550201004414e-06, + "loss": 1.03712149, + "memory(GiB)": 302.58, + "step": 126460, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.73894658, + "epoch": 0.7073360671208687, + "grad_norm": 9.5, + "learning_rate": 7.683770037461593e-06, + "loss": 1.01499128, + "memory(GiB)": 302.58, + "step": 126480, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.73858838, + "epoch": 0.707447916593848, + "grad_norm": 6.34375, + "learning_rate": 7.682989782125325e-06, + "loss": 1.03952332, + "memory(GiB)": 302.58, + "step": 126500, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.74992733, + "epoch": 0.7075597660668272, + "grad_norm": 7.78125, + "learning_rate": 7.6822094350223e-06, + "loss": 0.97440701, + "memory(GiB)": 302.58, + "step": 126520, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.73351297, + "epoch": 0.7076716155398065, + "grad_norm": 4.96875, + "learning_rate": 7.681428996179207e-06, + "loss": 1.03169041, + "memory(GiB)": 302.58, + "step": 126540, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73870311, + "epoch": 0.7077834650127858, + "grad_norm": 6.96875, + "learning_rate": 7.68064846562274e-06, + "loss": 1.04016294, + "memory(GiB)": 302.58, + "step": 126560, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74834042, + "epoch": 0.707895314485765, + "grad_norm": 7.125, + "learning_rate": 7.679867843379593e-06, + "loss": 1.01868706, + "memory(GiB)": 302.58, + "step": 126580, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.73478699, + "epoch": 0.7080071639587443, + "grad_norm": 8.5625, + "learning_rate": 7.679087129476472e-06, + "loss": 1.04479418, + "memory(GiB)": 302.58, + "step": 126600, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.73817549, + "epoch": 0.7081190134317236, + "grad_norm": 10.3125, + "learning_rate": 7.678306323940075e-06, + "loss": 1.024786, + "memory(GiB)": 302.58, + "step": 126620, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.73877883, + "epoch": 0.7082308629047028, + "grad_norm": 7.34375, + "learning_rate": 7.677525426797107e-06, + "loss": 1.00578079, + "memory(GiB)": 302.58, + "step": 126640, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.73767376, + "epoch": 0.7083427123776821, + "grad_norm": 6.875, + "learning_rate": 7.676744438074282e-06, + "loss": 1.0321434, + "memory(GiB)": 302.58, + "step": 126660, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74561772, + "epoch": 0.7084545618506614, + "grad_norm": 7.53125, + "learning_rate": 7.675963357798307e-06, + "loss": 1.01464586, + "memory(GiB)": 302.58, + "step": 126680, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.73113499, + "epoch": 0.7085664113236406, + "grad_norm": 8.8125, + "learning_rate": 7.6751821859959e-06, + "loss": 1.0535512, + "memory(GiB)": 302.58, + "step": 126700, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.73691416, + "epoch": 0.7086782607966199, + "grad_norm": 7.71875, + "learning_rate": 7.674400922693781e-06, + "loss": 1.0261735, + "memory(GiB)": 302.58, + "step": 126720, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.73479419, + "epoch": 0.7087901102695991, + "grad_norm": 7.78125, + "learning_rate": 7.67361956791867e-06, + "loss": 1.02589521, + "memory(GiB)": 302.58, + "step": 126740, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.73794932, + "epoch": 0.7089019597425784, + "grad_norm": 4.84375, + "learning_rate": 7.672838121697292e-06, + "loss": 1.03884916, + "memory(GiB)": 302.58, + "step": 126760, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.74010644, + "epoch": 0.7090138092155577, + "grad_norm": 7.09375, + "learning_rate": 7.672056584056373e-06, + "loss": 1.00616283, + "memory(GiB)": 302.58, + "step": 126780, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.73624573, + "epoch": 0.7091256586885369, + "grad_norm": 6.21875, + "learning_rate": 7.671274955022648e-06, + "loss": 1.02752352, + "memory(GiB)": 302.58, + "step": 126800, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.72335386, + "epoch": 0.7092375081615162, + "grad_norm": 8.1875, + "learning_rate": 7.670493234622849e-06, + "loss": 1.08662748, + "memory(GiB)": 302.58, + "step": 126820, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.74103417, + "epoch": 0.7093493576344955, + "grad_norm": 8.3125, + "learning_rate": 7.669711422883712e-06, + "loss": 1.02083244, + "memory(GiB)": 302.58, + "step": 126840, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.72526746, + "epoch": 0.7094612071074747, + "grad_norm": 8.875, + "learning_rate": 7.66892951983198e-06, + "loss": 1.08831491, + "memory(GiB)": 302.58, + "step": 126860, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.73562045, + "epoch": 0.709573056580454, + "grad_norm": 6.8125, + "learning_rate": 7.668147525494396e-06, + "loss": 1.04814377, + "memory(GiB)": 302.58, + "step": 126880, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.73856516, + "epoch": 0.7096849060534333, + "grad_norm": 7.0, + "learning_rate": 7.667365439897704e-06, + "loss": 1.01806335, + "memory(GiB)": 302.58, + "step": 126900, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.73015466, + "epoch": 0.7097967555264125, + "grad_norm": 4.65625, + "learning_rate": 7.66658326306866e-06, + "loss": 1.07113914, + "memory(GiB)": 302.58, + "step": 126920, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.73601565, + "epoch": 0.7099086049993918, + "grad_norm": 7.75, + "learning_rate": 7.66580099503401e-06, + "loss": 1.03333006, + "memory(GiB)": 302.58, + "step": 126940, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.73815293, + "epoch": 0.710020454472371, + "grad_norm": 7.90625, + "learning_rate": 7.665018635820514e-06, + "loss": 1.0368329, + "memory(GiB)": 302.58, + "step": 126960, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.73591123, + "epoch": 0.7101323039453503, + "grad_norm": 7.84375, + "learning_rate": 7.664236185454927e-06, + "loss": 1.03950844, + "memory(GiB)": 302.58, + "step": 126980, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.75821576, + "epoch": 0.7102441534183296, + "grad_norm": 6.59375, + "learning_rate": 7.663453643964017e-06, + "loss": 0.94402838, + "memory(GiB)": 302.58, + "step": 127000, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.72724524, + "epoch": 0.7103560028913088, + "grad_norm": 7.0, + "learning_rate": 7.662671011374546e-06, + "loss": 1.10333652, + "memory(GiB)": 302.58, + "step": 127020, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.73865142, + "epoch": 0.7104678523642881, + "grad_norm": 16.5, + "learning_rate": 7.661888287713285e-06, + "loss": 1.04231234, + "memory(GiB)": 302.58, + "step": 127040, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.75144629, + "epoch": 0.7105797018372674, + "grad_norm": 6.40625, + "learning_rate": 7.661105473007002e-06, + "loss": 0.97417622, + "memory(GiB)": 302.58, + "step": 127060, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.75026956, + "epoch": 0.7106915513102466, + "grad_norm": 8.5, + "learning_rate": 7.660322567282475e-06, + "loss": 0.9698535, + "memory(GiB)": 302.58, + "step": 127080, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.72564602, + "epoch": 0.7108034007832259, + "grad_norm": 6.1875, + "learning_rate": 7.65953957056648e-06, + "loss": 1.09679785, + "memory(GiB)": 302.58, + "step": 127100, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.74487424, + "epoch": 0.7109152502562052, + "grad_norm": 7.375, + "learning_rate": 7.658756482885799e-06, + "loss": 0.98830919, + "memory(GiB)": 302.58, + "step": 127120, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.73072424, + "epoch": 0.7110270997291844, + "grad_norm": 4.5625, + "learning_rate": 7.657973304267214e-06, + "loss": 1.07887115, + "memory(GiB)": 302.58, + "step": 127140, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.73462915, + "epoch": 0.7111389492021637, + "grad_norm": 6.9375, + "learning_rate": 7.657190034737517e-06, + "loss": 1.0596529, + "memory(GiB)": 302.58, + "step": 127160, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.75173368, + "epoch": 0.711250798675143, + "grad_norm": 7.9375, + "learning_rate": 7.656406674323492e-06, + "loss": 0.98606815, + "memory(GiB)": 302.58, + "step": 127180, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.73680258, + "epoch": 0.7113626481481222, + "grad_norm": 7.59375, + "learning_rate": 7.655623223051937e-06, + "loss": 1.05161114, + "memory(GiB)": 302.58, + "step": 127200, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.73910527, + "epoch": 0.7114744976211015, + "grad_norm": 7.125, + "learning_rate": 7.654839680949646e-06, + "loss": 1.0251338, + "memory(GiB)": 302.58, + "step": 127220, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.73908715, + "epoch": 0.7115863470940808, + "grad_norm": 7.03125, + "learning_rate": 7.65405604804342e-06, + "loss": 1.04979486, + "memory(GiB)": 302.58, + "step": 127240, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.72924404, + "epoch": 0.71169819656706, + "grad_norm": 9.3125, + "learning_rate": 7.653272324360063e-06, + "loss": 1.06042757, + "memory(GiB)": 302.58, + "step": 127260, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.74318976, + "epoch": 0.7118100460400393, + "grad_norm": 5.21875, + "learning_rate": 7.652488509926378e-06, + "loss": 0.98980446, + "memory(GiB)": 302.58, + "step": 127280, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.73515115, + "epoch": 0.7119218955130185, + "grad_norm": 7.65625, + "learning_rate": 7.651704604769174e-06, + "loss": 1.05176182, + "memory(GiB)": 302.58, + "step": 127300, + "train_speed(iter/s)": 0.123891 + }, + { + "acc": 0.73453555, + "epoch": 0.7120337449859978, + "grad_norm": 8.25, + "learning_rate": 7.650920608915265e-06, + "loss": 1.05954361, + "memory(GiB)": 302.58, + "step": 127320, + "train_speed(iter/s)": 0.123899 + }, + { + "acc": 0.74152622, + "epoch": 0.7121455944589771, + "grad_norm": 7.46875, + "learning_rate": 7.650136522391466e-06, + "loss": 1.02392521, + "memory(GiB)": 302.58, + "step": 127340, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.74540396, + "epoch": 0.7122574439319563, + "grad_norm": 9.1875, + "learning_rate": 7.649352345224594e-06, + "loss": 1.00114698, + "memory(GiB)": 302.58, + "step": 127360, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.73170924, + "epoch": 0.7123692934049356, + "grad_norm": 7.0, + "learning_rate": 7.648568077441472e-06, + "loss": 1.06564608, + "memory(GiB)": 302.58, + "step": 127380, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.75109906, + "epoch": 0.7124811428779149, + "grad_norm": 6.9375, + "learning_rate": 7.647783719068922e-06, + "loss": 0.98315277, + "memory(GiB)": 302.58, + "step": 127400, + "train_speed(iter/s)": 0.123937 + }, + { + "acc": 0.74798021, + "epoch": 0.7125929923508941, + "grad_norm": 7.9375, + "learning_rate": 7.646999270133773e-06, + "loss": 0.98888655, + "memory(GiB)": 302.58, + "step": 127420, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.74721365, + "epoch": 0.7127048418238734, + "grad_norm": 8.375, + "learning_rate": 7.646214730662855e-06, + "loss": 0.99213152, + "memory(GiB)": 302.58, + "step": 127440, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.7316515, + "epoch": 0.7128166912968527, + "grad_norm": 7.8125, + "learning_rate": 7.645430100683003e-06, + "loss": 1.06080894, + "memory(GiB)": 302.58, + "step": 127460, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.73368397, + "epoch": 0.7129285407698319, + "grad_norm": 6.625, + "learning_rate": 7.644645380221055e-06, + "loss": 1.06695652, + "memory(GiB)": 302.58, + "step": 127480, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.72907577, + "epoch": 0.7130403902428112, + "grad_norm": 7.75, + "learning_rate": 7.643860569303846e-06, + "loss": 1.08327923, + "memory(GiB)": 302.58, + "step": 127500, + "train_speed(iter/s)": 0.123983 + }, + { + "acc": 0.74665694, + "epoch": 0.7131522397157904, + "grad_norm": 8.0, + "learning_rate": 7.643075667958223e-06, + "loss": 0.99659147, + "memory(GiB)": 302.58, + "step": 127520, + "train_speed(iter/s)": 0.123992 + }, + { + "acc": 0.7503767, + "epoch": 0.7132640891887697, + "grad_norm": 7.25, + "learning_rate": 7.642290676211032e-06, + "loss": 0.97523441, + "memory(GiB)": 302.58, + "step": 127540, + "train_speed(iter/s)": 0.124001 + }, + { + "acc": 0.73919353, + "epoch": 0.713375938661749, + "grad_norm": 6.71875, + "learning_rate": 7.64150559408912e-06, + "loss": 1.04767218, + "memory(GiB)": 302.58, + "step": 127560, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.73959355, + "epoch": 0.7134877881347282, + "grad_norm": 7.0, + "learning_rate": 7.640720421619343e-06, + "loss": 1.01412888, + "memory(GiB)": 302.58, + "step": 127580, + "train_speed(iter/s)": 0.124019 + }, + { + "acc": 0.7498095, + "epoch": 0.7135996376077075, + "grad_norm": 4.875, + "learning_rate": 7.639935158828553e-06, + "loss": 0.98785076, + "memory(GiB)": 302.58, + "step": 127600, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.74691291, + "epoch": 0.7137114870806868, + "grad_norm": 7.34375, + "learning_rate": 7.639149805743608e-06, + "loss": 0.9953042, + "memory(GiB)": 302.58, + "step": 127620, + "train_speed(iter/s)": 0.124036 + }, + { + "acc": 0.76207099, + "epoch": 0.713823336553666, + "grad_norm": 7.1875, + "learning_rate": 7.638364362391372e-06, + "loss": 0.92961321, + "memory(GiB)": 302.58, + "step": 127640, + "train_speed(iter/s)": 0.124045 + }, + { + "acc": 0.75870805, + "epoch": 0.7139351860266453, + "grad_norm": 6.59375, + "learning_rate": 7.637578828798709e-06, + "loss": 0.94876385, + "memory(GiB)": 302.58, + "step": 127660, + "train_speed(iter/s)": 0.124054 + }, + { + "acc": 0.75392289, + "epoch": 0.7140470354996246, + "grad_norm": 6.375, + "learning_rate": 7.636793204992487e-06, + "loss": 0.95676641, + "memory(GiB)": 302.58, + "step": 127680, + "train_speed(iter/s)": 0.124062 + }, + { + "acc": 0.74047208, + "epoch": 0.7141588849726038, + "grad_norm": 7.03125, + "learning_rate": 7.636007490999577e-06, + "loss": 1.03622971, + "memory(GiB)": 302.58, + "step": 127700, + "train_speed(iter/s)": 0.124072 + }, + { + "acc": 0.73857341, + "epoch": 0.7142707344455831, + "grad_norm": 6.03125, + "learning_rate": 7.63522168684685e-06, + "loss": 1.05337944, + "memory(GiB)": 302.58, + "step": 127720, + "train_speed(iter/s)": 0.124081 + }, + { + "acc": 0.75143089, + "epoch": 0.7143825839185624, + "grad_norm": 6.03125, + "learning_rate": 7.634435792561188e-06, + "loss": 0.97248135, + "memory(GiB)": 302.58, + "step": 127740, + "train_speed(iter/s)": 0.12409 + }, + { + "acc": 0.74804897, + "epoch": 0.7144944333915416, + "grad_norm": 8.625, + "learning_rate": 7.633649808169466e-06, + "loss": 0.99126883, + "memory(GiB)": 302.58, + "step": 127760, + "train_speed(iter/s)": 0.1241 + }, + { + "acc": 0.74711132, + "epoch": 0.7146062828645209, + "grad_norm": 7.1875, + "learning_rate": 7.63286373369857e-06, + "loss": 0.9976758, + "memory(GiB)": 302.58, + "step": 127780, + "train_speed(iter/s)": 0.124109 + }, + { + "acc": 0.73592749, + "epoch": 0.7147181323375001, + "grad_norm": 6.96875, + "learning_rate": 7.632077569175386e-06, + "loss": 1.0479147, + "memory(GiB)": 302.58, + "step": 127800, + "train_speed(iter/s)": 0.124118 + }, + { + "acc": 0.7534801, + "epoch": 0.7148299818104794, + "grad_norm": 6.03125, + "learning_rate": 7.631291314626805e-06, + "loss": 0.95130205, + "memory(GiB)": 302.58, + "step": 127820, + "train_speed(iter/s)": 0.124128 + }, + { + "acc": 0.73590002, + "epoch": 0.7149418312834587, + "grad_norm": 8.25, + "learning_rate": 7.630504970079714e-06, + "loss": 1.05392294, + "memory(GiB)": 302.58, + "step": 127840, + "train_speed(iter/s)": 0.124137 + }, + { + "acc": 0.73161526, + "epoch": 0.7150536807564379, + "grad_norm": 5.3125, + "learning_rate": 7.629718535561014e-06, + "loss": 1.08369513, + "memory(GiB)": 302.58, + "step": 127860, + "train_speed(iter/s)": 0.124146 + }, + { + "acc": 0.73887777, + "epoch": 0.7151655302294172, + "grad_norm": 10.0, + "learning_rate": 7.6289320110976e-06, + "loss": 1.00826387, + "memory(GiB)": 302.58, + "step": 127880, + "train_speed(iter/s)": 0.124156 + }, + { + "acc": 0.73684225, + "epoch": 0.7152773797023965, + "grad_norm": 8.3125, + "learning_rate": 7.628145396716378e-06, + "loss": 1.04911394, + "memory(GiB)": 302.58, + "step": 127900, + "train_speed(iter/s)": 0.124165 + }, + { + "acc": 0.7286684, + "epoch": 0.7153892291753757, + "grad_norm": 10.0, + "learning_rate": 7.6273586924442464e-06, + "loss": 1.07252197, + "memory(GiB)": 302.58, + "step": 127920, + "train_speed(iter/s)": 0.124174 + }, + { + "acc": 0.73327842, + "epoch": 0.715501078648355, + "grad_norm": 6.78125, + "learning_rate": 7.6265718983081195e-06, + "loss": 1.05411654, + "memory(GiB)": 302.58, + "step": 127940, + "train_speed(iter/s)": 0.124183 + }, + { + "acc": 0.73404484, + "epoch": 0.7156129281213343, + "grad_norm": 6.875, + "learning_rate": 7.625785014334903e-06, + "loss": 1.06957502, + "memory(GiB)": 302.58, + "step": 127960, + "train_speed(iter/s)": 0.124192 + }, + { + "acc": 0.7442842, + "epoch": 0.7157247775943135, + "grad_norm": 6.1875, + "learning_rate": 7.624998040551514e-06, + "loss": 1.00981445, + "memory(GiB)": 302.58, + "step": 127980, + "train_speed(iter/s)": 0.124201 + }, + { + "acc": 0.74859133, + "epoch": 0.7158366270672928, + "grad_norm": 6.84375, + "learning_rate": 7.624210976984867e-06, + "loss": 0.98242168, + "memory(GiB)": 302.58, + "step": 128000, + "train_speed(iter/s)": 0.12421 + }, + { + "epoch": 0.7158366270672928, + "eval_acc": 0.7042768117086572, + "eval_loss": 1.0228739976882935, + "eval_runtime": 7499.8349, + "eval_samples_per_second": 10.038, + "eval_steps_per_second": 10.038, + "step": 128000 + }, + { + "acc": 0.75321546, + "epoch": 0.715948476540272, + "grad_norm": 5.875, + "learning_rate": 7.623423823661884e-06, + "loss": 0.95330296, + "memory(GiB)": 302.58, + "step": 128020, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.76345868, + "epoch": 0.7160603260132513, + "grad_norm": 4.875, + "learning_rate": 7.622636580609488e-06, + "loss": 0.91266088, + "memory(GiB)": 302.58, + "step": 128040, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.73117452, + "epoch": 0.7161721754862306, + "grad_norm": 7.90625, + "learning_rate": 7.621849247854605e-06, + "loss": 1.10080662, + "memory(GiB)": 302.58, + "step": 128060, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.74291477, + "epoch": 0.7162840249592098, + "grad_norm": 6.03125, + "learning_rate": 7.621061825424163e-06, + "loss": 0.99794502, + "memory(GiB)": 302.58, + "step": 128080, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.73606839, + "epoch": 0.7163958744321891, + "grad_norm": 5.6875, + "learning_rate": 7.620274313345095e-06, + "loss": 1.0694272, + "memory(GiB)": 302.58, + "step": 128100, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.72779431, + "epoch": 0.7165077239051684, + "grad_norm": 7.28125, + "learning_rate": 7.619486711644337e-06, + "loss": 1.07149868, + "memory(GiB)": 302.58, + "step": 128120, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.74275069, + "epoch": 0.7166195733781476, + "grad_norm": 6.9375, + "learning_rate": 7.61869902034883e-06, + "loss": 0.99842253, + "memory(GiB)": 302.58, + "step": 128140, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.74242396, + "epoch": 0.7167314228511269, + "grad_norm": 4.875, + "learning_rate": 7.61791123948551e-06, + "loss": 1.00656042, + "memory(GiB)": 302.58, + "step": 128160, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.76301937, + "epoch": 0.7168432723241063, + "grad_norm": 9.0625, + "learning_rate": 7.617123369081325e-06, + "loss": 0.92061481, + "memory(GiB)": 302.58, + "step": 128180, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.73443189, + "epoch": 0.7169551217970855, + "grad_norm": 5.6875, + "learning_rate": 7.616335409163223e-06, + "loss": 1.05911865, + "memory(GiB)": 302.58, + "step": 128200, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.73984814, + "epoch": 0.7170669712700648, + "grad_norm": 6.21875, + "learning_rate": 7.615547359758153e-06, + "loss": 1.04524298, + "memory(GiB)": 302.58, + "step": 128220, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.72720337, + "epoch": 0.7171788207430441, + "grad_norm": 8.0, + "learning_rate": 7.614759220893071e-06, + "loss": 1.07905302, + "memory(GiB)": 302.58, + "step": 128240, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.73806305, + "epoch": 0.7172906702160233, + "grad_norm": 9.75, + "learning_rate": 7.61397099259493e-06, + "loss": 1.02082853, + "memory(GiB)": 302.58, + "step": 128260, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.74256892, + "epoch": 0.7174025196890026, + "grad_norm": 9.625, + "learning_rate": 7.613182674890693e-06, + "loss": 1.00699615, + "memory(GiB)": 302.58, + "step": 128280, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75659223, + "epoch": 0.7175143691619819, + "grad_norm": 7.34375, + "learning_rate": 7.612394267807324e-06, + "loss": 0.95093775, + "memory(GiB)": 302.58, + "step": 128300, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.75146589, + "epoch": 0.7176262186349611, + "grad_norm": 5.6875, + "learning_rate": 7.6116057713717864e-06, + "loss": 0.95552626, + "memory(GiB)": 302.58, + "step": 128320, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.73344469, + "epoch": 0.7177380681079404, + "grad_norm": 7.5, + "learning_rate": 7.6108171856110504e-06, + "loss": 1.07381105, + "memory(GiB)": 302.58, + "step": 128340, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.75286303, + "epoch": 0.7178499175809196, + "grad_norm": 7.28125, + "learning_rate": 7.610028510552088e-06, + "loss": 0.97231283, + "memory(GiB)": 302.58, + "step": 128360, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.74168692, + "epoch": 0.7179617670538989, + "grad_norm": 7.46875, + "learning_rate": 7.609239746221875e-06, + "loss": 1.00427303, + "memory(GiB)": 302.58, + "step": 128380, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75234318, + "epoch": 0.7180736165268782, + "grad_norm": 10.1875, + "learning_rate": 7.6084508926473875e-06, + "loss": 0.96702242, + "memory(GiB)": 302.58, + "step": 128400, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.74985409, + "epoch": 0.7181854659998574, + "grad_norm": 6.0, + "learning_rate": 7.607661949855608e-06, + "loss": 0.98423576, + "memory(GiB)": 302.58, + "step": 128420, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.73620567, + "epoch": 0.7182973154728367, + "grad_norm": 6.65625, + "learning_rate": 7.606872917873522e-06, + "loss": 1.03050146, + "memory(GiB)": 302.58, + "step": 128440, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.73626437, + "epoch": 0.718409164945816, + "grad_norm": 8.4375, + "learning_rate": 7.606083796728116e-06, + "loss": 1.04271212, + "memory(GiB)": 302.58, + "step": 128460, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.73808541, + "epoch": 0.7185210144187952, + "grad_norm": 5.28125, + "learning_rate": 7.605294586446382e-06, + "loss": 1.02294989, + "memory(GiB)": 302.58, + "step": 128480, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75396705, + "epoch": 0.7186328638917745, + "grad_norm": 7.34375, + "learning_rate": 7.60450528705531e-06, + "loss": 0.95884275, + "memory(GiB)": 302.58, + "step": 128500, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.75557418, + "epoch": 0.7187447133647538, + "grad_norm": 6.34375, + "learning_rate": 7.6037158985819e-06, + "loss": 0.9701251, + "memory(GiB)": 302.58, + "step": 128520, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.75511441, + "epoch": 0.718856562837733, + "grad_norm": 6.5, + "learning_rate": 7.602926421053149e-06, + "loss": 0.96320696, + "memory(GiB)": 302.58, + "step": 128540, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74033689, + "epoch": 0.7189684123107123, + "grad_norm": 9.75, + "learning_rate": 7.60213685449606e-06, + "loss": 1.01726379, + "memory(GiB)": 302.58, + "step": 128560, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74108953, + "epoch": 0.7190802617836916, + "grad_norm": 6.8125, + "learning_rate": 7.601347198937641e-06, + "loss": 1.03671417, + "memory(GiB)": 302.58, + "step": 128580, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.73829689, + "epoch": 0.7191921112566708, + "grad_norm": 10.8125, + "learning_rate": 7.6005574544048984e-06, + "loss": 1.0381218, + "memory(GiB)": 302.58, + "step": 128600, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.73655481, + "epoch": 0.7193039607296501, + "grad_norm": 4.6875, + "learning_rate": 7.599767620924845e-06, + "loss": 1.0323617, + "memory(GiB)": 302.58, + "step": 128620, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.7343349, + "epoch": 0.7194158102026293, + "grad_norm": 7.34375, + "learning_rate": 7.598977698524497e-06, + "loss": 1.06155109, + "memory(GiB)": 302.58, + "step": 128640, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.7466115, + "epoch": 0.7195276596756086, + "grad_norm": 8.375, + "learning_rate": 7.598187687230868e-06, + "loss": 0.99002867, + "memory(GiB)": 302.58, + "step": 128660, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.74235506, + "epoch": 0.7196395091485879, + "grad_norm": 9.3125, + "learning_rate": 7.597397587070982e-06, + "loss": 1.02740307, + "memory(GiB)": 302.58, + "step": 128680, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.7293314, + "epoch": 0.7197513586215671, + "grad_norm": 6.53125, + "learning_rate": 7.5966073980718625e-06, + "loss": 1.06278849, + "memory(GiB)": 302.58, + "step": 128700, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.74767079, + "epoch": 0.7198632080945464, + "grad_norm": 6.15625, + "learning_rate": 7.595817120260537e-06, + "loss": 0.99070959, + "memory(GiB)": 302.58, + "step": 128720, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.70652213, + "epoch": 0.7199750575675257, + "grad_norm": 8.1875, + "learning_rate": 7.595026753664034e-06, + "loss": 1.17585382, + "memory(GiB)": 302.58, + "step": 128740, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.74827623, + "epoch": 0.7200869070405049, + "grad_norm": 8.9375, + "learning_rate": 7.5942362983093875e-06, + "loss": 0.98987007, + "memory(GiB)": 302.58, + "step": 128760, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.7472682, + "epoch": 0.7201987565134842, + "grad_norm": 7.34375, + "learning_rate": 7.593445754223634e-06, + "loss": 0.97726641, + "memory(GiB)": 302.58, + "step": 128780, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.74086061, + "epoch": 0.7203106059864635, + "grad_norm": 8.5, + "learning_rate": 7.5926551214338125e-06, + "loss": 1.01349726, + "memory(GiB)": 302.58, + "step": 128800, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.71877117, + "epoch": 0.7204224554594427, + "grad_norm": 6.78125, + "learning_rate": 7.591864399966965e-06, + "loss": 1.10998936, + "memory(GiB)": 302.58, + "step": 128820, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.74531975, + "epoch": 0.720534304932422, + "grad_norm": 7.5625, + "learning_rate": 7.5910735898501355e-06, + "loss": 1.00062914, + "memory(GiB)": 302.58, + "step": 128840, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.74054332, + "epoch": 0.7206461544054013, + "grad_norm": 8.5625, + "learning_rate": 7.5902826911103735e-06, + "loss": 1.03478479, + "memory(GiB)": 302.58, + "step": 128860, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.73726435, + "epoch": 0.7207580038783805, + "grad_norm": 8.3125, + "learning_rate": 7.589491703774729e-06, + "loss": 1.03644285, + "memory(GiB)": 302.58, + "step": 128880, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.75016265, + "epoch": 0.7208698533513598, + "grad_norm": 5.375, + "learning_rate": 7.588700627870257e-06, + "loss": 0.96970701, + "memory(GiB)": 302.58, + "step": 128900, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.74308825, + "epoch": 0.720981702824339, + "grad_norm": 10.25, + "learning_rate": 7.587909463424017e-06, + "loss": 1.04086924, + "memory(GiB)": 302.58, + "step": 128920, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.72965751, + "epoch": 0.7210935522973183, + "grad_norm": 7.59375, + "learning_rate": 7.587118210463065e-06, + "loss": 1.07429323, + "memory(GiB)": 302.58, + "step": 128940, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.74261198, + "epoch": 0.7212054017702976, + "grad_norm": 7.4375, + "learning_rate": 7.586326869014469e-06, + "loss": 0.98918867, + "memory(GiB)": 302.58, + "step": 128960, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.71950588, + "epoch": 0.7213172512432768, + "grad_norm": 8.9375, + "learning_rate": 7.585535439105291e-06, + "loss": 1.10351973, + "memory(GiB)": 302.58, + "step": 128980, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.73843455, + "epoch": 0.7214291007162561, + "grad_norm": 5.1875, + "learning_rate": 7.584743920762603e-06, + "loss": 1.01784039, + "memory(GiB)": 302.58, + "step": 129000, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.71963215, + "epoch": 0.7215409501892354, + "grad_norm": 11.75, + "learning_rate": 7.583952314013477e-06, + "loss": 1.12599783, + "memory(GiB)": 302.58, + "step": 129020, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.74767118, + "epoch": 0.7216527996622146, + "grad_norm": 5.9375, + "learning_rate": 7.583160618884989e-06, + "loss": 0.9897892, + "memory(GiB)": 302.58, + "step": 129040, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.75322533, + "epoch": 0.7217646491351939, + "grad_norm": 10.5, + "learning_rate": 7.582368835404217e-06, + "loss": 0.96097488, + "memory(GiB)": 302.58, + "step": 129060, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.74697332, + "epoch": 0.7218764986081732, + "grad_norm": 4.84375, + "learning_rate": 7.581576963598243e-06, + "loss": 0.98271732, + "memory(GiB)": 302.58, + "step": 129080, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.73955989, + "epoch": 0.7219883480811524, + "grad_norm": 6.96875, + "learning_rate": 7.580785003494149e-06, + "loss": 1.0402338, + "memory(GiB)": 302.58, + "step": 129100, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.75477567, + "epoch": 0.7221001975541317, + "grad_norm": 5.15625, + "learning_rate": 7.579992955119025e-06, + "loss": 0.94889269, + "memory(GiB)": 302.58, + "step": 129120, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.73725939, + "epoch": 0.722212047027111, + "grad_norm": 8.9375, + "learning_rate": 7.579200818499961e-06, + "loss": 1.03218956, + "memory(GiB)": 302.58, + "step": 129140, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.74078493, + "epoch": 0.7223238965000902, + "grad_norm": 8.0625, + "learning_rate": 7.57840859366405e-06, + "loss": 1.0068038, + "memory(GiB)": 302.58, + "step": 129160, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.74111948, + "epoch": 0.7224357459730695, + "grad_norm": 7.4375, + "learning_rate": 7.577616280638392e-06, + "loss": 1.01458988, + "memory(GiB)": 302.58, + "step": 129180, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.72969699, + "epoch": 0.7225475954460487, + "grad_norm": 5.46875, + "learning_rate": 7.576823879450081e-06, + "loss": 1.05624628, + "memory(GiB)": 302.58, + "step": 129200, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.75444231, + "epoch": 0.722659444919028, + "grad_norm": 6.1875, + "learning_rate": 7.576031390126224e-06, + "loss": 0.94688311, + "memory(GiB)": 302.58, + "step": 129220, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.73888412, + "epoch": 0.7227712943920073, + "grad_norm": 6.5625, + "learning_rate": 7.575238812693924e-06, + "loss": 1.03276043, + "memory(GiB)": 302.58, + "step": 129240, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.74105053, + "epoch": 0.7228831438649865, + "grad_norm": 9.25, + "learning_rate": 7.5744461471802915e-06, + "loss": 1.0193037, + "memory(GiB)": 302.58, + "step": 129260, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.74218287, + "epoch": 0.7229949933379658, + "grad_norm": 6.5, + "learning_rate": 7.573653393612436e-06, + "loss": 1.02114744, + "memory(GiB)": 302.58, + "step": 129280, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.73020349, + "epoch": 0.7231068428109451, + "grad_norm": 7.09375, + "learning_rate": 7.5728605520174734e-06, + "loss": 1.04681873, + "memory(GiB)": 302.58, + "step": 129300, + "train_speed(iter/s)": 0.123888 + }, + { + "acc": 0.74303436, + "epoch": 0.7232186922839243, + "grad_norm": 7.03125, + "learning_rate": 7.572067622422523e-06, + "loss": 1.00628376, + "memory(GiB)": 302.58, + "step": 129320, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.73974848, + "epoch": 0.7233305417569036, + "grad_norm": 7.78125, + "learning_rate": 7.571274604854705e-06, + "loss": 1.03671093, + "memory(GiB)": 302.58, + "step": 129340, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.74659252, + "epoch": 0.7234423912298829, + "grad_norm": 7.8125, + "learning_rate": 7.5704814993411405e-06, + "loss": 0.97661734, + "memory(GiB)": 302.58, + "step": 129360, + "train_speed(iter/s)": 0.123915 + }, + { + "acc": 0.75002427, + "epoch": 0.7235542407028621, + "grad_norm": 8.0625, + "learning_rate": 7.5696883059089595e-06, + "loss": 0.95728054, + "memory(GiB)": 302.58, + "step": 129380, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.74308286, + "epoch": 0.7236660901758414, + "grad_norm": 8.5625, + "learning_rate": 7.568895024585289e-06, + "loss": 1.00879774, + "memory(GiB)": 302.58, + "step": 129400, + "train_speed(iter/s)": 0.123934 + }, + { + "acc": 0.74653544, + "epoch": 0.7237779396488206, + "grad_norm": 5.6875, + "learning_rate": 7.568101655397262e-06, + "loss": 1.00348368, + "memory(GiB)": 302.58, + "step": 129420, + "train_speed(iter/s)": 0.123943 + }, + { + "acc": 0.73585019, + "epoch": 0.7238897891217999, + "grad_norm": 8.5625, + "learning_rate": 7.567308198372017e-06, + "loss": 1.04792423, + "memory(GiB)": 302.58, + "step": 129440, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.73298593, + "epoch": 0.7240016385947792, + "grad_norm": 7.5625, + "learning_rate": 7.566514653536689e-06, + "loss": 1.02812147, + "memory(GiB)": 302.58, + "step": 129460, + "train_speed(iter/s)": 0.123961 + }, + { + "acc": 0.72983899, + "epoch": 0.7241134880677584, + "grad_norm": 6.78125, + "learning_rate": 7.565721020918424e-06, + "loss": 1.05017233, + "memory(GiB)": 302.58, + "step": 129480, + "train_speed(iter/s)": 0.12397 + }, + { + "acc": 0.73659983, + "epoch": 0.7242253375407377, + "grad_norm": 8.1875, + "learning_rate": 7.564927300544363e-06, + "loss": 1.03556709, + "memory(GiB)": 302.58, + "step": 129500, + "train_speed(iter/s)": 0.12398 + }, + { + "acc": 0.74881744, + "epoch": 0.724337187013717, + "grad_norm": 6.40625, + "learning_rate": 7.564133492441656e-06, + "loss": 0.98681107, + "memory(GiB)": 302.58, + "step": 129520, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.74592199, + "epoch": 0.7244490364866962, + "grad_norm": 9.875, + "learning_rate": 7.563339596637451e-06, + "loss": 0.98905687, + "memory(GiB)": 302.58, + "step": 129540, + "train_speed(iter/s)": 0.123998 + }, + { + "acc": 0.74586296, + "epoch": 0.7245608859596755, + "grad_norm": 6.6875, + "learning_rate": 7.562545613158906e-06, + "loss": 1.01291599, + "memory(GiB)": 302.58, + "step": 129560, + "train_speed(iter/s)": 0.124007 + }, + { + "acc": 0.75676608, + "epoch": 0.7246727354326548, + "grad_norm": 6.65625, + "learning_rate": 7.561751542033173e-06, + "loss": 0.95605364, + "memory(GiB)": 302.58, + "step": 129580, + "train_speed(iter/s)": 0.124015 + }, + { + "acc": 0.74327197, + "epoch": 0.724784584905634, + "grad_norm": 6.0, + "learning_rate": 7.560957383287416e-06, + "loss": 0.99671488, + "memory(GiB)": 302.58, + "step": 129600, + "train_speed(iter/s)": 0.124025 + }, + { + "acc": 0.73142014, + "epoch": 0.7248964343786133, + "grad_norm": 7.125, + "learning_rate": 7.560163136948795e-06, + "loss": 1.05617075, + "memory(GiB)": 302.58, + "step": 129620, + "train_speed(iter/s)": 0.124033 + }, + { + "acc": 0.73366814, + "epoch": 0.7250082838515925, + "grad_norm": 8.625, + "learning_rate": 7.559368803044478e-06, + "loss": 1.02750626, + "memory(GiB)": 302.58, + "step": 129640, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.75901771, + "epoch": 0.7251201333245718, + "grad_norm": 7.5625, + "learning_rate": 7.558574381601631e-06, + "loss": 0.9579731, + "memory(GiB)": 302.58, + "step": 129660, + "train_speed(iter/s)": 0.124052 + }, + { + "acc": 0.74596786, + "epoch": 0.7252319827975511, + "grad_norm": 7.53125, + "learning_rate": 7.5577798726474285e-06, + "loss": 0.9976716, + "memory(GiB)": 302.58, + "step": 129680, + "train_speed(iter/s)": 0.124062 + }, + { + "acc": 0.7342135, + "epoch": 0.7253438322705303, + "grad_norm": 9.4375, + "learning_rate": 7.556985276209044e-06, + "loss": 1.05387783, + "memory(GiB)": 302.58, + "step": 129700, + "train_speed(iter/s)": 0.12407 + }, + { + "acc": 0.73349862, + "epoch": 0.7254556817435096, + "grad_norm": 8.625, + "learning_rate": 7.556190592313652e-06, + "loss": 1.03621531, + "memory(GiB)": 302.58, + "step": 129720, + "train_speed(iter/s)": 0.124079 + }, + { + "acc": 0.75872235, + "epoch": 0.7255675312164889, + "grad_norm": 9.3125, + "learning_rate": 7.5553958209884385e-06, + "loss": 0.95696068, + "memory(GiB)": 302.58, + "step": 129740, + "train_speed(iter/s)": 0.124088 + }, + { + "acc": 0.74053464, + "epoch": 0.7256793806894681, + "grad_norm": 9.4375, + "learning_rate": 7.554600962260585e-06, + "loss": 1.01877718, + "memory(GiB)": 302.58, + "step": 129760, + "train_speed(iter/s)": 0.124097 + }, + { + "acc": 0.74113374, + "epoch": 0.7257912301624474, + "grad_norm": 10.0, + "learning_rate": 7.553806016157279e-06, + "loss": 1.01606245, + "memory(GiB)": 302.58, + "step": 129780, + "train_speed(iter/s)": 0.124106 + }, + { + "acc": 0.75868206, + "epoch": 0.7259030796354267, + "grad_norm": 6.09375, + "learning_rate": 7.553010982705709e-06, + "loss": 0.92982244, + "memory(GiB)": 302.58, + "step": 129800, + "train_speed(iter/s)": 0.124116 + }, + { + "acc": 0.72990298, + "epoch": 0.7260149291084059, + "grad_norm": 6.8125, + "learning_rate": 7.552215861933066e-06, + "loss": 1.07989273, + "memory(GiB)": 302.58, + "step": 129820, + "train_speed(iter/s)": 0.124124 + }, + { + "acc": 0.71782427, + "epoch": 0.7261267785813852, + "grad_norm": 5.0, + "learning_rate": 7.55142065386655e-06, + "loss": 1.13018236, + "memory(GiB)": 302.58, + "step": 129840, + "train_speed(iter/s)": 0.124133 + }, + { + "acc": 0.74069514, + "epoch": 0.7262386280543645, + "grad_norm": 5.9375, + "learning_rate": 7.5506253585333555e-06, + "loss": 1.03065395, + "memory(GiB)": 302.58, + "step": 129860, + "train_speed(iter/s)": 0.124142 + }, + { + "acc": 0.74897351, + "epoch": 0.7263504775273437, + "grad_norm": 8.125, + "learning_rate": 7.549829975960687e-06, + "loss": 0.97692976, + "memory(GiB)": 302.58, + "step": 129880, + "train_speed(iter/s)": 0.124151 + }, + { + "acc": 0.73444624, + "epoch": 0.726462327000323, + "grad_norm": 5.21875, + "learning_rate": 7.549034506175747e-06, + "loss": 1.06471624, + "memory(GiB)": 302.58, + "step": 129900, + "train_speed(iter/s)": 0.124161 + }, + { + "acc": 0.73893294, + "epoch": 0.7265741764733022, + "grad_norm": 8.8125, + "learning_rate": 7.5482389492057466e-06, + "loss": 1.01677771, + "memory(GiB)": 302.58, + "step": 129920, + "train_speed(iter/s)": 0.12417 + }, + { + "acc": 0.730516, + "epoch": 0.7266860259462815, + "grad_norm": 6.5, + "learning_rate": 7.547443305077892e-06, + "loss": 1.09116859, + "memory(GiB)": 302.58, + "step": 129940, + "train_speed(iter/s)": 0.12418 + }, + { + "acc": 0.72992754, + "epoch": 0.7267978754192608, + "grad_norm": 8.5625, + "learning_rate": 7.546647573819399e-06, + "loss": 1.05097504, + "memory(GiB)": 302.58, + "step": 129960, + "train_speed(iter/s)": 0.124188 + }, + { + "acc": 0.73076143, + "epoch": 0.72690972489224, + "grad_norm": 7.75, + "learning_rate": 7.545851755457483e-06, + "loss": 1.06914511, + "memory(GiB)": 302.58, + "step": 129980, + "train_speed(iter/s)": 0.124197 + }, + { + "acc": 0.73986073, + "epoch": 0.7270215743652193, + "grad_norm": 10.125, + "learning_rate": 7.5450558500193646e-06, + "loss": 0.99925776, + "memory(GiB)": 302.58, + "step": 130000, + "train_speed(iter/s)": 0.124207 + }, + { + "epoch": 0.7270215743652193, + "eval_acc": 0.7043932014210489, + "eval_loss": 1.0222474336624146, + "eval_runtime": 7496.0147, + "eval_samples_per_second": 10.043, + "eval_steps_per_second": 10.043, + "step": 130000 + }, + { + "acc": 0.74779301, + "epoch": 0.7271334238381986, + "grad_norm": 7.03125, + "learning_rate": 7.5442598575322665e-06, + "loss": 0.9966033, + "memory(GiB)": 302.58, + "step": 130020, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.74225883, + "epoch": 0.7272452733111778, + "grad_norm": 9.0625, + "learning_rate": 7.543463778023414e-06, + "loss": 1.04062796, + "memory(GiB)": 302.58, + "step": 130040, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.73719578, + "epoch": 0.7273571227841571, + "grad_norm": 11.1875, + "learning_rate": 7.5426676115200335e-06, + "loss": 1.0425436, + "memory(GiB)": 302.58, + "step": 130060, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.73405361, + "epoch": 0.7274689722571364, + "grad_norm": 4.125, + "learning_rate": 7.541871358049359e-06, + "loss": 1.04009161, + "memory(GiB)": 302.58, + "step": 130080, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.73044853, + "epoch": 0.7275808217301156, + "grad_norm": 6.6875, + "learning_rate": 7.541075017638626e-06, + "loss": 1.06606245, + "memory(GiB)": 302.58, + "step": 130100, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.74027839, + "epoch": 0.7276926712030949, + "grad_norm": 6.125, + "learning_rate": 7.540278590315067e-06, + "loss": 1.00636711, + "memory(GiB)": 302.58, + "step": 130120, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.7406467, + "epoch": 0.7278045206760742, + "grad_norm": 7.0625, + "learning_rate": 7.539482076105926e-06, + "loss": 1.04327545, + "memory(GiB)": 302.58, + "step": 130140, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.74557576, + "epoch": 0.7279163701490534, + "grad_norm": 6.375, + "learning_rate": 7.538685475038445e-06, + "loss": 1.00041094, + "memory(GiB)": 302.58, + "step": 130160, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.73940296, + "epoch": 0.7280282196220327, + "grad_norm": 6.78125, + "learning_rate": 7.537888787139872e-06, + "loss": 1.01802607, + "memory(GiB)": 302.58, + "step": 130180, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.72502789, + "epoch": 0.7281400690950119, + "grad_norm": 5.75, + "learning_rate": 7.537092012437455e-06, + "loss": 1.1097105, + "memory(GiB)": 302.58, + "step": 130200, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.73204188, + "epoch": 0.7282519185679912, + "grad_norm": 13.0625, + "learning_rate": 7.536295150958446e-06, + "loss": 1.06374016, + "memory(GiB)": 302.58, + "step": 130220, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.71993117, + "epoch": 0.7283637680409705, + "grad_norm": 7.03125, + "learning_rate": 7.5354982027301e-06, + "loss": 1.09882402, + "memory(GiB)": 302.58, + "step": 130240, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.74840641, + "epoch": 0.7284756175139497, + "grad_norm": 5.09375, + "learning_rate": 7.534701167779677e-06, + "loss": 1.00150909, + "memory(GiB)": 302.58, + "step": 130260, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.74543791, + "epoch": 0.728587466986929, + "grad_norm": 7.75, + "learning_rate": 7.533904046134436e-06, + "loss": 1.01848078, + "memory(GiB)": 302.58, + "step": 130280, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.73545866, + "epoch": 0.7286993164599083, + "grad_norm": 9.0625, + "learning_rate": 7.533106837821641e-06, + "loss": 1.03214674, + "memory(GiB)": 302.58, + "step": 130300, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.72745309, + "epoch": 0.7288111659328875, + "grad_norm": 6.21875, + "learning_rate": 7.532309542868561e-06, + "loss": 1.06650953, + "memory(GiB)": 302.58, + "step": 130320, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.73785553, + "epoch": 0.7289230154058668, + "grad_norm": 6.09375, + "learning_rate": 7.531512161302463e-06, + "loss": 1.02812777, + "memory(GiB)": 302.58, + "step": 130340, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.73470297, + "epoch": 0.7290348648788461, + "grad_norm": 6.84375, + "learning_rate": 7.530714693150625e-06, + "loss": 1.06066904, + "memory(GiB)": 302.58, + "step": 130360, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75422955, + "epoch": 0.7291467143518253, + "grad_norm": 7.1875, + "learning_rate": 7.529917138440317e-06, + "loss": 0.95945559, + "memory(GiB)": 302.58, + "step": 130380, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.72689438, + "epoch": 0.7292585638248046, + "grad_norm": 7.46875, + "learning_rate": 7.529119497198823e-06, + "loss": 1.09047909, + "memory(GiB)": 302.58, + "step": 130400, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.74186902, + "epoch": 0.7293704132977838, + "grad_norm": 7.46875, + "learning_rate": 7.528321769453419e-06, + "loss": 1.02048569, + "memory(GiB)": 302.58, + "step": 130420, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.71726937, + "epoch": 0.7294822627707631, + "grad_norm": 7.25, + "learning_rate": 7.527523955231398e-06, + "loss": 1.09819937, + "memory(GiB)": 302.58, + "step": 130440, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.7537117, + "epoch": 0.7295941122437424, + "grad_norm": 6.28125, + "learning_rate": 7.5267260545600405e-06, + "loss": 0.95532598, + "memory(GiB)": 302.58, + "step": 130460, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.75248804, + "epoch": 0.7297059617167216, + "grad_norm": 7.4375, + "learning_rate": 7.52592806746664e-06, + "loss": 0.96060286, + "memory(GiB)": 302.58, + "step": 130480, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.73476858, + "epoch": 0.7298178111897009, + "grad_norm": 6.5, + "learning_rate": 7.525129993978491e-06, + "loss": 1.04298286, + "memory(GiB)": 302.58, + "step": 130500, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74056153, + "epoch": 0.7299296606626802, + "grad_norm": 4.90625, + "learning_rate": 7.5243318341228886e-06, + "loss": 1.03679972, + "memory(GiB)": 302.58, + "step": 130520, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.72734346, + "epoch": 0.7300415101356594, + "grad_norm": 5.71875, + "learning_rate": 7.5235335879271335e-06, + "loss": 1.07154684, + "memory(GiB)": 302.58, + "step": 130540, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.71729422, + "epoch": 0.7301533596086387, + "grad_norm": 7.34375, + "learning_rate": 7.522735255418526e-06, + "loss": 1.11555605, + "memory(GiB)": 302.58, + "step": 130560, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75440798, + "epoch": 0.730265209081618, + "grad_norm": 8.3125, + "learning_rate": 7.521936836624375e-06, + "loss": 0.95900621, + "memory(GiB)": 302.58, + "step": 130580, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.75450945, + "epoch": 0.7303770585545972, + "grad_norm": 5.84375, + "learning_rate": 7.521138331571986e-06, + "loss": 0.96610241, + "memory(GiB)": 302.58, + "step": 130600, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.73405776, + "epoch": 0.7304889080275765, + "grad_norm": 7.6875, + "learning_rate": 7.520339740288674e-06, + "loss": 1.04527569, + "memory(GiB)": 302.58, + "step": 130620, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.73434372, + "epoch": 0.7306007575005558, + "grad_norm": 10.125, + "learning_rate": 7.51954106280175e-06, + "loss": 1.03767805, + "memory(GiB)": 302.58, + "step": 130640, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.73596363, + "epoch": 0.730712606973535, + "grad_norm": 7.9375, + "learning_rate": 7.518742299138533e-06, + "loss": 1.03577003, + "memory(GiB)": 302.58, + "step": 130660, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.74987435, + "epoch": 0.7308244564465143, + "grad_norm": 8.75, + "learning_rate": 7.5179434493263415e-06, + "loss": 0.99602451, + "memory(GiB)": 302.58, + "step": 130680, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.75323672, + "epoch": 0.7309363059194935, + "grad_norm": 7.90625, + "learning_rate": 7.5171445133925e-06, + "loss": 0.97041531, + "memory(GiB)": 302.58, + "step": 130700, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.75208478, + "epoch": 0.7310481553924728, + "grad_norm": 7.5, + "learning_rate": 7.5163454913643354e-06, + "loss": 0.97461033, + "memory(GiB)": 302.58, + "step": 130720, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.7171639, + "epoch": 0.7311600048654521, + "grad_norm": 9.625, + "learning_rate": 7.515546383269176e-06, + "loss": 1.10495434, + "memory(GiB)": 302.58, + "step": 130740, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.7262115, + "epoch": 0.7312718543384313, + "grad_norm": 8.5625, + "learning_rate": 7.514747189134352e-06, + "loss": 1.07924175, + "memory(GiB)": 302.58, + "step": 130760, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.741394, + "epoch": 0.7313837038114106, + "grad_norm": 6.0, + "learning_rate": 7.5139479089872e-06, + "loss": 1.02458086, + "memory(GiB)": 302.58, + "step": 130780, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.72346072, + "epoch": 0.7314955532843899, + "grad_norm": 5.71875, + "learning_rate": 7.51314854285506e-06, + "loss": 1.11121349, + "memory(GiB)": 302.58, + "step": 130800, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.75747685, + "epoch": 0.7316074027573691, + "grad_norm": 8.875, + "learning_rate": 7.51234909076527e-06, + "loss": 0.96290493, + "memory(GiB)": 302.58, + "step": 130820, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.7345232, + "epoch": 0.7317192522303484, + "grad_norm": 6.78125, + "learning_rate": 7.511549552745176e-06, + "loss": 1.05751476, + "memory(GiB)": 302.58, + "step": 130840, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.74862852, + "epoch": 0.7318311017033277, + "grad_norm": 8.3125, + "learning_rate": 7.510749928822121e-06, + "loss": 0.9796052, + "memory(GiB)": 302.58, + "step": 130860, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.75254493, + "epoch": 0.7319429511763069, + "grad_norm": 6.625, + "learning_rate": 7.509950219023458e-06, + "loss": 0.96659851, + "memory(GiB)": 302.58, + "step": 130880, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.74418416, + "epoch": 0.7320548006492862, + "grad_norm": 5.5, + "learning_rate": 7.509150423376539e-06, + "loss": 1.00978317, + "memory(GiB)": 302.58, + "step": 130900, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.75336871, + "epoch": 0.7321666501222654, + "grad_norm": 5.34375, + "learning_rate": 7.508350541908719e-06, + "loss": 0.97767534, + "memory(GiB)": 302.58, + "step": 130920, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.73321772, + "epoch": 0.7322784995952447, + "grad_norm": 6.84375, + "learning_rate": 7.507550574647356e-06, + "loss": 1.06043205, + "memory(GiB)": 302.58, + "step": 130940, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.75941057, + "epoch": 0.732390349068224, + "grad_norm": 8.4375, + "learning_rate": 7.506750521619813e-06, + "loss": 0.93317814, + "memory(GiB)": 302.58, + "step": 130960, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.75488505, + "epoch": 0.7325021985412032, + "grad_norm": 6.75, + "learning_rate": 7.505950382853454e-06, + "loss": 0.94847183, + "memory(GiB)": 302.58, + "step": 130980, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.73408465, + "epoch": 0.7326140480141825, + "grad_norm": 5.84375, + "learning_rate": 7.505150158375645e-06, + "loss": 1.049998, + "memory(GiB)": 302.58, + "step": 131000, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.74947748, + "epoch": 0.7327258974871618, + "grad_norm": 5.5625, + "learning_rate": 7.504349848213757e-06, + "loss": 0.99545784, + "memory(GiB)": 302.58, + "step": 131020, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.75162458, + "epoch": 0.732837746960141, + "grad_norm": 10.4375, + "learning_rate": 7.5035494523951625e-06, + "loss": 0.97208319, + "memory(GiB)": 302.58, + "step": 131040, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.73584023, + "epoch": 0.7329495964331203, + "grad_norm": 7.75, + "learning_rate": 7.502748970947239e-06, + "loss": 1.05918894, + "memory(GiB)": 302.58, + "step": 131060, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.7459938, + "epoch": 0.7330614459060996, + "grad_norm": 5.1875, + "learning_rate": 7.501948403897366e-06, + "loss": 0.99876127, + "memory(GiB)": 302.58, + "step": 131080, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.75039167, + "epoch": 0.7331732953790788, + "grad_norm": 5.3125, + "learning_rate": 7.501147751272922e-06, + "loss": 0.94943371, + "memory(GiB)": 302.58, + "step": 131100, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.73267751, + "epoch": 0.7332851448520581, + "grad_norm": 6.875, + "learning_rate": 7.500347013101294e-06, + "loss": 1.04668093, + "memory(GiB)": 302.58, + "step": 131120, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.7648046, + "epoch": 0.7333969943250374, + "grad_norm": 9.75, + "learning_rate": 7.49954618940987e-06, + "loss": 0.93017111, + "memory(GiB)": 302.58, + "step": 131140, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.74058666, + "epoch": 0.7335088437980166, + "grad_norm": 4.59375, + "learning_rate": 7.498745280226041e-06, + "loss": 1.01581507, + "memory(GiB)": 302.58, + "step": 131160, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.74157839, + "epoch": 0.7336206932709959, + "grad_norm": 10.5625, + "learning_rate": 7.4979442855772e-06, + "loss": 1.01850119, + "memory(GiB)": 302.58, + "step": 131180, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.73937287, + "epoch": 0.7337325427439751, + "grad_norm": 8.625, + "learning_rate": 7.4971432054907435e-06, + "loss": 1.05594063, + "memory(GiB)": 302.58, + "step": 131200, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.72303729, + "epoch": 0.7338443922169544, + "grad_norm": 5.5625, + "learning_rate": 7.496342039994071e-06, + "loss": 1.08096867, + "memory(GiB)": 302.58, + "step": 131220, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.73641624, + "epoch": 0.7339562416899337, + "grad_norm": 5.71875, + "learning_rate": 7.495540789114586e-06, + "loss": 1.03653173, + "memory(GiB)": 302.58, + "step": 131240, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.74089651, + "epoch": 0.7340680911629129, + "grad_norm": 7.8125, + "learning_rate": 7.494739452879693e-06, + "loss": 1.01644821, + "memory(GiB)": 302.58, + "step": 131260, + "train_speed(iter/s)": 0.123869 + }, + { + "acc": 0.74256096, + "epoch": 0.7341799406358922, + "grad_norm": 7.625, + "learning_rate": 7.493938031316798e-06, + "loss": 1.00967131, + "memory(GiB)": 302.58, + "step": 131280, + "train_speed(iter/s)": 0.123878 + }, + { + "acc": 0.71929846, + "epoch": 0.7342917901088715, + "grad_norm": 5.78125, + "learning_rate": 7.493136524453315e-06, + "loss": 1.1111867, + "memory(GiB)": 302.58, + "step": 131300, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.73875699, + "epoch": 0.7344036395818507, + "grad_norm": 6.5, + "learning_rate": 7.4923349323166585e-06, + "loss": 1.0385498, + "memory(GiB)": 302.58, + "step": 131320, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.72241769, + "epoch": 0.73451548905483, + "grad_norm": 7.65625, + "learning_rate": 7.491533254934244e-06, + "loss": 1.09341431, + "memory(GiB)": 302.58, + "step": 131340, + "train_speed(iter/s)": 0.123904 + }, + { + "acc": 0.73116722, + "epoch": 0.7346273385278093, + "grad_norm": 5.25, + "learning_rate": 7.490731492333492e-06, + "loss": 1.06772242, + "memory(GiB)": 302.58, + "step": 131360, + "train_speed(iter/s)": 0.123913 + }, + { + "acc": 0.74535027, + "epoch": 0.7347391880007885, + "grad_norm": 8.125, + "learning_rate": 7.4899296445418246e-06, + "loss": 0.99665108, + "memory(GiB)": 302.58, + "step": 131380, + "train_speed(iter/s)": 0.123923 + }, + { + "acc": 0.74801302, + "epoch": 0.7348510374737678, + "grad_norm": 6.84375, + "learning_rate": 7.489127711586668e-06, + "loss": 0.98590918, + "memory(GiB)": 302.58, + "step": 131400, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.7376358, + "epoch": 0.734962886946747, + "grad_norm": 6.6875, + "learning_rate": 7.48832569349545e-06, + "loss": 1.0180583, + "memory(GiB)": 302.58, + "step": 131420, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.75465393, + "epoch": 0.7350747364197263, + "grad_norm": 6.125, + "learning_rate": 7.487523590295602e-06, + "loss": 0.95093622, + "memory(GiB)": 302.58, + "step": 131440, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.74353948, + "epoch": 0.7351865858927056, + "grad_norm": 6.15625, + "learning_rate": 7.486721402014563e-06, + "loss": 1.00467043, + "memory(GiB)": 302.58, + "step": 131460, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.73776364, + "epoch": 0.7352984353656848, + "grad_norm": 7.0625, + "learning_rate": 7.485919128679765e-06, + "loss": 1.04151926, + "memory(GiB)": 302.58, + "step": 131480, + "train_speed(iter/s)": 0.123967 + }, + { + "acc": 0.74240713, + "epoch": 0.7354102848386641, + "grad_norm": 7.8125, + "learning_rate": 7.48511677031865e-06, + "loss": 1.00021572, + "memory(GiB)": 302.58, + "step": 131500, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.73193159, + "epoch": 0.7355221343116434, + "grad_norm": 7.9375, + "learning_rate": 7.484314326958662e-06, + "loss": 1.04824486, + "memory(GiB)": 302.58, + "step": 131520, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.71937647, + "epoch": 0.7356339837846226, + "grad_norm": 6.53125, + "learning_rate": 7.483511798627247e-06, + "loss": 1.12031212, + "memory(GiB)": 302.58, + "step": 131540, + "train_speed(iter/s)": 0.123993 + }, + { + "acc": 0.73445997, + "epoch": 0.7357458332576019, + "grad_norm": 5.65625, + "learning_rate": 7.482709185351852e-06, + "loss": 1.0462389, + "memory(GiB)": 302.58, + "step": 131560, + "train_speed(iter/s)": 0.124002 + }, + { + "acc": 0.73797116, + "epoch": 0.7358576827305812, + "grad_norm": 10.4375, + "learning_rate": 7.48190648715993e-06, + "loss": 1.03303251, + "memory(GiB)": 302.58, + "step": 131580, + "train_speed(iter/s)": 0.124011 + }, + { + "acc": 0.74285002, + "epoch": 0.7359695322035604, + "grad_norm": 8.9375, + "learning_rate": 7.481103704078938e-06, + "loss": 1.00499105, + "memory(GiB)": 302.58, + "step": 131600, + "train_speed(iter/s)": 0.12402 + }, + { + "acc": 0.73137202, + "epoch": 0.7360813816765397, + "grad_norm": 4.5625, + "learning_rate": 7.480300836136332e-06, + "loss": 1.04554157, + "memory(GiB)": 302.58, + "step": 131620, + "train_speed(iter/s)": 0.124029 + }, + { + "acc": 0.73864059, + "epoch": 0.736193231149519, + "grad_norm": 7.53125, + "learning_rate": 7.479497883359571e-06, + "loss": 1.02413816, + "memory(GiB)": 302.58, + "step": 131640, + "train_speed(iter/s)": 0.124037 + }, + { + "acc": 0.75901351, + "epoch": 0.7363050806224982, + "grad_norm": 6.40625, + "learning_rate": 7.478694845776121e-06, + "loss": 0.95046377, + "memory(GiB)": 302.58, + "step": 131660, + "train_speed(iter/s)": 0.124046 + }, + { + "acc": 0.74723186, + "epoch": 0.7364169300954775, + "grad_norm": 8.625, + "learning_rate": 7.477891723413448e-06, + "loss": 0.98784113, + "memory(GiB)": 302.58, + "step": 131680, + "train_speed(iter/s)": 0.124055 + }, + { + "acc": 0.72415853, + "epoch": 0.7365287795684567, + "grad_norm": 6.28125, + "learning_rate": 7.477088516299021e-06, + "loss": 1.11677351, + "memory(GiB)": 302.58, + "step": 131700, + "train_speed(iter/s)": 0.124063 + }, + { + "acc": 0.74264832, + "epoch": 0.736640629041436, + "grad_norm": 8.3125, + "learning_rate": 7.47628522446031e-06, + "loss": 0.9977849, + "memory(GiB)": 302.58, + "step": 131720, + "train_speed(iter/s)": 0.124072 + }, + { + "acc": 0.72521982, + "epoch": 0.7367524785144153, + "grad_norm": 7.25, + "learning_rate": 7.475481847924794e-06, + "loss": 1.09724436, + "memory(GiB)": 302.58, + "step": 131740, + "train_speed(iter/s)": 0.12408 + }, + { + "acc": 0.73578582, + "epoch": 0.7368643279873945, + "grad_norm": 5.53125, + "learning_rate": 7.474678386719949e-06, + "loss": 1.04966507, + "memory(GiB)": 302.58, + "step": 131760, + "train_speed(iter/s)": 0.124089 + }, + { + "acc": 0.72583265, + "epoch": 0.7369761774603738, + "grad_norm": 13.0, + "learning_rate": 7.473874840873256e-06, + "loss": 1.08450527, + "memory(GiB)": 302.58, + "step": 131780, + "train_speed(iter/s)": 0.124098 + }, + { + "acc": 0.74486456, + "epoch": 0.7370880269333531, + "grad_norm": 7.125, + "learning_rate": 7.473071210412199e-06, + "loss": 1.02128639, + "memory(GiB)": 302.58, + "step": 131800, + "train_speed(iter/s)": 0.124107 + }, + { + "acc": 0.73880148, + "epoch": 0.7371998764063323, + "grad_norm": 9.0, + "learning_rate": 7.472267495364266e-06, + "loss": 1.02086916, + "memory(GiB)": 302.58, + "step": 131820, + "train_speed(iter/s)": 0.124116 + }, + { + "acc": 0.74735842, + "epoch": 0.7373117258793116, + "grad_norm": 5.6875, + "learning_rate": 7.471463695756944e-06, + "loss": 0.9935585, + "memory(GiB)": 302.58, + "step": 131840, + "train_speed(iter/s)": 0.124124 + }, + { + "acc": 0.74342527, + "epoch": 0.7374235753522909, + "grad_norm": 8.5625, + "learning_rate": 7.470659811617726e-06, + "loss": 0.99577732, + "memory(GiB)": 302.58, + "step": 131860, + "train_speed(iter/s)": 0.124133 + }, + { + "acc": 0.74184365, + "epoch": 0.7375354248252701, + "grad_norm": 8.1875, + "learning_rate": 7.46985584297411e-06, + "loss": 1.01859884, + "memory(GiB)": 302.58, + "step": 131880, + "train_speed(iter/s)": 0.124142 + }, + { + "acc": 0.73357844, + "epoch": 0.7376472742982494, + "grad_norm": 7.53125, + "learning_rate": 7.469051789853592e-06, + "loss": 1.05442276, + "memory(GiB)": 302.58, + "step": 131900, + "train_speed(iter/s)": 0.124151 + }, + { + "acc": 0.73268743, + "epoch": 0.7377591237712287, + "grad_norm": 7.6875, + "learning_rate": 7.4682476522836734e-06, + "loss": 1.05753059, + "memory(GiB)": 302.58, + "step": 131920, + "train_speed(iter/s)": 0.124159 + }, + { + "acc": 0.74121876, + "epoch": 0.7378709732442079, + "grad_norm": 7.9375, + "learning_rate": 7.467443430291857e-06, + "loss": 1.05139246, + "memory(GiB)": 302.58, + "step": 131940, + "train_speed(iter/s)": 0.124168 + }, + { + "acc": 0.75145049, + "epoch": 0.7379828227171872, + "grad_norm": 7.15625, + "learning_rate": 7.466639123905653e-06, + "loss": 0.97705488, + "memory(GiB)": 302.58, + "step": 131960, + "train_speed(iter/s)": 0.124177 + }, + { + "acc": 0.73304534, + "epoch": 0.7380946721901664, + "grad_norm": 7.375, + "learning_rate": 7.465834733152568e-06, + "loss": 1.06584978, + "memory(GiB)": 302.58, + "step": 131980, + "train_speed(iter/s)": 0.124186 + }, + { + "acc": 0.74983158, + "epoch": 0.7382065216631457, + "grad_norm": 5.34375, + "learning_rate": 7.465030258060117e-06, + "loss": 0.97365866, + "memory(GiB)": 302.58, + "step": 132000, + "train_speed(iter/s)": 0.124195 + }, + { + "epoch": 0.7382065216631457, + "eval_acc": 0.7045344367137224, + "eval_loss": 1.0215179920196533, + "eval_runtime": 7508.3136, + "eval_samples_per_second": 10.027, + "eval_steps_per_second": 10.027, + "step": 132000 + }, + { + "acc": 0.74465957, + "epoch": 0.738318371136125, + "grad_norm": 4.78125, + "learning_rate": 7.4642256986558156e-06, + "loss": 1.00306654, + "memory(GiB)": 302.58, + "step": 132020, + "train_speed(iter/s)": 0.123318 + }, + { + "acc": 0.73502026, + "epoch": 0.7384302206091042, + "grad_norm": 6.59375, + "learning_rate": 7.46342105496718e-06, + "loss": 1.01915684, + "memory(GiB)": 302.58, + "step": 132040, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.73039517, + "epoch": 0.7385420700820835, + "grad_norm": 6.15625, + "learning_rate": 7.462616327021735e-06, + "loss": 1.06434116, + "memory(GiB)": 302.58, + "step": 132060, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.73224525, + "epoch": 0.7386539195550628, + "grad_norm": 5.4375, + "learning_rate": 7.461811514847002e-06, + "loss": 1.05811834, + "memory(GiB)": 302.58, + "step": 132080, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.73312836, + "epoch": 0.738765769028042, + "grad_norm": 4.96875, + "learning_rate": 7.4610066184705085e-06, + "loss": 1.05473385, + "memory(GiB)": 302.58, + "step": 132100, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.70786643, + "epoch": 0.7388776185010213, + "grad_norm": 7.1875, + "learning_rate": 7.460201637919786e-06, + "loss": 1.16057825, + "memory(GiB)": 302.58, + "step": 132120, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.73883638, + "epoch": 0.7389894679740006, + "grad_norm": 7.78125, + "learning_rate": 7.459396573222365e-06, + "loss": 1.02435579, + "memory(GiB)": 302.58, + "step": 132140, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.75094719, + "epoch": 0.7391013174469798, + "grad_norm": 8.125, + "learning_rate": 7.458591424405783e-06, + "loss": 0.94797735, + "memory(GiB)": 302.58, + "step": 132160, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.75966673, + "epoch": 0.7392131669199591, + "grad_norm": 9.5625, + "learning_rate": 7.457786191497578e-06, + "loss": 0.92884789, + "memory(GiB)": 302.58, + "step": 132180, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.73851991, + "epoch": 0.7393250163929384, + "grad_norm": 7.40625, + "learning_rate": 7.456980874525292e-06, + "loss": 1.04449654, + "memory(GiB)": 302.58, + "step": 132200, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.73446608, + "epoch": 0.7394368658659176, + "grad_norm": 6.34375, + "learning_rate": 7.456175473516472e-06, + "loss": 1.04695005, + "memory(GiB)": 302.58, + "step": 132220, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.73579893, + "epoch": 0.7395487153388969, + "grad_norm": 5.5625, + "learning_rate": 7.45536998849866e-06, + "loss": 1.04611082, + "memory(GiB)": 302.58, + "step": 132240, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.74753032, + "epoch": 0.7396605648118761, + "grad_norm": 7.1875, + "learning_rate": 7.4545644194994085e-06, + "loss": 1.00493288, + "memory(GiB)": 302.58, + "step": 132260, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75770211, + "epoch": 0.7397724142848554, + "grad_norm": 10.4375, + "learning_rate": 7.4537587665462704e-06, + "loss": 0.96607952, + "memory(GiB)": 302.58, + "step": 132280, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.73092642, + "epoch": 0.7398842637578347, + "grad_norm": 10.0, + "learning_rate": 7.4529530296668015e-06, + "loss": 1.07051764, + "memory(GiB)": 302.58, + "step": 132300, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.73273191, + "epoch": 0.7399961132308139, + "grad_norm": 7.9375, + "learning_rate": 7.4521472088885625e-06, + "loss": 1.07559423, + "memory(GiB)": 302.58, + "step": 132320, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.731147, + "epoch": 0.7401079627037932, + "grad_norm": 7.84375, + "learning_rate": 7.451341304239112e-06, + "loss": 1.04978075, + "memory(GiB)": 302.58, + "step": 132340, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.75220819, + "epoch": 0.7402198121767725, + "grad_norm": 7.375, + "learning_rate": 7.450535315746017e-06, + "loss": 0.97968674, + "memory(GiB)": 302.58, + "step": 132360, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74471049, + "epoch": 0.7403316616497517, + "grad_norm": 6.625, + "learning_rate": 7.449729243436844e-06, + "loss": 0.99395151, + "memory(GiB)": 302.58, + "step": 132380, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74790912, + "epoch": 0.740443511122731, + "grad_norm": 8.375, + "learning_rate": 7.448923087339162e-06, + "loss": 0.97486906, + "memory(GiB)": 302.58, + "step": 132400, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74481106, + "epoch": 0.7405553605957103, + "grad_norm": 6.875, + "learning_rate": 7.448116847480544e-06, + "loss": 0.98128223, + "memory(GiB)": 302.58, + "step": 132420, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.74663548, + "epoch": 0.7406672100686895, + "grad_norm": 10.5, + "learning_rate": 7.447310523888569e-06, + "loss": 0.99424171, + "memory(GiB)": 302.58, + "step": 132440, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75224366, + "epoch": 0.7407790595416688, + "grad_norm": 6.78125, + "learning_rate": 7.446504116590814e-06, + "loss": 0.97058172, + "memory(GiB)": 302.58, + "step": 132460, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74618073, + "epoch": 0.740890909014648, + "grad_norm": 6.59375, + "learning_rate": 7.445697625614859e-06, + "loss": 0.99088459, + "memory(GiB)": 302.58, + "step": 132480, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.72887368, + "epoch": 0.7410027584876273, + "grad_norm": 8.4375, + "learning_rate": 7.444891050988292e-06, + "loss": 1.04875059, + "memory(GiB)": 302.58, + "step": 132500, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74863639, + "epoch": 0.7411146079606066, + "grad_norm": 7.625, + "learning_rate": 7.444084392738697e-06, + "loss": 0.97707043, + "memory(GiB)": 302.58, + "step": 132520, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.74573588, + "epoch": 0.7412264574335858, + "grad_norm": 7.6875, + "learning_rate": 7.443277650893667e-06, + "loss": 1.00248756, + "memory(GiB)": 302.58, + "step": 132540, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.72898388, + "epoch": 0.7413383069065651, + "grad_norm": 6.34375, + "learning_rate": 7.442470825480792e-06, + "loss": 1.0725914, + "memory(GiB)": 302.58, + "step": 132560, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.73202615, + "epoch": 0.7414501563795444, + "grad_norm": 5.90625, + "learning_rate": 7.441663916527671e-06, + "loss": 1.0767499, + "memory(GiB)": 302.58, + "step": 132580, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.73164763, + "epoch": 0.7415620058525236, + "grad_norm": 7.5625, + "learning_rate": 7.440856924061902e-06, + "loss": 1.04305944, + "memory(GiB)": 302.58, + "step": 132600, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.75785918, + "epoch": 0.7416738553255029, + "grad_norm": 6.84375, + "learning_rate": 7.440049848111087e-06, + "loss": 0.91940536, + "memory(GiB)": 302.58, + "step": 132620, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75105968, + "epoch": 0.7417857047984822, + "grad_norm": 5.78125, + "learning_rate": 7.439242688702829e-06, + "loss": 0.97341652, + "memory(GiB)": 302.58, + "step": 132640, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74134274, + "epoch": 0.7418975542714614, + "grad_norm": 7.1875, + "learning_rate": 7.4384354458647355e-06, + "loss": 1.01173439, + "memory(GiB)": 302.58, + "step": 132660, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.74046326, + "epoch": 0.7420094037444407, + "grad_norm": 6.6875, + "learning_rate": 7.437628119624421e-06, + "loss": 1.02743454, + "memory(GiB)": 302.58, + "step": 132680, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.7326344, + "epoch": 0.74212125321742, + "grad_norm": 6.4375, + "learning_rate": 7.436820710009492e-06, + "loss": 1.06016588, + "memory(GiB)": 302.58, + "step": 132700, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.73776164, + "epoch": 0.7422331026903992, + "grad_norm": 5.28125, + "learning_rate": 7.43601321704757e-06, + "loss": 1.0462327, + "memory(GiB)": 302.58, + "step": 132720, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.74355187, + "epoch": 0.7423449521633785, + "grad_norm": 5.53125, + "learning_rate": 7.435205640766269e-06, + "loss": 1.01851215, + "memory(GiB)": 302.58, + "step": 132740, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.73646808, + "epoch": 0.7424568016363577, + "grad_norm": 7.71875, + "learning_rate": 7.4343979811932155e-06, + "loss": 1.0384243, + "memory(GiB)": 302.58, + "step": 132760, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.75105262, + "epoch": 0.742568651109337, + "grad_norm": 6.75, + "learning_rate": 7.433590238356031e-06, + "loss": 0.97352743, + "memory(GiB)": 302.58, + "step": 132780, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.73619943, + "epoch": 0.7426805005823163, + "grad_norm": 6.34375, + "learning_rate": 7.4327824122823436e-06, + "loss": 1.03096943, + "memory(GiB)": 302.58, + "step": 132800, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.74899564, + "epoch": 0.7427923500552955, + "grad_norm": 8.5, + "learning_rate": 7.431974502999784e-06, + "loss": 0.98902445, + "memory(GiB)": 302.58, + "step": 132820, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.74537206, + "epoch": 0.7429041995282748, + "grad_norm": 5.3125, + "learning_rate": 7.431166510535985e-06, + "loss": 1.01225405, + "memory(GiB)": 302.58, + "step": 132840, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.74841366, + "epoch": 0.7430160490012541, + "grad_norm": 9.3125, + "learning_rate": 7.430358434918581e-06, + "loss": 0.98115692, + "memory(GiB)": 302.58, + "step": 132860, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.74545727, + "epoch": 0.7431278984742333, + "grad_norm": 8.4375, + "learning_rate": 7.429550276175213e-06, + "loss": 1.00277405, + "memory(GiB)": 302.58, + "step": 132880, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.73216171, + "epoch": 0.7432397479472126, + "grad_norm": 8.1875, + "learning_rate": 7.428742034333521e-06, + "loss": 1.04715548, + "memory(GiB)": 302.58, + "step": 132900, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.72828131, + "epoch": 0.7433515974201919, + "grad_norm": 6.21875, + "learning_rate": 7.427933709421151e-06, + "loss": 1.05808964, + "memory(GiB)": 302.58, + "step": 132920, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.73806801, + "epoch": 0.7434634468931711, + "grad_norm": 9.875, + "learning_rate": 7.427125301465749e-06, + "loss": 1.03598509, + "memory(GiB)": 302.58, + "step": 132940, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.72644696, + "epoch": 0.7435752963661504, + "grad_norm": 6.90625, + "learning_rate": 7.4263168104949655e-06, + "loss": 1.08195314, + "memory(GiB)": 302.58, + "step": 132960, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.74960675, + "epoch": 0.7436871458391296, + "grad_norm": 6.84375, + "learning_rate": 7.425508236536454e-06, + "loss": 1.00598698, + "memory(GiB)": 302.58, + "step": 132980, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.74684825, + "epoch": 0.7437989953121089, + "grad_norm": 6.125, + "learning_rate": 7.424699579617868e-06, + "loss": 0.96543341, + "memory(GiB)": 302.58, + "step": 133000, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.72255459, + "epoch": 0.7439108447850882, + "grad_norm": 4.53125, + "learning_rate": 7.4238908397668694e-06, + "loss": 1.13902569, + "memory(GiB)": 302.58, + "step": 133020, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.72760453, + "epoch": 0.7440226942580674, + "grad_norm": 8.5625, + "learning_rate": 7.423082017011119e-06, + "loss": 1.077878, + "memory(GiB)": 302.58, + "step": 133040, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.73914018, + "epoch": 0.7441345437310467, + "grad_norm": 6.1875, + "learning_rate": 7.422273111378278e-06, + "loss": 1.04145575, + "memory(GiB)": 302.58, + "step": 133060, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.74251804, + "epoch": 0.744246393204026, + "grad_norm": 8.125, + "learning_rate": 7.421464122896017e-06, + "loss": 1.00602551, + "memory(GiB)": 302.58, + "step": 133080, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.73444815, + "epoch": 0.7443582426770052, + "grad_norm": 6.21875, + "learning_rate": 7.420655051592005e-06, + "loss": 1.05008993, + "memory(GiB)": 302.58, + "step": 133100, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.73355255, + "epoch": 0.7444700921499845, + "grad_norm": 7.5625, + "learning_rate": 7.419845897493914e-06, + "loss": 1.05324936, + "memory(GiB)": 302.58, + "step": 133120, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.74068537, + "epoch": 0.7445819416229638, + "grad_norm": 5.34375, + "learning_rate": 7.419036660629419e-06, + "loss": 1.0237072, + "memory(GiB)": 302.58, + "step": 133140, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.74485826, + "epoch": 0.744693791095943, + "grad_norm": 6.40625, + "learning_rate": 7.418227341026201e-06, + "loss": 1.0067318, + "memory(GiB)": 302.58, + "step": 133160, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.74989743, + "epoch": 0.7448056405689223, + "grad_norm": 8.4375, + "learning_rate": 7.4174179387119405e-06, + "loss": 0.97377968, + "memory(GiB)": 302.58, + "step": 133180, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.73247838, + "epoch": 0.7449174900419017, + "grad_norm": 8.0625, + "learning_rate": 7.41660845371432e-06, + "loss": 1.04042854, + "memory(GiB)": 302.58, + "step": 133200, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.75701103, + "epoch": 0.7450293395148809, + "grad_norm": 7.1875, + "learning_rate": 7.4157988860610285e-06, + "loss": 0.94013739, + "memory(GiB)": 302.58, + "step": 133220, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.75056887, + "epoch": 0.7451411889878602, + "grad_norm": 8.0625, + "learning_rate": 7.414989235779753e-06, + "loss": 0.98751268, + "memory(GiB)": 302.58, + "step": 133240, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.74823194, + "epoch": 0.7452530384608395, + "grad_norm": 7.15625, + "learning_rate": 7.41417950289819e-06, + "loss": 0.98862028, + "memory(GiB)": 302.58, + "step": 133260, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.7380856, + "epoch": 0.7453648879338187, + "grad_norm": 6.28125, + "learning_rate": 7.413369687444032e-06, + "loss": 1.02251978, + "memory(GiB)": 302.58, + "step": 133280, + "train_speed(iter/s)": 0.123869 + }, + { + "acc": 0.7380003, + "epoch": 0.745476737406798, + "grad_norm": 6.90625, + "learning_rate": 7.4125597894449774e-06, + "loss": 1.01778355, + "memory(GiB)": 302.58, + "step": 133300, + "train_speed(iter/s)": 0.123878 + }, + { + "acc": 0.73971963, + "epoch": 0.7455885868797772, + "grad_norm": 6.75, + "learning_rate": 7.4117498089287275e-06, + "loss": 1.01314993, + "memory(GiB)": 302.58, + "step": 133320, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.71840262, + "epoch": 0.7457004363527565, + "grad_norm": 4.9375, + "learning_rate": 7.410939745922989e-06, + "loss": 1.15184002, + "memory(GiB)": 302.58, + "step": 133340, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.74335217, + "epoch": 0.7458122858257358, + "grad_norm": 5.96875, + "learning_rate": 7.410129600455464e-06, + "loss": 1.00864706, + "memory(GiB)": 302.58, + "step": 133360, + "train_speed(iter/s)": 0.123904 + }, + { + "acc": 0.75790291, + "epoch": 0.745924135298715, + "grad_norm": 9.625, + "learning_rate": 7.409319372553867e-06, + "loss": 0.95047512, + "memory(GiB)": 302.58, + "step": 133380, + "train_speed(iter/s)": 0.123913 + }, + { + "acc": 0.73155627, + "epoch": 0.7460359847716943, + "grad_norm": 6.28125, + "learning_rate": 7.408509062245906e-06, + "loss": 1.05245352, + "memory(GiB)": 302.58, + "step": 133400, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.72343001, + "epoch": 0.7461478342446736, + "grad_norm": 6.375, + "learning_rate": 7.407698669559298e-06, + "loss": 1.07890663, + "memory(GiB)": 302.58, + "step": 133420, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.74319677, + "epoch": 0.7462596837176528, + "grad_norm": 4.96875, + "learning_rate": 7.406888194521761e-06, + "loss": 1.01094465, + "memory(GiB)": 302.58, + "step": 133440, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.76045933, + "epoch": 0.7463715331906321, + "grad_norm": 8.4375, + "learning_rate": 7.406077637161015e-06, + "loss": 0.95081348, + "memory(GiB)": 302.58, + "step": 133460, + "train_speed(iter/s)": 0.123948 + }, + { + "acc": 0.75655332, + "epoch": 0.7464833826636114, + "grad_norm": 6.84375, + "learning_rate": 7.4052669975047856e-06, + "loss": 0.93689175, + "memory(GiB)": 302.58, + "step": 133480, + "train_speed(iter/s)": 0.123957 + }, + { + "acc": 0.73729, + "epoch": 0.7465952321365906, + "grad_norm": 6.15625, + "learning_rate": 7.404456275580798e-06, + "loss": 1.04019756, + "memory(GiB)": 302.58, + "step": 133500, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.74030995, + "epoch": 0.7467070816095699, + "grad_norm": 5.5, + "learning_rate": 7.403645471416781e-06, + "loss": 1.04007483, + "memory(GiB)": 302.58, + "step": 133520, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.75102253, + "epoch": 0.7468189310825492, + "grad_norm": 4.6875, + "learning_rate": 7.402834585040467e-06, + "loss": 0.9835083, + "memory(GiB)": 302.58, + "step": 133540, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.71813879, + "epoch": 0.7469307805555284, + "grad_norm": 6.28125, + "learning_rate": 7.4020236164795915e-06, + "loss": 1.09438696, + "memory(GiB)": 302.58, + "step": 133560, + "train_speed(iter/s)": 0.123991 + }, + { + "acc": 0.71828218, + "epoch": 0.7470426300285077, + "grad_norm": 7.9375, + "learning_rate": 7.40121256576189e-06, + "loss": 1.10560436, + "memory(GiB)": 302.58, + "step": 133580, + "train_speed(iter/s)": 0.124 + }, + { + "acc": 0.72473035, + "epoch": 0.747154479501487, + "grad_norm": 7.9375, + "learning_rate": 7.400401432915106e-06, + "loss": 1.08143129, + "memory(GiB)": 302.58, + "step": 133600, + "train_speed(iter/s)": 0.124008 + }, + { + "acc": 0.74833779, + "epoch": 0.7472663289744662, + "grad_norm": 7.375, + "learning_rate": 7.399590217966982e-06, + "loss": 0.98995152, + "memory(GiB)": 302.58, + "step": 133620, + "train_speed(iter/s)": 0.124017 + }, + { + "acc": 0.74557867, + "epoch": 0.7473781784474455, + "grad_norm": 5.78125, + "learning_rate": 7.398778920945263e-06, + "loss": 0.9973052, + "memory(GiB)": 302.58, + "step": 133640, + "train_speed(iter/s)": 0.124025 + }, + { + "acc": 0.73306341, + "epoch": 0.7474900279204247, + "grad_norm": 8.1875, + "learning_rate": 7.397967541877699e-06, + "loss": 1.03256416, + "memory(GiB)": 302.58, + "step": 133660, + "train_speed(iter/s)": 0.124034 + }, + { + "acc": 0.73592834, + "epoch": 0.747601877393404, + "grad_norm": 8.25, + "learning_rate": 7.397156080792041e-06, + "loss": 1.04864111, + "memory(GiB)": 302.58, + "step": 133680, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.75127149, + "epoch": 0.7477137268663833, + "grad_norm": 7.46875, + "learning_rate": 7.396344537716045e-06, + "loss": 0.98588381, + "memory(GiB)": 302.58, + "step": 133700, + "train_speed(iter/s)": 0.124052 + }, + { + "acc": 0.73665471, + "epoch": 0.7478255763393625, + "grad_norm": 6.5625, + "learning_rate": 7.395532912677464e-06, + "loss": 1.0585146, + "memory(GiB)": 302.58, + "step": 133720, + "train_speed(iter/s)": 0.12406 + }, + { + "acc": 0.7355639, + "epoch": 0.7479374258123418, + "grad_norm": 5.4375, + "learning_rate": 7.394721205704064e-06, + "loss": 1.06383581, + "memory(GiB)": 302.58, + "step": 133740, + "train_speed(iter/s)": 0.124069 + }, + { + "acc": 0.73099999, + "epoch": 0.7480492752853211, + "grad_norm": 7.5, + "learning_rate": 7.393909416823605e-06, + "loss": 1.06536007, + "memory(GiB)": 302.58, + "step": 133760, + "train_speed(iter/s)": 0.124077 + }, + { + "acc": 0.74186001, + "epoch": 0.7481611247583003, + "grad_norm": 8.375, + "learning_rate": 7.393097546063853e-06, + "loss": 0.98558311, + "memory(GiB)": 302.58, + "step": 133780, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.74392548, + "epoch": 0.7482729742312796, + "grad_norm": 10.25, + "learning_rate": 7.392285593452577e-06, + "loss": 1.01247892, + "memory(GiB)": 302.58, + "step": 133800, + "train_speed(iter/s)": 0.124095 + }, + { + "acc": 0.74439602, + "epoch": 0.7483848237042589, + "grad_norm": 7.875, + "learning_rate": 7.391473559017547e-06, + "loss": 0.99801321, + "memory(GiB)": 302.58, + "step": 133820, + "train_speed(iter/s)": 0.124104 + }, + { + "acc": 0.73365197, + "epoch": 0.7484966731772381, + "grad_norm": 5.9375, + "learning_rate": 7.390661442786539e-06, + "loss": 1.03217468, + "memory(GiB)": 302.58, + "step": 133840, + "train_speed(iter/s)": 0.124112 + }, + { + "acc": 0.73265224, + "epoch": 0.7486085226502174, + "grad_norm": 7.34375, + "learning_rate": 7.3898492447873284e-06, + "loss": 1.06198025, + "memory(GiB)": 302.58, + "step": 133860, + "train_speed(iter/s)": 0.124121 + }, + { + "acc": 0.75203204, + "epoch": 0.7487203721231966, + "grad_norm": 4.4375, + "learning_rate": 7.389036965047696e-06, + "loss": 0.99772186, + "memory(GiB)": 302.58, + "step": 133880, + "train_speed(iter/s)": 0.124129 + }, + { + "acc": 0.73836098, + "epoch": 0.7488322215961759, + "grad_norm": 5.9375, + "learning_rate": 7.388224603595423e-06, + "loss": 1.03473644, + "memory(GiB)": 302.58, + "step": 133900, + "train_speed(iter/s)": 0.124138 + }, + { + "acc": 0.74377146, + "epoch": 0.7489440710691552, + "grad_norm": 6.59375, + "learning_rate": 7.387412160458296e-06, + "loss": 1.01520672, + "memory(GiB)": 302.58, + "step": 133920, + "train_speed(iter/s)": 0.124147 + }, + { + "acc": 0.74366636, + "epoch": 0.7490559205421344, + "grad_norm": 7.8125, + "learning_rate": 7.386599635664104e-06, + "loss": 0.99191322, + "memory(GiB)": 302.58, + "step": 133940, + "train_speed(iter/s)": 0.124156 + }, + { + "acc": 0.74710784, + "epoch": 0.7491677700151137, + "grad_norm": 11.125, + "learning_rate": 7.385787029240636e-06, + "loss": 0.99003401, + "memory(GiB)": 302.58, + "step": 133960, + "train_speed(iter/s)": 0.124165 + }, + { + "acc": 0.74240274, + "epoch": 0.749279619488093, + "grad_norm": 6.40625, + "learning_rate": 7.384974341215687e-06, + "loss": 0.98265285, + "memory(GiB)": 302.58, + "step": 133980, + "train_speed(iter/s)": 0.124174 + }, + { + "acc": 0.73790574, + "epoch": 0.7493914689610722, + "grad_norm": 8.0625, + "learning_rate": 7.3841615716170525e-06, + "loss": 1.04362068, + "memory(GiB)": 302.58, + "step": 134000, + "train_speed(iter/s)": 0.124183 + }, + { + "epoch": 0.7493914689610722, + "eval_acc": 0.7046143468042717, + "eval_loss": 1.0212483406066895, + "eval_runtime": 7505.6055, + "eval_samples_per_second": 10.03, + "eval_steps_per_second": 10.03, + "step": 134000 + }, + { + "acc": 0.73198647, + "epoch": 0.7495033184340515, + "grad_norm": 7.9375, + "learning_rate": 7.3833487204725315e-06, + "loss": 1.04570389, + "memory(GiB)": 302.58, + "step": 134020, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.75852194, + "epoch": 0.7496151679070308, + "grad_norm": 6.5625, + "learning_rate": 7.382535787809928e-06, + "loss": 0.94891825, + "memory(GiB)": 302.58, + "step": 134040, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.76856513, + "epoch": 0.74972701738001, + "grad_norm": 9.3125, + "learning_rate": 7.3817227736570455e-06, + "loss": 0.88662119, + "memory(GiB)": 302.58, + "step": 134060, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.73563519, + "epoch": 0.7498388668529893, + "grad_norm": 8.125, + "learning_rate": 7.380909678041692e-06, + "loss": 1.03494797, + "memory(GiB)": 302.58, + "step": 134080, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.74056177, + "epoch": 0.7499507163259685, + "grad_norm": 7.03125, + "learning_rate": 7.380096500991678e-06, + "loss": 1.02738333, + "memory(GiB)": 302.58, + "step": 134100, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.75977964, + "epoch": 0.7500625657989478, + "grad_norm": 6.53125, + "learning_rate": 7.379283242534816e-06, + "loss": 0.94675856, + "memory(GiB)": 302.58, + "step": 134120, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.74199381, + "epoch": 0.7501744152719271, + "grad_norm": 7.625, + "learning_rate": 7.3784699026989225e-06, + "loss": 1.00507545, + "memory(GiB)": 302.58, + "step": 134140, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.73252163, + "epoch": 0.7502862647449063, + "grad_norm": 9.4375, + "learning_rate": 7.377656481511817e-06, + "loss": 1.05189791, + "memory(GiB)": 302.58, + "step": 134160, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.7530344, + "epoch": 0.7503981142178856, + "grad_norm": 5.9375, + "learning_rate": 7.376842979001319e-06, + "loss": 0.96570425, + "memory(GiB)": 302.58, + "step": 134180, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.7287293, + "epoch": 0.7505099636908649, + "grad_norm": 4.71875, + "learning_rate": 7.3760293951952565e-06, + "loss": 1.06508579, + "memory(GiB)": 302.58, + "step": 134200, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.72286339, + "epoch": 0.7506218131638441, + "grad_norm": 9.375, + "learning_rate": 7.375215730121455e-06, + "loss": 1.1046258, + "memory(GiB)": 302.58, + "step": 134220, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.72785068, + "epoch": 0.7507336626368234, + "grad_norm": 9.0625, + "learning_rate": 7.374401983807742e-06, + "loss": 1.07337914, + "memory(GiB)": 302.58, + "step": 134240, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.7368515, + "epoch": 0.7508455121098027, + "grad_norm": 5.875, + "learning_rate": 7.3735881562819525e-06, + "loss": 1.03782101, + "memory(GiB)": 302.58, + "step": 134260, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.73833451, + "epoch": 0.7509573615827819, + "grad_norm": 6.96875, + "learning_rate": 7.372774247571922e-06, + "loss": 1.03137064, + "memory(GiB)": 302.58, + "step": 134280, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.74662089, + "epoch": 0.7510692110557612, + "grad_norm": 5.75, + "learning_rate": 7.371960257705487e-06, + "loss": 0.98932724, + "memory(GiB)": 302.58, + "step": 134300, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.74581609, + "epoch": 0.7511810605287405, + "grad_norm": 7.75, + "learning_rate": 7.371146186710491e-06, + "loss": 0.98855801, + "memory(GiB)": 302.58, + "step": 134320, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.73557534, + "epoch": 0.7512929100017197, + "grad_norm": 9.8125, + "learning_rate": 7.370332034614778e-06, + "loss": 1.04809456, + "memory(GiB)": 302.58, + "step": 134340, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.74537168, + "epoch": 0.751404759474699, + "grad_norm": 4.875, + "learning_rate": 7.369517801446192e-06, + "loss": 1.00306969, + "memory(GiB)": 302.58, + "step": 134360, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.74427352, + "epoch": 0.7515166089476782, + "grad_norm": 8.0625, + "learning_rate": 7.368703487232583e-06, + "loss": 0.99948425, + "memory(GiB)": 302.58, + "step": 134380, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.73684359, + "epoch": 0.7516284584206575, + "grad_norm": 7.96875, + "learning_rate": 7.367889092001805e-06, + "loss": 1.058708, + "memory(GiB)": 302.58, + "step": 134400, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.73209805, + "epoch": 0.7517403078936368, + "grad_norm": 8.4375, + "learning_rate": 7.367074615781711e-06, + "loss": 1.0771966, + "memory(GiB)": 302.58, + "step": 134420, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.74556465, + "epoch": 0.751852157366616, + "grad_norm": 9.6875, + "learning_rate": 7.36626005860016e-06, + "loss": 0.98468618, + "memory(GiB)": 302.58, + "step": 134440, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.72944961, + "epoch": 0.7519640068395953, + "grad_norm": 11.3125, + "learning_rate": 7.365445420485011e-06, + "loss": 1.0692091, + "memory(GiB)": 302.58, + "step": 134460, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.74853415, + "epoch": 0.7520758563125746, + "grad_norm": 6.90625, + "learning_rate": 7.364630701464128e-06, + "loss": 0.98747225, + "memory(GiB)": 302.58, + "step": 134480, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.73613791, + "epoch": 0.7521877057855538, + "grad_norm": 6.6875, + "learning_rate": 7.363815901565377e-06, + "loss": 1.04258804, + "memory(GiB)": 302.58, + "step": 134500, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.73541117, + "epoch": 0.7522995552585331, + "grad_norm": 5.46875, + "learning_rate": 7.363001020816628e-06, + "loss": 1.04275055, + "memory(GiB)": 302.58, + "step": 134520, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.73808641, + "epoch": 0.7524114047315124, + "grad_norm": 9.0, + "learning_rate": 7.36218605924575e-06, + "loss": 1.02619495, + "memory(GiB)": 302.58, + "step": 134540, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74106989, + "epoch": 0.7525232542044916, + "grad_norm": 6.46875, + "learning_rate": 7.361371016880619e-06, + "loss": 1.02519522, + "memory(GiB)": 302.58, + "step": 134560, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.7486073, + "epoch": 0.7526351036774709, + "grad_norm": 6.46875, + "learning_rate": 7.36055589374911e-06, + "loss": 0.98527918, + "memory(GiB)": 302.58, + "step": 134580, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.73255162, + "epoch": 0.7527469531504501, + "grad_norm": 6.90625, + "learning_rate": 7.3597406898791056e-06, + "loss": 1.07008047, + "memory(GiB)": 302.58, + "step": 134600, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.7399529, + "epoch": 0.7528588026234294, + "grad_norm": 8.1875, + "learning_rate": 7.358925405298487e-06, + "loss": 1.03409376, + "memory(GiB)": 302.58, + "step": 134620, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.73577676, + "epoch": 0.7529706520964087, + "grad_norm": 5.59375, + "learning_rate": 7.35811004003514e-06, + "loss": 1.05946722, + "memory(GiB)": 302.58, + "step": 134640, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.72473941, + "epoch": 0.7530825015693879, + "grad_norm": 4.53125, + "learning_rate": 7.357294594116952e-06, + "loss": 1.10758133, + "memory(GiB)": 302.58, + "step": 134660, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.73880277, + "epoch": 0.7531943510423672, + "grad_norm": 4.9375, + "learning_rate": 7.3564790675718135e-06, + "loss": 1.03545399, + "memory(GiB)": 302.58, + "step": 134680, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74356651, + "epoch": 0.7533062005153465, + "grad_norm": 6.71875, + "learning_rate": 7.35566346042762e-06, + "loss": 0.99054365, + "memory(GiB)": 302.58, + "step": 134700, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.75467672, + "epoch": 0.7534180499883257, + "grad_norm": 9.3125, + "learning_rate": 7.354847772712266e-06, + "loss": 0.96496868, + "memory(GiB)": 302.58, + "step": 134720, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.74523029, + "epoch": 0.753529899461305, + "grad_norm": 6.9375, + "learning_rate": 7.3540320044536495e-06, + "loss": 1.01372604, + "memory(GiB)": 302.58, + "step": 134740, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.74538031, + "epoch": 0.7536417489342843, + "grad_norm": 10.4375, + "learning_rate": 7.353216155679676e-06, + "loss": 0.99580622, + "memory(GiB)": 302.58, + "step": 134760, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.75404882, + "epoch": 0.7537535984072635, + "grad_norm": 6.5, + "learning_rate": 7.352400226418247e-06, + "loss": 0.9611371, + "memory(GiB)": 302.58, + "step": 134780, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75559783, + "epoch": 0.7538654478802428, + "grad_norm": 8.125, + "learning_rate": 7.351584216697272e-06, + "loss": 0.94416876, + "memory(GiB)": 302.58, + "step": 134800, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.7439167, + "epoch": 0.753977297353222, + "grad_norm": 6.15625, + "learning_rate": 7.35076812654466e-06, + "loss": 1.03571558, + "memory(GiB)": 302.58, + "step": 134820, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.74049158, + "epoch": 0.7540891468262013, + "grad_norm": 7.84375, + "learning_rate": 7.349951955988323e-06, + "loss": 0.98861198, + "memory(GiB)": 302.58, + "step": 134840, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.73152566, + "epoch": 0.7542009962991806, + "grad_norm": 10.6875, + "learning_rate": 7.3491357050561785e-06, + "loss": 1.05086861, + "memory(GiB)": 302.58, + "step": 134860, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.71616087, + "epoch": 0.7543128457721598, + "grad_norm": 5.40625, + "learning_rate": 7.3483193737761425e-06, + "loss": 1.15454464, + "memory(GiB)": 302.58, + "step": 134880, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.7456831, + "epoch": 0.7544246952451391, + "grad_norm": 6.875, + "learning_rate": 7.3475029621761396e-06, + "loss": 1.01187115, + "memory(GiB)": 302.58, + "step": 134900, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.75104542, + "epoch": 0.7545365447181184, + "grad_norm": 6.59375, + "learning_rate": 7.346686470284091e-06, + "loss": 0.98199902, + "memory(GiB)": 302.58, + "step": 134920, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.74644227, + "epoch": 0.7546483941910976, + "grad_norm": 6.03125, + "learning_rate": 7.345869898127924e-06, + "loss": 1.01151295, + "memory(GiB)": 302.58, + "step": 134940, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.72634091, + "epoch": 0.7547602436640769, + "grad_norm": 6.25, + "learning_rate": 7.345053245735568e-06, + "loss": 1.09408989, + "memory(GiB)": 302.58, + "step": 134960, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.71507201, + "epoch": 0.7548720931370562, + "grad_norm": 6.78125, + "learning_rate": 7.344236513134955e-06, + "loss": 1.13186121, + "memory(GiB)": 302.58, + "step": 134980, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.74306302, + "epoch": 0.7549839426100354, + "grad_norm": 5.125, + "learning_rate": 7.343419700354019e-06, + "loss": 0.99211483, + "memory(GiB)": 302.58, + "step": 135000, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.72801843, + "epoch": 0.7550957920830147, + "grad_norm": 8.1875, + "learning_rate": 7.342602807420699e-06, + "loss": 1.0712739, + "memory(GiB)": 302.58, + "step": 135020, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.73676877, + "epoch": 0.755207641555994, + "grad_norm": 6.75, + "learning_rate": 7.341785834362936e-06, + "loss": 1.02896366, + "memory(GiB)": 302.58, + "step": 135040, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.73293333, + "epoch": 0.7553194910289732, + "grad_norm": 9.5625, + "learning_rate": 7.340968781208671e-06, + "loss": 1.06625385, + "memory(GiB)": 302.58, + "step": 135060, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.73760037, + "epoch": 0.7554313405019525, + "grad_norm": 5.625, + "learning_rate": 7.340151647985849e-06, + "loss": 1.03843803, + "memory(GiB)": 302.58, + "step": 135080, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.7358079, + "epoch": 0.7555431899749318, + "grad_norm": 8.8125, + "learning_rate": 7.339334434722423e-06, + "loss": 1.02856264, + "memory(GiB)": 302.58, + "step": 135100, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.74318662, + "epoch": 0.755655039447911, + "grad_norm": 5.28125, + "learning_rate": 7.338517141446339e-06, + "loss": 1.02092915, + "memory(GiB)": 302.58, + "step": 135120, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.74769735, + "epoch": 0.7557668889208903, + "grad_norm": 4.96875, + "learning_rate": 7.337699768185555e-06, + "loss": 0.981285, + "memory(GiB)": 302.58, + "step": 135140, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.72599444, + "epoch": 0.7558787383938695, + "grad_norm": 5.03125, + "learning_rate": 7.336882314968026e-06, + "loss": 1.07264214, + "memory(GiB)": 302.58, + "step": 135160, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.73999653, + "epoch": 0.7559905878668488, + "grad_norm": 7.78125, + "learning_rate": 7.336064781821711e-06, + "loss": 1.02276106, + "memory(GiB)": 302.58, + "step": 135180, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.76336379, + "epoch": 0.7561024373398281, + "grad_norm": 4.90625, + "learning_rate": 7.335247168774574e-06, + "loss": 0.92255344, + "memory(GiB)": 302.58, + "step": 135200, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.73312244, + "epoch": 0.7562142868128073, + "grad_norm": 6.1875, + "learning_rate": 7.334429475854579e-06, + "loss": 1.0540206, + "memory(GiB)": 302.58, + "step": 135220, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.741576, + "epoch": 0.7563261362857866, + "grad_norm": 6.65625, + "learning_rate": 7.333611703089693e-06, + "loss": 1.02552567, + "memory(GiB)": 302.58, + "step": 135240, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.74943128, + "epoch": 0.7564379857587659, + "grad_norm": 9.375, + "learning_rate": 7.332793850507888e-06, + "loss": 0.98968935, + "memory(GiB)": 302.58, + "step": 135260, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.75260983, + "epoch": 0.7565498352317451, + "grad_norm": 5.96875, + "learning_rate": 7.331975918137137e-06, + "loss": 0.98344469, + "memory(GiB)": 302.58, + "step": 135280, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.7403266, + "epoch": 0.7566616847047244, + "grad_norm": 6.78125, + "learning_rate": 7.3311579060054115e-06, + "loss": 1.03818874, + "memory(GiB)": 302.58, + "step": 135300, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.72646337, + "epoch": 0.7567735341777037, + "grad_norm": 3.75, + "learning_rate": 7.330339814140697e-06, + "loss": 1.06221743, + "memory(GiB)": 302.58, + "step": 135320, + "train_speed(iter/s)": 0.123888 + }, + { + "acc": 0.74371562, + "epoch": 0.7568853836506829, + "grad_norm": 8.6875, + "learning_rate": 7.32952164257097e-06, + "loss": 1.0073575, + "memory(GiB)": 302.58, + "step": 135340, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.72983971, + "epoch": 0.7569972331236622, + "grad_norm": 6.5625, + "learning_rate": 7.328703391324217e-06, + "loss": 1.05395098, + "memory(GiB)": 302.58, + "step": 135360, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.75489035, + "epoch": 0.7571090825966414, + "grad_norm": 8.875, + "learning_rate": 7.3278850604284235e-06, + "loss": 0.96683798, + "memory(GiB)": 302.58, + "step": 135380, + "train_speed(iter/s)": 0.123913 + }, + { + "acc": 0.75585217, + "epoch": 0.7572209320696207, + "grad_norm": 5.625, + "learning_rate": 7.32706664991158e-06, + "loss": 0.9554224, + "memory(GiB)": 302.58, + "step": 135400, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.73393431, + "epoch": 0.7573327815426, + "grad_norm": 6.65625, + "learning_rate": 7.326248159801678e-06, + "loss": 1.07784204, + "memory(GiB)": 302.58, + "step": 135420, + "train_speed(iter/s)": 0.12393 + }, + { + "acc": 0.72648687, + "epoch": 0.7574446310155792, + "grad_norm": 7.0625, + "learning_rate": 7.325429590126711e-06, + "loss": 1.06668634, + "memory(GiB)": 302.58, + "step": 135440, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.73412628, + "epoch": 0.7575564804885585, + "grad_norm": 6.71875, + "learning_rate": 7.32461094091468e-06, + "loss": 1.05346508, + "memory(GiB)": 302.58, + "step": 135460, + "train_speed(iter/s)": 0.123947 + }, + { + "acc": 0.76488051, + "epoch": 0.7576683299615378, + "grad_norm": 5.71875, + "learning_rate": 7.323792212193584e-06, + "loss": 0.90681458, + "memory(GiB)": 302.58, + "step": 135480, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.73583522, + "epoch": 0.757780179434517, + "grad_norm": 7.1875, + "learning_rate": 7.322973403991424e-06, + "loss": 1.0525032, + "memory(GiB)": 302.58, + "step": 135500, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.74452124, + "epoch": 0.7578920289074963, + "grad_norm": 5.125, + "learning_rate": 7.322154516336208e-06, + "loss": 0.99566336, + "memory(GiB)": 302.58, + "step": 135520, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.74061542, + "epoch": 0.7580038783804756, + "grad_norm": 7.75, + "learning_rate": 7.321335549255946e-06, + "loss": 1.01145153, + "memory(GiB)": 302.58, + "step": 135540, + "train_speed(iter/s)": 0.123983 + }, + { + "acc": 0.73610353, + "epoch": 0.7581157278534548, + "grad_norm": 8.3125, + "learning_rate": 7.3205165027786465e-06, + "loss": 1.02414799, + "memory(GiB)": 302.58, + "step": 135560, + "train_speed(iter/s)": 0.123991 + }, + { + "acc": 0.73102984, + "epoch": 0.7582275773264341, + "grad_norm": 8.5625, + "learning_rate": 7.3196973769323234e-06, + "loss": 1.07225027, + "memory(GiB)": 302.58, + "step": 135580, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.75457101, + "epoch": 0.7583394267994134, + "grad_norm": 6.8125, + "learning_rate": 7.318878171744995e-06, + "loss": 0.96227169, + "memory(GiB)": 302.58, + "step": 135600, + "train_speed(iter/s)": 0.124007 + }, + { + "acc": 0.75257177, + "epoch": 0.7584512762723926, + "grad_norm": 5.15625, + "learning_rate": 7.3180588872446815e-06, + "loss": 0.94537029, + "memory(GiB)": 302.58, + "step": 135620, + "train_speed(iter/s)": 0.124017 + }, + { + "acc": 0.73963289, + "epoch": 0.7585631257453719, + "grad_norm": 7.90625, + "learning_rate": 7.317239523459403e-06, + "loss": 1.0427763, + "memory(GiB)": 302.58, + "step": 135640, + "train_speed(iter/s)": 0.124025 + }, + { + "acc": 0.73292489, + "epoch": 0.7586749752183511, + "grad_norm": 6.8125, + "learning_rate": 7.316420080417184e-06, + "loss": 1.06249466, + "memory(GiB)": 302.58, + "step": 135660, + "train_speed(iter/s)": 0.124034 + }, + { + "acc": 0.7207572, + "epoch": 0.7587868246913304, + "grad_norm": 7.09375, + "learning_rate": 7.315600558146054e-06, + "loss": 1.10951853, + "memory(GiB)": 302.58, + "step": 135680, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.74002385, + "epoch": 0.7588986741643097, + "grad_norm": 6.0625, + "learning_rate": 7.314780956674042e-06, + "loss": 1.01244469, + "memory(GiB)": 302.58, + "step": 135700, + "train_speed(iter/s)": 0.124051 + }, + { + "acc": 0.72377038, + "epoch": 0.7590105236372889, + "grad_norm": 6.78125, + "learning_rate": 7.31396127602918e-06, + "loss": 1.09495287, + "memory(GiB)": 302.58, + "step": 135720, + "train_speed(iter/s)": 0.12406 + }, + { + "acc": 0.74216752, + "epoch": 0.7591223731102682, + "grad_norm": 9.3125, + "learning_rate": 7.313141516239507e-06, + "loss": 1.02666826, + "memory(GiB)": 302.58, + "step": 135740, + "train_speed(iter/s)": 0.124069 + }, + { + "acc": 0.73121328, + "epoch": 0.7592342225832475, + "grad_norm": 9.3125, + "learning_rate": 7.3123216773330585e-06, + "loss": 1.0518199, + "memory(GiB)": 302.58, + "step": 135760, + "train_speed(iter/s)": 0.124077 + }, + { + "acc": 0.74513335, + "epoch": 0.7593460720562267, + "grad_norm": 6.25, + "learning_rate": 7.3115017593378776e-06, + "loss": 0.99043789, + "memory(GiB)": 302.58, + "step": 135780, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.74326038, + "epoch": 0.759457921529206, + "grad_norm": 5.875, + "learning_rate": 7.310681762282005e-06, + "loss": 1.00869036, + "memory(GiB)": 302.58, + "step": 135800, + "train_speed(iter/s)": 0.124095 + }, + { + "acc": 0.73350334, + "epoch": 0.7595697710021853, + "grad_norm": 6.15625, + "learning_rate": 7.3098616861934905e-06, + "loss": 1.05802803, + "memory(GiB)": 302.58, + "step": 135820, + "train_speed(iter/s)": 0.124103 + }, + { + "acc": 0.74254012, + "epoch": 0.7596816204751645, + "grad_norm": 8.8125, + "learning_rate": 7.309041531100382e-06, + "loss": 1.01221943, + "memory(GiB)": 302.58, + "step": 135840, + "train_speed(iter/s)": 0.124112 + }, + { + "acc": 0.75843029, + "epoch": 0.7597934699481438, + "grad_norm": 11.25, + "learning_rate": 7.308221297030731e-06, + "loss": 0.9454278, + "memory(GiB)": 302.58, + "step": 135860, + "train_speed(iter/s)": 0.12412 + }, + { + "acc": 0.75674639, + "epoch": 0.759905319421123, + "grad_norm": 9.0625, + "learning_rate": 7.307400984012592e-06, + "loss": 0.95720596, + "memory(GiB)": 302.58, + "step": 135880, + "train_speed(iter/s)": 0.124128 + }, + { + "acc": 0.72415667, + "epoch": 0.7600171688941023, + "grad_norm": 7.125, + "learning_rate": 7.306580592074023e-06, + "loss": 1.09119511, + "memory(GiB)": 302.58, + "step": 135900, + "train_speed(iter/s)": 0.124136 + }, + { + "acc": 0.73864036, + "epoch": 0.7601290183670816, + "grad_norm": 5.40625, + "learning_rate": 7.305760121243084e-06, + "loss": 1.02069407, + "memory(GiB)": 302.58, + "step": 135920, + "train_speed(iter/s)": 0.124144 + }, + { + "acc": 0.72937036, + "epoch": 0.7602408678400608, + "grad_norm": 7.84375, + "learning_rate": 7.304939571547838e-06, + "loss": 1.0606926, + "memory(GiB)": 302.58, + "step": 135940, + "train_speed(iter/s)": 0.124153 + }, + { + "acc": 0.74219575, + "epoch": 0.7603527173130401, + "grad_norm": 7.8125, + "learning_rate": 7.304118943016349e-06, + "loss": 1.03237934, + "memory(GiB)": 302.58, + "step": 135960, + "train_speed(iter/s)": 0.124162 + }, + { + "acc": 0.7446991, + "epoch": 0.7604645667860194, + "grad_norm": 4.625, + "learning_rate": 7.303298235676687e-06, + "loss": 0.97412891, + "memory(GiB)": 302.58, + "step": 135980, + "train_speed(iter/s)": 0.12417 + }, + { + "acc": 0.74583058, + "epoch": 0.7605764162589986, + "grad_norm": 10.625, + "learning_rate": 7.302477449556921e-06, + "loss": 0.97999249, + "memory(GiB)": 302.58, + "step": 136000, + "train_speed(iter/s)": 0.124179 + }, + { + "epoch": 0.7605764162589986, + "eval_acc": 0.7047804276772272, + "eval_loss": 1.0205943584442139, + "eval_runtime": 7503.655, + "eval_samples_per_second": 10.033, + "eval_steps_per_second": 10.033, + "step": 136000 + }, + { + "acc": 0.73590918, + "epoch": 0.7606882657319779, + "grad_norm": 6.75, + "learning_rate": 7.301656584685125e-06, + "loss": 1.04197273, + "memory(GiB)": 302.58, + "step": 136020, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.74593554, + "epoch": 0.7608001152049572, + "grad_norm": 8.0625, + "learning_rate": 7.300835641089377e-06, + "loss": 1.00343857, + "memory(GiB)": 302.58, + "step": 136040, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.74088745, + "epoch": 0.7609119646779364, + "grad_norm": 4.0, + "learning_rate": 7.3000146187977524e-06, + "loss": 1.02083464, + "memory(GiB)": 302.58, + "step": 136060, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.73653169, + "epoch": 0.7610238141509157, + "grad_norm": 6.5, + "learning_rate": 7.299193517838337e-06, + "loss": 1.02524071, + "memory(GiB)": 302.58, + "step": 136080, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.74213419, + "epoch": 0.761135663623895, + "grad_norm": 6.84375, + "learning_rate": 7.29837233823921e-06, + "loss": 1.0080965, + "memory(GiB)": 302.58, + "step": 136100, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.74370613, + "epoch": 0.7612475130968742, + "grad_norm": 6.65625, + "learning_rate": 7.2975510800284635e-06, + "loss": 1.01646338, + "memory(GiB)": 302.58, + "step": 136120, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.74582596, + "epoch": 0.7613593625698535, + "grad_norm": 8.9375, + "learning_rate": 7.296729743234183e-06, + "loss": 1.00631084, + "memory(GiB)": 302.58, + "step": 136140, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.76444507, + "epoch": 0.7614712120428327, + "grad_norm": 6.34375, + "learning_rate": 7.295908327884463e-06, + "loss": 0.9078825, + "memory(GiB)": 302.58, + "step": 136160, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.7426168, + "epoch": 0.761583061515812, + "grad_norm": 8.9375, + "learning_rate": 7.295086834007397e-06, + "loss": 1.01039171, + "memory(GiB)": 302.58, + "step": 136180, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.74763031, + "epoch": 0.7616949109887913, + "grad_norm": 8.8125, + "learning_rate": 7.294265261631086e-06, + "loss": 1.00104723, + "memory(GiB)": 302.58, + "step": 136200, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.74553361, + "epoch": 0.7618067604617705, + "grad_norm": 10.125, + "learning_rate": 7.293443610783627e-06, + "loss": 0.96633787, + "memory(GiB)": 302.58, + "step": 136220, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.73408403, + "epoch": 0.7619186099347498, + "grad_norm": 8.125, + "learning_rate": 7.292621881493125e-06, + "loss": 1.04974537, + "memory(GiB)": 302.58, + "step": 136240, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.75195322, + "epoch": 0.7620304594077291, + "grad_norm": 7.125, + "learning_rate": 7.291800073787684e-06, + "loss": 0.9626235, + "memory(GiB)": 302.58, + "step": 136260, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74052563, + "epoch": 0.7621423088807083, + "grad_norm": 5.75, + "learning_rate": 7.290978187695415e-06, + "loss": 1.00978441, + "memory(GiB)": 302.58, + "step": 136280, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.72596831, + "epoch": 0.7622541583536876, + "grad_norm": 7.03125, + "learning_rate": 7.290156223244424e-06, + "loss": 1.11341257, + "memory(GiB)": 302.58, + "step": 136300, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.73632288, + "epoch": 0.7623660078266669, + "grad_norm": 9.4375, + "learning_rate": 7.28933418046283e-06, + "loss": 1.04922924, + "memory(GiB)": 302.58, + "step": 136320, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.74307513, + "epoch": 0.7624778572996461, + "grad_norm": 6.1875, + "learning_rate": 7.288512059378749e-06, + "loss": 1.00698643, + "memory(GiB)": 302.58, + "step": 136340, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.73177118, + "epoch": 0.7625897067726254, + "grad_norm": 7.90625, + "learning_rate": 7.287689860020298e-06, + "loss": 1.06623135, + "memory(GiB)": 302.58, + "step": 136360, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74357328, + "epoch": 0.7627015562456047, + "grad_norm": 7.25, + "learning_rate": 7.2868675824156e-06, + "loss": 1.02246122, + "memory(GiB)": 302.58, + "step": 136380, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.73359895, + "epoch": 0.7628134057185839, + "grad_norm": 7.21875, + "learning_rate": 7.28604522659278e-06, + "loss": 1.04315624, + "memory(GiB)": 302.58, + "step": 136400, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.73584175, + "epoch": 0.7629252551915632, + "grad_norm": 5.0625, + "learning_rate": 7.285222792579962e-06, + "loss": 1.03440742, + "memory(GiB)": 302.58, + "step": 136420, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.73011289, + "epoch": 0.7630371046645424, + "grad_norm": 7.9375, + "learning_rate": 7.284400280405279e-06, + "loss": 1.05415478, + "memory(GiB)": 302.58, + "step": 136440, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.72504172, + "epoch": 0.7631489541375217, + "grad_norm": 7.3125, + "learning_rate": 7.283577690096863e-06, + "loss": 1.10212612, + "memory(GiB)": 302.58, + "step": 136460, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.75633631, + "epoch": 0.763260803610501, + "grad_norm": 7.5625, + "learning_rate": 7.282755021682849e-06, + "loss": 0.95884962, + "memory(GiB)": 302.58, + "step": 136480, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.75549784, + "epoch": 0.7633726530834802, + "grad_norm": 8.6875, + "learning_rate": 7.281932275191376e-06, + "loss": 0.94328127, + "memory(GiB)": 302.58, + "step": 136500, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.73212185, + "epoch": 0.7634845025564595, + "grad_norm": 7.9375, + "learning_rate": 7.281109450650582e-06, + "loss": 1.05046501, + "memory(GiB)": 302.58, + "step": 136520, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74190168, + "epoch": 0.7635963520294388, + "grad_norm": 6.625, + "learning_rate": 7.2802865480886135e-06, + "loss": 1.01870823, + "memory(GiB)": 302.58, + "step": 136540, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.76204009, + "epoch": 0.763708201502418, + "grad_norm": 6.0625, + "learning_rate": 7.279463567533612e-06, + "loss": 0.92451639, + "memory(GiB)": 302.58, + "step": 136560, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.74593148, + "epoch": 0.7638200509753973, + "grad_norm": 10.0, + "learning_rate": 7.27864050901373e-06, + "loss": 0.99969501, + "memory(GiB)": 302.58, + "step": 136580, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.73384891, + "epoch": 0.7639319004483766, + "grad_norm": 8.9375, + "learning_rate": 7.277817372557116e-06, + "loss": 1.07333527, + "memory(GiB)": 302.58, + "step": 136600, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75329151, + "epoch": 0.7640437499213558, + "grad_norm": 5.46875, + "learning_rate": 7.276994158191926e-06, + "loss": 0.95926056, + "memory(GiB)": 302.58, + "step": 136620, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74286084, + "epoch": 0.7641555993943351, + "grad_norm": 6.9375, + "learning_rate": 7.276170865946316e-06, + "loss": 1.0145546, + "memory(GiB)": 302.58, + "step": 136640, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.73879786, + "epoch": 0.7642674488673143, + "grad_norm": 5.75, + "learning_rate": 7.275347495848444e-06, + "loss": 1.00848103, + "memory(GiB)": 302.58, + "step": 136660, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.73142562, + "epoch": 0.7643792983402936, + "grad_norm": 5.8125, + "learning_rate": 7.274524047926474e-06, + "loss": 1.04497223, + "memory(GiB)": 302.58, + "step": 136680, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.7403348, + "epoch": 0.7644911478132729, + "grad_norm": 7.84375, + "learning_rate": 7.273700522208569e-06, + "loss": 1.00598221, + "memory(GiB)": 302.58, + "step": 136700, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.73190355, + "epoch": 0.7646029972862521, + "grad_norm": 6.0625, + "learning_rate": 7.272876918722894e-06, + "loss": 1.06071167, + "memory(GiB)": 302.58, + "step": 136720, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.72778335, + "epoch": 0.7647148467592314, + "grad_norm": 6.1875, + "learning_rate": 7.272053237497624e-06, + "loss": 1.08893337, + "memory(GiB)": 302.58, + "step": 136740, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.72884598, + "epoch": 0.7648266962322107, + "grad_norm": 4.375, + "learning_rate": 7.271229478560928e-06, + "loss": 1.08023939, + "memory(GiB)": 302.58, + "step": 136760, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.73847132, + "epoch": 0.7649385457051899, + "grad_norm": 7.96875, + "learning_rate": 7.270405641940981e-06, + "loss": 1.02945013, + "memory(GiB)": 302.58, + "step": 136780, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.72877908, + "epoch": 0.7650503951781692, + "grad_norm": 6.15625, + "learning_rate": 7.269581727665963e-06, + "loss": 1.07462587, + "memory(GiB)": 302.58, + "step": 136800, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.72602305, + "epoch": 0.7651622446511485, + "grad_norm": 7.875, + "learning_rate": 7.268757735764054e-06, + "loss": 1.08645315, + "memory(GiB)": 302.58, + "step": 136820, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.73284502, + "epoch": 0.7652740941241277, + "grad_norm": 6.625, + "learning_rate": 7.267933666263435e-06, + "loss": 1.04619541, + "memory(GiB)": 302.58, + "step": 136840, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.74214516, + "epoch": 0.765385943597107, + "grad_norm": 9.0, + "learning_rate": 7.267109519192292e-06, + "loss": 0.99196997, + "memory(GiB)": 302.58, + "step": 136860, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.73791308, + "epoch": 0.7654977930700863, + "grad_norm": 6.59375, + "learning_rate": 7.266285294578816e-06, + "loss": 1.03476734, + "memory(GiB)": 302.58, + "step": 136880, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.74593701, + "epoch": 0.7656096425430655, + "grad_norm": 7.8125, + "learning_rate": 7.265460992451198e-06, + "loss": 1.00929861, + "memory(GiB)": 302.58, + "step": 136900, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.73816504, + "epoch": 0.7657214920160448, + "grad_norm": 6.46875, + "learning_rate": 7.264636612837629e-06, + "loss": 1.04061928, + "memory(GiB)": 302.58, + "step": 136920, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.74384346, + "epoch": 0.765833341489024, + "grad_norm": 6.4375, + "learning_rate": 7.263812155766308e-06, + "loss": 1.00789547, + "memory(GiB)": 302.58, + "step": 136940, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.74261069, + "epoch": 0.7659451909620033, + "grad_norm": 6.125, + "learning_rate": 7.262987621265432e-06, + "loss": 1.01157808, + "memory(GiB)": 302.58, + "step": 136960, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.75299616, + "epoch": 0.7660570404349826, + "grad_norm": 6.125, + "learning_rate": 7.262163009363203e-06, + "loss": 0.98285561, + "memory(GiB)": 302.58, + "step": 136980, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.73871384, + "epoch": 0.7661688899079618, + "grad_norm": 9.8125, + "learning_rate": 7.261338320087827e-06, + "loss": 1.02549524, + "memory(GiB)": 302.58, + "step": 137000, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.74539909, + "epoch": 0.7662807393809411, + "grad_norm": 6.0625, + "learning_rate": 7.2605135534675085e-06, + "loss": 1.00281639, + "memory(GiB)": 302.58, + "step": 137020, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.73495979, + "epoch": 0.7663925888539204, + "grad_norm": 10.625, + "learning_rate": 7.25968870953046e-06, + "loss": 1.05053635, + "memory(GiB)": 302.58, + "step": 137040, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.74511237, + "epoch": 0.7665044383268996, + "grad_norm": 7.75, + "learning_rate": 7.258863788304892e-06, + "loss": 0.99970312, + "memory(GiB)": 302.58, + "step": 137060, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.73082919, + "epoch": 0.7666162877998789, + "grad_norm": 9.1875, + "learning_rate": 7.25803878981902e-06, + "loss": 1.04925289, + "memory(GiB)": 302.58, + "step": 137080, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.74656086, + "epoch": 0.7667281372728582, + "grad_norm": 6.3125, + "learning_rate": 7.257213714101061e-06, + "loss": 0.9886651, + "memory(GiB)": 302.58, + "step": 137100, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.74082255, + "epoch": 0.7668399867458374, + "grad_norm": 8.0625, + "learning_rate": 7.256388561179235e-06, + "loss": 1.02000675, + "memory(GiB)": 302.58, + "step": 137120, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.73464441, + "epoch": 0.7669518362188167, + "grad_norm": 7.1875, + "learning_rate": 7.255563331081767e-06, + "loss": 1.05854692, + "memory(GiB)": 302.58, + "step": 137140, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.75005708, + "epoch": 0.767063685691796, + "grad_norm": 6.6875, + "learning_rate": 7.254738023836879e-06, + "loss": 0.97241211, + "memory(GiB)": 302.58, + "step": 137160, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.73538222, + "epoch": 0.7671755351647752, + "grad_norm": 6.0625, + "learning_rate": 7.253912639472802e-06, + "loss": 1.03127613, + "memory(GiB)": 302.58, + "step": 137180, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.73430009, + "epoch": 0.7672873846377545, + "grad_norm": 10.3125, + "learning_rate": 7.253087178017766e-06, + "loss": 1.04614525, + "memory(GiB)": 302.58, + "step": 137200, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.73000507, + "epoch": 0.7673992341107337, + "grad_norm": 8.25, + "learning_rate": 7.252261639500005e-06, + "loss": 1.06003761, + "memory(GiB)": 302.58, + "step": 137220, + "train_speed(iter/s)": 0.123839 + }, + { + "acc": 0.73777261, + "epoch": 0.767511083583713, + "grad_norm": 8.5625, + "learning_rate": 7.2514360239477524e-06, + "loss": 1.01975822, + "memory(GiB)": 302.58, + "step": 137240, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.74984326, + "epoch": 0.7676229330566923, + "grad_norm": 7.25, + "learning_rate": 7.25061033138925e-06, + "loss": 0.97924404, + "memory(GiB)": 302.58, + "step": 137260, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.74185276, + "epoch": 0.7677347825296715, + "grad_norm": 6.6875, + "learning_rate": 7.2497845618527375e-06, + "loss": 0.98966732, + "memory(GiB)": 302.58, + "step": 137280, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.74486542, + "epoch": 0.7678466320026508, + "grad_norm": 6.46875, + "learning_rate": 7.248958715366458e-06, + "loss": 1.00962353, + "memory(GiB)": 302.58, + "step": 137300, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.76312957, + "epoch": 0.7679584814756301, + "grad_norm": 6.0625, + "learning_rate": 7.248132791958659e-06, + "loss": 0.92697458, + "memory(GiB)": 302.58, + "step": 137320, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.74130397, + "epoch": 0.7680703309486093, + "grad_norm": 4.90625, + "learning_rate": 7.247306791657591e-06, + "loss": 1.01424046, + "memory(GiB)": 302.58, + "step": 137340, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.7309135, + "epoch": 0.7681821804215886, + "grad_norm": 6.3125, + "learning_rate": 7.246480714491505e-06, + "loss": 1.0652751, + "memory(GiB)": 302.58, + "step": 137360, + "train_speed(iter/s)": 0.123898 + }, + { + "acc": 0.7417666, + "epoch": 0.7682940298945679, + "grad_norm": 9.25, + "learning_rate": 7.245654560488655e-06, + "loss": 1.01403894, + "memory(GiB)": 302.58, + "step": 137380, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.74995551, + "epoch": 0.7684058793675471, + "grad_norm": 4.59375, + "learning_rate": 7.244828329677298e-06, + "loss": 0.99170141, + "memory(GiB)": 302.58, + "step": 137400, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.74267173, + "epoch": 0.7685177288405264, + "grad_norm": 11.5, + "learning_rate": 7.244002022085695e-06, + "loss": 0.99959145, + "memory(GiB)": 302.58, + "step": 137420, + "train_speed(iter/s)": 0.123925 + }, + { + "acc": 0.75261521, + "epoch": 0.7686295783135056, + "grad_norm": 5.3125, + "learning_rate": 7.243175637742105e-06, + "loss": 0.97316236, + "memory(GiB)": 302.58, + "step": 137440, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.73560796, + "epoch": 0.7687414277864849, + "grad_norm": 6.9375, + "learning_rate": 7.242349176674797e-06, + "loss": 1.06835432, + "memory(GiB)": 302.58, + "step": 137460, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.74268546, + "epoch": 0.7688532772594642, + "grad_norm": 5.15625, + "learning_rate": 7.241522638912035e-06, + "loss": 0.99997683, + "memory(GiB)": 302.58, + "step": 137480, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.75457449, + "epoch": 0.7689651267324434, + "grad_norm": 6.4375, + "learning_rate": 7.240696024482093e-06, + "loss": 0.95023003, + "memory(GiB)": 302.58, + "step": 137500, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.73212876, + "epoch": 0.7690769762054227, + "grad_norm": 8.25, + "learning_rate": 7.239869333413242e-06, + "loss": 1.06953363, + "memory(GiB)": 302.58, + "step": 137520, + "train_speed(iter/s)": 0.123966 + }, + { + "acc": 0.73144617, + "epoch": 0.769188825678402, + "grad_norm": 10.1875, + "learning_rate": 7.239042565733757e-06, + "loss": 1.05477571, + "memory(GiB)": 302.58, + "step": 137540, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.74669733, + "epoch": 0.7693006751513812, + "grad_norm": 7.8125, + "learning_rate": 7.238215721471916e-06, + "loss": 0.98427944, + "memory(GiB)": 302.58, + "step": 137560, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.75229759, + "epoch": 0.7694125246243605, + "grad_norm": 8.875, + "learning_rate": 7.237388800656e-06, + "loss": 0.97504005, + "memory(GiB)": 302.58, + "step": 137580, + "train_speed(iter/s)": 0.123992 + }, + { + "acc": 0.75671115, + "epoch": 0.7695243740973398, + "grad_norm": 4.21875, + "learning_rate": 7.236561803314292e-06, + "loss": 0.94032364, + "memory(GiB)": 302.58, + "step": 137600, + "train_speed(iter/s)": 0.124001 + }, + { + "acc": 0.72775087, + "epoch": 0.769636223570319, + "grad_norm": 4.96875, + "learning_rate": 7.235734729475079e-06, + "loss": 1.07371044, + "memory(GiB)": 302.58, + "step": 137620, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.75122328, + "epoch": 0.7697480730432983, + "grad_norm": 10.25, + "learning_rate": 7.23490757916665e-06, + "loss": 0.96844845, + "memory(GiB)": 302.58, + "step": 137640, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.73214445, + "epoch": 0.7698599225162776, + "grad_norm": 4.96875, + "learning_rate": 7.234080352417294e-06, + "loss": 1.07008705, + "memory(GiB)": 302.58, + "step": 137660, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.74217191, + "epoch": 0.7699717719892568, + "grad_norm": 7.65625, + "learning_rate": 7.233253049255305e-06, + "loss": 1.0184516, + "memory(GiB)": 302.58, + "step": 137680, + "train_speed(iter/s)": 0.124035 + }, + { + "acc": 0.73426762, + "epoch": 0.7700836214622361, + "grad_norm": 4.6875, + "learning_rate": 7.232425669708982e-06, + "loss": 1.04543018, + "memory(GiB)": 302.58, + "step": 137700, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.73414936, + "epoch": 0.7701954709352153, + "grad_norm": 5.1875, + "learning_rate": 7.231598213806621e-06, + "loss": 1.0576148, + "memory(GiB)": 302.58, + "step": 137720, + "train_speed(iter/s)": 0.124052 + }, + { + "acc": 0.74348297, + "epoch": 0.7703073204081946, + "grad_norm": 8.75, + "learning_rate": 7.230770681576525e-06, + "loss": 0.99792976, + "memory(GiB)": 302.58, + "step": 137740, + "train_speed(iter/s)": 0.124061 + }, + { + "acc": 0.73893981, + "epoch": 0.7704191698811739, + "grad_norm": 9.875, + "learning_rate": 7.229943073046999e-06, + "loss": 1.0262414, + "memory(GiB)": 302.58, + "step": 137760, + "train_speed(iter/s)": 0.124069 + }, + { + "acc": 0.74922209, + "epoch": 0.7705310193541531, + "grad_norm": 6.125, + "learning_rate": 7.229115388246348e-06, + "loss": 0.98514767, + "memory(GiB)": 302.58, + "step": 137780, + "train_speed(iter/s)": 0.124077 + }, + { + "acc": 0.73004975, + "epoch": 0.7706428688271324, + "grad_norm": 6.96875, + "learning_rate": 7.228287627202884e-06, + "loss": 1.07791824, + "memory(GiB)": 302.58, + "step": 137800, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.75636754, + "epoch": 0.7707547183001117, + "grad_norm": 7.0625, + "learning_rate": 7.2274597899449165e-06, + "loss": 0.98547277, + "memory(GiB)": 302.58, + "step": 137820, + "train_speed(iter/s)": 0.124094 + }, + { + "acc": 0.74494195, + "epoch": 0.7708665677730909, + "grad_norm": 8.125, + "learning_rate": 7.22663187650076e-06, + "loss": 1.02773008, + "memory(GiB)": 302.58, + "step": 137840, + "train_speed(iter/s)": 0.124102 + }, + { + "acc": 0.718401, + "epoch": 0.7709784172460702, + "grad_norm": 5.875, + "learning_rate": 7.225803886898734e-06, + "loss": 1.11496487, + "memory(GiB)": 302.58, + "step": 137860, + "train_speed(iter/s)": 0.12411 + }, + { + "acc": 0.75050559, + "epoch": 0.7710902667190495, + "grad_norm": 5.4375, + "learning_rate": 7.224975821167156e-06, + "loss": 0.96105881, + "memory(GiB)": 302.58, + "step": 137880, + "train_speed(iter/s)": 0.124119 + }, + { + "acc": 0.74692926, + "epoch": 0.7712021161920287, + "grad_norm": 6.03125, + "learning_rate": 7.224147679334351e-06, + "loss": 1.00379982, + "memory(GiB)": 302.58, + "step": 137900, + "train_speed(iter/s)": 0.124128 + }, + { + "acc": 0.75887523, + "epoch": 0.771313965665008, + "grad_norm": 7.40625, + "learning_rate": 7.223319461428643e-06, + "loss": 0.9538249, + "memory(GiB)": 302.58, + "step": 137920, + "train_speed(iter/s)": 0.124136 + }, + { + "acc": 0.75920138, + "epoch": 0.7714258151379872, + "grad_norm": 8.8125, + "learning_rate": 7.222491167478359e-06, + "loss": 0.94751616, + "memory(GiB)": 302.58, + "step": 137940, + "train_speed(iter/s)": 0.124144 + }, + { + "acc": 0.74006591, + "epoch": 0.7715376646109665, + "grad_norm": 5.625, + "learning_rate": 7.22166279751183e-06, + "loss": 1.01251898, + "memory(GiB)": 302.58, + "step": 137960, + "train_speed(iter/s)": 0.124152 + }, + { + "acc": 0.74910288, + "epoch": 0.7716495140839458, + "grad_norm": 6.125, + "learning_rate": 7.2208343515573885e-06, + "loss": 0.96956806, + "memory(GiB)": 302.58, + "step": 137980, + "train_speed(iter/s)": 0.124161 + }, + { + "acc": 0.7416533, + "epoch": 0.771761363556925, + "grad_norm": 7.0625, + "learning_rate": 7.220005829643371e-06, + "loss": 0.99572697, + "memory(GiB)": 302.58, + "step": 138000, + "train_speed(iter/s)": 0.124169 + }, + { + "epoch": 0.771761363556925, + "eval_acc": 0.704807984580754, + "eval_loss": 1.020049810409546, + "eval_runtime": 7502.5427, + "eval_samples_per_second": 10.034, + "eval_steps_per_second": 10.034, + "step": 138000 + }, + { + "acc": 0.73219919, + "epoch": 0.7718732130299043, + "grad_norm": 7.59375, + "learning_rate": 7.219177231798111e-06, + "loss": 1.04849863, + "memory(GiB)": 302.58, + "step": 138020, + "train_speed(iter/s)": 0.123331 + }, + { + "acc": 0.74541845, + "epoch": 0.7719850625028836, + "grad_norm": 10.625, + "learning_rate": 7.2183485580499565e-06, + "loss": 0.99060144, + "memory(GiB)": 302.58, + "step": 138040, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.74280028, + "epoch": 0.7720969119758628, + "grad_norm": 4.71875, + "learning_rate": 7.217519808427247e-06, + "loss": 1.01159611, + "memory(GiB)": 302.58, + "step": 138060, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.73601003, + "epoch": 0.7722087614488421, + "grad_norm": 8.1875, + "learning_rate": 7.2166909829583276e-06, + "loss": 1.05186186, + "memory(GiB)": 302.58, + "step": 138080, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.72841291, + "epoch": 0.7723206109218214, + "grad_norm": 7.84375, + "learning_rate": 7.215862081671548e-06, + "loss": 1.07617579, + "memory(GiB)": 302.58, + "step": 138100, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.73207026, + "epoch": 0.7724324603948006, + "grad_norm": 6.5625, + "learning_rate": 7.215033104595257e-06, + "loss": 1.07250042, + "memory(GiB)": 302.58, + "step": 138120, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.73521843, + "epoch": 0.7725443098677799, + "grad_norm": 9.9375, + "learning_rate": 7.214204051757812e-06, + "loss": 1.0282238, + "memory(GiB)": 302.58, + "step": 138140, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.72322302, + "epoch": 0.7726561593407592, + "grad_norm": 7.21875, + "learning_rate": 7.213374923187567e-06, + "loss": 1.0926199, + "memory(GiB)": 302.58, + "step": 138160, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.72758374, + "epoch": 0.7727680088137384, + "grad_norm": 6.09375, + "learning_rate": 7.212545718912881e-06, + "loss": 1.06524115, + "memory(GiB)": 302.58, + "step": 138180, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.73901091, + "epoch": 0.7728798582867177, + "grad_norm": 6.78125, + "learning_rate": 7.211716438962116e-06, + "loss": 1.02259312, + "memory(GiB)": 302.58, + "step": 138200, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.72299895, + "epoch": 0.7729917077596969, + "grad_norm": 6.625, + "learning_rate": 7.210887083363636e-06, + "loss": 1.11243744, + "memory(GiB)": 302.58, + "step": 138220, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.74314237, + "epoch": 0.7731035572326763, + "grad_norm": 5.875, + "learning_rate": 7.2100576521458065e-06, + "loss": 0.99826517, + "memory(GiB)": 302.58, + "step": 138240, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.73659873, + "epoch": 0.7732154067056556, + "grad_norm": 8.0625, + "learning_rate": 7.209228145336998e-06, + "loss": 1.0533699, + "memory(GiB)": 302.58, + "step": 138260, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74261398, + "epoch": 0.7733272561786348, + "grad_norm": 9.75, + "learning_rate": 7.208398562965582e-06, + "loss": 1.02639408, + "memory(GiB)": 302.58, + "step": 138280, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.73812175, + "epoch": 0.7734391056516141, + "grad_norm": 7.25, + "learning_rate": 7.2075689050599305e-06, + "loss": 1.02773418, + "memory(GiB)": 302.58, + "step": 138300, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.73931508, + "epoch": 0.7735509551245934, + "grad_norm": 6.15625, + "learning_rate": 7.206739171648422e-06, + "loss": 1.0402751, + "memory(GiB)": 302.58, + "step": 138320, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.74131513, + "epoch": 0.7736628045975726, + "grad_norm": 9.75, + "learning_rate": 7.205909362759439e-06, + "loss": 1.0227417, + "memory(GiB)": 302.58, + "step": 138340, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.75463452, + "epoch": 0.7737746540705519, + "grad_norm": 7.25, + "learning_rate": 7.205079478421361e-06, + "loss": 0.94652224, + "memory(GiB)": 302.58, + "step": 138360, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.7412755, + "epoch": 0.7738865035435312, + "grad_norm": 7.34375, + "learning_rate": 7.204249518662572e-06, + "loss": 1.01865807, + "memory(GiB)": 302.58, + "step": 138380, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.74407139, + "epoch": 0.7739983530165104, + "grad_norm": 8.375, + "learning_rate": 7.203419483511459e-06, + "loss": 1.00242615, + "memory(GiB)": 302.58, + "step": 138400, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.75157375, + "epoch": 0.7741102024894897, + "grad_norm": 7.15625, + "learning_rate": 7.202589372996413e-06, + "loss": 0.97784185, + "memory(GiB)": 302.58, + "step": 138420, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.74895549, + "epoch": 0.774222051962469, + "grad_norm": 4.9375, + "learning_rate": 7.201759187145828e-06, + "loss": 0.97924299, + "memory(GiB)": 302.58, + "step": 138440, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74059911, + "epoch": 0.7743339014354482, + "grad_norm": 6.96875, + "learning_rate": 7.200928925988096e-06, + "loss": 0.99579906, + "memory(GiB)": 302.58, + "step": 138460, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.74223676, + "epoch": 0.7744457509084275, + "grad_norm": 5.8125, + "learning_rate": 7.200098589551616e-06, + "loss": 1.02074594, + "memory(GiB)": 302.58, + "step": 138480, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.75566859, + "epoch": 0.7745576003814068, + "grad_norm": 6.8125, + "learning_rate": 7.1992681778647875e-06, + "loss": 0.94911966, + "memory(GiB)": 302.58, + "step": 138500, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.73456964, + "epoch": 0.774669449854386, + "grad_norm": 4.8125, + "learning_rate": 7.198437690956014e-06, + "loss": 1.03231955, + "memory(GiB)": 302.58, + "step": 138520, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.74761567, + "epoch": 0.7747812993273653, + "grad_norm": 6.28125, + "learning_rate": 7.197607128853701e-06, + "loss": 0.98790998, + "memory(GiB)": 302.58, + "step": 138540, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74442029, + "epoch": 0.7748931488003445, + "grad_norm": 7.84375, + "learning_rate": 7.196776491586255e-06, + "loss": 1.02289457, + "memory(GiB)": 302.58, + "step": 138560, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.73046942, + "epoch": 0.7750049982733238, + "grad_norm": 7.71875, + "learning_rate": 7.195945779182088e-06, + "loss": 1.06289406, + "memory(GiB)": 302.58, + "step": 138580, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74792695, + "epoch": 0.7751168477463031, + "grad_norm": 6.125, + "learning_rate": 7.195114991669611e-06, + "loss": 0.97073793, + "memory(GiB)": 302.58, + "step": 138600, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.7335835, + "epoch": 0.7752286972192823, + "grad_norm": 7.75, + "learning_rate": 7.19428412907724e-06, + "loss": 1.05022821, + "memory(GiB)": 302.58, + "step": 138620, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75329347, + "epoch": 0.7753405466922616, + "grad_norm": 7.125, + "learning_rate": 7.193453191433396e-06, + "loss": 0.95253344, + "memory(GiB)": 302.58, + "step": 138640, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.7280314, + "epoch": 0.7754523961652409, + "grad_norm": 7.8125, + "learning_rate": 7.192622178766497e-06, + "loss": 1.06623859, + "memory(GiB)": 302.58, + "step": 138660, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.7376039, + "epoch": 0.7755642456382201, + "grad_norm": 8.5625, + "learning_rate": 7.191791091104967e-06, + "loss": 1.03164682, + "memory(GiB)": 302.58, + "step": 138680, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.72514434, + "epoch": 0.7756760951111994, + "grad_norm": 8.1875, + "learning_rate": 7.190959928477232e-06, + "loss": 1.10061178, + "memory(GiB)": 302.58, + "step": 138700, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.7384325, + "epoch": 0.7757879445841787, + "grad_norm": 7.1875, + "learning_rate": 7.190128690911717e-06, + "loss": 1.04246588, + "memory(GiB)": 302.58, + "step": 138720, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.74409428, + "epoch": 0.7758997940571579, + "grad_norm": 5.5625, + "learning_rate": 7.189297378436859e-06, + "loss": 1.01522703, + "memory(GiB)": 302.58, + "step": 138740, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.77021084, + "epoch": 0.7760116435301372, + "grad_norm": 10.0, + "learning_rate": 7.188465991081088e-06, + "loss": 0.89781637, + "memory(GiB)": 302.58, + "step": 138760, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.7478528, + "epoch": 0.7761234930031164, + "grad_norm": 5.65625, + "learning_rate": 7.187634528872842e-06, + "loss": 1.00895061, + "memory(GiB)": 302.58, + "step": 138780, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.72002997, + "epoch": 0.7762353424760957, + "grad_norm": 5.78125, + "learning_rate": 7.1868029918405564e-06, + "loss": 1.11654081, + "memory(GiB)": 302.58, + "step": 138800, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74660444, + "epoch": 0.776347191949075, + "grad_norm": 9.6875, + "learning_rate": 7.185971380012675e-06, + "loss": 1.0045577, + "memory(GiB)": 302.58, + "step": 138820, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.73929954, + "epoch": 0.7764590414220542, + "grad_norm": 7.5, + "learning_rate": 7.185139693417642e-06, + "loss": 1.03193054, + "memory(GiB)": 302.58, + "step": 138840, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.72622299, + "epoch": 0.7765708908950335, + "grad_norm": 7.03125, + "learning_rate": 7.184307932083901e-06, + "loss": 1.07869453, + "memory(GiB)": 302.58, + "step": 138860, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.73644495, + "epoch": 0.7766827403680128, + "grad_norm": 5.84375, + "learning_rate": 7.183476096039903e-06, + "loss": 1.04304695, + "memory(GiB)": 302.58, + "step": 138880, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.74224367, + "epoch": 0.776794589840992, + "grad_norm": 8.625, + "learning_rate": 7.182644185314099e-06, + "loss": 0.99655981, + "memory(GiB)": 302.58, + "step": 138900, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.72717299, + "epoch": 0.7769064393139713, + "grad_norm": 8.3125, + "learning_rate": 7.1818121999349426e-06, + "loss": 1.09077063, + "memory(GiB)": 302.58, + "step": 138920, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.71704659, + "epoch": 0.7770182887869506, + "grad_norm": 7.6875, + "learning_rate": 7.18098013993089e-06, + "loss": 1.12697525, + "memory(GiB)": 302.58, + "step": 138940, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.72106094, + "epoch": 0.7771301382599298, + "grad_norm": 7.875, + "learning_rate": 7.180148005330402e-06, + "loss": 1.10987577, + "memory(GiB)": 302.58, + "step": 138960, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.75249729, + "epoch": 0.7772419877329091, + "grad_norm": 7.0, + "learning_rate": 7.1793157961619384e-06, + "loss": 0.97313786, + "memory(GiB)": 302.58, + "step": 138980, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.74080038, + "epoch": 0.7773538372058884, + "grad_norm": 5.96875, + "learning_rate": 7.178483512453964e-06, + "loss": 1.0276516, + "memory(GiB)": 302.58, + "step": 139000, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.73681278, + "epoch": 0.7774656866788676, + "grad_norm": 6.6875, + "learning_rate": 7.177651154234947e-06, + "loss": 1.04728203, + "memory(GiB)": 302.58, + "step": 139020, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.74025879, + "epoch": 0.7775775361518469, + "grad_norm": 8.1875, + "learning_rate": 7.176818721533354e-06, + "loss": 1.02968912, + "memory(GiB)": 302.58, + "step": 139040, + "train_speed(iter/s)": 0.123762 + }, + { + "acc": 0.74964247, + "epoch": 0.7776893856248261, + "grad_norm": 8.0, + "learning_rate": 7.1759862143776595e-06, + "loss": 0.96560268, + "memory(GiB)": 302.58, + "step": 139060, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.73688974, + "epoch": 0.7778012350978054, + "grad_norm": 7.25, + "learning_rate": 7.175153632796335e-06, + "loss": 1.02660265, + "memory(GiB)": 302.58, + "step": 139080, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.72590322, + "epoch": 0.7779130845707847, + "grad_norm": 8.375, + "learning_rate": 7.174320976817861e-06, + "loss": 1.07551022, + "memory(GiB)": 302.58, + "step": 139100, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.74893494, + "epoch": 0.7780249340437639, + "grad_norm": 8.4375, + "learning_rate": 7.173488246470715e-06, + "loss": 0.97369308, + "memory(GiB)": 302.58, + "step": 139120, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.74050326, + "epoch": 0.7781367835167432, + "grad_norm": 5.5625, + "learning_rate": 7.172655441783377e-06, + "loss": 1.02252588, + "memory(GiB)": 302.58, + "step": 139140, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.74063859, + "epoch": 0.7782486329897225, + "grad_norm": 6.375, + "learning_rate": 7.171822562784334e-06, + "loss": 1.0347928, + "memory(GiB)": 302.58, + "step": 139160, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.74021401, + "epoch": 0.7783604824627017, + "grad_norm": 5.59375, + "learning_rate": 7.170989609502074e-06, + "loss": 1.03069382, + "memory(GiB)": 302.58, + "step": 139180, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.74662824, + "epoch": 0.778472331935681, + "grad_norm": 5.25, + "learning_rate": 7.170156581965084e-06, + "loss": 0.98799906, + "memory(GiB)": 302.58, + "step": 139200, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.74357414, + "epoch": 0.7785841814086603, + "grad_norm": 8.3125, + "learning_rate": 7.169323480201859e-06, + "loss": 1.01535263, + "memory(GiB)": 302.58, + "step": 139220, + "train_speed(iter/s)": 0.12384 + }, + { + "acc": 0.73990321, + "epoch": 0.7786960308816395, + "grad_norm": 10.625, + "learning_rate": 7.1684903042408916e-06, + "loss": 1.03774672, + "memory(GiB)": 302.58, + "step": 139240, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.73930478, + "epoch": 0.7788078803546188, + "grad_norm": 6.53125, + "learning_rate": 7.1676570541106795e-06, + "loss": 1.01054602, + "memory(GiB)": 302.58, + "step": 139260, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.75205584, + "epoch": 0.778919729827598, + "grad_norm": 7.875, + "learning_rate": 7.166823729839723e-06, + "loss": 0.97294044, + "memory(GiB)": 302.58, + "step": 139280, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.7507607, + "epoch": 0.7790315793005773, + "grad_norm": 7.65625, + "learning_rate": 7.165990331456523e-06, + "loss": 0.9858077, + "memory(GiB)": 302.58, + "step": 139300, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.74106812, + "epoch": 0.7791434287735566, + "grad_norm": 8.125, + "learning_rate": 7.165156858989586e-06, + "loss": 1.01369486, + "memory(GiB)": 302.58, + "step": 139320, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.73654881, + "epoch": 0.7792552782465358, + "grad_norm": 8.5, + "learning_rate": 7.164323312467419e-06, + "loss": 1.04183416, + "memory(GiB)": 302.58, + "step": 139340, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.75040231, + "epoch": 0.7793671277195151, + "grad_norm": 7.59375, + "learning_rate": 7.163489691918533e-06, + "loss": 0.98518963, + "memory(GiB)": 302.58, + "step": 139360, + "train_speed(iter/s)": 0.123899 + }, + { + "acc": 0.7301085, + "epoch": 0.7794789771924944, + "grad_norm": 7.53125, + "learning_rate": 7.162655997371437e-06, + "loss": 1.08843355, + "memory(GiB)": 302.58, + "step": 139380, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.72936902, + "epoch": 0.7795908266654736, + "grad_norm": 8.5, + "learning_rate": 7.16182222885465e-06, + "loss": 1.0726306, + "memory(GiB)": 302.58, + "step": 139400, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.74738879, + "epoch": 0.7797026761384529, + "grad_norm": 6.9375, + "learning_rate": 7.160988386396688e-06, + "loss": 0.98745108, + "memory(GiB)": 302.58, + "step": 139420, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.74371901, + "epoch": 0.7798145256114322, + "grad_norm": 9.375, + "learning_rate": 7.16015447002607e-06, + "loss": 1.00883675, + "memory(GiB)": 302.58, + "step": 139440, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.72492585, + "epoch": 0.7799263750844114, + "grad_norm": 6.21875, + "learning_rate": 7.159320479771319e-06, + "loss": 1.07970133, + "memory(GiB)": 302.58, + "step": 139460, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.73011384, + "epoch": 0.7800382245573907, + "grad_norm": 7.21875, + "learning_rate": 7.158486415660962e-06, + "loss": 1.08012695, + "memory(GiB)": 302.58, + "step": 139480, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.7464561, + "epoch": 0.78015007403037, + "grad_norm": 11.1875, + "learning_rate": 7.157652277723525e-06, + "loss": 1.00528574, + "memory(GiB)": 302.58, + "step": 139500, + "train_speed(iter/s)": 0.123957 + }, + { + "acc": 0.7463768, + "epoch": 0.7802619235033492, + "grad_norm": 5.5, + "learning_rate": 7.156818065987538e-06, + "loss": 0.98548279, + "memory(GiB)": 302.58, + "step": 139520, + "train_speed(iter/s)": 0.123966 + }, + { + "acc": 0.73464856, + "epoch": 0.7803737729763285, + "grad_norm": 5.53125, + "learning_rate": 7.155983780481535e-06, + "loss": 1.03431511, + "memory(GiB)": 302.58, + "step": 139540, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.74587946, + "epoch": 0.7804856224493077, + "grad_norm": 7.125, + "learning_rate": 7.155149421234048e-06, + "loss": 0.99611864, + "memory(GiB)": 302.58, + "step": 139560, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.7468039, + "epoch": 0.780597471922287, + "grad_norm": 4.84375, + "learning_rate": 7.154314988273619e-06, + "loss": 1.01350746, + "memory(GiB)": 302.58, + "step": 139580, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.72893109, + "epoch": 0.7807093213952663, + "grad_norm": 8.25, + "learning_rate": 7.153480481628784e-06, + "loss": 1.08140554, + "memory(GiB)": 302.58, + "step": 139600, + "train_speed(iter/s)": 0.123998 + }, + { + "acc": 0.73641534, + "epoch": 0.7808211708682455, + "grad_norm": 9.5625, + "learning_rate": 7.152645901328089e-06, + "loss": 1.03059883, + "memory(GiB)": 302.58, + "step": 139620, + "train_speed(iter/s)": 0.124006 + }, + { + "acc": 0.74559631, + "epoch": 0.7809330203412248, + "grad_norm": 7.125, + "learning_rate": 7.151811247400079e-06, + "loss": 0.99858913, + "memory(GiB)": 302.58, + "step": 139640, + "train_speed(iter/s)": 0.124015 + }, + { + "acc": 0.73271646, + "epoch": 0.7810448698142041, + "grad_norm": 6.625, + "learning_rate": 7.150976519873302e-06, + "loss": 1.04785976, + "memory(GiB)": 302.58, + "step": 139660, + "train_speed(iter/s)": 0.124022 + }, + { + "acc": 0.76150093, + "epoch": 0.7811567192871833, + "grad_norm": 6.65625, + "learning_rate": 7.150141718776307e-06, + "loss": 0.92796402, + "memory(GiB)": 302.58, + "step": 139680, + "train_speed(iter/s)": 0.124031 + }, + { + "acc": 0.74559374, + "epoch": 0.7812685687601626, + "grad_norm": 6.75, + "learning_rate": 7.1493068441376455e-06, + "loss": 1.00196047, + "memory(GiB)": 302.58, + "step": 139700, + "train_speed(iter/s)": 0.124039 + }, + { + "acc": 0.73077807, + "epoch": 0.7813804182331419, + "grad_norm": 6.6875, + "learning_rate": 7.1484718959858765e-06, + "loss": 1.05466204, + "memory(GiB)": 302.58, + "step": 139720, + "train_speed(iter/s)": 0.124047 + }, + { + "acc": 0.75062952, + "epoch": 0.7814922677061211, + "grad_norm": 5.03125, + "learning_rate": 7.147636874349555e-06, + "loss": 0.98249483, + "memory(GiB)": 302.58, + "step": 139740, + "train_speed(iter/s)": 0.124056 + }, + { + "acc": 0.73248649, + "epoch": 0.7816041171791004, + "grad_norm": 8.0625, + "learning_rate": 7.146801779257243e-06, + "loss": 1.052526, + "memory(GiB)": 302.58, + "step": 139760, + "train_speed(iter/s)": 0.124065 + }, + { + "acc": 0.74698386, + "epoch": 0.7817159666520797, + "grad_norm": 7.15625, + "learning_rate": 7.145966610737503e-06, + "loss": 0.97461462, + "memory(GiB)": 302.58, + "step": 139780, + "train_speed(iter/s)": 0.124073 + }, + { + "acc": 0.74281726, + "epoch": 0.7818278161250589, + "grad_norm": 6.375, + "learning_rate": 7.1451313688188995e-06, + "loss": 1.02456732, + "memory(GiB)": 302.58, + "step": 139800, + "train_speed(iter/s)": 0.124081 + }, + { + "acc": 0.74432025, + "epoch": 0.7819396655980382, + "grad_norm": 11.0625, + "learning_rate": 7.144296053530002e-06, + "loss": 1.00829363, + "memory(GiB)": 302.58, + "step": 139820, + "train_speed(iter/s)": 0.124089 + }, + { + "acc": 0.75249834, + "epoch": 0.7820515150710174, + "grad_norm": 5.1875, + "learning_rate": 7.1434606648993795e-06, + "loss": 0.95301504, + "memory(GiB)": 302.58, + "step": 139840, + "train_speed(iter/s)": 0.124097 + }, + { + "acc": 0.74884195, + "epoch": 0.7821633645439967, + "grad_norm": 9.375, + "learning_rate": 7.142625202955606e-06, + "loss": 1.00187397, + "memory(GiB)": 302.58, + "step": 139860, + "train_speed(iter/s)": 0.124106 + }, + { + "acc": 0.72256374, + "epoch": 0.782275214016976, + "grad_norm": 9.875, + "learning_rate": 7.1417896677272556e-06, + "loss": 1.11074533, + "memory(GiB)": 302.58, + "step": 139880, + "train_speed(iter/s)": 0.124115 + }, + { + "acc": 0.73681536, + "epoch": 0.7823870634899552, + "grad_norm": 11.0, + "learning_rate": 7.140954059242908e-06, + "loss": 1.03086834, + "memory(GiB)": 302.58, + "step": 139900, + "train_speed(iter/s)": 0.124123 + }, + { + "acc": 0.72429523, + "epoch": 0.7824989129629345, + "grad_norm": 8.6875, + "learning_rate": 7.140118377531142e-06, + "loss": 1.10187349, + "memory(GiB)": 302.58, + "step": 139920, + "train_speed(iter/s)": 0.124132 + }, + { + "acc": 0.72945375, + "epoch": 0.7826107624359138, + "grad_norm": 8.4375, + "learning_rate": 7.1392826226205425e-06, + "loss": 1.07295923, + "memory(GiB)": 302.58, + "step": 139940, + "train_speed(iter/s)": 0.12414 + }, + { + "acc": 0.74269767, + "epoch": 0.782722611908893, + "grad_norm": 8.5, + "learning_rate": 7.138446794539693e-06, + "loss": 1.0220499, + "memory(GiB)": 302.58, + "step": 139960, + "train_speed(iter/s)": 0.124149 + }, + { + "acc": 0.7496717, + "epoch": 0.7828344613818723, + "grad_norm": 6.75, + "learning_rate": 7.137610893317183e-06, + "loss": 0.95096674, + "memory(GiB)": 302.58, + "step": 139980, + "train_speed(iter/s)": 0.124157 + }, + { + "acc": 0.73978252, + "epoch": 0.7829463108548516, + "grad_norm": 6.09375, + "learning_rate": 7.136774918981602e-06, + "loss": 1.02173786, + "memory(GiB)": 302.58, + "step": 140000, + "train_speed(iter/s)": 0.124165 + }, + { + "epoch": 0.7829463108548516, + "eval_acc": 0.7049421211362041, + "eval_loss": 1.0195680856704712, + "eval_runtime": 7500.9969, + "eval_samples_per_second": 10.036, + "eval_steps_per_second": 10.036, + "step": 140000 + }, + { + "acc": 0.7371933, + "epoch": 0.7830581603278308, + "grad_norm": 4.53125, + "learning_rate": 7.135938871561542e-06, + "loss": 1.01587524, + "memory(GiB)": 302.58, + "step": 140020, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.74462357, + "epoch": 0.7831700098008101, + "grad_norm": 7.78125, + "learning_rate": 7.135102751085601e-06, + "loss": 1.01242914, + "memory(GiB)": 302.58, + "step": 140040, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.73930063, + "epoch": 0.7832818592737893, + "grad_norm": 8.9375, + "learning_rate": 7.134266557582375e-06, + "loss": 1.01924114, + "memory(GiB)": 302.58, + "step": 140060, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.76427145, + "epoch": 0.7833937087467686, + "grad_norm": 6.21875, + "learning_rate": 7.1334302910804665e-06, + "loss": 0.91529398, + "memory(GiB)": 302.58, + "step": 140080, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.74034557, + "epoch": 0.7835055582197479, + "grad_norm": 8.4375, + "learning_rate": 7.132593951608476e-06, + "loss": 1.00777006, + "memory(GiB)": 302.58, + "step": 140100, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.74005365, + "epoch": 0.7836174076927271, + "grad_norm": 7.46875, + "learning_rate": 7.13175753919501e-06, + "loss": 1.02572565, + "memory(GiB)": 302.58, + "step": 140120, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.76804757, + "epoch": 0.7837292571657064, + "grad_norm": 7.53125, + "learning_rate": 7.130921053868678e-06, + "loss": 0.9236021, + "memory(GiB)": 302.58, + "step": 140140, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.74323831, + "epoch": 0.7838411066386857, + "grad_norm": 8.125, + "learning_rate": 7.130084495658086e-06, + "loss": 1.01780987, + "memory(GiB)": 302.58, + "step": 140160, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.73323874, + "epoch": 0.7839529561116649, + "grad_norm": 5.78125, + "learning_rate": 7.129247864591852e-06, + "loss": 1.08533316, + "memory(GiB)": 302.58, + "step": 140180, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.75379596, + "epoch": 0.7840648055846442, + "grad_norm": 7.6875, + "learning_rate": 7.128411160698589e-06, + "loss": 0.96554251, + "memory(GiB)": 302.58, + "step": 140200, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.7350512, + "epoch": 0.7841766550576235, + "grad_norm": 7.5, + "learning_rate": 7.127574384006915e-06, + "loss": 1.06795139, + "memory(GiB)": 302.58, + "step": 140220, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.73965454, + "epoch": 0.7842885045306027, + "grad_norm": 7.75, + "learning_rate": 7.126737534545452e-06, + "loss": 1.05031395, + "memory(GiB)": 302.58, + "step": 140240, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74556842, + "epoch": 0.784400354003582, + "grad_norm": 7.8125, + "learning_rate": 7.1259006123428196e-06, + "loss": 1.02305346, + "memory(GiB)": 302.58, + "step": 140260, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74321795, + "epoch": 0.7845122034765613, + "grad_norm": 8.25, + "learning_rate": 7.125063617427648e-06, + "loss": 1.00629244, + "memory(GiB)": 302.58, + "step": 140280, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.73338723, + "epoch": 0.7846240529495405, + "grad_norm": 8.375, + "learning_rate": 7.12422654982856e-06, + "loss": 1.06732359, + "memory(GiB)": 302.58, + "step": 140300, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.73505177, + "epoch": 0.7847359024225198, + "grad_norm": 6.625, + "learning_rate": 7.123389409574189e-06, + "loss": 1.06580572, + "memory(GiB)": 302.58, + "step": 140320, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74933944, + "epoch": 0.784847751895499, + "grad_norm": 5.875, + "learning_rate": 7.122552196693167e-06, + "loss": 0.9904129, + "memory(GiB)": 302.58, + "step": 140340, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.72977595, + "epoch": 0.7849596013684783, + "grad_norm": 4.6875, + "learning_rate": 7.1217149112141286e-06, + "loss": 1.09107723, + "memory(GiB)": 302.58, + "step": 140360, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.73416986, + "epoch": 0.7850714508414576, + "grad_norm": 9.8125, + "learning_rate": 7.120877553165714e-06, + "loss": 1.03709984, + "memory(GiB)": 302.58, + "step": 140380, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74453669, + "epoch": 0.7851833003144368, + "grad_norm": 6.53125, + "learning_rate": 7.120040122576561e-06, + "loss": 0.98937178, + "memory(GiB)": 302.58, + "step": 140400, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74190021, + "epoch": 0.7852951497874161, + "grad_norm": 8.0, + "learning_rate": 7.119202619475314e-06, + "loss": 1.0316824, + "memory(GiB)": 302.58, + "step": 140420, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74674168, + "epoch": 0.7854069992603954, + "grad_norm": 6.9375, + "learning_rate": 7.118365043890617e-06, + "loss": 0.98483315, + "memory(GiB)": 302.58, + "step": 140440, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74060144, + "epoch": 0.7855188487333746, + "grad_norm": 5.96875, + "learning_rate": 7.117527395851118e-06, + "loss": 1.03913698, + "memory(GiB)": 302.58, + "step": 140460, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74109745, + "epoch": 0.7856306982063539, + "grad_norm": 4.6875, + "learning_rate": 7.1166896753854685e-06, + "loss": 1.00518646, + "memory(GiB)": 302.58, + "step": 140480, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.7464406, + "epoch": 0.7857425476793332, + "grad_norm": 7.75, + "learning_rate": 7.11585188252232e-06, + "loss": 0.99073648, + "memory(GiB)": 302.58, + "step": 140500, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.73626428, + "epoch": 0.7858543971523124, + "grad_norm": 7.96875, + "learning_rate": 7.115014017290327e-06, + "loss": 1.0234745, + "memory(GiB)": 302.58, + "step": 140520, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74213462, + "epoch": 0.7859662466252917, + "grad_norm": 6.0625, + "learning_rate": 7.114176079718149e-06, + "loss": 1.01517096, + "memory(GiB)": 302.58, + "step": 140540, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.76423907, + "epoch": 0.786078096098271, + "grad_norm": 6.75, + "learning_rate": 7.113338069834445e-06, + "loss": 0.92193279, + "memory(GiB)": 302.58, + "step": 140560, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.74692535, + "epoch": 0.7861899455712502, + "grad_norm": 9.375, + "learning_rate": 7.112499987667879e-06, + "loss": 1.01020575, + "memory(GiB)": 302.58, + "step": 140580, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.72450161, + "epoch": 0.7863017950442295, + "grad_norm": 5.40625, + "learning_rate": 7.111661833247113e-06, + "loss": 1.10757446, + "memory(GiB)": 302.58, + "step": 140600, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.73967338, + "epoch": 0.7864136445172087, + "grad_norm": 7.75, + "learning_rate": 7.1108236066008175e-06, + "loss": 1.02300787, + "memory(GiB)": 302.58, + "step": 140620, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.75192623, + "epoch": 0.786525493990188, + "grad_norm": 7.53125, + "learning_rate": 7.109985307757661e-06, + "loss": 0.98131361, + "memory(GiB)": 302.58, + "step": 140640, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.7426125, + "epoch": 0.7866373434631673, + "grad_norm": 6.25, + "learning_rate": 7.109146936746316e-06, + "loss": 1.01881762, + "memory(GiB)": 302.58, + "step": 140660, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.73435154, + "epoch": 0.7867491929361465, + "grad_norm": 6.0, + "learning_rate": 7.1083084935954595e-06, + "loss": 1.05328121, + "memory(GiB)": 302.58, + "step": 140680, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.74997044, + "epoch": 0.7868610424091258, + "grad_norm": 10.25, + "learning_rate": 7.107469978333765e-06, + "loss": 0.98842058, + "memory(GiB)": 302.58, + "step": 140700, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.73903828, + "epoch": 0.7869728918821051, + "grad_norm": 8.5625, + "learning_rate": 7.106631390989915e-06, + "loss": 1.03791895, + "memory(GiB)": 302.58, + "step": 140720, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.7492909, + "epoch": 0.7870847413550843, + "grad_norm": 6.75, + "learning_rate": 7.105792731592593e-06, + "loss": 0.99117384, + "memory(GiB)": 302.58, + "step": 140740, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.7485836, + "epoch": 0.7871965908280636, + "grad_norm": 5.59375, + "learning_rate": 7.104954000170481e-06, + "loss": 0.99219074, + "memory(GiB)": 302.58, + "step": 140760, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.72408123, + "epoch": 0.7873084403010429, + "grad_norm": 8.5, + "learning_rate": 7.1041151967522685e-06, + "loss": 1.11769762, + "memory(GiB)": 302.58, + "step": 140780, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.73791032, + "epoch": 0.7874202897740221, + "grad_norm": 7.09375, + "learning_rate": 7.103276321366644e-06, + "loss": 1.02575779, + "memory(GiB)": 302.58, + "step": 140800, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.748072, + "epoch": 0.7875321392470014, + "grad_norm": 6.53125, + "learning_rate": 7.102437374042299e-06, + "loss": 1.0006938, + "memory(GiB)": 302.58, + "step": 140820, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.74166226, + "epoch": 0.7876439887199806, + "grad_norm": 6.96875, + "learning_rate": 7.10159835480793e-06, + "loss": 1.0166379, + "memory(GiB)": 302.58, + "step": 140840, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.74863958, + "epoch": 0.7877558381929599, + "grad_norm": 10.5, + "learning_rate": 7.100759263692234e-06, + "loss": 0.98575869, + "memory(GiB)": 302.58, + "step": 140860, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.74108262, + "epoch": 0.7878676876659392, + "grad_norm": 7.90625, + "learning_rate": 7.099920100723909e-06, + "loss": 1.02791414, + "memory(GiB)": 302.58, + "step": 140880, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.7460454, + "epoch": 0.7879795371389184, + "grad_norm": 8.9375, + "learning_rate": 7.0990808659316565e-06, + "loss": 0.97747889, + "memory(GiB)": 302.58, + "step": 140900, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.73940511, + "epoch": 0.7880913866118977, + "grad_norm": 12.0, + "learning_rate": 7.098241559344185e-06, + "loss": 1.02473583, + "memory(GiB)": 302.58, + "step": 140920, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.74410925, + "epoch": 0.788203236084877, + "grad_norm": 8.125, + "learning_rate": 7.097402180990198e-06, + "loss": 0.99619246, + "memory(GiB)": 302.58, + "step": 140940, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.74407997, + "epoch": 0.7883150855578562, + "grad_norm": 6.59375, + "learning_rate": 7.0965627308984056e-06, + "loss": 0.99449434, + "memory(GiB)": 302.58, + "step": 140960, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.74652038, + "epoch": 0.7884269350308355, + "grad_norm": 7.03125, + "learning_rate": 7.095723209097519e-06, + "loss": 0.98471775, + "memory(GiB)": 302.58, + "step": 140980, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.74144239, + "epoch": 0.7885387845038148, + "grad_norm": 5.59375, + "learning_rate": 7.094883615616255e-06, + "loss": 1.01000261, + "memory(GiB)": 302.58, + "step": 141000, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.73051953, + "epoch": 0.788650633976794, + "grad_norm": 8.75, + "learning_rate": 7.094043950483327e-06, + "loss": 1.05492678, + "memory(GiB)": 302.58, + "step": 141020, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.72971697, + "epoch": 0.7887624834497733, + "grad_norm": 7.34375, + "learning_rate": 7.093204213727456e-06, + "loss": 1.0711792, + "memory(GiB)": 302.58, + "step": 141040, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.74978099, + "epoch": 0.7888743329227526, + "grad_norm": 5.125, + "learning_rate": 7.092364405377363e-06, + "loss": 0.97554951, + "memory(GiB)": 302.58, + "step": 141060, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.74557214, + "epoch": 0.7889861823957318, + "grad_norm": 10.0625, + "learning_rate": 7.091524525461775e-06, + "loss": 0.98113079, + "memory(GiB)": 302.58, + "step": 141080, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.74864173, + "epoch": 0.7890980318687111, + "grad_norm": 5.3125, + "learning_rate": 7.090684574009415e-06, + "loss": 0.99533205, + "memory(GiB)": 302.58, + "step": 141100, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.73701153, + "epoch": 0.7892098813416903, + "grad_norm": 8.0, + "learning_rate": 7.0898445510490125e-06, + "loss": 1.02288609, + "memory(GiB)": 302.58, + "step": 141120, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.75024014, + "epoch": 0.7893217308146696, + "grad_norm": 6.0625, + "learning_rate": 7.089004456609301e-06, + "loss": 0.98455219, + "memory(GiB)": 302.58, + "step": 141140, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.75534668, + "epoch": 0.7894335802876489, + "grad_norm": 9.625, + "learning_rate": 7.088164290719011e-06, + "loss": 0.95764847, + "memory(GiB)": 302.58, + "step": 141160, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.7346086, + "epoch": 0.7895454297606281, + "grad_norm": 7.6875, + "learning_rate": 7.087324053406882e-06, + "loss": 1.07520628, + "memory(GiB)": 302.58, + "step": 141180, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.74256425, + "epoch": 0.7896572792336074, + "grad_norm": 6.21875, + "learning_rate": 7.086483744701651e-06, + "loss": 0.99479542, + "memory(GiB)": 302.58, + "step": 141200, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.74011636, + "epoch": 0.7897691287065867, + "grad_norm": 5.84375, + "learning_rate": 7.085643364632061e-06, + "loss": 1.01616163, + "memory(GiB)": 302.58, + "step": 141220, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.75239916, + "epoch": 0.7898809781795659, + "grad_norm": 8.1875, + "learning_rate": 7.084802913226855e-06, + "loss": 0.95570011, + "memory(GiB)": 302.58, + "step": 141240, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.73161387, + "epoch": 0.7899928276525452, + "grad_norm": 6.6875, + "learning_rate": 7.083962390514777e-06, + "loss": 1.04909668, + "memory(GiB)": 302.58, + "step": 141260, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.72841058, + "epoch": 0.7901046771255245, + "grad_norm": 7.5625, + "learning_rate": 7.083121796524578e-06, + "loss": 1.07846823, + "memory(GiB)": 302.58, + "step": 141280, + "train_speed(iter/s)": 0.123862 + }, + { + "acc": 0.7573235, + "epoch": 0.7902165265985037, + "grad_norm": 8.0, + "learning_rate": 7.082281131285007e-06, + "loss": 0.9470149, + "memory(GiB)": 302.58, + "step": 141300, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.75087171, + "epoch": 0.790328376071483, + "grad_norm": 8.9375, + "learning_rate": 7.08144039482482e-06, + "loss": 0.9738656, + "memory(GiB)": 302.58, + "step": 141320, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.73858438, + "epoch": 0.7904402255444622, + "grad_norm": 6.90625, + "learning_rate": 7.080599587172772e-06, + "loss": 1.03475285, + "memory(GiB)": 302.58, + "step": 141340, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.73436685, + "epoch": 0.7905520750174415, + "grad_norm": 4.6875, + "learning_rate": 7.079758708357619e-06, + "loss": 1.04687881, + "memory(GiB)": 302.58, + "step": 141360, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.73981547, + "epoch": 0.7906639244904208, + "grad_norm": 6.5625, + "learning_rate": 7.0789177584081246e-06, + "loss": 1.03235312, + "memory(GiB)": 302.58, + "step": 141380, + "train_speed(iter/s)": 0.123904 + }, + { + "acc": 0.73380494, + "epoch": 0.7907757739634, + "grad_norm": 6.53125, + "learning_rate": 7.07807673735305e-06, + "loss": 1.06034851, + "memory(GiB)": 302.58, + "step": 141400, + "train_speed(iter/s)": 0.123912 + }, + { + "acc": 0.75803027, + "epoch": 0.7908876234363793, + "grad_norm": 5.09375, + "learning_rate": 7.077235645221163e-06, + "loss": 0.93332434, + "memory(GiB)": 302.58, + "step": 141420, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.73443518, + "epoch": 0.7909994729093586, + "grad_norm": 7.6875, + "learning_rate": 7.0763944820412274e-06, + "loss": 1.05509272, + "memory(GiB)": 302.58, + "step": 141440, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.73538032, + "epoch": 0.7911113223823378, + "grad_norm": 9.3125, + "learning_rate": 7.075553247842018e-06, + "loss": 1.03092098, + "memory(GiB)": 302.58, + "step": 141460, + "train_speed(iter/s)": 0.123937 + }, + { + "acc": 0.75042458, + "epoch": 0.7912231718553171, + "grad_norm": 8.0625, + "learning_rate": 7.074711942652306e-06, + "loss": 0.97767591, + "memory(GiB)": 302.58, + "step": 141480, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.74368582, + "epoch": 0.7913350213282964, + "grad_norm": 10.625, + "learning_rate": 7.073870566500868e-06, + "loss": 1.01861506, + "memory(GiB)": 302.58, + "step": 141500, + "train_speed(iter/s)": 0.123953 + }, + { + "acc": 0.73758669, + "epoch": 0.7914468708012756, + "grad_norm": 7.25, + "learning_rate": 7.0730291194164795e-06, + "loss": 1.03307009, + "memory(GiB)": 302.58, + "step": 141520, + "train_speed(iter/s)": 0.12396 + }, + { + "acc": 0.73253508, + "epoch": 0.7915587202742549, + "grad_norm": 10.0625, + "learning_rate": 7.072187601427922e-06, + "loss": 1.0463212, + "memory(GiB)": 302.58, + "step": 141540, + "train_speed(iter/s)": 0.123969 + }, + { + "acc": 0.75250516, + "epoch": 0.7916705697472342, + "grad_norm": 7.96875, + "learning_rate": 7.071346012563977e-06, + "loss": 0.97023888, + "memory(GiB)": 302.58, + "step": 141560, + "train_speed(iter/s)": 0.123977 + }, + { + "acc": 0.74441276, + "epoch": 0.7917824192202134, + "grad_norm": 7.375, + "learning_rate": 7.070504352853432e-06, + "loss": 1.00283947, + "memory(GiB)": 302.58, + "step": 141580, + "train_speed(iter/s)": 0.123985 + }, + { + "acc": 0.74409885, + "epoch": 0.7918942686931927, + "grad_norm": 8.5, + "learning_rate": 7.0696626223250705e-06, + "loss": 0.98087502, + "memory(GiB)": 302.58, + "step": 141600, + "train_speed(iter/s)": 0.123993 + }, + { + "acc": 0.74659262, + "epoch": 0.792006118166172, + "grad_norm": 6.125, + "learning_rate": 7.068820821007687e-06, + "loss": 0.99761581, + "memory(GiB)": 302.58, + "step": 141620, + "train_speed(iter/s)": 0.124001 + }, + { + "acc": 0.73411226, + "epoch": 0.7921179676391512, + "grad_norm": 6.9375, + "learning_rate": 7.0679789489300695e-06, + "loss": 1.05757828, + "memory(GiB)": 302.58, + "step": 141640, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.73645592, + "epoch": 0.7922298171121305, + "grad_norm": 6.59375, + "learning_rate": 7.067137006121017e-06, + "loss": 1.03020735, + "memory(GiB)": 302.58, + "step": 141660, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.7254559, + "epoch": 0.7923416665851097, + "grad_norm": 8.25, + "learning_rate": 7.066294992609322e-06, + "loss": 1.1013485, + "memory(GiB)": 302.58, + "step": 141680, + "train_speed(iter/s)": 0.124026 + }, + { + "acc": 0.73124061, + "epoch": 0.792453516058089, + "grad_norm": 6.5, + "learning_rate": 7.065452908423787e-06, + "loss": 1.07378559, + "memory(GiB)": 302.58, + "step": 141700, + "train_speed(iter/s)": 0.124034 + }, + { + "acc": 0.7272408, + "epoch": 0.7925653655310683, + "grad_norm": 4.4375, + "learning_rate": 7.064610753593213e-06, + "loss": 1.08666658, + "memory(GiB)": 302.58, + "step": 141720, + "train_speed(iter/s)": 0.124042 + }, + { + "acc": 0.72028794, + "epoch": 0.7926772150040475, + "grad_norm": 6.71875, + "learning_rate": 7.063768528146403e-06, + "loss": 1.1288168, + "memory(GiB)": 302.58, + "step": 141740, + "train_speed(iter/s)": 0.124051 + }, + { + "acc": 0.73985023, + "epoch": 0.7927890644770268, + "grad_norm": 6.375, + "learning_rate": 7.0629262321121675e-06, + "loss": 1.03431044, + "memory(GiB)": 302.58, + "step": 141760, + "train_speed(iter/s)": 0.124059 + }, + { + "acc": 0.72988567, + "epoch": 0.7929009139500061, + "grad_norm": 4.46875, + "learning_rate": 7.0620838655193124e-06, + "loss": 1.08970757, + "memory(GiB)": 302.58, + "step": 141780, + "train_speed(iter/s)": 0.124067 + }, + { + "acc": 0.73836994, + "epoch": 0.7930127634229853, + "grad_norm": 7.78125, + "learning_rate": 7.0612414283966505e-06, + "loss": 1.027771, + "memory(GiB)": 302.58, + "step": 141800, + "train_speed(iter/s)": 0.124075 + }, + { + "acc": 0.73910656, + "epoch": 0.7931246128959646, + "grad_norm": 7.90625, + "learning_rate": 7.060398920772996e-06, + "loss": 1.00131044, + "memory(GiB)": 302.58, + "step": 141820, + "train_speed(iter/s)": 0.124084 + }, + { + "acc": 0.74573994, + "epoch": 0.7932364623689439, + "grad_norm": 9.125, + "learning_rate": 7.059556342677165e-06, + "loss": 0.99425535, + "memory(GiB)": 302.58, + "step": 141840, + "train_speed(iter/s)": 0.124092 + }, + { + "acc": 0.74962187, + "epoch": 0.7933483118419231, + "grad_norm": 10.9375, + "learning_rate": 7.058713694137977e-06, + "loss": 0.97095242, + "memory(GiB)": 302.58, + "step": 141860, + "train_speed(iter/s)": 0.1241 + }, + { + "acc": 0.74037614, + "epoch": 0.7934601613149024, + "grad_norm": 5.625, + "learning_rate": 7.057870975184251e-06, + "loss": 1.01297541, + "memory(GiB)": 302.58, + "step": 141880, + "train_speed(iter/s)": 0.124108 + }, + { + "acc": 0.73460779, + "epoch": 0.7935720107878816, + "grad_norm": 5.84375, + "learning_rate": 7.057028185844814e-06, + "loss": 1.04249582, + "memory(GiB)": 302.58, + "step": 141900, + "train_speed(iter/s)": 0.124117 + }, + { + "acc": 0.74881902, + "epoch": 0.7936838602608609, + "grad_norm": 10.0, + "learning_rate": 7.056185326148489e-06, + "loss": 1.00738459, + "memory(GiB)": 302.58, + "step": 141920, + "train_speed(iter/s)": 0.124125 + }, + { + "acc": 0.73788667, + "epoch": 0.7937957097338402, + "grad_norm": 7.03125, + "learning_rate": 7.055342396124107e-06, + "loss": 1.03110428, + "memory(GiB)": 302.58, + "step": 141940, + "train_speed(iter/s)": 0.124134 + }, + { + "acc": 0.75386553, + "epoch": 0.7939075592068194, + "grad_norm": 5.03125, + "learning_rate": 7.054499395800498e-06, + "loss": 0.96768999, + "memory(GiB)": 302.58, + "step": 141960, + "train_speed(iter/s)": 0.124142 + }, + { + "acc": 0.74052896, + "epoch": 0.7940194086797987, + "grad_norm": 6.78125, + "learning_rate": 7.053656325206494e-06, + "loss": 0.98920527, + "memory(GiB)": 302.58, + "step": 141980, + "train_speed(iter/s)": 0.12415 + }, + { + "acc": 0.74327374, + "epoch": 0.794131258152778, + "grad_norm": 4.34375, + "learning_rate": 7.052813184370933e-06, + "loss": 1.00006618, + "memory(GiB)": 302.58, + "step": 142000, + "train_speed(iter/s)": 0.124158 + }, + { + "epoch": 0.794131258152778, + "eval_acc": 0.7050658560697506, + "eval_loss": 1.0190843343734741, + "eval_runtime": 7491.9076, + "eval_samples_per_second": 10.049, + "eval_steps_per_second": 10.049, + "step": 142000 + }, + { + "acc": 0.73359504, + "epoch": 0.7942431076257572, + "grad_norm": 9.6875, + "learning_rate": 7.05196997332265e-06, + "loss": 1.07125864, + "memory(GiB)": 302.58, + "step": 142020, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.73768439, + "epoch": 0.7943549570987365, + "grad_norm": 7.96875, + "learning_rate": 7.051126692090488e-06, + "loss": 1.01266937, + "memory(GiB)": 302.58, + "step": 142040, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.73908582, + "epoch": 0.7944668065717158, + "grad_norm": 9.8125, + "learning_rate": 7.0502833407032874e-06, + "loss": 1.03783054, + "memory(GiB)": 302.58, + "step": 142060, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.72898612, + "epoch": 0.794578656044695, + "grad_norm": 6.03125, + "learning_rate": 7.0494399191898975e-06, + "loss": 1.06593475, + "memory(GiB)": 302.58, + "step": 142080, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.74736023, + "epoch": 0.7946905055176743, + "grad_norm": 7.0625, + "learning_rate": 7.048596427579164e-06, + "loss": 0.96796112, + "memory(GiB)": 302.58, + "step": 142100, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.72723403, + "epoch": 0.7948023549906535, + "grad_norm": 5.84375, + "learning_rate": 7.047752865899937e-06, + "loss": 1.07208328, + "memory(GiB)": 302.58, + "step": 142120, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.74646816, + "epoch": 0.7949142044636328, + "grad_norm": 9.8125, + "learning_rate": 7.046909234181067e-06, + "loss": 0.99240398, + "memory(GiB)": 302.58, + "step": 142140, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.73934641, + "epoch": 0.7950260539366121, + "grad_norm": 10.5625, + "learning_rate": 7.046065532451413e-06, + "loss": 1.01706724, + "memory(GiB)": 302.58, + "step": 142160, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.74827132, + "epoch": 0.7951379034095913, + "grad_norm": 6.875, + "learning_rate": 7.045221760739827e-06, + "loss": 0.98258276, + "memory(GiB)": 302.58, + "step": 142180, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.73048768, + "epoch": 0.7952497528825706, + "grad_norm": 7.03125, + "learning_rate": 7.044377919075174e-06, + "loss": 1.0581543, + "memory(GiB)": 302.58, + "step": 142200, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75885396, + "epoch": 0.7953616023555499, + "grad_norm": 9.8125, + "learning_rate": 7.043534007486313e-06, + "loss": 0.92408714, + "memory(GiB)": 302.58, + "step": 142220, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.74082537, + "epoch": 0.7954734518285291, + "grad_norm": 6.21875, + "learning_rate": 7.042690026002108e-06, + "loss": 1.01302748, + "memory(GiB)": 302.58, + "step": 142240, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.7481133, + "epoch": 0.7955853013015084, + "grad_norm": 8.3125, + "learning_rate": 7.041845974651428e-06, + "loss": 0.97612495, + "memory(GiB)": 302.58, + "step": 142260, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.73466763, + "epoch": 0.7956971507744877, + "grad_norm": 12.0, + "learning_rate": 7.0410018534631405e-06, + "loss": 1.04490242, + "memory(GiB)": 302.58, + "step": 142280, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.72288232, + "epoch": 0.7958090002474669, + "grad_norm": 10.75, + "learning_rate": 7.040157662466119e-06, + "loss": 1.09395838, + "memory(GiB)": 302.58, + "step": 142300, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.73271809, + "epoch": 0.7959208497204462, + "grad_norm": 10.0625, + "learning_rate": 7.0393134016892336e-06, + "loss": 1.06622553, + "memory(GiB)": 302.58, + "step": 142320, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74943676, + "epoch": 0.7960326991934255, + "grad_norm": 7.5625, + "learning_rate": 7.0384690711613655e-06, + "loss": 0.99965897, + "memory(GiB)": 302.58, + "step": 142340, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.7314271, + "epoch": 0.7961445486664047, + "grad_norm": 5.1875, + "learning_rate": 7.0376246709113915e-06, + "loss": 1.06646347, + "memory(GiB)": 302.58, + "step": 142360, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75847926, + "epoch": 0.796256398139384, + "grad_norm": 9.3125, + "learning_rate": 7.036780200968192e-06, + "loss": 0.94324236, + "memory(GiB)": 302.58, + "step": 142380, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.7419909, + "epoch": 0.7963682476123632, + "grad_norm": 6.3125, + "learning_rate": 7.035935661360651e-06, + "loss": 1.00612326, + "memory(GiB)": 302.58, + "step": 142400, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.74339471, + "epoch": 0.7964800970853425, + "grad_norm": 5.34375, + "learning_rate": 7.035091052117655e-06, + "loss": 0.99149065, + "memory(GiB)": 302.58, + "step": 142420, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74758959, + "epoch": 0.7965919465583218, + "grad_norm": 9.25, + "learning_rate": 7.034246373268092e-06, + "loss": 0.99894009, + "memory(GiB)": 302.58, + "step": 142440, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74531913, + "epoch": 0.796703796031301, + "grad_norm": 6.78125, + "learning_rate": 7.033401624840852e-06, + "loss": 1.01829624, + "memory(GiB)": 302.58, + "step": 142460, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.73966579, + "epoch": 0.7968156455042803, + "grad_norm": 7.25, + "learning_rate": 7.0325568068648296e-06, + "loss": 1.02359819, + "memory(GiB)": 302.58, + "step": 142480, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.74188166, + "epoch": 0.7969274949772596, + "grad_norm": 8.875, + "learning_rate": 7.031711919368919e-06, + "loss": 0.99607849, + "memory(GiB)": 302.58, + "step": 142500, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74070711, + "epoch": 0.7970393444502388, + "grad_norm": 6.84375, + "learning_rate": 7.030866962382018e-06, + "loss": 1.00464287, + "memory(GiB)": 302.58, + "step": 142520, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.73602448, + "epoch": 0.7971511939232181, + "grad_norm": 6.03125, + "learning_rate": 7.030021935933028e-06, + "loss": 1.00619879, + "memory(GiB)": 302.58, + "step": 142540, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.74428434, + "epoch": 0.7972630433961974, + "grad_norm": 5.375, + "learning_rate": 7.02917684005085e-06, + "loss": 1.01428089, + "memory(GiB)": 302.58, + "step": 142560, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.72659302, + "epoch": 0.7973748928691766, + "grad_norm": 8.0625, + "learning_rate": 7.02833167476439e-06, + "loss": 1.06421814, + "memory(GiB)": 302.58, + "step": 142580, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.74245448, + "epoch": 0.7974867423421559, + "grad_norm": 7.34375, + "learning_rate": 7.0274864401025545e-06, + "loss": 1.00922976, + "memory(GiB)": 302.58, + "step": 142600, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.72842278, + "epoch": 0.7975985918151351, + "grad_norm": 8.4375, + "learning_rate": 7.026641136094254e-06, + "loss": 1.07940226, + "memory(GiB)": 302.58, + "step": 142620, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.74746532, + "epoch": 0.7977104412881144, + "grad_norm": 6.0625, + "learning_rate": 7.025795762768401e-06, + "loss": 0.98024378, + "memory(GiB)": 302.58, + "step": 142640, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.72753758, + "epoch": 0.7978222907610937, + "grad_norm": 6.65625, + "learning_rate": 7.024950320153909e-06, + "loss": 1.09113674, + "memory(GiB)": 302.58, + "step": 142660, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.7366899, + "epoch": 0.7979341402340729, + "grad_norm": 7.9375, + "learning_rate": 7.024104808279694e-06, + "loss": 1.01959639, + "memory(GiB)": 302.58, + "step": 142680, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74022374, + "epoch": 0.7980459897070522, + "grad_norm": 6.1875, + "learning_rate": 7.023259227174677e-06, + "loss": 1.0236166, + "memory(GiB)": 302.58, + "step": 142700, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.74657226, + "epoch": 0.7981578391800315, + "grad_norm": 6.90625, + "learning_rate": 7.022413576867779e-06, + "loss": 0.98477945, + "memory(GiB)": 302.58, + "step": 142720, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.73987212, + "epoch": 0.7982696886530107, + "grad_norm": 7.125, + "learning_rate": 7.021567857387925e-06, + "loss": 1.02782164, + "memory(GiB)": 302.58, + "step": 142740, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74132118, + "epoch": 0.79838153812599, + "grad_norm": 6.40625, + "learning_rate": 7.020722068764038e-06, + "loss": 1.0361475, + "memory(GiB)": 302.58, + "step": 142760, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.71314564, + "epoch": 0.7984933875989693, + "grad_norm": 7.21875, + "learning_rate": 7.01987621102505e-06, + "loss": 1.17454262, + "memory(GiB)": 302.58, + "step": 142780, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.7498611, + "epoch": 0.7986052370719485, + "grad_norm": 6.03125, + "learning_rate": 7.019030284199891e-06, + "loss": 0.96353226, + "memory(GiB)": 302.58, + "step": 142800, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.75827532, + "epoch": 0.7987170865449278, + "grad_norm": 10.1875, + "learning_rate": 7.018184288317492e-06, + "loss": 0.93062611, + "memory(GiB)": 302.58, + "step": 142820, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.74436321, + "epoch": 0.798828936017907, + "grad_norm": 6.90625, + "learning_rate": 7.0173382234067935e-06, + "loss": 1.00684614, + "memory(GiB)": 302.58, + "step": 142840, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.72521787, + "epoch": 0.7989407854908863, + "grad_norm": 7.125, + "learning_rate": 7.016492089496729e-06, + "loss": 1.08563719, + "memory(GiB)": 302.58, + "step": 142860, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.7259645, + "epoch": 0.7990526349638656, + "grad_norm": 7.8125, + "learning_rate": 7.015645886616243e-06, + "loss": 1.08886633, + "memory(GiB)": 302.58, + "step": 142880, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.75527964, + "epoch": 0.7991644844368448, + "grad_norm": 6.28125, + "learning_rate": 7.014799614794273e-06, + "loss": 0.95835285, + "memory(GiB)": 302.58, + "step": 142900, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.73572388, + "epoch": 0.7992763339098241, + "grad_norm": 8.0625, + "learning_rate": 7.013953274059771e-06, + "loss": 1.04326982, + "memory(GiB)": 302.58, + "step": 142920, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.73970056, + "epoch": 0.7993881833828034, + "grad_norm": 9.8125, + "learning_rate": 7.013106864441679e-06, + "loss": 1.02655449, + "memory(GiB)": 302.58, + "step": 142940, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.74532185, + "epoch": 0.7995000328557826, + "grad_norm": 7.40625, + "learning_rate": 7.0122603859689485e-06, + "loss": 0.96604252, + "memory(GiB)": 302.58, + "step": 142960, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.7581841, + "epoch": 0.7996118823287619, + "grad_norm": 4.34375, + "learning_rate": 7.011413838670533e-06, + "loss": 0.95297928, + "memory(GiB)": 302.58, + "step": 142980, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.74762235, + "epoch": 0.7997237318017412, + "grad_norm": 5.8125, + "learning_rate": 7.010567222575386e-06, + "loss": 0.99794579, + "memory(GiB)": 302.58, + "step": 143000, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.74367437, + "epoch": 0.7998355812747204, + "grad_norm": 9.4375, + "learning_rate": 7.009720537712466e-06, + "loss": 1.00922928, + "memory(GiB)": 302.58, + "step": 143020, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.74924712, + "epoch": 0.7999474307476997, + "grad_norm": 7.21875, + "learning_rate": 7.008873784110728e-06, + "loss": 0.96722164, + "memory(GiB)": 302.58, + "step": 143040, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.72984362, + "epoch": 0.800059280220679, + "grad_norm": 5.40625, + "learning_rate": 7.008026961799138e-06, + "loss": 1.0537055, + "memory(GiB)": 302.58, + "step": 143060, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.75709434, + "epoch": 0.8001711296936582, + "grad_norm": 5.5625, + "learning_rate": 7.007180070806659e-06, + "loss": 0.9271327, + "memory(GiB)": 302.58, + "step": 143080, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.7514823, + "epoch": 0.8002829791666375, + "grad_norm": 6.25, + "learning_rate": 7.0063331111622565e-06, + "loss": 0.95862722, + "memory(GiB)": 302.58, + "step": 143100, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.75299449, + "epoch": 0.8003948286396168, + "grad_norm": 9.375, + "learning_rate": 7.0054860828949015e-06, + "loss": 0.94542484, + "memory(GiB)": 302.58, + "step": 143120, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.7482327, + "epoch": 0.800506678112596, + "grad_norm": 7.40625, + "learning_rate": 7.004638986033561e-06, + "loss": 1.0022049, + "memory(GiB)": 302.58, + "step": 143140, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.74116287, + "epoch": 0.8006185275855753, + "grad_norm": 7.4375, + "learning_rate": 7.003791820607211e-06, + "loss": 0.99985752, + "memory(GiB)": 302.58, + "step": 143160, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.73229737, + "epoch": 0.8007303770585545, + "grad_norm": 7.625, + "learning_rate": 7.002944586644827e-06, + "loss": 1.05338984, + "memory(GiB)": 302.58, + "step": 143180, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.74403167, + "epoch": 0.8008422265315338, + "grad_norm": 5.53125, + "learning_rate": 7.002097284175387e-06, + "loss": 1.01157017, + "memory(GiB)": 302.58, + "step": 143200, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.73105311, + "epoch": 0.8009540760045131, + "grad_norm": 6.78125, + "learning_rate": 7.001249913227872e-06, + "loss": 1.05785065, + "memory(GiB)": 302.58, + "step": 143220, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.73517046, + "epoch": 0.8010659254774923, + "grad_norm": 8.0625, + "learning_rate": 7.000402473831264e-06, + "loss": 1.03515682, + "memory(GiB)": 302.58, + "step": 143240, + "train_speed(iter/s)": 0.123846 + }, + { + "acc": 0.72773914, + "epoch": 0.8011777749504717, + "grad_norm": 8.625, + "learning_rate": 6.999554966014548e-06, + "loss": 1.08455505, + "memory(GiB)": 302.58, + "step": 143260, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.73234358, + "epoch": 0.801289624423451, + "grad_norm": 5.375, + "learning_rate": 6.998707389806712e-06, + "loss": 1.06601334, + "memory(GiB)": 302.58, + "step": 143280, + "train_speed(iter/s)": 0.123862 + }, + { + "acc": 0.72100143, + "epoch": 0.8014014738964302, + "grad_norm": 8.25, + "learning_rate": 6.997859745236744e-06, + "loss": 1.11683083, + "memory(GiB)": 302.58, + "step": 143300, + "train_speed(iter/s)": 0.123869 + }, + { + "acc": 0.73974738, + "epoch": 0.8015133233694095, + "grad_norm": 6.6875, + "learning_rate": 6.99701203233364e-06, + "loss": 1.03538532, + "memory(GiB)": 302.58, + "step": 143320, + "train_speed(iter/s)": 0.123878 + }, + { + "acc": 0.73472052, + "epoch": 0.8016251728423888, + "grad_norm": 7.375, + "learning_rate": 6.996164251126389e-06, + "loss": 1.03073397, + "memory(GiB)": 302.58, + "step": 143340, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.75795932, + "epoch": 0.801737022315368, + "grad_norm": 6.03125, + "learning_rate": 6.995316401643994e-06, + "loss": 0.94602795, + "memory(GiB)": 302.58, + "step": 143360, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.74656997, + "epoch": 0.8018488717883473, + "grad_norm": 6.59375, + "learning_rate": 6.994468483915451e-06, + "loss": 0.98668528, + "memory(GiB)": 302.58, + "step": 143380, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73011079, + "epoch": 0.8019607212613266, + "grad_norm": 9.875, + "learning_rate": 6.993620497969761e-06, + "loss": 1.06673174, + "memory(GiB)": 302.58, + "step": 143400, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.74430966, + "epoch": 0.8020725707343058, + "grad_norm": 5.03125, + "learning_rate": 6.992772443835928e-06, + "loss": 1.00984516, + "memory(GiB)": 302.58, + "step": 143420, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.74575934, + "epoch": 0.8021844202072851, + "grad_norm": 6.125, + "learning_rate": 6.991924321542959e-06, + "loss": 0.98823376, + "memory(GiB)": 302.58, + "step": 143440, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.74524312, + "epoch": 0.8022962696802644, + "grad_norm": 7.28125, + "learning_rate": 6.991076131119861e-06, + "loss": 1.00897131, + "memory(GiB)": 302.58, + "step": 143460, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.74724169, + "epoch": 0.8024081191532436, + "grad_norm": 7.28125, + "learning_rate": 6.990227872595646e-06, + "loss": 0.98510799, + "memory(GiB)": 302.58, + "step": 143480, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.74928107, + "epoch": 0.8025199686262229, + "grad_norm": 6.59375, + "learning_rate": 6.989379545999327e-06, + "loss": 0.98103704, + "memory(GiB)": 302.58, + "step": 143500, + "train_speed(iter/s)": 0.123953 + }, + { + "acc": 0.74237318, + "epoch": 0.8026318180992021, + "grad_norm": 6.71875, + "learning_rate": 6.9885311513599205e-06, + "loss": 1.01388941, + "memory(GiB)": 302.58, + "step": 143520, + "train_speed(iter/s)": 0.123962 + }, + { + "acc": 0.74658923, + "epoch": 0.8027436675721814, + "grad_norm": 6.40625, + "learning_rate": 6.987682688706443e-06, + "loss": 0.99890003, + "memory(GiB)": 302.58, + "step": 143540, + "train_speed(iter/s)": 0.12397 + }, + { + "acc": 0.73411431, + "epoch": 0.8028555170451607, + "grad_norm": 8.8125, + "learning_rate": 6.986834158067914e-06, + "loss": 1.04557247, + "memory(GiB)": 302.58, + "step": 143560, + "train_speed(iter/s)": 0.123978 + }, + { + "acc": 0.73752136, + "epoch": 0.8029673665181399, + "grad_norm": 7.71875, + "learning_rate": 6.985985559473356e-06, + "loss": 1.03507853, + "memory(GiB)": 302.58, + "step": 143580, + "train_speed(iter/s)": 0.123987 + }, + { + "acc": 0.73184447, + "epoch": 0.8030792159911192, + "grad_norm": 6.09375, + "learning_rate": 6.985136892951795e-06, + "loss": 1.05706587, + "memory(GiB)": 302.58, + "step": 143600, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.74492288, + "epoch": 0.8031910654640985, + "grad_norm": 6.5625, + "learning_rate": 6.984288158532258e-06, + "loss": 0.99569597, + "memory(GiB)": 302.58, + "step": 143620, + "train_speed(iter/s)": 0.124005 + }, + { + "acc": 0.72334738, + "epoch": 0.8033029149370777, + "grad_norm": 8.375, + "learning_rate": 6.983439356243774e-06, + "loss": 1.10657644, + "memory(GiB)": 302.58, + "step": 143640, + "train_speed(iter/s)": 0.124013 + }, + { + "acc": 0.74859233, + "epoch": 0.803414764410057, + "grad_norm": 7.46875, + "learning_rate": 6.982590486115375e-06, + "loss": 0.98942299, + "memory(GiB)": 302.58, + "step": 143660, + "train_speed(iter/s)": 0.124021 + }, + { + "acc": 0.75991154, + "epoch": 0.8035266138830363, + "grad_norm": 5.5625, + "learning_rate": 6.981741548176094e-06, + "loss": 0.93931503, + "memory(GiB)": 302.58, + "step": 143680, + "train_speed(iter/s)": 0.124028 + }, + { + "acc": 0.74666505, + "epoch": 0.8036384633560155, + "grad_norm": 7.125, + "learning_rate": 6.980892542454969e-06, + "loss": 0.98821526, + "memory(GiB)": 302.58, + "step": 143700, + "train_speed(iter/s)": 0.124037 + }, + { + "acc": 0.73864017, + "epoch": 0.8037503128289948, + "grad_norm": 6.0625, + "learning_rate": 6.980043468981037e-06, + "loss": 1.03442202, + "memory(GiB)": 302.58, + "step": 143720, + "train_speed(iter/s)": 0.124045 + }, + { + "acc": 0.73790689, + "epoch": 0.803862162301974, + "grad_norm": 6.0625, + "learning_rate": 6.97919432778334e-06, + "loss": 1.04241056, + "memory(GiB)": 302.58, + "step": 143740, + "train_speed(iter/s)": 0.124053 + }, + { + "acc": 0.74075046, + "epoch": 0.8039740117749533, + "grad_norm": 7.53125, + "learning_rate": 6.978345118890921e-06, + "loss": 1.01290789, + "memory(GiB)": 302.58, + "step": 143760, + "train_speed(iter/s)": 0.12406 + }, + { + "acc": 0.7465548, + "epoch": 0.8040858612479326, + "grad_norm": 7.3125, + "learning_rate": 6.977495842332827e-06, + "loss": 0.97905226, + "memory(GiB)": 302.58, + "step": 143780, + "train_speed(iter/s)": 0.124068 + }, + { + "acc": 0.73276987, + "epoch": 0.8041977107209118, + "grad_norm": 7.71875, + "learning_rate": 6.976646498138103e-06, + "loss": 1.03162031, + "memory(GiB)": 302.58, + "step": 143800, + "train_speed(iter/s)": 0.124077 + }, + { + "acc": 0.74386716, + "epoch": 0.8043095601938911, + "grad_norm": 7.625, + "learning_rate": 6.975797086335801e-06, + "loss": 1.00595531, + "memory(GiB)": 302.58, + "step": 143820, + "train_speed(iter/s)": 0.124084 + }, + { + "acc": 0.74321742, + "epoch": 0.8044214096668704, + "grad_norm": 6.25, + "learning_rate": 6.974947606954975e-06, + "loss": 1.00083818, + "memory(GiB)": 302.58, + "step": 143840, + "train_speed(iter/s)": 0.124092 + }, + { + "acc": 0.74436731, + "epoch": 0.8045332591398496, + "grad_norm": 6.9375, + "learning_rate": 6.9740980600246775e-06, + "loss": 0.99138937, + "memory(GiB)": 302.58, + "step": 143860, + "train_speed(iter/s)": 0.1241 + }, + { + "acc": 0.7479609, + "epoch": 0.8046451086128289, + "grad_norm": 7.09375, + "learning_rate": 6.973248445573967e-06, + "loss": 0.98783703, + "memory(GiB)": 302.58, + "step": 143880, + "train_speed(iter/s)": 0.124108 + }, + { + "acc": 0.7374959, + "epoch": 0.8047569580858082, + "grad_norm": 7.125, + "learning_rate": 6.972398763631902e-06, + "loss": 1.01192875, + "memory(GiB)": 302.58, + "step": 143900, + "train_speed(iter/s)": 0.124117 + }, + { + "acc": 0.7360599, + "epoch": 0.8048688075587874, + "grad_norm": 7.71875, + "learning_rate": 6.9715490142275455e-06, + "loss": 1.0448802, + "memory(GiB)": 302.58, + "step": 143920, + "train_speed(iter/s)": 0.124124 + }, + { + "acc": 0.75398784, + "epoch": 0.8049806570317667, + "grad_norm": 7.78125, + "learning_rate": 6.970699197389961e-06, + "loss": 0.97511702, + "memory(GiB)": 302.58, + "step": 143940, + "train_speed(iter/s)": 0.124133 + }, + { + "acc": 0.73129792, + "epoch": 0.805092506504746, + "grad_norm": 4.84375, + "learning_rate": 6.969849313148215e-06, + "loss": 1.04001656, + "memory(GiB)": 302.58, + "step": 143960, + "train_speed(iter/s)": 0.124141 + }, + { + "acc": 0.73900728, + "epoch": 0.8052043559777252, + "grad_norm": 8.8125, + "learning_rate": 6.968999361531377e-06, + "loss": 1.02112741, + "memory(GiB)": 302.58, + "step": 143980, + "train_speed(iter/s)": 0.124149 + }, + { + "acc": 0.73162246, + "epoch": 0.8053162054507045, + "grad_norm": 6.75, + "learning_rate": 6.9681493425685165e-06, + "loss": 1.07638302, + "memory(GiB)": 302.58, + "step": 144000, + "train_speed(iter/s)": 0.124156 + }, + { + "epoch": 0.8053162054507045, + "eval_acc": 0.7051450760052921, + "eval_loss": 1.0186278820037842, + "eval_runtime": 7508.6388, + "eval_samples_per_second": 10.026, + "eval_steps_per_second": 10.026, + "step": 144000 + }, + { + "acc": 0.73611612, + "epoch": 0.8054280549236837, + "grad_norm": 9.6875, + "learning_rate": 6.967299256288708e-06, + "loss": 1.04053288, + "memory(GiB)": 302.58, + "step": 144020, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.74542723, + "epoch": 0.805539904396663, + "grad_norm": 7.625, + "learning_rate": 6.966449102721025e-06, + "loss": 0.99894094, + "memory(GiB)": 302.58, + "step": 144040, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.7334661, + "epoch": 0.8056517538696423, + "grad_norm": 7.125, + "learning_rate": 6.965598881894549e-06, + "loss": 1.05318356, + "memory(GiB)": 302.58, + "step": 144060, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.7692102, + "epoch": 0.8057636033426215, + "grad_norm": 9.5625, + "learning_rate": 6.964748593838359e-06, + "loss": 0.90044346, + "memory(GiB)": 302.58, + "step": 144080, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.74335093, + "epoch": 0.8058754528156008, + "grad_norm": 7.28125, + "learning_rate": 6.9638982385815365e-06, + "loss": 1.00651407, + "memory(GiB)": 302.58, + "step": 144100, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.73086295, + "epoch": 0.8059873022885801, + "grad_norm": 7.5, + "learning_rate": 6.963047816153166e-06, + "loss": 1.05760975, + "memory(GiB)": 302.58, + "step": 144120, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.75699282, + "epoch": 0.8060991517615593, + "grad_norm": 10.0, + "learning_rate": 6.9621973265823375e-06, + "loss": 0.94701519, + "memory(GiB)": 302.58, + "step": 144140, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.74011283, + "epoch": 0.8062110012345386, + "grad_norm": 5.90625, + "learning_rate": 6.961346769898137e-06, + "loss": 1.01886902, + "memory(GiB)": 302.58, + "step": 144160, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.75015006, + "epoch": 0.8063228507075179, + "grad_norm": 6.375, + "learning_rate": 6.960496146129659e-06, + "loss": 0.96540899, + "memory(GiB)": 302.58, + "step": 144180, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.74804039, + "epoch": 0.8064347001804971, + "grad_norm": 5.25, + "learning_rate": 6.9596454553059945e-06, + "loss": 1.00736914, + "memory(GiB)": 302.58, + "step": 144200, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.74465837, + "epoch": 0.8065465496534764, + "grad_norm": 4.65625, + "learning_rate": 6.9587946974562415e-06, + "loss": 1.0144495, + "memory(GiB)": 302.58, + "step": 144220, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.73235788, + "epoch": 0.8066583991264556, + "grad_norm": 6.40625, + "learning_rate": 6.9579438726095e-06, + "loss": 1.04973879, + "memory(GiB)": 302.58, + "step": 144240, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.73348618, + "epoch": 0.8067702485994349, + "grad_norm": 5.625, + "learning_rate": 6.9570929807948695e-06, + "loss": 1.07437143, + "memory(GiB)": 302.58, + "step": 144260, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.75314193, + "epoch": 0.8068820980724142, + "grad_norm": 9.1875, + "learning_rate": 6.956242022041453e-06, + "loss": 0.97330284, + "memory(GiB)": 302.58, + "step": 144280, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.73386993, + "epoch": 0.8069939475453934, + "grad_norm": 10.8125, + "learning_rate": 6.955390996378356e-06, + "loss": 1.06089706, + "memory(GiB)": 302.58, + "step": 144300, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74351468, + "epoch": 0.8071057970183727, + "grad_norm": 5.71875, + "learning_rate": 6.954539903834686e-06, + "loss": 1.03010769, + "memory(GiB)": 302.58, + "step": 144320, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.7387857, + "epoch": 0.807217646491352, + "grad_norm": 5.375, + "learning_rate": 6.953688744439555e-06, + "loss": 1.03878937, + "memory(GiB)": 302.58, + "step": 144340, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.7159174, + "epoch": 0.8073294959643312, + "grad_norm": 9.9375, + "learning_rate": 6.952837518222073e-06, + "loss": 1.14276152, + "memory(GiB)": 302.58, + "step": 144360, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74736247, + "epoch": 0.8074413454373105, + "grad_norm": 7.25, + "learning_rate": 6.951986225211356e-06, + "loss": 0.97701874, + "memory(GiB)": 302.58, + "step": 144380, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.75800772, + "epoch": 0.8075531949102898, + "grad_norm": 7.53125, + "learning_rate": 6.951134865436521e-06, + "loss": 0.95311747, + "memory(GiB)": 302.58, + "step": 144400, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.74344215, + "epoch": 0.807665044383269, + "grad_norm": 7.25, + "learning_rate": 6.950283438926686e-06, + "loss": 1.02089262, + "memory(GiB)": 302.58, + "step": 144420, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.74009242, + "epoch": 0.8077768938562483, + "grad_norm": 7.09375, + "learning_rate": 6.949431945710974e-06, + "loss": 1.02161303, + "memory(GiB)": 302.58, + "step": 144440, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.75293603, + "epoch": 0.8078887433292276, + "grad_norm": 9.625, + "learning_rate": 6.948580385818505e-06, + "loss": 0.95671911, + "memory(GiB)": 302.58, + "step": 144460, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.72851253, + "epoch": 0.8080005928022068, + "grad_norm": 5.09375, + "learning_rate": 6.9477287592784115e-06, + "loss": 1.05671949, + "memory(GiB)": 302.58, + "step": 144480, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73658986, + "epoch": 0.8081124422751861, + "grad_norm": 10.125, + "learning_rate": 6.946877066119817e-06, + "loss": 1.0374403, + "memory(GiB)": 302.58, + "step": 144500, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75131397, + "epoch": 0.8082242917481653, + "grad_norm": 5.9375, + "learning_rate": 6.946025306371853e-06, + "loss": 0.98586731, + "memory(GiB)": 302.58, + "step": 144520, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.73929458, + "epoch": 0.8083361412211446, + "grad_norm": 7.46875, + "learning_rate": 6.945173480063654e-06, + "loss": 1.02196894, + "memory(GiB)": 302.58, + "step": 144540, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.74583559, + "epoch": 0.8084479906941239, + "grad_norm": 6.75, + "learning_rate": 6.944321587224354e-06, + "loss": 0.99322433, + "memory(GiB)": 302.58, + "step": 144560, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.7337801, + "epoch": 0.8085598401671031, + "grad_norm": 6.25, + "learning_rate": 6.943469627883088e-06, + "loss": 1.08367062, + "memory(GiB)": 302.58, + "step": 144580, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75033503, + "epoch": 0.8086716896400824, + "grad_norm": 14.8125, + "learning_rate": 6.942617602069001e-06, + "loss": 0.95971422, + "memory(GiB)": 302.58, + "step": 144600, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.74572783, + "epoch": 0.8087835391130617, + "grad_norm": 6.40625, + "learning_rate": 6.94176550981123e-06, + "loss": 1.00021639, + "memory(GiB)": 302.58, + "step": 144620, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.73250461, + "epoch": 0.8088953885860409, + "grad_norm": 8.6875, + "learning_rate": 6.9409133511389225e-06, + "loss": 1.0644105, + "memory(GiB)": 302.58, + "step": 144640, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.74864969, + "epoch": 0.8090072380590202, + "grad_norm": 6.4375, + "learning_rate": 6.940061126081224e-06, + "loss": 0.99208622, + "memory(GiB)": 302.58, + "step": 144660, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.74063373, + "epoch": 0.8091190875319995, + "grad_norm": 6.34375, + "learning_rate": 6.939208834667283e-06, + "loss": 1.03329153, + "memory(GiB)": 302.58, + "step": 144680, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.74348249, + "epoch": 0.8092309370049787, + "grad_norm": 6.21875, + "learning_rate": 6.938356476926252e-06, + "loss": 0.99779196, + "memory(GiB)": 302.58, + "step": 144700, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.72517967, + "epoch": 0.809342786477958, + "grad_norm": 6.0625, + "learning_rate": 6.937504052887283e-06, + "loss": 1.09382658, + "memory(GiB)": 302.58, + "step": 144720, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.74381027, + "epoch": 0.8094546359509373, + "grad_norm": 7.90625, + "learning_rate": 6.936651562579531e-06, + "loss": 1.02840824, + "memory(GiB)": 302.58, + "step": 144740, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.73922901, + "epoch": 0.8095664854239165, + "grad_norm": 5.5625, + "learning_rate": 6.935799006032153e-06, + "loss": 1.0297616, + "memory(GiB)": 302.58, + "step": 144760, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.73256841, + "epoch": 0.8096783348968958, + "grad_norm": 6.90625, + "learning_rate": 6.934946383274312e-06, + "loss": 1.049228, + "memory(GiB)": 302.58, + "step": 144780, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.72302852, + "epoch": 0.809790184369875, + "grad_norm": 8.875, + "learning_rate": 6.93409369433517e-06, + "loss": 1.09905663, + "memory(GiB)": 302.58, + "step": 144800, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.74520702, + "epoch": 0.8099020338428543, + "grad_norm": 6.21875, + "learning_rate": 6.933240939243891e-06, + "loss": 1.00722971, + "memory(GiB)": 302.58, + "step": 144820, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.73743706, + "epoch": 0.8100138833158336, + "grad_norm": 8.0, + "learning_rate": 6.9323881180296404e-06, + "loss": 1.03710623, + "memory(GiB)": 302.58, + "step": 144840, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.74277229, + "epoch": 0.8101257327888128, + "grad_norm": 8.6875, + "learning_rate": 6.931535230721589e-06, + "loss": 1.01328564, + "memory(GiB)": 302.58, + "step": 144860, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.74093552, + "epoch": 0.8102375822617921, + "grad_norm": 9.3125, + "learning_rate": 6.930682277348909e-06, + "loss": 1.01226521, + "memory(GiB)": 302.58, + "step": 144880, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.75379758, + "epoch": 0.8103494317347714, + "grad_norm": 10.25, + "learning_rate": 6.929829257940771e-06, + "loss": 0.95816278, + "memory(GiB)": 302.58, + "step": 144900, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.74392977, + "epoch": 0.8104612812077506, + "grad_norm": 7.4375, + "learning_rate": 6.928976172526355e-06, + "loss": 1.00463486, + "memory(GiB)": 302.58, + "step": 144920, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.73997507, + "epoch": 0.8105731306807299, + "grad_norm": 6.1875, + "learning_rate": 6.928123021134836e-06, + "loss": 1.01838837, + "memory(GiB)": 302.58, + "step": 144940, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.74961262, + "epoch": 0.8106849801537092, + "grad_norm": 9.25, + "learning_rate": 6.9272698037953965e-06, + "loss": 0.97159967, + "memory(GiB)": 302.58, + "step": 144960, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.73205109, + "epoch": 0.8107968296266884, + "grad_norm": 6.21875, + "learning_rate": 6.926416520537219e-06, + "loss": 1.06002579, + "memory(GiB)": 302.58, + "step": 144980, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.74503965, + "epoch": 0.8109086790996677, + "grad_norm": 7.9375, + "learning_rate": 6.9255631713894875e-06, + "loss": 1.02677946, + "memory(GiB)": 302.58, + "step": 145000, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.73692641, + "epoch": 0.811020528572647, + "grad_norm": 7.125, + "learning_rate": 6.9247097563813895e-06, + "loss": 1.0429369, + "memory(GiB)": 302.58, + "step": 145020, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.74806576, + "epoch": 0.8111323780456262, + "grad_norm": 7.125, + "learning_rate": 6.923856275542114e-06, + "loss": 0.98769169, + "memory(GiB)": 302.58, + "step": 145040, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.73621249, + "epoch": 0.8112442275186055, + "grad_norm": 11.375, + "learning_rate": 6.923002728900855e-06, + "loss": 1.04707985, + "memory(GiB)": 302.58, + "step": 145060, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.73425961, + "epoch": 0.8113560769915847, + "grad_norm": 5.9375, + "learning_rate": 6.922149116486805e-06, + "loss": 1.06230965, + "memory(GiB)": 302.58, + "step": 145080, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.74058018, + "epoch": 0.811467926464564, + "grad_norm": 7.5, + "learning_rate": 6.921295438329159e-06, + "loss": 1.02600813, + "memory(GiB)": 302.58, + "step": 145100, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.74532323, + "epoch": 0.8115797759375433, + "grad_norm": 9.3125, + "learning_rate": 6.920441694457119e-06, + "loss": 1.01115942, + "memory(GiB)": 302.58, + "step": 145120, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.72158637, + "epoch": 0.8116916254105225, + "grad_norm": 7.53125, + "learning_rate": 6.919587884899881e-06, + "loss": 1.12701845, + "memory(GiB)": 302.58, + "step": 145140, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.74136381, + "epoch": 0.8118034748835018, + "grad_norm": 7.03125, + "learning_rate": 6.918734009686651e-06, + "loss": 1.01672411, + "memory(GiB)": 302.58, + "step": 145160, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.74109602, + "epoch": 0.8119153243564811, + "grad_norm": 9.0, + "learning_rate": 6.917880068846634e-06, + "loss": 0.99544249, + "memory(GiB)": 302.58, + "step": 145180, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.73941717, + "epoch": 0.8120271738294603, + "grad_norm": 6.5625, + "learning_rate": 6.9170260624090355e-06, + "loss": 1.04110966, + "memory(GiB)": 302.58, + "step": 145200, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.74658279, + "epoch": 0.8121390233024396, + "grad_norm": 8.625, + "learning_rate": 6.9161719904030685e-06, + "loss": 0.9885644, + "memory(GiB)": 302.58, + "step": 145220, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.75034423, + "epoch": 0.8122508727754189, + "grad_norm": 5.78125, + "learning_rate": 6.915317852857945e-06, + "loss": 0.97032232, + "memory(GiB)": 302.58, + "step": 145240, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.75309491, + "epoch": 0.8123627222483981, + "grad_norm": 6.90625, + "learning_rate": 6.9144636498028765e-06, + "loss": 0.96973143, + "memory(GiB)": 302.58, + "step": 145260, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.74710965, + "epoch": 0.8124745717213774, + "grad_norm": 7.40625, + "learning_rate": 6.91360938126708e-06, + "loss": 0.99920244, + "memory(GiB)": 302.58, + "step": 145280, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.74274178, + "epoch": 0.8125864211943566, + "grad_norm": 9.3125, + "learning_rate": 6.912755047279775e-06, + "loss": 1.01169243, + "memory(GiB)": 302.58, + "step": 145300, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74625726, + "epoch": 0.8126982706673359, + "grad_norm": 7.96875, + "learning_rate": 6.9119006478701825e-06, + "loss": 0.9971076, + "memory(GiB)": 302.58, + "step": 145320, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.73927531, + "epoch": 0.8128101201403152, + "grad_norm": 7.53125, + "learning_rate": 6.9110461830675245e-06, + "loss": 1.00559492, + "memory(GiB)": 302.58, + "step": 145340, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.73974004, + "epoch": 0.8129219696132944, + "grad_norm": 8.0625, + "learning_rate": 6.910191652901028e-06, + "loss": 1.02335882, + "memory(GiB)": 302.58, + "step": 145360, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.72690845, + "epoch": 0.8130338190862737, + "grad_norm": 8.1875, + "learning_rate": 6.909337057399921e-06, + "loss": 1.0719305, + "memory(GiB)": 302.58, + "step": 145380, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.74624696, + "epoch": 0.813145668559253, + "grad_norm": 6.5, + "learning_rate": 6.90848239659343e-06, + "loss": 1.00382538, + "memory(GiB)": 302.58, + "step": 145400, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.7303112, + "epoch": 0.8132575180322322, + "grad_norm": 5.25, + "learning_rate": 6.907627670510792e-06, + "loss": 1.07434158, + "memory(GiB)": 302.58, + "step": 145420, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.75159712, + "epoch": 0.8133693675052115, + "grad_norm": 5.78125, + "learning_rate": 6.906772879181236e-06, + "loss": 0.94100342, + "memory(GiB)": 302.58, + "step": 145440, + "train_speed(iter/s)": 0.123925 + }, + { + "acc": 0.74056492, + "epoch": 0.8134812169781908, + "grad_norm": 7.25, + "learning_rate": 6.905918022634003e-06, + "loss": 1.03265686, + "memory(GiB)": 302.58, + "step": 145460, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.74329047, + "epoch": 0.81359306645117, + "grad_norm": 12.625, + "learning_rate": 6.905063100898329e-06, + "loss": 1.00612154, + "memory(GiB)": 302.58, + "step": 145480, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.74547281, + "epoch": 0.8137049159241493, + "grad_norm": 5.28125, + "learning_rate": 6.904208114003458e-06, + "loss": 1.01347008, + "memory(GiB)": 302.58, + "step": 145500, + "train_speed(iter/s)": 0.123948 + }, + { + "acc": 0.74751377, + "epoch": 0.8138167653971285, + "grad_norm": 7.46875, + "learning_rate": 6.90335306197863e-06, + "loss": 0.98314857, + "memory(GiB)": 302.58, + "step": 145520, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.74028344, + "epoch": 0.8139286148701078, + "grad_norm": 6.875, + "learning_rate": 6.902497944853092e-06, + "loss": 1.03810434, + "memory(GiB)": 302.58, + "step": 145540, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.7350565, + "epoch": 0.8140404643430871, + "grad_norm": 8.1875, + "learning_rate": 6.901642762656092e-06, + "loss": 1.03526697, + "memory(GiB)": 302.58, + "step": 145560, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.75270257, + "epoch": 0.8141523138160663, + "grad_norm": 7.1875, + "learning_rate": 6.900787515416881e-06, + "loss": 0.95207195, + "memory(GiB)": 302.58, + "step": 145580, + "train_speed(iter/s)": 0.12398 + }, + { + "acc": 0.75484195, + "epoch": 0.8142641632890456, + "grad_norm": 7.46875, + "learning_rate": 6.899932203164706e-06, + "loss": 0.97626352, + "memory(GiB)": 302.58, + "step": 145600, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.73695498, + "epoch": 0.8143760127620249, + "grad_norm": 6.8125, + "learning_rate": 6.899076825928827e-06, + "loss": 1.03421135, + "memory(GiB)": 302.58, + "step": 145620, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.7481132, + "epoch": 0.8144878622350041, + "grad_norm": 7.875, + "learning_rate": 6.8982213837385005e-06, + "loss": 0.99518967, + "memory(GiB)": 302.58, + "step": 145640, + "train_speed(iter/s)": 0.124004 + }, + { + "acc": 0.7528019, + "epoch": 0.8145997117079834, + "grad_norm": 7.375, + "learning_rate": 6.897365876622981e-06, + "loss": 0.95632801, + "memory(GiB)": 302.58, + "step": 145660, + "train_speed(iter/s)": 0.124012 + }, + { + "acc": 0.74991555, + "epoch": 0.8147115611809627, + "grad_norm": 7.125, + "learning_rate": 6.896510304611534e-06, + "loss": 0.97101669, + "memory(GiB)": 302.58, + "step": 145680, + "train_speed(iter/s)": 0.12402 + }, + { + "acc": 0.75071459, + "epoch": 0.8148234106539419, + "grad_norm": 8.6875, + "learning_rate": 6.89565466773342e-06, + "loss": 0.98979521, + "memory(GiB)": 302.58, + "step": 145700, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.72163486, + "epoch": 0.8149352601269212, + "grad_norm": 7.0, + "learning_rate": 6.894798966017906e-06, + "loss": 1.09317904, + "memory(GiB)": 302.58, + "step": 145720, + "train_speed(iter/s)": 0.124035 + }, + { + "acc": 0.74921288, + "epoch": 0.8150471095999005, + "grad_norm": 8.75, + "learning_rate": 6.893943199494259e-06, + "loss": 0.99604073, + "memory(GiB)": 302.58, + "step": 145740, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.73865318, + "epoch": 0.8151589590728797, + "grad_norm": 7.625, + "learning_rate": 6.893087368191748e-06, + "loss": 1.03399868, + "memory(GiB)": 302.58, + "step": 145760, + "train_speed(iter/s)": 0.12405 + }, + { + "acc": 0.75539751, + "epoch": 0.815270808545859, + "grad_norm": 9.3125, + "learning_rate": 6.8922314721396474e-06, + "loss": 0.95911121, + "memory(GiB)": 302.58, + "step": 145780, + "train_speed(iter/s)": 0.124058 + }, + { + "acc": 0.7239471, + "epoch": 0.8153826580188382, + "grad_norm": 6.71875, + "learning_rate": 6.89137551136723e-06, + "loss": 1.08416071, + "memory(GiB)": 302.58, + "step": 145800, + "train_speed(iter/s)": 0.124066 + }, + { + "acc": 0.75064073, + "epoch": 0.8154945074918175, + "grad_norm": 8.125, + "learning_rate": 6.890519485903771e-06, + "loss": 0.98508291, + "memory(GiB)": 302.58, + "step": 145820, + "train_speed(iter/s)": 0.124074 + }, + { + "acc": 0.73367233, + "epoch": 0.8156063569647968, + "grad_norm": 4.5, + "learning_rate": 6.889663395778553e-06, + "loss": 1.03376875, + "memory(GiB)": 302.58, + "step": 145840, + "train_speed(iter/s)": 0.124082 + }, + { + "acc": 0.73954387, + "epoch": 0.815718206437776, + "grad_norm": 5.5, + "learning_rate": 6.888807241020855e-06, + "loss": 1.03902082, + "memory(GiB)": 302.58, + "step": 145860, + "train_speed(iter/s)": 0.12409 + }, + { + "acc": 0.74708877, + "epoch": 0.8158300559107553, + "grad_norm": 8.5625, + "learning_rate": 6.88795102165996e-06, + "loss": 0.9766777, + "memory(GiB)": 302.58, + "step": 145880, + "train_speed(iter/s)": 0.124097 + }, + { + "acc": 0.74577894, + "epoch": 0.8159419053837346, + "grad_norm": 9.0625, + "learning_rate": 6.887094737725152e-06, + "loss": 1.01146631, + "memory(GiB)": 302.58, + "step": 145900, + "train_speed(iter/s)": 0.124106 + }, + { + "acc": 0.75476084, + "epoch": 0.8160537548567138, + "grad_norm": 5.90625, + "learning_rate": 6.8862383892457205e-06, + "loss": 0.95257635, + "memory(GiB)": 302.58, + "step": 145920, + "train_speed(iter/s)": 0.124113 + }, + { + "acc": 0.74036975, + "epoch": 0.8161656043296931, + "grad_norm": 7.0, + "learning_rate": 6.885381976250956e-06, + "loss": 0.99997864, + "memory(GiB)": 302.58, + "step": 145940, + "train_speed(iter/s)": 0.124122 + }, + { + "acc": 0.72771831, + "epoch": 0.8162774538026724, + "grad_norm": 5.8125, + "learning_rate": 6.884525498770149e-06, + "loss": 1.07282257, + "memory(GiB)": 302.58, + "step": 145960, + "train_speed(iter/s)": 0.124129 + }, + { + "acc": 0.72170534, + "epoch": 0.8163893032756516, + "grad_norm": 6.75, + "learning_rate": 6.883668956832596e-06, + "loss": 1.1129734, + "memory(GiB)": 302.58, + "step": 145980, + "train_speed(iter/s)": 0.124136 + }, + { + "acc": 0.72017574, + "epoch": 0.8165011527486309, + "grad_norm": 8.1875, + "learning_rate": 6.882812350467589e-06, + "loss": 1.11696987, + "memory(GiB)": 302.58, + "step": 146000, + "train_speed(iter/s)": 0.124144 + }, + { + "epoch": 0.8165011527486309, + "eval_acc": 0.7052510147989939, + "eval_loss": 1.018133282661438, + "eval_runtime": 7571.1648, + "eval_samples_per_second": 9.943, + "eval_steps_per_second": 9.943, + "step": 146000 + }, + { + "acc": 0.74335413, + "epoch": 0.8166130022216102, + "grad_norm": 8.0625, + "learning_rate": 6.881955679704432e-06, + "loss": 1.02411375, + "memory(GiB)": 302.58, + "step": 146020, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.72648077, + "epoch": 0.8167248516945894, + "grad_norm": 5.6875, + "learning_rate": 6.881098944572421e-06, + "loss": 1.08002377, + "memory(GiB)": 302.58, + "step": 146040, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.74413428, + "epoch": 0.8168367011675687, + "grad_norm": 6.0, + "learning_rate": 6.880242145100862e-06, + "loss": 1.00287857, + "memory(GiB)": 302.58, + "step": 146060, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.74322228, + "epoch": 0.8169485506405479, + "grad_norm": 9.0625, + "learning_rate": 6.8793852813190595e-06, + "loss": 1.02811766, + "memory(GiB)": 302.58, + "step": 146080, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.7556983, + "epoch": 0.8170604001135272, + "grad_norm": 5.6875, + "learning_rate": 6.878528353256322e-06, + "loss": 0.92192059, + "memory(GiB)": 302.58, + "step": 146100, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.72319522, + "epoch": 0.8171722495865065, + "grad_norm": 8.6875, + "learning_rate": 6.877671360941958e-06, + "loss": 1.1024086, + "memory(GiB)": 302.58, + "step": 146120, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75385203, + "epoch": 0.8172840990594857, + "grad_norm": 7.9375, + "learning_rate": 6.876814304405278e-06, + "loss": 0.96415787, + "memory(GiB)": 302.58, + "step": 146140, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.74551826, + "epoch": 0.817395948532465, + "grad_norm": 6.34375, + "learning_rate": 6.875957183675599e-06, + "loss": 0.98388128, + "memory(GiB)": 302.58, + "step": 146160, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.72388573, + "epoch": 0.8175077980054443, + "grad_norm": 7.15625, + "learning_rate": 6.875099998782234e-06, + "loss": 1.11691628, + "memory(GiB)": 302.58, + "step": 146180, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.73727965, + "epoch": 0.8176196474784235, + "grad_norm": 8.5, + "learning_rate": 6.874242749754505e-06, + "loss": 1.06606712, + "memory(GiB)": 302.58, + "step": 146200, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.74405112, + "epoch": 0.8177314969514028, + "grad_norm": 7.25, + "learning_rate": 6.87338543662173e-06, + "loss": 1.00808105, + "memory(GiB)": 302.58, + "step": 146220, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.74475093, + "epoch": 0.8178433464243821, + "grad_norm": 6.125, + "learning_rate": 6.872528059413234e-06, + "loss": 1.00365944, + "memory(GiB)": 302.58, + "step": 146240, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.75050807, + "epoch": 0.8179551958973613, + "grad_norm": 7.375, + "learning_rate": 6.871670618158339e-06, + "loss": 0.99824705, + "memory(GiB)": 302.58, + "step": 146260, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.7439702, + "epoch": 0.8180670453703406, + "grad_norm": 5.875, + "learning_rate": 6.870813112886376e-06, + "loss": 0.99712753, + "memory(GiB)": 302.58, + "step": 146280, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75524755, + "epoch": 0.8181788948433198, + "grad_norm": 7.84375, + "learning_rate": 6.869955543626672e-06, + "loss": 0.9342226, + "memory(GiB)": 302.58, + "step": 146300, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.72858272, + "epoch": 0.8182907443162991, + "grad_norm": 8.125, + "learning_rate": 6.8690979104085564e-06, + "loss": 1.08037977, + "memory(GiB)": 302.58, + "step": 146320, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.73207793, + "epoch": 0.8184025937892784, + "grad_norm": 5.625, + "learning_rate": 6.868240213261368e-06, + "loss": 1.06936321, + "memory(GiB)": 302.58, + "step": 146340, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.73818383, + "epoch": 0.8185144432622576, + "grad_norm": 5.875, + "learning_rate": 6.867382452214439e-06, + "loss": 1.03976555, + "memory(GiB)": 302.58, + "step": 146360, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.73200436, + "epoch": 0.8186262927352369, + "grad_norm": 5.90625, + "learning_rate": 6.866524627297109e-06, + "loss": 1.03570652, + "memory(GiB)": 302.58, + "step": 146380, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74212894, + "epoch": 0.8187381422082162, + "grad_norm": 14.125, + "learning_rate": 6.8656667385387185e-06, + "loss": 1.01240597, + "memory(GiB)": 302.58, + "step": 146400, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74009237, + "epoch": 0.8188499916811954, + "grad_norm": 7.4375, + "learning_rate": 6.864808785968611e-06, + "loss": 1.05859413, + "memory(GiB)": 302.58, + "step": 146420, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.74497352, + "epoch": 0.8189618411541747, + "grad_norm": 5.46875, + "learning_rate": 6.863950769616128e-06, + "loss": 0.99948769, + "memory(GiB)": 302.58, + "step": 146440, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.7472187, + "epoch": 0.819073690627154, + "grad_norm": 5.65625, + "learning_rate": 6.863092689510619e-06, + "loss": 0.99952745, + "memory(GiB)": 302.58, + "step": 146460, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.75791898, + "epoch": 0.8191855401001332, + "grad_norm": 8.125, + "learning_rate": 6.862234545681431e-06, + "loss": 0.96694822, + "memory(GiB)": 302.58, + "step": 146480, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73486052, + "epoch": 0.8192973895731125, + "grad_norm": 7.28125, + "learning_rate": 6.861376338157917e-06, + "loss": 1.04582005, + "memory(GiB)": 302.58, + "step": 146500, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.75323954, + "epoch": 0.8194092390460918, + "grad_norm": 10.125, + "learning_rate": 6.860518066969432e-06, + "loss": 0.94314404, + "memory(GiB)": 302.58, + "step": 146520, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.75234098, + "epoch": 0.819521088519071, + "grad_norm": 5.625, + "learning_rate": 6.859659732145327e-06, + "loss": 0.9659934, + "memory(GiB)": 302.58, + "step": 146540, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.72987328, + "epoch": 0.8196329379920503, + "grad_norm": 6.34375, + "learning_rate": 6.8588013337149635e-06, + "loss": 1.06698952, + "memory(GiB)": 302.58, + "step": 146560, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.73992434, + "epoch": 0.8197447874650295, + "grad_norm": 9.25, + "learning_rate": 6.857942871707701e-06, + "loss": 1.03115959, + "memory(GiB)": 302.58, + "step": 146580, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.72358088, + "epoch": 0.8198566369380088, + "grad_norm": 8.5625, + "learning_rate": 6.8570843461529e-06, + "loss": 1.09560213, + "memory(GiB)": 302.58, + "step": 146600, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.76561537, + "epoch": 0.8199684864109881, + "grad_norm": 9.5, + "learning_rate": 6.856225757079926e-06, + "loss": 0.90353527, + "memory(GiB)": 302.58, + "step": 146620, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.73517003, + "epoch": 0.8200803358839673, + "grad_norm": 8.75, + "learning_rate": 6.855367104518145e-06, + "loss": 1.0295435, + "memory(GiB)": 302.58, + "step": 146640, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.74500647, + "epoch": 0.8201921853569466, + "grad_norm": 5.03125, + "learning_rate": 6.854508388496927e-06, + "loss": 0.98336725, + "memory(GiB)": 302.58, + "step": 146660, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.73954635, + "epoch": 0.8203040348299259, + "grad_norm": 6.96875, + "learning_rate": 6.8536496090456404e-06, + "loss": 1.01288624, + "memory(GiB)": 302.58, + "step": 146680, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.75079913, + "epoch": 0.8204158843029051, + "grad_norm": 11.0, + "learning_rate": 6.85279076619366e-06, + "loss": 0.9781004, + "memory(GiB)": 302.58, + "step": 146700, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.73744841, + "epoch": 0.8205277337758844, + "grad_norm": 7.0, + "learning_rate": 6.85193185997036e-06, + "loss": 1.00518513, + "memory(GiB)": 302.58, + "step": 146720, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75698528, + "epoch": 0.8206395832488637, + "grad_norm": 9.0, + "learning_rate": 6.851072890405117e-06, + "loss": 0.94403391, + "memory(GiB)": 302.58, + "step": 146740, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.75352554, + "epoch": 0.8207514327218429, + "grad_norm": 8.6875, + "learning_rate": 6.8502138575273126e-06, + "loss": 0.95409775, + "memory(GiB)": 302.58, + "step": 146760, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74810476, + "epoch": 0.8208632821948222, + "grad_norm": 9.5, + "learning_rate": 6.849354761366327e-06, + "loss": 0.9967082, + "memory(GiB)": 302.58, + "step": 146780, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.75307536, + "epoch": 0.8209751316678014, + "grad_norm": 6.21875, + "learning_rate": 6.848495601951546e-06, + "loss": 0.96095886, + "memory(GiB)": 302.58, + "step": 146800, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74589677, + "epoch": 0.8210869811407807, + "grad_norm": 6.65625, + "learning_rate": 6.847636379312352e-06, + "loss": 0.99480324, + "memory(GiB)": 302.58, + "step": 146820, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.735991, + "epoch": 0.82119883061376, + "grad_norm": 7.15625, + "learning_rate": 6.846777093478137e-06, + "loss": 1.04085464, + "memory(GiB)": 302.58, + "step": 146840, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.74534378, + "epoch": 0.8213106800867392, + "grad_norm": 7.5625, + "learning_rate": 6.845917744478287e-06, + "loss": 0.99199772, + "memory(GiB)": 302.58, + "step": 146860, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.73409319, + "epoch": 0.8214225295597185, + "grad_norm": 5.4375, + "learning_rate": 6.8450583323421996e-06, + "loss": 1.05899677, + "memory(GiB)": 302.58, + "step": 146880, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.74867802, + "epoch": 0.8215343790326978, + "grad_norm": 6.75, + "learning_rate": 6.844198857099265e-06, + "loss": 0.97277918, + "memory(GiB)": 302.58, + "step": 146900, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.74123616, + "epoch": 0.821646228505677, + "grad_norm": 6.59375, + "learning_rate": 6.843339318778881e-06, + "loss": 1.02568893, + "memory(GiB)": 302.58, + "step": 146920, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.7562633, + "epoch": 0.8217580779786563, + "grad_norm": 9.4375, + "learning_rate": 6.842479717410449e-06, + "loss": 0.93314123, + "memory(GiB)": 302.58, + "step": 146940, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.74127283, + "epoch": 0.8218699274516356, + "grad_norm": 8.6875, + "learning_rate": 6.841620053023368e-06, + "loss": 1.01181154, + "memory(GiB)": 302.58, + "step": 146960, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.7372828, + "epoch": 0.8219817769246148, + "grad_norm": 8.0, + "learning_rate": 6.84076032564704e-06, + "loss": 1.04339552, + "memory(GiB)": 302.58, + "step": 146980, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.73656559, + "epoch": 0.8220936263975941, + "grad_norm": 8.5625, + "learning_rate": 6.839900535310874e-06, + "loss": 1.03662958, + "memory(GiB)": 302.58, + "step": 147000, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.72728329, + "epoch": 0.8222054758705734, + "grad_norm": 7.0625, + "learning_rate": 6.839040682044274e-06, + "loss": 1.0673872, + "memory(GiB)": 302.58, + "step": 147020, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.74443235, + "epoch": 0.8223173253435526, + "grad_norm": 8.3125, + "learning_rate": 6.838180765876652e-06, + "loss": 1.00730038, + "memory(GiB)": 302.58, + "step": 147040, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.72947874, + "epoch": 0.8224291748165319, + "grad_norm": 7.25, + "learning_rate": 6.8373207868374195e-06, + "loss": 1.07024469, + "memory(GiB)": 302.58, + "step": 147060, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.74873714, + "epoch": 0.8225410242895111, + "grad_norm": 8.0, + "learning_rate": 6.836460744955988e-06, + "loss": 0.99076195, + "memory(GiB)": 302.58, + "step": 147080, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.729075, + "epoch": 0.8226528737624904, + "grad_norm": 10.8125, + "learning_rate": 6.835600640261777e-06, + "loss": 1.08464851, + "memory(GiB)": 302.58, + "step": 147100, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.73538003, + "epoch": 0.8227647232354697, + "grad_norm": 8.625, + "learning_rate": 6.8347404727842045e-06, + "loss": 1.05804443, + "memory(GiB)": 302.58, + "step": 147120, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.72568789, + "epoch": 0.8228765727084489, + "grad_norm": 8.75, + "learning_rate": 6.83388024255269e-06, + "loss": 1.07398672, + "memory(GiB)": 302.58, + "step": 147140, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.75559916, + "epoch": 0.8229884221814282, + "grad_norm": 9.0625, + "learning_rate": 6.833019949596655e-06, + "loss": 0.96221952, + "memory(GiB)": 302.58, + "step": 147160, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.73065434, + "epoch": 0.8231002716544075, + "grad_norm": 5.8125, + "learning_rate": 6.832159593945527e-06, + "loss": 1.05126429, + "memory(GiB)": 302.58, + "step": 147180, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.75321279, + "epoch": 0.8232121211273867, + "grad_norm": 9.625, + "learning_rate": 6.831299175628729e-06, + "loss": 0.96979103, + "memory(GiB)": 302.58, + "step": 147200, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.74100556, + "epoch": 0.823323970600366, + "grad_norm": 5.9375, + "learning_rate": 6.830438694675694e-06, + "loss": 0.99186802, + "memory(GiB)": 302.58, + "step": 147220, + "train_speed(iter/s)": 0.123822 + }, + { + "acc": 0.74621086, + "epoch": 0.8234358200733453, + "grad_norm": 7.84375, + "learning_rate": 6.829578151115851e-06, + "loss": 1.01196108, + "memory(GiB)": 302.58, + "step": 147240, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.74098291, + "epoch": 0.8235476695463245, + "grad_norm": 8.0, + "learning_rate": 6.8287175449786335e-06, + "loss": 1.00692091, + "memory(GiB)": 302.58, + "step": 147260, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.74143591, + "epoch": 0.8236595190193038, + "grad_norm": 6.84375, + "learning_rate": 6.8278568762934784e-06, + "loss": 1.00984192, + "memory(GiB)": 302.58, + "step": 147280, + "train_speed(iter/s)": 0.123846 + }, + { + "acc": 0.74138699, + "epoch": 0.823771368492283, + "grad_norm": 6.5, + "learning_rate": 6.826996145089822e-06, + "loss": 1.00067368, + "memory(GiB)": 302.58, + "step": 147300, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.74041486, + "epoch": 0.8238832179652623, + "grad_norm": 7.4375, + "learning_rate": 6.826135351397103e-06, + "loss": 1.01665936, + "memory(GiB)": 302.58, + "step": 147320, + "train_speed(iter/s)": 0.123862 + }, + { + "acc": 0.73210287, + "epoch": 0.8239950674382416, + "grad_norm": 7.125, + "learning_rate": 6.825274495244764e-06, + "loss": 1.07846241, + "memory(GiB)": 302.58, + "step": 147340, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.73195229, + "epoch": 0.8241069169112208, + "grad_norm": 8.1875, + "learning_rate": 6.82441357666225e-06, + "loss": 1.05909967, + "memory(GiB)": 302.58, + "step": 147360, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.75470576, + "epoch": 0.8242187663842001, + "grad_norm": 7.75, + "learning_rate": 6.823552595679007e-06, + "loss": 0.94788504, + "memory(GiB)": 302.58, + "step": 147380, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.72628136, + "epoch": 0.8243306158571794, + "grad_norm": 6.9375, + "learning_rate": 6.822691552324483e-06, + "loss": 1.06947069, + "memory(GiB)": 302.58, + "step": 147400, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.73917809, + "epoch": 0.8244424653301586, + "grad_norm": 9.75, + "learning_rate": 6.821830446628129e-06, + "loss": 1.03863201, + "memory(GiB)": 302.58, + "step": 147420, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.73566875, + "epoch": 0.8245543148031379, + "grad_norm": 5.5625, + "learning_rate": 6.8209692786193955e-06, + "loss": 1.0263648, + "memory(GiB)": 302.58, + "step": 147440, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.74483247, + "epoch": 0.8246661642761172, + "grad_norm": 10.4375, + "learning_rate": 6.82010804832774e-06, + "loss": 1.01766644, + "memory(GiB)": 302.58, + "step": 147460, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.73036141, + "epoch": 0.8247780137490964, + "grad_norm": 7.40625, + "learning_rate": 6.819246755782616e-06, + "loss": 1.07836008, + "memory(GiB)": 302.58, + "step": 147480, + "train_speed(iter/s)": 0.123925 + }, + { + "acc": 0.73125944, + "epoch": 0.8248898632220757, + "grad_norm": 7.90625, + "learning_rate": 6.818385401013485e-06, + "loss": 1.06964016, + "memory(GiB)": 302.58, + "step": 147500, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.73706136, + "epoch": 0.825001712695055, + "grad_norm": 8.9375, + "learning_rate": 6.817523984049809e-06, + "loss": 1.04064112, + "memory(GiB)": 302.58, + "step": 147520, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.73045015, + "epoch": 0.8251135621680342, + "grad_norm": 11.5, + "learning_rate": 6.816662504921047e-06, + "loss": 1.08050079, + "memory(GiB)": 302.58, + "step": 147540, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.75696759, + "epoch": 0.8252254116410135, + "grad_norm": 6.75, + "learning_rate": 6.815800963656669e-06, + "loss": 0.96363354, + "memory(GiB)": 302.58, + "step": 147560, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.73284531, + "epoch": 0.8253372611139927, + "grad_norm": 7.875, + "learning_rate": 6.814939360286139e-06, + "loss": 1.06916485, + "memory(GiB)": 302.58, + "step": 147580, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.74022961, + "epoch": 0.825449110586972, + "grad_norm": 7.65625, + "learning_rate": 6.814077694838927e-06, + "loss": 1.0273159, + "memory(GiB)": 302.58, + "step": 147600, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.7331491, + "epoch": 0.8255609600599513, + "grad_norm": 8.625, + "learning_rate": 6.813215967344507e-06, + "loss": 1.05966778, + "memory(GiB)": 302.58, + "step": 147620, + "train_speed(iter/s)": 0.12398 + }, + { + "acc": 0.74244056, + "epoch": 0.8256728095329305, + "grad_norm": 5.25, + "learning_rate": 6.8123541778323525e-06, + "loss": 1.00547085, + "memory(GiB)": 302.58, + "step": 147640, + "train_speed(iter/s)": 0.123988 + }, + { + "acc": 0.75958991, + "epoch": 0.8257846590059098, + "grad_norm": 7.34375, + "learning_rate": 6.811492326331936e-06, + "loss": 0.94426622, + "memory(GiB)": 302.58, + "step": 147660, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.71904502, + "epoch": 0.8258965084788891, + "grad_norm": 6.90625, + "learning_rate": 6.810630412872738e-06, + "loss": 1.12543325, + "memory(GiB)": 302.58, + "step": 147680, + "train_speed(iter/s)": 0.124004 + }, + { + "acc": 0.74395533, + "epoch": 0.8260083579518683, + "grad_norm": 9.0625, + "learning_rate": 6.809768437484239e-06, + "loss": 0.99875507, + "memory(GiB)": 302.58, + "step": 147700, + "train_speed(iter/s)": 0.124011 + }, + { + "acc": 0.74335175, + "epoch": 0.8261202074248476, + "grad_norm": 9.1875, + "learning_rate": 6.808906400195922e-06, + "loss": 1.00869303, + "memory(GiB)": 302.58, + "step": 147720, + "train_speed(iter/s)": 0.124019 + }, + { + "acc": 0.7332459, + "epoch": 0.8262320568978269, + "grad_norm": 7.0625, + "learning_rate": 6.808044301037269e-06, + "loss": 1.0747695, + "memory(GiB)": 302.58, + "step": 147740, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.74933791, + "epoch": 0.8263439063708061, + "grad_norm": 5.65625, + "learning_rate": 6.807182140037768e-06, + "loss": 0.97401667, + "memory(GiB)": 302.58, + "step": 147760, + "train_speed(iter/s)": 0.124035 + }, + { + "acc": 0.72704463, + "epoch": 0.8264557558437854, + "grad_norm": 7.875, + "learning_rate": 6.806319917226907e-06, + "loss": 1.08216562, + "memory(GiB)": 302.58, + "step": 147780, + "train_speed(iter/s)": 0.124042 + }, + { + "acc": 0.74896941, + "epoch": 0.8265676053167647, + "grad_norm": 5.375, + "learning_rate": 6.805457632634178e-06, + "loss": 0.98745852, + "memory(GiB)": 302.58, + "step": 147800, + "train_speed(iter/s)": 0.12405 + }, + { + "acc": 0.73606033, + "epoch": 0.8266794547897439, + "grad_norm": 3.90625, + "learning_rate": 6.804595286289072e-06, + "loss": 1.0416667, + "memory(GiB)": 302.58, + "step": 147820, + "train_speed(iter/s)": 0.124058 + }, + { + "acc": 0.75377574, + "epoch": 0.8267913042627232, + "grad_norm": 7.9375, + "learning_rate": 6.8037328782210856e-06, + "loss": 0.96707525, + "memory(GiB)": 302.58, + "step": 147840, + "train_speed(iter/s)": 0.124065 + }, + { + "acc": 0.74963369, + "epoch": 0.8269031537357024, + "grad_norm": 6.5, + "learning_rate": 6.802870408459716e-06, + "loss": 0.9782465, + "memory(GiB)": 302.58, + "step": 147860, + "train_speed(iter/s)": 0.124073 + }, + { + "acc": 0.74414186, + "epoch": 0.8270150032086817, + "grad_norm": 8.4375, + "learning_rate": 6.8020078770344615e-06, + "loss": 1.02342749, + "memory(GiB)": 302.58, + "step": 147880, + "train_speed(iter/s)": 0.124081 + }, + { + "acc": 0.72255435, + "epoch": 0.827126852681661, + "grad_norm": 6.25, + "learning_rate": 6.8011452839748215e-06, + "loss": 1.08962679, + "memory(GiB)": 302.58, + "step": 147900, + "train_speed(iter/s)": 0.124089 + }, + { + "acc": 0.74810772, + "epoch": 0.8272387021546402, + "grad_norm": 9.5625, + "learning_rate": 6.800282629310303e-06, + "loss": 0.98254147, + "memory(GiB)": 302.58, + "step": 147920, + "train_speed(iter/s)": 0.124098 + }, + { + "acc": 0.75361447, + "epoch": 0.8273505516276195, + "grad_norm": 6.71875, + "learning_rate": 6.799419913070411e-06, + "loss": 0.94580364, + "memory(GiB)": 302.58, + "step": 147940, + "train_speed(iter/s)": 0.124105 + }, + { + "acc": 0.7626318, + "epoch": 0.8274624011005988, + "grad_norm": 7.03125, + "learning_rate": 6.798557135284651e-06, + "loss": 0.93098288, + "memory(GiB)": 302.58, + "step": 147960, + "train_speed(iter/s)": 0.124113 + }, + { + "acc": 0.74235148, + "epoch": 0.827574250573578, + "grad_norm": 7.0, + "learning_rate": 6.797694295982534e-06, + "loss": 1.01243277, + "memory(GiB)": 302.58, + "step": 147980, + "train_speed(iter/s)": 0.12412 + }, + { + "acc": 0.72627392, + "epoch": 0.8276861000465573, + "grad_norm": 5.53125, + "learning_rate": 6.796831395193573e-06, + "loss": 1.07589035, + "memory(GiB)": 302.58, + "step": 148000, + "train_speed(iter/s)": 0.124127 + }, + { + "epoch": 0.8276861000465573, + "eval_acc": 0.7052875930144088, + "eval_loss": 1.0177102088928223, + "eval_runtime": 7502.3647, + "eval_samples_per_second": 10.035, + "eval_steps_per_second": 10.035, + "step": 148000 + }, + { + "acc": 0.73608599, + "epoch": 0.8277979495195366, + "grad_norm": 6.1875, + "learning_rate": 6.79596843294728e-06, + "loss": 1.04196415, + "memory(GiB)": 302.58, + "step": 148020, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.73717232, + "epoch": 0.8279097989925158, + "grad_norm": 6.3125, + "learning_rate": 6.79510540927317e-06, + "loss": 1.05411301, + "memory(GiB)": 302.58, + "step": 148040, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.75050921, + "epoch": 0.8280216484654951, + "grad_norm": 5.34375, + "learning_rate": 6.794242324200764e-06, + "loss": 0.97978525, + "memory(GiB)": 302.58, + "step": 148060, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.73647852, + "epoch": 0.8281334979384743, + "grad_norm": 6.84375, + "learning_rate": 6.793379177759581e-06, + "loss": 1.03147211, + "memory(GiB)": 302.58, + "step": 148080, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.73747315, + "epoch": 0.8282453474114536, + "grad_norm": 10.1875, + "learning_rate": 6.792515969979144e-06, + "loss": 1.06549845, + "memory(GiB)": 302.58, + "step": 148100, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.73474808, + "epoch": 0.8283571968844329, + "grad_norm": 7.375, + "learning_rate": 6.791652700888978e-06, + "loss": 1.05602884, + "memory(GiB)": 302.58, + "step": 148120, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.72742553, + "epoch": 0.8284690463574121, + "grad_norm": 7.28125, + "learning_rate": 6.7907893705186065e-06, + "loss": 1.07078457, + "memory(GiB)": 302.58, + "step": 148140, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.74135122, + "epoch": 0.8285808958303914, + "grad_norm": 4.5625, + "learning_rate": 6.78992597889756e-06, + "loss": 1.02799978, + "memory(GiB)": 302.58, + "step": 148160, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.73803344, + "epoch": 0.8286927453033707, + "grad_norm": 5.03125, + "learning_rate": 6.78906252605537e-06, + "loss": 1.03964243, + "memory(GiB)": 302.58, + "step": 148180, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.73181849, + "epoch": 0.8288045947763499, + "grad_norm": 5.3125, + "learning_rate": 6.788199012021569e-06, + "loss": 1.04529972, + "memory(GiB)": 302.58, + "step": 148200, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.74121327, + "epoch": 0.8289164442493292, + "grad_norm": 8.75, + "learning_rate": 6.7873354368256915e-06, + "loss": 1.02526512, + "memory(GiB)": 302.58, + "step": 148220, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.7446908, + "epoch": 0.8290282937223085, + "grad_norm": 8.8125, + "learning_rate": 6.786471800497274e-06, + "loss": 1.00964203, + "memory(GiB)": 302.58, + "step": 148240, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74562006, + "epoch": 0.8291401431952877, + "grad_norm": 7.375, + "learning_rate": 6.785608103065857e-06, + "loss": 1.01335592, + "memory(GiB)": 302.58, + "step": 148260, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.74662018, + "epoch": 0.8292519926682671, + "grad_norm": 5.34375, + "learning_rate": 6.784744344560981e-06, + "loss": 1.00758209, + "memory(GiB)": 302.58, + "step": 148280, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.74821534, + "epoch": 0.8293638421412464, + "grad_norm": 7.9375, + "learning_rate": 6.783880525012189e-06, + "loss": 0.96481581, + "memory(GiB)": 302.58, + "step": 148300, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.7564868, + "epoch": 0.8294756916142256, + "grad_norm": 5.5, + "learning_rate": 6.783016644449024e-06, + "loss": 0.95039482, + "memory(GiB)": 302.58, + "step": 148320, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74551959, + "epoch": 0.8295875410872049, + "grad_norm": 5.96875, + "learning_rate": 6.782152702901037e-06, + "loss": 0.99602118, + "memory(GiB)": 302.58, + "step": 148340, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.73756971, + "epoch": 0.8296993905601842, + "grad_norm": 6.84375, + "learning_rate": 6.7812887003977775e-06, + "loss": 1.04313679, + "memory(GiB)": 302.58, + "step": 148360, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.70837903, + "epoch": 0.8298112400331634, + "grad_norm": 9.625, + "learning_rate": 6.780424636968797e-06, + "loss": 1.16358805, + "memory(GiB)": 302.58, + "step": 148380, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.7345593, + "epoch": 0.8299230895061427, + "grad_norm": 6.4375, + "learning_rate": 6.779560512643648e-06, + "loss": 1.05671234, + "memory(GiB)": 302.58, + "step": 148400, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74067659, + "epoch": 0.830034938979122, + "grad_norm": 10.125, + "learning_rate": 6.778696327451886e-06, + "loss": 1.02340183, + "memory(GiB)": 302.58, + "step": 148420, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.73762045, + "epoch": 0.8301467884521012, + "grad_norm": 9.125, + "learning_rate": 6.7778320814230705e-06, + "loss": 1.04146547, + "memory(GiB)": 302.58, + "step": 148440, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.73328733, + "epoch": 0.8302586379250805, + "grad_norm": 4.84375, + "learning_rate": 6.776967774586759e-06, + "loss": 1.05132666, + "memory(GiB)": 302.58, + "step": 148460, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.74563556, + "epoch": 0.8303704873980597, + "grad_norm": 7.6875, + "learning_rate": 6.776103406972515e-06, + "loss": 1.00854759, + "memory(GiB)": 302.58, + "step": 148480, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.748175, + "epoch": 0.830482336871039, + "grad_norm": 8.9375, + "learning_rate": 6.775238978609904e-06, + "loss": 0.96504564, + "memory(GiB)": 302.58, + "step": 148500, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73010392, + "epoch": 0.8305941863440183, + "grad_norm": 6.09375, + "learning_rate": 6.774374489528489e-06, + "loss": 1.09060402, + "memory(GiB)": 302.58, + "step": 148520, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.74833026, + "epoch": 0.8307060358169975, + "grad_norm": 5.5625, + "learning_rate": 6.773509939757842e-06, + "loss": 0.98770409, + "memory(GiB)": 302.58, + "step": 148540, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.75425601, + "epoch": 0.8308178852899768, + "grad_norm": 8.8125, + "learning_rate": 6.77264532932753e-06, + "loss": 0.96827955, + "memory(GiB)": 302.58, + "step": 148560, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75654693, + "epoch": 0.8309297347629561, + "grad_norm": 8.5625, + "learning_rate": 6.771780658267127e-06, + "loss": 0.95674915, + "memory(GiB)": 302.58, + "step": 148580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.7318912, + "epoch": 0.8310415842359353, + "grad_norm": 6.4375, + "learning_rate": 6.770915926606208e-06, + "loss": 1.05360689, + "memory(GiB)": 302.58, + "step": 148600, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.73982592, + "epoch": 0.8311534337089146, + "grad_norm": 6.5625, + "learning_rate": 6.770051134374348e-06, + "loss": 1.02284088, + "memory(GiB)": 302.58, + "step": 148620, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.73476448, + "epoch": 0.8312652831818939, + "grad_norm": 8.625, + "learning_rate": 6.7691862816011274e-06, + "loss": 1.04022846, + "memory(GiB)": 302.58, + "step": 148640, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.72151403, + "epoch": 0.8313771326548731, + "grad_norm": 7.5625, + "learning_rate": 6.768321368316127e-06, + "loss": 1.11474428, + "memory(GiB)": 302.58, + "step": 148660, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.72849574, + "epoch": 0.8314889821278524, + "grad_norm": 4.28125, + "learning_rate": 6.767456394548926e-06, + "loss": 1.07350845, + "memory(GiB)": 302.58, + "step": 148680, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.72788682, + "epoch": 0.8316008316008316, + "grad_norm": 6.96875, + "learning_rate": 6.7665913603291135e-06, + "loss": 1.07188263, + "memory(GiB)": 302.58, + "step": 148700, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.73203111, + "epoch": 0.8317126810738109, + "grad_norm": 6.09375, + "learning_rate": 6.765726265686275e-06, + "loss": 1.06572886, + "memory(GiB)": 302.58, + "step": 148720, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.74774265, + "epoch": 0.8318245305467902, + "grad_norm": 6.28125, + "learning_rate": 6.764861110649999e-06, + "loss": 0.98388538, + "memory(GiB)": 302.58, + "step": 148740, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.72889237, + "epoch": 0.8319363800197694, + "grad_norm": 6.125, + "learning_rate": 6.7639958952498756e-06, + "loss": 1.08555756, + "memory(GiB)": 302.58, + "step": 148760, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.73593602, + "epoch": 0.8320482294927487, + "grad_norm": 6.4375, + "learning_rate": 6.763130619515499e-06, + "loss": 1.04948626, + "memory(GiB)": 302.58, + "step": 148780, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.74115963, + "epoch": 0.832160078965728, + "grad_norm": 7.28125, + "learning_rate": 6.762265283476465e-06, + "loss": 1.01536922, + "memory(GiB)": 302.58, + "step": 148800, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.75151405, + "epoch": 0.8322719284387072, + "grad_norm": 8.4375, + "learning_rate": 6.761399887162372e-06, + "loss": 0.97115469, + "memory(GiB)": 302.58, + "step": 148820, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.74699483, + "epoch": 0.8323837779116865, + "grad_norm": 7.90625, + "learning_rate": 6.760534430602816e-06, + "loss": 0.98974791, + "memory(GiB)": 302.58, + "step": 148840, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.7424489, + "epoch": 0.8324956273846658, + "grad_norm": 10.4375, + "learning_rate": 6.7596689138274005e-06, + "loss": 1.00888157, + "memory(GiB)": 302.58, + "step": 148860, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.74772806, + "epoch": 0.832607476857645, + "grad_norm": 6.4375, + "learning_rate": 6.758803336865729e-06, + "loss": 0.98004856, + "memory(GiB)": 302.58, + "step": 148880, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.73408165, + "epoch": 0.8327193263306243, + "grad_norm": 7.8125, + "learning_rate": 6.757937699747405e-06, + "loss": 1.05458755, + "memory(GiB)": 302.58, + "step": 148900, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.7489881, + "epoch": 0.8328311758036036, + "grad_norm": 6.03125, + "learning_rate": 6.757072002502037e-06, + "loss": 0.98934717, + "memory(GiB)": 302.58, + "step": 148920, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.72166071, + "epoch": 0.8329430252765828, + "grad_norm": 9.5, + "learning_rate": 6.756206245159237e-06, + "loss": 1.09937372, + "memory(GiB)": 302.58, + "step": 148940, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.74379706, + "epoch": 0.8330548747495621, + "grad_norm": 5.75, + "learning_rate": 6.7553404277486125e-06, + "loss": 1.00565643, + "memory(GiB)": 302.58, + "step": 148960, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.75168376, + "epoch": 0.8331667242225413, + "grad_norm": 8.75, + "learning_rate": 6.754474550299781e-06, + "loss": 0.99069595, + "memory(GiB)": 302.58, + "step": 148980, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.74648738, + "epoch": 0.8332785736955206, + "grad_norm": 8.8125, + "learning_rate": 6.753608612842356e-06, + "loss": 0.97613182, + "memory(GiB)": 302.58, + "step": 149000, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.72913194, + "epoch": 0.8333904231684999, + "grad_norm": 8.0625, + "learning_rate": 6.7527426154059565e-06, + "loss": 1.08510046, + "memory(GiB)": 302.58, + "step": 149020, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.73812175, + "epoch": 0.8335022726414791, + "grad_norm": 5.0625, + "learning_rate": 6.7518765580202004e-06, + "loss": 1.04805775, + "memory(GiB)": 302.58, + "step": 149040, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.72846355, + "epoch": 0.8336141221144584, + "grad_norm": 5.875, + "learning_rate": 6.751010440714711e-06, + "loss": 1.08225431, + "memory(GiB)": 302.58, + "step": 149060, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.7334918, + "epoch": 0.8337259715874377, + "grad_norm": 7.21875, + "learning_rate": 6.750144263519114e-06, + "loss": 1.03974705, + "memory(GiB)": 302.58, + "step": 149080, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.76066012, + "epoch": 0.8338378210604169, + "grad_norm": 6.65625, + "learning_rate": 6.749278026463032e-06, + "loss": 0.93467665, + "memory(GiB)": 302.58, + "step": 149100, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.74444218, + "epoch": 0.8339496705333962, + "grad_norm": 7.03125, + "learning_rate": 6.748411729576095e-06, + "loss": 0.98484697, + "memory(GiB)": 302.58, + "step": 149120, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.73398623, + "epoch": 0.8340615200063755, + "grad_norm": 5.3125, + "learning_rate": 6.747545372887934e-06, + "loss": 1.04857683, + "memory(GiB)": 302.58, + "step": 149140, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.74792933, + "epoch": 0.8341733694793547, + "grad_norm": 9.8125, + "learning_rate": 6.746678956428179e-06, + "loss": 1.01312847, + "memory(GiB)": 302.58, + "step": 149160, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.75700355, + "epoch": 0.834285218952334, + "grad_norm": 7.53125, + "learning_rate": 6.7458124802264655e-06, + "loss": 0.97006445, + "memory(GiB)": 302.58, + "step": 149180, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.75232954, + "epoch": 0.8343970684253132, + "grad_norm": 6.59375, + "learning_rate": 6.7449459443124275e-06, + "loss": 0.97914228, + "memory(GiB)": 302.58, + "step": 149200, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.7311614, + "epoch": 0.8345089178982925, + "grad_norm": 8.25, + "learning_rate": 6.744079348715708e-06, + "loss": 1.06576986, + "memory(GiB)": 302.58, + "step": 149220, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.73214087, + "epoch": 0.8346207673712718, + "grad_norm": 8.75, + "learning_rate": 6.743212693465943e-06, + "loss": 1.05962496, + "memory(GiB)": 302.58, + "step": 149240, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.74593244, + "epoch": 0.834732616844251, + "grad_norm": 5.9375, + "learning_rate": 6.742345978592776e-06, + "loss": 1.01542864, + "memory(GiB)": 302.58, + "step": 149260, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.76192541, + "epoch": 0.8348444663172303, + "grad_norm": 8.25, + "learning_rate": 6.741479204125852e-06, + "loss": 0.92944098, + "memory(GiB)": 302.58, + "step": 149280, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.74885564, + "epoch": 0.8349563157902096, + "grad_norm": 6.65625, + "learning_rate": 6.740612370094818e-06, + "loss": 0.9991643, + "memory(GiB)": 302.58, + "step": 149300, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.75125127, + "epoch": 0.8350681652631888, + "grad_norm": 8.6875, + "learning_rate": 6.7397454765293205e-06, + "loss": 0.99021444, + "memory(GiB)": 302.58, + "step": 149320, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.74062171, + "epoch": 0.8351800147361681, + "grad_norm": 8.8125, + "learning_rate": 6.738878523459011e-06, + "loss": 1.01128607, + "memory(GiB)": 302.58, + "step": 149340, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.75374203, + "epoch": 0.8352918642091474, + "grad_norm": 6.96875, + "learning_rate": 6.738011510913542e-06, + "loss": 0.97076025, + "memory(GiB)": 302.58, + "step": 149360, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.7414413, + "epoch": 0.8354037136821266, + "grad_norm": 9.5, + "learning_rate": 6.7371444389225695e-06, + "loss": 1.02610655, + "memory(GiB)": 302.58, + "step": 149380, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.75398312, + "epoch": 0.8355155631551059, + "grad_norm": 6.9375, + "learning_rate": 6.736277307515748e-06, + "loss": 0.96324892, + "memory(GiB)": 302.58, + "step": 149400, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.73756671, + "epoch": 0.8356274126280852, + "grad_norm": 6.78125, + "learning_rate": 6.735410116722738e-06, + "loss": 1.02429323, + "memory(GiB)": 302.58, + "step": 149420, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.7386693, + "epoch": 0.8357392621010644, + "grad_norm": 7.6875, + "learning_rate": 6.734542866573198e-06, + "loss": 1.04324226, + "memory(GiB)": 302.58, + "step": 149440, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.74082875, + "epoch": 0.8358511115740437, + "grad_norm": 8.5625, + "learning_rate": 6.733675557096792e-06, + "loss": 1.02187538, + "memory(GiB)": 302.58, + "step": 149460, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.76462879, + "epoch": 0.835962961047023, + "grad_norm": 5.4375, + "learning_rate": 6.732808188323184e-06, + "loss": 0.92019558, + "memory(GiB)": 302.58, + "step": 149480, + "train_speed(iter/s)": 0.123912 + }, + { + "acc": 0.73364916, + "epoch": 0.8360748105200022, + "grad_norm": 8.125, + "learning_rate": 6.731940760282043e-06, + "loss": 1.05172663, + "memory(GiB)": 302.58, + "step": 149500, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.75446639, + "epoch": 0.8361866599929815, + "grad_norm": 7.21875, + "learning_rate": 6.7310732730030355e-06, + "loss": 0.94374151, + "memory(GiB)": 302.58, + "step": 149520, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.73065376, + "epoch": 0.8362985094659607, + "grad_norm": 7.5, + "learning_rate": 6.730205726515833e-06, + "loss": 1.06199408, + "memory(GiB)": 302.58, + "step": 149540, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.74095631, + "epoch": 0.83641035893894, + "grad_norm": 6.9375, + "learning_rate": 6.729338120850109e-06, + "loss": 1.00564909, + "memory(GiB)": 302.58, + "step": 149560, + "train_speed(iter/s)": 0.123944 + }, + { + "acc": 0.73495927, + "epoch": 0.8365222084119193, + "grad_norm": 5.75, + "learning_rate": 6.728470456035535e-06, + "loss": 1.04898567, + "memory(GiB)": 302.58, + "step": 149580, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.75335765, + "epoch": 0.8366340578848985, + "grad_norm": 7.4375, + "learning_rate": 6.727602732101793e-06, + "loss": 0.96833982, + "memory(GiB)": 302.58, + "step": 149600, + "train_speed(iter/s)": 0.12396 + }, + { + "acc": 0.75256357, + "epoch": 0.8367459073578778, + "grad_norm": 8.0625, + "learning_rate": 6.72673494907856e-06, + "loss": 0.99334249, + "memory(GiB)": 302.58, + "step": 149620, + "train_speed(iter/s)": 0.123967 + }, + { + "acc": 0.73175988, + "epoch": 0.8368577568308571, + "grad_norm": 5.375, + "learning_rate": 6.725867106995514e-06, + "loss": 1.06204443, + "memory(GiB)": 302.58, + "step": 149640, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.72014313, + "epoch": 0.8369696063038363, + "grad_norm": 6.5, + "learning_rate": 6.724999205882341e-06, + "loss": 1.11018314, + "memory(GiB)": 302.58, + "step": 149660, + "train_speed(iter/s)": 0.123983 + }, + { + "acc": 0.72687106, + "epoch": 0.8370814557768156, + "grad_norm": 7.71875, + "learning_rate": 6.724131245768727e-06, + "loss": 1.09494066, + "memory(GiB)": 302.58, + "step": 149680, + "train_speed(iter/s)": 0.123991 + }, + { + "acc": 0.7508359, + "epoch": 0.8371933052497949, + "grad_norm": 8.0625, + "learning_rate": 6.723263226684357e-06, + "loss": 0.96801176, + "memory(GiB)": 302.58, + "step": 149700, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.74427361, + "epoch": 0.8373051547227741, + "grad_norm": 5.34375, + "learning_rate": 6.722395148658919e-06, + "loss": 1.00314388, + "memory(GiB)": 302.58, + "step": 149720, + "train_speed(iter/s)": 0.124007 + }, + { + "acc": 0.73454342, + "epoch": 0.8374170041957534, + "grad_norm": 7.90625, + "learning_rate": 6.721527011722106e-06, + "loss": 1.05396461, + "memory(GiB)": 302.58, + "step": 149740, + "train_speed(iter/s)": 0.124014 + }, + { + "acc": 0.7588769, + "epoch": 0.8375288536687326, + "grad_norm": 8.875, + "learning_rate": 6.72065881590361e-06, + "loss": 0.93274212, + "memory(GiB)": 302.58, + "step": 149760, + "train_speed(iter/s)": 0.124022 + }, + { + "acc": 0.7265842, + "epoch": 0.8376407031417119, + "grad_norm": 5.90625, + "learning_rate": 6.719790561233127e-06, + "loss": 1.0896493, + "memory(GiB)": 302.58, + "step": 149780, + "train_speed(iter/s)": 0.124029 + }, + { + "acc": 0.74709301, + "epoch": 0.8377525526146912, + "grad_norm": 6.4375, + "learning_rate": 6.7189222477403535e-06, + "loss": 0.99258547, + "memory(GiB)": 302.58, + "step": 149800, + "train_speed(iter/s)": 0.124037 + }, + { + "acc": 0.74636884, + "epoch": 0.8378644020876704, + "grad_norm": 7.46875, + "learning_rate": 6.718053875454989e-06, + "loss": 0.99546814, + "memory(GiB)": 302.58, + "step": 149820, + "train_speed(iter/s)": 0.124045 + }, + { + "acc": 0.7372716, + "epoch": 0.8379762515606497, + "grad_norm": 5.96875, + "learning_rate": 6.717185444406733e-06, + "loss": 1.0548687, + "memory(GiB)": 302.58, + "step": 149840, + "train_speed(iter/s)": 0.124052 + }, + { + "acc": 0.73565779, + "epoch": 0.838088101033629, + "grad_norm": 9.5, + "learning_rate": 6.716316954625291e-06, + "loss": 1.04942703, + "memory(GiB)": 302.58, + "step": 149860, + "train_speed(iter/s)": 0.124059 + }, + { + "acc": 0.73960018, + "epoch": 0.8381999505066082, + "grad_norm": 4.25, + "learning_rate": 6.715448406140366e-06, + "loss": 1.02339516, + "memory(GiB)": 302.58, + "step": 149880, + "train_speed(iter/s)": 0.124067 + }, + { + "acc": 0.74902716, + "epoch": 0.8383117999795875, + "grad_norm": 6.6875, + "learning_rate": 6.714579798981665e-06, + "loss": 1.00083208, + "memory(GiB)": 302.58, + "step": 149900, + "train_speed(iter/s)": 0.124075 + }, + { + "acc": 0.74634647, + "epoch": 0.8384236494525668, + "grad_norm": 6.15625, + "learning_rate": 6.7137111331788995e-06, + "loss": 0.99974146, + "memory(GiB)": 302.58, + "step": 149920, + "train_speed(iter/s)": 0.124083 + }, + { + "acc": 0.74875097, + "epoch": 0.838535498925546, + "grad_norm": 5.71875, + "learning_rate": 6.712842408761779e-06, + "loss": 0.98550301, + "memory(GiB)": 302.58, + "step": 149940, + "train_speed(iter/s)": 0.124091 + }, + { + "acc": 0.74582524, + "epoch": 0.8386473483985253, + "grad_norm": 7.15625, + "learning_rate": 6.711973625760018e-06, + "loss": 0.97416868, + "memory(GiB)": 302.58, + "step": 149960, + "train_speed(iter/s)": 0.124099 + }, + { + "acc": 0.74529362, + "epoch": 0.8387591978715045, + "grad_norm": 12.8125, + "learning_rate": 6.71110478420333e-06, + "loss": 1.00471458, + "memory(GiB)": 302.58, + "step": 149980, + "train_speed(iter/s)": 0.124106 + }, + { + "acc": 0.74754138, + "epoch": 0.8388710473444838, + "grad_norm": 8.375, + "learning_rate": 6.710235884121431e-06, + "loss": 0.99010859, + "memory(GiB)": 302.58, + "step": 150000, + "train_speed(iter/s)": 0.124114 + }, + { + "epoch": 0.8388710473444838, + "eval_acc": 0.7053885035359105, + "eval_loss": 1.0172706842422485, + "eval_runtime": 7515.3529, + "eval_samples_per_second": 10.017, + "eval_steps_per_second": 10.017, + "step": 150000 + }, + { + "acc": 0.73857269, + "epoch": 0.8389828968174631, + "grad_norm": 6.1875, + "learning_rate": 6.709366925544043e-06, + "loss": 1.03085423, + "memory(GiB)": 302.58, + "step": 150020, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.73656602, + "epoch": 0.8390947462904423, + "grad_norm": 7.40625, + "learning_rate": 6.708497908500885e-06, + "loss": 1.0489728, + "memory(GiB)": 302.58, + "step": 150040, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.7426825, + "epoch": 0.8392065957634216, + "grad_norm": 5.8125, + "learning_rate": 6.707628833021681e-06, + "loss": 1.00916862, + "memory(GiB)": 302.58, + "step": 150060, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.73647733, + "epoch": 0.8393184452364009, + "grad_norm": 8.25, + "learning_rate": 6.7067596991361575e-06, + "loss": 1.0538455, + "memory(GiB)": 302.58, + "step": 150080, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.76690335, + "epoch": 0.8394302947093801, + "grad_norm": 7.71875, + "learning_rate": 6.70589050687404e-06, + "loss": 0.90177889, + "memory(GiB)": 302.58, + "step": 150100, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.74364829, + "epoch": 0.8395421441823594, + "grad_norm": 8.6875, + "learning_rate": 6.705021256265057e-06, + "loss": 1.01894741, + "memory(GiB)": 302.58, + "step": 150120, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.72781811, + "epoch": 0.8396539936553387, + "grad_norm": 9.125, + "learning_rate": 6.7041519473389415e-06, + "loss": 1.08192396, + "memory(GiB)": 302.58, + "step": 150140, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.73804188, + "epoch": 0.8397658431283179, + "grad_norm": 8.5, + "learning_rate": 6.703282580125425e-06, + "loss": 1.03335028, + "memory(GiB)": 302.58, + "step": 150160, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.73135514, + "epoch": 0.8398776926012972, + "grad_norm": 9.0625, + "learning_rate": 6.702413154654243e-06, + "loss": 1.05984421, + "memory(GiB)": 302.58, + "step": 150180, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.73877139, + "epoch": 0.8399895420742765, + "grad_norm": 6.78125, + "learning_rate": 6.7015436709551325e-06, + "loss": 1.00916138, + "memory(GiB)": 302.58, + "step": 150200, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.7467165, + "epoch": 0.8401013915472557, + "grad_norm": 9.75, + "learning_rate": 6.7006741290578334e-06, + "loss": 0.98701763, + "memory(GiB)": 302.58, + "step": 150220, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74261518, + "epoch": 0.840213241020235, + "grad_norm": 7.3125, + "learning_rate": 6.6998045289920875e-06, + "loss": 1.01222515, + "memory(GiB)": 302.58, + "step": 150240, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74050288, + "epoch": 0.8403250904932142, + "grad_norm": 8.375, + "learning_rate": 6.6989348707876365e-06, + "loss": 1.02453718, + "memory(GiB)": 302.58, + "step": 150260, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.74150805, + "epoch": 0.8404369399661935, + "grad_norm": 7.90625, + "learning_rate": 6.698065154474226e-06, + "loss": 1.02636662, + "memory(GiB)": 302.58, + "step": 150280, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.74828429, + "epoch": 0.8405487894391728, + "grad_norm": 7.5, + "learning_rate": 6.697195380081603e-06, + "loss": 0.97467632, + "memory(GiB)": 302.58, + "step": 150300, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.73957586, + "epoch": 0.840660638912152, + "grad_norm": 7.125, + "learning_rate": 6.696325547639515e-06, + "loss": 1.01678305, + "memory(GiB)": 302.58, + "step": 150320, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74437041, + "epoch": 0.8407724883851313, + "grad_norm": 5.46875, + "learning_rate": 6.695455657177714e-06, + "loss": 1.00546465, + "memory(GiB)": 302.58, + "step": 150340, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.73617444, + "epoch": 0.8408843378581106, + "grad_norm": 7.25, + "learning_rate": 6.694585708725954e-06, + "loss": 1.03913698, + "memory(GiB)": 302.58, + "step": 150360, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74260082, + "epoch": 0.8409961873310898, + "grad_norm": 8.75, + "learning_rate": 6.6937157023139896e-06, + "loss": 1.00534229, + "memory(GiB)": 302.58, + "step": 150380, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74354687, + "epoch": 0.8411080368040691, + "grad_norm": 7.78125, + "learning_rate": 6.692845637971579e-06, + "loss": 1.00553064, + "memory(GiB)": 302.58, + "step": 150400, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.7265687, + "epoch": 0.8412198862770484, + "grad_norm": 8.6875, + "learning_rate": 6.691975515728479e-06, + "loss": 1.07727947, + "memory(GiB)": 302.58, + "step": 150420, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74339452, + "epoch": 0.8413317357500276, + "grad_norm": 7.5625, + "learning_rate": 6.691105335614449e-06, + "loss": 0.98995323, + "memory(GiB)": 302.58, + "step": 150440, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.72928433, + "epoch": 0.8414435852230069, + "grad_norm": 7.0625, + "learning_rate": 6.690235097659256e-06, + "loss": 1.08959846, + "memory(GiB)": 302.58, + "step": 150460, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.73663163, + "epoch": 0.8415554346959861, + "grad_norm": 5.53125, + "learning_rate": 6.689364801892663e-06, + "loss": 1.05239639, + "memory(GiB)": 302.58, + "step": 150480, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.75024285, + "epoch": 0.8416672841689654, + "grad_norm": 6.21875, + "learning_rate": 6.688494448344437e-06, + "loss": 0.97002621, + "memory(GiB)": 302.58, + "step": 150500, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74800377, + "epoch": 0.8417791336419447, + "grad_norm": 5.53125, + "learning_rate": 6.687624037044346e-06, + "loss": 0.98536787, + "memory(GiB)": 302.58, + "step": 150520, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73316236, + "epoch": 0.8418909831149239, + "grad_norm": 5.375, + "learning_rate": 6.686753568022163e-06, + "loss": 1.07233467, + "memory(GiB)": 302.58, + "step": 150540, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.72552814, + "epoch": 0.8420028325879032, + "grad_norm": 10.625, + "learning_rate": 6.685883041307659e-06, + "loss": 1.11135426, + "memory(GiB)": 302.58, + "step": 150560, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74151745, + "epoch": 0.8421146820608825, + "grad_norm": 6.8125, + "learning_rate": 6.685012456930609e-06, + "loss": 1.00738716, + "memory(GiB)": 302.58, + "step": 150580, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.73786621, + "epoch": 0.8422265315338617, + "grad_norm": 6.65625, + "learning_rate": 6.68414181492079e-06, + "loss": 1.02654543, + "memory(GiB)": 302.58, + "step": 150600, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.729457, + "epoch": 0.842338381006841, + "grad_norm": 6.3125, + "learning_rate": 6.683271115307979e-06, + "loss": 1.07085934, + "memory(GiB)": 302.58, + "step": 150620, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.75772777, + "epoch": 0.8424502304798203, + "grad_norm": 9.1875, + "learning_rate": 6.682400358121961e-06, + "loss": 0.95283346, + "memory(GiB)": 302.58, + "step": 150640, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74379697, + "epoch": 0.8425620799527995, + "grad_norm": 6.40625, + "learning_rate": 6.681529543392515e-06, + "loss": 1.00150318, + "memory(GiB)": 302.58, + "step": 150660, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.76462665, + "epoch": 0.8426739294257788, + "grad_norm": 7.15625, + "learning_rate": 6.680658671149426e-06, + "loss": 0.91871986, + "memory(GiB)": 302.58, + "step": 150680, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.74439998, + "epoch": 0.842785778898758, + "grad_norm": 6.0625, + "learning_rate": 6.679787741422482e-06, + "loss": 1.01265821, + "memory(GiB)": 302.58, + "step": 150700, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.74482064, + "epoch": 0.8428976283717373, + "grad_norm": 7.0625, + "learning_rate": 6.67891675424147e-06, + "loss": 0.98905754, + "memory(GiB)": 302.58, + "step": 150720, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.75402832, + "epoch": 0.8430094778447166, + "grad_norm": 7.6875, + "learning_rate": 6.678045709636183e-06, + "loss": 0.97259769, + "memory(GiB)": 302.58, + "step": 150740, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.73800468, + "epoch": 0.8431213273176958, + "grad_norm": 8.5625, + "learning_rate": 6.677174607636409e-06, + "loss": 1.0205267, + "memory(GiB)": 302.58, + "step": 150760, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.73418345, + "epoch": 0.8432331767906751, + "grad_norm": 9.75, + "learning_rate": 6.676303448271947e-06, + "loss": 1.05119801, + "memory(GiB)": 302.58, + "step": 150780, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.72727356, + "epoch": 0.8433450262636544, + "grad_norm": 9.8125, + "learning_rate": 6.675432231572592e-06, + "loss": 1.08056097, + "memory(GiB)": 302.58, + "step": 150800, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.75338502, + "epoch": 0.8434568757366336, + "grad_norm": 8.1875, + "learning_rate": 6.6745609575681416e-06, + "loss": 0.9739089, + "memory(GiB)": 302.58, + "step": 150820, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.74675212, + "epoch": 0.8435687252096129, + "grad_norm": 8.625, + "learning_rate": 6.673689626288396e-06, + "loss": 1.03436661, + "memory(GiB)": 302.58, + "step": 150840, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.74080791, + "epoch": 0.8436805746825922, + "grad_norm": 7.53125, + "learning_rate": 6.672818237763158e-06, + "loss": 1.02775192, + "memory(GiB)": 302.58, + "step": 150860, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.74201965, + "epoch": 0.8437924241555714, + "grad_norm": 8.0, + "learning_rate": 6.671946792022233e-06, + "loss": 1.01549854, + "memory(GiB)": 302.58, + "step": 150880, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.73949108, + "epoch": 0.8439042736285507, + "grad_norm": 9.9375, + "learning_rate": 6.671075289095426e-06, + "loss": 1.00500345, + "memory(GiB)": 302.58, + "step": 150900, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.71507683, + "epoch": 0.84401612310153, + "grad_norm": 6.6875, + "learning_rate": 6.670203729012543e-06, + "loss": 1.15101051, + "memory(GiB)": 302.58, + "step": 150920, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.74328637, + "epoch": 0.8441279725745092, + "grad_norm": 8.625, + "learning_rate": 6.669332111803398e-06, + "loss": 0.98701801, + "memory(GiB)": 302.58, + "step": 150940, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.74549761, + "epoch": 0.8442398220474885, + "grad_norm": 7.9375, + "learning_rate": 6.668460437497802e-06, + "loss": 0.98678255, + "memory(GiB)": 302.58, + "step": 150960, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.72683172, + "epoch": 0.8443516715204678, + "grad_norm": 8.3125, + "learning_rate": 6.667588706125568e-06, + "loss": 1.07755861, + "memory(GiB)": 302.58, + "step": 150980, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.75133901, + "epoch": 0.844463520993447, + "grad_norm": 12.5625, + "learning_rate": 6.666716917716514e-06, + "loss": 0.9647439, + "memory(GiB)": 302.58, + "step": 151000, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.735186, + "epoch": 0.8445753704664263, + "grad_norm": 5.75, + "learning_rate": 6.665845072300454e-06, + "loss": 1.03397741, + "memory(GiB)": 302.58, + "step": 151020, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.73538518, + "epoch": 0.8446872199394055, + "grad_norm": 6.71875, + "learning_rate": 6.664973169907211e-06, + "loss": 1.05060692, + "memory(GiB)": 302.58, + "step": 151040, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.75531373, + "epoch": 0.8447990694123848, + "grad_norm": 11.0, + "learning_rate": 6.664101210566606e-06, + "loss": 0.96132145, + "memory(GiB)": 302.58, + "step": 151060, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.74902544, + "epoch": 0.8449109188853641, + "grad_norm": 9.6875, + "learning_rate": 6.663229194308464e-06, + "loss": 0.97745533, + "memory(GiB)": 302.58, + "step": 151080, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.74397407, + "epoch": 0.8450227683583433, + "grad_norm": 7.125, + "learning_rate": 6.662357121162608e-06, + "loss": 1.01342268, + "memory(GiB)": 302.58, + "step": 151100, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.73939629, + "epoch": 0.8451346178313226, + "grad_norm": 10.4375, + "learning_rate": 6.661484991158869e-06, + "loss": 1.00189114, + "memory(GiB)": 302.58, + "step": 151120, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.73885818, + "epoch": 0.8452464673043019, + "grad_norm": 7.40625, + "learning_rate": 6.660612804327074e-06, + "loss": 1.01719341, + "memory(GiB)": 302.58, + "step": 151140, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.7455317, + "epoch": 0.8453583167772811, + "grad_norm": 10.6875, + "learning_rate": 6.659740560697055e-06, + "loss": 1.002036, + "memory(GiB)": 302.58, + "step": 151160, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.73782706, + "epoch": 0.8454701662502604, + "grad_norm": 7.59375, + "learning_rate": 6.658868260298647e-06, + "loss": 1.03606777, + "memory(GiB)": 302.58, + "step": 151180, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.72957602, + "epoch": 0.8455820157232397, + "grad_norm": 7.78125, + "learning_rate": 6.657995903161682e-06, + "loss": 1.08996391, + "memory(GiB)": 302.58, + "step": 151200, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.72846222, + "epoch": 0.8456938651962189, + "grad_norm": 8.75, + "learning_rate": 6.657123489316003e-06, + "loss": 1.08859463, + "memory(GiB)": 302.58, + "step": 151220, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.74697008, + "epoch": 0.8458057146691982, + "grad_norm": 10.6875, + "learning_rate": 6.656251018791444e-06, + "loss": 0.99018679, + "memory(GiB)": 302.58, + "step": 151240, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.74067569, + "epoch": 0.8459175641421774, + "grad_norm": 6.84375, + "learning_rate": 6.65537849161785e-06, + "loss": 0.99360008, + "memory(GiB)": 302.58, + "step": 151260, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.72585912, + "epoch": 0.8460294136151567, + "grad_norm": 7.03125, + "learning_rate": 6.654505907825061e-06, + "loss": 1.08277206, + "memory(GiB)": 302.58, + "step": 151280, + "train_speed(iter/s)": 0.123826 + }, + { + "acc": 0.74952798, + "epoch": 0.846141263088136, + "grad_norm": 10.5625, + "learning_rate": 6.6536332674429235e-06, + "loss": 0.99531374, + "memory(GiB)": 302.58, + "step": 151300, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.74572206, + "epoch": 0.8462531125611152, + "grad_norm": 5.96875, + "learning_rate": 6.652760570501287e-06, + "loss": 0.98349066, + "memory(GiB)": 302.58, + "step": 151320, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.72972026, + "epoch": 0.8463649620340945, + "grad_norm": 7.34375, + "learning_rate": 6.651887817029995e-06, + "loss": 1.06323576, + "memory(GiB)": 302.58, + "step": 151340, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.73492665, + "epoch": 0.8464768115070738, + "grad_norm": 8.5625, + "learning_rate": 6.651015007058903e-06, + "loss": 1.05821114, + "memory(GiB)": 302.58, + "step": 151360, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.75282674, + "epoch": 0.846588660980053, + "grad_norm": 6.59375, + "learning_rate": 6.650142140617863e-06, + "loss": 0.96601601, + "memory(GiB)": 302.58, + "step": 151380, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.74036684, + "epoch": 0.8467005104530323, + "grad_norm": 8.8125, + "learning_rate": 6.649269217736729e-06, + "loss": 1.0102704, + "memory(GiB)": 302.58, + "step": 151400, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.74846883, + "epoch": 0.8468123599260116, + "grad_norm": 8.5625, + "learning_rate": 6.648396238445358e-06, + "loss": 1.00170956, + "memory(GiB)": 302.58, + "step": 151420, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.75521598, + "epoch": 0.8469242093989908, + "grad_norm": 7.28125, + "learning_rate": 6.647523202773609e-06, + "loss": 0.96927233, + "memory(GiB)": 302.58, + "step": 151440, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.73223066, + "epoch": 0.8470360588719701, + "grad_norm": 6.03125, + "learning_rate": 6.646650110751342e-06, + "loss": 1.07116756, + "memory(GiB)": 302.58, + "step": 151460, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.74318442, + "epoch": 0.8471479083449494, + "grad_norm": 6.0625, + "learning_rate": 6.64577696240842e-06, + "loss": 1.01265392, + "memory(GiB)": 302.58, + "step": 151480, + "train_speed(iter/s)": 0.123902 + }, + { + "acc": 0.74895844, + "epoch": 0.8472597578179286, + "grad_norm": 7.65625, + "learning_rate": 6.644903757774705e-06, + "loss": 0.96823225, + "memory(GiB)": 302.58, + "step": 151500, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.74331431, + "epoch": 0.8473716072909079, + "grad_norm": 4.65625, + "learning_rate": 6.64403049688007e-06, + "loss": 1.00461674, + "memory(GiB)": 302.58, + "step": 151520, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.74827728, + "epoch": 0.8474834567638871, + "grad_norm": 8.0625, + "learning_rate": 6.643157179754376e-06, + "loss": 1.00276728, + "memory(GiB)": 302.58, + "step": 151540, + "train_speed(iter/s)": 0.123926 + }, + { + "acc": 0.74621034, + "epoch": 0.8475953062368664, + "grad_norm": 6.5, + "learning_rate": 6.642283806427497e-06, + "loss": 1.01276493, + "memory(GiB)": 302.58, + "step": 151560, + "train_speed(iter/s)": 0.123934 + }, + { + "acc": 0.72674618, + "epoch": 0.8477071557098457, + "grad_norm": 4.78125, + "learning_rate": 6.641410376929304e-06, + "loss": 1.08998957, + "memory(GiB)": 302.58, + "step": 151580, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.73406692, + "epoch": 0.8478190051828249, + "grad_norm": 7.46875, + "learning_rate": 6.640536891289673e-06, + "loss": 1.04213219, + "memory(GiB)": 302.58, + "step": 151600, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.74095531, + "epoch": 0.8479308546558042, + "grad_norm": 7.1875, + "learning_rate": 6.639663349538477e-06, + "loss": 1.03860188, + "memory(GiB)": 302.58, + "step": 151620, + "train_speed(iter/s)": 0.123957 + }, + { + "acc": 0.73991585, + "epoch": 0.8480427041287835, + "grad_norm": 7.3125, + "learning_rate": 6.638789751705593e-06, + "loss": 1.02566824, + "memory(GiB)": 302.58, + "step": 151640, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.74869304, + "epoch": 0.8481545536017627, + "grad_norm": 6.46875, + "learning_rate": 6.637916097820905e-06, + "loss": 0.9683754, + "memory(GiB)": 302.58, + "step": 151660, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.73530154, + "epoch": 0.848266403074742, + "grad_norm": 5.90625, + "learning_rate": 6.637042387914294e-06, + "loss": 1.06503611, + "memory(GiB)": 302.58, + "step": 151680, + "train_speed(iter/s)": 0.12398 + }, + { + "acc": 0.74459167, + "epoch": 0.8483782525477213, + "grad_norm": 6.21875, + "learning_rate": 6.636168622015642e-06, + "loss": 0.99198103, + "memory(GiB)": 302.58, + "step": 151700, + "train_speed(iter/s)": 0.123987 + }, + { + "acc": 0.7475059, + "epoch": 0.8484901020207005, + "grad_norm": 8.0625, + "learning_rate": 6.635294800154834e-06, + "loss": 0.98873434, + "memory(GiB)": 302.58, + "step": 151720, + "train_speed(iter/s)": 0.123995 + }, + { + "acc": 0.74875836, + "epoch": 0.8486019514936798, + "grad_norm": 7.71875, + "learning_rate": 6.634420922361758e-06, + "loss": 0.95822811, + "memory(GiB)": 302.58, + "step": 151740, + "train_speed(iter/s)": 0.124003 + }, + { + "acc": 0.72986188, + "epoch": 0.848713800966659, + "grad_norm": 8.625, + "learning_rate": 6.633546988666304e-06, + "loss": 1.0956748, + "memory(GiB)": 302.58, + "step": 151760, + "train_speed(iter/s)": 0.124011 + }, + { + "acc": 0.73514543, + "epoch": 0.8488256504396383, + "grad_norm": 7.15625, + "learning_rate": 6.632672999098364e-06, + "loss": 1.04960022, + "memory(GiB)": 302.58, + "step": 151780, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.74180579, + "epoch": 0.8489374999126176, + "grad_norm": 10.0625, + "learning_rate": 6.63179895368783e-06, + "loss": 1.02656631, + "memory(GiB)": 302.58, + "step": 151800, + "train_speed(iter/s)": 0.124025 + }, + { + "acc": 0.7381546, + "epoch": 0.8490493493855968, + "grad_norm": 7.53125, + "learning_rate": 6.630924852464598e-06, + "loss": 1.01581268, + "memory(GiB)": 302.58, + "step": 151820, + "train_speed(iter/s)": 0.124033 + }, + { + "acc": 0.7417995, + "epoch": 0.8491611988585761, + "grad_norm": 6.90625, + "learning_rate": 6.630050695458564e-06, + "loss": 0.99452038, + "memory(GiB)": 302.58, + "step": 151840, + "train_speed(iter/s)": 0.124041 + }, + { + "acc": 0.73663597, + "epoch": 0.8492730483315554, + "grad_norm": 9.0, + "learning_rate": 6.629176482699629e-06, + "loss": 1.04597349, + "memory(GiB)": 302.58, + "step": 151860, + "train_speed(iter/s)": 0.124048 + }, + { + "acc": 0.72490954, + "epoch": 0.8493848978045346, + "grad_norm": 11.8125, + "learning_rate": 6.62830221421769e-06, + "loss": 1.09976645, + "memory(GiB)": 302.58, + "step": 151880, + "train_speed(iter/s)": 0.124056 + }, + { + "acc": 0.74978957, + "epoch": 0.8494967472775139, + "grad_norm": 8.9375, + "learning_rate": 6.627427890042653e-06, + "loss": 0.99421291, + "memory(GiB)": 302.58, + "step": 151900, + "train_speed(iter/s)": 0.124064 + }, + { + "acc": 0.74441562, + "epoch": 0.8496085967504932, + "grad_norm": 6.875, + "learning_rate": 6.626553510204421e-06, + "loss": 1.00654163, + "memory(GiB)": 302.58, + "step": 151920, + "train_speed(iter/s)": 0.124072 + }, + { + "acc": 0.74817653, + "epoch": 0.8497204462234724, + "grad_norm": 7.78125, + "learning_rate": 6.625679074732902e-06, + "loss": 0.97938719, + "memory(GiB)": 302.58, + "step": 151940, + "train_speed(iter/s)": 0.12408 + }, + { + "acc": 0.74111996, + "epoch": 0.8498322956964517, + "grad_norm": 8.9375, + "learning_rate": 6.624804583658005e-06, + "loss": 1.0322094, + "memory(GiB)": 302.58, + "step": 151960, + "train_speed(iter/s)": 0.124088 + }, + { + "acc": 0.73252401, + "epoch": 0.849944145169431, + "grad_norm": 6.25, + "learning_rate": 6.623930037009637e-06, + "loss": 1.0430728, + "memory(GiB)": 302.58, + "step": 151980, + "train_speed(iter/s)": 0.124095 + }, + { + "acc": 0.74738903, + "epoch": 0.8500559946424102, + "grad_norm": 5.25, + "learning_rate": 6.623055434817714e-06, + "loss": 0.98838024, + "memory(GiB)": 302.58, + "step": 152000, + "train_speed(iter/s)": 0.124103 + }, + { + "epoch": 0.8500559946424102, + "eval_acc": 0.7055355065525781, + "eval_loss": 1.0168380737304688, + "eval_runtime": 7498.6536, + "eval_samples_per_second": 10.04, + "eval_steps_per_second": 10.04, + "step": 152000 + }, + { + "acc": 0.74984274, + "epoch": 0.8501678441153895, + "grad_norm": 7.28125, + "learning_rate": 6.622180777112147e-06, + "loss": 0.96359272, + "memory(GiB)": 302.58, + "step": 152020, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.76203661, + "epoch": 0.8502796935883687, + "grad_norm": 7.65625, + "learning_rate": 6.621306063922854e-06, + "loss": 0.91931925, + "memory(GiB)": 302.58, + "step": 152040, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.74457245, + "epoch": 0.850391543061348, + "grad_norm": 9.0625, + "learning_rate": 6.620431295279752e-06, + "loss": 0.97326813, + "memory(GiB)": 302.58, + "step": 152060, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.75289617, + "epoch": 0.8505033925343273, + "grad_norm": 7.46875, + "learning_rate": 6.619556471212764e-06, + "loss": 0.96339998, + "memory(GiB)": 302.58, + "step": 152080, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.74255557, + "epoch": 0.8506152420073065, + "grad_norm": 7.78125, + "learning_rate": 6.618681591751807e-06, + "loss": 1.01217833, + "memory(GiB)": 302.58, + "step": 152100, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.74351373, + "epoch": 0.8507270914802858, + "grad_norm": 8.5625, + "learning_rate": 6.617806656926807e-06, + "loss": 1.0130682, + "memory(GiB)": 302.58, + "step": 152120, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.7418745, + "epoch": 0.8508389409532651, + "grad_norm": 7.4375, + "learning_rate": 6.61693166676769e-06, + "loss": 1.02036257, + "memory(GiB)": 302.58, + "step": 152140, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.72265534, + "epoch": 0.8509507904262443, + "grad_norm": 5.90625, + "learning_rate": 6.616056621304383e-06, + "loss": 1.09111633, + "memory(GiB)": 302.58, + "step": 152160, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.74450135, + "epoch": 0.8510626398992236, + "grad_norm": 4.4375, + "learning_rate": 6.6151815205668155e-06, + "loss": 1.00952587, + "memory(GiB)": 302.58, + "step": 152180, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.73572497, + "epoch": 0.8511744893722029, + "grad_norm": 7.5625, + "learning_rate": 6.614306364584917e-06, + "loss": 1.05386143, + "memory(GiB)": 302.58, + "step": 152200, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74236984, + "epoch": 0.8512863388451821, + "grad_norm": 9.6875, + "learning_rate": 6.613431153388623e-06, + "loss": 1.01600914, + "memory(GiB)": 302.58, + "step": 152220, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74381576, + "epoch": 0.8513981883181614, + "grad_norm": 9.0, + "learning_rate": 6.612555887007868e-06, + "loss": 1.01160345, + "memory(GiB)": 302.58, + "step": 152240, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.75596356, + "epoch": 0.8515100377911407, + "grad_norm": 6.59375, + "learning_rate": 6.611680565472587e-06, + "loss": 0.96692495, + "memory(GiB)": 302.58, + "step": 152260, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.73727541, + "epoch": 0.8516218872641199, + "grad_norm": 7.5, + "learning_rate": 6.610805188812721e-06, + "loss": 1.04127331, + "memory(GiB)": 302.58, + "step": 152280, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.7347291, + "epoch": 0.8517337367370992, + "grad_norm": 5.875, + "learning_rate": 6.60992975705821e-06, + "loss": 1.02135086, + "memory(GiB)": 302.58, + "step": 152300, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.73459373, + "epoch": 0.8518455862100784, + "grad_norm": 7.40625, + "learning_rate": 6.609054270238996e-06, + "loss": 1.04433317, + "memory(GiB)": 302.58, + "step": 152320, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74016876, + "epoch": 0.8519574356830577, + "grad_norm": 6.15625, + "learning_rate": 6.608178728385023e-06, + "loss": 1.00616093, + "memory(GiB)": 302.58, + "step": 152340, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.74733148, + "epoch": 0.852069285156037, + "grad_norm": 7.71875, + "learning_rate": 6.607303131526239e-06, + "loss": 0.98465624, + "memory(GiB)": 302.58, + "step": 152360, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.7200006, + "epoch": 0.8521811346290162, + "grad_norm": 7.375, + "learning_rate": 6.606427479692592e-06, + "loss": 1.12581482, + "memory(GiB)": 302.58, + "step": 152380, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.73421545, + "epoch": 0.8522929841019955, + "grad_norm": 5.90625, + "learning_rate": 6.605551772914031e-06, + "loss": 1.0764328, + "memory(GiB)": 302.58, + "step": 152400, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74603667, + "epoch": 0.8524048335749748, + "grad_norm": 6.03125, + "learning_rate": 6.604676011220508e-06, + "loss": 0.95750008, + "memory(GiB)": 302.58, + "step": 152420, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.73552723, + "epoch": 0.852516683047954, + "grad_norm": 7.75, + "learning_rate": 6.603800194641976e-06, + "loss": 1.04938354, + "memory(GiB)": 302.58, + "step": 152440, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.74510741, + "epoch": 0.8526285325209333, + "grad_norm": 7.71875, + "learning_rate": 6.602924323208394e-06, + "loss": 0.98771791, + "memory(GiB)": 302.58, + "step": 152460, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74822173, + "epoch": 0.8527403819939126, + "grad_norm": 7.40625, + "learning_rate": 6.602048396949715e-06, + "loss": 0.99344158, + "memory(GiB)": 302.58, + "step": 152480, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74524789, + "epoch": 0.8528522314668918, + "grad_norm": 7.75, + "learning_rate": 6.6011724158959025e-06, + "loss": 1.00281601, + "memory(GiB)": 302.58, + "step": 152500, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.73929672, + "epoch": 0.8529640809398711, + "grad_norm": 6.3125, + "learning_rate": 6.600296380076916e-06, + "loss": 1.01748629, + "memory(GiB)": 302.58, + "step": 152520, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75312986, + "epoch": 0.8530759304128503, + "grad_norm": 8.25, + "learning_rate": 6.5994202895227185e-06, + "loss": 0.97781811, + "memory(GiB)": 302.58, + "step": 152540, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74285302, + "epoch": 0.8531877798858296, + "grad_norm": 5.96875, + "learning_rate": 6.598544144263276e-06, + "loss": 1.02868176, + "memory(GiB)": 302.58, + "step": 152560, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.74199991, + "epoch": 0.8532996293588089, + "grad_norm": 6.375, + "learning_rate": 6.597667944328555e-06, + "loss": 1.01251688, + "memory(GiB)": 302.58, + "step": 152580, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.72982426, + "epoch": 0.8534114788317881, + "grad_norm": 5.34375, + "learning_rate": 6.596791689748524e-06, + "loss": 1.06637001, + "memory(GiB)": 302.58, + "step": 152600, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.72470837, + "epoch": 0.8535233283047674, + "grad_norm": 6.71875, + "learning_rate": 6.5959153805531525e-06, + "loss": 1.10897245, + "memory(GiB)": 302.58, + "step": 152620, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.74364982, + "epoch": 0.8536351777777467, + "grad_norm": 8.5625, + "learning_rate": 6.595039016772417e-06, + "loss": 1.00069838, + "memory(GiB)": 302.58, + "step": 152640, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74454484, + "epoch": 0.8537470272507259, + "grad_norm": 9.1875, + "learning_rate": 6.5941625984362876e-06, + "loss": 0.99943256, + "memory(GiB)": 302.58, + "step": 152660, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.75195265, + "epoch": 0.8538588767237052, + "grad_norm": 6.78125, + "learning_rate": 6.593286125574743e-06, + "loss": 0.97182522, + "memory(GiB)": 302.58, + "step": 152680, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.73422403, + "epoch": 0.8539707261966845, + "grad_norm": 9.0625, + "learning_rate": 6.59240959821776e-06, + "loss": 1.05121279, + "memory(GiB)": 302.58, + "step": 152700, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.74185882, + "epoch": 0.8540825756696637, + "grad_norm": 7.53125, + "learning_rate": 6.591533016395319e-06, + "loss": 1.02149534, + "memory(GiB)": 302.58, + "step": 152720, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.71504512, + "epoch": 0.854194425142643, + "grad_norm": 7.40625, + "learning_rate": 6.590656380137405e-06, + "loss": 1.12959471, + "memory(GiB)": 302.58, + "step": 152740, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.74478059, + "epoch": 0.8543062746156223, + "grad_norm": 8.3125, + "learning_rate": 6.5897796894739966e-06, + "loss": 1.00723896, + "memory(GiB)": 302.58, + "step": 152760, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.74318295, + "epoch": 0.8544181240886015, + "grad_norm": 7.4375, + "learning_rate": 6.588902944435081e-06, + "loss": 1.00090361, + "memory(GiB)": 302.58, + "step": 152780, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.73058081, + "epoch": 0.8545299735615808, + "grad_norm": 7.65625, + "learning_rate": 6.588026145050649e-06, + "loss": 1.04482918, + "memory(GiB)": 302.58, + "step": 152800, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.73485017, + "epoch": 0.85464182303456, + "grad_norm": 4.625, + "learning_rate": 6.587149291350685e-06, + "loss": 1.05529318, + "memory(GiB)": 302.58, + "step": 152820, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.7397913, + "epoch": 0.8547536725075393, + "grad_norm": 7.96875, + "learning_rate": 6.586272383365184e-06, + "loss": 1.04420061, + "memory(GiB)": 302.58, + "step": 152840, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.7400938, + "epoch": 0.8548655219805186, + "grad_norm": 5.15625, + "learning_rate": 6.585395421124138e-06, + "loss": 1.05177851, + "memory(GiB)": 302.58, + "step": 152860, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.73640056, + "epoch": 0.8549773714534978, + "grad_norm": 6.5625, + "learning_rate": 6.58451840465754e-06, + "loss": 1.03857479, + "memory(GiB)": 302.58, + "step": 152880, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.7353004, + "epoch": 0.8550892209264771, + "grad_norm": 9.6875, + "learning_rate": 6.58364133399539e-06, + "loss": 1.03600178, + "memory(GiB)": 302.58, + "step": 152900, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.7256536, + "epoch": 0.8552010703994564, + "grad_norm": 6.375, + "learning_rate": 6.582764209167683e-06, + "loss": 1.10432491, + "memory(GiB)": 302.58, + "step": 152920, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.74964943, + "epoch": 0.8553129198724356, + "grad_norm": 8.125, + "learning_rate": 6.5818870302044214e-06, + "loss": 0.96716213, + "memory(GiB)": 302.58, + "step": 152940, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.73644304, + "epoch": 0.8554247693454149, + "grad_norm": 4.625, + "learning_rate": 6.581009797135608e-06, + "loss": 1.04160995, + "memory(GiB)": 302.58, + "step": 152960, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.73696704, + "epoch": 0.8555366188183942, + "grad_norm": 5.65625, + "learning_rate": 6.580132509991246e-06, + "loss": 1.04433956, + "memory(GiB)": 302.58, + "step": 152980, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.74146476, + "epoch": 0.8556484682913734, + "grad_norm": 7.9375, + "learning_rate": 6.579255168801342e-06, + "loss": 1.03162632, + "memory(GiB)": 302.58, + "step": 153000, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.7491662, + "epoch": 0.8557603177643527, + "grad_norm": 7.75, + "learning_rate": 6.578377773595903e-06, + "loss": 0.97784185, + "memory(GiB)": 302.58, + "step": 153020, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.73380675, + "epoch": 0.855872167237332, + "grad_norm": 6.25, + "learning_rate": 6.577500324404941e-06, + "loss": 1.0654952, + "memory(GiB)": 302.58, + "step": 153040, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.72790294, + "epoch": 0.8559840167103112, + "grad_norm": 7.03125, + "learning_rate": 6.576622821258462e-06, + "loss": 1.081001, + "memory(GiB)": 302.58, + "step": 153060, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.74241529, + "epoch": 0.8560958661832905, + "grad_norm": 4.46875, + "learning_rate": 6.5757452641864864e-06, + "loss": 1.01694593, + "memory(GiB)": 302.58, + "step": 153080, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.75138664, + "epoch": 0.8562077156562697, + "grad_norm": 8.5625, + "learning_rate": 6.574867653219026e-06, + "loss": 0.96485786, + "memory(GiB)": 302.58, + "step": 153100, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.73067045, + "epoch": 0.856319565129249, + "grad_norm": 5.0, + "learning_rate": 6.573989988386098e-06, + "loss": 1.07466421, + "memory(GiB)": 302.58, + "step": 153120, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.74233646, + "epoch": 0.8564314146022283, + "grad_norm": 4.875, + "learning_rate": 6.573112269717721e-06, + "loss": 1.02209387, + "memory(GiB)": 302.58, + "step": 153140, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.73320241, + "epoch": 0.8565432640752075, + "grad_norm": 6.15625, + "learning_rate": 6.5722344972439155e-06, + "loss": 1.05963182, + "memory(GiB)": 302.58, + "step": 153160, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.74250355, + "epoch": 0.8566551135481868, + "grad_norm": 7.09375, + "learning_rate": 6.5713566709947056e-06, + "loss": 1.00542183, + "memory(GiB)": 302.58, + "step": 153180, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.74026299, + "epoch": 0.8567669630211661, + "grad_norm": 8.9375, + "learning_rate": 6.570478791000113e-06, + "loss": 1.04354258, + "memory(GiB)": 302.58, + "step": 153200, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.73931737, + "epoch": 0.8568788124941453, + "grad_norm": 7.28125, + "learning_rate": 6.569600857290168e-06, + "loss": 1.01283712, + "memory(GiB)": 302.58, + "step": 153220, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.76235838, + "epoch": 0.8569906619671246, + "grad_norm": 7.28125, + "learning_rate": 6.568722869894897e-06, + "loss": 0.93809433, + "memory(GiB)": 302.58, + "step": 153240, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.7383584, + "epoch": 0.8571025114401039, + "grad_norm": 5.875, + "learning_rate": 6.567844828844329e-06, + "loss": 1.03208971, + "memory(GiB)": 302.58, + "step": 153260, + "train_speed(iter/s)": 0.123822 + }, + { + "acc": 0.75440345, + "epoch": 0.8572143609130831, + "grad_norm": 6.625, + "learning_rate": 6.566966734168496e-06, + "loss": 0.95262413, + "memory(GiB)": 302.58, + "step": 153280, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.73991947, + "epoch": 0.8573262103860624, + "grad_norm": 6.34375, + "learning_rate": 6.566088585897432e-06, + "loss": 1.02258635, + "memory(GiB)": 302.58, + "step": 153300, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.7500833, + "epoch": 0.8574380598590418, + "grad_norm": 7.15625, + "learning_rate": 6.565210384061172e-06, + "loss": 0.98561373, + "memory(GiB)": 302.58, + "step": 153320, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.73210316, + "epoch": 0.857549909332021, + "grad_norm": 5.59375, + "learning_rate": 6.564332128689755e-06, + "loss": 1.0740531, + "memory(GiB)": 302.58, + "step": 153340, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.75767531, + "epoch": 0.8576617588050003, + "grad_norm": 7.46875, + "learning_rate": 6.563453819813216e-06, + "loss": 0.94243307, + "memory(GiB)": 302.58, + "step": 153360, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.73292384, + "epoch": 0.8577736082779795, + "grad_norm": 7.625, + "learning_rate": 6.562575457461601e-06, + "loss": 1.06665974, + "memory(GiB)": 302.58, + "step": 153380, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.74535246, + "epoch": 0.8578854577509588, + "grad_norm": 8.875, + "learning_rate": 6.5616970416649505e-06, + "loss": 0.99814329, + "memory(GiB)": 302.58, + "step": 153400, + "train_speed(iter/s)": 0.123875 + }, + { + "acc": 0.73234296, + "epoch": 0.8579973072239381, + "grad_norm": 6.625, + "learning_rate": 6.560818572453309e-06, + "loss": 1.04764891, + "memory(GiB)": 302.58, + "step": 153420, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.73886747, + "epoch": 0.8581091566969173, + "grad_norm": 8.9375, + "learning_rate": 6.559940049856723e-06, + "loss": 1.02548504, + "memory(GiB)": 302.58, + "step": 153440, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.74486909, + "epoch": 0.8582210061698966, + "grad_norm": 10.875, + "learning_rate": 6.559061473905242e-06, + "loss": 0.99653378, + "memory(GiB)": 302.58, + "step": 153460, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.75120378, + "epoch": 0.8583328556428759, + "grad_norm": 6.84375, + "learning_rate": 6.558182844628913e-06, + "loss": 0.998563, + "memory(GiB)": 302.58, + "step": 153480, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.73954382, + "epoch": 0.8584447051158551, + "grad_norm": 9.5, + "learning_rate": 6.55730416205779e-06, + "loss": 1.04093304, + "memory(GiB)": 302.58, + "step": 153500, + "train_speed(iter/s)": 0.123912 + }, + { + "acc": 0.74944606, + "epoch": 0.8585565545888344, + "grad_norm": 7.71875, + "learning_rate": 6.556425426221927e-06, + "loss": 0.97412119, + "memory(GiB)": 302.58, + "step": 153520, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.72166896, + "epoch": 0.8586684040618137, + "grad_norm": 7.84375, + "learning_rate": 6.55554663715138e-06, + "loss": 1.10261717, + "memory(GiB)": 302.58, + "step": 153540, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.73263612, + "epoch": 0.8587802535347929, + "grad_norm": 8.125, + "learning_rate": 6.554667794876206e-06, + "loss": 1.07010193, + "memory(GiB)": 302.58, + "step": 153560, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.76175671, + "epoch": 0.8588921030077722, + "grad_norm": 7.34375, + "learning_rate": 6.553788899426462e-06, + "loss": 0.93476171, + "memory(GiB)": 302.58, + "step": 153580, + "train_speed(iter/s)": 0.123943 + }, + { + "acc": 0.74542522, + "epoch": 0.8590039524807515, + "grad_norm": 8.0, + "learning_rate": 6.552909950832211e-06, + "loss": 0.98360186, + "memory(GiB)": 302.58, + "step": 153600, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.73350058, + "epoch": 0.8591158019537307, + "grad_norm": 7.65625, + "learning_rate": 6.5520309491235165e-06, + "loss": 1.05517998, + "memory(GiB)": 302.58, + "step": 153620, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.72937441, + "epoch": 0.85922765142671, + "grad_norm": 5.46875, + "learning_rate": 6.5511518943304405e-06, + "loss": 1.06147499, + "memory(GiB)": 302.58, + "step": 153640, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.75244598, + "epoch": 0.8593395008996892, + "grad_norm": 7.46875, + "learning_rate": 6.550272786483053e-06, + "loss": 0.99496717, + "memory(GiB)": 302.58, + "step": 153660, + "train_speed(iter/s)": 0.123973 + }, + { + "acc": 0.74759717, + "epoch": 0.8594513503726685, + "grad_norm": 9.25, + "learning_rate": 6.54939362561142e-06, + "loss": 0.98447046, + "memory(GiB)": 302.58, + "step": 153680, + "train_speed(iter/s)": 0.123981 + }, + { + "acc": 0.74015598, + "epoch": 0.8595631998456478, + "grad_norm": 5.9375, + "learning_rate": 6.5485144117456125e-06, + "loss": 1.02062998, + "memory(GiB)": 302.58, + "step": 153700, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.7382453, + "epoch": 0.859675049318627, + "grad_norm": 6.59375, + "learning_rate": 6.5476351449157004e-06, + "loss": 1.02562199, + "memory(GiB)": 302.58, + "step": 153720, + "train_speed(iter/s)": 0.123997 + }, + { + "acc": 0.74207549, + "epoch": 0.8597868987916063, + "grad_norm": 7.5, + "learning_rate": 6.54675582515176e-06, + "loss": 1.017383, + "memory(GiB)": 302.58, + "step": 153740, + "train_speed(iter/s)": 0.124004 + }, + { + "acc": 0.75088367, + "epoch": 0.8598987482645856, + "grad_norm": 6.0625, + "learning_rate": 6.545876452483865e-06, + "loss": 0.98107691, + "memory(GiB)": 302.58, + "step": 153760, + "train_speed(iter/s)": 0.124012 + }, + { + "acc": 0.74841132, + "epoch": 0.8600105977375648, + "grad_norm": 8.625, + "learning_rate": 6.544997026942094e-06, + "loss": 0.99135361, + "memory(GiB)": 302.58, + "step": 153780, + "train_speed(iter/s)": 0.12402 + }, + { + "acc": 0.73945432, + "epoch": 0.8601224472105441, + "grad_norm": 5.28125, + "learning_rate": 6.5441175485565255e-06, + "loss": 1.00016451, + "memory(GiB)": 302.58, + "step": 153800, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.73627858, + "epoch": 0.8602342966835234, + "grad_norm": 6.78125, + "learning_rate": 6.543238017357241e-06, + "loss": 1.02624464, + "memory(GiB)": 302.58, + "step": 153820, + "train_speed(iter/s)": 0.124035 + }, + { + "acc": 0.73853421, + "epoch": 0.8603461461565026, + "grad_norm": 7.4375, + "learning_rate": 6.542358433374321e-06, + "loss": 1.03439236, + "memory(GiB)": 302.58, + "step": 153840, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.73191514, + "epoch": 0.8604579956294819, + "grad_norm": 8.0, + "learning_rate": 6.5414787966378545e-06, + "loss": 1.06282902, + "memory(GiB)": 302.58, + "step": 153860, + "train_speed(iter/s)": 0.12405 + }, + { + "acc": 0.75509572, + "epoch": 0.8605698451024612, + "grad_norm": 5.90625, + "learning_rate": 6.540599107177924e-06, + "loss": 0.96644058, + "memory(GiB)": 302.58, + "step": 153880, + "train_speed(iter/s)": 0.124058 + }, + { + "acc": 0.74556031, + "epoch": 0.8606816945754404, + "grad_norm": 6.46875, + "learning_rate": 6.539719365024618e-06, + "loss": 0.9959815, + "memory(GiB)": 302.58, + "step": 153900, + "train_speed(iter/s)": 0.124065 + }, + { + "acc": 0.74755163, + "epoch": 0.8607935440484197, + "grad_norm": 7.96875, + "learning_rate": 6.538839570208029e-06, + "loss": 0.98351307, + "memory(GiB)": 302.58, + "step": 153920, + "train_speed(iter/s)": 0.124073 + }, + { + "acc": 0.73652554, + "epoch": 0.8609053935213989, + "grad_norm": 6.65625, + "learning_rate": 6.537959722758246e-06, + "loss": 1.03019638, + "memory(GiB)": 302.58, + "step": 153940, + "train_speed(iter/s)": 0.12408 + }, + { + "acc": 0.73472915, + "epoch": 0.8610172429943782, + "grad_norm": 8.5625, + "learning_rate": 6.5370798227053645e-06, + "loss": 1.05412445, + "memory(GiB)": 302.58, + "step": 153960, + "train_speed(iter/s)": 0.124087 + }, + { + "acc": 0.75681605, + "epoch": 0.8611290924673575, + "grad_norm": 7.59375, + "learning_rate": 6.536199870079478e-06, + "loss": 0.93505516, + "memory(GiB)": 302.58, + "step": 153980, + "train_speed(iter/s)": 0.124095 + }, + { + "acc": 0.72938161, + "epoch": 0.8612409419403367, + "grad_norm": 5.1875, + "learning_rate": 6.535319864910685e-06, + "loss": 1.08480711, + "memory(GiB)": 302.58, + "step": 154000, + "train_speed(iter/s)": 0.124102 + }, + { + "epoch": 0.8612409419403367, + "eval_acc": 0.7056105855580727, + "eval_loss": 1.0163832902908325, + "eval_runtime": 7508.3542, + "eval_samples_per_second": 10.027, + "eval_steps_per_second": 10.027, + "step": 154000 + }, + { + "acc": 0.74790211, + "epoch": 0.861352791413316, + "grad_norm": 11.375, + "learning_rate": 6.534439807229086e-06, + "loss": 0.97955294, + "memory(GiB)": 302.58, + "step": 154020, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.75769787, + "epoch": 0.8614646408862953, + "grad_norm": 5.875, + "learning_rate": 6.53355969706478e-06, + "loss": 0.93537502, + "memory(GiB)": 302.58, + "step": 154040, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.74754539, + "epoch": 0.8615764903592745, + "grad_norm": 6.625, + "learning_rate": 6.532679534447868e-06, + "loss": 0.98654966, + "memory(GiB)": 302.58, + "step": 154060, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.73129516, + "epoch": 0.8616883398322538, + "grad_norm": 6.15625, + "learning_rate": 6.531799319408455e-06, + "loss": 1.06262293, + "memory(GiB)": 302.58, + "step": 154080, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.75219378, + "epoch": 0.861800189305233, + "grad_norm": 7.875, + "learning_rate": 6.5309190519766495e-06, + "loss": 0.97324438, + "memory(GiB)": 302.58, + "step": 154100, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.75380869, + "epoch": 0.8619120387782123, + "grad_norm": 5.34375, + "learning_rate": 6.530038732182559e-06, + "loss": 0.96224852, + "memory(GiB)": 302.58, + "step": 154120, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.74658427, + "epoch": 0.8620238882511916, + "grad_norm": 4.5625, + "learning_rate": 6.529158360056291e-06, + "loss": 1.00708942, + "memory(GiB)": 302.58, + "step": 154140, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.73867106, + "epoch": 0.8621357377241708, + "grad_norm": 7.125, + "learning_rate": 6.528277935627959e-06, + "loss": 1.02868786, + "memory(GiB)": 302.58, + "step": 154160, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.75012298, + "epoch": 0.8622475871971501, + "grad_norm": 7.59375, + "learning_rate": 6.527397458927676e-06, + "loss": 0.97821016, + "memory(GiB)": 302.58, + "step": 154180, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.72817187, + "epoch": 0.8623594366701294, + "grad_norm": 6.0, + "learning_rate": 6.526516929985553e-06, + "loss": 1.06993093, + "memory(GiB)": 302.58, + "step": 154200, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.73303871, + "epoch": 0.8624712861431086, + "grad_norm": 6.875, + "learning_rate": 6.525636348831714e-06, + "loss": 1.01741486, + "memory(GiB)": 302.58, + "step": 154220, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.74004388, + "epoch": 0.8625831356160879, + "grad_norm": 6.6875, + "learning_rate": 6.524755715496273e-06, + "loss": 1.02712803, + "memory(GiB)": 302.58, + "step": 154240, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.71854115, + "epoch": 0.8626949850890672, + "grad_norm": 5.4375, + "learning_rate": 6.5238750300093535e-06, + "loss": 1.12654858, + "memory(GiB)": 302.58, + "step": 154260, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.7373178, + "epoch": 0.8628068345620464, + "grad_norm": 4.21875, + "learning_rate": 6.5229942924010744e-06, + "loss": 1.02446718, + "memory(GiB)": 302.58, + "step": 154280, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.72522206, + "epoch": 0.8629186840350257, + "grad_norm": 6.53125, + "learning_rate": 6.522113502701562e-06, + "loss": 1.10875835, + "memory(GiB)": 302.58, + "step": 154300, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.73075247, + "epoch": 0.863030533508005, + "grad_norm": 5.78125, + "learning_rate": 6.521232660940941e-06, + "loss": 1.08261957, + "memory(GiB)": 302.58, + "step": 154320, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74929528, + "epoch": 0.8631423829809842, + "grad_norm": 10.0, + "learning_rate": 6.520351767149337e-06, + "loss": 0.99666805, + "memory(GiB)": 302.58, + "step": 154340, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75195479, + "epoch": 0.8632542324539635, + "grad_norm": 5.96875, + "learning_rate": 6.5194708213568846e-06, + "loss": 0.97894878, + "memory(GiB)": 302.58, + "step": 154360, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.75003977, + "epoch": 0.8633660819269428, + "grad_norm": 9.75, + "learning_rate": 6.5185898235937106e-06, + "loss": 1.00235262, + "memory(GiB)": 302.58, + "step": 154380, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.75036602, + "epoch": 0.863477931399922, + "grad_norm": 4.34375, + "learning_rate": 6.51770877388995e-06, + "loss": 0.96250486, + "memory(GiB)": 302.58, + "step": 154400, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.736199, + "epoch": 0.8635897808729013, + "grad_norm": 8.375, + "learning_rate": 6.516827672275738e-06, + "loss": 1.02102842, + "memory(GiB)": 302.58, + "step": 154420, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.73453422, + "epoch": 0.8637016303458805, + "grad_norm": 6.9375, + "learning_rate": 6.515946518781208e-06, + "loss": 1.04198503, + "memory(GiB)": 302.58, + "step": 154440, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.7338501, + "epoch": 0.8638134798188598, + "grad_norm": 8.5625, + "learning_rate": 6.5150653134365e-06, + "loss": 1.04007044, + "memory(GiB)": 302.58, + "step": 154460, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.75263524, + "epoch": 0.8639253292918391, + "grad_norm": 8.125, + "learning_rate": 6.514184056271754e-06, + "loss": 0.98686037, + "memory(GiB)": 302.58, + "step": 154480, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.72683663, + "epoch": 0.8640371787648183, + "grad_norm": 4.25, + "learning_rate": 6.513302747317113e-06, + "loss": 1.08104715, + "memory(GiB)": 302.58, + "step": 154500, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.7454422, + "epoch": 0.8641490282377976, + "grad_norm": 10.875, + "learning_rate": 6.512421386602719e-06, + "loss": 1.00243883, + "memory(GiB)": 302.58, + "step": 154520, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.74343376, + "epoch": 0.8642608777107769, + "grad_norm": 7.84375, + "learning_rate": 6.5115399741587185e-06, + "loss": 1.01181717, + "memory(GiB)": 302.58, + "step": 154540, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.73300328, + "epoch": 0.8643727271837561, + "grad_norm": 8.8125, + "learning_rate": 6.5106585100152574e-06, + "loss": 1.06152, + "memory(GiB)": 302.58, + "step": 154560, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74852424, + "epoch": 0.8644845766567354, + "grad_norm": 7.6875, + "learning_rate": 6.509776994202486e-06, + "loss": 0.99998064, + "memory(GiB)": 302.58, + "step": 154580, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.74903741, + "epoch": 0.8645964261297147, + "grad_norm": 9.0, + "learning_rate": 6.5088954267505514e-06, + "loss": 0.96752672, + "memory(GiB)": 302.58, + "step": 154600, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.72571402, + "epoch": 0.8647082756026939, + "grad_norm": 5.59375, + "learning_rate": 6.50801380768961e-06, + "loss": 1.0792964, + "memory(GiB)": 302.58, + "step": 154620, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.74179511, + "epoch": 0.8648201250756732, + "grad_norm": 5.71875, + "learning_rate": 6.507132137049814e-06, + "loss": 1.00161295, + "memory(GiB)": 302.58, + "step": 154640, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.72833652, + "epoch": 0.8649319745486524, + "grad_norm": 5.71875, + "learning_rate": 6.506250414861321e-06, + "loss": 1.07022591, + "memory(GiB)": 302.58, + "step": 154660, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.73711848, + "epoch": 0.8650438240216317, + "grad_norm": 5.65625, + "learning_rate": 6.505368641154288e-06, + "loss": 1.05215502, + "memory(GiB)": 302.58, + "step": 154680, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.73649077, + "epoch": 0.865155673494611, + "grad_norm": 6.21875, + "learning_rate": 6.504486815958874e-06, + "loss": 1.03173304, + "memory(GiB)": 302.58, + "step": 154700, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.7382575, + "epoch": 0.8652675229675902, + "grad_norm": 7.09375, + "learning_rate": 6.5036049393052394e-06, + "loss": 1.0292119, + "memory(GiB)": 302.58, + "step": 154720, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.74512086, + "epoch": 0.8653793724405695, + "grad_norm": 6.09375, + "learning_rate": 6.502723011223548e-06, + "loss": 0.9968441, + "memory(GiB)": 302.58, + "step": 154740, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.75230627, + "epoch": 0.8654912219135488, + "grad_norm": 4.40625, + "learning_rate": 6.5018410317439664e-06, + "loss": 0.9649147, + "memory(GiB)": 302.58, + "step": 154760, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.75220003, + "epoch": 0.865603071386528, + "grad_norm": 9.125, + "learning_rate": 6.500959000896656e-06, + "loss": 0.98345442, + "memory(GiB)": 302.58, + "step": 154780, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.73816853, + "epoch": 0.8657149208595073, + "grad_norm": 5.9375, + "learning_rate": 6.500076918711791e-06, + "loss": 1.03416185, + "memory(GiB)": 302.58, + "step": 154800, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.7238821, + "epoch": 0.8658267703324866, + "grad_norm": 7.6875, + "learning_rate": 6.499194785219538e-06, + "loss": 1.10482349, + "memory(GiB)": 302.58, + "step": 154820, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.74226546, + "epoch": 0.8659386198054658, + "grad_norm": 7.6875, + "learning_rate": 6.498312600450071e-06, + "loss": 1.01121006, + "memory(GiB)": 302.58, + "step": 154840, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.75902472, + "epoch": 0.8660504692784451, + "grad_norm": 7.5625, + "learning_rate": 6.49743036443356e-06, + "loss": 0.93999872, + "memory(GiB)": 302.58, + "step": 154860, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.74786358, + "epoch": 0.8661623187514244, + "grad_norm": 6.375, + "learning_rate": 6.496548077200183e-06, + "loss": 0.97556915, + "memory(GiB)": 302.58, + "step": 154880, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.74415207, + "epoch": 0.8662741682244036, + "grad_norm": 6.21875, + "learning_rate": 6.495665738780117e-06, + "loss": 0.98361311, + "memory(GiB)": 302.58, + "step": 154900, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.74586701, + "epoch": 0.8663860176973829, + "grad_norm": 7.3125, + "learning_rate": 6.494783349203538e-06, + "loss": 0.99158278, + "memory(GiB)": 302.58, + "step": 154920, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.72240577, + "epoch": 0.8664978671703621, + "grad_norm": 4.96875, + "learning_rate": 6.493900908500631e-06, + "loss": 1.11737213, + "memory(GiB)": 302.58, + "step": 154940, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.73611121, + "epoch": 0.8666097166433414, + "grad_norm": 4.84375, + "learning_rate": 6.493018416701576e-06, + "loss": 1.05076551, + "memory(GiB)": 302.58, + "step": 154960, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.74257493, + "epoch": 0.8667215661163207, + "grad_norm": 6.53125, + "learning_rate": 6.492135873836554e-06, + "loss": 0.9916853, + "memory(GiB)": 302.58, + "step": 154980, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.74726729, + "epoch": 0.8668334155892999, + "grad_norm": 8.1875, + "learning_rate": 6.491253279935757e-06, + "loss": 1.00332355, + "memory(GiB)": 302.58, + "step": 155000, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.74790225, + "epoch": 0.8669452650622792, + "grad_norm": 6.625, + "learning_rate": 6.490370635029368e-06, + "loss": 0.99598045, + "memory(GiB)": 302.58, + "step": 155020, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.75820851, + "epoch": 0.8670571145352585, + "grad_norm": 8.0625, + "learning_rate": 6.4894879391475776e-06, + "loss": 0.95913162, + "memory(GiB)": 302.58, + "step": 155040, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.74515896, + "epoch": 0.8671689640082377, + "grad_norm": 8.6875, + "learning_rate": 6.488605192320575e-06, + "loss": 1.00159388, + "memory(GiB)": 302.58, + "step": 155060, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.74400673, + "epoch": 0.867280813481217, + "grad_norm": 7.6875, + "learning_rate": 6.487722394578555e-06, + "loss": 0.98862391, + "memory(GiB)": 302.58, + "step": 155080, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.75223136, + "epoch": 0.8673926629541963, + "grad_norm": 7.4375, + "learning_rate": 6.486839545951713e-06, + "loss": 0.97973328, + "memory(GiB)": 302.58, + "step": 155100, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.74652991, + "epoch": 0.8675045124271755, + "grad_norm": 7.15625, + "learning_rate": 6.485956646470244e-06, + "loss": 0.99698238, + "memory(GiB)": 302.58, + "step": 155120, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.73247046, + "epoch": 0.8676163619001548, + "grad_norm": 5.59375, + "learning_rate": 6.485073696164346e-06, + "loss": 1.03610382, + "memory(GiB)": 302.58, + "step": 155140, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.74423752, + "epoch": 0.867728211373134, + "grad_norm": 10.5, + "learning_rate": 6.484190695064217e-06, + "loss": 0.99800758, + "memory(GiB)": 302.58, + "step": 155160, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.75041528, + "epoch": 0.8678400608461133, + "grad_norm": 6.3125, + "learning_rate": 6.483307643200061e-06, + "loss": 0.9984005, + "memory(GiB)": 302.58, + "step": 155180, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.74373426, + "epoch": 0.8679519103190926, + "grad_norm": 7.59375, + "learning_rate": 6.482424540602079e-06, + "loss": 1.0150774, + "memory(GiB)": 302.58, + "step": 155200, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.75082989, + "epoch": 0.8680637597920718, + "grad_norm": 6.3125, + "learning_rate": 6.481541387300477e-06, + "loss": 0.99401264, + "memory(GiB)": 302.58, + "step": 155220, + "train_speed(iter/s)": 0.123801 + }, + { + "acc": 0.75512762, + "epoch": 0.8681756092650511, + "grad_norm": 5.84375, + "learning_rate": 6.480658183325463e-06, + "loss": 0.9304224, + "memory(GiB)": 302.58, + "step": 155240, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.74328732, + "epoch": 0.8682874587380304, + "grad_norm": 4.28125, + "learning_rate": 6.4797749287072444e-06, + "loss": 0.99980659, + "memory(GiB)": 302.58, + "step": 155260, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.74181294, + "epoch": 0.8683993082110096, + "grad_norm": 9.125, + "learning_rate": 6.4788916234760305e-06, + "loss": 1.01324644, + "memory(GiB)": 302.58, + "step": 155280, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.73804564, + "epoch": 0.8685111576839889, + "grad_norm": 10.25, + "learning_rate": 6.478008267662033e-06, + "loss": 1.04910727, + "memory(GiB)": 302.58, + "step": 155300, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.73932667, + "epoch": 0.8686230071569682, + "grad_norm": 6.875, + "learning_rate": 6.477124861295468e-06, + "loss": 1.04672346, + "memory(GiB)": 302.58, + "step": 155320, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.74666634, + "epoch": 0.8687348566299474, + "grad_norm": 7.15625, + "learning_rate": 6.476241404406547e-06, + "loss": 0.98671646, + "memory(GiB)": 302.58, + "step": 155340, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.74308205, + "epoch": 0.8688467061029267, + "grad_norm": 7.375, + "learning_rate": 6.47535789702549e-06, + "loss": 1.02635155, + "memory(GiB)": 302.58, + "step": 155360, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.74947367, + "epoch": 0.868958555575906, + "grad_norm": 8.125, + "learning_rate": 6.4744743391825125e-06, + "loss": 0.97389708, + "memory(GiB)": 302.58, + "step": 155380, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.72706394, + "epoch": 0.8690704050488852, + "grad_norm": 10.0, + "learning_rate": 6.473590730907839e-06, + "loss": 1.06686563, + "memory(GiB)": 302.58, + "step": 155400, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74885435, + "epoch": 0.8691822545218645, + "grad_norm": 6.6875, + "learning_rate": 6.47270707223169e-06, + "loss": 0.98910933, + "memory(GiB)": 302.58, + "step": 155420, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.76341081, + "epoch": 0.8692941039948437, + "grad_norm": 6.65625, + "learning_rate": 6.4718233631842896e-06, + "loss": 0.91328325, + "memory(GiB)": 302.58, + "step": 155440, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.76648974, + "epoch": 0.869405953467823, + "grad_norm": 6.84375, + "learning_rate": 6.470939603795863e-06, + "loss": 0.90063772, + "memory(GiB)": 302.58, + "step": 155460, + "train_speed(iter/s)": 0.123891 + }, + { + "acc": 0.75508499, + "epoch": 0.8695178029408023, + "grad_norm": 7.0625, + "learning_rate": 6.470055794096637e-06, + "loss": 0.95877171, + "memory(GiB)": 302.58, + "step": 155480, + "train_speed(iter/s)": 0.123899 + }, + { + "acc": 0.74347034, + "epoch": 0.8696296524137815, + "grad_norm": 7.625, + "learning_rate": 6.469171934116841e-06, + "loss": 1.00580091, + "memory(GiB)": 302.58, + "step": 155500, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.75158482, + "epoch": 0.8697415018867608, + "grad_norm": 6.65625, + "learning_rate": 6.468288023886708e-06, + "loss": 0.9792881, + "memory(GiB)": 302.58, + "step": 155520, + "train_speed(iter/s)": 0.123915 + }, + { + "acc": 0.73599954, + "epoch": 0.8698533513597401, + "grad_norm": 7.25, + "learning_rate": 6.467404063436467e-06, + "loss": 1.03797216, + "memory(GiB)": 302.58, + "step": 155540, + "train_speed(iter/s)": 0.123923 + }, + { + "acc": 0.73194489, + "epoch": 0.8699652008327193, + "grad_norm": 7.53125, + "learning_rate": 6.466520052796354e-06, + "loss": 1.07620344, + "memory(GiB)": 302.58, + "step": 155560, + "train_speed(iter/s)": 0.12393 + }, + { + "acc": 0.74465871, + "epoch": 0.8700770503056986, + "grad_norm": 5.90625, + "learning_rate": 6.465635991996607e-06, + "loss": 1.00371714, + "memory(GiB)": 302.58, + "step": 155580, + "train_speed(iter/s)": 0.123937 + }, + { + "acc": 0.74051995, + "epoch": 0.8701888997786779, + "grad_norm": 6.84375, + "learning_rate": 6.46475188106746e-06, + "loss": 1.01692972, + "memory(GiB)": 302.58, + "step": 155600, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.74975572, + "epoch": 0.8703007492516571, + "grad_norm": 9.0625, + "learning_rate": 6.463867720039156e-06, + "loss": 0.98748779, + "memory(GiB)": 302.58, + "step": 155620, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.72942047, + "epoch": 0.8704125987246364, + "grad_norm": 6.78125, + "learning_rate": 6.462983508941932e-06, + "loss": 1.07640305, + "memory(GiB)": 302.58, + "step": 155640, + "train_speed(iter/s)": 0.12396 + }, + { + "acc": 0.73876343, + "epoch": 0.8705244481976157, + "grad_norm": 10.9375, + "learning_rate": 6.462099247806033e-06, + "loss": 1.01277094, + "memory(GiB)": 302.58, + "step": 155660, + "train_speed(iter/s)": 0.123967 + }, + { + "acc": 0.74499946, + "epoch": 0.8706362976705949, + "grad_norm": 6.375, + "learning_rate": 6.461214936661704e-06, + "loss": 1.01247816, + "memory(GiB)": 302.58, + "step": 155680, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.75814166, + "epoch": 0.8707481471435742, + "grad_norm": 5.25, + "learning_rate": 6.460330575539191e-06, + "loss": 0.95197868, + "memory(GiB)": 302.58, + "step": 155700, + "train_speed(iter/s)": 0.123981 + }, + { + "acc": 0.75226526, + "epoch": 0.8708599966165534, + "grad_norm": 7.625, + "learning_rate": 6.459446164468743e-06, + "loss": 0.94584332, + "memory(GiB)": 302.58, + "step": 155720, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.74172568, + "epoch": 0.8709718460895327, + "grad_norm": 9.0, + "learning_rate": 6.458561703480606e-06, + "loss": 1.00128326, + "memory(GiB)": 302.58, + "step": 155740, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.73268666, + "epoch": 0.871083695562512, + "grad_norm": 6.0, + "learning_rate": 6.457677192605034e-06, + "loss": 1.05157604, + "memory(GiB)": 302.58, + "step": 155760, + "train_speed(iter/s)": 0.124004 + }, + { + "acc": 0.74659534, + "epoch": 0.8711955450354912, + "grad_norm": 6.40625, + "learning_rate": 6.456792631872282e-06, + "loss": 0.98823967, + "memory(GiB)": 302.58, + "step": 155780, + "train_speed(iter/s)": 0.124011 + }, + { + "acc": 0.7408793, + "epoch": 0.8713073945084705, + "grad_norm": 6.53125, + "learning_rate": 6.4559080213126006e-06, + "loss": 1.04729214, + "memory(GiB)": 302.58, + "step": 155800, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.75803413, + "epoch": 0.8714192439814498, + "grad_norm": 7.34375, + "learning_rate": 6.4550233609562494e-06, + "loss": 0.91718674, + "memory(GiB)": 302.58, + "step": 155820, + "train_speed(iter/s)": 0.124026 + }, + { + "acc": 0.73998442, + "epoch": 0.871531093454429, + "grad_norm": 7.96875, + "learning_rate": 6.4541386508334855e-06, + "loss": 1.03077946, + "memory(GiB)": 302.58, + "step": 155840, + "train_speed(iter/s)": 0.124034 + }, + { + "acc": 0.73978028, + "epoch": 0.8716429429274083, + "grad_norm": 6.71875, + "learning_rate": 6.453253890974566e-06, + "loss": 1.02290401, + "memory(GiB)": 302.58, + "step": 155860, + "train_speed(iter/s)": 0.124041 + }, + { + "acc": 0.75201578, + "epoch": 0.8717547924003876, + "grad_norm": 5.6875, + "learning_rate": 6.452369081409759e-06, + "loss": 0.96860676, + "memory(GiB)": 302.58, + "step": 155880, + "train_speed(iter/s)": 0.124049 + }, + { + "acc": 0.75108986, + "epoch": 0.8718666418733668, + "grad_norm": 4.96875, + "learning_rate": 6.451484222169323e-06, + "loss": 0.97524033, + "memory(GiB)": 302.58, + "step": 155900, + "train_speed(iter/s)": 0.124056 + }, + { + "acc": 0.731352, + "epoch": 0.8719784913463461, + "grad_norm": 5.875, + "learning_rate": 6.450599313283524e-06, + "loss": 1.06258945, + "memory(GiB)": 302.58, + "step": 155920, + "train_speed(iter/s)": 0.124064 + }, + { + "acc": 0.73871479, + "epoch": 0.8720903408193253, + "grad_norm": 5.375, + "learning_rate": 6.4497143547826276e-06, + "loss": 1.02301903, + "memory(GiB)": 302.58, + "step": 155940, + "train_speed(iter/s)": 0.124071 + }, + { + "acc": 0.73078699, + "epoch": 0.8722021902923046, + "grad_norm": 6.5, + "learning_rate": 6.448829346696905e-06, + "loss": 1.07745247, + "memory(GiB)": 302.58, + "step": 155960, + "train_speed(iter/s)": 0.124078 + }, + { + "acc": 0.74955621, + "epoch": 0.8723140397652839, + "grad_norm": 6.5, + "learning_rate": 6.447944289056625e-06, + "loss": 0.98321381, + "memory(GiB)": 302.58, + "step": 155980, + "train_speed(iter/s)": 0.124086 + }, + { + "acc": 0.7531414, + "epoch": 0.8724258892382631, + "grad_norm": 9.375, + "learning_rate": 6.447059181892057e-06, + "loss": 0.97242947, + "memory(GiB)": 302.58, + "step": 156000, + "train_speed(iter/s)": 0.124094 + }, + { + "epoch": 0.8724258892382631, + "eval_acc": 0.7056749671609458, + "eval_loss": 1.0160021781921387, + "eval_runtime": 7508.6444, + "eval_samples_per_second": 10.026, + "eval_steps_per_second": 10.026, + "step": 156000 + }, + { + "acc": 0.7382441, + "epoch": 0.8725377387112424, + "grad_norm": 6.8125, + "learning_rate": 6.446174025233478e-06, + "loss": 1.02365465, + "memory(GiB)": 302.58, + "step": 156020, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.73348861, + "epoch": 0.8726495881842217, + "grad_norm": 7.875, + "learning_rate": 6.4452888191111605e-06, + "loss": 1.05462799, + "memory(GiB)": 302.58, + "step": 156040, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.74840207, + "epoch": 0.8727614376572009, + "grad_norm": 7.5625, + "learning_rate": 6.444403563555383e-06, + "loss": 0.99491911, + "memory(GiB)": 302.58, + "step": 156060, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.73139582, + "epoch": 0.8728732871301802, + "grad_norm": 5.84375, + "learning_rate": 6.443518258596424e-06, + "loss": 1.03669119, + "memory(GiB)": 302.58, + "step": 156080, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.74220142, + "epoch": 0.8729851366031595, + "grad_norm": 8.3125, + "learning_rate": 6.442632904264563e-06, + "loss": 1.00269604, + "memory(GiB)": 302.58, + "step": 156100, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.7586309, + "epoch": 0.8730969860761387, + "grad_norm": 7.84375, + "learning_rate": 6.441747500590082e-06, + "loss": 0.9475894, + "memory(GiB)": 302.58, + "step": 156120, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.73200197, + "epoch": 0.873208835549118, + "grad_norm": 7.03125, + "learning_rate": 6.440862047603265e-06, + "loss": 1.08028736, + "memory(GiB)": 302.58, + "step": 156140, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.74457006, + "epoch": 0.8733206850220973, + "grad_norm": 5.34375, + "learning_rate": 6.439976545334398e-06, + "loss": 0.96894283, + "memory(GiB)": 302.58, + "step": 156160, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.73741474, + "epoch": 0.8734325344950765, + "grad_norm": 7.25, + "learning_rate": 6.4390909938137655e-06, + "loss": 1.02533779, + "memory(GiB)": 302.58, + "step": 156180, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.73340573, + "epoch": 0.8735443839680558, + "grad_norm": 9.1875, + "learning_rate": 6.438205393071658e-06, + "loss": 1.04461126, + "memory(GiB)": 302.58, + "step": 156200, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.73185954, + "epoch": 0.873656233441035, + "grad_norm": 7.8125, + "learning_rate": 6.437319743138366e-06, + "loss": 1.04278727, + "memory(GiB)": 302.58, + "step": 156220, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.73655591, + "epoch": 0.8737680829140143, + "grad_norm": 7.15625, + "learning_rate": 6.436434044044182e-06, + "loss": 1.05748978, + "memory(GiB)": 302.58, + "step": 156240, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.74591546, + "epoch": 0.8738799323869936, + "grad_norm": 7.6875, + "learning_rate": 6.4355482958193985e-06, + "loss": 1.00202484, + "memory(GiB)": 302.58, + "step": 156260, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.73069034, + "epoch": 0.8739917818599728, + "grad_norm": 7.4375, + "learning_rate": 6.43466249849431e-06, + "loss": 1.0606473, + "memory(GiB)": 302.58, + "step": 156280, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.75755172, + "epoch": 0.8741036313329521, + "grad_norm": 5.65625, + "learning_rate": 6.433776652099217e-06, + "loss": 0.9319828, + "memory(GiB)": 302.58, + "step": 156300, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.75032296, + "epoch": 0.8742154808059314, + "grad_norm": 7.3125, + "learning_rate": 6.432890756664415e-06, + "loss": 0.98043509, + "memory(GiB)": 302.58, + "step": 156320, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.73212862, + "epoch": 0.8743273302789106, + "grad_norm": 7.09375, + "learning_rate": 6.432004812220205e-06, + "loss": 1.07492418, + "memory(GiB)": 302.58, + "step": 156340, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74757452, + "epoch": 0.8744391797518899, + "grad_norm": 9.1875, + "learning_rate": 6.43111881879689e-06, + "loss": 0.97392931, + "memory(GiB)": 302.58, + "step": 156360, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.74230886, + "epoch": 0.8745510292248692, + "grad_norm": 5.3125, + "learning_rate": 6.430232776424774e-06, + "loss": 1.00171671, + "memory(GiB)": 302.58, + "step": 156380, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.74198194, + "epoch": 0.8746628786978484, + "grad_norm": 9.0, + "learning_rate": 6.429346685134161e-06, + "loss": 1.00864449, + "memory(GiB)": 302.58, + "step": 156400, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.76005535, + "epoch": 0.8747747281708277, + "grad_norm": 8.75, + "learning_rate": 6.4284605449553595e-06, + "loss": 0.92462826, + "memory(GiB)": 302.58, + "step": 156420, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.73633714, + "epoch": 0.874886577643807, + "grad_norm": 8.875, + "learning_rate": 6.427574355918679e-06, + "loss": 1.05685081, + "memory(GiB)": 302.58, + "step": 156440, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.74453669, + "epoch": 0.8749984271167862, + "grad_norm": 8.5625, + "learning_rate": 6.426688118054427e-06, + "loss": 0.9940177, + "memory(GiB)": 302.58, + "step": 156460, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.75285501, + "epoch": 0.8751102765897655, + "grad_norm": 7.125, + "learning_rate": 6.425801831392917e-06, + "loss": 0.93496895, + "memory(GiB)": 302.58, + "step": 156480, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.74629393, + "epoch": 0.8752221260627447, + "grad_norm": 10.5, + "learning_rate": 6.424915495964464e-06, + "loss": 1.01076107, + "memory(GiB)": 302.58, + "step": 156500, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.73287983, + "epoch": 0.875333975535724, + "grad_norm": 9.625, + "learning_rate": 6.424029111799383e-06, + "loss": 1.06168737, + "memory(GiB)": 302.58, + "step": 156520, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.72265744, + "epoch": 0.8754458250087033, + "grad_norm": 9.9375, + "learning_rate": 6.423142678927992e-06, + "loss": 1.12092209, + "memory(GiB)": 302.58, + "step": 156540, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75663624, + "epoch": 0.8755576744816825, + "grad_norm": 7.9375, + "learning_rate": 6.422256197380607e-06, + "loss": 0.95356541, + "memory(GiB)": 302.58, + "step": 156560, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.7235826, + "epoch": 0.8756695239546618, + "grad_norm": 4.34375, + "learning_rate": 6.421369667187551e-06, + "loss": 1.07961912, + "memory(GiB)": 302.58, + "step": 156580, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.74682236, + "epoch": 0.8757813734276411, + "grad_norm": 6.84375, + "learning_rate": 6.420483088379146e-06, + "loss": 1.00627909, + "memory(GiB)": 302.58, + "step": 156600, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.73254089, + "epoch": 0.8758932229006203, + "grad_norm": 5.21875, + "learning_rate": 6.419596460985716e-06, + "loss": 1.04445848, + "memory(GiB)": 302.58, + "step": 156620, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74103441, + "epoch": 0.8760050723735996, + "grad_norm": 6.25, + "learning_rate": 6.418709785037582e-06, + "loss": 1.02096071, + "memory(GiB)": 302.58, + "step": 156640, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.77230635, + "epoch": 0.8761169218465789, + "grad_norm": 6.3125, + "learning_rate": 6.417823060565077e-06, + "loss": 0.88505421, + "memory(GiB)": 302.58, + "step": 156660, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.73379254, + "epoch": 0.8762287713195581, + "grad_norm": 8.9375, + "learning_rate": 6.416936287598527e-06, + "loss": 1.05704784, + "memory(GiB)": 302.58, + "step": 156680, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.72785287, + "epoch": 0.8763406207925374, + "grad_norm": 7.84375, + "learning_rate": 6.416049466168264e-06, + "loss": 1.05974245, + "memory(GiB)": 302.58, + "step": 156700, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.76504736, + "epoch": 0.8764524702655166, + "grad_norm": 6.25, + "learning_rate": 6.415162596304618e-06, + "loss": 0.91589937, + "memory(GiB)": 302.58, + "step": 156720, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74770541, + "epoch": 0.8765643197384959, + "grad_norm": 6.15625, + "learning_rate": 6.414275678037925e-06, + "loss": 0.98721294, + "memory(GiB)": 302.58, + "step": 156740, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.73555551, + "epoch": 0.8766761692114752, + "grad_norm": 9.6875, + "learning_rate": 6.4133887113985185e-06, + "loss": 1.04190016, + "memory(GiB)": 302.58, + "step": 156760, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.76116614, + "epoch": 0.8767880186844544, + "grad_norm": 11.3125, + "learning_rate": 6.412501696416735e-06, + "loss": 0.94657335, + "memory(GiB)": 302.58, + "step": 156780, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.73906221, + "epoch": 0.8768998681574337, + "grad_norm": 7.21875, + "learning_rate": 6.411614633122917e-06, + "loss": 1.01713915, + "memory(GiB)": 302.58, + "step": 156800, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74513865, + "epoch": 0.877011717630413, + "grad_norm": 9.25, + "learning_rate": 6.4107275215474015e-06, + "loss": 0.98787661, + "memory(GiB)": 302.58, + "step": 156820, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.73663225, + "epoch": 0.8771235671033922, + "grad_norm": 5.90625, + "learning_rate": 6.409840361720532e-06, + "loss": 1.03908701, + "memory(GiB)": 302.58, + "step": 156840, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74202323, + "epoch": 0.8772354165763715, + "grad_norm": 7.15625, + "learning_rate": 6.40895315367265e-06, + "loss": 1.02204046, + "memory(GiB)": 302.58, + "step": 156860, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.73934031, + "epoch": 0.8773472660493508, + "grad_norm": 9.0625, + "learning_rate": 6.4080658974341035e-06, + "loss": 1.02893028, + "memory(GiB)": 302.58, + "step": 156880, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.74260187, + "epoch": 0.87745911552233, + "grad_norm": 4.75, + "learning_rate": 6.407178593035239e-06, + "loss": 0.99335928, + "memory(GiB)": 302.58, + "step": 156900, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.74981279, + "epoch": 0.8775709649953093, + "grad_norm": 7.90625, + "learning_rate": 6.406291240506404e-06, + "loss": 0.998139, + "memory(GiB)": 302.58, + "step": 156920, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.73282127, + "epoch": 0.8776828144682886, + "grad_norm": 5.6875, + "learning_rate": 6.4054038398779486e-06, + "loss": 1.0577383, + "memory(GiB)": 302.58, + "step": 156940, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.74784946, + "epoch": 0.8777946639412678, + "grad_norm": 9.0625, + "learning_rate": 6.404516391180227e-06, + "loss": 0.98868237, + "memory(GiB)": 302.58, + "step": 156960, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.74492745, + "epoch": 0.8779065134142471, + "grad_norm": 7.25, + "learning_rate": 6.4036288944435894e-06, + "loss": 0.99730167, + "memory(GiB)": 302.58, + "step": 156980, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.74422479, + "epoch": 0.8780183628872263, + "grad_norm": 5.375, + "learning_rate": 6.402741349698394e-06, + "loss": 1.01851587, + "memory(GiB)": 302.58, + "step": 157000, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.74634519, + "epoch": 0.8781302123602056, + "grad_norm": 6.5, + "learning_rate": 6.401853756974998e-06, + "loss": 1.01084566, + "memory(GiB)": 302.58, + "step": 157020, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.7399837, + "epoch": 0.8782420618331849, + "grad_norm": 6.71875, + "learning_rate": 6.400966116303756e-06, + "loss": 1.01898708, + "memory(GiB)": 302.58, + "step": 157040, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.73040061, + "epoch": 0.8783539113061641, + "grad_norm": 6.40625, + "learning_rate": 6.400078427715031e-06, + "loss": 1.06403399, + "memory(GiB)": 302.58, + "step": 157060, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.75978608, + "epoch": 0.8784657607791434, + "grad_norm": 8.1875, + "learning_rate": 6.399190691239183e-06, + "loss": 0.96009264, + "memory(GiB)": 302.58, + "step": 157080, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.74087019, + "epoch": 0.8785776102521227, + "grad_norm": 5.125, + "learning_rate": 6.39830290690658e-06, + "loss": 1.01883984, + "memory(GiB)": 302.58, + "step": 157100, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.74440875, + "epoch": 0.8786894597251019, + "grad_norm": 6.5625, + "learning_rate": 6.39741507474758e-06, + "loss": 1.01648569, + "memory(GiB)": 302.58, + "step": 157120, + "train_speed(iter/s)": 0.123762 + }, + { + "acc": 0.74624877, + "epoch": 0.8788013091980812, + "grad_norm": 7.125, + "learning_rate": 6.396527194792554e-06, + "loss": 0.99480219, + "memory(GiB)": 302.58, + "step": 157140, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.72616372, + "epoch": 0.8789131586710605, + "grad_norm": 8.875, + "learning_rate": 6.395639267071871e-06, + "loss": 1.09096355, + "memory(GiB)": 302.58, + "step": 157160, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.76090651, + "epoch": 0.8790250081440397, + "grad_norm": 6.5, + "learning_rate": 6.3947512916159e-06, + "loss": 0.93081522, + "memory(GiB)": 302.58, + "step": 157180, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.73295798, + "epoch": 0.879136857617019, + "grad_norm": 7.9375, + "learning_rate": 6.393863268455012e-06, + "loss": 1.09014368, + "memory(GiB)": 302.58, + "step": 157200, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.73281293, + "epoch": 0.8792487070899982, + "grad_norm": 8.25, + "learning_rate": 6.392975197619579e-06, + "loss": 1.07649345, + "memory(GiB)": 302.58, + "step": 157220, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.74851394, + "epoch": 0.8793605565629775, + "grad_norm": 7.46875, + "learning_rate": 6.392087079139979e-06, + "loss": 0.98495808, + "memory(GiB)": 302.58, + "step": 157240, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.73779511, + "epoch": 0.8794724060359568, + "grad_norm": 7.15625, + "learning_rate": 6.391198913046586e-06, + "loss": 1.01590233, + "memory(GiB)": 302.58, + "step": 157260, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.74845567, + "epoch": 0.879584255508936, + "grad_norm": 7.09375, + "learning_rate": 6.390310699369779e-06, + "loss": 0.97979012, + "memory(GiB)": 302.58, + "step": 157280, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.74230657, + "epoch": 0.8796961049819153, + "grad_norm": 5.78125, + "learning_rate": 6.389422438139937e-06, + "loss": 1.01717091, + "memory(GiB)": 302.58, + "step": 157300, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.73452034, + "epoch": 0.8798079544548946, + "grad_norm": 7.03125, + "learning_rate": 6.3885341293874425e-06, + "loss": 1.04697018, + "memory(GiB)": 302.58, + "step": 157320, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.75385652, + "epoch": 0.8799198039278738, + "grad_norm": 4.8125, + "learning_rate": 6.387645773142676e-06, + "loss": 0.97513885, + "memory(GiB)": 302.58, + "step": 157340, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.73751745, + "epoch": 0.8800316534008531, + "grad_norm": 10.0625, + "learning_rate": 6.386757369436025e-06, + "loss": 1.02386599, + "memory(GiB)": 302.58, + "step": 157360, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.73765965, + "epoch": 0.8801435028738324, + "grad_norm": 5.78125, + "learning_rate": 6.385868918297876e-06, + "loss": 1.02235317, + "memory(GiB)": 302.58, + "step": 157380, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.73115335, + "epoch": 0.8802553523468116, + "grad_norm": 6.9375, + "learning_rate": 6.3849804197586155e-06, + "loss": 1.05209579, + "memory(GiB)": 302.58, + "step": 157400, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.73367763, + "epoch": 0.8803672018197909, + "grad_norm": 6.40625, + "learning_rate": 6.384091873848631e-06, + "loss": 1.06071186, + "memory(GiB)": 302.58, + "step": 157420, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.7234129, + "epoch": 0.8804790512927702, + "grad_norm": 8.1875, + "learning_rate": 6.383203280598318e-06, + "loss": 1.08259773, + "memory(GiB)": 302.58, + "step": 157440, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.71636519, + "epoch": 0.8805909007657494, + "grad_norm": 6.0625, + "learning_rate": 6.382314640038065e-06, + "loss": 1.12742624, + "memory(GiB)": 302.58, + "step": 157460, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.73567772, + "epoch": 0.8807027502387287, + "grad_norm": 5.875, + "learning_rate": 6.381425952198269e-06, + "loss": 1.02215958, + "memory(GiB)": 302.58, + "step": 157480, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.74995337, + "epoch": 0.880814599711708, + "grad_norm": 10.125, + "learning_rate": 6.380537217109324e-06, + "loss": 0.96290483, + "memory(GiB)": 302.58, + "step": 157500, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73885851, + "epoch": 0.8809264491846872, + "grad_norm": 9.9375, + "learning_rate": 6.379648434801629e-06, + "loss": 1.04200392, + "memory(GiB)": 302.58, + "step": 157520, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.75022244, + "epoch": 0.8810382986576665, + "grad_norm": 7.71875, + "learning_rate": 6.378759605305583e-06, + "loss": 0.97400494, + "memory(GiB)": 302.58, + "step": 157540, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.73871708, + "epoch": 0.8811501481306457, + "grad_norm": 5.40625, + "learning_rate": 6.377870728651587e-06, + "loss": 1.03225698, + "memory(GiB)": 302.58, + "step": 157560, + "train_speed(iter/s)": 0.123925 + }, + { + "acc": 0.74364471, + "epoch": 0.881261997603625, + "grad_norm": 5.84375, + "learning_rate": 6.376981804870042e-06, + "loss": 0.99321327, + "memory(GiB)": 302.58, + "step": 157580, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.74772687, + "epoch": 0.8813738470766043, + "grad_norm": 5.375, + "learning_rate": 6.376092833991354e-06, + "loss": 0.99551315, + "memory(GiB)": 302.58, + "step": 157600, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.72289824, + "epoch": 0.8814856965495835, + "grad_norm": 8.875, + "learning_rate": 6.375203816045928e-06, + "loss": 1.09405384, + "memory(GiB)": 302.58, + "step": 157620, + "train_speed(iter/s)": 0.123947 + }, + { + "acc": 0.74842386, + "epoch": 0.8815975460225628, + "grad_norm": 11.3125, + "learning_rate": 6.374314751064168e-06, + "loss": 0.98422909, + "memory(GiB)": 302.58, + "step": 157640, + "train_speed(iter/s)": 0.123954 + }, + { + "acc": 0.74758172, + "epoch": 0.8817093954955421, + "grad_norm": 8.875, + "learning_rate": 6.373425639076488e-06, + "loss": 1.00104122, + "memory(GiB)": 302.58, + "step": 157660, + "train_speed(iter/s)": 0.123962 + }, + { + "acc": 0.71973162, + "epoch": 0.8818212449685213, + "grad_norm": 5.34375, + "learning_rate": 6.372536480113296e-06, + "loss": 1.1118578, + "memory(GiB)": 302.58, + "step": 157680, + "train_speed(iter/s)": 0.12397 + }, + { + "acc": 0.75127783, + "epoch": 0.8819330944415006, + "grad_norm": 8.125, + "learning_rate": 6.371647274205004e-06, + "loss": 0.97366743, + "memory(GiB)": 302.58, + "step": 157700, + "train_speed(iter/s)": 0.123977 + }, + { + "acc": 0.73655009, + "epoch": 0.8820449439144799, + "grad_norm": 6.78125, + "learning_rate": 6.370758021382026e-06, + "loss": 1.02580576, + "memory(GiB)": 302.58, + "step": 157720, + "train_speed(iter/s)": 0.123985 + }, + { + "acc": 0.76337852, + "epoch": 0.8821567933874591, + "grad_norm": 6.625, + "learning_rate": 6.369868721674777e-06, + "loss": 0.89425011, + "memory(GiB)": 302.58, + "step": 157740, + "train_speed(iter/s)": 0.123992 + }, + { + "acc": 0.75902457, + "epoch": 0.8822686428604384, + "grad_norm": 7.15625, + "learning_rate": 6.368979375113673e-06, + "loss": 0.929389, + "memory(GiB)": 302.58, + "step": 157760, + "train_speed(iter/s)": 0.123999 + }, + { + "acc": 0.74904633, + "epoch": 0.8823804923334176, + "grad_norm": 7.03125, + "learning_rate": 6.368089981729135e-06, + "loss": 1.00865498, + "memory(GiB)": 302.58, + "step": 157780, + "train_speed(iter/s)": 0.124007 + }, + { + "acc": 0.73658271, + "epoch": 0.8824923418063969, + "grad_norm": 5.53125, + "learning_rate": 6.36720054155158e-06, + "loss": 1.04286118, + "memory(GiB)": 302.58, + "step": 157800, + "train_speed(iter/s)": 0.124014 + }, + { + "acc": 0.71802974, + "epoch": 0.8826041912793762, + "grad_norm": 7.90625, + "learning_rate": 6.3663110546114324e-06, + "loss": 1.10652809, + "memory(GiB)": 302.58, + "step": 157820, + "train_speed(iter/s)": 0.124022 + }, + { + "acc": 0.74387341, + "epoch": 0.8827160407523554, + "grad_norm": 7.21875, + "learning_rate": 6.365421520939115e-06, + "loss": 0.98788738, + "memory(GiB)": 302.58, + "step": 157840, + "train_speed(iter/s)": 0.124029 + }, + { + "acc": 0.74000158, + "epoch": 0.8828278902253347, + "grad_norm": 7.03125, + "learning_rate": 6.364531940565051e-06, + "loss": 1.01639938, + "memory(GiB)": 302.58, + "step": 157860, + "train_speed(iter/s)": 0.124037 + }, + { + "acc": 0.74257288, + "epoch": 0.882939739698314, + "grad_norm": 5.21875, + "learning_rate": 6.363642313519667e-06, + "loss": 1.00867081, + "memory(GiB)": 302.58, + "step": 157880, + "train_speed(iter/s)": 0.124044 + }, + { + "acc": 0.73242793, + "epoch": 0.8830515891712932, + "grad_norm": 11.0625, + "learning_rate": 6.3627526398333925e-06, + "loss": 1.05355186, + "memory(GiB)": 302.58, + "step": 157900, + "train_speed(iter/s)": 0.124051 + }, + { + "acc": 0.73715091, + "epoch": 0.8831634386442725, + "grad_norm": 7.125, + "learning_rate": 6.361862919536656e-06, + "loss": 1.0471508, + "memory(GiB)": 302.58, + "step": 157920, + "train_speed(iter/s)": 0.124059 + }, + { + "acc": 0.75118809, + "epoch": 0.8832752881172518, + "grad_norm": 6.625, + "learning_rate": 6.360973152659891e-06, + "loss": 1.00108366, + "memory(GiB)": 302.58, + "step": 157940, + "train_speed(iter/s)": 0.124066 + }, + { + "acc": 0.72530651, + "epoch": 0.883387137590231, + "grad_norm": 9.125, + "learning_rate": 6.360083339233527e-06, + "loss": 1.08842087, + "memory(GiB)": 302.58, + "step": 157960, + "train_speed(iter/s)": 0.124074 + }, + { + "acc": 0.74676619, + "epoch": 0.8834989870632103, + "grad_norm": 8.1875, + "learning_rate": 6.359193479288e-06, + "loss": 0.97445698, + "memory(GiB)": 302.58, + "step": 157980, + "train_speed(iter/s)": 0.124081 + }, + { + "acc": 0.73037691, + "epoch": 0.8836108365361895, + "grad_norm": 7.125, + "learning_rate": 6.358303572853748e-06, + "loss": 1.05970039, + "memory(GiB)": 302.58, + "step": 158000, + "train_speed(iter/s)": 0.124088 + }, + { + "epoch": 0.8836108365361895, + "eval_acc": 0.705804223334555, + "eval_loss": 1.0156697034835815, + "eval_runtime": 7502.3728, + "eval_samples_per_second": 10.035, + "eval_steps_per_second": 10.035, + "step": 158000 + }, + { + "acc": 0.73846307, + "epoch": 0.8837226860091688, + "grad_norm": 7.03125, + "learning_rate": 6.357413619961206e-06, + "loss": 1.03406992, + "memory(GiB)": 302.58, + "step": 158020, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.74725752, + "epoch": 0.8838345354821481, + "grad_norm": 10.1875, + "learning_rate": 6.356523620640813e-06, + "loss": 1.00408497, + "memory(GiB)": 302.58, + "step": 158040, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.74580588, + "epoch": 0.8839463849551273, + "grad_norm": 7.65625, + "learning_rate": 6.35563357492301e-06, + "loss": 1.00727615, + "memory(GiB)": 302.58, + "step": 158060, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.73693724, + "epoch": 0.8840582344281066, + "grad_norm": 8.3125, + "learning_rate": 6.354743482838243e-06, + "loss": 1.03867779, + "memory(GiB)": 302.58, + "step": 158080, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.76894054, + "epoch": 0.8841700839010859, + "grad_norm": 8.6875, + "learning_rate": 6.3538533444169516e-06, + "loss": 0.86978846, + "memory(GiB)": 302.58, + "step": 158100, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.74410162, + "epoch": 0.8842819333740651, + "grad_norm": 5.0625, + "learning_rate": 6.352963159689582e-06, + "loss": 1.01246958, + "memory(GiB)": 302.58, + "step": 158120, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.72848487, + "epoch": 0.8843937828470444, + "grad_norm": 7.9375, + "learning_rate": 6.3520729286865835e-06, + "loss": 1.07994604, + "memory(GiB)": 302.58, + "step": 158140, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.74866071, + "epoch": 0.8845056323200237, + "grad_norm": 7.15625, + "learning_rate": 6.351182651438403e-06, + "loss": 0.96742973, + "memory(GiB)": 302.58, + "step": 158160, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74660487, + "epoch": 0.8846174817930029, + "grad_norm": 5.40625, + "learning_rate": 6.350292327975491e-06, + "loss": 0.99828901, + "memory(GiB)": 302.58, + "step": 158180, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.7485642, + "epoch": 0.8847293312659822, + "grad_norm": 7.78125, + "learning_rate": 6.349401958328301e-06, + "loss": 0.97335491, + "memory(GiB)": 302.58, + "step": 158200, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.74465866, + "epoch": 0.8848411807389615, + "grad_norm": 4.96875, + "learning_rate": 6.348511542527284e-06, + "loss": 1.00490532, + "memory(GiB)": 302.58, + "step": 158220, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.73350158, + "epoch": 0.8849530302119407, + "grad_norm": 6.6875, + "learning_rate": 6.347621080602896e-06, + "loss": 1.04009552, + "memory(GiB)": 302.58, + "step": 158240, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.72725129, + "epoch": 0.88506487968492, + "grad_norm": 8.75, + "learning_rate": 6.3467305725855955e-06, + "loss": 1.06496038, + "memory(GiB)": 302.58, + "step": 158260, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.73332281, + "epoch": 0.8851767291578992, + "grad_norm": 8.625, + "learning_rate": 6.3458400185058375e-06, + "loss": 1.03357592, + "memory(GiB)": 302.58, + "step": 158280, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.74963217, + "epoch": 0.8852885786308785, + "grad_norm": 7.8125, + "learning_rate": 6.344949418394084e-06, + "loss": 0.99663754, + "memory(GiB)": 302.58, + "step": 158300, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74549985, + "epoch": 0.8854004281038578, + "grad_norm": 8.4375, + "learning_rate": 6.3440587722807956e-06, + "loss": 1.00422792, + "memory(GiB)": 302.58, + "step": 158320, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.74408135, + "epoch": 0.8855122775768371, + "grad_norm": 9.0, + "learning_rate": 6.343168080196434e-06, + "loss": 0.98554249, + "memory(GiB)": 302.58, + "step": 158340, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.74701838, + "epoch": 0.8856241270498164, + "grad_norm": 4.9375, + "learning_rate": 6.342277342171467e-06, + "loss": 0.99195223, + "memory(GiB)": 302.58, + "step": 158360, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.73297734, + "epoch": 0.8857359765227957, + "grad_norm": 9.875, + "learning_rate": 6.341386558236357e-06, + "loss": 1.05929947, + "memory(GiB)": 302.58, + "step": 158380, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.74535465, + "epoch": 0.8858478259957749, + "grad_norm": 8.4375, + "learning_rate": 6.340495728421574e-06, + "loss": 1.02236061, + "memory(GiB)": 302.58, + "step": 158400, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.74415326, + "epoch": 0.8859596754687542, + "grad_norm": 5.5625, + "learning_rate": 6.339604852757586e-06, + "loss": 0.98702421, + "memory(GiB)": 302.58, + "step": 158420, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.72997084, + "epoch": 0.8860715249417335, + "grad_norm": 7.0, + "learning_rate": 6.3387139312748645e-06, + "loss": 1.06577711, + "memory(GiB)": 302.58, + "step": 158440, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.75245585, + "epoch": 0.8861833744147127, + "grad_norm": 10.0, + "learning_rate": 6.337822964003882e-06, + "loss": 0.97450094, + "memory(GiB)": 302.58, + "step": 158460, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.72993698, + "epoch": 0.886295223887692, + "grad_norm": 6.21875, + "learning_rate": 6.336931950975112e-06, + "loss": 1.06332912, + "memory(GiB)": 302.58, + "step": 158480, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.74261351, + "epoch": 0.8864070733606713, + "grad_norm": 7.65625, + "learning_rate": 6.33604089221903e-06, + "loss": 0.99641829, + "memory(GiB)": 302.58, + "step": 158500, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.74345117, + "epoch": 0.8865189228336505, + "grad_norm": 10.625, + "learning_rate": 6.3351497877661125e-06, + "loss": 1.01124125, + "memory(GiB)": 302.58, + "step": 158520, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.73456116, + "epoch": 0.8866307723066298, + "grad_norm": 6.59375, + "learning_rate": 6.3342586376468395e-06, + "loss": 1.06096134, + "memory(GiB)": 302.58, + "step": 158540, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74253097, + "epoch": 0.886742621779609, + "grad_norm": 7.3125, + "learning_rate": 6.33336744189169e-06, + "loss": 1.01447573, + "memory(GiB)": 302.58, + "step": 158560, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75814514, + "epoch": 0.8868544712525883, + "grad_norm": 7.6875, + "learning_rate": 6.3324762005311456e-06, + "loss": 0.94757662, + "memory(GiB)": 302.58, + "step": 158580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.74046459, + "epoch": 0.8869663207255676, + "grad_norm": 8.6875, + "learning_rate": 6.331584913595691e-06, + "loss": 1.01065044, + "memory(GiB)": 302.58, + "step": 158600, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.75079288, + "epoch": 0.8870781701985468, + "grad_norm": 6.9375, + "learning_rate": 6.330693581115809e-06, + "loss": 0.98397093, + "memory(GiB)": 302.58, + "step": 158620, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75531211, + "epoch": 0.8871900196715261, + "grad_norm": 6.25, + "learning_rate": 6.329802203121987e-06, + "loss": 0.94738846, + "memory(GiB)": 302.58, + "step": 158640, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.73515992, + "epoch": 0.8873018691445054, + "grad_norm": 5.625, + "learning_rate": 6.328910779644712e-06, + "loss": 1.03513193, + "memory(GiB)": 302.58, + "step": 158660, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.72771416, + "epoch": 0.8874137186174846, + "grad_norm": 7.125, + "learning_rate": 6.3280193107144776e-06, + "loss": 1.09002647, + "memory(GiB)": 302.58, + "step": 158680, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.72912498, + "epoch": 0.8875255680904639, + "grad_norm": 7.34375, + "learning_rate": 6.327127796361769e-06, + "loss": 1.0642766, + "memory(GiB)": 302.58, + "step": 158700, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.7604702, + "epoch": 0.8876374175634432, + "grad_norm": 7.4375, + "learning_rate": 6.326236236617082e-06, + "loss": 0.92425289, + "memory(GiB)": 302.58, + "step": 158720, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.73874545, + "epoch": 0.8877492670364224, + "grad_norm": 7.375, + "learning_rate": 6.325344631510912e-06, + "loss": 1.04565182, + "memory(GiB)": 302.58, + "step": 158740, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.73895955, + "epoch": 0.8878611165094017, + "grad_norm": 7.15625, + "learning_rate": 6.324452981073751e-06, + "loss": 1.04312525, + "memory(GiB)": 302.58, + "step": 158760, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.76016078, + "epoch": 0.887972965982381, + "grad_norm": 9.875, + "learning_rate": 6.323561285336098e-06, + "loss": 0.90667915, + "memory(GiB)": 302.58, + "step": 158780, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.74552398, + "epoch": 0.8880848154553602, + "grad_norm": 5.75, + "learning_rate": 6.322669544328453e-06, + "loss": 0.99748974, + "memory(GiB)": 302.58, + "step": 158800, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.76387539, + "epoch": 0.8881966649283395, + "grad_norm": 10.0, + "learning_rate": 6.321777758081317e-06, + "loss": 0.91148767, + "memory(GiB)": 302.58, + "step": 158820, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.73702779, + "epoch": 0.8883085144013187, + "grad_norm": 9.0, + "learning_rate": 6.320885926625189e-06, + "loss": 1.04139423, + "memory(GiB)": 302.58, + "step": 158840, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74175878, + "epoch": 0.888420363874298, + "grad_norm": 6.15625, + "learning_rate": 6.319994049990574e-06, + "loss": 0.99381542, + "memory(GiB)": 302.58, + "step": 158860, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.74298358, + "epoch": 0.8885322133472773, + "grad_norm": 9.875, + "learning_rate": 6.3191021282079765e-06, + "loss": 1.01669922, + "memory(GiB)": 302.58, + "step": 158880, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.7388855, + "epoch": 0.8886440628202565, + "grad_norm": 11.375, + "learning_rate": 6.318210161307905e-06, + "loss": 1.03945513, + "memory(GiB)": 302.58, + "step": 158900, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.73597941, + "epoch": 0.8887559122932358, + "grad_norm": 6.5625, + "learning_rate": 6.317318149320864e-06, + "loss": 1.05004387, + "memory(GiB)": 302.58, + "step": 158920, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.73358474, + "epoch": 0.8888677617662151, + "grad_norm": 4.9375, + "learning_rate": 6.316426092277366e-06, + "loss": 1.06216145, + "memory(GiB)": 302.58, + "step": 158940, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.74148116, + "epoch": 0.8889796112391943, + "grad_norm": 10.9375, + "learning_rate": 6.315533990207922e-06, + "loss": 1.01771307, + "memory(GiB)": 302.58, + "step": 158960, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.74982018, + "epoch": 0.8890914607121736, + "grad_norm": 5.96875, + "learning_rate": 6.3146418431430435e-06, + "loss": 0.97352686, + "memory(GiB)": 302.58, + "step": 158980, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.74789424, + "epoch": 0.8892033101851529, + "grad_norm": 8.5, + "learning_rate": 6.313749651113243e-06, + "loss": 1.01765108, + "memory(GiB)": 302.58, + "step": 159000, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.72908688, + "epoch": 0.8893151596581321, + "grad_norm": 7.6875, + "learning_rate": 6.312857414149043e-06, + "loss": 1.07083473, + "memory(GiB)": 302.58, + "step": 159020, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.72959313, + "epoch": 0.8894270091311114, + "grad_norm": 9.625, + "learning_rate": 6.311965132280954e-06, + "loss": 1.06809912, + "memory(GiB)": 302.58, + "step": 159040, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.73712893, + "epoch": 0.8895388586040907, + "grad_norm": 7.84375, + "learning_rate": 6.311072805539497e-06, + "loss": 1.04588318, + "memory(GiB)": 302.58, + "step": 159060, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.75367045, + "epoch": 0.8896507080770699, + "grad_norm": 7.59375, + "learning_rate": 6.310180433955193e-06, + "loss": 0.95070515, + "memory(GiB)": 302.58, + "step": 159080, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.73280272, + "epoch": 0.8897625575500492, + "grad_norm": 7.09375, + "learning_rate": 6.309288017558566e-06, + "loss": 1.04395514, + "memory(GiB)": 302.58, + "step": 159100, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.71226387, + "epoch": 0.8898744070230284, + "grad_norm": 6.6875, + "learning_rate": 6.308395556380135e-06, + "loss": 1.12817097, + "memory(GiB)": 302.58, + "step": 159120, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.74100089, + "epoch": 0.8899862564960077, + "grad_norm": 6.8125, + "learning_rate": 6.3075030504504276e-06, + "loss": 1.04056492, + "memory(GiB)": 302.58, + "step": 159140, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.74447594, + "epoch": 0.890098105968987, + "grad_norm": 7.84375, + "learning_rate": 6.30661049979997e-06, + "loss": 0.99820862, + "memory(GiB)": 302.58, + "step": 159160, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.72961273, + "epoch": 0.8902099554419662, + "grad_norm": 5.78125, + "learning_rate": 6.305717904459289e-06, + "loss": 1.08074961, + "memory(GiB)": 302.58, + "step": 159180, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.72960258, + "epoch": 0.8903218049149455, + "grad_norm": 9.5625, + "learning_rate": 6.304825264458918e-06, + "loss": 1.07484245, + "memory(GiB)": 302.58, + "step": 159200, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.74280086, + "epoch": 0.8904336543879248, + "grad_norm": 7.15625, + "learning_rate": 6.303932579829383e-06, + "loss": 1.00597525, + "memory(GiB)": 302.58, + "step": 159220, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.73568459, + "epoch": 0.890545503860904, + "grad_norm": 5.53125, + "learning_rate": 6.3030398506012206e-06, + "loss": 1.04420624, + "memory(GiB)": 302.58, + "step": 159240, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.7563838, + "epoch": 0.8906573533338833, + "grad_norm": 8.5, + "learning_rate": 6.302147076804963e-06, + "loss": 0.94670229, + "memory(GiB)": 302.58, + "step": 159260, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.74603844, + "epoch": 0.8907692028068626, + "grad_norm": 6.375, + "learning_rate": 6.301254258471146e-06, + "loss": 1.00978622, + "memory(GiB)": 302.58, + "step": 159280, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.74685316, + "epoch": 0.8908810522798418, + "grad_norm": 5.90625, + "learning_rate": 6.300361395630307e-06, + "loss": 0.98300123, + "memory(GiB)": 302.58, + "step": 159300, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.7411181, + "epoch": 0.8909929017528211, + "grad_norm": 7.25, + "learning_rate": 6.299468488312987e-06, + "loss": 1.02778769, + "memory(GiB)": 302.58, + "step": 159320, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.72370682, + "epoch": 0.8911047512258004, + "grad_norm": 8.0, + "learning_rate": 6.298575536549722e-06, + "loss": 1.08818026, + "memory(GiB)": 302.58, + "step": 159340, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.75089016, + "epoch": 0.8912166006987796, + "grad_norm": 5.96875, + "learning_rate": 6.297682540371055e-06, + "loss": 0.9788126, + "memory(GiB)": 302.58, + "step": 159360, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.75709128, + "epoch": 0.8913284501717589, + "grad_norm": 5.375, + "learning_rate": 6.296789499807533e-06, + "loss": 0.95452871, + "memory(GiB)": 302.58, + "step": 159380, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.74537144, + "epoch": 0.8914402996447381, + "grad_norm": 7.9375, + "learning_rate": 6.295896414889697e-06, + "loss": 1.01121893, + "memory(GiB)": 302.58, + "step": 159400, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.74756293, + "epoch": 0.8915521491177174, + "grad_norm": 7.125, + "learning_rate": 6.295003285648094e-06, + "loss": 1.00469389, + "memory(GiB)": 302.58, + "step": 159420, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.75693054, + "epoch": 0.8916639985906967, + "grad_norm": 5.78125, + "learning_rate": 6.294110112113273e-06, + "loss": 0.93912544, + "memory(GiB)": 302.58, + "step": 159440, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.73284154, + "epoch": 0.8917758480636759, + "grad_norm": 7.15625, + "learning_rate": 6.293216894315783e-06, + "loss": 1.05959311, + "memory(GiB)": 302.58, + "step": 159460, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.74138875, + "epoch": 0.8918876975366552, + "grad_norm": 6.1875, + "learning_rate": 6.2923236322861735e-06, + "loss": 1.02246695, + "memory(GiB)": 302.58, + "step": 159480, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.73931227, + "epoch": 0.8919995470096345, + "grad_norm": 7.59375, + "learning_rate": 6.291430326054998e-06, + "loss": 1.03640995, + "memory(GiB)": 302.58, + "step": 159500, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.72692327, + "epoch": 0.8921113964826137, + "grad_norm": 5.875, + "learning_rate": 6.29053697565281e-06, + "loss": 1.07976923, + "memory(GiB)": 302.58, + "step": 159520, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.74648509, + "epoch": 0.892223245955593, + "grad_norm": 10.0, + "learning_rate": 6.289643581110166e-06, + "loss": 0.9955781, + "memory(GiB)": 302.58, + "step": 159540, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.73523369, + "epoch": 0.8923350954285723, + "grad_norm": 7.375, + "learning_rate": 6.288750142457623e-06, + "loss": 1.04930916, + "memory(GiB)": 302.58, + "step": 159560, + "train_speed(iter/s)": 0.123923 + }, + { + "acc": 0.73187914, + "epoch": 0.8924469449015515, + "grad_norm": 7.0, + "learning_rate": 6.287856659725737e-06, + "loss": 1.05167589, + "memory(GiB)": 302.58, + "step": 159580, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.74137301, + "epoch": 0.8925587943745308, + "grad_norm": 5.75, + "learning_rate": 6.286963132945071e-06, + "loss": 1.02452059, + "memory(GiB)": 302.58, + "step": 159600, + "train_speed(iter/s)": 0.123938 + }, + { + "acc": 0.73514881, + "epoch": 0.89267064384751, + "grad_norm": 5.78125, + "learning_rate": 6.286069562146183e-06, + "loss": 1.0415947, + "memory(GiB)": 302.58, + "step": 159620, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.74030738, + "epoch": 0.8927824933204893, + "grad_norm": 7.90625, + "learning_rate": 6.28517594735964e-06, + "loss": 1.03356771, + "memory(GiB)": 302.58, + "step": 159640, + "train_speed(iter/s)": 0.123953 + }, + { + "acc": 0.72809711, + "epoch": 0.8928943427934686, + "grad_norm": 9.1875, + "learning_rate": 6.284282288616002e-06, + "loss": 1.08637733, + "memory(GiB)": 302.58, + "step": 159660, + "train_speed(iter/s)": 0.12396 + }, + { + "acc": 0.73254795, + "epoch": 0.8930061922664478, + "grad_norm": 6.65625, + "learning_rate": 6.28338858594584e-06, + "loss": 1.05447216, + "memory(GiB)": 302.58, + "step": 159680, + "train_speed(iter/s)": 0.123967 + }, + { + "acc": 0.74844246, + "epoch": 0.8931180417394271, + "grad_norm": 7.625, + "learning_rate": 6.282494839379718e-06, + "loss": 0.98132429, + "memory(GiB)": 302.58, + "step": 159700, + "train_speed(iter/s)": 0.123974 + }, + { + "acc": 0.75333004, + "epoch": 0.8932298912124064, + "grad_norm": 4.21875, + "learning_rate": 6.281601048948205e-06, + "loss": 0.98593521, + "memory(GiB)": 302.58, + "step": 159720, + "train_speed(iter/s)": 0.123981 + }, + { + "acc": 0.7507391, + "epoch": 0.8933417406853856, + "grad_norm": 8.1875, + "learning_rate": 6.280707214681873e-06, + "loss": 0.97762403, + "memory(GiB)": 302.58, + "step": 159740, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.75516329, + "epoch": 0.8934535901583649, + "grad_norm": 7.34375, + "learning_rate": 6.279813336611293e-06, + "loss": 0.939781, + "memory(GiB)": 302.58, + "step": 159760, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.74867544, + "epoch": 0.8935654396313442, + "grad_norm": 7.3125, + "learning_rate": 6.27891941476704e-06, + "loss": 0.96568651, + "memory(GiB)": 302.58, + "step": 159780, + "train_speed(iter/s)": 0.124003 + }, + { + "acc": 0.74889255, + "epoch": 0.8936772891043234, + "grad_norm": 7.0, + "learning_rate": 6.2780254491796855e-06, + "loss": 0.9570632, + "memory(GiB)": 302.58, + "step": 159800, + "train_speed(iter/s)": 0.12401 + }, + { + "acc": 0.73746219, + "epoch": 0.8937891385773027, + "grad_norm": 8.8125, + "learning_rate": 6.277131439879809e-06, + "loss": 1.04274406, + "memory(GiB)": 302.58, + "step": 159820, + "train_speed(iter/s)": 0.124017 + }, + { + "acc": 0.74134603, + "epoch": 0.893900988050282, + "grad_norm": 7.0625, + "learning_rate": 6.276237386897989e-06, + "loss": 1.01359549, + "memory(GiB)": 302.58, + "step": 159840, + "train_speed(iter/s)": 0.124025 + }, + { + "acc": 0.74284148, + "epoch": 0.8940128375232612, + "grad_norm": 6.1875, + "learning_rate": 6.275343290264801e-06, + "loss": 1.03165665, + "memory(GiB)": 302.58, + "step": 159860, + "train_speed(iter/s)": 0.124032 + }, + { + "acc": 0.73281732, + "epoch": 0.8941246869962405, + "grad_norm": 5.875, + "learning_rate": 6.27444915001083e-06, + "loss": 1.0482584, + "memory(GiB)": 302.58, + "step": 159880, + "train_speed(iter/s)": 0.12404 + }, + { + "acc": 0.7321238, + "epoch": 0.8942365364692197, + "grad_norm": 7.3125, + "learning_rate": 6.273554966166657e-06, + "loss": 1.06985025, + "memory(GiB)": 302.58, + "step": 159900, + "train_speed(iter/s)": 0.124047 + }, + { + "acc": 0.731569, + "epoch": 0.894348385942199, + "grad_norm": 7.78125, + "learning_rate": 6.272660738762866e-06, + "loss": 1.05817966, + "memory(GiB)": 302.58, + "step": 159920, + "train_speed(iter/s)": 0.124054 + }, + { + "acc": 0.73802581, + "epoch": 0.8944602354151783, + "grad_norm": 10.375, + "learning_rate": 6.271766467830041e-06, + "loss": 1.01736307, + "memory(GiB)": 302.58, + "step": 159940, + "train_speed(iter/s)": 0.124062 + }, + { + "acc": 0.74458513, + "epoch": 0.8945720848881575, + "grad_norm": 9.25, + "learning_rate": 6.27087215339877e-06, + "loss": 0.99664602, + "memory(GiB)": 302.58, + "step": 159960, + "train_speed(iter/s)": 0.124069 + }, + { + "acc": 0.73959103, + "epoch": 0.8946839343611368, + "grad_norm": 8.1875, + "learning_rate": 6.269977795499643e-06, + "loss": 1.04946928, + "memory(GiB)": 302.58, + "step": 159980, + "train_speed(iter/s)": 0.124076 + }, + { + "acc": 0.74090333, + "epoch": 0.8947957838341161, + "grad_norm": 7.375, + "learning_rate": 6.269083394163249e-06, + "loss": 1.03062162, + "memory(GiB)": 302.58, + "step": 160000, + "train_speed(iter/s)": 0.124083 + }, + { + "epoch": 0.8947957838341161, + "eval_acc": 0.7058054064574255, + "eval_loss": 1.0152391195297241, + "eval_runtime": 7491.2372, + "eval_samples_per_second": 10.049, + "eval_steps_per_second": 10.049, + "step": 160000 + }, + { + "acc": 0.73823147, + "epoch": 0.8949076333070953, + "grad_norm": 9.75, + "learning_rate": 6.2681889494201775e-06, + "loss": 1.03225279, + "memory(GiB)": 302.58, + "step": 160020, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.74360151, + "epoch": 0.8950194827800746, + "grad_norm": 8.125, + "learning_rate": 6.267294461301022e-06, + "loss": 1.01123896, + "memory(GiB)": 302.58, + "step": 160040, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.74060078, + "epoch": 0.8951313322530539, + "grad_norm": 11.75, + "learning_rate": 6.266399929836377e-06, + "loss": 1.04689789, + "memory(GiB)": 302.58, + "step": 160060, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.73389316, + "epoch": 0.8952431817260331, + "grad_norm": 6.09375, + "learning_rate": 6.2655053550568414e-06, + "loss": 1.06236801, + "memory(GiB)": 302.58, + "step": 160080, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.73893476, + "epoch": 0.8953550311990124, + "grad_norm": 6.71875, + "learning_rate": 6.264610736993007e-06, + "loss": 1.02450123, + "memory(GiB)": 302.58, + "step": 160100, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.74505363, + "epoch": 0.8954668806719916, + "grad_norm": 6.1875, + "learning_rate": 6.263716075675478e-06, + "loss": 0.9926878, + "memory(GiB)": 302.58, + "step": 160120, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.74937215, + "epoch": 0.8955787301449709, + "grad_norm": 7.15625, + "learning_rate": 6.26282137113485e-06, + "loss": 0.98131895, + "memory(GiB)": 302.58, + "step": 160140, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.75336637, + "epoch": 0.8956905796179502, + "grad_norm": 8.875, + "learning_rate": 6.261926623401728e-06, + "loss": 0.96544809, + "memory(GiB)": 302.58, + "step": 160160, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.74095297, + "epoch": 0.8958024290909294, + "grad_norm": 4.96875, + "learning_rate": 6.261031832506715e-06, + "loss": 1.01416922, + "memory(GiB)": 302.58, + "step": 160180, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74172449, + "epoch": 0.8959142785639087, + "grad_norm": 5.5, + "learning_rate": 6.260136998480412e-06, + "loss": 1.03439474, + "memory(GiB)": 302.58, + "step": 160200, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.74714236, + "epoch": 0.896026128036888, + "grad_norm": 4.75, + "learning_rate": 6.2592421213534295e-06, + "loss": 1.02242079, + "memory(GiB)": 302.58, + "step": 160220, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.74331121, + "epoch": 0.8961379775098672, + "grad_norm": 9.5625, + "learning_rate": 6.258347201156372e-06, + "loss": 1.00017509, + "memory(GiB)": 302.58, + "step": 160240, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.75677147, + "epoch": 0.8962498269828465, + "grad_norm": 4.96875, + "learning_rate": 6.2574522379198524e-06, + "loss": 0.95967646, + "memory(GiB)": 302.58, + "step": 160260, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.76044455, + "epoch": 0.8963616764558258, + "grad_norm": 8.625, + "learning_rate": 6.256557231674477e-06, + "loss": 0.94290104, + "memory(GiB)": 302.58, + "step": 160280, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.72863832, + "epoch": 0.896473525928805, + "grad_norm": 6.21875, + "learning_rate": 6.255662182450862e-06, + "loss": 1.0934226, + "memory(GiB)": 302.58, + "step": 160300, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.73916936, + "epoch": 0.8965853754017843, + "grad_norm": 6.3125, + "learning_rate": 6.254767090279616e-06, + "loss": 1.03957739, + "memory(GiB)": 302.58, + "step": 160320, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75415444, + "epoch": 0.8966972248747636, + "grad_norm": 9.0625, + "learning_rate": 6.2538719551913595e-06, + "loss": 0.9740448, + "memory(GiB)": 302.58, + "step": 160340, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.73984318, + "epoch": 0.8968090743477428, + "grad_norm": 6.03125, + "learning_rate": 6.252976777216705e-06, + "loss": 1.03416662, + "memory(GiB)": 302.58, + "step": 160360, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.75652785, + "epoch": 0.8969209238207221, + "grad_norm": 7.625, + "learning_rate": 6.252081556386271e-06, + "loss": 0.96145945, + "memory(GiB)": 302.58, + "step": 160380, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74773216, + "epoch": 0.8970327732937013, + "grad_norm": 4.90625, + "learning_rate": 6.251186292730678e-06, + "loss": 1.00018883, + "memory(GiB)": 302.58, + "step": 160400, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.74495139, + "epoch": 0.8971446227666806, + "grad_norm": 7.375, + "learning_rate": 6.250290986280547e-06, + "loss": 0.98990374, + "memory(GiB)": 302.58, + "step": 160420, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.73576336, + "epoch": 0.8972564722396599, + "grad_norm": 7.3125, + "learning_rate": 6.249395637066499e-06, + "loss": 1.04283419, + "memory(GiB)": 302.58, + "step": 160440, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.73513465, + "epoch": 0.8973683217126391, + "grad_norm": 7.5625, + "learning_rate": 6.248500245119159e-06, + "loss": 1.03500996, + "memory(GiB)": 302.58, + "step": 160460, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.74073377, + "epoch": 0.8974801711856184, + "grad_norm": 7.59375, + "learning_rate": 6.2476048104691525e-06, + "loss": 0.99963493, + "memory(GiB)": 302.58, + "step": 160480, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.73453722, + "epoch": 0.8975920206585977, + "grad_norm": 8.4375, + "learning_rate": 6.246709333147103e-06, + "loss": 1.0655818, + "memory(GiB)": 302.58, + "step": 160500, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.7255477, + "epoch": 0.8977038701315769, + "grad_norm": 6.75, + "learning_rate": 6.245813813183642e-06, + "loss": 1.07159319, + "memory(GiB)": 302.58, + "step": 160520, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.75991149, + "epoch": 0.8978157196045562, + "grad_norm": 8.9375, + "learning_rate": 6.2449182506094e-06, + "loss": 0.93849106, + "memory(GiB)": 302.58, + "step": 160540, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.73196788, + "epoch": 0.8979275690775355, + "grad_norm": 10.1875, + "learning_rate": 6.244022645455005e-06, + "loss": 1.06606464, + "memory(GiB)": 302.58, + "step": 160560, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.73547921, + "epoch": 0.8980394185505147, + "grad_norm": 8.3125, + "learning_rate": 6.24312699775109e-06, + "loss": 1.03064346, + "memory(GiB)": 302.58, + "step": 160580, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.7361249, + "epoch": 0.898151268023494, + "grad_norm": 8.5, + "learning_rate": 6.242231307528291e-06, + "loss": 1.0361968, + "memory(GiB)": 302.58, + "step": 160600, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.72773938, + "epoch": 0.8982631174964733, + "grad_norm": 6.78125, + "learning_rate": 6.241335574817242e-06, + "loss": 1.07987719, + "memory(GiB)": 302.58, + "step": 160620, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75254393, + "epoch": 0.8983749669694525, + "grad_norm": 8.5, + "learning_rate": 6.2404397996485795e-06, + "loss": 0.99623432, + "memory(GiB)": 302.58, + "step": 160640, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74918923, + "epoch": 0.8984868164424318, + "grad_norm": 8.875, + "learning_rate": 6.239543982052944e-06, + "loss": 0.98997145, + "memory(GiB)": 302.58, + "step": 160660, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.75360236, + "epoch": 0.898598665915411, + "grad_norm": 7.0625, + "learning_rate": 6.2386481220609725e-06, + "loss": 0.95991688, + "memory(GiB)": 302.58, + "step": 160680, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.75109267, + "epoch": 0.8987105153883903, + "grad_norm": 6.90625, + "learning_rate": 6.237752219703308e-06, + "loss": 0.97110634, + "memory(GiB)": 302.58, + "step": 160700, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.7390048, + "epoch": 0.8988223648613696, + "grad_norm": 7.96875, + "learning_rate": 6.236856275010593e-06, + "loss": 0.99067392, + "memory(GiB)": 302.58, + "step": 160720, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.71767898, + "epoch": 0.8989342143343488, + "grad_norm": 7.90625, + "learning_rate": 6.235960288013472e-06, + "loss": 1.12247686, + "memory(GiB)": 302.58, + "step": 160740, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.72437096, + "epoch": 0.8990460638073281, + "grad_norm": 5.40625, + "learning_rate": 6.235064258742589e-06, + "loss": 1.10744305, + "memory(GiB)": 302.58, + "step": 160760, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.74736214, + "epoch": 0.8991579132803074, + "grad_norm": 5.6875, + "learning_rate": 6.234168187228591e-06, + "loss": 0.95784998, + "memory(GiB)": 302.58, + "step": 160780, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.75958362, + "epoch": 0.8992697627532866, + "grad_norm": 6.375, + "learning_rate": 6.23327207350213e-06, + "loss": 0.92986984, + "memory(GiB)": 302.58, + "step": 160800, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.74214082, + "epoch": 0.8993816122262659, + "grad_norm": 7.125, + "learning_rate": 6.232375917593853e-06, + "loss": 1.00652122, + "memory(GiB)": 302.58, + "step": 160820, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.7457345, + "epoch": 0.8994934616992452, + "grad_norm": 9.6875, + "learning_rate": 6.23147971953441e-06, + "loss": 0.99523754, + "memory(GiB)": 302.58, + "step": 160840, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.74895172, + "epoch": 0.8996053111722244, + "grad_norm": 7.125, + "learning_rate": 6.230583479354457e-06, + "loss": 0.98582468, + "memory(GiB)": 302.58, + "step": 160860, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.74425826, + "epoch": 0.8997171606452037, + "grad_norm": 6.96875, + "learning_rate": 6.229687197084646e-06, + "loss": 1.00416107, + "memory(GiB)": 302.58, + "step": 160880, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.72633548, + "epoch": 0.899829010118183, + "grad_norm": 7.15625, + "learning_rate": 6.228790872755636e-06, + "loss": 1.0944663, + "memory(GiB)": 302.58, + "step": 160900, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.73458877, + "epoch": 0.8999408595911622, + "grad_norm": 7.59375, + "learning_rate": 6.22789450639808e-06, + "loss": 1.05289602, + "memory(GiB)": 302.58, + "step": 160920, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.74987364, + "epoch": 0.9000527090641415, + "grad_norm": 7.8125, + "learning_rate": 6.226998098042637e-06, + "loss": 0.96701756, + "memory(GiB)": 302.58, + "step": 160940, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.73232694, + "epoch": 0.9001645585371207, + "grad_norm": 7.59375, + "learning_rate": 6.2261016477199705e-06, + "loss": 1.06246586, + "memory(GiB)": 302.58, + "step": 160960, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.73675194, + "epoch": 0.9002764080101, + "grad_norm": 6.71875, + "learning_rate": 6.22520515546074e-06, + "loss": 1.04511223, + "memory(GiB)": 302.58, + "step": 160980, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.75260735, + "epoch": 0.9003882574830793, + "grad_norm": 5.375, + "learning_rate": 6.224308621295607e-06, + "loss": 0.98408842, + "memory(GiB)": 302.58, + "step": 161000, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.74919419, + "epoch": 0.9005001069560585, + "grad_norm": 8.4375, + "learning_rate": 6.223412045255237e-06, + "loss": 0.99133062, + "memory(GiB)": 302.58, + "step": 161020, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.75105228, + "epoch": 0.9006119564290378, + "grad_norm": 5.0625, + "learning_rate": 6.222515427370297e-06, + "loss": 0.94869976, + "memory(GiB)": 302.58, + "step": 161040, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.73466816, + "epoch": 0.9007238059020171, + "grad_norm": 5.78125, + "learning_rate": 6.221618767671451e-06, + "loss": 1.06801481, + "memory(GiB)": 302.58, + "step": 161060, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.73508325, + "epoch": 0.9008356553749963, + "grad_norm": 4.9375, + "learning_rate": 6.220722066189371e-06, + "loss": 1.06316385, + "memory(GiB)": 302.58, + "step": 161080, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.74021602, + "epoch": 0.9009475048479756, + "grad_norm": 5.125, + "learning_rate": 6.219825322954724e-06, + "loss": 1.00056362, + "memory(GiB)": 302.58, + "step": 161100, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.73990235, + "epoch": 0.9010593543209549, + "grad_norm": 7.21875, + "learning_rate": 6.218928537998185e-06, + "loss": 1.03580198, + "memory(GiB)": 302.58, + "step": 161120, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.73740168, + "epoch": 0.9011712037939341, + "grad_norm": 8.5625, + "learning_rate": 6.218031711350424e-06, + "loss": 1.03299465, + "memory(GiB)": 302.58, + "step": 161140, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.72690873, + "epoch": 0.9012830532669134, + "grad_norm": 4.6875, + "learning_rate": 6.217134843042117e-06, + "loss": 1.10412607, + "memory(GiB)": 302.58, + "step": 161160, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.71822486, + "epoch": 0.9013949027398926, + "grad_norm": 8.125, + "learning_rate": 6.216237933103937e-06, + "loss": 1.08704472, + "memory(GiB)": 302.58, + "step": 161180, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.73163443, + "epoch": 0.9015067522128719, + "grad_norm": 6.15625, + "learning_rate": 6.215340981566564e-06, + "loss": 1.07667303, + "memory(GiB)": 302.58, + "step": 161200, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.74214272, + "epoch": 0.9016186016858512, + "grad_norm": 9.0625, + "learning_rate": 6.214443988460676e-06, + "loss": 1.01022654, + "memory(GiB)": 302.58, + "step": 161220, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.74217086, + "epoch": 0.9017304511588304, + "grad_norm": 9.0625, + "learning_rate": 6.213546953816952e-06, + "loss": 1.02750568, + "memory(GiB)": 302.58, + "step": 161240, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.7496223, + "epoch": 0.9018423006318097, + "grad_norm": 7.6875, + "learning_rate": 6.2126498776660745e-06, + "loss": 0.96604261, + "memory(GiB)": 302.58, + "step": 161260, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.74745374, + "epoch": 0.901954150104789, + "grad_norm": 5.5, + "learning_rate": 6.2117527600387265e-06, + "loss": 0.99760628, + "memory(GiB)": 302.58, + "step": 161280, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.73480887, + "epoch": 0.9020659995777682, + "grad_norm": 10.1875, + "learning_rate": 6.2108556009655904e-06, + "loss": 1.02938337, + "memory(GiB)": 302.58, + "step": 161300, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.73179045, + "epoch": 0.9021778490507475, + "grad_norm": 8.6875, + "learning_rate": 6.209958400477355e-06, + "loss": 1.05957851, + "memory(GiB)": 302.58, + "step": 161320, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.7429255, + "epoch": 0.9022896985237268, + "grad_norm": 5.84375, + "learning_rate": 6.209061158604706e-06, + "loss": 1.02036943, + "memory(GiB)": 302.58, + "step": 161340, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.7268012, + "epoch": 0.902401547996706, + "grad_norm": 7.4375, + "learning_rate": 6.2081638753783294e-06, + "loss": 1.08048153, + "memory(GiB)": 302.58, + "step": 161360, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.75468597, + "epoch": 0.9025133974696853, + "grad_norm": 5.78125, + "learning_rate": 6.207266550828917e-06, + "loss": 0.9428421, + "memory(GiB)": 302.58, + "step": 161380, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.7364975, + "epoch": 0.9026252469426645, + "grad_norm": 6.71875, + "learning_rate": 6.206369184987162e-06, + "loss": 1.04102325, + "memory(GiB)": 302.58, + "step": 161400, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.73274441, + "epoch": 0.9027370964156438, + "grad_norm": 4.5625, + "learning_rate": 6.205471777883756e-06, + "loss": 1.06214399, + "memory(GiB)": 302.58, + "step": 161420, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.7403295, + "epoch": 0.9028489458886231, + "grad_norm": 7.875, + "learning_rate": 6.204574329549392e-06, + "loss": 1.02110262, + "memory(GiB)": 302.58, + "step": 161440, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.7438992, + "epoch": 0.9029607953616023, + "grad_norm": 8.1875, + "learning_rate": 6.203676840014767e-06, + "loss": 1.02483768, + "memory(GiB)": 302.58, + "step": 161460, + "train_speed(iter/s)": 0.123885 + }, + { + "acc": 0.7225553, + "epoch": 0.9030726448345816, + "grad_norm": 7.75, + "learning_rate": 6.202779309310576e-06, + "loss": 1.08512669, + "memory(GiB)": 302.58, + "step": 161480, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.7490418, + "epoch": 0.9031844943075609, + "grad_norm": 5.71875, + "learning_rate": 6.201881737467519e-06, + "loss": 0.99469137, + "memory(GiB)": 302.58, + "step": 161500, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.73173618, + "epoch": 0.9032963437805401, + "grad_norm": 8.5, + "learning_rate": 6.2009841245162945e-06, + "loss": 1.06232958, + "memory(GiB)": 302.58, + "step": 161520, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.73825374, + "epoch": 0.9034081932535194, + "grad_norm": 10.6875, + "learning_rate": 6.200086470487606e-06, + "loss": 1.03390684, + "memory(GiB)": 302.58, + "step": 161540, + "train_speed(iter/s)": 0.123915 + }, + { + "acc": 0.72520108, + "epoch": 0.9035200427264987, + "grad_norm": 7.78125, + "learning_rate": 6.199188775412155e-06, + "loss": 1.09102335, + "memory(GiB)": 302.58, + "step": 161560, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.74198895, + "epoch": 0.9036318921994779, + "grad_norm": 7.28125, + "learning_rate": 6.198291039320645e-06, + "loss": 1.0097827, + "memory(GiB)": 302.58, + "step": 161580, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.71622162, + "epoch": 0.9037437416724572, + "grad_norm": 7.9375, + "learning_rate": 6.1973932622437825e-06, + "loss": 1.13271809, + "memory(GiB)": 302.58, + "step": 161600, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.74344578, + "epoch": 0.9038555911454365, + "grad_norm": 6.5, + "learning_rate": 6.196495444212274e-06, + "loss": 1.00523167, + "memory(GiB)": 302.58, + "step": 161620, + "train_speed(iter/s)": 0.123943 + }, + { + "acc": 0.74561176, + "epoch": 0.9039674406184157, + "grad_norm": 4.5625, + "learning_rate": 6.195597585256826e-06, + "loss": 1.0136631, + "memory(GiB)": 302.58, + "step": 161640, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.74959993, + "epoch": 0.904079290091395, + "grad_norm": 7.65625, + "learning_rate": 6.19469968540815e-06, + "loss": 0.98940849, + "memory(GiB)": 302.58, + "step": 161660, + "train_speed(iter/s)": 0.123957 + }, + { + "acc": 0.74514012, + "epoch": 0.9041911395643742, + "grad_norm": 7.6875, + "learning_rate": 6.193801744696956e-06, + "loss": 0.99039526, + "memory(GiB)": 302.58, + "step": 161680, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.74335117, + "epoch": 0.9043029890373535, + "grad_norm": 8.6875, + "learning_rate": 6.1929037631539565e-06, + "loss": 1.01669588, + "memory(GiB)": 302.58, + "step": 161700, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.74038019, + "epoch": 0.9044148385103328, + "grad_norm": 6.15625, + "learning_rate": 6.192005740809868e-06, + "loss": 1.02045813, + "memory(GiB)": 302.58, + "step": 161720, + "train_speed(iter/s)": 0.123979 + }, + { + "acc": 0.7496881, + "epoch": 0.904526687983312, + "grad_norm": 6.3125, + "learning_rate": 6.191107677695402e-06, + "loss": 0.97590246, + "memory(GiB)": 302.58, + "step": 161740, + "train_speed(iter/s)": 0.123986 + }, + { + "acc": 0.76084189, + "epoch": 0.9046385374562913, + "grad_norm": 10.6875, + "learning_rate": 6.190209573841276e-06, + "loss": 0.94259071, + "memory(GiB)": 302.58, + "step": 161760, + "train_speed(iter/s)": 0.123994 + }, + { + "acc": 0.75390358, + "epoch": 0.9047503869292706, + "grad_norm": 8.375, + "learning_rate": 6.189311429278209e-06, + "loss": 0.96866589, + "memory(GiB)": 302.58, + "step": 161780, + "train_speed(iter/s)": 0.124001 + }, + { + "acc": 0.73178496, + "epoch": 0.9048622364022498, + "grad_norm": 8.625, + "learning_rate": 6.18841324403692e-06, + "loss": 1.07509136, + "memory(GiB)": 302.58, + "step": 161800, + "train_speed(iter/s)": 0.124008 + }, + { + "acc": 0.72789493, + "epoch": 0.9049740858752291, + "grad_norm": 9.4375, + "learning_rate": 6.187515018148129e-06, + "loss": 1.08303738, + "memory(GiB)": 302.58, + "step": 161820, + "train_speed(iter/s)": 0.124015 + }, + { + "acc": 0.73783607, + "epoch": 0.9050859353482084, + "grad_norm": 9.375, + "learning_rate": 6.186616751642559e-06, + "loss": 1.04097223, + "memory(GiB)": 302.58, + "step": 161840, + "train_speed(iter/s)": 0.124022 + }, + { + "acc": 0.73443336, + "epoch": 0.9051977848211876, + "grad_norm": 7.46875, + "learning_rate": 6.185718444550934e-06, + "loss": 1.0719182, + "memory(GiB)": 302.58, + "step": 161860, + "train_speed(iter/s)": 0.124029 + }, + { + "acc": 0.7518013, + "epoch": 0.9053096342941669, + "grad_norm": 5.75, + "learning_rate": 6.184820096903979e-06, + "loss": 0.96067524, + "memory(GiB)": 302.58, + "step": 161880, + "train_speed(iter/s)": 0.124036 + }, + { + "acc": 0.73969307, + "epoch": 0.9054214837671462, + "grad_norm": 6.90625, + "learning_rate": 6.183921708732419e-06, + "loss": 1.02517395, + "memory(GiB)": 302.58, + "step": 161900, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.73314042, + "epoch": 0.9055333332401254, + "grad_norm": 7.84375, + "learning_rate": 6.183023280066982e-06, + "loss": 1.05245047, + "memory(GiB)": 302.58, + "step": 161920, + "train_speed(iter/s)": 0.124051 + }, + { + "acc": 0.7404767, + "epoch": 0.9056451827131047, + "grad_norm": 6.8125, + "learning_rate": 6.182124810938398e-06, + "loss": 1.01617231, + "memory(GiB)": 302.58, + "step": 161940, + "train_speed(iter/s)": 0.124058 + }, + { + "acc": 0.73796511, + "epoch": 0.9057570321860839, + "grad_norm": 7.40625, + "learning_rate": 6.181226301377396e-06, + "loss": 1.04624071, + "memory(GiB)": 302.58, + "step": 161960, + "train_speed(iter/s)": 0.124065 + }, + { + "acc": 0.73186698, + "epoch": 0.9058688816590632, + "grad_norm": 8.5625, + "learning_rate": 6.180327751414711e-06, + "loss": 1.04673748, + "memory(GiB)": 302.58, + "step": 161980, + "train_speed(iter/s)": 0.124073 + }, + { + "acc": 0.73119273, + "epoch": 0.9059807311320425, + "grad_norm": 8.875, + "learning_rate": 6.179429161081073e-06, + "loss": 1.08260565, + "memory(GiB)": 302.58, + "step": 162000, + "train_speed(iter/s)": 0.12408 + }, + { + "epoch": 0.9059807311320425, + "eval_acc": 0.7059151411036703, + "eval_loss": 1.0149575471878052, + "eval_runtime": 7498.8674, + "eval_samples_per_second": 10.039, + "eval_steps_per_second": 10.039, + "step": 162000 + }, + { + "acc": 0.74908361, + "epoch": 0.9060925806050217, + "grad_norm": 7.09375, + "learning_rate": 6.1785305304072185e-06, + "loss": 0.99007044, + "memory(GiB)": 302.58, + "step": 162020, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.73256626, + "epoch": 0.906204430078001, + "grad_norm": 6.09375, + "learning_rate": 6.177631859423883e-06, + "loss": 1.05402136, + "memory(GiB)": 302.58, + "step": 162040, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.73184338, + "epoch": 0.9063162795509803, + "grad_norm": 6.28125, + "learning_rate": 6.176733148161801e-06, + "loss": 1.04078226, + "memory(GiB)": 302.58, + "step": 162060, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.74846358, + "epoch": 0.9064281290239595, + "grad_norm": 7.8125, + "learning_rate": 6.1758343966517165e-06, + "loss": 0.96481152, + "memory(GiB)": 302.58, + "step": 162080, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.73179874, + "epoch": 0.9065399784969388, + "grad_norm": 7.40625, + "learning_rate": 6.174935604924367e-06, + "loss": 1.07444572, + "memory(GiB)": 302.58, + "step": 162100, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.73466258, + "epoch": 0.9066518279699181, + "grad_norm": 5.5625, + "learning_rate": 6.174036773010492e-06, + "loss": 1.04632263, + "memory(GiB)": 302.58, + "step": 162120, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.73014669, + "epoch": 0.9067636774428973, + "grad_norm": 7.6875, + "learning_rate": 6.17313790094084e-06, + "loss": 1.04745789, + "memory(GiB)": 302.58, + "step": 162140, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.74425626, + "epoch": 0.9068755269158766, + "grad_norm": 8.1875, + "learning_rate": 6.17223898874615e-06, + "loss": 1.03379383, + "memory(GiB)": 302.58, + "step": 162160, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75122733, + "epoch": 0.9069873763888558, + "grad_norm": 6.84375, + "learning_rate": 6.17134003645717e-06, + "loss": 0.99098473, + "memory(GiB)": 302.58, + "step": 162180, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.74191933, + "epoch": 0.9070992258618351, + "grad_norm": 7.375, + "learning_rate": 6.170441044104647e-06, + "loss": 1.00651722, + "memory(GiB)": 302.58, + "step": 162200, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.74202294, + "epoch": 0.9072110753348144, + "grad_norm": 9.1875, + "learning_rate": 6.169542011719328e-06, + "loss": 1.02560177, + "memory(GiB)": 302.58, + "step": 162220, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.74819899, + "epoch": 0.9073229248077936, + "grad_norm": 8.0, + "learning_rate": 6.168642939331964e-06, + "loss": 0.97925844, + "memory(GiB)": 302.58, + "step": 162240, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.73861122, + "epoch": 0.9074347742807729, + "grad_norm": 9.5, + "learning_rate": 6.167743826973305e-06, + "loss": 1.00612202, + "memory(GiB)": 302.58, + "step": 162260, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.72169704, + "epoch": 0.9075466237537522, + "grad_norm": 8.4375, + "learning_rate": 6.166844674674105e-06, + "loss": 1.08741245, + "memory(GiB)": 302.58, + "step": 162280, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.73379397, + "epoch": 0.9076584732267314, + "grad_norm": 7.625, + "learning_rate": 6.165945482465118e-06, + "loss": 1.03975096, + "memory(GiB)": 302.58, + "step": 162300, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74839153, + "epoch": 0.9077703226997107, + "grad_norm": 7.15625, + "learning_rate": 6.165046250377098e-06, + "loss": 0.99585037, + "memory(GiB)": 302.58, + "step": 162320, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.74523888, + "epoch": 0.90788217217269, + "grad_norm": 8.875, + "learning_rate": 6.164146978440802e-06, + "loss": 0.98718262, + "memory(GiB)": 302.58, + "step": 162340, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.73195238, + "epoch": 0.9079940216456692, + "grad_norm": 6.0, + "learning_rate": 6.1632476666869865e-06, + "loss": 1.04407234, + "memory(GiB)": 302.58, + "step": 162360, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.75230012, + "epoch": 0.9081058711186485, + "grad_norm": 7.4375, + "learning_rate": 6.162348315146413e-06, + "loss": 0.95482664, + "memory(GiB)": 302.58, + "step": 162380, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.74503098, + "epoch": 0.9082177205916278, + "grad_norm": 8.6875, + "learning_rate": 6.161448923849842e-06, + "loss": 1.00070601, + "memory(GiB)": 302.58, + "step": 162400, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75062709, + "epoch": 0.908329570064607, + "grad_norm": 7.0, + "learning_rate": 6.160549492828035e-06, + "loss": 0.96258602, + "memory(GiB)": 302.58, + "step": 162420, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.73001194, + "epoch": 0.9084414195375863, + "grad_norm": 8.9375, + "learning_rate": 6.159650022111755e-06, + "loss": 1.06887341, + "memory(GiB)": 302.58, + "step": 162440, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.73983507, + "epoch": 0.9085532690105655, + "grad_norm": 5.59375, + "learning_rate": 6.158750511731767e-06, + "loss": 1.0305562, + "memory(GiB)": 302.58, + "step": 162460, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.74104338, + "epoch": 0.9086651184835448, + "grad_norm": 7.15625, + "learning_rate": 6.157850961718837e-06, + "loss": 1.01300707, + "memory(GiB)": 302.58, + "step": 162480, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.74760146, + "epoch": 0.9087769679565241, + "grad_norm": 6.34375, + "learning_rate": 6.1569513721037335e-06, + "loss": 0.99194183, + "memory(GiB)": 302.58, + "step": 162500, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.74990416, + "epoch": 0.9088888174295033, + "grad_norm": 5.40625, + "learning_rate": 6.156051742917224e-06, + "loss": 0.99506779, + "memory(GiB)": 302.58, + "step": 162520, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.75977435, + "epoch": 0.9090006669024826, + "grad_norm": 6.78125, + "learning_rate": 6.155152074190078e-06, + "loss": 0.94146824, + "memory(GiB)": 302.58, + "step": 162540, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74919491, + "epoch": 0.9091125163754619, + "grad_norm": 9.6875, + "learning_rate": 6.154252365953071e-06, + "loss": 0.98938732, + "memory(GiB)": 302.58, + "step": 162560, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75209436, + "epoch": 0.9092243658484411, + "grad_norm": 8.25, + "learning_rate": 6.153352618236971e-06, + "loss": 0.96842785, + "memory(GiB)": 302.58, + "step": 162580, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74486117, + "epoch": 0.9093362153214204, + "grad_norm": 8.8125, + "learning_rate": 6.152452831072554e-06, + "loss": 0.98405972, + "memory(GiB)": 302.58, + "step": 162600, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.75988679, + "epoch": 0.9094480647943997, + "grad_norm": 6.40625, + "learning_rate": 6.151553004490597e-06, + "loss": 0.93540545, + "memory(GiB)": 302.58, + "step": 162620, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.73953872, + "epoch": 0.9095599142673789, + "grad_norm": 6.75, + "learning_rate": 6.150653138521875e-06, + "loss": 1.02245922, + "memory(GiB)": 302.58, + "step": 162640, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.73825569, + "epoch": 0.9096717637403582, + "grad_norm": 5.6875, + "learning_rate": 6.1497532331971664e-06, + "loss": 1.02762527, + "memory(GiB)": 302.58, + "step": 162660, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.73629308, + "epoch": 0.9097836132133374, + "grad_norm": 8.0625, + "learning_rate": 6.148853288547253e-06, + "loss": 1.03429775, + "memory(GiB)": 302.58, + "step": 162680, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.76221533, + "epoch": 0.9098954626863167, + "grad_norm": 8.375, + "learning_rate": 6.147953304602914e-06, + "loss": 0.91400013, + "memory(GiB)": 302.58, + "step": 162700, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.7487771, + "epoch": 0.910007312159296, + "grad_norm": 8.75, + "learning_rate": 6.147053281394931e-06, + "loss": 1.01417532, + "memory(GiB)": 302.58, + "step": 162720, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.74841342, + "epoch": 0.9101191616322752, + "grad_norm": 6.34375, + "learning_rate": 6.1461532189540895e-06, + "loss": 0.97760839, + "memory(GiB)": 302.58, + "step": 162740, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.73355865, + "epoch": 0.9102310111052545, + "grad_norm": 6.84375, + "learning_rate": 6.1452531173111715e-06, + "loss": 1.05728703, + "memory(GiB)": 302.58, + "step": 162760, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.7387712, + "epoch": 0.9103428605782338, + "grad_norm": 8.6875, + "learning_rate": 6.1443529764969674e-06, + "loss": 1.03560667, + "memory(GiB)": 302.58, + "step": 162780, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.74075246, + "epoch": 0.910454710051213, + "grad_norm": 5.28125, + "learning_rate": 6.143452796542262e-06, + "loss": 1.00653915, + "memory(GiB)": 302.58, + "step": 162800, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.73886747, + "epoch": 0.9105665595241923, + "grad_norm": 7.5625, + "learning_rate": 6.142552577477846e-06, + "loss": 1.02755508, + "memory(GiB)": 302.58, + "step": 162820, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.74732175, + "epoch": 0.9106784089971716, + "grad_norm": 6.625, + "learning_rate": 6.141652319334508e-06, + "loss": 0.992418, + "memory(GiB)": 302.58, + "step": 162840, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.74738822, + "epoch": 0.9107902584701508, + "grad_norm": 7.25, + "learning_rate": 6.140752022143041e-06, + "loss": 0.98010302, + "memory(GiB)": 302.58, + "step": 162860, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.7511591, + "epoch": 0.9109021079431301, + "grad_norm": 9.1875, + "learning_rate": 6.139851685934239e-06, + "loss": 0.97590714, + "memory(GiB)": 302.58, + "step": 162880, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.75355434, + "epoch": 0.9110139574161094, + "grad_norm": 6.5625, + "learning_rate": 6.138951310738893e-06, + "loss": 0.96530371, + "memory(GiB)": 302.58, + "step": 162900, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.72371616, + "epoch": 0.9111258068890886, + "grad_norm": 5.5, + "learning_rate": 6.138050896587801e-06, + "loss": 1.10542002, + "memory(GiB)": 302.58, + "step": 162920, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.73819561, + "epoch": 0.9112376563620679, + "grad_norm": 7.8125, + "learning_rate": 6.1371504435117606e-06, + "loss": 1.0415102, + "memory(GiB)": 302.58, + "step": 162940, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.73490491, + "epoch": 0.9113495058350471, + "grad_norm": 8.1875, + "learning_rate": 6.1362499515415686e-06, + "loss": 1.05165195, + "memory(GiB)": 302.58, + "step": 162960, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.73027864, + "epoch": 0.9114613553080264, + "grad_norm": 6.75, + "learning_rate": 6.135349420708025e-06, + "loss": 1.07095213, + "memory(GiB)": 302.58, + "step": 162980, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.72838273, + "epoch": 0.9115732047810057, + "grad_norm": 8.625, + "learning_rate": 6.134448851041932e-06, + "loss": 1.08888559, + "memory(GiB)": 302.58, + "step": 163000, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.74392033, + "epoch": 0.9116850542539849, + "grad_norm": 5.03125, + "learning_rate": 6.13354824257409e-06, + "loss": 0.9886692, + "memory(GiB)": 302.58, + "step": 163020, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.7504734, + "epoch": 0.9117969037269642, + "grad_norm": 7.53125, + "learning_rate": 6.132647595335305e-06, + "loss": 0.98702927, + "memory(GiB)": 302.58, + "step": 163040, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.74820824, + "epoch": 0.9119087531999435, + "grad_norm": 7.90625, + "learning_rate": 6.13174690935638e-06, + "loss": 0.99497671, + "memory(GiB)": 302.58, + "step": 163060, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.75029645, + "epoch": 0.9120206026729227, + "grad_norm": 6.8125, + "learning_rate": 6.1308461846681214e-06, + "loss": 0.98208075, + "memory(GiB)": 302.58, + "step": 163080, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.75668082, + "epoch": 0.912132452145902, + "grad_norm": 6.5, + "learning_rate": 6.129945421301338e-06, + "loss": 0.95354395, + "memory(GiB)": 302.58, + "step": 163100, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.73659582, + "epoch": 0.9122443016188813, + "grad_norm": 8.0625, + "learning_rate": 6.129044619286838e-06, + "loss": 1.04964333, + "memory(GiB)": 302.58, + "step": 163120, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.73793211, + "epoch": 0.9123561510918605, + "grad_norm": 6.8125, + "learning_rate": 6.128143778655433e-06, + "loss": 1.01754389, + "memory(GiB)": 302.58, + "step": 163140, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.72371807, + "epoch": 0.9124680005648398, + "grad_norm": 9.1875, + "learning_rate": 6.127242899437933e-06, + "loss": 1.08847246, + "memory(GiB)": 302.58, + "step": 163160, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.74240026, + "epoch": 0.912579850037819, + "grad_norm": 9.1875, + "learning_rate": 6.126341981665151e-06, + "loss": 1.01227827, + "memory(GiB)": 302.58, + "step": 163180, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.74810362, + "epoch": 0.9126916995107983, + "grad_norm": 5.875, + "learning_rate": 6.125441025367902e-06, + "loss": 0.96545191, + "memory(GiB)": 302.58, + "step": 163200, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.75468884, + "epoch": 0.9128035489837776, + "grad_norm": 9.6875, + "learning_rate": 6.124540030577e-06, + "loss": 0.94979734, + "memory(GiB)": 302.58, + "step": 163220, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.74360094, + "epoch": 0.9129153984567568, + "grad_norm": 5.4375, + "learning_rate": 6.123638997323264e-06, + "loss": 1.00063524, + "memory(GiB)": 302.58, + "step": 163240, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.74580679, + "epoch": 0.9130272479297361, + "grad_norm": 6.03125, + "learning_rate": 6.122737925637512e-06, + "loss": 1.01144962, + "memory(GiB)": 302.58, + "step": 163260, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.73695407, + "epoch": 0.9131390974027154, + "grad_norm": 6.46875, + "learning_rate": 6.121836815550564e-06, + "loss": 1.02680645, + "memory(GiB)": 302.58, + "step": 163280, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.75583372, + "epoch": 0.9132509468756946, + "grad_norm": 8.4375, + "learning_rate": 6.120935667093237e-06, + "loss": 0.95092869, + "memory(GiB)": 302.58, + "step": 163300, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.75178494, + "epoch": 0.9133627963486739, + "grad_norm": 6.8125, + "learning_rate": 6.120034480296358e-06, + "loss": 0.96414156, + "memory(GiB)": 302.58, + "step": 163320, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.72785687, + "epoch": 0.9134746458216532, + "grad_norm": 7.96875, + "learning_rate": 6.119133255190746e-06, + "loss": 1.06924829, + "memory(GiB)": 302.58, + "step": 163340, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.75558367, + "epoch": 0.9135864952946325, + "grad_norm": 8.0625, + "learning_rate": 6.118231991807229e-06, + "loss": 0.96853514, + "memory(GiB)": 302.58, + "step": 163360, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.73588319, + "epoch": 0.9136983447676118, + "grad_norm": 7.34375, + "learning_rate": 6.11733069017663e-06, + "loss": 1.03414211, + "memory(GiB)": 302.58, + "step": 163380, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.73085666, + "epoch": 0.9138101942405911, + "grad_norm": 5.5, + "learning_rate": 6.1164293503297825e-06, + "loss": 1.05639105, + "memory(GiB)": 302.58, + "step": 163400, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.73722682, + "epoch": 0.9139220437135703, + "grad_norm": 7.71875, + "learning_rate": 6.115527972297509e-06, + "loss": 1.0374156, + "memory(GiB)": 302.58, + "step": 163420, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.73973808, + "epoch": 0.9140338931865496, + "grad_norm": 7.21875, + "learning_rate": 6.114626556110642e-06, + "loss": 1.03148642, + "memory(GiB)": 302.58, + "step": 163440, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.73667603, + "epoch": 0.9141457426595289, + "grad_norm": 5.71875, + "learning_rate": 6.113725101800012e-06, + "loss": 1.03032303, + "memory(GiB)": 302.58, + "step": 163460, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.74326296, + "epoch": 0.9142575921325081, + "grad_norm": 8.3125, + "learning_rate": 6.112823609396452e-06, + "loss": 0.99399452, + "memory(GiB)": 302.58, + "step": 163480, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.74175777, + "epoch": 0.9143694416054874, + "grad_norm": 5.71875, + "learning_rate": 6.111922078930797e-06, + "loss": 1.01235161, + "memory(GiB)": 302.58, + "step": 163500, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.73302536, + "epoch": 0.9144812910784667, + "grad_norm": 6.875, + "learning_rate": 6.11102051043388e-06, + "loss": 1.05288839, + "memory(GiB)": 302.58, + "step": 163520, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.74744244, + "epoch": 0.9145931405514459, + "grad_norm": 5.59375, + "learning_rate": 6.110118903936538e-06, + "loss": 0.99481401, + "memory(GiB)": 302.58, + "step": 163540, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.73678298, + "epoch": 0.9147049900244252, + "grad_norm": 7.21875, + "learning_rate": 6.109217259469611e-06, + "loss": 1.03667278, + "memory(GiB)": 302.58, + "step": 163560, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.75201054, + "epoch": 0.9148168394974044, + "grad_norm": 5.84375, + "learning_rate": 6.108315577063935e-06, + "loss": 0.94824724, + "memory(GiB)": 302.58, + "step": 163580, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.7462378, + "epoch": 0.9149286889703837, + "grad_norm": 8.3125, + "learning_rate": 6.107413856750353e-06, + "loss": 1.00042915, + "memory(GiB)": 302.58, + "step": 163600, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.75420814, + "epoch": 0.915040538443363, + "grad_norm": 7.0625, + "learning_rate": 6.106512098559706e-06, + "loss": 0.96413374, + "memory(GiB)": 302.58, + "step": 163620, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.74649782, + "epoch": 0.9151523879163422, + "grad_norm": 6.59375, + "learning_rate": 6.105610302522835e-06, + "loss": 0.99105682, + "memory(GiB)": 302.58, + "step": 163640, + "train_speed(iter/s)": 0.123943 + }, + { + "acc": 0.74450874, + "epoch": 0.9152642373893215, + "grad_norm": 8.375, + "learning_rate": 6.104708468670586e-06, + "loss": 1.00079155, + "memory(GiB)": 302.58, + "step": 163660, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.75019798, + "epoch": 0.9153760868623008, + "grad_norm": 7.75, + "learning_rate": 6.103806597033806e-06, + "loss": 0.9715354, + "memory(GiB)": 302.58, + "step": 163680, + "train_speed(iter/s)": 0.123957 + }, + { + "acc": 0.73087869, + "epoch": 0.91548793633528, + "grad_norm": 7.71875, + "learning_rate": 6.102904687643339e-06, + "loss": 1.07289915, + "memory(GiB)": 302.58, + "step": 163700, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.75747161, + "epoch": 0.9155997858082593, + "grad_norm": 9.6875, + "learning_rate": 6.1020027405300355e-06, + "loss": 0.96005926, + "memory(GiB)": 302.58, + "step": 163720, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.74299746, + "epoch": 0.9157116352812386, + "grad_norm": 8.125, + "learning_rate": 6.101100755724743e-06, + "loss": 0.99915323, + "memory(GiB)": 302.58, + "step": 163740, + "train_speed(iter/s)": 0.123979 + }, + { + "acc": 0.73427472, + "epoch": 0.9158234847542178, + "grad_norm": 8.0625, + "learning_rate": 6.100198733258314e-06, + "loss": 1.05535564, + "memory(GiB)": 302.58, + "step": 163760, + "train_speed(iter/s)": 0.123986 + }, + { + "acc": 0.73694825, + "epoch": 0.9159353342271971, + "grad_norm": 6.78125, + "learning_rate": 6.099296673161599e-06, + "loss": 1.03148413, + "memory(GiB)": 302.58, + "step": 163780, + "train_speed(iter/s)": 0.123993 + }, + { + "acc": 0.72953057, + "epoch": 0.9160471837001763, + "grad_norm": 10.0625, + "learning_rate": 6.098394575465451e-06, + "loss": 1.10449457, + "memory(GiB)": 302.58, + "step": 163800, + "train_speed(iter/s)": 0.124 + }, + { + "acc": 0.73583813, + "epoch": 0.9161590331731556, + "grad_norm": 7.65625, + "learning_rate": 6.097492440200727e-06, + "loss": 1.02712021, + "memory(GiB)": 302.58, + "step": 163820, + "train_speed(iter/s)": 0.124007 + }, + { + "acc": 0.74884267, + "epoch": 0.9162708826461349, + "grad_norm": 5.9375, + "learning_rate": 6.096590267398281e-06, + "loss": 0.98685589, + "memory(GiB)": 302.58, + "step": 163840, + "train_speed(iter/s)": 0.124014 + }, + { + "acc": 0.73968306, + "epoch": 0.9163827321191141, + "grad_norm": 8.375, + "learning_rate": 6.095688057088971e-06, + "loss": 1.04356079, + "memory(GiB)": 302.58, + "step": 163860, + "train_speed(iter/s)": 0.124021 + }, + { + "acc": 0.74556942, + "epoch": 0.9164945815920934, + "grad_norm": 7.65625, + "learning_rate": 6.094785809303655e-06, + "loss": 0.98143196, + "memory(GiB)": 302.58, + "step": 163880, + "train_speed(iter/s)": 0.124028 + }, + { + "acc": 0.74615335, + "epoch": 0.9166064310650727, + "grad_norm": 6.5625, + "learning_rate": 6.093883524073193e-06, + "loss": 0.99767265, + "memory(GiB)": 302.58, + "step": 163900, + "train_speed(iter/s)": 0.124035 + }, + { + "acc": 0.72843523, + "epoch": 0.9167182805380519, + "grad_norm": 4.6875, + "learning_rate": 6.092981201428445e-06, + "loss": 1.09341173, + "memory(GiB)": 302.58, + "step": 163920, + "train_speed(iter/s)": 0.124042 + }, + { + "acc": 0.74977307, + "epoch": 0.9168301300110312, + "grad_norm": 6.46875, + "learning_rate": 6.092078841400275e-06, + "loss": 0.98578205, + "memory(GiB)": 302.58, + "step": 163940, + "train_speed(iter/s)": 0.124048 + }, + { + "acc": 0.73520374, + "epoch": 0.9169419794840105, + "grad_norm": 7.3125, + "learning_rate": 6.091176444019545e-06, + "loss": 1.04049711, + "memory(GiB)": 302.58, + "step": 163960, + "train_speed(iter/s)": 0.124056 + }, + { + "acc": 0.73779345, + "epoch": 0.9170538289569897, + "grad_norm": 5.28125, + "learning_rate": 6.090274009317122e-06, + "loss": 1.02576971, + "memory(GiB)": 302.58, + "step": 163980, + "train_speed(iter/s)": 0.124062 + }, + { + "acc": 0.74325972, + "epoch": 0.917165678429969, + "grad_norm": 6.15625, + "learning_rate": 6.089371537323868e-06, + "loss": 0.99254456, + "memory(GiB)": 302.58, + "step": 164000, + "train_speed(iter/s)": 0.124069 + }, + { + "epoch": 0.917165678429969, + "eval_acc": 0.7059611843020497, + "eval_loss": 1.0146350860595703, + "eval_runtime": 7505.1428, + "eval_samples_per_second": 10.031, + "eval_steps_per_second": 10.031, + "step": 164000 + }, + { + "acc": 0.75412016, + "epoch": 0.9172775279029483, + "grad_norm": 7.28125, + "learning_rate": 6.088469028070656e-06, + "loss": 0.96883307, + "memory(GiB)": 302.58, + "step": 164020, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.73109179, + "epoch": 0.9173893773759275, + "grad_norm": 5.1875, + "learning_rate": 6.087566481588351e-06, + "loss": 1.07322426, + "memory(GiB)": 302.58, + "step": 164040, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.73541627, + "epoch": 0.9175012268489068, + "grad_norm": 5.71875, + "learning_rate": 6.086663897907823e-06, + "loss": 1.0426199, + "memory(GiB)": 302.58, + "step": 164060, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.73538966, + "epoch": 0.917613076321886, + "grad_norm": 6.5625, + "learning_rate": 6.085761277059945e-06, + "loss": 1.05294399, + "memory(GiB)": 302.58, + "step": 164080, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.75797844, + "epoch": 0.9177249257948653, + "grad_norm": 7.90625, + "learning_rate": 6.0848586190755866e-06, + "loss": 0.96010418, + "memory(GiB)": 302.58, + "step": 164100, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.73487287, + "epoch": 0.9178367752678446, + "grad_norm": 6.25, + "learning_rate": 6.083955923985625e-06, + "loss": 1.04691896, + "memory(GiB)": 302.58, + "step": 164120, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.74278393, + "epoch": 0.9179486247408238, + "grad_norm": 7.875, + "learning_rate": 6.0830531918209346e-06, + "loss": 1.02472458, + "memory(GiB)": 302.58, + "step": 164140, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.7426455, + "epoch": 0.9180604742138031, + "grad_norm": 6.375, + "learning_rate": 6.0821504226123895e-06, + "loss": 1.01215248, + "memory(GiB)": 302.58, + "step": 164160, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.72622848, + "epoch": 0.9181723236867824, + "grad_norm": 8.375, + "learning_rate": 6.081247616390869e-06, + "loss": 1.09261923, + "memory(GiB)": 302.58, + "step": 164180, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.7341188, + "epoch": 0.9182841731597616, + "grad_norm": 4.53125, + "learning_rate": 6.080344773187251e-06, + "loss": 1.0704752, + "memory(GiB)": 302.58, + "step": 164200, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.73093562, + "epoch": 0.9183960226327409, + "grad_norm": 6.625, + "learning_rate": 6.079441893032416e-06, + "loss": 1.09193449, + "memory(GiB)": 302.58, + "step": 164220, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.7396574, + "epoch": 0.9185078721057202, + "grad_norm": 5.28125, + "learning_rate": 6.0785389759572454e-06, + "loss": 1.01572571, + "memory(GiB)": 302.58, + "step": 164240, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.75372467, + "epoch": 0.9186197215786994, + "grad_norm": 7.03125, + "learning_rate": 6.077636021992622e-06, + "loss": 0.94633417, + "memory(GiB)": 302.58, + "step": 164260, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.73448682, + "epoch": 0.9187315710516787, + "grad_norm": 4.9375, + "learning_rate": 6.076733031169431e-06, + "loss": 1.04325943, + "memory(GiB)": 302.58, + "step": 164280, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.73279142, + "epoch": 0.918843420524658, + "grad_norm": 5.28125, + "learning_rate": 6.075830003518554e-06, + "loss": 1.06051531, + "memory(GiB)": 302.58, + "step": 164300, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.73226228, + "epoch": 0.9189552699976372, + "grad_norm": 8.8125, + "learning_rate": 6.074926939070881e-06, + "loss": 1.0565073, + "memory(GiB)": 302.58, + "step": 164320, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74693894, + "epoch": 0.9190671194706165, + "grad_norm": 6.90625, + "learning_rate": 6.074023837857297e-06, + "loss": 1.00552435, + "memory(GiB)": 302.58, + "step": 164340, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.74351263, + "epoch": 0.9191789689435957, + "grad_norm": 7.21875, + "learning_rate": 6.073120699908694e-06, + "loss": 1.0145443, + "memory(GiB)": 302.58, + "step": 164360, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.7441227, + "epoch": 0.919290818416575, + "grad_norm": 6.625, + "learning_rate": 6.0722175252559564e-06, + "loss": 1.01506824, + "memory(GiB)": 302.58, + "step": 164380, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.73419666, + "epoch": 0.9194026678895543, + "grad_norm": 5.8125, + "learning_rate": 6.071314313929982e-06, + "loss": 1.0734108, + "memory(GiB)": 302.58, + "step": 164400, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.73793082, + "epoch": 0.9195145173625335, + "grad_norm": 6.5, + "learning_rate": 6.070411065961662e-06, + "loss": 1.02767305, + "memory(GiB)": 302.58, + "step": 164420, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.75870833, + "epoch": 0.9196263668355128, + "grad_norm": 6.15625, + "learning_rate": 6.0695077813818884e-06, + "loss": 0.93437366, + "memory(GiB)": 302.58, + "step": 164440, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.73646712, + "epoch": 0.9197382163084921, + "grad_norm": 8.25, + "learning_rate": 6.068604460221558e-06, + "loss": 1.02845592, + "memory(GiB)": 302.58, + "step": 164460, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74173222, + "epoch": 0.9198500657814713, + "grad_norm": 5.46875, + "learning_rate": 6.067701102511566e-06, + "loss": 1.02794952, + "memory(GiB)": 302.58, + "step": 164480, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.73391113, + "epoch": 0.9199619152544506, + "grad_norm": 7.75, + "learning_rate": 6.06679770828281e-06, + "loss": 1.05462809, + "memory(GiB)": 302.58, + "step": 164500, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.74793267, + "epoch": 0.9200737647274299, + "grad_norm": 5.8125, + "learning_rate": 6.06589427756619e-06, + "loss": 0.98595514, + "memory(GiB)": 302.58, + "step": 164520, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.7432147, + "epoch": 0.9201856142004091, + "grad_norm": 6.25, + "learning_rate": 6.0649908103926056e-06, + "loss": 0.99920282, + "memory(GiB)": 302.58, + "step": 164540, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74679637, + "epoch": 0.9202974636733884, + "grad_norm": 6.09375, + "learning_rate": 6.0640873067929605e-06, + "loss": 0.97135029, + "memory(GiB)": 302.58, + "step": 164560, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.7392508, + "epoch": 0.9204093131463676, + "grad_norm": 7.84375, + "learning_rate": 6.0631837667981545e-06, + "loss": 1.00935488, + "memory(GiB)": 302.58, + "step": 164580, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.76223149, + "epoch": 0.9205211626193469, + "grad_norm": 5.28125, + "learning_rate": 6.062280190439092e-06, + "loss": 0.91546507, + "memory(GiB)": 302.58, + "step": 164600, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74519548, + "epoch": 0.9206330120923262, + "grad_norm": 8.375, + "learning_rate": 6.061376577746678e-06, + "loss": 0.99370756, + "memory(GiB)": 302.58, + "step": 164620, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.71996231, + "epoch": 0.9207448615653054, + "grad_norm": 7.375, + "learning_rate": 6.0604729287518195e-06, + "loss": 1.11046534, + "memory(GiB)": 302.58, + "step": 164640, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.73442888, + "epoch": 0.9208567110382847, + "grad_norm": 5.90625, + "learning_rate": 6.0595692434854235e-06, + "loss": 1.04029036, + "memory(GiB)": 302.58, + "step": 164660, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.7374516, + "epoch": 0.920968560511264, + "grad_norm": 7.0625, + "learning_rate": 6.058665521978401e-06, + "loss": 1.02389431, + "memory(GiB)": 302.58, + "step": 164680, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.72468352, + "epoch": 0.9210804099842432, + "grad_norm": 3.984375, + "learning_rate": 6.057761764261661e-06, + "loss": 1.09872751, + "memory(GiB)": 302.58, + "step": 164700, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.73618555, + "epoch": 0.9211922594572225, + "grad_norm": 6.6875, + "learning_rate": 6.056857970366113e-06, + "loss": 1.02239199, + "memory(GiB)": 302.58, + "step": 164720, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.74972348, + "epoch": 0.9213041089302018, + "grad_norm": 6.90625, + "learning_rate": 6.055954140322673e-06, + "loss": 0.98147535, + "memory(GiB)": 302.58, + "step": 164740, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.74931407, + "epoch": 0.921415958403181, + "grad_norm": 7.5625, + "learning_rate": 6.055050274162252e-06, + "loss": 0.98797827, + "memory(GiB)": 302.58, + "step": 164760, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.75257339, + "epoch": 0.9215278078761603, + "grad_norm": 6.53125, + "learning_rate": 6.054146371915767e-06, + "loss": 0.96613522, + "memory(GiB)": 302.58, + "step": 164780, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.72963786, + "epoch": 0.9216396573491396, + "grad_norm": 5.28125, + "learning_rate": 6.053242433614132e-06, + "loss": 1.0804594, + "memory(GiB)": 302.58, + "step": 164800, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.76332741, + "epoch": 0.9217515068221188, + "grad_norm": 9.0, + "learning_rate": 6.052338459288267e-06, + "loss": 0.9382019, + "memory(GiB)": 302.58, + "step": 164820, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.7504077, + "epoch": 0.9218633562950981, + "grad_norm": 6.9375, + "learning_rate": 6.051434448969089e-06, + "loss": 0.97313004, + "memory(GiB)": 302.58, + "step": 164840, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74955001, + "epoch": 0.9219752057680773, + "grad_norm": 9.9375, + "learning_rate": 6.0505304026875204e-06, + "loss": 0.99090185, + "memory(GiB)": 302.58, + "step": 164860, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.73993936, + "epoch": 0.9220870552410566, + "grad_norm": 6.40625, + "learning_rate": 6.04962632047448e-06, + "loss": 1.03776169, + "memory(GiB)": 302.58, + "step": 164880, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.73799586, + "epoch": 0.9221989047140359, + "grad_norm": 4.71875, + "learning_rate": 6.048722202360891e-06, + "loss": 1.02893801, + "memory(GiB)": 302.58, + "step": 164900, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.74242725, + "epoch": 0.9223107541870151, + "grad_norm": 7.0625, + "learning_rate": 6.047818048377679e-06, + "loss": 1.01594, + "memory(GiB)": 302.58, + "step": 164920, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.72532258, + "epoch": 0.9224226036599944, + "grad_norm": 5.625, + "learning_rate": 6.046913858555764e-06, + "loss": 1.08545485, + "memory(GiB)": 302.58, + "step": 164940, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.75954285, + "epoch": 0.9225344531329737, + "grad_norm": 8.625, + "learning_rate": 6.046009632926078e-06, + "loss": 0.93033371, + "memory(GiB)": 302.58, + "step": 164960, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.7573278, + "epoch": 0.9226463026059529, + "grad_norm": 11.5, + "learning_rate": 6.045105371519544e-06, + "loss": 0.95521774, + "memory(GiB)": 302.58, + "step": 164980, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.73537722, + "epoch": 0.9227581520789322, + "grad_norm": 5.125, + "learning_rate": 6.044201074367093e-06, + "loss": 1.03214254, + "memory(GiB)": 302.58, + "step": 165000, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.73034253, + "epoch": 0.9228700015519115, + "grad_norm": 5.71875, + "learning_rate": 6.043296741499656e-06, + "loss": 1.07116137, + "memory(GiB)": 302.58, + "step": 165020, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.74211884, + "epoch": 0.9229818510248907, + "grad_norm": 6.75, + "learning_rate": 6.04239237294816e-06, + "loss": 1.01457262, + "memory(GiB)": 302.58, + "step": 165040, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.74701548, + "epoch": 0.92309370049787, + "grad_norm": 6.53125, + "learning_rate": 6.04148796874354e-06, + "loss": 0.984935, + "memory(GiB)": 302.58, + "step": 165060, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.72118063, + "epoch": 0.9232055499708492, + "grad_norm": 7.53125, + "learning_rate": 6.040583528916728e-06, + "loss": 1.1012804, + "memory(GiB)": 302.58, + "step": 165080, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.74710574, + "epoch": 0.9233173994438285, + "grad_norm": 7.0, + "learning_rate": 6.039679053498662e-06, + "loss": 0.9879158, + "memory(GiB)": 302.58, + "step": 165100, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.7358129, + "epoch": 0.9234292489168078, + "grad_norm": 6.09375, + "learning_rate": 6.038774542520274e-06, + "loss": 1.01815567, + "memory(GiB)": 302.58, + "step": 165120, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.75836039, + "epoch": 0.923541098389787, + "grad_norm": 6.375, + "learning_rate": 6.037869996012503e-06, + "loss": 0.93777113, + "memory(GiB)": 302.58, + "step": 165140, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.73399224, + "epoch": 0.9236529478627663, + "grad_norm": 6.4375, + "learning_rate": 6.0369654140062885e-06, + "loss": 1.04893923, + "memory(GiB)": 302.58, + "step": 165160, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.72483344, + "epoch": 0.9237647973357456, + "grad_norm": 7.78125, + "learning_rate": 6.036060796532567e-06, + "loss": 1.09205999, + "memory(GiB)": 302.58, + "step": 165180, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.73490839, + "epoch": 0.9238766468087248, + "grad_norm": 7.53125, + "learning_rate": 6.035156143622281e-06, + "loss": 1.06131277, + "memory(GiB)": 302.58, + "step": 165200, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.74550509, + "epoch": 0.9239884962817041, + "grad_norm": 8.1875, + "learning_rate": 6.034251455306372e-06, + "loss": 1.00213852, + "memory(GiB)": 302.58, + "step": 165220, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.7336225, + "epoch": 0.9241003457546834, + "grad_norm": 5.46875, + "learning_rate": 6.033346731615786e-06, + "loss": 1.05441465, + "memory(GiB)": 302.58, + "step": 165240, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.75112276, + "epoch": 0.9242121952276626, + "grad_norm": 7.0625, + "learning_rate": 6.032441972581463e-06, + "loss": 0.99024124, + "memory(GiB)": 302.58, + "step": 165260, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.75123196, + "epoch": 0.9243240447006419, + "grad_norm": 7.8125, + "learning_rate": 6.031537178234353e-06, + "loss": 0.94918537, + "memory(GiB)": 302.58, + "step": 165280, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.74502444, + "epoch": 0.9244358941736212, + "grad_norm": 10.625, + "learning_rate": 6.0306323486053985e-06, + "loss": 1.04122076, + "memory(GiB)": 302.58, + "step": 165300, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.75212135, + "epoch": 0.9245477436466004, + "grad_norm": 7.90625, + "learning_rate": 6.0297274837255505e-06, + "loss": 0.94583406, + "memory(GiB)": 302.58, + "step": 165320, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.73707795, + "epoch": 0.9246595931195797, + "grad_norm": 6.625, + "learning_rate": 6.0288225836257575e-06, + "loss": 1.0437356, + "memory(GiB)": 302.58, + "step": 165340, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.72775774, + "epoch": 0.924771442592559, + "grad_norm": 5.90625, + "learning_rate": 6.027917648336969e-06, + "loss": 1.08901043, + "memory(GiB)": 302.58, + "step": 165360, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.74365478, + "epoch": 0.9248832920655382, + "grad_norm": 4.90625, + "learning_rate": 6.027012677890137e-06, + "loss": 1.03333302, + "memory(GiB)": 302.58, + "step": 165380, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.73434677, + "epoch": 0.9249951415385175, + "grad_norm": 8.0625, + "learning_rate": 6.026107672316217e-06, + "loss": 1.04104137, + "memory(GiB)": 302.58, + "step": 165400, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.74107451, + "epoch": 0.9251069910114967, + "grad_norm": 6.6875, + "learning_rate": 6.02520263164616e-06, + "loss": 1.0119977, + "memory(GiB)": 302.58, + "step": 165420, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.73203464, + "epoch": 0.925218840484476, + "grad_norm": 6.84375, + "learning_rate": 6.024297555910922e-06, + "loss": 1.05115404, + "memory(GiB)": 302.58, + "step": 165440, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.73317461, + "epoch": 0.9253306899574553, + "grad_norm": 8.125, + "learning_rate": 6.023392445141459e-06, + "loss": 1.04445019, + "memory(GiB)": 302.58, + "step": 165460, + "train_speed(iter/s)": 0.123875 + }, + { + "acc": 0.73582211, + "epoch": 0.9254425394304345, + "grad_norm": 6.84375, + "learning_rate": 6.022487299368731e-06, + "loss": 1.03426571, + "memory(GiB)": 302.58, + "step": 165480, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.73908925, + "epoch": 0.9255543889034138, + "grad_norm": 6.46875, + "learning_rate": 6.021582118623695e-06, + "loss": 1.02346973, + "memory(GiB)": 302.58, + "step": 165500, + "train_speed(iter/s)": 0.123888 + }, + { + "acc": 0.74185681, + "epoch": 0.9256662383763931, + "grad_norm": 7.0, + "learning_rate": 6.020676902937308e-06, + "loss": 1.02234287, + "memory(GiB)": 302.58, + "step": 165520, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.73942494, + "epoch": 0.9257780878493723, + "grad_norm": 5.71875, + "learning_rate": 6.019771652340537e-06, + "loss": 1.02470112, + "memory(GiB)": 302.58, + "step": 165540, + "train_speed(iter/s)": 0.123902 + }, + { + "acc": 0.75174017, + "epoch": 0.9258899373223516, + "grad_norm": 5.15625, + "learning_rate": 6.018866366864342e-06, + "loss": 0.98061609, + "memory(GiB)": 302.58, + "step": 165560, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.72547169, + "epoch": 0.9260017867953309, + "grad_norm": 5.59375, + "learning_rate": 6.017961046539686e-06, + "loss": 1.07896185, + "memory(GiB)": 302.58, + "step": 165580, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.74804292, + "epoch": 0.9261136362683101, + "grad_norm": 6.40625, + "learning_rate": 6.017055691397535e-06, + "loss": 0.98207006, + "memory(GiB)": 302.58, + "step": 165600, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.76149206, + "epoch": 0.9262254857412894, + "grad_norm": 12.8125, + "learning_rate": 6.016150301468854e-06, + "loss": 0.93706417, + "memory(GiB)": 302.58, + "step": 165620, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.74244595, + "epoch": 0.9263373352142686, + "grad_norm": 6.34375, + "learning_rate": 6.01524487678461e-06, + "loss": 1.00738688, + "memory(GiB)": 302.58, + "step": 165640, + "train_speed(iter/s)": 0.123938 + }, + { + "acc": 0.73223386, + "epoch": 0.9264491846872479, + "grad_norm": 6.5625, + "learning_rate": 6.014339417375772e-06, + "loss": 1.05650797, + "memory(GiB)": 302.58, + "step": 165660, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.74190755, + "epoch": 0.9265610341602272, + "grad_norm": 8.6875, + "learning_rate": 6.01343392327331e-06, + "loss": 1.00102701, + "memory(GiB)": 302.58, + "step": 165680, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.74979596, + "epoch": 0.9266728836332064, + "grad_norm": 5.09375, + "learning_rate": 6.012528394508193e-06, + "loss": 0.9897995, + "memory(GiB)": 302.58, + "step": 165700, + "train_speed(iter/s)": 0.123959 + }, + { + "acc": 0.72646322, + "epoch": 0.9267847331061857, + "grad_norm": 7.96875, + "learning_rate": 6.011622831111396e-06, + "loss": 1.10987101, + "memory(GiB)": 302.58, + "step": 165720, + "train_speed(iter/s)": 0.123966 + }, + { + "acc": 0.74378767, + "epoch": 0.926896582579165, + "grad_norm": 7.375, + "learning_rate": 6.01071723311389e-06, + "loss": 1.02485437, + "memory(GiB)": 302.58, + "step": 165740, + "train_speed(iter/s)": 0.123973 + }, + { + "acc": 0.73078518, + "epoch": 0.9270084320521442, + "grad_norm": 8.8125, + "learning_rate": 6.00981160054665e-06, + "loss": 1.06844673, + "memory(GiB)": 302.58, + "step": 165760, + "train_speed(iter/s)": 0.12398 + }, + { + "acc": 0.74112668, + "epoch": 0.9271202815251235, + "grad_norm": 10.6875, + "learning_rate": 6.0089059334406515e-06, + "loss": 1.03919744, + "memory(GiB)": 302.58, + "step": 165780, + "train_speed(iter/s)": 0.123987 + }, + { + "acc": 0.72477674, + "epoch": 0.9272321309981028, + "grad_norm": 7.3125, + "learning_rate": 6.008000231826869e-06, + "loss": 1.091294, + "memory(GiB)": 302.58, + "step": 165800, + "train_speed(iter/s)": 0.123994 + }, + { + "acc": 0.73343635, + "epoch": 0.927343980471082, + "grad_norm": 6.125, + "learning_rate": 6.007094495736284e-06, + "loss": 1.0951314, + "memory(GiB)": 302.58, + "step": 165820, + "train_speed(iter/s)": 0.124001 + }, + { + "acc": 0.74199028, + "epoch": 0.9274558299440613, + "grad_norm": 6.40625, + "learning_rate": 6.006188725199873e-06, + "loss": 0.99856186, + "memory(GiB)": 302.58, + "step": 165840, + "train_speed(iter/s)": 0.124008 + }, + { + "acc": 0.7415585, + "epoch": 0.9275676794170405, + "grad_norm": 7.8125, + "learning_rate": 6.005282920248618e-06, + "loss": 1.01427422, + "memory(GiB)": 302.58, + "step": 165860, + "train_speed(iter/s)": 0.124015 + }, + { + "acc": 0.75291057, + "epoch": 0.9276795288900198, + "grad_norm": 5.5, + "learning_rate": 6.004377080913498e-06, + "loss": 0.95886497, + "memory(GiB)": 302.58, + "step": 165880, + "train_speed(iter/s)": 0.124022 + }, + { + "acc": 0.72803106, + "epoch": 0.9277913783629991, + "grad_norm": 8.75, + "learning_rate": 6.003471207225498e-06, + "loss": 1.07236757, + "memory(GiB)": 302.58, + "step": 165900, + "train_speed(iter/s)": 0.124029 + }, + { + "acc": 0.75420151, + "epoch": 0.9279032278359783, + "grad_norm": 9.125, + "learning_rate": 6.002565299215602e-06, + "loss": 0.97712202, + "memory(GiB)": 302.58, + "step": 165920, + "train_speed(iter/s)": 0.124037 + }, + { + "acc": 0.74568095, + "epoch": 0.9280150773089576, + "grad_norm": 8.625, + "learning_rate": 6.001659356914792e-06, + "loss": 0.9872838, + "memory(GiB)": 302.58, + "step": 165940, + "train_speed(iter/s)": 0.124044 + }, + { + "acc": 0.73788028, + "epoch": 0.9281269267819369, + "grad_norm": 9.3125, + "learning_rate": 6.000753380354055e-06, + "loss": 1.02428484, + "memory(GiB)": 302.58, + "step": 165960, + "train_speed(iter/s)": 0.124051 + }, + { + "acc": 0.74130116, + "epoch": 0.9282387762549161, + "grad_norm": 9.375, + "learning_rate": 5.99984736956438e-06, + "loss": 1.02186337, + "memory(GiB)": 302.58, + "step": 165980, + "train_speed(iter/s)": 0.124058 + }, + { + "acc": 0.75504851, + "epoch": 0.9283506257278954, + "grad_norm": 8.875, + "learning_rate": 5.998941324576756e-06, + "loss": 0.94458714, + "memory(GiB)": 302.58, + "step": 166000, + "train_speed(iter/s)": 0.124065 + }, + { + "epoch": 0.9283506257278954, + "eval_acc": 0.7060209813037994, + "eval_loss": 1.0142680406570435, + "eval_runtime": 7496.881, + "eval_samples_per_second": 10.042, + "eval_steps_per_second": 10.042, + "step": 166000 + }, + { + "acc": 0.74963322, + "epoch": 0.9284624752008747, + "grad_norm": 5.28125, + "learning_rate": 5.998035245422169e-06, + "loss": 0.97644176, + "memory(GiB)": 302.58, + "step": 166020, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.76077032, + "epoch": 0.9285743246738539, + "grad_norm": 6.8125, + "learning_rate": 5.997129132131614e-06, + "loss": 0.92846985, + "memory(GiB)": 302.58, + "step": 166040, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.73830495, + "epoch": 0.9286861741468332, + "grad_norm": 8.6875, + "learning_rate": 5.99622298473608e-06, + "loss": 1.0278882, + "memory(GiB)": 302.58, + "step": 166060, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.74575505, + "epoch": 0.9287980236198125, + "grad_norm": 5.6875, + "learning_rate": 5.995316803266561e-06, + "loss": 1.01568832, + "memory(GiB)": 302.58, + "step": 166080, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.73546638, + "epoch": 0.9289098730927917, + "grad_norm": 7.75, + "learning_rate": 5.994410587754049e-06, + "loss": 1.02314949, + "memory(GiB)": 302.58, + "step": 166100, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.75585155, + "epoch": 0.929021722565771, + "grad_norm": 6.6875, + "learning_rate": 5.993504338229544e-06, + "loss": 0.97237072, + "memory(GiB)": 302.58, + "step": 166120, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.72578344, + "epoch": 0.9291335720387502, + "grad_norm": 6.46875, + "learning_rate": 5.992598054724041e-06, + "loss": 1.09823456, + "memory(GiB)": 302.58, + "step": 166140, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.73870296, + "epoch": 0.9292454215117295, + "grad_norm": 6.28125, + "learning_rate": 5.991691737268537e-06, + "loss": 1.05604954, + "memory(GiB)": 302.58, + "step": 166160, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.73126845, + "epoch": 0.9293572709847088, + "grad_norm": 4.8125, + "learning_rate": 5.990785385894029e-06, + "loss": 1.05964937, + "memory(GiB)": 302.58, + "step": 166180, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75554953, + "epoch": 0.929469120457688, + "grad_norm": 7.8125, + "learning_rate": 5.989879000631521e-06, + "loss": 0.93716135, + "memory(GiB)": 302.58, + "step": 166200, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.73895035, + "epoch": 0.9295809699306673, + "grad_norm": 8.3125, + "learning_rate": 5.988972581512011e-06, + "loss": 1.04036312, + "memory(GiB)": 302.58, + "step": 166220, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.75809431, + "epoch": 0.9296928194036466, + "grad_norm": 7.65625, + "learning_rate": 5.9880661285665035e-06, + "loss": 0.95506096, + "memory(GiB)": 302.58, + "step": 166240, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.75080962, + "epoch": 0.9298046688766258, + "grad_norm": 5.34375, + "learning_rate": 5.987159641826002e-06, + "loss": 0.98305407, + "memory(GiB)": 302.58, + "step": 166260, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.73210444, + "epoch": 0.9299165183496051, + "grad_norm": 7.34375, + "learning_rate": 5.986253121321509e-06, + "loss": 1.05425081, + "memory(GiB)": 302.58, + "step": 166280, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74486237, + "epoch": 0.9300283678225844, + "grad_norm": 7.53125, + "learning_rate": 5.985346567084033e-06, + "loss": 1.00294333, + "memory(GiB)": 302.58, + "step": 166300, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.72779336, + "epoch": 0.9301402172955636, + "grad_norm": 5.09375, + "learning_rate": 5.984439979144579e-06, + "loss": 1.08000956, + "memory(GiB)": 302.58, + "step": 166320, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.73282623, + "epoch": 0.9302520667685429, + "grad_norm": 7.03125, + "learning_rate": 5.983533357534158e-06, + "loss": 1.0617506, + "memory(GiB)": 302.58, + "step": 166340, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.75779982, + "epoch": 0.9303639162415221, + "grad_norm": 8.25, + "learning_rate": 5.982626702283776e-06, + "loss": 0.96293268, + "memory(GiB)": 302.58, + "step": 166360, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.73828831, + "epoch": 0.9304757657145014, + "grad_norm": 6.5, + "learning_rate": 5.981720013424444e-06, + "loss": 1.04985809, + "memory(GiB)": 302.58, + "step": 166380, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.7342803, + "epoch": 0.9305876151874807, + "grad_norm": 6.5625, + "learning_rate": 5.980813290987176e-06, + "loss": 1.02789001, + "memory(GiB)": 302.58, + "step": 166400, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.74359517, + "epoch": 0.9306994646604599, + "grad_norm": 5.65625, + "learning_rate": 5.979906535002984e-06, + "loss": 1.03789978, + "memory(GiB)": 302.58, + "step": 166420, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74786692, + "epoch": 0.9308113141334392, + "grad_norm": 6.3125, + "learning_rate": 5.97899974550288e-06, + "loss": 0.97809296, + "memory(GiB)": 302.58, + "step": 166440, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74446573, + "epoch": 0.9309231636064185, + "grad_norm": 6.8125, + "learning_rate": 5.978092922517882e-06, + "loss": 1.02317657, + "memory(GiB)": 302.58, + "step": 166460, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75363464, + "epoch": 0.9310350130793977, + "grad_norm": 7.46875, + "learning_rate": 5.977186066079004e-06, + "loss": 0.96422482, + "memory(GiB)": 302.58, + "step": 166480, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.75201383, + "epoch": 0.931146862552377, + "grad_norm": 5.875, + "learning_rate": 5.976279176217262e-06, + "loss": 0.98677111, + "memory(GiB)": 302.58, + "step": 166500, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.74491501, + "epoch": 0.9312587120253563, + "grad_norm": 5.40625, + "learning_rate": 5.975372252963678e-06, + "loss": 1.00258608, + "memory(GiB)": 302.58, + "step": 166520, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.74807963, + "epoch": 0.9313705614983355, + "grad_norm": 9.4375, + "learning_rate": 5.9744652963492686e-06, + "loss": 0.98642502, + "memory(GiB)": 302.58, + "step": 166540, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.73315611, + "epoch": 0.9314824109713148, + "grad_norm": 7.90625, + "learning_rate": 5.973558306405059e-06, + "loss": 1.0363018, + "memory(GiB)": 302.58, + "step": 166560, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.73639379, + "epoch": 0.931594260444294, + "grad_norm": 8.9375, + "learning_rate": 5.972651283162066e-06, + "loss": 1.04279318, + "memory(GiB)": 302.58, + "step": 166580, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.73935351, + "epoch": 0.9317061099172733, + "grad_norm": 10.625, + "learning_rate": 5.971744226651316e-06, + "loss": 1.01843939, + "memory(GiB)": 302.58, + "step": 166600, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.74882226, + "epoch": 0.9318179593902526, + "grad_norm": 6.84375, + "learning_rate": 5.970837136903832e-06, + "loss": 0.99071589, + "memory(GiB)": 302.58, + "step": 166620, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.74993901, + "epoch": 0.9319298088632318, + "grad_norm": 8.3125, + "learning_rate": 5.96993001395064e-06, + "loss": 0.99012136, + "memory(GiB)": 302.58, + "step": 166640, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.71667137, + "epoch": 0.9320416583362111, + "grad_norm": 7.625, + "learning_rate": 5.969022857822765e-06, + "loss": 1.11375179, + "memory(GiB)": 302.58, + "step": 166660, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.7582026, + "epoch": 0.9321535078091904, + "grad_norm": 4.34375, + "learning_rate": 5.968115668551236e-06, + "loss": 0.95663376, + "memory(GiB)": 302.58, + "step": 166680, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.72696285, + "epoch": 0.9322653572821696, + "grad_norm": 6.6875, + "learning_rate": 5.967208446167081e-06, + "loss": 1.08969603, + "memory(GiB)": 302.58, + "step": 166700, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74707866, + "epoch": 0.9323772067551489, + "grad_norm": 11.0, + "learning_rate": 5.96630119070133e-06, + "loss": 1.00186472, + "memory(GiB)": 302.58, + "step": 166720, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74468336, + "epoch": 0.9324890562281282, + "grad_norm": 7.75, + "learning_rate": 5.965393902185015e-06, + "loss": 1.01062288, + "memory(GiB)": 302.58, + "step": 166740, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.73554864, + "epoch": 0.9326009057011074, + "grad_norm": 7.75, + "learning_rate": 5.964486580649167e-06, + "loss": 1.04634304, + "memory(GiB)": 302.58, + "step": 166760, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.74611464, + "epoch": 0.9327127551740867, + "grad_norm": 7.96875, + "learning_rate": 5.96357922612482e-06, + "loss": 1.0012166, + "memory(GiB)": 302.58, + "step": 166780, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.76055212, + "epoch": 0.932824604647066, + "grad_norm": 6.25, + "learning_rate": 5.9626718386430075e-06, + "loss": 0.94050722, + "memory(GiB)": 302.58, + "step": 166800, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.7481976, + "epoch": 0.9329364541200452, + "grad_norm": 6.375, + "learning_rate": 5.961764418234766e-06, + "loss": 1.00682735, + "memory(GiB)": 302.58, + "step": 166820, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.73936124, + "epoch": 0.9330483035930245, + "grad_norm": 7.9375, + "learning_rate": 5.960856964931131e-06, + "loss": 1.03796177, + "memory(GiB)": 302.58, + "step": 166840, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.75263362, + "epoch": 0.9331601530660038, + "grad_norm": 6.84375, + "learning_rate": 5.9599494787631415e-06, + "loss": 0.98274813, + "memory(GiB)": 302.58, + "step": 166860, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.73278775, + "epoch": 0.933272002538983, + "grad_norm": 4.84375, + "learning_rate": 5.959041959761836e-06, + "loss": 1.02923822, + "memory(GiB)": 302.58, + "step": 166880, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.73825078, + "epoch": 0.9333838520119623, + "grad_norm": 6.6875, + "learning_rate": 5.958134407958255e-06, + "loss": 1.0326561, + "memory(GiB)": 302.58, + "step": 166900, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.74985414, + "epoch": 0.9334957014849415, + "grad_norm": 6.84375, + "learning_rate": 5.957226823383438e-06, + "loss": 0.97006741, + "memory(GiB)": 302.58, + "step": 166920, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.75160332, + "epoch": 0.9336075509579208, + "grad_norm": 8.625, + "learning_rate": 5.956319206068428e-06, + "loss": 0.97844191, + "memory(GiB)": 302.58, + "step": 166940, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.74381351, + "epoch": 0.9337194004309001, + "grad_norm": 8.375, + "learning_rate": 5.955411556044269e-06, + "loss": 1.01811008, + "memory(GiB)": 302.58, + "step": 166960, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.72990656, + "epoch": 0.9338312499038793, + "grad_norm": 5.53125, + "learning_rate": 5.954503873342007e-06, + "loss": 1.06684961, + "memory(GiB)": 302.58, + "step": 166980, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.75089388, + "epoch": 0.9339430993768586, + "grad_norm": 7.78125, + "learning_rate": 5.953596157992685e-06, + "loss": 0.98436785, + "memory(GiB)": 302.58, + "step": 167000, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.73693728, + "epoch": 0.9340549488498379, + "grad_norm": 7.4375, + "learning_rate": 5.95268841002735e-06, + "loss": 1.03754158, + "memory(GiB)": 302.58, + "step": 167020, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.74750395, + "epoch": 0.9341667983228171, + "grad_norm": 8.8125, + "learning_rate": 5.951780629477051e-06, + "loss": 1.0137167, + "memory(GiB)": 302.58, + "step": 167040, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.72496829, + "epoch": 0.9342786477957964, + "grad_norm": 6.125, + "learning_rate": 5.950872816372836e-06, + "loss": 1.10062408, + "memory(GiB)": 302.58, + "step": 167060, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.73909912, + "epoch": 0.9343904972687757, + "grad_norm": 5.1875, + "learning_rate": 5.949964970745755e-06, + "loss": 1.04037247, + "memory(GiB)": 302.58, + "step": 167080, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.75044155, + "epoch": 0.9345023467417549, + "grad_norm": 4.75, + "learning_rate": 5.949057092626861e-06, + "loss": 0.98931074, + "memory(GiB)": 302.58, + "step": 167100, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.72921238, + "epoch": 0.9346141962147342, + "grad_norm": 7.5, + "learning_rate": 5.948149182047205e-06, + "loss": 1.06873236, + "memory(GiB)": 302.58, + "step": 167120, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.73768511, + "epoch": 0.9347260456877134, + "grad_norm": 7.15625, + "learning_rate": 5.9472412390378384e-06, + "loss": 1.02109947, + "memory(GiB)": 302.58, + "step": 167140, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.74857802, + "epoch": 0.9348378951606927, + "grad_norm": 7.3125, + "learning_rate": 5.94633326362982e-06, + "loss": 0.98401289, + "memory(GiB)": 302.58, + "step": 167160, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.76179824, + "epoch": 0.934949744633672, + "grad_norm": 5.125, + "learning_rate": 5.945425255854203e-06, + "loss": 0.92694206, + "memory(GiB)": 302.58, + "step": 167180, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.73362446, + "epoch": 0.9350615941066512, + "grad_norm": 5.84375, + "learning_rate": 5.944517215742044e-06, + "loss": 1.05214643, + "memory(GiB)": 302.58, + "step": 167200, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.72679205, + "epoch": 0.9351734435796305, + "grad_norm": 6.21875, + "learning_rate": 5.943609143324402e-06, + "loss": 1.07143869, + "memory(GiB)": 302.58, + "step": 167220, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.73142376, + "epoch": 0.9352852930526098, + "grad_norm": 6.65625, + "learning_rate": 5.942701038632333e-06, + "loss": 1.0605032, + "memory(GiB)": 302.58, + "step": 167240, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.75614243, + "epoch": 0.935397142525589, + "grad_norm": 7.21875, + "learning_rate": 5.9417929016969014e-06, + "loss": 0.95602798, + "memory(GiB)": 302.58, + "step": 167260, + "train_speed(iter/s)": 0.123801 + }, + { + "acc": 0.74598088, + "epoch": 0.9355089919985683, + "grad_norm": 5.6875, + "learning_rate": 5.940884732549165e-06, + "loss": 0.99506321, + "memory(GiB)": 302.58, + "step": 167280, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.74872003, + "epoch": 0.9356208414715476, + "grad_norm": 6.375, + "learning_rate": 5.939976531220188e-06, + "loss": 1.02183895, + "memory(GiB)": 302.58, + "step": 167300, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.72712049, + "epoch": 0.9357326909445268, + "grad_norm": 4.71875, + "learning_rate": 5.939068297741033e-06, + "loss": 1.0765193, + "memory(GiB)": 302.58, + "step": 167320, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.73944817, + "epoch": 0.9358445404175061, + "grad_norm": 6.8125, + "learning_rate": 5.938160032142765e-06, + "loss": 1.00730867, + "memory(GiB)": 302.58, + "step": 167340, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.74328947, + "epoch": 0.9359563898904854, + "grad_norm": 7.75, + "learning_rate": 5.937251734456449e-06, + "loss": 1.0335783, + "memory(GiB)": 302.58, + "step": 167360, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.75578189, + "epoch": 0.9360682393634646, + "grad_norm": 8.1875, + "learning_rate": 5.93634340471315e-06, + "loss": 0.93356972, + "memory(GiB)": 302.58, + "step": 167380, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.74329839, + "epoch": 0.9361800888364439, + "grad_norm": 8.0, + "learning_rate": 5.935435042943939e-06, + "loss": 1.01666183, + "memory(GiB)": 302.58, + "step": 167400, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.7507062, + "epoch": 0.9362919383094231, + "grad_norm": 7.90625, + "learning_rate": 5.934526649179883e-06, + "loss": 0.9765708, + "memory(GiB)": 302.58, + "step": 167420, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.74438667, + "epoch": 0.9364037877824024, + "grad_norm": 4.0, + "learning_rate": 5.933618223452052e-06, + "loss": 0.98817301, + "memory(GiB)": 302.58, + "step": 167440, + "train_speed(iter/s)": 0.123866 + }, + { + "acc": 0.7417953, + "epoch": 0.9365156372553817, + "grad_norm": 6.125, + "learning_rate": 5.932709765791518e-06, + "loss": 1.01337233, + "memory(GiB)": 302.58, + "step": 167460, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.7249126, + "epoch": 0.9366274867283609, + "grad_norm": 7.59375, + "learning_rate": 5.931801276229353e-06, + "loss": 1.11081142, + "memory(GiB)": 302.58, + "step": 167480, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.73908782, + "epoch": 0.9367393362013402, + "grad_norm": 4.59375, + "learning_rate": 5.9308927547966286e-06, + "loss": 1.02657337, + "memory(GiB)": 302.58, + "step": 167500, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.73051443, + "epoch": 0.9368511856743195, + "grad_norm": 6.09375, + "learning_rate": 5.929984201524419e-06, + "loss": 1.07659225, + "memory(GiB)": 302.58, + "step": 167520, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.73229265, + "epoch": 0.9369630351472987, + "grad_norm": 5.40625, + "learning_rate": 5.929075616443803e-06, + "loss": 1.05512362, + "memory(GiB)": 302.58, + "step": 167540, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.73753672, + "epoch": 0.937074884620278, + "grad_norm": 7.3125, + "learning_rate": 5.928166999585855e-06, + "loss": 1.02952995, + "memory(GiB)": 302.58, + "step": 167560, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.73408823, + "epoch": 0.9371867340932573, + "grad_norm": 8.625, + "learning_rate": 5.927258350981651e-06, + "loss": 1.05473652, + "memory(GiB)": 302.58, + "step": 167580, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.74511261, + "epoch": 0.9372985835662365, + "grad_norm": 7.21875, + "learning_rate": 5.9263496706622725e-06, + "loss": 1.00899477, + "memory(GiB)": 302.58, + "step": 167600, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.74598918, + "epoch": 0.9374104330392158, + "grad_norm": 4.09375, + "learning_rate": 5.925440958658797e-06, + "loss": 0.99139652, + "memory(GiB)": 302.58, + "step": 167620, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.7465313, + "epoch": 0.937522282512195, + "grad_norm": 4.46875, + "learning_rate": 5.924532215002307e-06, + "loss": 1.00653038, + "memory(GiB)": 302.58, + "step": 167640, + "train_speed(iter/s)": 0.123934 + }, + { + "acc": 0.73802943, + "epoch": 0.9376341319851743, + "grad_norm": 5.25, + "learning_rate": 5.9236234397238815e-06, + "loss": 1.03021259, + "memory(GiB)": 302.58, + "step": 167660, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.73696713, + "epoch": 0.9377459814581536, + "grad_norm": 6.75, + "learning_rate": 5.922714632854607e-06, + "loss": 1.02983103, + "memory(GiB)": 302.58, + "step": 167680, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.73108354, + "epoch": 0.9378578309311328, + "grad_norm": 6.5, + "learning_rate": 5.9218057944255666e-06, + "loss": 1.0617671, + "memory(GiB)": 302.58, + "step": 167700, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.74645, + "epoch": 0.9379696804041121, + "grad_norm": 6.09375, + "learning_rate": 5.920896924467845e-06, + "loss": 0.99225864, + "memory(GiB)": 302.58, + "step": 167720, + "train_speed(iter/s)": 0.123963 + }, + { + "acc": 0.74433103, + "epoch": 0.9380815298770914, + "grad_norm": 5.78125, + "learning_rate": 5.919988023012529e-06, + "loss": 1.03798513, + "memory(GiB)": 302.58, + "step": 167740, + "train_speed(iter/s)": 0.12397 + }, + { + "acc": 0.76210608, + "epoch": 0.9381933793500706, + "grad_norm": 6.96875, + "learning_rate": 5.919079090090704e-06, + "loss": 0.94407101, + "memory(GiB)": 302.58, + "step": 167760, + "train_speed(iter/s)": 0.123977 + }, + { + "acc": 0.75264115, + "epoch": 0.9383052288230499, + "grad_norm": 6.125, + "learning_rate": 5.91817012573346e-06, + "loss": 0.95397911, + "memory(GiB)": 302.58, + "step": 167780, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.73356905, + "epoch": 0.9384170782960292, + "grad_norm": 7.03125, + "learning_rate": 5.917261129971887e-06, + "loss": 1.05525503, + "memory(GiB)": 302.58, + "step": 167800, + "train_speed(iter/s)": 0.123991 + }, + { + "acc": 0.76515484, + "epoch": 0.9385289277690084, + "grad_norm": 7.84375, + "learning_rate": 5.916352102837074e-06, + "loss": 0.91811647, + "memory(GiB)": 302.58, + "step": 167820, + "train_speed(iter/s)": 0.123998 + }, + { + "acc": 0.75202513, + "epoch": 0.9386407772419877, + "grad_norm": 3.875, + "learning_rate": 5.915443044360114e-06, + "loss": 0.96603441, + "memory(GiB)": 302.58, + "step": 167840, + "train_speed(iter/s)": 0.124005 + }, + { + "acc": 0.74984374, + "epoch": 0.938752626714967, + "grad_norm": 7.21875, + "learning_rate": 5.9145339545721e-06, + "loss": 0.97734404, + "memory(GiB)": 302.58, + "step": 167860, + "train_speed(iter/s)": 0.124012 + }, + { + "acc": 0.74101124, + "epoch": 0.9388644761879462, + "grad_norm": 7.5625, + "learning_rate": 5.913624833504124e-06, + "loss": 1.03423882, + "memory(GiB)": 302.58, + "step": 167880, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.74254718, + "epoch": 0.9389763256609255, + "grad_norm": 9.6875, + "learning_rate": 5.912715681187281e-06, + "loss": 1.01093798, + "memory(GiB)": 302.58, + "step": 167900, + "train_speed(iter/s)": 0.124026 + }, + { + "acc": 0.75503573, + "epoch": 0.9390881751339047, + "grad_norm": 6.34375, + "learning_rate": 5.9118064976526694e-06, + "loss": 0.96189146, + "memory(GiB)": 302.58, + "step": 167920, + "train_speed(iter/s)": 0.124033 + }, + { + "acc": 0.73494534, + "epoch": 0.939200024606884, + "grad_norm": 10.3125, + "learning_rate": 5.910897282931383e-06, + "loss": 1.06447659, + "memory(GiB)": 302.58, + "step": 167940, + "train_speed(iter/s)": 0.12404 + }, + { + "acc": 0.74118471, + "epoch": 0.9393118740798633, + "grad_norm": 7.03125, + "learning_rate": 5.90998803705452e-06, + "loss": 1.01454277, + "memory(GiB)": 302.58, + "step": 167960, + "train_speed(iter/s)": 0.124046 + }, + { + "acc": 0.74009843, + "epoch": 0.9394237235528425, + "grad_norm": 9.875, + "learning_rate": 5.909078760053182e-06, + "loss": 1.01557913, + "memory(GiB)": 302.58, + "step": 167980, + "train_speed(iter/s)": 0.124053 + }, + { + "acc": 0.72639985, + "epoch": 0.9395355730258218, + "grad_norm": 7.09375, + "learning_rate": 5.908169451958468e-06, + "loss": 1.10626783, + "memory(GiB)": 302.58, + "step": 168000, + "train_speed(iter/s)": 0.12406 + }, + { + "epoch": 0.9395355730258218, + "eval_acc": 0.7061014336589978, + "eval_loss": 1.0140541791915894, + "eval_runtime": 7496.9003, + "eval_samples_per_second": 10.042, + "eval_steps_per_second": 10.042, + "step": 168000 + }, + { + "acc": 0.74148598, + "epoch": 0.9396474224988011, + "grad_norm": 5.9375, + "learning_rate": 5.907260112801478e-06, + "loss": 1.02038832, + "memory(GiB)": 302.58, + "step": 168020, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.75643601, + "epoch": 0.9397592719717803, + "grad_norm": 5.90625, + "learning_rate": 5.906350742613317e-06, + "loss": 0.95248365, + "memory(GiB)": 302.58, + "step": 168040, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.74669118, + "epoch": 0.9398711214447596, + "grad_norm": 5.625, + "learning_rate": 5.905441341425085e-06, + "loss": 0.98426971, + "memory(GiB)": 302.58, + "step": 168060, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.73292084, + "epoch": 0.9399829709177389, + "grad_norm": 8.4375, + "learning_rate": 5.90453190926789e-06, + "loss": 1.05705576, + "memory(GiB)": 302.58, + "step": 168080, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.7395659, + "epoch": 0.9400948203907181, + "grad_norm": 6.1875, + "learning_rate": 5.903622446172833e-06, + "loss": 1.02912645, + "memory(GiB)": 302.58, + "step": 168100, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.76097651, + "epoch": 0.9402066698636974, + "grad_norm": 7.40625, + "learning_rate": 5.902712952171023e-06, + "loss": 0.93616524, + "memory(GiB)": 302.58, + "step": 168120, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.74366722, + "epoch": 0.9403185193366767, + "grad_norm": 6.9375, + "learning_rate": 5.9018034272935696e-06, + "loss": 0.99644079, + "memory(GiB)": 302.58, + "step": 168140, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.74495196, + "epoch": 0.9404303688096559, + "grad_norm": 6.46875, + "learning_rate": 5.900893871571579e-06, + "loss": 1.02336445, + "memory(GiB)": 302.58, + "step": 168160, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.76011939, + "epoch": 0.9405422182826352, + "grad_norm": 7.03125, + "learning_rate": 5.8999842850361614e-06, + "loss": 0.91060534, + "memory(GiB)": 302.58, + "step": 168180, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.74697356, + "epoch": 0.9406540677556144, + "grad_norm": 4.5, + "learning_rate": 5.899074667718428e-06, + "loss": 0.99044514, + "memory(GiB)": 302.58, + "step": 168200, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.74480767, + "epoch": 0.9407659172285937, + "grad_norm": 9.75, + "learning_rate": 5.898165019649489e-06, + "loss": 0.99253531, + "memory(GiB)": 302.58, + "step": 168220, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.75151734, + "epoch": 0.940877766701573, + "grad_norm": 7.375, + "learning_rate": 5.897255340860457e-06, + "loss": 0.99741917, + "memory(GiB)": 302.58, + "step": 168240, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.73342876, + "epoch": 0.9409896161745522, + "grad_norm": 4.125, + "learning_rate": 5.89634563138245e-06, + "loss": 1.05974693, + "memory(GiB)": 302.58, + "step": 168260, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75064325, + "epoch": 0.9411014656475315, + "grad_norm": 6.84375, + "learning_rate": 5.895435891246578e-06, + "loss": 0.97624846, + "memory(GiB)": 302.58, + "step": 168280, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.75788217, + "epoch": 0.9412133151205108, + "grad_norm": 5.6875, + "learning_rate": 5.894526120483961e-06, + "loss": 0.96613722, + "memory(GiB)": 302.58, + "step": 168300, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.73828306, + "epoch": 0.94132516459349, + "grad_norm": 6.40625, + "learning_rate": 5.893616319125713e-06, + "loss": 0.99820251, + "memory(GiB)": 302.58, + "step": 168320, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.73436046, + "epoch": 0.9414370140664693, + "grad_norm": 9.5, + "learning_rate": 5.892706487202954e-06, + "loss": 1.0485055, + "memory(GiB)": 302.58, + "step": 168340, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.72846327, + "epoch": 0.9415488635394486, + "grad_norm": 14.5, + "learning_rate": 5.891796624746801e-06, + "loss": 1.09558067, + "memory(GiB)": 302.58, + "step": 168360, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75261564, + "epoch": 0.9416607130124278, + "grad_norm": 5.09375, + "learning_rate": 5.890886731788377e-06, + "loss": 0.96984091, + "memory(GiB)": 302.58, + "step": 168380, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.74242921, + "epoch": 0.9417725624854072, + "grad_norm": 5.375, + "learning_rate": 5.889976808358801e-06, + "loss": 1.00359983, + "memory(GiB)": 302.58, + "step": 168400, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.739323, + "epoch": 0.9418844119583865, + "grad_norm": 7.71875, + "learning_rate": 5.889066854489196e-06, + "loss": 1.03463917, + "memory(GiB)": 302.58, + "step": 168420, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.76361551, + "epoch": 0.9419962614313657, + "grad_norm": 8.375, + "learning_rate": 5.888156870210686e-06, + "loss": 0.92327232, + "memory(GiB)": 302.58, + "step": 168440, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74318318, + "epoch": 0.942108110904345, + "grad_norm": 7.1875, + "learning_rate": 5.887246855554395e-06, + "loss": 1.01456432, + "memory(GiB)": 302.58, + "step": 168460, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.72700768, + "epoch": 0.9422199603773243, + "grad_norm": 9.3125, + "learning_rate": 5.886336810551448e-06, + "loss": 1.08658361, + "memory(GiB)": 302.58, + "step": 168480, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74346566, + "epoch": 0.9423318098503035, + "grad_norm": 5.875, + "learning_rate": 5.885426735232972e-06, + "loss": 0.99319229, + "memory(GiB)": 302.58, + "step": 168500, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.75518975, + "epoch": 0.9424436593232828, + "grad_norm": 10.5, + "learning_rate": 5.884516629630094e-06, + "loss": 0.97088957, + "memory(GiB)": 302.58, + "step": 168520, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.7464592, + "epoch": 0.942555508796262, + "grad_norm": 6.15625, + "learning_rate": 5.883606493773941e-06, + "loss": 0.99901743, + "memory(GiB)": 302.58, + "step": 168540, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.7369678, + "epoch": 0.9426673582692413, + "grad_norm": 5.96875, + "learning_rate": 5.882696327695645e-06, + "loss": 1.03374872, + "memory(GiB)": 302.58, + "step": 168560, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.74416533, + "epoch": 0.9427792077422206, + "grad_norm": 7.59375, + "learning_rate": 5.881786131426336e-06, + "loss": 1.01655636, + "memory(GiB)": 302.58, + "step": 168580, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.73040318, + "epoch": 0.9428910572151998, + "grad_norm": 6.09375, + "learning_rate": 5.8808759049971455e-06, + "loss": 1.0691721, + "memory(GiB)": 302.58, + "step": 168600, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.7573544, + "epoch": 0.9430029066881791, + "grad_norm": 5.3125, + "learning_rate": 5.8799656484392056e-06, + "loss": 0.96560688, + "memory(GiB)": 302.58, + "step": 168620, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.74233494, + "epoch": 0.9431147561611584, + "grad_norm": 8.125, + "learning_rate": 5.87905536178365e-06, + "loss": 1.00949097, + "memory(GiB)": 302.58, + "step": 168640, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.72942867, + "epoch": 0.9432266056341376, + "grad_norm": 5.03125, + "learning_rate": 5.878145045061614e-06, + "loss": 1.0736762, + "memory(GiB)": 302.58, + "step": 168660, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.74183021, + "epoch": 0.9433384551071169, + "grad_norm": 6.78125, + "learning_rate": 5.877234698304232e-06, + "loss": 1.00610971, + "memory(GiB)": 302.58, + "step": 168680, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.73055034, + "epoch": 0.9434503045800962, + "grad_norm": 7.28125, + "learning_rate": 5.876324321542643e-06, + "loss": 1.04664087, + "memory(GiB)": 302.58, + "step": 168700, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.76087384, + "epoch": 0.9435621540530754, + "grad_norm": 7.1875, + "learning_rate": 5.875413914807981e-06, + "loss": 0.91062307, + "memory(GiB)": 302.58, + "step": 168720, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.73583155, + "epoch": 0.9436740035260547, + "grad_norm": 9.5625, + "learning_rate": 5.8745034781313904e-06, + "loss": 1.03862095, + "memory(GiB)": 302.58, + "step": 168740, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.74615183, + "epoch": 0.943785852999034, + "grad_norm": 6.5625, + "learning_rate": 5.873593011544006e-06, + "loss": 0.98839121, + "memory(GiB)": 302.58, + "step": 168760, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.74139085, + "epoch": 0.9438977024720132, + "grad_norm": 8.0625, + "learning_rate": 5.8726825150769715e-06, + "loss": 1.01345377, + "memory(GiB)": 302.58, + "step": 168780, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.72640061, + "epoch": 0.9440095519449925, + "grad_norm": 8.3125, + "learning_rate": 5.871771988761427e-06, + "loss": 1.10935736, + "memory(GiB)": 302.58, + "step": 168800, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.76204062, + "epoch": 0.9441214014179717, + "grad_norm": 6.8125, + "learning_rate": 5.870861432628516e-06, + "loss": 0.93158865, + "memory(GiB)": 302.58, + "step": 168820, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75466933, + "epoch": 0.944233250890951, + "grad_norm": 8.0625, + "learning_rate": 5.869950846709384e-06, + "loss": 0.9679677, + "memory(GiB)": 302.58, + "step": 168840, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.74127245, + "epoch": 0.9443451003639303, + "grad_norm": 6.9375, + "learning_rate": 5.869040231035172e-06, + "loss": 1.02524815, + "memory(GiB)": 302.58, + "step": 168860, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.74207788, + "epoch": 0.9444569498369095, + "grad_norm": 9.375, + "learning_rate": 5.868129585637031e-06, + "loss": 0.98934174, + "memory(GiB)": 302.58, + "step": 168880, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.74738526, + "epoch": 0.9445687993098888, + "grad_norm": 11.3125, + "learning_rate": 5.867218910546104e-06, + "loss": 0.98161449, + "memory(GiB)": 302.58, + "step": 168900, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.74856644, + "epoch": 0.9446806487828681, + "grad_norm": 8.75, + "learning_rate": 5.866308205793542e-06, + "loss": 0.98566036, + "memory(GiB)": 302.58, + "step": 168920, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.72876763, + "epoch": 0.9447924982558473, + "grad_norm": 6.3125, + "learning_rate": 5.86539747141049e-06, + "loss": 1.06148996, + "memory(GiB)": 302.58, + "step": 168940, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.73578491, + "epoch": 0.9449043477288266, + "grad_norm": 8.0625, + "learning_rate": 5.864486707428101e-06, + "loss": 1.0634388, + "memory(GiB)": 302.58, + "step": 168960, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.73100085, + "epoch": 0.9450161972018059, + "grad_norm": 6.75, + "learning_rate": 5.863575913877526e-06, + "loss": 1.07797375, + "memory(GiB)": 302.58, + "step": 168980, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.75141001, + "epoch": 0.9451280466747851, + "grad_norm": 8.8125, + "learning_rate": 5.862665090789917e-06, + "loss": 0.97378044, + "memory(GiB)": 302.58, + "step": 169000, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.76404214, + "epoch": 0.9452398961477644, + "grad_norm": 7.375, + "learning_rate": 5.861754238196424e-06, + "loss": 0.91015825, + "memory(GiB)": 302.58, + "step": 169020, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.74129329, + "epoch": 0.9453517456207436, + "grad_norm": 6.40625, + "learning_rate": 5.860843356128206e-06, + "loss": 0.9924324, + "memory(GiB)": 302.58, + "step": 169040, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.73587732, + "epoch": 0.9454635950937229, + "grad_norm": 5.03125, + "learning_rate": 5.8599324446164155e-06, + "loss": 1.02833881, + "memory(GiB)": 302.58, + "step": 169060, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.7360796, + "epoch": 0.9455754445667022, + "grad_norm": 6.96875, + "learning_rate": 5.859021503692208e-06, + "loss": 1.05528126, + "memory(GiB)": 302.58, + "step": 169080, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.7162725, + "epoch": 0.9456872940396814, + "grad_norm": 6.6875, + "learning_rate": 5.858110533386741e-06, + "loss": 1.1260087, + "memory(GiB)": 302.58, + "step": 169100, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.73188519, + "epoch": 0.9457991435126607, + "grad_norm": 5.46875, + "learning_rate": 5.857199533731172e-06, + "loss": 1.05745745, + "memory(GiB)": 302.58, + "step": 169120, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.74424872, + "epoch": 0.94591099298564, + "grad_norm": 6.5, + "learning_rate": 5.856288504756662e-06, + "loss": 1.01161146, + "memory(GiB)": 302.58, + "step": 169140, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.75614357, + "epoch": 0.9460228424586192, + "grad_norm": 7.4375, + "learning_rate": 5.85537744649437e-06, + "loss": 0.96240339, + "memory(GiB)": 302.58, + "step": 169160, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.7325407, + "epoch": 0.9461346919315985, + "grad_norm": 6.59375, + "learning_rate": 5.854466358975457e-06, + "loss": 1.05333891, + "memory(GiB)": 302.58, + "step": 169180, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.74485617, + "epoch": 0.9462465414045778, + "grad_norm": 9.375, + "learning_rate": 5.853555242231085e-06, + "loss": 0.99235125, + "memory(GiB)": 302.58, + "step": 169200, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.73939166, + "epoch": 0.946358390877557, + "grad_norm": 5.65625, + "learning_rate": 5.852644096292417e-06, + "loss": 1.03174152, + "memory(GiB)": 302.58, + "step": 169220, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.73024192, + "epoch": 0.9464702403505363, + "grad_norm": 6.125, + "learning_rate": 5.851732921190616e-06, + "loss": 1.07375154, + "memory(GiB)": 302.58, + "step": 169240, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.74208212, + "epoch": 0.9465820898235155, + "grad_norm": 8.6875, + "learning_rate": 5.8508217169568495e-06, + "loss": 1.00742359, + "memory(GiB)": 302.58, + "step": 169260, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.74252849, + "epoch": 0.9466939392964948, + "grad_norm": 9.0625, + "learning_rate": 5.8499104836222835e-06, + "loss": 1.00999708, + "memory(GiB)": 302.58, + "step": 169280, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.74970946, + "epoch": 0.9468057887694741, + "grad_norm": 7.125, + "learning_rate": 5.848999221218082e-06, + "loss": 0.99282179, + "memory(GiB)": 302.58, + "step": 169300, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.74619017, + "epoch": 0.9469176382424533, + "grad_norm": 8.125, + "learning_rate": 5.848087929775417e-06, + "loss": 1.0089098, + "memory(GiB)": 302.58, + "step": 169320, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.73556728, + "epoch": 0.9470294877154326, + "grad_norm": 10.0625, + "learning_rate": 5.847176609325454e-06, + "loss": 1.03709393, + "memory(GiB)": 302.58, + "step": 169340, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.74388466, + "epoch": 0.9471413371884119, + "grad_norm": 6.875, + "learning_rate": 5.846265259899365e-06, + "loss": 1.00430832, + "memory(GiB)": 302.58, + "step": 169360, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.76036634, + "epoch": 0.9472531866613911, + "grad_norm": 7.9375, + "learning_rate": 5.84535388152832e-06, + "loss": 0.93687725, + "memory(GiB)": 302.58, + "step": 169380, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.74907966, + "epoch": 0.9473650361343704, + "grad_norm": 5.90625, + "learning_rate": 5.844442474243492e-06, + "loss": 0.98451738, + "memory(GiB)": 302.58, + "step": 169400, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.74498382, + "epoch": 0.9474768856073497, + "grad_norm": 9.8125, + "learning_rate": 5.843531038076056e-06, + "loss": 0.99975872, + "memory(GiB)": 302.58, + "step": 169420, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.74540572, + "epoch": 0.9475887350803289, + "grad_norm": 9.9375, + "learning_rate": 5.842619573057181e-06, + "loss": 1.00094471, + "memory(GiB)": 302.58, + "step": 169440, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.73913112, + "epoch": 0.9477005845533082, + "grad_norm": 5.0625, + "learning_rate": 5.8417080792180454e-06, + "loss": 1.0183197, + "memory(GiB)": 302.58, + "step": 169460, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.7466558, + "epoch": 0.9478124340262875, + "grad_norm": 6.78125, + "learning_rate": 5.840796556589826e-06, + "loss": 0.98441925, + "memory(GiB)": 302.58, + "step": 169480, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.74980211, + "epoch": 0.9479242834992667, + "grad_norm": 4.875, + "learning_rate": 5.839885005203697e-06, + "loss": 0.98491125, + "memory(GiB)": 302.58, + "step": 169500, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.72027116, + "epoch": 0.948036132972246, + "grad_norm": 7.1875, + "learning_rate": 5.838973425090837e-06, + "loss": 1.11546764, + "memory(GiB)": 302.58, + "step": 169520, + "train_speed(iter/s)": 0.123891 + }, + { + "acc": 0.74719872, + "epoch": 0.9481479824452252, + "grad_norm": 9.125, + "learning_rate": 5.838061816282427e-06, + "loss": 1.00429716, + "memory(GiB)": 302.58, + "step": 169540, + "train_speed(iter/s)": 0.123898 + }, + { + "acc": 0.76132712, + "epoch": 0.9482598319182045, + "grad_norm": 6.5625, + "learning_rate": 5.837150178809644e-06, + "loss": 0.93884268, + "memory(GiB)": 302.58, + "step": 169560, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.75429425, + "epoch": 0.9483716813911838, + "grad_norm": 6.03125, + "learning_rate": 5.836238512703671e-06, + "loss": 0.96278687, + "memory(GiB)": 302.58, + "step": 169580, + "train_speed(iter/s)": 0.123912 + }, + { + "acc": 0.75267868, + "epoch": 0.948483530864163, + "grad_norm": 5.0, + "learning_rate": 5.83532681799569e-06, + "loss": 0.96823788, + "memory(GiB)": 302.58, + "step": 169600, + "train_speed(iter/s)": 0.123919 + }, + { + "acc": 0.74373326, + "epoch": 0.9485953803371423, + "grad_norm": 6.03125, + "learning_rate": 5.8344150947168824e-06, + "loss": 1.01190729, + "memory(GiB)": 302.58, + "step": 169620, + "train_speed(iter/s)": 0.123925 + }, + { + "acc": 0.7556531, + "epoch": 0.9487072298101216, + "grad_norm": 4.25, + "learning_rate": 5.8335033428984315e-06, + "loss": 0.93270054, + "memory(GiB)": 302.58, + "step": 169640, + "train_speed(iter/s)": 0.123932 + }, + { + "acc": 0.74796782, + "epoch": 0.9488190792831008, + "grad_norm": 7.15625, + "learning_rate": 5.832591562571525e-06, + "loss": 0.98110304, + "memory(GiB)": 302.58, + "step": 169660, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.7320477, + "epoch": 0.9489309287560801, + "grad_norm": 6.6875, + "learning_rate": 5.831679753767345e-06, + "loss": 1.06605997, + "memory(GiB)": 302.58, + "step": 169680, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.74182014, + "epoch": 0.9490427782290594, + "grad_norm": 9.9375, + "learning_rate": 5.830767916517082e-06, + "loss": 0.99788904, + "memory(GiB)": 302.58, + "step": 169700, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.72733703, + "epoch": 0.9491546277020386, + "grad_norm": 5.625, + "learning_rate": 5.829856050851922e-06, + "loss": 1.08198519, + "memory(GiB)": 302.58, + "step": 169720, + "train_speed(iter/s)": 0.123959 + }, + { + "acc": 0.74482217, + "epoch": 0.9492664771750179, + "grad_norm": 6.84375, + "learning_rate": 5.828944156803053e-06, + "loss": 1.01504002, + "memory(GiB)": 302.58, + "step": 169740, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.75569229, + "epoch": 0.9493783266479972, + "grad_norm": 9.625, + "learning_rate": 5.828032234401666e-06, + "loss": 0.96853399, + "memory(GiB)": 302.58, + "step": 169760, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.75442901, + "epoch": 0.9494901761209764, + "grad_norm": 10.0625, + "learning_rate": 5.827120283678951e-06, + "loss": 0.94081602, + "memory(GiB)": 302.58, + "step": 169780, + "train_speed(iter/s)": 0.123979 + }, + { + "acc": 0.7562067, + "epoch": 0.9496020255939557, + "grad_norm": 6.78125, + "learning_rate": 5.826208304666099e-06, + "loss": 0.96628551, + "memory(GiB)": 302.58, + "step": 169800, + "train_speed(iter/s)": 0.123986 + }, + { + "acc": 0.75216384, + "epoch": 0.9497138750669349, + "grad_norm": 5.09375, + "learning_rate": 5.825296297394303e-06, + "loss": 0.98049135, + "memory(GiB)": 302.58, + "step": 169820, + "train_speed(iter/s)": 0.123993 + }, + { + "acc": 0.76938562, + "epoch": 0.9498257245399142, + "grad_norm": 6.78125, + "learning_rate": 5.8243842618947565e-06, + "loss": 0.91104498, + "memory(GiB)": 302.58, + "step": 169840, + "train_speed(iter/s)": 0.124 + }, + { + "acc": 0.76206045, + "epoch": 0.9499375740128935, + "grad_norm": 6.40625, + "learning_rate": 5.823472198198655e-06, + "loss": 0.92335758, + "memory(GiB)": 302.58, + "step": 169860, + "train_speed(iter/s)": 0.124006 + }, + { + "acc": 0.72531281, + "epoch": 0.9500494234858727, + "grad_norm": 8.125, + "learning_rate": 5.822560106337193e-06, + "loss": 1.07228804, + "memory(GiB)": 302.58, + "step": 169880, + "train_speed(iter/s)": 0.124013 + }, + { + "acc": 0.74059634, + "epoch": 0.950161272958852, + "grad_norm": 6.625, + "learning_rate": 5.821647986341567e-06, + "loss": 1.01670513, + "memory(GiB)": 302.58, + "step": 169900, + "train_speed(iter/s)": 0.12402 + }, + { + "acc": 0.74037981, + "epoch": 0.9502731224318313, + "grad_norm": 5.84375, + "learning_rate": 5.820735838242975e-06, + "loss": 1.02521362, + "memory(GiB)": 302.58, + "step": 169920, + "train_speed(iter/s)": 0.124027 + }, + { + "acc": 0.7571043, + "epoch": 0.9503849719048105, + "grad_norm": 7.09375, + "learning_rate": 5.819823662072615e-06, + "loss": 0.96241179, + "memory(GiB)": 302.58, + "step": 169940, + "train_speed(iter/s)": 0.124035 + }, + { + "acc": 0.73189225, + "epoch": 0.9504968213777898, + "grad_norm": 7.3125, + "learning_rate": 5.818911457861683e-06, + "loss": 1.05205202, + "memory(GiB)": 302.58, + "step": 169960, + "train_speed(iter/s)": 0.124042 + }, + { + "acc": 0.75056901, + "epoch": 0.950608670850769, + "grad_norm": 7.5, + "learning_rate": 5.817999225641385e-06, + "loss": 0.99062223, + "memory(GiB)": 302.58, + "step": 169980, + "train_speed(iter/s)": 0.124049 + }, + { + "acc": 0.73262749, + "epoch": 0.9507205203237483, + "grad_norm": 7.375, + "learning_rate": 5.817086965442921e-06, + "loss": 1.04799728, + "memory(GiB)": 302.58, + "step": 170000, + "train_speed(iter/s)": 0.124056 + }, + { + "epoch": 0.9507205203237483, + "eval_acc": 0.7061549206721045, + "eval_loss": 1.0136560201644897, + "eval_runtime": 7498.4881, + "eval_samples_per_second": 10.04, + "eval_steps_per_second": 10.04, + "step": 170000 + }, + { + "acc": 0.74653797, + "epoch": 0.9508323697967276, + "grad_norm": 8.4375, + "learning_rate": 5.8161746772974905e-06, + "loss": 0.99627905, + "memory(GiB)": 302.58, + "step": 170020, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.75534139, + "epoch": 0.9509442192697068, + "grad_norm": 6.25, + "learning_rate": 5.815262361236298e-06, + "loss": 0.95586767, + "memory(GiB)": 302.58, + "step": 170040, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.72952747, + "epoch": 0.9510560687426861, + "grad_norm": 7.3125, + "learning_rate": 5.814350017290548e-06, + "loss": 1.07328548, + "memory(GiB)": 302.58, + "step": 170060, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.74143424, + "epoch": 0.9511679182156654, + "grad_norm": 4.59375, + "learning_rate": 5.8134376454914445e-06, + "loss": 1.03424425, + "memory(GiB)": 302.58, + "step": 170080, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.75698738, + "epoch": 0.9512797676886446, + "grad_norm": 6.21875, + "learning_rate": 5.812525245870193e-06, + "loss": 0.94202776, + "memory(GiB)": 302.58, + "step": 170100, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.74862156, + "epoch": 0.9513916171616239, + "grad_norm": 6.65625, + "learning_rate": 5.8116128184580035e-06, + "loss": 0.9489893, + "memory(GiB)": 302.58, + "step": 170120, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.72909703, + "epoch": 0.9515034666346032, + "grad_norm": 7.59375, + "learning_rate": 5.810700363286082e-06, + "loss": 1.085709, + "memory(GiB)": 302.58, + "step": 170140, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.73451767, + "epoch": 0.9516153161075824, + "grad_norm": 5.71875, + "learning_rate": 5.809787880385637e-06, + "loss": 1.06389122, + "memory(GiB)": 302.58, + "step": 170160, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.74358649, + "epoch": 0.9517271655805617, + "grad_norm": 6.75, + "learning_rate": 5.808875369787878e-06, + "loss": 1.00261002, + "memory(GiB)": 302.58, + "step": 170180, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.73769317, + "epoch": 0.951839015053541, + "grad_norm": 6.71875, + "learning_rate": 5.807962831524019e-06, + "loss": 1.05569801, + "memory(GiB)": 302.58, + "step": 170200, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.75895271, + "epoch": 0.9519508645265202, + "grad_norm": 12.375, + "learning_rate": 5.807050265625267e-06, + "loss": 0.95328655, + "memory(GiB)": 302.58, + "step": 170220, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.73073616, + "epoch": 0.9520627139994995, + "grad_norm": 7.53125, + "learning_rate": 5.8061376721228355e-06, + "loss": 1.05641537, + "memory(GiB)": 302.58, + "step": 170240, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.75152025, + "epoch": 0.9521745634724788, + "grad_norm": 7.0, + "learning_rate": 5.805225051047942e-06, + "loss": 0.96926746, + "memory(GiB)": 302.58, + "step": 170260, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.73493948, + "epoch": 0.952286412945458, + "grad_norm": 9.75, + "learning_rate": 5.804312402431797e-06, + "loss": 1.05438375, + "memory(GiB)": 302.58, + "step": 170280, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.74136448, + "epoch": 0.9523982624184373, + "grad_norm": 8.5, + "learning_rate": 5.803399726305618e-06, + "loss": 1.02312326, + "memory(GiB)": 302.58, + "step": 170300, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.73758879, + "epoch": 0.9525101118914165, + "grad_norm": 9.1875, + "learning_rate": 5.802487022700621e-06, + "loss": 1.02935371, + "memory(GiB)": 302.58, + "step": 170320, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.73995643, + "epoch": 0.9526219613643958, + "grad_norm": 7.1875, + "learning_rate": 5.801574291648023e-06, + "loss": 1.02833652, + "memory(GiB)": 302.58, + "step": 170340, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.75061817, + "epoch": 0.9527338108373751, + "grad_norm": 9.875, + "learning_rate": 5.800661533179043e-06, + "loss": 0.99510202, + "memory(GiB)": 302.58, + "step": 170360, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.75702415, + "epoch": 0.9528456603103543, + "grad_norm": 5.34375, + "learning_rate": 5.7997487473249e-06, + "loss": 0.94815168, + "memory(GiB)": 302.58, + "step": 170380, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.72648587, + "epoch": 0.9529575097833336, + "grad_norm": 6.4375, + "learning_rate": 5.798835934116812e-06, + "loss": 1.09149628, + "memory(GiB)": 302.58, + "step": 170400, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.75037041, + "epoch": 0.9530693592563129, + "grad_norm": 6.0625, + "learning_rate": 5.7979230935860035e-06, + "loss": 0.98659182, + "memory(GiB)": 302.58, + "step": 170420, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.7543931, + "epoch": 0.9531812087292921, + "grad_norm": 4.96875, + "learning_rate": 5.797010225763695e-06, + "loss": 0.97470484, + "memory(GiB)": 302.58, + "step": 170440, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.73540053, + "epoch": 0.9532930582022714, + "grad_norm": 8.375, + "learning_rate": 5.79609733068111e-06, + "loss": 1.04513521, + "memory(GiB)": 302.58, + "step": 170460, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.76160436, + "epoch": 0.9534049076752507, + "grad_norm": 9.4375, + "learning_rate": 5.795184408369471e-06, + "loss": 0.92040462, + "memory(GiB)": 302.58, + "step": 170480, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.74424081, + "epoch": 0.9535167571482299, + "grad_norm": 5.96875, + "learning_rate": 5.794271458860005e-06, + "loss": 1.00263529, + "memory(GiB)": 302.58, + "step": 170500, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.74649787, + "epoch": 0.9536286066212092, + "grad_norm": 6.4375, + "learning_rate": 5.793358482183935e-06, + "loss": 0.99034195, + "memory(GiB)": 302.58, + "step": 170520, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.75915852, + "epoch": 0.9537404560941884, + "grad_norm": 7.46875, + "learning_rate": 5.792445478372489e-06, + "loss": 0.95026178, + "memory(GiB)": 302.58, + "step": 170540, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.75342765, + "epoch": 0.9538523055671677, + "grad_norm": 6.65625, + "learning_rate": 5.7915324474568955e-06, + "loss": 0.99015036, + "memory(GiB)": 302.58, + "step": 170560, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.75093899, + "epoch": 0.953964155040147, + "grad_norm": 8.125, + "learning_rate": 5.790619389468383e-06, + "loss": 0.96890478, + "memory(GiB)": 302.58, + "step": 170580, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.73271112, + "epoch": 0.9540760045131262, + "grad_norm": 7.875, + "learning_rate": 5.789706304438181e-06, + "loss": 1.04147978, + "memory(GiB)": 302.58, + "step": 170600, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.73762341, + "epoch": 0.9541878539861055, + "grad_norm": 6.125, + "learning_rate": 5.788793192397517e-06, + "loss": 1.06294355, + "memory(GiB)": 302.58, + "step": 170620, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75082746, + "epoch": 0.9542997034590848, + "grad_norm": 10.3125, + "learning_rate": 5.787880053377627e-06, + "loss": 0.96403294, + "memory(GiB)": 302.58, + "step": 170640, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.75712738, + "epoch": 0.954411552932064, + "grad_norm": 6.125, + "learning_rate": 5.786966887409739e-06, + "loss": 0.94187012, + "memory(GiB)": 302.58, + "step": 170660, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74232383, + "epoch": 0.9545234024050433, + "grad_norm": 9.0, + "learning_rate": 5.786053694525089e-06, + "loss": 1.0185791, + "memory(GiB)": 302.58, + "step": 170680, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.75297956, + "epoch": 0.9546352518780226, + "grad_norm": 7.1875, + "learning_rate": 5.7851404747549105e-06, + "loss": 0.97103214, + "memory(GiB)": 302.58, + "step": 170700, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.72906976, + "epoch": 0.9547471013510018, + "grad_norm": 4.78125, + "learning_rate": 5.784227228130437e-06, + "loss": 1.06969881, + "memory(GiB)": 302.58, + "step": 170720, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.75276847, + "epoch": 0.9548589508239811, + "grad_norm": 7.0625, + "learning_rate": 5.7833139546829065e-06, + "loss": 0.95734539, + "memory(GiB)": 302.58, + "step": 170740, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.76594176, + "epoch": 0.9549708002969604, + "grad_norm": 9.0, + "learning_rate": 5.7824006544435555e-06, + "loss": 0.90964575, + "memory(GiB)": 302.58, + "step": 170760, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.74626179, + "epoch": 0.9550826497699396, + "grad_norm": 6.40625, + "learning_rate": 5.78148732744362e-06, + "loss": 1.0072257, + "memory(GiB)": 302.58, + "step": 170780, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.74088168, + "epoch": 0.9551944992429189, + "grad_norm": 5.03125, + "learning_rate": 5.78057397371434e-06, + "loss": 1.01855993, + "memory(GiB)": 302.58, + "step": 170800, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.73818364, + "epoch": 0.9553063487158981, + "grad_norm": 7.65625, + "learning_rate": 5.779660593286954e-06, + "loss": 1.05171165, + "memory(GiB)": 302.58, + "step": 170820, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.72900906, + "epoch": 0.9554181981888774, + "grad_norm": 10.8125, + "learning_rate": 5.778747186192706e-06, + "loss": 1.07991209, + "memory(GiB)": 302.58, + "step": 170840, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.74144387, + "epoch": 0.9555300476618567, + "grad_norm": 5.375, + "learning_rate": 5.777833752462834e-06, + "loss": 1.01070461, + "memory(GiB)": 302.58, + "step": 170860, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.73994226, + "epoch": 0.9556418971348359, + "grad_norm": 6.875, + "learning_rate": 5.7769202921285805e-06, + "loss": 1.02750282, + "memory(GiB)": 302.58, + "step": 170880, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.74953761, + "epoch": 0.9557537466078152, + "grad_norm": 6.34375, + "learning_rate": 5.7760068052211885e-06, + "loss": 0.97639494, + "memory(GiB)": 302.58, + "step": 170900, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.74253073, + "epoch": 0.9558655960807945, + "grad_norm": 6.8125, + "learning_rate": 5.775093291771905e-06, + "loss": 1.02932587, + "memory(GiB)": 302.58, + "step": 170920, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.72891474, + "epoch": 0.9559774455537737, + "grad_norm": 7.625, + "learning_rate": 5.774179751811972e-06, + "loss": 1.06333227, + "memory(GiB)": 302.58, + "step": 170940, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.74309125, + "epoch": 0.956089295026753, + "grad_norm": 9.1875, + "learning_rate": 5.773266185372637e-06, + "loss": 1.00956287, + "memory(GiB)": 302.58, + "step": 170960, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.73145895, + "epoch": 0.9562011444997323, + "grad_norm": 5.5, + "learning_rate": 5.7723525924851466e-06, + "loss": 1.05282269, + "memory(GiB)": 302.58, + "step": 170980, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.72932868, + "epoch": 0.9563129939727115, + "grad_norm": 8.1875, + "learning_rate": 5.7714389731807485e-06, + "loss": 1.07805748, + "memory(GiB)": 302.58, + "step": 171000, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.75313263, + "epoch": 0.9564248434456908, + "grad_norm": 6.40625, + "learning_rate": 5.770525327490691e-06, + "loss": 0.94968491, + "memory(GiB)": 302.58, + "step": 171020, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.72240829, + "epoch": 0.95653669291867, + "grad_norm": 6.34375, + "learning_rate": 5.769611655446225e-06, + "loss": 1.09395943, + "memory(GiB)": 302.58, + "step": 171040, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.73405643, + "epoch": 0.9566485423916493, + "grad_norm": 7.5, + "learning_rate": 5.768697957078599e-06, + "loss": 1.05660563, + "memory(GiB)": 302.58, + "step": 171060, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.74543905, + "epoch": 0.9567603918646286, + "grad_norm": 7.90625, + "learning_rate": 5.767784232419067e-06, + "loss": 1.00831146, + "memory(GiB)": 302.58, + "step": 171080, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.74850926, + "epoch": 0.9568722413376078, + "grad_norm": 14.1875, + "learning_rate": 5.766870481498876e-06, + "loss": 0.99551601, + "memory(GiB)": 302.58, + "step": 171100, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.75549645, + "epoch": 0.9569840908105871, + "grad_norm": 7.71875, + "learning_rate": 5.765956704349287e-06, + "loss": 0.96539202, + "memory(GiB)": 302.58, + "step": 171120, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.74761906, + "epoch": 0.9570959402835664, + "grad_norm": 8.1875, + "learning_rate": 5.765042901001547e-06, + "loss": 0.99189606, + "memory(GiB)": 302.58, + "step": 171140, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.76033106, + "epoch": 0.9572077897565456, + "grad_norm": 9.8125, + "learning_rate": 5.764129071486916e-06, + "loss": 0.92293758, + "memory(GiB)": 302.58, + "step": 171160, + "train_speed(iter/s)": 0.123765 + }, + { + "acc": 0.74075704, + "epoch": 0.9573196392295249, + "grad_norm": 5.90625, + "learning_rate": 5.7632152158366474e-06, + "loss": 1.0091032, + "memory(GiB)": 302.58, + "step": 171180, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.74582214, + "epoch": 0.9574314887025042, + "grad_norm": 6.25, + "learning_rate": 5.762301334081998e-06, + "loss": 0.99987974, + "memory(GiB)": 302.58, + "step": 171200, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.72042785, + "epoch": 0.9575433381754834, + "grad_norm": 6.5, + "learning_rate": 5.7613874262542254e-06, + "loss": 1.10950365, + "memory(GiB)": 302.58, + "step": 171220, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.74498687, + "epoch": 0.9576551876484627, + "grad_norm": 9.9375, + "learning_rate": 5.760473492384589e-06, + "loss": 1.02098408, + "memory(GiB)": 302.58, + "step": 171240, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.72413788, + "epoch": 0.957767037121442, + "grad_norm": 8.875, + "learning_rate": 5.759559532504346e-06, + "loss": 1.07558403, + "memory(GiB)": 302.58, + "step": 171260, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.73975339, + "epoch": 0.9578788865944212, + "grad_norm": 10.0625, + "learning_rate": 5.758645546644761e-06, + "loss": 1.03116035, + "memory(GiB)": 302.58, + "step": 171280, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.73163619, + "epoch": 0.9579907360674005, + "grad_norm": 5.0, + "learning_rate": 5.757731534837092e-06, + "loss": 1.03919792, + "memory(GiB)": 302.58, + "step": 171300, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.76761394, + "epoch": 0.9581025855403797, + "grad_norm": 12.0, + "learning_rate": 5.756817497112601e-06, + "loss": 0.90030975, + "memory(GiB)": 302.58, + "step": 171320, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.73250022, + "epoch": 0.958214435013359, + "grad_norm": 7.875, + "learning_rate": 5.755903433502553e-06, + "loss": 1.05632648, + "memory(GiB)": 302.58, + "step": 171340, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.7265347, + "epoch": 0.9583262844863383, + "grad_norm": 8.4375, + "learning_rate": 5.75498934403821e-06, + "loss": 1.08367319, + "memory(GiB)": 302.58, + "step": 171360, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.75880799, + "epoch": 0.9584381339593175, + "grad_norm": 10.8125, + "learning_rate": 5.754075228750837e-06, + "loss": 0.95364094, + "memory(GiB)": 302.58, + "step": 171380, + "train_speed(iter/s)": 0.123839 + }, + { + "acc": 0.74713407, + "epoch": 0.9585499834322968, + "grad_norm": 6.375, + "learning_rate": 5.753161087671701e-06, + "loss": 1.0040431, + "memory(GiB)": 302.58, + "step": 171400, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.73267894, + "epoch": 0.9586618329052761, + "grad_norm": 6.6875, + "learning_rate": 5.752246920832066e-06, + "loss": 1.05960398, + "memory(GiB)": 302.58, + "step": 171420, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.73142519, + "epoch": 0.9587736823782553, + "grad_norm": 7.90625, + "learning_rate": 5.751332728263202e-06, + "loss": 1.06260557, + "memory(GiB)": 302.58, + "step": 171440, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.7529325, + "epoch": 0.9588855318512346, + "grad_norm": 6.65625, + "learning_rate": 5.750418509996377e-06, + "loss": 0.965798, + "memory(GiB)": 302.58, + "step": 171460, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.74978023, + "epoch": 0.9589973813242139, + "grad_norm": 5.25, + "learning_rate": 5.74950426606286e-06, + "loss": 0.98638029, + "memory(GiB)": 302.58, + "step": 171480, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.74648943, + "epoch": 0.9591092307971931, + "grad_norm": 8.375, + "learning_rate": 5.7485899964939185e-06, + "loss": 1.01580305, + "memory(GiB)": 302.58, + "step": 171500, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.73863435, + "epoch": 0.9592210802701724, + "grad_norm": 7.25, + "learning_rate": 5.7476757013208255e-06, + "loss": 1.03890963, + "memory(GiB)": 302.58, + "step": 171520, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.76392374, + "epoch": 0.9593329297431517, + "grad_norm": 7.3125, + "learning_rate": 5.746761380574852e-06, + "loss": 0.9043149, + "memory(GiB)": 302.58, + "step": 171540, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.74616022, + "epoch": 0.9594447792161309, + "grad_norm": 5.875, + "learning_rate": 5.745847034287273e-06, + "loss": 0.99251814, + "memory(GiB)": 302.58, + "step": 171560, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.72716618, + "epoch": 0.9595566286891102, + "grad_norm": 9.4375, + "learning_rate": 5.7449326624893605e-06, + "loss": 1.10200167, + "memory(GiB)": 302.58, + "step": 171580, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.7347888, + "epoch": 0.9596684781620894, + "grad_norm": 7.75, + "learning_rate": 5.7440182652123885e-06, + "loss": 1.043822, + "memory(GiB)": 302.58, + "step": 171600, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.74530244, + "epoch": 0.9597803276350687, + "grad_norm": 8.9375, + "learning_rate": 5.743103842487633e-06, + "loss": 0.98773613, + "memory(GiB)": 302.58, + "step": 171620, + "train_speed(iter/s)": 0.123921 + }, + { + "acc": 0.73834925, + "epoch": 0.959892177108048, + "grad_norm": 7.875, + "learning_rate": 5.7421893943463705e-06, + "loss": 1.01251926, + "memory(GiB)": 302.58, + "step": 171640, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.74357958, + "epoch": 0.9600040265810272, + "grad_norm": 8.5625, + "learning_rate": 5.741274920819876e-06, + "loss": 1.02682104, + "memory(GiB)": 302.58, + "step": 171660, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.72945514, + "epoch": 0.9601158760540065, + "grad_norm": 7.9375, + "learning_rate": 5.740360421939429e-06, + "loss": 1.05385952, + "memory(GiB)": 302.58, + "step": 171680, + "train_speed(iter/s)": 0.123942 + }, + { + "acc": 0.73633986, + "epoch": 0.9602277255269858, + "grad_norm": 5.84375, + "learning_rate": 5.739445897736307e-06, + "loss": 1.04057503, + "memory(GiB)": 302.58, + "step": 171700, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.73616853, + "epoch": 0.960339574999965, + "grad_norm": 5.0625, + "learning_rate": 5.7385313482417915e-06, + "loss": 1.03351402, + "memory(GiB)": 302.58, + "step": 171720, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.72006946, + "epoch": 0.9604514244729443, + "grad_norm": 5.9375, + "learning_rate": 5.737616773487162e-06, + "loss": 1.13655176, + "memory(GiB)": 302.58, + "step": 171740, + "train_speed(iter/s)": 0.123963 + }, + { + "acc": 0.73954325, + "epoch": 0.9605632739459236, + "grad_norm": 8.1875, + "learning_rate": 5.7367021735037e-06, + "loss": 1.02311211, + "memory(GiB)": 302.58, + "step": 171760, + "train_speed(iter/s)": 0.12397 + }, + { + "acc": 0.7558444, + "epoch": 0.9606751234189028, + "grad_norm": 5.5, + "learning_rate": 5.735787548322688e-06, + "loss": 0.94852123, + "memory(GiB)": 302.58, + "step": 171780, + "train_speed(iter/s)": 0.123976 + }, + { + "acc": 0.74927216, + "epoch": 0.9607869728918821, + "grad_norm": 6.46875, + "learning_rate": 5.734872897975408e-06, + "loss": 0.98596697, + "memory(GiB)": 302.58, + "step": 171800, + "train_speed(iter/s)": 0.123983 + }, + { + "acc": 0.74580498, + "epoch": 0.9608988223648613, + "grad_norm": 8.5625, + "learning_rate": 5.733958222493143e-06, + "loss": 1.00892305, + "memory(GiB)": 302.58, + "step": 171820, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.73841143, + "epoch": 0.9610106718378406, + "grad_norm": 8.0, + "learning_rate": 5.733043521907182e-06, + "loss": 1.04446793, + "memory(GiB)": 302.58, + "step": 171840, + "train_speed(iter/s)": 0.123996 + }, + { + "acc": 0.74483514, + "epoch": 0.9611225213108199, + "grad_norm": 9.5, + "learning_rate": 5.732128796248808e-06, + "loss": 0.99580927, + "memory(GiB)": 302.58, + "step": 171860, + "train_speed(iter/s)": 0.124003 + }, + { + "acc": 0.75008888, + "epoch": 0.9612343707837991, + "grad_norm": 7.0, + "learning_rate": 5.731214045549307e-06, + "loss": 0.98507032, + "memory(GiB)": 302.58, + "step": 171880, + "train_speed(iter/s)": 0.124009 + }, + { + "acc": 0.74903879, + "epoch": 0.9613462202567784, + "grad_norm": 6.125, + "learning_rate": 5.7302992698399685e-06, + "loss": 0.98767843, + "memory(GiB)": 302.58, + "step": 171900, + "train_speed(iter/s)": 0.124016 + }, + { + "acc": 0.7485877, + "epoch": 0.9614580697297577, + "grad_norm": 7.9375, + "learning_rate": 5.729384469152077e-06, + "loss": 0.98988047, + "memory(GiB)": 302.58, + "step": 171920, + "train_speed(iter/s)": 0.124023 + }, + { + "acc": 0.7565516, + "epoch": 0.9615699192027369, + "grad_norm": 9.0625, + "learning_rate": 5.728469643516925e-06, + "loss": 0.95296621, + "memory(GiB)": 302.58, + "step": 171940, + "train_speed(iter/s)": 0.12403 + }, + { + "acc": 0.75558448, + "epoch": 0.9616817686757162, + "grad_norm": 7.21875, + "learning_rate": 5.7275547929658e-06, + "loss": 0.95064411, + "memory(GiB)": 302.58, + "step": 171960, + "train_speed(iter/s)": 0.124037 + }, + { + "acc": 0.74371986, + "epoch": 0.9617936181486955, + "grad_norm": 8.125, + "learning_rate": 5.7266399175299955e-06, + "loss": 0.99233904, + "memory(GiB)": 302.58, + "step": 171980, + "train_speed(iter/s)": 0.124043 + }, + { + "acc": 0.74110713, + "epoch": 0.9619054676216747, + "grad_norm": 8.125, + "learning_rate": 5.725725017240802e-06, + "loss": 1.01719551, + "memory(GiB)": 302.58, + "step": 172000, + "train_speed(iter/s)": 0.12405 + }, + { + "epoch": 0.9619054676216747, + "eval_acc": 0.7061409696815891, + "eval_loss": 1.0134668350219727, + "eval_runtime": 7541.4413, + "eval_samples_per_second": 9.983, + "eval_steps_per_second": 9.983, + "step": 172000 + }, + { + "acc": 0.75298066, + "epoch": 0.962017317094654, + "grad_norm": 11.1875, + "learning_rate": 5.724810092129512e-06, + "loss": 0.97033806, + "memory(GiB)": 302.58, + "step": 172020, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.74176469, + "epoch": 0.9621291665676333, + "grad_norm": 8.4375, + "learning_rate": 5.723895142227417e-06, + "loss": 1.00842896, + "memory(GiB)": 302.58, + "step": 172040, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.73088055, + "epoch": 0.9622410160406125, + "grad_norm": 7.0625, + "learning_rate": 5.722980167565814e-06, + "loss": 1.05630207, + "memory(GiB)": 302.58, + "step": 172060, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74155707, + "epoch": 0.9623528655135918, + "grad_norm": 8.625, + "learning_rate": 5.722065168175996e-06, + "loss": 1.03286915, + "memory(GiB)": 302.58, + "step": 172080, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74675689, + "epoch": 0.962464714986571, + "grad_norm": 8.625, + "learning_rate": 5.7211501440892594e-06, + "loss": 0.99352427, + "memory(GiB)": 302.58, + "step": 172100, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.75021853, + "epoch": 0.9625765644595503, + "grad_norm": 9.25, + "learning_rate": 5.720235095336901e-06, + "loss": 1.00197659, + "memory(GiB)": 302.58, + "step": 172120, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74981799, + "epoch": 0.9626884139325296, + "grad_norm": 8.5625, + "learning_rate": 5.719320021950219e-06, + "loss": 0.9745657, + "memory(GiB)": 302.58, + "step": 172140, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.72764115, + "epoch": 0.9628002634055088, + "grad_norm": 6.78125, + "learning_rate": 5.718404923960511e-06, + "loss": 1.08807821, + "memory(GiB)": 302.58, + "step": 172160, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.74831858, + "epoch": 0.9629121128784881, + "grad_norm": 6.5625, + "learning_rate": 5.717489801399077e-06, + "loss": 1.00269699, + "memory(GiB)": 302.58, + "step": 172180, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74023485, + "epoch": 0.9630239623514674, + "grad_norm": 5.90625, + "learning_rate": 5.7165746542972164e-06, + "loss": 1.0319315, + "memory(GiB)": 302.58, + "step": 172200, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.74564776, + "epoch": 0.9631358118244466, + "grad_norm": 8.125, + "learning_rate": 5.715659482686231e-06, + "loss": 0.99226933, + "memory(GiB)": 302.58, + "step": 172220, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.73820434, + "epoch": 0.9632476612974259, + "grad_norm": 6.375, + "learning_rate": 5.714744286597422e-06, + "loss": 1.03689528, + "memory(GiB)": 302.58, + "step": 172240, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.74518566, + "epoch": 0.9633595107704052, + "grad_norm": 6.3125, + "learning_rate": 5.713829066062091e-06, + "loss": 1.03867569, + "memory(GiB)": 302.58, + "step": 172260, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73239284, + "epoch": 0.9634713602433844, + "grad_norm": 9.5625, + "learning_rate": 5.712913821111543e-06, + "loss": 1.04932613, + "memory(GiB)": 302.58, + "step": 172280, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.74041014, + "epoch": 0.9635832097163637, + "grad_norm": 6.5625, + "learning_rate": 5.7119985517770825e-06, + "loss": 1.01923409, + "memory(GiB)": 302.58, + "step": 172300, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.73786068, + "epoch": 0.963695059189343, + "grad_norm": 8.3125, + "learning_rate": 5.711083258090014e-06, + "loss": 1.03662634, + "memory(GiB)": 302.58, + "step": 172320, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.7552618, + "epoch": 0.9638069086623222, + "grad_norm": 10.3125, + "learning_rate": 5.710167940081641e-06, + "loss": 0.97496567, + "memory(GiB)": 302.58, + "step": 172340, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74791894, + "epoch": 0.9639187581353015, + "grad_norm": 7.1875, + "learning_rate": 5.709252597783275e-06, + "loss": 0.98925514, + "memory(GiB)": 302.58, + "step": 172360, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.73909173, + "epoch": 0.9640306076082807, + "grad_norm": 7.3125, + "learning_rate": 5.708337231226221e-06, + "loss": 1.02347622, + "memory(GiB)": 302.58, + "step": 172380, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.75566812, + "epoch": 0.96414245708126, + "grad_norm": 9.0, + "learning_rate": 5.7074218404417855e-06, + "loss": 0.97428141, + "memory(GiB)": 302.58, + "step": 172400, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.73708725, + "epoch": 0.9642543065542393, + "grad_norm": 5.59375, + "learning_rate": 5.706506425461281e-06, + "loss": 1.02760019, + "memory(GiB)": 302.58, + "step": 172420, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.73494058, + "epoch": 0.9643661560272185, + "grad_norm": 10.0, + "learning_rate": 5.705590986316017e-06, + "loss": 1.064536, + "memory(GiB)": 302.58, + "step": 172440, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.73938546, + "epoch": 0.9644780055001978, + "grad_norm": 6.21875, + "learning_rate": 5.704675523037304e-06, + "loss": 1.02271442, + "memory(GiB)": 302.58, + "step": 172460, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74113345, + "epoch": 0.9645898549731771, + "grad_norm": 7.5, + "learning_rate": 5.703760035656453e-06, + "loss": 0.99513578, + "memory(GiB)": 302.58, + "step": 172480, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.75202527, + "epoch": 0.9647017044461563, + "grad_norm": 10.3125, + "learning_rate": 5.702844524204778e-06, + "loss": 0.98121929, + "memory(GiB)": 302.58, + "step": 172500, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.73699951, + "epoch": 0.9648135539191356, + "grad_norm": 8.125, + "learning_rate": 5.701928988713592e-06, + "loss": 1.0451334, + "memory(GiB)": 302.58, + "step": 172520, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.76049018, + "epoch": 0.9649254033921149, + "grad_norm": 7.0, + "learning_rate": 5.701013429214206e-06, + "loss": 0.94779692, + "memory(GiB)": 302.58, + "step": 172540, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.73961735, + "epoch": 0.9650372528650941, + "grad_norm": 7.8125, + "learning_rate": 5.700097845737941e-06, + "loss": 1.02214317, + "memory(GiB)": 302.58, + "step": 172560, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.75832791, + "epoch": 0.9651491023380734, + "grad_norm": 8.625, + "learning_rate": 5.699182238316107e-06, + "loss": 0.94593725, + "memory(GiB)": 302.58, + "step": 172580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.7509335, + "epoch": 0.9652609518110526, + "grad_norm": 6.75, + "learning_rate": 5.698266606980026e-06, + "loss": 0.98457127, + "memory(GiB)": 302.58, + "step": 172600, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.72882509, + "epoch": 0.9653728012840319, + "grad_norm": 6.0625, + "learning_rate": 5.697350951761013e-06, + "loss": 1.0759573, + "memory(GiB)": 302.58, + "step": 172620, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75529923, + "epoch": 0.9654846507570112, + "grad_norm": 8.875, + "learning_rate": 5.696435272690385e-06, + "loss": 0.9709795, + "memory(GiB)": 302.58, + "step": 172640, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74984264, + "epoch": 0.9655965002299904, + "grad_norm": 5.84375, + "learning_rate": 5.695519569799461e-06, + "loss": 0.9918191, + "memory(GiB)": 302.58, + "step": 172660, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.74156899, + "epoch": 0.9657083497029697, + "grad_norm": 10.3125, + "learning_rate": 5.694603843119564e-06, + "loss": 1.0207346, + "memory(GiB)": 302.58, + "step": 172680, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.73902297, + "epoch": 0.965820199175949, + "grad_norm": 5.65625, + "learning_rate": 5.693688092682013e-06, + "loss": 1.03758373, + "memory(GiB)": 302.58, + "step": 172700, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.7385757, + "epoch": 0.9659320486489282, + "grad_norm": 8.4375, + "learning_rate": 5.69277231851813e-06, + "loss": 1.0405261, + "memory(GiB)": 302.58, + "step": 172720, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.76152782, + "epoch": 0.9660438981219075, + "grad_norm": 8.0, + "learning_rate": 5.691856520659236e-06, + "loss": 0.92749691, + "memory(GiB)": 302.58, + "step": 172740, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.76452641, + "epoch": 0.9661557475948868, + "grad_norm": 9.5625, + "learning_rate": 5.690940699136656e-06, + "loss": 0.92720165, + "memory(GiB)": 302.58, + "step": 172760, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.73313327, + "epoch": 0.966267597067866, + "grad_norm": 6.53125, + "learning_rate": 5.69002485398171e-06, + "loss": 1.05827398, + "memory(GiB)": 302.58, + "step": 172780, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.73540058, + "epoch": 0.9663794465408453, + "grad_norm": 7.6875, + "learning_rate": 5.689108985225729e-06, + "loss": 1.02683477, + "memory(GiB)": 302.58, + "step": 172800, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.75083041, + "epoch": 0.9664912960138246, + "grad_norm": 6.59375, + "learning_rate": 5.688193092900036e-06, + "loss": 0.97484102, + "memory(GiB)": 302.58, + "step": 172820, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.74918857, + "epoch": 0.9666031454868038, + "grad_norm": 5.4375, + "learning_rate": 5.687277177035956e-06, + "loss": 0.97529507, + "memory(GiB)": 302.58, + "step": 172840, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.73609338, + "epoch": 0.9667149949597831, + "grad_norm": 8.625, + "learning_rate": 5.686361237664817e-06, + "loss": 1.04044733, + "memory(GiB)": 302.58, + "step": 172860, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.74415669, + "epoch": 0.9668268444327623, + "grad_norm": 7.5, + "learning_rate": 5.685445274817948e-06, + "loss": 0.98253412, + "memory(GiB)": 302.58, + "step": 172880, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.72507386, + "epoch": 0.9669386939057416, + "grad_norm": 9.5, + "learning_rate": 5.6845292885266765e-06, + "loss": 1.10224047, + "memory(GiB)": 302.58, + "step": 172900, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.76115308, + "epoch": 0.9670505433787209, + "grad_norm": 7.25, + "learning_rate": 5.6836132788223345e-06, + "loss": 0.92273836, + "memory(GiB)": 302.58, + "step": 172920, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.73753862, + "epoch": 0.9671623928517001, + "grad_norm": 8.875, + "learning_rate": 5.6826972457362485e-06, + "loss": 1.0214056, + "memory(GiB)": 302.58, + "step": 172940, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.73763118, + "epoch": 0.9672742423246794, + "grad_norm": 6.84375, + "learning_rate": 5.681781189299753e-06, + "loss": 1.0574892, + "memory(GiB)": 302.58, + "step": 172960, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.74460554, + "epoch": 0.9673860917976587, + "grad_norm": 8.125, + "learning_rate": 5.680865109544178e-06, + "loss": 1.02368717, + "memory(GiB)": 302.58, + "step": 172980, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.75206776, + "epoch": 0.9674979412706379, + "grad_norm": 11.0, + "learning_rate": 5.679949006500858e-06, + "loss": 0.95420198, + "memory(GiB)": 302.58, + "step": 173000, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.74704204, + "epoch": 0.9676097907436172, + "grad_norm": 5.1875, + "learning_rate": 5.679032880201126e-06, + "loss": 1.01261272, + "memory(GiB)": 302.58, + "step": 173020, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.71980772, + "epoch": 0.9677216402165965, + "grad_norm": 5.875, + "learning_rate": 5.678116730676316e-06, + "loss": 1.11336231, + "memory(GiB)": 302.58, + "step": 173040, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.75185709, + "epoch": 0.9678334896895757, + "grad_norm": 5.375, + "learning_rate": 5.6772005579577646e-06, + "loss": 0.97711029, + "memory(GiB)": 302.58, + "step": 173060, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.73528113, + "epoch": 0.967945339162555, + "grad_norm": 7.28125, + "learning_rate": 5.676284362076806e-06, + "loss": 1.04262104, + "memory(GiB)": 302.58, + "step": 173080, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.74382834, + "epoch": 0.9680571886355342, + "grad_norm": 9.6875, + "learning_rate": 5.675368143064777e-06, + "loss": 1.00871611, + "memory(GiB)": 302.58, + "step": 173100, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.74126234, + "epoch": 0.9681690381085135, + "grad_norm": 9.4375, + "learning_rate": 5.674451900953017e-06, + "loss": 1.03289032, + "memory(GiB)": 302.58, + "step": 173120, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.75752444, + "epoch": 0.9682808875814928, + "grad_norm": 9.875, + "learning_rate": 5.673535635772863e-06, + "loss": 0.95325737, + "memory(GiB)": 302.58, + "step": 173140, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.75012417, + "epoch": 0.968392737054472, + "grad_norm": 7.03125, + "learning_rate": 5.672619347555655e-06, + "loss": 0.98110447, + "memory(GiB)": 302.58, + "step": 173160, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.75448856, + "epoch": 0.9685045865274513, + "grad_norm": 5.90625, + "learning_rate": 5.671703036332732e-06, + "loss": 0.97145023, + "memory(GiB)": 302.58, + "step": 173180, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.72518997, + "epoch": 0.9686164360004306, + "grad_norm": 8.125, + "learning_rate": 5.670786702135436e-06, + "loss": 1.05818501, + "memory(GiB)": 302.58, + "step": 173200, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.72418242, + "epoch": 0.9687282854734098, + "grad_norm": 5.46875, + "learning_rate": 5.6698703449951076e-06, + "loss": 1.09489174, + "memory(GiB)": 302.58, + "step": 173220, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.74206195, + "epoch": 0.9688401349463891, + "grad_norm": 6.5625, + "learning_rate": 5.668953964943088e-06, + "loss": 1.00684404, + "memory(GiB)": 302.58, + "step": 173240, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.74207067, + "epoch": 0.9689519844193684, + "grad_norm": 8.3125, + "learning_rate": 5.668037562010722e-06, + "loss": 1.00990324, + "memory(GiB)": 302.58, + "step": 173260, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.74162388, + "epoch": 0.9690638338923476, + "grad_norm": 5.96875, + "learning_rate": 5.6671211362293544e-06, + "loss": 1.02645454, + "memory(GiB)": 302.58, + "step": 173280, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.75297813, + "epoch": 0.9691756833653269, + "grad_norm": 7.40625, + "learning_rate": 5.666204687630328e-06, + "loss": 0.9775712, + "memory(GiB)": 302.58, + "step": 173300, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.73385773, + "epoch": 0.9692875328383062, + "grad_norm": 6.40625, + "learning_rate": 5.665288216244988e-06, + "loss": 1.04914284, + "memory(GiB)": 302.58, + "step": 173320, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.75885825, + "epoch": 0.9693993823112854, + "grad_norm": 6.46875, + "learning_rate": 5.664371722104682e-06, + "loss": 0.92375937, + "memory(GiB)": 302.58, + "step": 173340, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.74159579, + "epoch": 0.9695112317842647, + "grad_norm": 7.34375, + "learning_rate": 5.6634552052407565e-06, + "loss": 1.01075478, + "memory(GiB)": 302.58, + "step": 173360, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.74485345, + "epoch": 0.969623081257244, + "grad_norm": 6.15625, + "learning_rate": 5.662538665684558e-06, + "loss": 1.00050812, + "memory(GiB)": 302.58, + "step": 173380, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.75234847, + "epoch": 0.9697349307302232, + "grad_norm": 7.21875, + "learning_rate": 5.661622103467438e-06, + "loss": 0.96206808, + "memory(GiB)": 302.58, + "step": 173400, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.75618963, + "epoch": 0.9698467802032026, + "grad_norm": 9.75, + "learning_rate": 5.660705518620743e-06, + "loss": 0.96268015, + "memory(GiB)": 302.58, + "step": 173420, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.71200757, + "epoch": 0.9699586296761818, + "grad_norm": 5.90625, + "learning_rate": 5.659788911175824e-06, + "loss": 1.16172657, + "memory(GiB)": 302.58, + "step": 173440, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.73050876, + "epoch": 0.9700704791491611, + "grad_norm": 6.0, + "learning_rate": 5.658872281164033e-06, + "loss": 1.07056065, + "memory(GiB)": 302.58, + "step": 173460, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.75777912, + "epoch": 0.9701823286221404, + "grad_norm": 9.8125, + "learning_rate": 5.6579556286167216e-06, + "loss": 0.94138937, + "memory(GiB)": 302.58, + "step": 173480, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.72987604, + "epoch": 0.9702941780951196, + "grad_norm": 7.40625, + "learning_rate": 5.657038953565241e-06, + "loss": 1.0717968, + "memory(GiB)": 302.58, + "step": 173500, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.73306217, + "epoch": 0.9704060275680989, + "grad_norm": 8.75, + "learning_rate": 5.656122256040943e-06, + "loss": 1.0636548, + "memory(GiB)": 302.58, + "step": 173520, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.73275495, + "epoch": 0.9705178770410782, + "grad_norm": 5.71875, + "learning_rate": 5.655205536075184e-06, + "loss": 1.05434561, + "memory(GiB)": 302.58, + "step": 173540, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.74495759, + "epoch": 0.9706297265140574, + "grad_norm": 9.125, + "learning_rate": 5.6542887936993186e-06, + "loss": 0.98986902, + "memory(GiB)": 302.58, + "step": 173560, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.74390354, + "epoch": 0.9707415759870367, + "grad_norm": 6.6875, + "learning_rate": 5.6533720289447015e-06, + "loss": 1.00560961, + "memory(GiB)": 302.58, + "step": 173580, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.73183432, + "epoch": 0.970853425460016, + "grad_norm": 7.8125, + "learning_rate": 5.65245524184269e-06, + "loss": 1.06060839, + "memory(GiB)": 302.58, + "step": 173600, + "train_speed(iter/s)": 0.123904 + }, + { + "acc": 0.75538859, + "epoch": 0.9709652749329952, + "grad_norm": 6.25, + "learning_rate": 5.651538432424639e-06, + "loss": 0.95433474, + "memory(GiB)": 302.58, + "step": 173620, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.74427309, + "epoch": 0.9710771244059745, + "grad_norm": 10.8125, + "learning_rate": 5.650621600721909e-06, + "loss": 1.00803289, + "memory(GiB)": 302.58, + "step": 173640, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.7192359, + "epoch": 0.9711889738789538, + "grad_norm": 6.34375, + "learning_rate": 5.649704746765857e-06, + "loss": 1.12002668, + "memory(GiB)": 302.58, + "step": 173660, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.758179, + "epoch": 0.971300823351933, + "grad_norm": 7.6875, + "learning_rate": 5.648787870587842e-06, + "loss": 0.94841366, + "memory(GiB)": 302.58, + "step": 173680, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.74780612, + "epoch": 0.9714126728249123, + "grad_norm": 8.375, + "learning_rate": 5.647870972219224e-06, + "loss": 0.99668608, + "memory(GiB)": 302.58, + "step": 173700, + "train_speed(iter/s)": 0.123937 + }, + { + "acc": 0.72640853, + "epoch": 0.9715245222978915, + "grad_norm": 7.53125, + "learning_rate": 5.646954051691366e-06, + "loss": 1.06892786, + "memory(GiB)": 302.58, + "step": 173720, + "train_speed(iter/s)": 0.123944 + }, + { + "acc": 0.73691139, + "epoch": 0.9716363717708708, + "grad_norm": 8.4375, + "learning_rate": 5.646037109035627e-06, + "loss": 1.03478584, + "memory(GiB)": 302.58, + "step": 173740, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.72814999, + "epoch": 0.9717482212438501, + "grad_norm": 4.0625, + "learning_rate": 5.64512014428337e-06, + "loss": 1.08589802, + "memory(GiB)": 302.58, + "step": 173760, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.75367188, + "epoch": 0.9718600707168293, + "grad_norm": 8.375, + "learning_rate": 5.644203157465959e-06, + "loss": 0.97015648, + "memory(GiB)": 302.58, + "step": 173780, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.74406404, + "epoch": 0.9719719201898086, + "grad_norm": 7.34375, + "learning_rate": 5.643286148614757e-06, + "loss": 1.02046289, + "memory(GiB)": 302.58, + "step": 173800, + "train_speed(iter/s)": 0.12397 + }, + { + "acc": 0.71423469, + "epoch": 0.9720837696627879, + "grad_norm": 5.90625, + "learning_rate": 5.642369117761129e-06, + "loss": 1.1736063, + "memory(GiB)": 302.58, + "step": 173820, + "train_speed(iter/s)": 0.123977 + }, + { + "acc": 0.7444768, + "epoch": 0.9721956191357671, + "grad_norm": 8.375, + "learning_rate": 5.6414520649364405e-06, + "loss": 0.99399672, + "memory(GiB)": 302.58, + "step": 173840, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.74673786, + "epoch": 0.9723074686087464, + "grad_norm": 9.625, + "learning_rate": 5.6405349901720575e-06, + "loss": 0.99190254, + "memory(GiB)": 302.58, + "step": 173860, + "train_speed(iter/s)": 0.12399 + }, + { + "acc": 0.72366347, + "epoch": 0.9724193180817257, + "grad_norm": 6.71875, + "learning_rate": 5.639617893499346e-06, + "loss": 1.09275789, + "memory(GiB)": 302.58, + "step": 173880, + "train_speed(iter/s)": 0.123998 + }, + { + "acc": 0.76257467, + "epoch": 0.9725311675547049, + "grad_norm": 6.5625, + "learning_rate": 5.638700774949676e-06, + "loss": 0.9171134, + "memory(GiB)": 302.58, + "step": 173900, + "train_speed(iter/s)": 0.124005 + }, + { + "acc": 0.74734025, + "epoch": 0.9726430170276842, + "grad_norm": 8.3125, + "learning_rate": 5.637783634554412e-06, + "loss": 1.00703497, + "memory(GiB)": 302.58, + "step": 173920, + "train_speed(iter/s)": 0.124012 + }, + { + "acc": 0.72332621, + "epoch": 0.9727548665006635, + "grad_norm": 9.3125, + "learning_rate": 5.636866472344928e-06, + "loss": 1.13206415, + "memory(GiB)": 302.58, + "step": 173940, + "train_speed(iter/s)": 0.124018 + }, + { + "acc": 0.75648761, + "epoch": 0.9728667159736427, + "grad_norm": 6.03125, + "learning_rate": 5.635949288352588e-06, + "loss": 0.96749382, + "memory(GiB)": 302.58, + "step": 173960, + "train_speed(iter/s)": 0.124025 + }, + { + "acc": 0.74805465, + "epoch": 0.972978565446622, + "grad_norm": 6.71875, + "learning_rate": 5.635032082608768e-06, + "loss": 0.98714638, + "memory(GiB)": 302.58, + "step": 173980, + "train_speed(iter/s)": 0.124031 + }, + { + "acc": 0.73546901, + "epoch": 0.9730904149196012, + "grad_norm": 5.90625, + "learning_rate": 5.634114855144837e-06, + "loss": 1.05560484, + "memory(GiB)": 302.58, + "step": 174000, + "train_speed(iter/s)": 0.124038 + }, + { + "epoch": 0.9730904149196012, + "eval_acc": 0.7062223093789404, + "eval_loss": 1.0131474733352661, + "eval_runtime": 7579.7942, + "eval_samples_per_second": 9.932, + "eval_steps_per_second": 9.932, + "step": 174000 + }, + { + "acc": 0.74375844, + "epoch": 0.9732022643925805, + "grad_norm": 7.78125, + "learning_rate": 5.633197605992166e-06, + "loss": 1.01510239, + "memory(GiB)": 302.58, + "step": 174020, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.74758978, + "epoch": 0.9733141138655598, + "grad_norm": 7.84375, + "learning_rate": 5.63228033518213e-06, + "loss": 0.99746561, + "memory(GiB)": 302.58, + "step": 174040, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.76235099, + "epoch": 0.973425963338539, + "grad_norm": 7.96875, + "learning_rate": 5.631363042746101e-06, + "loss": 0.91284561, + "memory(GiB)": 302.58, + "step": 174060, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.74692883, + "epoch": 0.9735378128115183, + "grad_norm": 8.25, + "learning_rate": 5.630445728715454e-06, + "loss": 0.9901885, + "memory(GiB)": 302.58, + "step": 174080, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.7487884, + "epoch": 0.9736496622844976, + "grad_norm": 6.875, + "learning_rate": 5.629528393121565e-06, + "loss": 0.99346571, + "memory(GiB)": 302.58, + "step": 174100, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.74596124, + "epoch": 0.9737615117574768, + "grad_norm": 6.4375, + "learning_rate": 5.628611035995808e-06, + "loss": 1.00115585, + "memory(GiB)": 302.58, + "step": 174120, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.74675126, + "epoch": 0.9738733612304561, + "grad_norm": 8.125, + "learning_rate": 5.627693657369559e-06, + "loss": 0.98825665, + "memory(GiB)": 302.58, + "step": 174140, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.72341905, + "epoch": 0.9739852107034354, + "grad_norm": 6.25, + "learning_rate": 5.626776257274199e-06, + "loss": 1.10255747, + "memory(GiB)": 302.58, + "step": 174160, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.74132152, + "epoch": 0.9740970601764146, + "grad_norm": 5.0, + "learning_rate": 5.625858835741103e-06, + "loss": 0.99920759, + "memory(GiB)": 302.58, + "step": 174180, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.72201023, + "epoch": 0.9742089096493939, + "grad_norm": 9.5625, + "learning_rate": 5.62494139280165e-06, + "loss": 1.09833126, + "memory(GiB)": 302.58, + "step": 174200, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74322972, + "epoch": 0.9743207591223731, + "grad_norm": 7.625, + "learning_rate": 5.62402392848722e-06, + "loss": 1.00352936, + "memory(GiB)": 302.58, + "step": 174220, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.73702817, + "epoch": 0.9744326085953524, + "grad_norm": 6.34375, + "learning_rate": 5.623106442829193e-06, + "loss": 1.0335331, + "memory(GiB)": 302.58, + "step": 174240, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.7565671, + "epoch": 0.9745444580683317, + "grad_norm": 10.5625, + "learning_rate": 5.622188935858948e-06, + "loss": 0.96940327, + "memory(GiB)": 302.58, + "step": 174260, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.75385418, + "epoch": 0.9746563075413109, + "grad_norm": 7.34375, + "learning_rate": 5.62127140760787e-06, + "loss": 0.96378279, + "memory(GiB)": 302.58, + "step": 174280, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73881431, + "epoch": 0.9747681570142902, + "grad_norm": 5.1875, + "learning_rate": 5.62035385810734e-06, + "loss": 1.02149982, + "memory(GiB)": 302.58, + "step": 174300, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74771004, + "epoch": 0.9748800064872695, + "grad_norm": 6.65625, + "learning_rate": 5.619436287388741e-06, + "loss": 0.98677578, + "memory(GiB)": 302.58, + "step": 174320, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.73535414, + "epoch": 0.9749918559602487, + "grad_norm": 6.75, + "learning_rate": 5.618518695483456e-06, + "loss": 1.03468132, + "memory(GiB)": 302.58, + "step": 174340, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.73556852, + "epoch": 0.975103705433228, + "grad_norm": 6.3125, + "learning_rate": 5.617601082422871e-06, + "loss": 1.04182377, + "memory(GiB)": 302.58, + "step": 174360, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.73447928, + "epoch": 0.9752155549062073, + "grad_norm": 6.125, + "learning_rate": 5.616683448238371e-06, + "loss": 1.01348152, + "memory(GiB)": 302.58, + "step": 174380, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.73372164, + "epoch": 0.9753274043791865, + "grad_norm": 7.3125, + "learning_rate": 5.615765792961339e-06, + "loss": 1.05999746, + "memory(GiB)": 302.58, + "step": 174400, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.73658609, + "epoch": 0.9754392538521658, + "grad_norm": 8.0625, + "learning_rate": 5.614848116623167e-06, + "loss": 1.03875294, + "memory(GiB)": 302.58, + "step": 174420, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.7432106, + "epoch": 0.975551103325145, + "grad_norm": 6.75, + "learning_rate": 5.61393041925524e-06, + "loss": 1.02736874, + "memory(GiB)": 302.58, + "step": 174440, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.75024109, + "epoch": 0.9756629527981243, + "grad_norm": 8.8125, + "learning_rate": 5.613012700888946e-06, + "loss": 0.99304142, + "memory(GiB)": 302.58, + "step": 174460, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.74119678, + "epoch": 0.9757748022711036, + "grad_norm": 6.90625, + "learning_rate": 5.612094961555672e-06, + "loss": 1.01069393, + "memory(GiB)": 302.58, + "step": 174480, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74338794, + "epoch": 0.9758866517440828, + "grad_norm": 5.09375, + "learning_rate": 5.611177201286811e-06, + "loss": 1.0220273, + "memory(GiB)": 302.58, + "step": 174500, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.74835324, + "epoch": 0.9759985012170621, + "grad_norm": 7.125, + "learning_rate": 5.610259420113751e-06, + "loss": 1.00875635, + "memory(GiB)": 302.58, + "step": 174520, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.72974615, + "epoch": 0.9761103506900414, + "grad_norm": 6.625, + "learning_rate": 5.609341618067883e-06, + "loss": 1.05313129, + "memory(GiB)": 302.58, + "step": 174540, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74651403, + "epoch": 0.9762222001630206, + "grad_norm": 6.15625, + "learning_rate": 5.6084237951806005e-06, + "loss": 0.96094799, + "memory(GiB)": 302.58, + "step": 174560, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.736903, + "epoch": 0.9763340496359999, + "grad_norm": 8.375, + "learning_rate": 5.607505951483295e-06, + "loss": 1.0634284, + "memory(GiB)": 302.58, + "step": 174580, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74593587, + "epoch": 0.9764458991089792, + "grad_norm": 6.5625, + "learning_rate": 5.606588087007359e-06, + "loss": 1.00306673, + "memory(GiB)": 302.58, + "step": 174600, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.74680796, + "epoch": 0.9765577485819584, + "grad_norm": 5.90625, + "learning_rate": 5.605670201784188e-06, + "loss": 0.97034245, + "memory(GiB)": 302.58, + "step": 174620, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.73509531, + "epoch": 0.9766695980549377, + "grad_norm": 7.65625, + "learning_rate": 5.604752295845175e-06, + "loss": 1.05062256, + "memory(GiB)": 302.58, + "step": 174640, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.73421159, + "epoch": 0.976781447527917, + "grad_norm": 9.875, + "learning_rate": 5.603834369221714e-06, + "loss": 1.04771185, + "memory(GiB)": 302.58, + "step": 174660, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75331297, + "epoch": 0.9768932970008962, + "grad_norm": 7.96875, + "learning_rate": 5.602916421945206e-06, + "loss": 0.97240734, + "memory(GiB)": 302.58, + "step": 174680, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.7493566, + "epoch": 0.9770051464738755, + "grad_norm": 8.4375, + "learning_rate": 5.601998454047042e-06, + "loss": 0.97004757, + "memory(GiB)": 302.58, + "step": 174700, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.75965929, + "epoch": 0.9771169959468547, + "grad_norm": 5.96875, + "learning_rate": 5.6010804655586235e-06, + "loss": 0.94373541, + "memory(GiB)": 302.58, + "step": 174720, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.75088158, + "epoch": 0.977228845419834, + "grad_norm": 8.25, + "learning_rate": 5.6001624565113465e-06, + "loss": 0.99437933, + "memory(GiB)": 302.58, + "step": 174740, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74099836, + "epoch": 0.9773406948928133, + "grad_norm": 5.78125, + "learning_rate": 5.599244426936611e-06, + "loss": 1.02440653, + "memory(GiB)": 302.58, + "step": 174760, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.73645339, + "epoch": 0.9774525443657925, + "grad_norm": 6.46875, + "learning_rate": 5.598326376865816e-06, + "loss": 1.0290122, + "memory(GiB)": 302.58, + "step": 174780, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.73053865, + "epoch": 0.9775643938387718, + "grad_norm": 7.625, + "learning_rate": 5.597408306330362e-06, + "loss": 1.0536788, + "memory(GiB)": 302.58, + "step": 174800, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75112448, + "epoch": 0.9776762433117511, + "grad_norm": 7.84375, + "learning_rate": 5.596490215361649e-06, + "loss": 0.98581343, + "memory(GiB)": 302.58, + "step": 174820, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.74269633, + "epoch": 0.9777880927847303, + "grad_norm": 9.4375, + "learning_rate": 5.595572103991078e-06, + "loss": 1.02540998, + "memory(GiB)": 302.58, + "step": 174840, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.74557319, + "epoch": 0.9778999422577096, + "grad_norm": 6.59375, + "learning_rate": 5.594653972250054e-06, + "loss": 1.01333942, + "memory(GiB)": 302.58, + "step": 174860, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.73257117, + "epoch": 0.9780117917306889, + "grad_norm": 8.75, + "learning_rate": 5.593735820169979e-06, + "loss": 1.05924511, + "memory(GiB)": 302.58, + "step": 174880, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.72900643, + "epoch": 0.9781236412036681, + "grad_norm": 7.53125, + "learning_rate": 5.592817647782255e-06, + "loss": 1.06632805, + "memory(GiB)": 302.58, + "step": 174900, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.76443434, + "epoch": 0.9782354906766474, + "grad_norm": 6.78125, + "learning_rate": 5.591899455118288e-06, + "loss": 0.89891882, + "memory(GiB)": 302.58, + "step": 174920, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.75396929, + "epoch": 0.9783473401496267, + "grad_norm": 6.125, + "learning_rate": 5.590981242209483e-06, + "loss": 0.95127068, + "memory(GiB)": 302.58, + "step": 174940, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.7421042, + "epoch": 0.9784591896226059, + "grad_norm": 8.75, + "learning_rate": 5.590063009087246e-06, + "loss": 1.01214638, + "memory(GiB)": 302.58, + "step": 174960, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.730936, + "epoch": 0.9785710390955852, + "grad_norm": 5.1875, + "learning_rate": 5.589144755782981e-06, + "loss": 1.07603836, + "memory(GiB)": 302.58, + "step": 174980, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.73531251, + "epoch": 0.9786828885685644, + "grad_norm": 4.625, + "learning_rate": 5.588226482328099e-06, + "loss": 1.03322573, + "memory(GiB)": 302.58, + "step": 175000, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.74396849, + "epoch": 0.9787947380415437, + "grad_norm": 5.65625, + "learning_rate": 5.587308188754007e-06, + "loss": 1.00358391, + "memory(GiB)": 302.58, + "step": 175020, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.7444521, + "epoch": 0.978906587514523, + "grad_norm": 7.5625, + "learning_rate": 5.586389875092112e-06, + "loss": 0.99274826, + "memory(GiB)": 302.58, + "step": 175040, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.72786055, + "epoch": 0.9790184369875022, + "grad_norm": 7.0, + "learning_rate": 5.585471541373824e-06, + "loss": 1.08421965, + "memory(GiB)": 302.58, + "step": 175060, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.74630461, + "epoch": 0.9791302864604815, + "grad_norm": 6.96875, + "learning_rate": 5.584553187630551e-06, + "loss": 0.99240093, + "memory(GiB)": 302.58, + "step": 175080, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.73281035, + "epoch": 0.9792421359334608, + "grad_norm": 8.25, + "learning_rate": 5.5836348138937055e-06, + "loss": 1.06855774, + "memory(GiB)": 302.58, + "step": 175100, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.74319386, + "epoch": 0.97935398540644, + "grad_norm": 4.4375, + "learning_rate": 5.5827164201947005e-06, + "loss": 1.02495947, + "memory(GiB)": 302.58, + "step": 175120, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.74081783, + "epoch": 0.9794658348794193, + "grad_norm": 9.0625, + "learning_rate": 5.581798006564946e-06, + "loss": 1.02052507, + "memory(GiB)": 302.58, + "step": 175140, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.72542324, + "epoch": 0.9795776843523986, + "grad_norm": 9.0625, + "learning_rate": 5.580879573035855e-06, + "loss": 1.0750246, + "memory(GiB)": 302.58, + "step": 175160, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.73277392, + "epoch": 0.9796895338253778, + "grad_norm": 7.53125, + "learning_rate": 5.579961119638839e-06, + "loss": 1.06300392, + "memory(GiB)": 302.58, + "step": 175180, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.73963752, + "epoch": 0.9798013832983571, + "grad_norm": 4.375, + "learning_rate": 5.579042646405316e-06, + "loss": 1.05516939, + "memory(GiB)": 302.58, + "step": 175200, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.73868575, + "epoch": 0.9799132327713364, + "grad_norm": 8.0625, + "learning_rate": 5.578124153366697e-06, + "loss": 1.03573103, + "memory(GiB)": 302.58, + "step": 175220, + "train_speed(iter/s)": 0.123765 + }, + { + "acc": 0.75675397, + "epoch": 0.9800250822443156, + "grad_norm": 7.1875, + "learning_rate": 5.577205640554398e-06, + "loss": 0.92702923, + "memory(GiB)": 302.58, + "step": 175240, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.74224215, + "epoch": 0.9801369317172949, + "grad_norm": 7.5625, + "learning_rate": 5.576287107999837e-06, + "loss": 1.016541, + "memory(GiB)": 302.58, + "step": 175260, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.73837051, + "epoch": 0.9802487811902741, + "grad_norm": 4.96875, + "learning_rate": 5.575368555734428e-06, + "loss": 1.02768078, + "memory(GiB)": 302.58, + "step": 175280, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.75117559, + "epoch": 0.9803606306632534, + "grad_norm": 6.9375, + "learning_rate": 5.5744499837895915e-06, + "loss": 0.9941781, + "memory(GiB)": 302.58, + "step": 175300, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.75087962, + "epoch": 0.9804724801362327, + "grad_norm": 8.5625, + "learning_rate": 5.573531392196744e-06, + "loss": 0.97234621, + "memory(GiB)": 302.58, + "step": 175320, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.73516221, + "epoch": 0.9805843296092119, + "grad_norm": 7.0, + "learning_rate": 5.572612780987304e-06, + "loss": 1.04793196, + "memory(GiB)": 302.58, + "step": 175340, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.7413547, + "epoch": 0.9806961790821912, + "grad_norm": 6.21875, + "learning_rate": 5.571694150192692e-06, + "loss": 1.00785322, + "memory(GiB)": 302.58, + "step": 175360, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.74622908, + "epoch": 0.9808080285551705, + "grad_norm": 7.625, + "learning_rate": 5.570775499844326e-06, + "loss": 0.98696489, + "memory(GiB)": 302.58, + "step": 175380, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.74160352, + "epoch": 0.9809198780281497, + "grad_norm": 7.78125, + "learning_rate": 5.569856829973629e-06, + "loss": 1.032973, + "memory(GiB)": 302.58, + "step": 175400, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.75905142, + "epoch": 0.981031727501129, + "grad_norm": 10.6875, + "learning_rate": 5.56893814061202e-06, + "loss": 0.91903086, + "memory(GiB)": 302.58, + "step": 175420, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.73693652, + "epoch": 0.9811435769741083, + "grad_norm": 7.0, + "learning_rate": 5.568019431790924e-06, + "loss": 1.03252935, + "memory(GiB)": 302.58, + "step": 175440, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.73672342, + "epoch": 0.9812554264470875, + "grad_norm": 6.4375, + "learning_rate": 5.567100703541762e-06, + "loss": 1.03911266, + "memory(GiB)": 302.58, + "step": 175460, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.7438714, + "epoch": 0.9813672759200668, + "grad_norm": 9.75, + "learning_rate": 5.566181955895958e-06, + "loss": 1.00077133, + "memory(GiB)": 302.58, + "step": 175480, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.75472913, + "epoch": 0.981479125393046, + "grad_norm": 9.1875, + "learning_rate": 5.565263188884935e-06, + "loss": 0.96865368, + "memory(GiB)": 302.58, + "step": 175500, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.7344697, + "epoch": 0.9815909748660253, + "grad_norm": 8.375, + "learning_rate": 5.564344402540119e-06, + "loss": 1.04907761, + "memory(GiB)": 302.58, + "step": 175520, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.74304733, + "epoch": 0.9817028243390046, + "grad_norm": 9.5625, + "learning_rate": 5.563425596892934e-06, + "loss": 1.01535892, + "memory(GiB)": 302.58, + "step": 175540, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.73574877, + "epoch": 0.9818146738119838, + "grad_norm": 7.1875, + "learning_rate": 5.5625067719748085e-06, + "loss": 1.04404812, + "memory(GiB)": 302.58, + "step": 175560, + "train_speed(iter/s)": 0.123878 + }, + { + "acc": 0.73405628, + "epoch": 0.9819265232849631, + "grad_norm": 9.5, + "learning_rate": 5.561587927817167e-06, + "loss": 1.07087622, + "memory(GiB)": 302.58, + "step": 175580, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.74131722, + "epoch": 0.9820383727579424, + "grad_norm": 6.53125, + "learning_rate": 5.560669064451437e-06, + "loss": 1.02738867, + "memory(GiB)": 302.58, + "step": 175600, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.75830798, + "epoch": 0.9821502222309216, + "grad_norm": 9.9375, + "learning_rate": 5.559750181909049e-06, + "loss": 0.94861517, + "memory(GiB)": 302.58, + "step": 175620, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.73505406, + "epoch": 0.9822620717039009, + "grad_norm": 9.0625, + "learning_rate": 5.558831280221428e-06, + "loss": 1.05508537, + "memory(GiB)": 302.58, + "step": 175640, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73908195, + "epoch": 0.9823739211768802, + "grad_norm": 11.625, + "learning_rate": 5.557912359420007e-06, + "loss": 1.03625336, + "memory(GiB)": 302.58, + "step": 175660, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.75195637, + "epoch": 0.9824857706498594, + "grad_norm": 6.6875, + "learning_rate": 5.556993419536212e-06, + "loss": 0.9648737, + "memory(GiB)": 302.58, + "step": 175680, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.75179024, + "epoch": 0.9825976201228387, + "grad_norm": 12.0, + "learning_rate": 5.556074460601477e-06, + "loss": 0.98093891, + "memory(GiB)": 302.58, + "step": 175700, + "train_speed(iter/s)": 0.123923 + }, + { + "acc": 0.72619443, + "epoch": 0.982709469595818, + "grad_norm": 5.03125, + "learning_rate": 5.555155482647232e-06, + "loss": 1.10293179, + "memory(GiB)": 302.58, + "step": 175720, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.75108213, + "epoch": 0.9828213190687972, + "grad_norm": 6.5, + "learning_rate": 5.5542364857049095e-06, + "loss": 0.97763233, + "memory(GiB)": 302.58, + "step": 175740, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.75541978, + "epoch": 0.9829331685417765, + "grad_norm": 8.6875, + "learning_rate": 5.553317469805942e-06, + "loss": 0.96589127, + "memory(GiB)": 302.58, + "step": 175760, + "train_speed(iter/s)": 0.123943 + }, + { + "acc": 0.74091644, + "epoch": 0.9830450180147557, + "grad_norm": 7.625, + "learning_rate": 5.552398434981762e-06, + "loss": 1.02150965, + "memory(GiB)": 302.58, + "step": 175780, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.74711146, + "epoch": 0.983156867487735, + "grad_norm": 8.4375, + "learning_rate": 5.551479381263805e-06, + "loss": 0.9916523, + "memory(GiB)": 302.58, + "step": 175800, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.71646981, + "epoch": 0.9832687169607143, + "grad_norm": 9.6875, + "learning_rate": 5.550560308683502e-06, + "loss": 1.13577824, + "memory(GiB)": 302.58, + "step": 175820, + "train_speed(iter/s)": 0.123962 + }, + { + "acc": 0.73642111, + "epoch": 0.9833805664336935, + "grad_norm": 7.5, + "learning_rate": 5.549641217272292e-06, + "loss": 1.03769655, + "memory(GiB)": 302.58, + "step": 175840, + "train_speed(iter/s)": 0.123969 + }, + { + "acc": 0.7475266, + "epoch": 0.9834924159066728, + "grad_norm": 7.6875, + "learning_rate": 5.5487221070616095e-06, + "loss": 0.99921932, + "memory(GiB)": 302.58, + "step": 175860, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.75701723, + "epoch": 0.9836042653796521, + "grad_norm": 5.90625, + "learning_rate": 5.5478029780828916e-06, + "loss": 0.95456238, + "memory(GiB)": 302.58, + "step": 175880, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.73714991, + "epoch": 0.9837161148526313, + "grad_norm": 6.75, + "learning_rate": 5.546883830367574e-06, + "loss": 1.02839317, + "memory(GiB)": 302.58, + "step": 175900, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.73594069, + "epoch": 0.9838279643256106, + "grad_norm": 6.53125, + "learning_rate": 5.545964663947096e-06, + "loss": 1.03605213, + "memory(GiB)": 302.58, + "step": 175920, + "train_speed(iter/s)": 0.123995 + }, + { + "acc": 0.74526682, + "epoch": 0.9839398137985899, + "grad_norm": 6.84375, + "learning_rate": 5.545045478852894e-06, + "loss": 0.99962826, + "memory(GiB)": 302.58, + "step": 175940, + "train_speed(iter/s)": 0.124002 + }, + { + "acc": 0.73426285, + "epoch": 0.9840516632715691, + "grad_norm": 7.84375, + "learning_rate": 5.544126275116409e-06, + "loss": 1.0473793, + "memory(GiB)": 302.58, + "step": 175960, + "train_speed(iter/s)": 0.124008 + }, + { + "acc": 0.75011339, + "epoch": 0.9841635127445484, + "grad_norm": 4.78125, + "learning_rate": 5.54320705276908e-06, + "loss": 1.00174017, + "memory(GiB)": 302.58, + "step": 175980, + "train_speed(iter/s)": 0.124015 + }, + { + "acc": 0.7349196, + "epoch": 0.9842753622175276, + "grad_norm": 6.5625, + "learning_rate": 5.542287811842347e-06, + "loss": 1.04506378, + "memory(GiB)": 302.58, + "step": 176000, + "train_speed(iter/s)": 0.124021 + }, + { + "epoch": 0.9842753622175276, + "eval_acc": 0.7063220367775717, + "eval_loss": 1.0130083560943604, + "eval_runtime": 7557.0883, + "eval_samples_per_second": 9.962, + "eval_steps_per_second": 9.962, + "step": 176000 + }, + { + "acc": 0.75773973, + "epoch": 0.9843872116905069, + "grad_norm": 6.25, + "learning_rate": 5.541368552367653e-06, + "loss": 0.95659714, + "memory(GiB)": 302.58, + "step": 176020, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.75126343, + "epoch": 0.9844990611634862, + "grad_norm": 4.75, + "learning_rate": 5.540449274376436e-06, + "loss": 0.97611065, + "memory(GiB)": 302.58, + "step": 176040, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.73775854, + "epoch": 0.9846109106364654, + "grad_norm": 4.75, + "learning_rate": 5.539529977900142e-06, + "loss": 1.02235994, + "memory(GiB)": 302.58, + "step": 176060, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.74663281, + "epoch": 0.9847227601094447, + "grad_norm": 8.625, + "learning_rate": 5.538610662970211e-06, + "loss": 0.99012079, + "memory(GiB)": 302.58, + "step": 176080, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.73687849, + "epoch": 0.984834609582424, + "grad_norm": 6.15625, + "learning_rate": 5.5376913296180864e-06, + "loss": 1.04332771, + "memory(GiB)": 302.58, + "step": 176100, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.7330092, + "epoch": 0.9849464590554032, + "grad_norm": 5.0, + "learning_rate": 5.536771977875214e-06, + "loss": 1.05899181, + "memory(GiB)": 302.58, + "step": 176120, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.73245893, + "epoch": 0.9850583085283825, + "grad_norm": 7.40625, + "learning_rate": 5.535852607773038e-06, + "loss": 1.05713072, + "memory(GiB)": 302.58, + "step": 176140, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.74642477, + "epoch": 0.9851701580013618, + "grad_norm": 5.75, + "learning_rate": 5.534933219343004e-06, + "loss": 0.99389563, + "memory(GiB)": 302.58, + "step": 176160, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.75011005, + "epoch": 0.985282007474341, + "grad_norm": 7.9375, + "learning_rate": 5.534013812616557e-06, + "loss": 0.98087454, + "memory(GiB)": 302.58, + "step": 176180, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74466553, + "epoch": 0.9853938569473203, + "grad_norm": 8.125, + "learning_rate": 5.5330943876251456e-06, + "loss": 0.99088783, + "memory(GiB)": 302.58, + "step": 176200, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75266323, + "epoch": 0.9855057064202996, + "grad_norm": 6.6875, + "learning_rate": 5.5321749444002145e-06, + "loss": 0.97008295, + "memory(GiB)": 302.58, + "step": 176220, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.74424076, + "epoch": 0.9856175558932788, + "grad_norm": 5.0, + "learning_rate": 5.531255482973214e-06, + "loss": 0.98715458, + "memory(GiB)": 302.58, + "step": 176240, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.75619855, + "epoch": 0.9857294053662581, + "grad_norm": 5.71875, + "learning_rate": 5.530336003375589e-06, + "loss": 0.94132614, + "memory(GiB)": 302.58, + "step": 176260, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.72968855, + "epoch": 0.9858412548392373, + "grad_norm": 7.03125, + "learning_rate": 5.5294165056387924e-06, + "loss": 1.05981245, + "memory(GiB)": 302.58, + "step": 176280, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.72664595, + "epoch": 0.9859531043122166, + "grad_norm": 7.4375, + "learning_rate": 5.528496989794274e-06, + "loss": 1.06043129, + "memory(GiB)": 302.58, + "step": 176300, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.74460702, + "epoch": 0.9860649537851959, + "grad_norm": 8.125, + "learning_rate": 5.527577455873482e-06, + "loss": 0.99555197, + "memory(GiB)": 302.58, + "step": 176320, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.7606297, + "epoch": 0.9861768032581751, + "grad_norm": 6.46875, + "learning_rate": 5.526657903907867e-06, + "loss": 0.9388319, + "memory(GiB)": 302.58, + "step": 176340, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.73289604, + "epoch": 0.9862886527311544, + "grad_norm": 8.6875, + "learning_rate": 5.525738333928883e-06, + "loss": 1.04472904, + "memory(GiB)": 302.58, + "step": 176360, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74571595, + "epoch": 0.9864005022041337, + "grad_norm": 8.4375, + "learning_rate": 5.5248187459679805e-06, + "loss": 0.99618101, + "memory(GiB)": 302.58, + "step": 176380, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.75060596, + "epoch": 0.9865123516771129, + "grad_norm": 7.6875, + "learning_rate": 5.523899140056611e-06, + "loss": 0.98132973, + "memory(GiB)": 302.58, + "step": 176400, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.77133131, + "epoch": 0.9866242011500922, + "grad_norm": 7.96875, + "learning_rate": 5.522979516226231e-06, + "loss": 0.89588079, + "memory(GiB)": 302.58, + "step": 176420, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.75826902, + "epoch": 0.9867360506230715, + "grad_norm": 7.4375, + "learning_rate": 5.522059874508294e-06, + "loss": 0.93264732, + "memory(GiB)": 302.58, + "step": 176440, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.73548555, + "epoch": 0.9868479000960507, + "grad_norm": 9.6875, + "learning_rate": 5.521140214934253e-06, + "loss": 1.03707705, + "memory(GiB)": 302.58, + "step": 176460, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.74097352, + "epoch": 0.98695974956903, + "grad_norm": 7.46875, + "learning_rate": 5.520220537535566e-06, + "loss": 1.01770582, + "memory(GiB)": 302.58, + "step": 176480, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.73962793, + "epoch": 0.9870715990420093, + "grad_norm": 6.0, + "learning_rate": 5.519300842343685e-06, + "loss": 1.0095521, + "memory(GiB)": 302.58, + "step": 176500, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.7397552, + "epoch": 0.9871834485149885, + "grad_norm": 6.75, + "learning_rate": 5.518381129390069e-06, + "loss": 1.03451891, + "memory(GiB)": 302.58, + "step": 176520, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.72647662, + "epoch": 0.9872952979879678, + "grad_norm": 8.4375, + "learning_rate": 5.517461398706173e-06, + "loss": 1.09670658, + "memory(GiB)": 302.58, + "step": 176540, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75348454, + "epoch": 0.987407147460947, + "grad_norm": 5.0, + "learning_rate": 5.516541650323457e-06, + "loss": 0.94634695, + "memory(GiB)": 302.58, + "step": 176560, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75030808, + "epoch": 0.9875189969339263, + "grad_norm": 6.8125, + "learning_rate": 5.51562188427338e-06, + "loss": 0.97223597, + "memory(GiB)": 302.58, + "step": 176580, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.76121922, + "epoch": 0.9876308464069056, + "grad_norm": 8.875, + "learning_rate": 5.5147021005874e-06, + "loss": 0.9407094, + "memory(GiB)": 302.58, + "step": 176600, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.72705393, + "epoch": 0.9877426958798848, + "grad_norm": 7.375, + "learning_rate": 5.513782299296975e-06, + "loss": 1.08443327, + "memory(GiB)": 302.58, + "step": 176620, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.72927055, + "epoch": 0.9878545453528641, + "grad_norm": 8.25, + "learning_rate": 5.512862480433567e-06, + "loss": 1.0795681, + "memory(GiB)": 302.58, + "step": 176640, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.73965068, + "epoch": 0.9879663948258434, + "grad_norm": 8.25, + "learning_rate": 5.511942644028635e-06, + "loss": 1.02613125, + "memory(GiB)": 302.58, + "step": 176660, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.73751936, + "epoch": 0.9880782442988226, + "grad_norm": 4.90625, + "learning_rate": 5.511022790113642e-06, + "loss": 1.03901482, + "memory(GiB)": 302.58, + "step": 176680, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.73021488, + "epoch": 0.9881900937718019, + "grad_norm": 6.34375, + "learning_rate": 5.510102918720048e-06, + "loss": 1.07618237, + "memory(GiB)": 302.58, + "step": 176700, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.76004629, + "epoch": 0.9883019432447812, + "grad_norm": 7.78125, + "learning_rate": 5.509183029879318e-06, + "loss": 0.92933559, + "memory(GiB)": 302.58, + "step": 176720, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.75333638, + "epoch": 0.9884137927177604, + "grad_norm": 8.75, + "learning_rate": 5.5082631236229135e-06, + "loss": 0.98741503, + "memory(GiB)": 302.58, + "step": 176740, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.76981311, + "epoch": 0.9885256421907397, + "grad_norm": 5.34375, + "learning_rate": 5.507343199982298e-06, + "loss": 0.88898411, + "memory(GiB)": 302.58, + "step": 176760, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.71716199, + "epoch": 0.988637491663719, + "grad_norm": 5.59375, + "learning_rate": 5.506423258988937e-06, + "loss": 1.09769764, + "memory(GiB)": 302.58, + "step": 176780, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.75392094, + "epoch": 0.9887493411366982, + "grad_norm": 7.15625, + "learning_rate": 5.505503300674294e-06, + "loss": 0.96273603, + "memory(GiB)": 302.58, + "step": 176800, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.75565648, + "epoch": 0.9888611906096775, + "grad_norm": 7.46875, + "learning_rate": 5.504583325069836e-06, + "loss": 0.95049715, + "memory(GiB)": 302.58, + "step": 176820, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75493965, + "epoch": 0.9889730400826567, + "grad_norm": 7.25, + "learning_rate": 5.5036633322070275e-06, + "loss": 0.96652632, + "memory(GiB)": 302.58, + "step": 176840, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.74096665, + "epoch": 0.989084889555636, + "grad_norm": 7.34375, + "learning_rate": 5.502743322117336e-06, + "loss": 1.0464653, + "memory(GiB)": 302.58, + "step": 176860, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.72513762, + "epoch": 0.9891967390286153, + "grad_norm": 6.625, + "learning_rate": 5.501823294832229e-06, + "loss": 1.09782591, + "memory(GiB)": 302.58, + "step": 176880, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.72674761, + "epoch": 0.9893085885015945, + "grad_norm": 9.0625, + "learning_rate": 5.500903250383175e-06, + "loss": 1.08971233, + "memory(GiB)": 302.58, + "step": 176900, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.74239302, + "epoch": 0.9894204379745738, + "grad_norm": 7.15625, + "learning_rate": 5.499983188801641e-06, + "loss": 1.0141202, + "memory(GiB)": 302.58, + "step": 176920, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.75025892, + "epoch": 0.9895322874475531, + "grad_norm": 10.1875, + "learning_rate": 5.499063110119096e-06, + "loss": 0.98503389, + "memory(GiB)": 302.58, + "step": 176940, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.73988295, + "epoch": 0.9896441369205323, + "grad_norm": 7.03125, + "learning_rate": 5.49814301436701e-06, + "loss": 1.02517614, + "memory(GiB)": 302.58, + "step": 176960, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.77263575, + "epoch": 0.9897559863935116, + "grad_norm": 8.9375, + "learning_rate": 5.497222901576853e-06, + "loss": 0.89829998, + "memory(GiB)": 302.58, + "step": 176980, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.73638906, + "epoch": 0.9898678358664909, + "grad_norm": 6.75, + "learning_rate": 5.496302771780096e-06, + "loss": 1.04770069, + "memory(GiB)": 302.58, + "step": 177000, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.72910976, + "epoch": 0.9899796853394701, + "grad_norm": 6.5, + "learning_rate": 5.495382625008212e-06, + "loss": 1.04801035, + "memory(GiB)": 302.58, + "step": 177020, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.73867197, + "epoch": 0.9900915348124494, + "grad_norm": 6.0625, + "learning_rate": 5.494462461292671e-06, + "loss": 1.0248333, + "memory(GiB)": 302.58, + "step": 177040, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.75125165, + "epoch": 0.9902033842854286, + "grad_norm": 9.375, + "learning_rate": 5.493542280664946e-06, + "loss": 0.99126654, + "memory(GiB)": 302.58, + "step": 177060, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.73835878, + "epoch": 0.9903152337584079, + "grad_norm": 5.9375, + "learning_rate": 5.492622083156509e-06, + "loss": 1.03331909, + "memory(GiB)": 302.58, + "step": 177080, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.73981705, + "epoch": 0.9904270832313872, + "grad_norm": 10.5625, + "learning_rate": 5.491701868798836e-06, + "loss": 1.02978649, + "memory(GiB)": 302.58, + "step": 177100, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.7356082, + "epoch": 0.9905389327043664, + "grad_norm": 6.71875, + "learning_rate": 5.490781637623398e-06, + "loss": 1.0419467, + "memory(GiB)": 302.58, + "step": 177120, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75600834, + "epoch": 0.9906507821773457, + "grad_norm": 5.25, + "learning_rate": 5.4898613896616735e-06, + "loss": 0.98266859, + "memory(GiB)": 302.58, + "step": 177140, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.74975457, + "epoch": 0.990762631650325, + "grad_norm": 5.1875, + "learning_rate": 5.488941124945135e-06, + "loss": 0.97560568, + "memory(GiB)": 302.58, + "step": 177160, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.7525671, + "epoch": 0.9908744811233042, + "grad_norm": 5.90625, + "learning_rate": 5.48802084350526e-06, + "loss": 0.96534185, + "memory(GiB)": 302.58, + "step": 177180, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.73361731, + "epoch": 0.9909863305962835, + "grad_norm": 5.125, + "learning_rate": 5.487100545373526e-06, + "loss": 1.0567152, + "memory(GiB)": 302.58, + "step": 177200, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.75386872, + "epoch": 0.9910981800692628, + "grad_norm": 7.9375, + "learning_rate": 5.486180230581407e-06, + "loss": 0.97378244, + "memory(GiB)": 302.58, + "step": 177220, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.74799895, + "epoch": 0.991210029542242, + "grad_norm": 8.25, + "learning_rate": 5.485259899160384e-06, + "loss": 0.99507704, + "memory(GiB)": 302.58, + "step": 177240, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.73496742, + "epoch": 0.9913218790152213, + "grad_norm": 6.4375, + "learning_rate": 5.484339551141933e-06, + "loss": 1.03774853, + "memory(GiB)": 302.58, + "step": 177260, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.76590381, + "epoch": 0.9914337284882005, + "grad_norm": 5.34375, + "learning_rate": 5.483419186557533e-06, + "loss": 0.89014244, + "memory(GiB)": 302.58, + "step": 177280, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.7435956, + "epoch": 0.9915455779611798, + "grad_norm": 11.5, + "learning_rate": 5.482498805438664e-06, + "loss": 1.02128239, + "memory(GiB)": 302.58, + "step": 177300, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.75832376, + "epoch": 0.9916574274341591, + "grad_norm": 6.125, + "learning_rate": 5.481578407816807e-06, + "loss": 0.95294046, + "memory(GiB)": 302.58, + "step": 177320, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.74150891, + "epoch": 0.9917692769071383, + "grad_norm": 8.3125, + "learning_rate": 5.48065799372344e-06, + "loss": 1.03024826, + "memory(GiB)": 302.58, + "step": 177340, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.73303185, + "epoch": 0.9918811263801176, + "grad_norm": 8.5, + "learning_rate": 5.479737563190047e-06, + "loss": 1.06116877, + "memory(GiB)": 302.58, + "step": 177360, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.74556928, + "epoch": 0.9919929758530969, + "grad_norm": 8.5, + "learning_rate": 5.478817116248108e-06, + "loss": 1.02831869, + "memory(GiB)": 302.58, + "step": 177380, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.74058843, + "epoch": 0.9921048253260761, + "grad_norm": 7.8125, + "learning_rate": 5.477896652929104e-06, + "loss": 1.01679029, + "memory(GiB)": 302.58, + "step": 177400, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.7428473, + "epoch": 0.9922166747990554, + "grad_norm": 6.375, + "learning_rate": 5.47697617326452e-06, + "loss": 1.0070364, + "memory(GiB)": 302.58, + "step": 177420, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.71024504, + "epoch": 0.9923285242720347, + "grad_norm": 10.9375, + "learning_rate": 5.476055677285839e-06, + "loss": 1.1719677, + "memory(GiB)": 302.58, + "step": 177440, + "train_speed(iter/s)": 0.123826 + }, + { + "acc": 0.73098588, + "epoch": 0.9924403737450139, + "grad_norm": 7.53125, + "learning_rate": 5.4751351650245435e-06, + "loss": 1.07083559, + "memory(GiB)": 302.58, + "step": 177460, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.75557036, + "epoch": 0.9925522232179932, + "grad_norm": 8.875, + "learning_rate": 5.474214636512119e-06, + "loss": 0.94717674, + "memory(GiB)": 302.58, + "step": 177480, + "train_speed(iter/s)": 0.12384 + }, + { + "acc": 0.73431678, + "epoch": 0.9926640726909725, + "grad_norm": 5.09375, + "learning_rate": 5.473294091780049e-06, + "loss": 1.0441247, + "memory(GiB)": 302.58, + "step": 177500, + "train_speed(iter/s)": 0.123846 + }, + { + "acc": 0.77265844, + "epoch": 0.9927759221639517, + "grad_norm": 6.96875, + "learning_rate": 5.472373530859822e-06, + "loss": 0.88312855, + "memory(GiB)": 302.58, + "step": 177520, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.74459901, + "epoch": 0.992887771636931, + "grad_norm": 5.90625, + "learning_rate": 5.471452953782921e-06, + "loss": 0.99832773, + "memory(GiB)": 302.58, + "step": 177540, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.74720259, + "epoch": 0.9929996211099102, + "grad_norm": 6.4375, + "learning_rate": 5.470532360580835e-06, + "loss": 0.98741522, + "memory(GiB)": 302.58, + "step": 177560, + "train_speed(iter/s)": 0.123866 + }, + { + "acc": 0.74316306, + "epoch": 0.9931114705828895, + "grad_norm": 6.5625, + "learning_rate": 5.46961175128505e-06, + "loss": 1.0265192, + "memory(GiB)": 302.58, + "step": 177580, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.72958217, + "epoch": 0.9932233200558688, + "grad_norm": 6.15625, + "learning_rate": 5.468691125927054e-06, + "loss": 1.06269913, + "memory(GiB)": 302.58, + "step": 177600, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.73301983, + "epoch": 0.993335169528848, + "grad_norm": 6.09375, + "learning_rate": 5.4677704845383365e-06, + "loss": 1.03052073, + "memory(GiB)": 302.58, + "step": 177620, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.75296745, + "epoch": 0.9934470190018273, + "grad_norm": 7.25, + "learning_rate": 5.466849827150383e-06, + "loss": 0.95882807, + "memory(GiB)": 302.58, + "step": 177640, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.74463305, + "epoch": 0.9935588684748066, + "grad_norm": 8.9375, + "learning_rate": 5.465929153794687e-06, + "loss": 0.9995821, + "memory(GiB)": 302.58, + "step": 177660, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.72644372, + "epoch": 0.9936707179477858, + "grad_norm": 6.9375, + "learning_rate": 5.465008464502735e-06, + "loss": 1.08446007, + "memory(GiB)": 302.58, + "step": 177680, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.7556386, + "epoch": 0.9937825674207651, + "grad_norm": 4.25, + "learning_rate": 5.4640877593060195e-06, + "loss": 0.955723, + "memory(GiB)": 302.58, + "step": 177700, + "train_speed(iter/s)": 0.123913 + }, + { + "acc": 0.72726746, + "epoch": 0.9938944168937444, + "grad_norm": 5.28125, + "learning_rate": 5.463167038236031e-06, + "loss": 1.0598527, + "memory(GiB)": 302.58, + "step": 177720, + "train_speed(iter/s)": 0.123919 + }, + { + "acc": 0.74857593, + "epoch": 0.9940062663667236, + "grad_norm": 7.625, + "learning_rate": 5.462246301324261e-06, + "loss": 0.99051924, + "memory(GiB)": 302.58, + "step": 177740, + "train_speed(iter/s)": 0.123926 + }, + { + "acc": 0.74057899, + "epoch": 0.9941181158397029, + "grad_norm": 9.8125, + "learning_rate": 5.461325548602204e-06, + "loss": 1.03634567, + "memory(GiB)": 302.58, + "step": 177760, + "train_speed(iter/s)": 0.123932 + }, + { + "acc": 0.73427162, + "epoch": 0.9942299653126822, + "grad_norm": 9.6875, + "learning_rate": 5.460404780101347e-06, + "loss": 1.05802002, + "memory(GiB)": 302.58, + "step": 177780, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.73678703, + "epoch": 0.9943418147856614, + "grad_norm": 6.125, + "learning_rate": 5.459483995853188e-06, + "loss": 1.03775425, + "memory(GiB)": 302.58, + "step": 177800, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.73886046, + "epoch": 0.9944536642586407, + "grad_norm": 8.0, + "learning_rate": 5.4585631958892196e-06, + "loss": 1.02944298, + "memory(GiB)": 302.58, + "step": 177820, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.74391432, + "epoch": 0.9945655137316199, + "grad_norm": 7.40625, + "learning_rate": 5.457642380240934e-06, + "loss": 1.02338009, + "memory(GiB)": 302.58, + "step": 177840, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.74362397, + "epoch": 0.9946773632045992, + "grad_norm": 9.5, + "learning_rate": 5.456721548939829e-06, + "loss": 1.00716906, + "memory(GiB)": 302.58, + "step": 177860, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.75468707, + "epoch": 0.9947892126775785, + "grad_norm": 6.375, + "learning_rate": 5.455800702017401e-06, + "loss": 0.94718456, + "memory(GiB)": 302.58, + "step": 177880, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.73240042, + "epoch": 0.9949010621505577, + "grad_norm": 9.4375, + "learning_rate": 5.454879839505142e-06, + "loss": 1.02633476, + "memory(GiB)": 302.58, + "step": 177900, + "train_speed(iter/s)": 0.123978 + }, + { + "acc": 0.75363851, + "epoch": 0.995012911623537, + "grad_norm": 9.0, + "learning_rate": 5.45395896143455e-06, + "loss": 0.97347479, + "memory(GiB)": 302.58, + "step": 177920, + "train_speed(iter/s)": 0.123985 + }, + { + "acc": 0.74593973, + "epoch": 0.9951247610965163, + "grad_norm": 5.28125, + "learning_rate": 5.453038067837122e-06, + "loss": 1.00357418, + "memory(GiB)": 302.58, + "step": 177940, + "train_speed(iter/s)": 0.123991 + }, + { + "acc": 0.7500021, + "epoch": 0.9952366105694955, + "grad_norm": 7.15625, + "learning_rate": 5.452117158744357e-06, + "loss": 0.98159389, + "memory(GiB)": 302.58, + "step": 177960, + "train_speed(iter/s)": 0.123997 + }, + { + "acc": 0.75703335, + "epoch": 0.9953484600424748, + "grad_norm": 5.0625, + "learning_rate": 5.451196234187749e-06, + "loss": 0.97339745, + "memory(GiB)": 302.58, + "step": 177980, + "train_speed(iter/s)": 0.124003 + }, + { + "acc": 0.74308281, + "epoch": 0.9954603095154541, + "grad_norm": 8.8125, + "learning_rate": 5.4502752941988e-06, + "loss": 1.00031109, + "memory(GiB)": 302.58, + "step": 178000, + "train_speed(iter/s)": 0.12401 + }, + { + "epoch": 0.9954603095154541, + "eval_acc": 0.7062827965356979, + "eval_loss": 1.0127134323120117, + "eval_runtime": 7505.9288, + "eval_samples_per_second": 10.03, + "eval_steps_per_second": 10.03, + "step": 178000 + }, + { + "acc": 0.74393878, + "epoch": 0.9955721589884333, + "grad_norm": 8.1875, + "learning_rate": 5.449354338809009e-06, + "loss": 1.03379679, + "memory(GiB)": 302.58, + "step": 178020, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.74448428, + "epoch": 0.9956840084614126, + "grad_norm": 6.0625, + "learning_rate": 5.448433368049874e-06, + "loss": 1.02470236, + "memory(GiB)": 302.58, + "step": 178040, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.74750204, + "epoch": 0.9957958579343918, + "grad_norm": 8.6875, + "learning_rate": 5.447512381952897e-06, + "loss": 1.01028118, + "memory(GiB)": 302.58, + "step": 178060, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.7517642, + "epoch": 0.9959077074073711, + "grad_norm": 6.75, + "learning_rate": 5.446591380549576e-06, + "loss": 0.98024883, + "memory(GiB)": 302.58, + "step": 178080, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.74193354, + "epoch": 0.9960195568803504, + "grad_norm": 9.875, + "learning_rate": 5.445670363871415e-06, + "loss": 1.02792397, + "memory(GiB)": 302.58, + "step": 178100, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.73731747, + "epoch": 0.9961314063533296, + "grad_norm": 7.5, + "learning_rate": 5.444749331949912e-06, + "loss": 1.04041042, + "memory(GiB)": 302.58, + "step": 178120, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.73878608, + "epoch": 0.9962432558263089, + "grad_norm": 7.5625, + "learning_rate": 5.443828284816574e-06, + "loss": 1.02983856, + "memory(GiB)": 302.58, + "step": 178140, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.74111204, + "epoch": 0.9963551052992882, + "grad_norm": 12.0, + "learning_rate": 5.4429072225029e-06, + "loss": 1.00852633, + "memory(GiB)": 302.58, + "step": 178160, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74507422, + "epoch": 0.9964669547722674, + "grad_norm": 8.5, + "learning_rate": 5.441986145040396e-06, + "loss": 1.00049248, + "memory(GiB)": 302.58, + "step": 178180, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.75391202, + "epoch": 0.9965788042452467, + "grad_norm": 7.09375, + "learning_rate": 5.441065052460563e-06, + "loss": 0.97573776, + "memory(GiB)": 302.58, + "step": 178200, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.72944846, + "epoch": 0.996690653718226, + "grad_norm": 6.4375, + "learning_rate": 5.440143944794908e-06, + "loss": 1.05173531, + "memory(GiB)": 302.58, + "step": 178220, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74134126, + "epoch": 0.9968025031912052, + "grad_norm": 6.5, + "learning_rate": 5.439222822074933e-06, + "loss": 1.0186532, + "memory(GiB)": 302.58, + "step": 178240, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.7492547, + "epoch": 0.9969143526641845, + "grad_norm": 7.28125, + "learning_rate": 5.438301684332144e-06, + "loss": 0.9853097, + "memory(GiB)": 302.58, + "step": 178260, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.75988145, + "epoch": 0.9970262021371638, + "grad_norm": 7.8125, + "learning_rate": 5.43738053159805e-06, + "loss": 0.93589382, + "memory(GiB)": 302.58, + "step": 178280, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.75327125, + "epoch": 0.997138051610143, + "grad_norm": 9.125, + "learning_rate": 5.436459363904154e-06, + "loss": 0.95943584, + "memory(GiB)": 302.58, + "step": 178300, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.74001102, + "epoch": 0.9972499010831223, + "grad_norm": 6.46875, + "learning_rate": 5.435538181281964e-06, + "loss": 1.01490526, + "memory(GiB)": 302.58, + "step": 178320, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74687152, + "epoch": 0.9973617505561015, + "grad_norm": 11.125, + "learning_rate": 5.434616983762987e-06, + "loss": 0.99614401, + "memory(GiB)": 302.58, + "step": 178340, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74298663, + "epoch": 0.9974736000290808, + "grad_norm": 9.875, + "learning_rate": 5.433695771378731e-06, + "loss": 1.00473909, + "memory(GiB)": 302.58, + "step": 178360, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.7515594, + "epoch": 0.9975854495020601, + "grad_norm": 6.84375, + "learning_rate": 5.432774544160704e-06, + "loss": 0.966611, + "memory(GiB)": 302.58, + "step": 178380, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.75807033, + "epoch": 0.9976972989750393, + "grad_norm": 8.625, + "learning_rate": 5.431853302140414e-06, + "loss": 0.97194347, + "memory(GiB)": 302.58, + "step": 178400, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.73171954, + "epoch": 0.9978091484480186, + "grad_norm": 6.40625, + "learning_rate": 5.430932045349373e-06, + "loss": 1.05267096, + "memory(GiB)": 302.58, + "step": 178420, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.73898873, + "epoch": 0.997920997920998, + "grad_norm": 5.90625, + "learning_rate": 5.43001077381909e-06, + "loss": 1.03760653, + "memory(GiB)": 302.58, + "step": 178440, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.72810087, + "epoch": 0.9980328473939772, + "grad_norm": 5.75, + "learning_rate": 5.429089487581074e-06, + "loss": 1.06556683, + "memory(GiB)": 302.58, + "step": 178460, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.74280815, + "epoch": 0.9981446968669565, + "grad_norm": 6.90625, + "learning_rate": 5.428168186666838e-06, + "loss": 0.9992506, + "memory(GiB)": 302.58, + "step": 178480, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.73746929, + "epoch": 0.9982565463399358, + "grad_norm": 7.28125, + "learning_rate": 5.4272468711078914e-06, + "loss": 1.01484871, + "memory(GiB)": 302.58, + "step": 178500, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.75497808, + "epoch": 0.998368395812915, + "grad_norm": 4.21875, + "learning_rate": 5.4263255409357486e-06, + "loss": 0.97887583, + "memory(GiB)": 302.58, + "step": 178520, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.75159278, + "epoch": 0.9984802452858943, + "grad_norm": 6.625, + "learning_rate": 5.42540419618192e-06, + "loss": 0.98420858, + "memory(GiB)": 302.58, + "step": 178540, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.71988111, + "epoch": 0.9985920947588736, + "grad_norm": 6.96875, + "learning_rate": 5.424482836877918e-06, + "loss": 1.13673172, + "memory(GiB)": 302.58, + "step": 178560, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74967961, + "epoch": 0.9987039442318528, + "grad_norm": 7.3125, + "learning_rate": 5.423561463055257e-06, + "loss": 0.97103415, + "memory(GiB)": 302.58, + "step": 178580, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.74728055, + "epoch": 0.9988157937048321, + "grad_norm": 7.75, + "learning_rate": 5.422640074745451e-06, + "loss": 0.9981122, + "memory(GiB)": 302.58, + "step": 178600, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.75114803, + "epoch": 0.9989276431778114, + "grad_norm": 7.53125, + "learning_rate": 5.421718671980015e-06, + "loss": 0.99080257, + "memory(GiB)": 302.58, + "step": 178620, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.74724636, + "epoch": 0.9990394926507906, + "grad_norm": 7.53125, + "learning_rate": 5.420797254790463e-06, + "loss": 0.98402376, + "memory(GiB)": 302.58, + "step": 178640, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.73372474, + "epoch": 0.9991513421237699, + "grad_norm": 7.5625, + "learning_rate": 5.419875823208311e-06, + "loss": 1.03952236, + "memory(GiB)": 302.58, + "step": 178660, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.74684625, + "epoch": 0.9992631915967491, + "grad_norm": 4.6875, + "learning_rate": 5.418954377265074e-06, + "loss": 0.98816032, + "memory(GiB)": 302.58, + "step": 178680, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74433022, + "epoch": 0.9993750410697284, + "grad_norm": 6.40625, + "learning_rate": 5.418032916992268e-06, + "loss": 1.02608013, + "memory(GiB)": 302.58, + "step": 178700, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.72781448, + "epoch": 0.9994868905427077, + "grad_norm": 9.625, + "learning_rate": 5.417111442421411e-06, + "loss": 1.08840027, + "memory(GiB)": 302.58, + "step": 178720, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.72095556, + "epoch": 0.9995987400156869, + "grad_norm": 6.71875, + "learning_rate": 5.416189953584021e-06, + "loss": 1.10004778, + "memory(GiB)": 302.58, + "step": 178740, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.76015601, + "epoch": 0.9997105894886662, + "grad_norm": 7.09375, + "learning_rate": 5.415268450511615e-06, + "loss": 0.94428396, + "memory(GiB)": 302.58, + "step": 178760, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.733815, + "epoch": 0.9998224389616455, + "grad_norm": 10.25, + "learning_rate": 5.4143469332357105e-06, + "loss": 1.03857689, + "memory(GiB)": 302.58, + "step": 178780, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.74811401, + "epoch": 0.9999342884346247, + "grad_norm": 7.9375, + "learning_rate": 5.413425401787826e-06, + "loss": 0.98611126, + "memory(GiB)": 302.58, + "step": 178800, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.73809161, + "epoch": 1.000046137907604, + "grad_norm": 6.09375, + "learning_rate": 5.412503856199484e-06, + "loss": 1.03082952, + "memory(GiB)": 302.58, + "step": 178820, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.76311617, + "epoch": 1.0001579873805833, + "grad_norm": 6.90625, + "learning_rate": 5.4115822965022e-06, + "loss": 0.94399261, + "memory(GiB)": 302.58, + "step": 178840, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.73969164, + "epoch": 1.0002698368535625, + "grad_norm": 6.78125, + "learning_rate": 5.410660722727497e-06, + "loss": 1.02329664, + "memory(GiB)": 302.58, + "step": 178860, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.73998647, + "epoch": 1.0003816863265418, + "grad_norm": 4.71875, + "learning_rate": 5.409739134906896e-06, + "loss": 1.03647356, + "memory(GiB)": 302.58, + "step": 178880, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.76209321, + "epoch": 1.000493535799521, + "grad_norm": 7.46875, + "learning_rate": 5.408817533071917e-06, + "loss": 0.93397493, + "memory(GiB)": 302.58, + "step": 178900, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.74362578, + "epoch": 1.0006053852725003, + "grad_norm": 8.0625, + "learning_rate": 5.407895917254083e-06, + "loss": 0.99422235, + "memory(GiB)": 302.58, + "step": 178920, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.75113444, + "epoch": 1.0007172347454796, + "grad_norm": 6.90625, + "learning_rate": 5.406974287484913e-06, + "loss": 0.98065224, + "memory(GiB)": 302.58, + "step": 178940, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.75978837, + "epoch": 1.0008290842184588, + "grad_norm": 8.625, + "learning_rate": 5.406052643795934e-06, + "loss": 0.94002476, + "memory(GiB)": 302.58, + "step": 178960, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.7469233, + "epoch": 1.000940933691438, + "grad_norm": 8.125, + "learning_rate": 5.405130986218664e-06, + "loss": 1.00519686, + "memory(GiB)": 302.58, + "step": 178980, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.73989944, + "epoch": 1.0010527831644174, + "grad_norm": 4.28125, + "learning_rate": 5.404209314784632e-06, + "loss": 1.02819395, + "memory(GiB)": 302.58, + "step": 179000, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.74331818, + "epoch": 1.0011646326373966, + "grad_norm": 6.59375, + "learning_rate": 5.40328762952536e-06, + "loss": 1.04068336, + "memory(GiB)": 302.58, + "step": 179020, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.75770268, + "epoch": 1.001276482110376, + "grad_norm": 7.28125, + "learning_rate": 5.40236593047237e-06, + "loss": 0.95776892, + "memory(GiB)": 302.58, + "step": 179040, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.75369325, + "epoch": 1.0013883315833552, + "grad_norm": 6.65625, + "learning_rate": 5.401444217657191e-06, + "loss": 0.96924305, + "memory(GiB)": 302.58, + "step": 179060, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.75520291, + "epoch": 1.0015001810563344, + "grad_norm": 6.1875, + "learning_rate": 5.400522491111347e-06, + "loss": 0.95893555, + "memory(GiB)": 302.58, + "step": 179080, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.74131036, + "epoch": 1.0016120305293137, + "grad_norm": 7.9375, + "learning_rate": 5.399600750866364e-06, + "loss": 1.04389687, + "memory(GiB)": 302.58, + "step": 179100, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.74645958, + "epoch": 1.001723880002293, + "grad_norm": 5.5, + "learning_rate": 5.398678996953768e-06, + "loss": 1.01193018, + "memory(GiB)": 302.58, + "step": 179120, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75686703, + "epoch": 1.0018357294752722, + "grad_norm": 6.21875, + "learning_rate": 5.397757229405085e-06, + "loss": 0.96799316, + "memory(GiB)": 302.58, + "step": 179140, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.77491121, + "epoch": 1.0019475789482515, + "grad_norm": 5.71875, + "learning_rate": 5.396835448251844e-06, + "loss": 0.86999016, + "memory(GiB)": 302.58, + "step": 179160, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.75840693, + "epoch": 1.0020594284212307, + "grad_norm": 6.8125, + "learning_rate": 5.395913653525573e-06, + "loss": 0.93745012, + "memory(GiB)": 302.58, + "step": 179180, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.77175193, + "epoch": 1.00217127789421, + "grad_norm": 5.78125, + "learning_rate": 5.3949918452577995e-06, + "loss": 0.89039125, + "memory(GiB)": 302.58, + "step": 179200, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.75617337, + "epoch": 1.0022831273671893, + "grad_norm": 7.03125, + "learning_rate": 5.3940700234800525e-06, + "loss": 0.95641708, + "memory(GiB)": 302.58, + "step": 179220, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.73685808, + "epoch": 1.0023949768401685, + "grad_norm": 9.625, + "learning_rate": 5.3931481882238615e-06, + "loss": 1.0382225, + "memory(GiB)": 302.58, + "step": 179240, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.75380583, + "epoch": 1.0025068263131478, + "grad_norm": 8.0625, + "learning_rate": 5.392226339520756e-06, + "loss": 0.95647097, + "memory(GiB)": 302.58, + "step": 179260, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.74480815, + "epoch": 1.002618675786127, + "grad_norm": 8.0625, + "learning_rate": 5.391304477402262e-06, + "loss": 0.98270111, + "memory(GiB)": 302.58, + "step": 179280, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.76651268, + "epoch": 1.0027305252591063, + "grad_norm": 8.625, + "learning_rate": 5.390382601899918e-06, + "loss": 0.91390066, + "memory(GiB)": 302.58, + "step": 179300, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.75646396, + "epoch": 1.0028423747320856, + "grad_norm": 9.25, + "learning_rate": 5.389460713045252e-06, + "loss": 0.950173, + "memory(GiB)": 302.58, + "step": 179320, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.74692864, + "epoch": 1.0029542242050649, + "grad_norm": 5.84375, + "learning_rate": 5.388538810869792e-06, + "loss": 1.01984863, + "memory(GiB)": 302.58, + "step": 179340, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.7549561, + "epoch": 1.0030660736780441, + "grad_norm": 8.1875, + "learning_rate": 5.387616895405075e-06, + "loss": 0.95823402, + "memory(GiB)": 302.58, + "step": 179360, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.74804525, + "epoch": 1.0031779231510234, + "grad_norm": 6.34375, + "learning_rate": 5.38669496668263e-06, + "loss": 0.99839993, + "memory(GiB)": 302.58, + "step": 179380, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.73347993, + "epoch": 1.0032897726240027, + "grad_norm": 6.625, + "learning_rate": 5.385773024733989e-06, + "loss": 1.06769514, + "memory(GiB)": 302.58, + "step": 179400, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.76220117, + "epoch": 1.003401622096982, + "grad_norm": 6.75, + "learning_rate": 5.384851069590689e-06, + "loss": 0.94574594, + "memory(GiB)": 302.58, + "step": 179420, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.74286876, + "epoch": 1.0035134715699612, + "grad_norm": 5.5625, + "learning_rate": 5.383929101284262e-06, + "loss": 1.03369083, + "memory(GiB)": 302.58, + "step": 179440, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.75028944, + "epoch": 1.0036253210429404, + "grad_norm": 9.5, + "learning_rate": 5.383007119846243e-06, + "loss": 0.99103756, + "memory(GiB)": 302.58, + "step": 179460, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.74583998, + "epoch": 1.0037371705159197, + "grad_norm": 6.0, + "learning_rate": 5.382085125308165e-06, + "loss": 1.02905188, + "memory(GiB)": 302.58, + "step": 179480, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.75252967, + "epoch": 1.003849019988899, + "grad_norm": 6.5625, + "learning_rate": 5.381163117701565e-06, + "loss": 0.95856838, + "memory(GiB)": 302.58, + "step": 179500, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.73869319, + "epoch": 1.0039608694618782, + "grad_norm": 10.625, + "learning_rate": 5.380241097057978e-06, + "loss": 1.01246729, + "memory(GiB)": 302.58, + "step": 179520, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.76461234, + "epoch": 1.0040727189348575, + "grad_norm": 7.875, + "learning_rate": 5.379319063408939e-06, + "loss": 0.9318532, + "memory(GiB)": 302.58, + "step": 179540, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.74509931, + "epoch": 1.0041845684078368, + "grad_norm": 7.03125, + "learning_rate": 5.378397016785984e-06, + "loss": 1.01060247, + "memory(GiB)": 302.58, + "step": 179560, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.75722227, + "epoch": 1.004296417880816, + "grad_norm": 7.5625, + "learning_rate": 5.377474957220654e-06, + "loss": 0.93064384, + "memory(GiB)": 302.58, + "step": 179580, + "train_speed(iter/s)": 0.123869 + }, + { + "acc": 0.76262894, + "epoch": 1.0044082673537953, + "grad_norm": 7.84375, + "learning_rate": 5.376552884744481e-06, + "loss": 0.93915482, + "memory(GiB)": 302.58, + "step": 179600, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.74534154, + "epoch": 1.0045201168267746, + "grad_norm": 4.6875, + "learning_rate": 5.375630799389008e-06, + "loss": 0.98774567, + "memory(GiB)": 302.58, + "step": 179620, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.75460868, + "epoch": 1.0046319662997538, + "grad_norm": 9.0625, + "learning_rate": 5.374708701185769e-06, + "loss": 0.96769991, + "memory(GiB)": 302.58, + "step": 179640, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.7685914, + "epoch": 1.004743815772733, + "grad_norm": 6.09375, + "learning_rate": 5.373786590166305e-06, + "loss": 0.9169878, + "memory(GiB)": 302.58, + "step": 179660, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.73779283, + "epoch": 1.0048556652457123, + "grad_norm": 7.1875, + "learning_rate": 5.372864466362152e-06, + "loss": 1.03767757, + "memory(GiB)": 302.58, + "step": 179680, + "train_speed(iter/s)": 0.123902 + }, + { + "acc": 0.75761518, + "epoch": 1.0049675147186916, + "grad_norm": 6.0, + "learning_rate": 5.371942329804853e-06, + "loss": 0.97337818, + "memory(GiB)": 302.58, + "step": 179700, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.76295614, + "epoch": 1.0050793641916709, + "grad_norm": 8.125, + "learning_rate": 5.371020180525946e-06, + "loss": 0.90097418, + "memory(GiB)": 302.58, + "step": 179720, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.75539732, + "epoch": 1.0051912136646501, + "grad_norm": 7.71875, + "learning_rate": 5.370098018556975e-06, + "loss": 0.96084156, + "memory(GiB)": 302.58, + "step": 179740, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.75818734, + "epoch": 1.0053030631376294, + "grad_norm": 9.25, + "learning_rate": 5.369175843929477e-06, + "loss": 0.94303541, + "memory(GiB)": 302.58, + "step": 179760, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.73782949, + "epoch": 1.0054149126106087, + "grad_norm": 6.125, + "learning_rate": 5.368253656674994e-06, + "loss": 1.04996405, + "memory(GiB)": 302.58, + "step": 179780, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.74759603, + "epoch": 1.005526762083588, + "grad_norm": 7.6875, + "learning_rate": 5.36733145682507e-06, + "loss": 0.97013607, + "memory(GiB)": 302.58, + "step": 179800, + "train_speed(iter/s)": 0.123942 + }, + { + "acc": 0.75735259, + "epoch": 1.0056386115565672, + "grad_norm": 6.9375, + "learning_rate": 5.366409244411244e-06, + "loss": 0.9442872, + "memory(GiB)": 302.58, + "step": 179820, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.75689869, + "epoch": 1.0057504610295465, + "grad_norm": 5.71875, + "learning_rate": 5.3654870194650585e-06, + "loss": 0.95633907, + "memory(GiB)": 302.58, + "step": 179840, + "train_speed(iter/s)": 0.123955 + }, + { + "acc": 0.77362418, + "epoch": 1.0058623105025257, + "grad_norm": 9.125, + "learning_rate": 5.36456478201806e-06, + "loss": 0.8730711, + "memory(GiB)": 302.58, + "step": 179860, + "train_speed(iter/s)": 0.123961 + }, + { + "acc": 0.74775605, + "epoch": 1.005974159975505, + "grad_norm": 7.3125, + "learning_rate": 5.363642532101791e-06, + "loss": 0.97604284, + "memory(GiB)": 302.58, + "step": 179880, + "train_speed(iter/s)": 0.123968 + }, + { + "acc": 0.75105081, + "epoch": 1.0060860094484843, + "grad_norm": 6.6875, + "learning_rate": 5.362720269747792e-06, + "loss": 0.96992512, + "memory(GiB)": 302.58, + "step": 179900, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.74448123, + "epoch": 1.0061978589214635, + "grad_norm": 7.78125, + "learning_rate": 5.361797994987611e-06, + "loss": 1.00341063, + "memory(GiB)": 302.58, + "step": 179920, + "train_speed(iter/s)": 0.123981 + }, + { + "acc": 0.73617687, + "epoch": 1.0063097083944428, + "grad_norm": 9.6875, + "learning_rate": 5.360875707852791e-06, + "loss": 1.03428154, + "memory(GiB)": 302.58, + "step": 179940, + "train_speed(iter/s)": 0.123987 + }, + { + "acc": 0.73438878, + "epoch": 1.006421557867422, + "grad_norm": 9.25, + "learning_rate": 5.359953408374878e-06, + "loss": 1.04647226, + "memory(GiB)": 302.58, + "step": 179960, + "train_speed(iter/s)": 0.123994 + }, + { + "acc": 0.74999847, + "epoch": 1.0065334073404013, + "grad_norm": 6.625, + "learning_rate": 5.359031096585415e-06, + "loss": 0.98540878, + "memory(GiB)": 302.58, + "step": 179980, + "train_speed(iter/s)": 0.124001 + }, + { + "acc": 0.75699873, + "epoch": 1.0066452568133806, + "grad_norm": 7.8125, + "learning_rate": 5.358108772515952e-06, + "loss": 0.92541752, + "memory(GiB)": 302.58, + "step": 180000, + "train_speed(iter/s)": 0.124007 + }, + { + "epoch": 1.0066452568133806, + "eval_acc": 0.7062743174884589, + "eval_loss": 1.0142908096313477, + "eval_runtime": 7498.1467, + "eval_samples_per_second": 10.04, + "eval_steps_per_second": 10.04, + "step": 180000 + }, + { + "acc": 0.74566622, + "epoch": 1.0067571062863598, + "grad_norm": 6.96875, + "learning_rate": 5.357186436198033e-06, + "loss": 1.00105934, + "memory(GiB)": 302.58, + "step": 180020, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.73622527, + "epoch": 1.006868955759339, + "grad_norm": 5.25, + "learning_rate": 5.356264087663205e-06, + "loss": 1.06849279, + "memory(GiB)": 302.58, + "step": 180040, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.75459781, + "epoch": 1.0069808052323184, + "grad_norm": 8.875, + "learning_rate": 5.355341726943016e-06, + "loss": 0.94793243, + "memory(GiB)": 302.58, + "step": 180060, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.74196944, + "epoch": 1.0070926547052976, + "grad_norm": 5.625, + "learning_rate": 5.354419354069013e-06, + "loss": 1.02335024, + "memory(GiB)": 302.58, + "step": 180080, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74710126, + "epoch": 1.007204504178277, + "grad_norm": 6.3125, + "learning_rate": 5.353496969072745e-06, + "loss": 0.98545141, + "memory(GiB)": 302.58, + "step": 180100, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.76459622, + "epoch": 1.0073163536512562, + "grad_norm": 10.1875, + "learning_rate": 5.352574571985758e-06, + "loss": 0.92619057, + "memory(GiB)": 302.58, + "step": 180120, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.75498719, + "epoch": 1.0074282031242354, + "grad_norm": 5.84375, + "learning_rate": 5.3516521628396035e-06, + "loss": 0.96466217, + "memory(GiB)": 302.58, + "step": 180140, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.7437746, + "epoch": 1.0075400525972147, + "grad_norm": 6.71875, + "learning_rate": 5.35072974166583e-06, + "loss": 1.00045938, + "memory(GiB)": 302.58, + "step": 180160, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75856047, + "epoch": 1.007651902070194, + "grad_norm": 9.25, + "learning_rate": 5.349807308495988e-06, + "loss": 0.94392214, + "memory(GiB)": 302.58, + "step": 180180, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.76024175, + "epoch": 1.0077637515431732, + "grad_norm": 6.71875, + "learning_rate": 5.348884863361625e-06, + "loss": 0.92437401, + "memory(GiB)": 302.58, + "step": 180200, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.76083903, + "epoch": 1.0078756010161525, + "grad_norm": 5.5, + "learning_rate": 5.347962406294296e-06, + "loss": 0.94178333, + "memory(GiB)": 302.58, + "step": 180220, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.75476246, + "epoch": 1.0079874504891317, + "grad_norm": 7.375, + "learning_rate": 5.347039937325548e-06, + "loss": 0.955828, + "memory(GiB)": 302.58, + "step": 180240, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.74536901, + "epoch": 1.008099299962111, + "grad_norm": 5.40625, + "learning_rate": 5.346117456486932e-06, + "loss": 1.01692219, + "memory(GiB)": 302.58, + "step": 180260, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.74958296, + "epoch": 1.0082111494350903, + "grad_norm": 7.375, + "learning_rate": 5.345194963810003e-06, + "loss": 0.99065084, + "memory(GiB)": 302.58, + "step": 180280, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.74627943, + "epoch": 1.0083229989080695, + "grad_norm": 5.5625, + "learning_rate": 5.344272459326311e-06, + "loss": 0.99154005, + "memory(GiB)": 302.58, + "step": 180300, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.74492388, + "epoch": 1.0084348483810488, + "grad_norm": 6.625, + "learning_rate": 5.34334994306741e-06, + "loss": 0.99837151, + "memory(GiB)": 302.58, + "step": 180320, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.75945911, + "epoch": 1.008546697854028, + "grad_norm": 7.53125, + "learning_rate": 5.342427415064852e-06, + "loss": 0.9463131, + "memory(GiB)": 302.58, + "step": 180340, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75992837, + "epoch": 1.0086585473270073, + "grad_norm": 6.5625, + "learning_rate": 5.341504875350191e-06, + "loss": 0.93406162, + "memory(GiB)": 302.58, + "step": 180360, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.74850264, + "epoch": 1.0087703967999866, + "grad_norm": 8.6875, + "learning_rate": 5.34058232395498e-06, + "loss": 0.96820393, + "memory(GiB)": 302.58, + "step": 180380, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.75431175, + "epoch": 1.0088822462729659, + "grad_norm": 7.5625, + "learning_rate": 5.3396597609107735e-06, + "loss": 0.94381762, + "memory(GiB)": 302.58, + "step": 180400, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74199595, + "epoch": 1.0089940957459451, + "grad_norm": 9.8125, + "learning_rate": 5.338737186249125e-06, + "loss": 1.01635914, + "memory(GiB)": 302.58, + "step": 180420, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.73789167, + "epoch": 1.0091059452189244, + "grad_norm": 6.75, + "learning_rate": 5.337814600001593e-06, + "loss": 1.03601446, + "memory(GiB)": 302.58, + "step": 180440, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.76646404, + "epoch": 1.0092177946919036, + "grad_norm": 7.09375, + "learning_rate": 5.33689200219973e-06, + "loss": 0.90819578, + "memory(GiB)": 302.58, + "step": 180460, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.72857924, + "epoch": 1.009329644164883, + "grad_norm": 7.59375, + "learning_rate": 5.335969392875091e-06, + "loss": 1.06473751, + "memory(GiB)": 302.58, + "step": 180480, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75166626, + "epoch": 1.0094414936378622, + "grad_norm": 7.75, + "learning_rate": 5.335046772059235e-06, + "loss": 0.96991777, + "memory(GiB)": 302.58, + "step": 180500, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.73156638, + "epoch": 1.0095533431108414, + "grad_norm": 7.125, + "learning_rate": 5.334124139783716e-06, + "loss": 1.06312065, + "memory(GiB)": 302.58, + "step": 180520, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.74421763, + "epoch": 1.0096651925838207, + "grad_norm": 7.875, + "learning_rate": 5.333201496080092e-06, + "loss": 0.95856266, + "memory(GiB)": 302.58, + "step": 180540, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.77492523, + "epoch": 1.0097770420568, + "grad_norm": 6.1875, + "learning_rate": 5.33227884097992e-06, + "loss": 0.87248163, + "memory(GiB)": 302.58, + "step": 180560, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.76117473, + "epoch": 1.0098888915297792, + "grad_norm": 9.625, + "learning_rate": 5.331356174514758e-06, + "loss": 0.94292727, + "memory(GiB)": 302.58, + "step": 180580, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.767945, + "epoch": 1.0100007410027585, + "grad_norm": 6.8125, + "learning_rate": 5.330433496716165e-06, + "loss": 0.89882774, + "memory(GiB)": 302.58, + "step": 180600, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.7398272, + "epoch": 1.0101125904757378, + "grad_norm": 6.75, + "learning_rate": 5.3295108076156975e-06, + "loss": 1.04796572, + "memory(GiB)": 302.58, + "step": 180620, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.74753909, + "epoch": 1.010224439948717, + "grad_norm": 5.90625, + "learning_rate": 5.3285881072449165e-06, + "loss": 0.98233519, + "memory(GiB)": 302.58, + "step": 180640, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74326015, + "epoch": 1.0103362894216963, + "grad_norm": 5.8125, + "learning_rate": 5.32766539563538e-06, + "loss": 1.0486681, + "memory(GiB)": 302.58, + "step": 180660, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.76140604, + "epoch": 1.0104481388946756, + "grad_norm": 8.6875, + "learning_rate": 5.326742672818649e-06, + "loss": 0.9299118, + "memory(GiB)": 302.58, + "step": 180680, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.73795099, + "epoch": 1.0105599883676548, + "grad_norm": 8.0, + "learning_rate": 5.32581993882628e-06, + "loss": 1.03376331, + "memory(GiB)": 302.58, + "step": 180700, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.76403208, + "epoch": 1.010671837840634, + "grad_norm": 3.765625, + "learning_rate": 5.324897193689837e-06, + "loss": 0.94425793, + "memory(GiB)": 302.58, + "step": 180720, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.76013651, + "epoch": 1.0107836873136133, + "grad_norm": 5.84375, + "learning_rate": 5.323974437440881e-06, + "loss": 0.92550879, + "memory(GiB)": 302.58, + "step": 180740, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.78025861, + "epoch": 1.0108955367865926, + "grad_norm": 7.1875, + "learning_rate": 5.323051670110972e-06, + "loss": 0.83781223, + "memory(GiB)": 302.58, + "step": 180760, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.75199642, + "epoch": 1.0110073862595719, + "grad_norm": 9.625, + "learning_rate": 5.32212889173167e-06, + "loss": 0.98214006, + "memory(GiB)": 302.58, + "step": 180780, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.74755936, + "epoch": 1.0111192357325511, + "grad_norm": 7.3125, + "learning_rate": 5.3212061023345385e-06, + "loss": 0.99790497, + "memory(GiB)": 302.58, + "step": 180800, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.7398963, + "epoch": 1.0112310852055304, + "grad_norm": 8.125, + "learning_rate": 5.320283301951141e-06, + "loss": 1.0209404, + "memory(GiB)": 302.58, + "step": 180820, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.73967886, + "epoch": 1.0113429346785097, + "grad_norm": 7.0625, + "learning_rate": 5.319360490613037e-06, + "loss": 1.04287758, + "memory(GiB)": 302.58, + "step": 180840, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.74032221, + "epoch": 1.011454784151489, + "grad_norm": 5.25, + "learning_rate": 5.3184376683517936e-06, + "loss": 1.03600655, + "memory(GiB)": 302.58, + "step": 180860, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.74058194, + "epoch": 1.0115666336244682, + "grad_norm": 12.375, + "learning_rate": 5.3175148351989715e-06, + "loss": 1.01856384, + "memory(GiB)": 302.58, + "step": 180880, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.74626241, + "epoch": 1.0116784830974475, + "grad_norm": 8.3125, + "learning_rate": 5.316591991186135e-06, + "loss": 0.99699926, + "memory(GiB)": 302.58, + "step": 180900, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.73163242, + "epoch": 1.0117903325704267, + "grad_norm": 8.875, + "learning_rate": 5.315669136344848e-06, + "loss": 1.0666153, + "memory(GiB)": 302.58, + "step": 180920, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.74140921, + "epoch": 1.011902182043406, + "grad_norm": 9.0625, + "learning_rate": 5.314746270706677e-06, + "loss": 1.01968784, + "memory(GiB)": 302.58, + "step": 180940, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.74717832, + "epoch": 1.0120140315163852, + "grad_norm": 7.03125, + "learning_rate": 5.313823394303185e-06, + "loss": 0.99370651, + "memory(GiB)": 302.58, + "step": 180960, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.74752698, + "epoch": 1.0121258809893645, + "grad_norm": 6.15625, + "learning_rate": 5.312900507165937e-06, + "loss": 1.01397448, + "memory(GiB)": 302.58, + "step": 180980, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.75063896, + "epoch": 1.0122377304623438, + "grad_norm": 7.15625, + "learning_rate": 5.3119776093264996e-06, + "loss": 0.98232574, + "memory(GiB)": 302.58, + "step": 181000, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.73452849, + "epoch": 1.012349579935323, + "grad_norm": 7.03125, + "learning_rate": 5.311054700816439e-06, + "loss": 1.05225935, + "memory(GiB)": 302.58, + "step": 181020, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.76708851, + "epoch": 1.0124614294083023, + "grad_norm": 7.53125, + "learning_rate": 5.3101317816673215e-06, + "loss": 0.88837194, + "memory(GiB)": 302.58, + "step": 181040, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.76085262, + "epoch": 1.0125732788812816, + "grad_norm": 6.4375, + "learning_rate": 5.3092088519107145e-06, + "loss": 0.92316818, + "memory(GiB)": 302.58, + "step": 181060, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.75572639, + "epoch": 1.0126851283542608, + "grad_norm": 8.875, + "learning_rate": 5.308285911578184e-06, + "loss": 0.96584177, + "memory(GiB)": 302.58, + "step": 181080, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.73423419, + "epoch": 1.01279697782724, + "grad_norm": 4.3125, + "learning_rate": 5.307362960701296e-06, + "loss": 1.0703146, + "memory(GiB)": 302.58, + "step": 181100, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.76953268, + "epoch": 1.0129088273002194, + "grad_norm": 4.96875, + "learning_rate": 5.306439999311622e-06, + "loss": 0.90709505, + "memory(GiB)": 302.58, + "step": 181120, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.74163289, + "epoch": 1.0130206767731986, + "grad_norm": 9.3125, + "learning_rate": 5.305517027440727e-06, + "loss": 1.01243305, + "memory(GiB)": 302.58, + "step": 181140, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.73910298, + "epoch": 1.013132526246178, + "grad_norm": 8.0625, + "learning_rate": 5.304594045120182e-06, + "loss": 1.04085484, + "memory(GiB)": 302.58, + "step": 181160, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.74721785, + "epoch": 1.0132443757191572, + "grad_norm": 8.25, + "learning_rate": 5.303671052381554e-06, + "loss": 0.982234, + "memory(GiB)": 302.58, + "step": 181180, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.75820198, + "epoch": 1.0133562251921364, + "grad_norm": 6.25, + "learning_rate": 5.302748049256414e-06, + "loss": 0.92104654, + "memory(GiB)": 302.58, + "step": 181200, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.76118965, + "epoch": 1.0134680746651157, + "grad_norm": 7.5625, + "learning_rate": 5.30182503577633e-06, + "loss": 0.92515059, + "memory(GiB)": 302.58, + "step": 181220, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.75776691, + "epoch": 1.013579924138095, + "grad_norm": 7.71875, + "learning_rate": 5.3009020119728725e-06, + "loss": 0.95491829, + "memory(GiB)": 302.58, + "step": 181240, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.76425514, + "epoch": 1.0136917736110742, + "grad_norm": 6.1875, + "learning_rate": 5.299978977877614e-06, + "loss": 0.91047716, + "memory(GiB)": 302.58, + "step": 181260, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.73973622, + "epoch": 1.0138036230840535, + "grad_norm": 7.65625, + "learning_rate": 5.29905593352212e-06, + "loss": 1.02536411, + "memory(GiB)": 302.58, + "step": 181280, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.77858658, + "epoch": 1.0139154725570327, + "grad_norm": 8.3125, + "learning_rate": 5.298132878937967e-06, + "loss": 0.85009327, + "memory(GiB)": 302.58, + "step": 181300, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.73974438, + "epoch": 1.014027322030012, + "grad_norm": 9.625, + "learning_rate": 5.297209814156724e-06, + "loss": 1.02123814, + "memory(GiB)": 302.58, + "step": 181320, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.74699287, + "epoch": 1.0141391715029913, + "grad_norm": 5.625, + "learning_rate": 5.296286739209962e-06, + "loss": 0.98892221, + "memory(GiB)": 302.58, + "step": 181340, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.75142021, + "epoch": 1.0142510209759705, + "grad_norm": 7.125, + "learning_rate": 5.295363654129255e-06, + "loss": 0.96485453, + "memory(GiB)": 302.58, + "step": 181360, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.76148467, + "epoch": 1.0143628704489498, + "grad_norm": 8.5625, + "learning_rate": 5.294440558946174e-06, + "loss": 0.90949917, + "memory(GiB)": 302.58, + "step": 181380, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.73936439, + "epoch": 1.014474719921929, + "grad_norm": 7.84375, + "learning_rate": 5.293517453692292e-06, + "loss": 1.04339867, + "memory(GiB)": 302.58, + "step": 181400, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.73587666, + "epoch": 1.0145865693949083, + "grad_norm": 5.9375, + "learning_rate": 5.292594338399183e-06, + "loss": 1.05535641, + "memory(GiB)": 302.58, + "step": 181420, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.74898949, + "epoch": 1.0146984188678876, + "grad_norm": 7.28125, + "learning_rate": 5.2916712130984195e-06, + "loss": 0.97843237, + "memory(GiB)": 302.58, + "step": 181440, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.73877063, + "epoch": 1.0148102683408669, + "grad_norm": 7.15625, + "learning_rate": 5.290748077821576e-06, + "loss": 1.04558134, + "memory(GiB)": 302.58, + "step": 181460, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.742554, + "epoch": 1.0149221178138461, + "grad_norm": 5.5, + "learning_rate": 5.289824932600226e-06, + "loss": 1.00353422, + "memory(GiB)": 302.58, + "step": 181480, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.73585591, + "epoch": 1.0150339672868254, + "grad_norm": 7.375, + "learning_rate": 5.288901777465945e-06, + "loss": 1.03346367, + "memory(GiB)": 302.58, + "step": 181500, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.75524974, + "epoch": 1.0151458167598046, + "grad_norm": 6.59375, + "learning_rate": 5.287978612450306e-06, + "loss": 0.95558128, + "memory(GiB)": 302.58, + "step": 181520, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.75470157, + "epoch": 1.015257666232784, + "grad_norm": 5.65625, + "learning_rate": 5.2870554375848845e-06, + "loss": 0.99199343, + "memory(GiB)": 302.58, + "step": 181540, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.76243591, + "epoch": 1.0153695157057632, + "grad_norm": 9.75, + "learning_rate": 5.286132252901258e-06, + "loss": 0.9223568, + "memory(GiB)": 302.58, + "step": 181560, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.74620304, + "epoch": 1.0154813651787424, + "grad_norm": 6.0, + "learning_rate": 5.285209058431001e-06, + "loss": 1.00934162, + "memory(GiB)": 302.58, + "step": 181580, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.77351179, + "epoch": 1.0155932146517217, + "grad_norm": 8.75, + "learning_rate": 5.284285854205689e-06, + "loss": 0.86851406, + "memory(GiB)": 302.58, + "step": 181600, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.73054204, + "epoch": 1.015705064124701, + "grad_norm": 7.15625, + "learning_rate": 5.283362640256902e-06, + "loss": 1.0649663, + "memory(GiB)": 302.58, + "step": 181620, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.75237908, + "epoch": 1.0158169135976802, + "grad_norm": 7.09375, + "learning_rate": 5.282439416616212e-06, + "loss": 0.95879545, + "memory(GiB)": 302.58, + "step": 181640, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.75640492, + "epoch": 1.0159287630706595, + "grad_norm": 6.0625, + "learning_rate": 5.2815161833152e-06, + "loss": 0.96798859, + "memory(GiB)": 302.58, + "step": 181660, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.74294128, + "epoch": 1.0160406125436388, + "grad_norm": 7.625, + "learning_rate": 5.280592940385441e-06, + "loss": 1.00981054, + "memory(GiB)": 302.58, + "step": 181680, + "train_speed(iter/s)": 0.123902 + }, + { + "acc": 0.74225807, + "epoch": 1.016152462016618, + "grad_norm": 6.1875, + "learning_rate": 5.279669687858513e-06, + "loss": 1.03206682, + "memory(GiB)": 302.58, + "step": 181700, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.74551096, + "epoch": 1.0162643114895973, + "grad_norm": 8.25, + "learning_rate": 5.278746425765994e-06, + "loss": 1.0267828, + "memory(GiB)": 302.58, + "step": 181720, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.74320021, + "epoch": 1.0163761609625765, + "grad_norm": 7.84375, + "learning_rate": 5.277823154139465e-06, + "loss": 1.01651754, + "memory(GiB)": 302.58, + "step": 181740, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.74846268, + "epoch": 1.0164880104355558, + "grad_norm": 5.78125, + "learning_rate": 5.2768998730105025e-06, + "loss": 0.97640829, + "memory(GiB)": 302.58, + "step": 181760, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.75894938, + "epoch": 1.016599859908535, + "grad_norm": 7.5, + "learning_rate": 5.275976582410686e-06, + "loss": 0.92815771, + "memory(GiB)": 302.58, + "step": 181780, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.74655576, + "epoch": 1.0167117093815143, + "grad_norm": 5.59375, + "learning_rate": 5.275053282371597e-06, + "loss": 0.98986263, + "memory(GiB)": 302.58, + "step": 181800, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.76535964, + "epoch": 1.0168235588544936, + "grad_norm": 4.75, + "learning_rate": 5.274129972924813e-06, + "loss": 0.90802307, + "memory(GiB)": 302.58, + "step": 181820, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.77281084, + "epoch": 1.0169354083274729, + "grad_norm": 7.5625, + "learning_rate": 5.273206654101914e-06, + "loss": 0.88824825, + "memory(GiB)": 302.58, + "step": 181840, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.74566817, + "epoch": 1.0170472578004521, + "grad_norm": 4.875, + "learning_rate": 5.2722833259344805e-06, + "loss": 0.99765873, + "memory(GiB)": 302.58, + "step": 181860, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.75331903, + "epoch": 1.0171591072734314, + "grad_norm": 6.625, + "learning_rate": 5.271359988454095e-06, + "loss": 0.9526022, + "memory(GiB)": 302.58, + "step": 181880, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.73793287, + "epoch": 1.0172709567464107, + "grad_norm": 7.5625, + "learning_rate": 5.270436641692338e-06, + "loss": 1.0458374, + "memory(GiB)": 302.58, + "step": 181900, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.74102125, + "epoch": 1.01738280621939, + "grad_norm": 9.8125, + "learning_rate": 5.26951328568079e-06, + "loss": 1.01578579, + "memory(GiB)": 302.58, + "step": 181920, + "train_speed(iter/s)": 0.123978 + }, + { + "acc": 0.75851555, + "epoch": 1.0174946556923692, + "grad_norm": 5.28125, + "learning_rate": 5.268589920451031e-06, + "loss": 0.92872095, + "memory(GiB)": 302.58, + "step": 181940, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.74509349, + "epoch": 1.0176065051653485, + "grad_norm": 5.3125, + "learning_rate": 5.267666546034648e-06, + "loss": 1.0204917, + "memory(GiB)": 302.58, + "step": 181960, + "train_speed(iter/s)": 0.12399 + }, + { + "acc": 0.7434217, + "epoch": 1.0177183546383277, + "grad_norm": 5.90625, + "learning_rate": 5.266743162463217e-06, + "loss": 1.00958471, + "memory(GiB)": 302.58, + "step": 181980, + "train_speed(iter/s)": 0.123997 + }, + { + "acc": 0.76587605, + "epoch": 1.017830204111307, + "grad_norm": 5.71875, + "learning_rate": 5.265819769768326e-06, + "loss": 0.90026436, + "memory(GiB)": 302.58, + "step": 182000, + "train_speed(iter/s)": 0.124003 + }, + { + "epoch": 1.017830204111307, + "eval_acc": 0.706327853798352, + "eval_loss": 1.014357328414917, + "eval_runtime": 7528.602, + "eval_samples_per_second": 10.0, + "eval_steps_per_second": 10.0, + "step": 182000 + }, + { + "acc": 0.74405422, + "epoch": 1.0179420535842862, + "grad_norm": 6.78125, + "learning_rate": 5.2648963679815564e-06, + "loss": 1.00658026, + "memory(GiB)": 302.58, + "step": 182020, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.76856999, + "epoch": 1.0180539030572655, + "grad_norm": 6.3125, + "learning_rate": 5.263972957134491e-06, + "loss": 0.91906233, + "memory(GiB)": 302.58, + "step": 182040, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.75086646, + "epoch": 1.0181657525302448, + "grad_norm": 7.75, + "learning_rate": 5.263049537258713e-06, + "loss": 0.97072239, + "memory(GiB)": 302.58, + "step": 182060, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.75438766, + "epoch": 1.018277602003224, + "grad_norm": 7.5625, + "learning_rate": 5.262126108385807e-06, + "loss": 0.94343901, + "memory(GiB)": 302.58, + "step": 182080, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.75668197, + "epoch": 1.0183894514762033, + "grad_norm": 8.625, + "learning_rate": 5.261202670547356e-06, + "loss": 0.95574808, + "memory(GiB)": 302.58, + "step": 182100, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.74086628, + "epoch": 1.0185013009491826, + "grad_norm": 7.96875, + "learning_rate": 5.260279223774945e-06, + "loss": 1.01279755, + "memory(GiB)": 302.58, + "step": 182120, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.74175372, + "epoch": 1.0186131504221618, + "grad_norm": 6.0, + "learning_rate": 5.259355768100161e-06, + "loss": 1.02186003, + "memory(GiB)": 302.58, + "step": 182140, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.74990606, + "epoch": 1.018724999895141, + "grad_norm": 8.875, + "learning_rate": 5.258432303554587e-06, + "loss": 0.98300638, + "memory(GiB)": 302.58, + "step": 182160, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.76273489, + "epoch": 1.0188368493681204, + "grad_norm": 8.9375, + "learning_rate": 5.257508830169807e-06, + "loss": 0.94344072, + "memory(GiB)": 302.58, + "step": 182180, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.76402183, + "epoch": 1.0189486988410996, + "grad_norm": 6.71875, + "learning_rate": 5.256585347977411e-06, + "loss": 0.91371441, + "memory(GiB)": 302.58, + "step": 182200, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.76190047, + "epoch": 1.0190605483140789, + "grad_norm": 7.84375, + "learning_rate": 5.2556618570089815e-06, + "loss": 0.94451962, + "memory(GiB)": 302.58, + "step": 182220, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.76034441, + "epoch": 1.0191723977870581, + "grad_norm": 7.875, + "learning_rate": 5.254738357296105e-06, + "loss": 0.95032616, + "memory(GiB)": 302.58, + "step": 182240, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.77104564, + "epoch": 1.0192842472600374, + "grad_norm": 12.4375, + "learning_rate": 5.25381484887037e-06, + "loss": 0.90776806, + "memory(GiB)": 302.58, + "step": 182260, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.7503861, + "epoch": 1.0193960967330167, + "grad_norm": 5.0625, + "learning_rate": 5.2528913317633615e-06, + "loss": 0.98596859, + "memory(GiB)": 302.58, + "step": 182280, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.73601303, + "epoch": 1.019507946205996, + "grad_norm": 7.4375, + "learning_rate": 5.251967806006667e-06, + "loss": 1.05376644, + "memory(GiB)": 302.58, + "step": 182300, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.75350246, + "epoch": 1.0196197956789752, + "grad_norm": 6.8125, + "learning_rate": 5.251044271631877e-06, + "loss": 0.96009359, + "memory(GiB)": 302.58, + "step": 182320, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.7458189, + "epoch": 1.0197316451519545, + "grad_norm": 8.875, + "learning_rate": 5.2501207286705745e-06, + "loss": 1.00592213, + "memory(GiB)": 302.58, + "step": 182340, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.75561547, + "epoch": 1.0198434946249337, + "grad_norm": 7.875, + "learning_rate": 5.249197177154351e-06, + "loss": 0.96655579, + "memory(GiB)": 302.58, + "step": 182360, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74161158, + "epoch": 1.019955344097913, + "grad_norm": 9.5, + "learning_rate": 5.248273617114794e-06, + "loss": 1.02293558, + "memory(GiB)": 302.58, + "step": 182380, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.74747505, + "epoch": 1.0200671935708923, + "grad_norm": 6.90625, + "learning_rate": 5.247350048583491e-06, + "loss": 1.01808138, + "memory(GiB)": 302.58, + "step": 182400, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75267954, + "epoch": 1.0201790430438715, + "grad_norm": 6.875, + "learning_rate": 5.246426471592033e-06, + "loss": 0.98650589, + "memory(GiB)": 302.58, + "step": 182420, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.73704891, + "epoch": 1.0202908925168508, + "grad_norm": 11.0625, + "learning_rate": 5.245502886172008e-06, + "loss": 1.03506174, + "memory(GiB)": 302.58, + "step": 182440, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74028282, + "epoch": 1.02040274198983, + "grad_norm": 8.3125, + "learning_rate": 5.244579292355006e-06, + "loss": 1.03993006, + "memory(GiB)": 302.58, + "step": 182460, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.75443969, + "epoch": 1.0205145914628093, + "grad_norm": 6.6875, + "learning_rate": 5.243655690172616e-06, + "loss": 0.94413099, + "memory(GiB)": 302.58, + "step": 182480, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.75522718, + "epoch": 1.0206264409357886, + "grad_norm": 7.78125, + "learning_rate": 5.242732079656431e-06, + "loss": 0.96151552, + "memory(GiB)": 302.58, + "step": 182500, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.74192033, + "epoch": 1.0207382904087678, + "grad_norm": 7.125, + "learning_rate": 5.241808460838037e-06, + "loss": 1.00995474, + "memory(GiB)": 302.58, + "step": 182520, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.76358299, + "epoch": 1.020850139881747, + "grad_norm": 5.96875, + "learning_rate": 5.240884833749027e-06, + "loss": 0.93470325, + "memory(GiB)": 302.58, + "step": 182540, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.73979039, + "epoch": 1.0209619893547264, + "grad_norm": 4.71875, + "learning_rate": 5.239961198420992e-06, + "loss": 1.0268301, + "memory(GiB)": 302.58, + "step": 182560, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.77463045, + "epoch": 1.0210738388277056, + "grad_norm": 10.0625, + "learning_rate": 5.239037554885524e-06, + "loss": 0.85268641, + "memory(GiB)": 302.58, + "step": 182580, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.76753697, + "epoch": 1.021185688300685, + "grad_norm": 8.5, + "learning_rate": 5.238113903174213e-06, + "loss": 0.92217045, + "memory(GiB)": 302.58, + "step": 182600, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.76786666, + "epoch": 1.0212975377736642, + "grad_norm": 6.375, + "learning_rate": 5.23719024331865e-06, + "loss": 0.92795143, + "memory(GiB)": 302.58, + "step": 182620, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.7495523, + "epoch": 1.0214093872466434, + "grad_norm": 5.46875, + "learning_rate": 5.236266575350432e-06, + "loss": 1.01121225, + "memory(GiB)": 302.58, + "step": 182640, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.76308107, + "epoch": 1.0215212367196227, + "grad_norm": 9.1875, + "learning_rate": 5.235342899301146e-06, + "loss": 0.91701212, + "memory(GiB)": 302.58, + "step": 182660, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.77792544, + "epoch": 1.021633086192602, + "grad_norm": 7.84375, + "learning_rate": 5.234419215202387e-06, + "loss": 0.85695724, + "memory(GiB)": 302.58, + "step": 182680, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.76996355, + "epoch": 1.0217449356655812, + "grad_norm": 5.6875, + "learning_rate": 5.233495523085745e-06, + "loss": 0.89791746, + "memory(GiB)": 302.58, + "step": 182700, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74236264, + "epoch": 1.0218567851385605, + "grad_norm": 6.96875, + "learning_rate": 5.232571822982818e-06, + "loss": 0.99183369, + "memory(GiB)": 302.58, + "step": 182720, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.75354614, + "epoch": 1.0219686346115398, + "grad_norm": 9.3125, + "learning_rate": 5.231648114925197e-06, + "loss": 0.9746068, + "memory(GiB)": 302.58, + "step": 182740, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.75812621, + "epoch": 1.022080484084519, + "grad_norm": 7.1875, + "learning_rate": 5.230724398944477e-06, + "loss": 0.95787563, + "memory(GiB)": 302.58, + "step": 182760, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.74901528, + "epoch": 1.0221923335574983, + "grad_norm": 5.75, + "learning_rate": 5.22980067507225e-06, + "loss": 0.97042074, + "memory(GiB)": 302.58, + "step": 182780, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.74912691, + "epoch": 1.0223041830304775, + "grad_norm": 8.125, + "learning_rate": 5.228876943340111e-06, + "loss": 1.00804462, + "memory(GiB)": 302.58, + "step": 182800, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.74219117, + "epoch": 1.0224160325034568, + "grad_norm": 6.5625, + "learning_rate": 5.227953203779654e-06, + "loss": 1.01772079, + "memory(GiB)": 302.58, + "step": 182820, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.74785013, + "epoch": 1.022527881976436, + "grad_norm": 8.25, + "learning_rate": 5.227029456422476e-06, + "loss": 0.9653429, + "memory(GiB)": 302.58, + "step": 182840, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75500226, + "epoch": 1.0226397314494153, + "grad_norm": 6.71875, + "learning_rate": 5.226105701300169e-06, + "loss": 0.95618954, + "memory(GiB)": 302.58, + "step": 182860, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.75934606, + "epoch": 1.0227515809223946, + "grad_norm": 6.5, + "learning_rate": 5.225181938444331e-06, + "loss": 0.96873875, + "memory(GiB)": 302.58, + "step": 182880, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.74442487, + "epoch": 1.0228634303953739, + "grad_norm": 8.6875, + "learning_rate": 5.224258167886558e-06, + "loss": 0.99503498, + "memory(GiB)": 302.58, + "step": 182900, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.76236763, + "epoch": 1.0229752798683531, + "grad_norm": 6.28125, + "learning_rate": 5.223334389658443e-06, + "loss": 0.91596127, + "memory(GiB)": 302.58, + "step": 182920, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75717211, + "epoch": 1.0230871293413324, + "grad_norm": 9.625, + "learning_rate": 5.222410603791584e-06, + "loss": 0.93247967, + "memory(GiB)": 302.58, + "step": 182940, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.7665936, + "epoch": 1.0231989788143117, + "grad_norm": 9.0, + "learning_rate": 5.221486810317577e-06, + "loss": 0.91639757, + "memory(GiB)": 302.58, + "step": 182960, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.75135937, + "epoch": 1.023310828287291, + "grad_norm": 6.5, + "learning_rate": 5.22056300926802e-06, + "loss": 0.97319708, + "memory(GiB)": 302.58, + "step": 182980, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.74962244, + "epoch": 1.0234226777602702, + "grad_norm": 6.15625, + "learning_rate": 5.219639200674507e-06, + "loss": 0.98160057, + "memory(GiB)": 302.58, + "step": 183000, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.73890944, + "epoch": 1.0235345272332494, + "grad_norm": 7.40625, + "learning_rate": 5.218715384568638e-06, + "loss": 1.03211823, + "memory(GiB)": 302.58, + "step": 183020, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74201655, + "epoch": 1.0236463767062287, + "grad_norm": 10.25, + "learning_rate": 5.2177915609820095e-06, + "loss": 1.00944414, + "memory(GiB)": 302.58, + "step": 183040, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.7459281, + "epoch": 1.023758226179208, + "grad_norm": 6.6875, + "learning_rate": 5.216867729946218e-06, + "loss": 1.02461958, + "memory(GiB)": 302.58, + "step": 183060, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.75471344, + "epoch": 1.0238700756521872, + "grad_norm": 8.375, + "learning_rate": 5.215943891492864e-06, + "loss": 0.96376505, + "memory(GiB)": 302.58, + "step": 183080, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.74682541, + "epoch": 1.0239819251251665, + "grad_norm": 6.84375, + "learning_rate": 5.2150200456535425e-06, + "loss": 1.00386763, + "memory(GiB)": 302.58, + "step": 183100, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.74525647, + "epoch": 1.0240937745981458, + "grad_norm": 9.625, + "learning_rate": 5.2140961924598545e-06, + "loss": 1.0082221, + "memory(GiB)": 302.58, + "step": 183120, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.78026485, + "epoch": 1.024205624071125, + "grad_norm": 6.03125, + "learning_rate": 5.2131723319433965e-06, + "loss": 0.85867233, + "memory(GiB)": 302.58, + "step": 183140, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.74797411, + "epoch": 1.0243174735441043, + "grad_norm": 6.0625, + "learning_rate": 5.21224846413577e-06, + "loss": 1.01550837, + "memory(GiB)": 302.58, + "step": 183160, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.75240479, + "epoch": 1.0244293230170836, + "grad_norm": 8.9375, + "learning_rate": 5.211324589068574e-06, + "loss": 0.97552509, + "memory(GiB)": 302.58, + "step": 183180, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.76694698, + "epoch": 1.0245411724900628, + "grad_norm": 6.71875, + "learning_rate": 5.210400706773405e-06, + "loss": 0.92265863, + "memory(GiB)": 302.58, + "step": 183200, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.7574018, + "epoch": 1.024653021963042, + "grad_norm": 7.1875, + "learning_rate": 5.209476817281866e-06, + "loss": 0.94159737, + "memory(GiB)": 302.58, + "step": 183220, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.75321989, + "epoch": 1.0247648714360214, + "grad_norm": 6.71875, + "learning_rate": 5.208552920625556e-06, + "loss": 0.97294731, + "memory(GiB)": 302.58, + "step": 183240, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.7477644, + "epoch": 1.0248767209090006, + "grad_norm": 5.75, + "learning_rate": 5.207629016836073e-06, + "loss": 1.00630045, + "memory(GiB)": 302.58, + "step": 183260, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.75092988, + "epoch": 1.0249885703819799, + "grad_norm": 7.125, + "learning_rate": 5.20670510594502e-06, + "loss": 0.96448374, + "memory(GiB)": 302.58, + "step": 183280, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.74853978, + "epoch": 1.0251004198549591, + "grad_norm": 6.4375, + "learning_rate": 5.205781187983997e-06, + "loss": 0.97679081, + "memory(GiB)": 302.58, + "step": 183300, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.75256495, + "epoch": 1.0252122693279384, + "grad_norm": 8.0, + "learning_rate": 5.204857262984605e-06, + "loss": 0.98089724, + "memory(GiB)": 302.58, + "step": 183320, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.74694619, + "epoch": 1.0253241188009177, + "grad_norm": 7.9375, + "learning_rate": 5.203933330978446e-06, + "loss": 1.00849266, + "memory(GiB)": 302.58, + "step": 183340, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.73930087, + "epoch": 1.025435968273897, + "grad_norm": 4.6875, + "learning_rate": 5.203009391997119e-06, + "loss": 1.03341885, + "memory(GiB)": 302.58, + "step": 183360, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.72948513, + "epoch": 1.0255478177468762, + "grad_norm": 8.375, + "learning_rate": 5.202085446072228e-06, + "loss": 1.06389055, + "memory(GiB)": 302.58, + "step": 183380, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.7635015, + "epoch": 1.0256596672198555, + "grad_norm": 5.9375, + "learning_rate": 5.201161493235373e-06, + "loss": 0.92234621, + "memory(GiB)": 302.58, + "step": 183400, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.73364387, + "epoch": 1.0257715166928347, + "grad_norm": 7.84375, + "learning_rate": 5.2002375335181575e-06, + "loss": 1.0597147, + "memory(GiB)": 302.58, + "step": 183420, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.74377718, + "epoch": 1.025883366165814, + "grad_norm": 8.5, + "learning_rate": 5.1993135669521826e-06, + "loss": 1.02151613, + "memory(GiB)": 302.58, + "step": 183440, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.7537756, + "epoch": 1.0259952156387933, + "grad_norm": 7.0, + "learning_rate": 5.198389593569052e-06, + "loss": 0.9571949, + "memory(GiB)": 302.58, + "step": 183460, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.74372177, + "epoch": 1.0261070651117725, + "grad_norm": 7.6875, + "learning_rate": 5.197465613400369e-06, + "loss": 1.01105947, + "memory(GiB)": 302.58, + "step": 183480, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.7571557, + "epoch": 1.0262189145847518, + "grad_norm": 9.0, + "learning_rate": 5.196541626477736e-06, + "loss": 0.95278959, + "memory(GiB)": 302.58, + "step": 183500, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.7405757, + "epoch": 1.026330764057731, + "grad_norm": 8.0625, + "learning_rate": 5.195617632832756e-06, + "loss": 1.01705732, + "memory(GiB)": 302.58, + "step": 183520, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.73950448, + "epoch": 1.0264426135307103, + "grad_norm": 5.53125, + "learning_rate": 5.194693632497032e-06, + "loss": 1.02629051, + "memory(GiB)": 302.58, + "step": 183540, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.74690094, + "epoch": 1.0265544630036896, + "grad_norm": 8.4375, + "learning_rate": 5.19376962550217e-06, + "loss": 1.02383862, + "memory(GiB)": 302.58, + "step": 183560, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.75931301, + "epoch": 1.0266663124766688, + "grad_norm": 8.5625, + "learning_rate": 5.19284561187977e-06, + "loss": 0.94384003, + "memory(GiB)": 302.58, + "step": 183580, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.76537514, + "epoch": 1.026778161949648, + "grad_norm": 7.46875, + "learning_rate": 5.19192159166144e-06, + "loss": 0.89166622, + "memory(GiB)": 302.58, + "step": 183600, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.74581661, + "epoch": 1.0268900114226274, + "grad_norm": 6.90625, + "learning_rate": 5.190997564878782e-06, + "loss": 1.01142645, + "memory(GiB)": 302.58, + "step": 183620, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.772157, + "epoch": 1.0270018608956066, + "grad_norm": 7.78125, + "learning_rate": 5.1900735315634034e-06, + "loss": 0.88620586, + "memory(GiB)": 302.58, + "step": 183640, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.74039149, + "epoch": 1.027113710368586, + "grad_norm": 6.6875, + "learning_rate": 5.189149491746906e-06, + "loss": 1.02512102, + "memory(GiB)": 302.58, + "step": 183660, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.74605055, + "epoch": 1.0272255598415652, + "grad_norm": 6.46875, + "learning_rate": 5.188225445460897e-06, + "loss": 1.00417252, + "memory(GiB)": 302.58, + "step": 183680, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.76029654, + "epoch": 1.0273374093145444, + "grad_norm": 5.1875, + "learning_rate": 5.187301392736981e-06, + "loss": 0.94357777, + "memory(GiB)": 302.58, + "step": 183700, + "train_speed(iter/s)": 0.123899 + }, + { + "acc": 0.735256, + "epoch": 1.0274492587875237, + "grad_norm": 12.1875, + "learning_rate": 5.186377333606763e-06, + "loss": 1.0525301, + "memory(GiB)": 302.58, + "step": 183720, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.74205079, + "epoch": 1.027561108260503, + "grad_norm": 8.875, + "learning_rate": 5.185453268101849e-06, + "loss": 1.01106033, + "memory(GiB)": 302.58, + "step": 183740, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.75779629, + "epoch": 1.0276729577334822, + "grad_norm": 6.15625, + "learning_rate": 5.1845291962538445e-06, + "loss": 0.95860319, + "memory(GiB)": 302.58, + "step": 183760, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.73078609, + "epoch": 1.0277848072064615, + "grad_norm": 6.0, + "learning_rate": 5.183605118094358e-06, + "loss": 1.04705276, + "memory(GiB)": 302.58, + "step": 183780, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.75060759, + "epoch": 1.0278966566794407, + "grad_norm": 9.0625, + "learning_rate": 5.182681033654994e-06, + "loss": 0.98282642, + "memory(GiB)": 302.58, + "step": 183800, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.74302278, + "epoch": 1.02800850615242, + "grad_norm": 7.125, + "learning_rate": 5.181756942967359e-06, + "loss": 0.99778805, + "memory(GiB)": 302.58, + "step": 183820, + "train_speed(iter/s)": 0.123937 + }, + { + "acc": 0.75807042, + "epoch": 1.0281203556253993, + "grad_norm": 7.40625, + "learning_rate": 5.180832846063059e-06, + "loss": 0.95252266, + "memory(GiB)": 302.58, + "step": 183840, + "train_speed(iter/s)": 0.123943 + }, + { + "acc": 0.75730147, + "epoch": 1.0282322050983785, + "grad_norm": 7.59375, + "learning_rate": 5.179908742973704e-06, + "loss": 0.96866865, + "memory(GiB)": 302.58, + "step": 183860, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.73206139, + "epoch": 1.0283440545713578, + "grad_norm": 4.5625, + "learning_rate": 5.178984633730898e-06, + "loss": 1.06971569, + "memory(GiB)": 302.58, + "step": 183880, + "train_speed(iter/s)": 0.123956 + }, + { + "acc": 0.75391359, + "epoch": 1.028455904044337, + "grad_norm": 6.75, + "learning_rate": 5.178060518366252e-06, + "loss": 0.97640171, + "memory(GiB)": 302.58, + "step": 183900, + "train_speed(iter/s)": 0.123962 + }, + { + "acc": 0.74570804, + "epoch": 1.0285677535173163, + "grad_norm": 5.0625, + "learning_rate": 5.17713639691137e-06, + "loss": 1.00590239, + "memory(GiB)": 302.58, + "step": 183920, + "train_speed(iter/s)": 0.123969 + }, + { + "acc": 0.75860963, + "epoch": 1.0286796029902956, + "grad_norm": 9.9375, + "learning_rate": 5.176212269397862e-06, + "loss": 0.92729645, + "memory(GiB)": 302.58, + "step": 183940, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.73918705, + "epoch": 1.0287914524632749, + "grad_norm": 9.5, + "learning_rate": 5.175288135857336e-06, + "loss": 1.03451223, + "memory(GiB)": 302.58, + "step": 183960, + "train_speed(iter/s)": 0.123981 + }, + { + "acc": 0.75034881, + "epoch": 1.0289033019362541, + "grad_norm": 7.21875, + "learning_rate": 5.174363996321398e-06, + "loss": 0.99315252, + "memory(GiB)": 302.58, + "step": 183980, + "train_speed(iter/s)": 0.123988 + }, + { + "acc": 0.75992188, + "epoch": 1.0290151514092334, + "grad_norm": 5.5, + "learning_rate": 5.173439850821661e-06, + "loss": 0.94532242, + "memory(GiB)": 302.58, + "step": 184000, + "train_speed(iter/s)": 0.123994 + }, + { + "epoch": 1.0290151514092334, + "eval_acc": 0.7063592558512081, + "eval_loss": 1.014323115348816, + "eval_runtime": 7494.1614, + "eval_samples_per_second": 10.046, + "eval_steps_per_second": 10.046, + "step": 184000 + }, + { + "acc": 0.77220511, + "epoch": 1.0291270008822127, + "grad_norm": 8.125, + "learning_rate": 5.17251569938973e-06, + "loss": 0.88162203, + "memory(GiB)": 302.58, + "step": 184020, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.75654893, + "epoch": 1.029238850355192, + "grad_norm": 6.9375, + "learning_rate": 5.171591542057216e-06, + "loss": 0.95289459, + "memory(GiB)": 302.58, + "step": 184040, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.74456162, + "epoch": 1.0293506998281712, + "grad_norm": 7.53125, + "learning_rate": 5.170667378855725e-06, + "loss": 0.99042158, + "memory(GiB)": 302.58, + "step": 184060, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.7506537, + "epoch": 1.0294625493011504, + "grad_norm": 9.3125, + "learning_rate": 5.1697432098168694e-06, + "loss": 0.99218159, + "memory(GiB)": 302.58, + "step": 184080, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.75453362, + "epoch": 1.0295743987741297, + "grad_norm": 8.0, + "learning_rate": 5.168819034972259e-06, + "loss": 0.96491423, + "memory(GiB)": 302.58, + "step": 184100, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.7521924, + "epoch": 1.029686248247109, + "grad_norm": 7.0, + "learning_rate": 5.167894854353503e-06, + "loss": 0.9659523, + "memory(GiB)": 302.58, + "step": 184120, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.76387672, + "epoch": 1.0297980977200882, + "grad_norm": 8.8125, + "learning_rate": 5.166970667992207e-06, + "loss": 0.91414499, + "memory(GiB)": 302.58, + "step": 184140, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.73714771, + "epoch": 1.0299099471930675, + "grad_norm": 8.125, + "learning_rate": 5.1660464759199865e-06, + "loss": 1.0303545, + "memory(GiB)": 302.58, + "step": 184160, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75440922, + "epoch": 1.0300217966660468, + "grad_norm": 4.53125, + "learning_rate": 5.165122278168451e-06, + "loss": 0.95155306, + "memory(GiB)": 302.58, + "step": 184180, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.73412571, + "epoch": 1.030133646139026, + "grad_norm": 8.5625, + "learning_rate": 5.16419807476921e-06, + "loss": 1.04170485, + "memory(GiB)": 302.58, + "step": 184200, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75530114, + "epoch": 1.0302454956120053, + "grad_norm": 6.09375, + "learning_rate": 5.163273865753873e-06, + "loss": 0.9402544, + "memory(GiB)": 302.58, + "step": 184220, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.74442167, + "epoch": 1.0303573450849846, + "grad_norm": 8.625, + "learning_rate": 5.162349651154053e-06, + "loss": 0.99226656, + "memory(GiB)": 302.58, + "step": 184240, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.75489159, + "epoch": 1.0304691945579638, + "grad_norm": 9.25, + "learning_rate": 5.16142543100136e-06, + "loss": 0.97533007, + "memory(GiB)": 302.58, + "step": 184260, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.76040444, + "epoch": 1.030581044030943, + "grad_norm": 7.8125, + "learning_rate": 5.160501205327404e-06, + "loss": 0.94354773, + "memory(GiB)": 302.58, + "step": 184280, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.76875558, + "epoch": 1.0306928935039223, + "grad_norm": 8.0, + "learning_rate": 5.1595769741637995e-06, + "loss": 0.9049324, + "memory(GiB)": 302.58, + "step": 184300, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.73935804, + "epoch": 1.0308047429769016, + "grad_norm": 6.78125, + "learning_rate": 5.1586527375421555e-06, + "loss": 1.04283342, + "memory(GiB)": 302.58, + "step": 184320, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.74917493, + "epoch": 1.0309165924498809, + "grad_norm": 8.125, + "learning_rate": 5.157728495494086e-06, + "loss": 0.99767504, + "memory(GiB)": 302.58, + "step": 184340, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.73642759, + "epoch": 1.0310284419228601, + "grad_norm": 6.46875, + "learning_rate": 5.1568042480512e-06, + "loss": 1.03607111, + "memory(GiB)": 302.58, + "step": 184360, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74232979, + "epoch": 1.0311402913958394, + "grad_norm": 8.9375, + "learning_rate": 5.155879995245113e-06, + "loss": 1.00474577, + "memory(GiB)": 302.58, + "step": 184380, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74171343, + "epoch": 1.0312521408688187, + "grad_norm": 6.1875, + "learning_rate": 5.1549557371074364e-06, + "loss": 0.99612265, + "memory(GiB)": 302.58, + "step": 184400, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.76777005, + "epoch": 1.031363990341798, + "grad_norm": 6.25, + "learning_rate": 5.1540314736697805e-06, + "loss": 0.90686054, + "memory(GiB)": 302.58, + "step": 184420, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.75375795, + "epoch": 1.0314758398147772, + "grad_norm": 4.5, + "learning_rate": 5.15310720496376e-06, + "loss": 0.97645359, + "memory(GiB)": 302.58, + "step": 184440, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.74837284, + "epoch": 1.0315876892877565, + "grad_norm": 7.15625, + "learning_rate": 5.152182931020989e-06, + "loss": 1.00181532, + "memory(GiB)": 302.58, + "step": 184460, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74934149, + "epoch": 1.0316995387607357, + "grad_norm": 5.1875, + "learning_rate": 5.151258651873078e-06, + "loss": 1.00443048, + "memory(GiB)": 302.58, + "step": 184480, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.72658701, + "epoch": 1.031811388233715, + "grad_norm": 6.15625, + "learning_rate": 5.150334367551642e-06, + "loss": 1.07264605, + "memory(GiB)": 302.58, + "step": 184500, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.73933039, + "epoch": 1.0319232377066943, + "grad_norm": 6.6875, + "learning_rate": 5.1494100780882936e-06, + "loss": 1.02136669, + "memory(GiB)": 302.58, + "step": 184520, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.77025027, + "epoch": 1.0320350871796735, + "grad_norm": 9.9375, + "learning_rate": 5.148485783514648e-06, + "loss": 0.88949347, + "memory(GiB)": 302.58, + "step": 184540, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75818434, + "epoch": 1.0321469366526528, + "grad_norm": 8.4375, + "learning_rate": 5.147561483862316e-06, + "loss": 0.96831388, + "memory(GiB)": 302.58, + "step": 184560, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.75417542, + "epoch": 1.032258786125632, + "grad_norm": 10.0625, + "learning_rate": 5.1466371791629135e-06, + "loss": 0.97656717, + "memory(GiB)": 302.58, + "step": 184580, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.75003324, + "epoch": 1.0323706355986113, + "grad_norm": 7.46875, + "learning_rate": 5.1457128694480556e-06, + "loss": 0.97177906, + "memory(GiB)": 302.58, + "step": 184600, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.71550846, + "epoch": 1.0324824850715906, + "grad_norm": 6.84375, + "learning_rate": 5.144788554749355e-06, + "loss": 1.13578501, + "memory(GiB)": 302.58, + "step": 184620, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.77416453, + "epoch": 1.0325943345445698, + "grad_norm": 8.3125, + "learning_rate": 5.143864235098427e-06, + "loss": 0.88087044, + "memory(GiB)": 302.58, + "step": 184640, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.77332411, + "epoch": 1.032706184017549, + "grad_norm": 6.90625, + "learning_rate": 5.1429399105268855e-06, + "loss": 0.89603481, + "memory(GiB)": 302.58, + "step": 184660, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.7454195, + "epoch": 1.0328180334905284, + "grad_norm": 4.53125, + "learning_rate": 5.142015581066346e-06, + "loss": 1.00786324, + "memory(GiB)": 302.58, + "step": 184680, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.73421736, + "epoch": 1.0329298829635076, + "grad_norm": 7.5625, + "learning_rate": 5.141091246748422e-06, + "loss": 1.04976387, + "memory(GiB)": 302.58, + "step": 184700, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.77417049, + "epoch": 1.033041732436487, + "grad_norm": 6.96875, + "learning_rate": 5.140166907604731e-06, + "loss": 0.88245096, + "memory(GiB)": 302.58, + "step": 184720, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.75071182, + "epoch": 1.0331535819094662, + "grad_norm": 7.34375, + "learning_rate": 5.139242563666888e-06, + "loss": 0.98461866, + "memory(GiB)": 302.58, + "step": 184740, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.76122937, + "epoch": 1.0332654313824454, + "grad_norm": 6.5, + "learning_rate": 5.138318214966507e-06, + "loss": 0.92927742, + "memory(GiB)": 302.58, + "step": 184760, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.75891552, + "epoch": 1.0333772808554247, + "grad_norm": 5.78125, + "learning_rate": 5.137393861535205e-06, + "loss": 0.95573292, + "memory(GiB)": 302.58, + "step": 184780, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.74076767, + "epoch": 1.033489130328404, + "grad_norm": 5.90625, + "learning_rate": 5.136469503404598e-06, + "loss": 1.0300437, + "memory(GiB)": 302.58, + "step": 184800, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.76176963, + "epoch": 1.0336009798013832, + "grad_norm": 8.75, + "learning_rate": 5.1355451406062995e-06, + "loss": 0.9381547, + "memory(GiB)": 302.58, + "step": 184820, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75819955, + "epoch": 1.0337128292743625, + "grad_norm": 8.875, + "learning_rate": 5.134620773171929e-06, + "loss": 0.96151752, + "memory(GiB)": 302.58, + "step": 184840, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.76597786, + "epoch": 1.0338246787473417, + "grad_norm": 10.125, + "learning_rate": 5.133696401133099e-06, + "loss": 0.91218109, + "memory(GiB)": 302.58, + "step": 184860, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.74413152, + "epoch": 1.033936528220321, + "grad_norm": 5.15625, + "learning_rate": 5.13277202452143e-06, + "loss": 1.02333899, + "memory(GiB)": 302.58, + "step": 184880, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.76889725, + "epoch": 1.0340483776933003, + "grad_norm": 7.03125, + "learning_rate": 5.131847643368536e-06, + "loss": 0.9011344, + "memory(GiB)": 302.58, + "step": 184900, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.7607471, + "epoch": 1.0341602271662795, + "grad_norm": 5.25, + "learning_rate": 5.130923257706035e-06, + "loss": 0.92609043, + "memory(GiB)": 302.58, + "step": 184920, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75329785, + "epoch": 1.0342720766392588, + "grad_norm": 5.125, + "learning_rate": 5.129998867565543e-06, + "loss": 0.96493149, + "memory(GiB)": 302.58, + "step": 184940, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.75615778, + "epoch": 1.034383926112238, + "grad_norm": 8.0625, + "learning_rate": 5.129074472978678e-06, + "loss": 0.94521065, + "memory(GiB)": 302.58, + "step": 184960, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.73748455, + "epoch": 1.0344957755852173, + "grad_norm": 6.09375, + "learning_rate": 5.1281500739770575e-06, + "loss": 1.03807278, + "memory(GiB)": 302.58, + "step": 184980, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.7454484, + "epoch": 1.0346076250581966, + "grad_norm": 6.625, + "learning_rate": 5.127225670592296e-06, + "loss": 1.01629305, + "memory(GiB)": 302.58, + "step": 185000, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.76196361, + "epoch": 1.0347194745311759, + "grad_norm": 5.9375, + "learning_rate": 5.126301262856014e-06, + "loss": 0.94165049, + "memory(GiB)": 302.58, + "step": 185020, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74618034, + "epoch": 1.0348313240041551, + "grad_norm": 7.125, + "learning_rate": 5.125376850799829e-06, + "loss": 0.99894543, + "memory(GiB)": 302.58, + "step": 185040, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.72655663, + "epoch": 1.0349431734771344, + "grad_norm": 5.25, + "learning_rate": 5.124452434455358e-06, + "loss": 1.08123016, + "memory(GiB)": 302.58, + "step": 185060, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.76099062, + "epoch": 1.0350550229501136, + "grad_norm": 6.53125, + "learning_rate": 5.123528013854218e-06, + "loss": 0.92522144, + "memory(GiB)": 302.58, + "step": 185080, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.75126729, + "epoch": 1.035166872423093, + "grad_norm": 6.90625, + "learning_rate": 5.12260358902803e-06, + "loss": 0.98368635, + "memory(GiB)": 302.58, + "step": 185100, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.756885, + "epoch": 1.0352787218960722, + "grad_norm": 7.6875, + "learning_rate": 5.12167916000841e-06, + "loss": 0.97845516, + "memory(GiB)": 302.58, + "step": 185120, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.75522594, + "epoch": 1.0353905713690514, + "grad_norm": 8.625, + "learning_rate": 5.120754726826976e-06, + "loss": 0.96552753, + "memory(GiB)": 302.58, + "step": 185140, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.74381876, + "epoch": 1.0355024208420307, + "grad_norm": 9.0625, + "learning_rate": 5.119830289515347e-06, + "loss": 1.01187611, + "memory(GiB)": 302.58, + "step": 185160, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.74001207, + "epoch": 1.03561427031501, + "grad_norm": 6.09375, + "learning_rate": 5.118905848105143e-06, + "loss": 1.00508194, + "memory(GiB)": 302.58, + "step": 185180, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.75327559, + "epoch": 1.0357261197879892, + "grad_norm": 8.625, + "learning_rate": 5.117981402627983e-06, + "loss": 0.97866383, + "memory(GiB)": 302.58, + "step": 185200, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.7630477, + "epoch": 1.0358379692609685, + "grad_norm": 9.0625, + "learning_rate": 5.117056953115483e-06, + "loss": 0.91803217, + "memory(GiB)": 302.58, + "step": 185220, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.75515256, + "epoch": 1.0359498187339478, + "grad_norm": 7.0625, + "learning_rate": 5.116132499599266e-06, + "loss": 0.94678946, + "memory(GiB)": 302.58, + "step": 185240, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.74982014, + "epoch": 1.036061668206927, + "grad_norm": 6.53125, + "learning_rate": 5.1152080421109485e-06, + "loss": 0.99316101, + "memory(GiB)": 302.58, + "step": 185260, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.7353354, + "epoch": 1.0361735176799063, + "grad_norm": 7.8125, + "learning_rate": 5.114283580682149e-06, + "loss": 1.05863733, + "memory(GiB)": 302.58, + "step": 185280, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.75901847, + "epoch": 1.0362853671528856, + "grad_norm": 6.5625, + "learning_rate": 5.113359115344491e-06, + "loss": 0.96639967, + "memory(GiB)": 302.58, + "step": 185300, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.75314956, + "epoch": 1.0363972166258648, + "grad_norm": 10.1875, + "learning_rate": 5.112434646129589e-06, + "loss": 0.96668348, + "memory(GiB)": 302.58, + "step": 185320, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.7507782, + "epoch": 1.036509066098844, + "grad_norm": 8.6875, + "learning_rate": 5.1115101730690675e-06, + "loss": 0.99202719, + "memory(GiB)": 302.58, + "step": 185340, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.75404396, + "epoch": 1.0366209155718233, + "grad_norm": 7.65625, + "learning_rate": 5.1105856961945444e-06, + "loss": 0.97449903, + "memory(GiB)": 302.58, + "step": 185360, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.73086605, + "epoch": 1.0367327650448026, + "grad_norm": 10.0, + "learning_rate": 5.10966121553764e-06, + "loss": 1.07267122, + "memory(GiB)": 302.58, + "step": 185380, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.72664742, + "epoch": 1.0368446145177819, + "grad_norm": 5.5, + "learning_rate": 5.108736731129974e-06, + "loss": 1.08822479, + "memory(GiB)": 302.58, + "step": 185400, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.74277177, + "epoch": 1.0369564639907611, + "grad_norm": 6.125, + "learning_rate": 5.107812243003167e-06, + "loss": 0.99084635, + "memory(GiB)": 302.58, + "step": 185420, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.76543508, + "epoch": 1.0370683134637404, + "grad_norm": 9.125, + "learning_rate": 5.1068877511888395e-06, + "loss": 0.90355024, + "memory(GiB)": 302.58, + "step": 185440, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.7863306, + "epoch": 1.0371801629367197, + "grad_norm": 8.25, + "learning_rate": 5.105963255718613e-06, + "loss": 0.83741503, + "memory(GiB)": 302.58, + "step": 185460, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.7483973, + "epoch": 1.037292012409699, + "grad_norm": 5.8125, + "learning_rate": 5.1050387566241055e-06, + "loss": 0.98219891, + "memory(GiB)": 302.58, + "step": 185480, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.74777694, + "epoch": 1.0374038618826782, + "grad_norm": 10.375, + "learning_rate": 5.10411425393694e-06, + "loss": 0.97709665, + "memory(GiB)": 302.58, + "step": 185500, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.75859385, + "epoch": 1.0375157113556575, + "grad_norm": 6.65625, + "learning_rate": 5.103189747688737e-06, + "loss": 0.93769732, + "memory(GiB)": 302.58, + "step": 185520, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.75415006, + "epoch": 1.0376275608286367, + "grad_norm": 4.6875, + "learning_rate": 5.102265237911118e-06, + "loss": 0.955896, + "memory(GiB)": 302.58, + "step": 185540, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.74654975, + "epoch": 1.037739410301616, + "grad_norm": 7.78125, + "learning_rate": 5.101340724635703e-06, + "loss": 0.9970808, + "memory(GiB)": 302.58, + "step": 185560, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.74879279, + "epoch": 1.0378512597745952, + "grad_norm": 5.59375, + "learning_rate": 5.100416207894113e-06, + "loss": 1.00665989, + "memory(GiB)": 302.58, + "step": 185580, + "train_speed(iter/s)": 0.123862 + }, + { + "acc": 0.75640473, + "epoch": 1.0379631092475745, + "grad_norm": 6.4375, + "learning_rate": 5.0994916877179715e-06, + "loss": 0.94770174, + "memory(GiB)": 302.58, + "step": 185600, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.77288947, + "epoch": 1.0380749587205538, + "grad_norm": 9.5, + "learning_rate": 5.098567164138899e-06, + "loss": 0.89032936, + "memory(GiB)": 302.58, + "step": 185620, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.74072943, + "epoch": 1.038186808193533, + "grad_norm": 5.96875, + "learning_rate": 5.097642637188515e-06, + "loss": 1.02497244, + "memory(GiB)": 302.58, + "step": 185640, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.74337435, + "epoch": 1.0382986576665123, + "grad_norm": 8.0625, + "learning_rate": 5.096718106898446e-06, + "loss": 1.01507368, + "memory(GiB)": 302.58, + "step": 185660, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.75208583, + "epoch": 1.0384105071394916, + "grad_norm": 5.59375, + "learning_rate": 5.0957935733003095e-06, + "loss": 0.96493168, + "memory(GiB)": 302.58, + "step": 185680, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.74572406, + "epoch": 1.0385223566124708, + "grad_norm": 6.0625, + "learning_rate": 5.094869036425728e-06, + "loss": 1.0116209, + "memory(GiB)": 302.58, + "step": 185700, + "train_speed(iter/s)": 0.123899 + }, + { + "acc": 0.73818383, + "epoch": 1.03863420608545, + "grad_norm": 6.71875, + "learning_rate": 5.093944496306327e-06, + "loss": 1.02592592, + "memory(GiB)": 302.58, + "step": 185720, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.73066258, + "epoch": 1.0387460555584294, + "grad_norm": 8.0625, + "learning_rate": 5.093019952973724e-06, + "loss": 1.05754862, + "memory(GiB)": 302.58, + "step": 185740, + "train_speed(iter/s)": 0.123912 + }, + { + "acc": 0.75231819, + "epoch": 1.0388579050314086, + "grad_norm": 5.8125, + "learning_rate": 5.0920954064595454e-06, + "loss": 0.96345482, + "memory(GiB)": 302.58, + "step": 185760, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.75904908, + "epoch": 1.0389697545043879, + "grad_norm": 7.09375, + "learning_rate": 5.091170856795412e-06, + "loss": 0.937603, + "memory(GiB)": 302.58, + "step": 185780, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.7435771, + "epoch": 1.0390816039773672, + "grad_norm": 7.75, + "learning_rate": 5.0902463040129454e-06, + "loss": 1.00162745, + "memory(GiB)": 302.58, + "step": 185800, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.74932857, + "epoch": 1.0391934534503466, + "grad_norm": 7.375, + "learning_rate": 5.08932174814377e-06, + "loss": 1.01009846, + "memory(GiB)": 302.58, + "step": 185820, + "train_speed(iter/s)": 0.123937 + }, + { + "acc": 0.74009738, + "epoch": 1.0393053029233257, + "grad_norm": 6.40625, + "learning_rate": 5.088397189219505e-06, + "loss": 1.03845425, + "memory(GiB)": 302.58, + "step": 185840, + "train_speed(iter/s)": 0.123944 + }, + { + "acc": 0.75079436, + "epoch": 1.0394171523963052, + "grad_norm": 8.0, + "learning_rate": 5.087472627271778e-06, + "loss": 0.96889915, + "memory(GiB)": 302.58, + "step": 185860, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.75159407, + "epoch": 1.0395290018692842, + "grad_norm": 7.0, + "learning_rate": 5.086548062332209e-06, + "loss": 0.98495703, + "memory(GiB)": 302.58, + "step": 185880, + "train_speed(iter/s)": 0.123957 + }, + { + "acc": 0.7305954, + "epoch": 1.0396408513422637, + "grad_norm": 6.40625, + "learning_rate": 5.085623494432422e-06, + "loss": 1.0771163, + "memory(GiB)": 302.58, + "step": 185900, + "train_speed(iter/s)": 0.123963 + }, + { + "acc": 0.74614515, + "epoch": 1.0397527008152427, + "grad_norm": 9.5, + "learning_rate": 5.0846989236040414e-06, + "loss": 0.99341803, + "memory(GiB)": 302.58, + "step": 185920, + "train_speed(iter/s)": 0.123969 + }, + { + "acc": 0.75792394, + "epoch": 1.0398645502882222, + "grad_norm": 6.1875, + "learning_rate": 5.083774349878686e-06, + "loss": 0.94132214, + "memory(GiB)": 302.58, + "step": 185940, + "train_speed(iter/s)": 0.123976 + }, + { + "acc": 0.73033409, + "epoch": 1.0399763997612013, + "grad_norm": 6.1875, + "learning_rate": 5.082849773287984e-06, + "loss": 1.08314238, + "memory(GiB)": 302.58, + "step": 185960, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.75606203, + "epoch": 1.0400882492341808, + "grad_norm": 6.0625, + "learning_rate": 5.0819251938635575e-06, + "loss": 0.93848867, + "memory(GiB)": 302.58, + "step": 185980, + "train_speed(iter/s)": 0.123988 + }, + { + "acc": 0.75275788, + "epoch": 1.0402000987071598, + "grad_norm": 8.375, + "learning_rate": 5.081000611637029e-06, + "loss": 0.97639694, + "memory(GiB)": 302.58, + "step": 186000, + "train_speed(iter/s)": 0.123995 + }, + { + "epoch": 1.0402000987071598, + "eval_acc": 0.7063771505846254, + "eval_loss": 1.0143986940383911, + "eval_runtime": 7504.7069, + "eval_samples_per_second": 10.031, + "eval_steps_per_second": 10.031, + "step": 186000 + }, + { + "acc": 0.73895073, + "epoch": 1.0403119481801393, + "grad_norm": 6.4375, + "learning_rate": 5.080076026640021e-06, + "loss": 1.03372002, + "memory(GiB)": 302.58, + "step": 186020, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.73876643, + "epoch": 1.0404237976531183, + "grad_norm": 6.53125, + "learning_rate": 5.079151438904162e-06, + "loss": 1.03628416, + "memory(GiB)": 302.58, + "step": 186040, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.76441755, + "epoch": 1.0405356471260978, + "grad_norm": 10.125, + "learning_rate": 5.078226848461072e-06, + "loss": 0.91742649, + "memory(GiB)": 302.58, + "step": 186060, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.76148562, + "epoch": 1.0406474965990768, + "grad_norm": 7.03125, + "learning_rate": 5.077302255342376e-06, + "loss": 0.94316921, + "memory(GiB)": 302.58, + "step": 186080, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.74476643, + "epoch": 1.0407593460720563, + "grad_norm": 10.5, + "learning_rate": 5.076377659579696e-06, + "loss": 1.00837774, + "memory(GiB)": 302.58, + "step": 186100, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.75461469, + "epoch": 1.0408711955450354, + "grad_norm": 10.5, + "learning_rate": 5.075453061204659e-06, + "loss": 0.97846107, + "memory(GiB)": 302.58, + "step": 186120, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.75554976, + "epoch": 1.0409830450180149, + "grad_norm": 8.875, + "learning_rate": 5.074528460248888e-06, + "loss": 0.96549082, + "memory(GiB)": 302.58, + "step": 186140, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.7580677, + "epoch": 1.0410948944909941, + "grad_norm": 8.875, + "learning_rate": 5.073603856744007e-06, + "loss": 0.94210138, + "memory(GiB)": 302.58, + "step": 186160, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.76601658, + "epoch": 1.0412067439639734, + "grad_norm": 7.40625, + "learning_rate": 5.072679250721641e-06, + "loss": 0.89338322, + "memory(GiB)": 302.58, + "step": 186180, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.73623829, + "epoch": 1.0413185934369527, + "grad_norm": 6.28125, + "learning_rate": 5.071754642213415e-06, + "loss": 1.04332619, + "memory(GiB)": 302.58, + "step": 186200, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.73784862, + "epoch": 1.041430442909932, + "grad_norm": 7.40625, + "learning_rate": 5.070830031250952e-06, + "loss": 1.02022333, + "memory(GiB)": 302.58, + "step": 186220, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.74667768, + "epoch": 1.0415422923829112, + "grad_norm": 4.53125, + "learning_rate": 5.069905417865877e-06, + "loss": 0.98472281, + "memory(GiB)": 302.58, + "step": 186240, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.75306973, + "epoch": 1.0416541418558904, + "grad_norm": 9.0625, + "learning_rate": 5.068980802089816e-06, + "loss": 0.97170343, + "memory(GiB)": 302.58, + "step": 186260, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.74285874, + "epoch": 1.0417659913288697, + "grad_norm": 6.96875, + "learning_rate": 5.068056183954391e-06, + "loss": 1.02375565, + "memory(GiB)": 302.58, + "step": 186280, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.73742213, + "epoch": 1.041877840801849, + "grad_norm": 5.75, + "learning_rate": 5.0671315634912295e-06, + "loss": 1.05910501, + "memory(GiB)": 302.58, + "step": 186300, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.7623858, + "epoch": 1.0419896902748282, + "grad_norm": 8.125, + "learning_rate": 5.066206940731957e-06, + "loss": 0.94404736, + "memory(GiB)": 302.58, + "step": 186320, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75530586, + "epoch": 1.0421015397478075, + "grad_norm": 4.25, + "learning_rate": 5.0652823157081965e-06, + "loss": 0.97857084, + "memory(GiB)": 302.58, + "step": 186340, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.73309989, + "epoch": 1.0422133892207868, + "grad_norm": 9.4375, + "learning_rate": 5.064357688451574e-06, + "loss": 1.04900961, + "memory(GiB)": 302.58, + "step": 186360, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.74040418, + "epoch": 1.042325238693766, + "grad_norm": 9.625, + "learning_rate": 5.063433058993713e-06, + "loss": 1.01992912, + "memory(GiB)": 302.58, + "step": 186380, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.74367242, + "epoch": 1.0424370881667453, + "grad_norm": 6.25, + "learning_rate": 5.062508427366241e-06, + "loss": 1.0134346, + "memory(GiB)": 302.58, + "step": 186400, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.76721125, + "epoch": 1.0425489376397246, + "grad_norm": 6.125, + "learning_rate": 5.061583793600783e-06, + "loss": 0.89959278, + "memory(GiB)": 302.58, + "step": 186420, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.73747087, + "epoch": 1.0426607871127038, + "grad_norm": 5.40625, + "learning_rate": 5.060659157728963e-06, + "loss": 1.03282385, + "memory(GiB)": 302.58, + "step": 186440, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74477072, + "epoch": 1.042772636585683, + "grad_norm": 11.25, + "learning_rate": 5.059734519782409e-06, + "loss": 1.0053113, + "memory(GiB)": 302.58, + "step": 186460, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75886774, + "epoch": 1.0428844860586624, + "grad_norm": 8.625, + "learning_rate": 5.058809879792745e-06, + "loss": 0.94182959, + "memory(GiB)": 302.58, + "step": 186480, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74421382, + "epoch": 1.0429963355316416, + "grad_norm": 6.40625, + "learning_rate": 5.0578852377915955e-06, + "loss": 1.01452808, + "memory(GiB)": 302.58, + "step": 186500, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.76131544, + "epoch": 1.0431081850046209, + "grad_norm": 8.0625, + "learning_rate": 5.056960593810588e-06, + "loss": 0.94810724, + "memory(GiB)": 302.58, + "step": 186520, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75414853, + "epoch": 1.0432200344776001, + "grad_norm": 8.1875, + "learning_rate": 5.056035947881347e-06, + "loss": 0.97156982, + "memory(GiB)": 302.58, + "step": 186540, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.74283214, + "epoch": 1.0433318839505794, + "grad_norm": 5.5, + "learning_rate": 5.055111300035499e-06, + "loss": 1.00839844, + "memory(GiB)": 302.58, + "step": 186560, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.7516098, + "epoch": 1.0434437334235587, + "grad_norm": 8.125, + "learning_rate": 5.054186650304669e-06, + "loss": 0.98609419, + "memory(GiB)": 302.58, + "step": 186580, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.73066368, + "epoch": 1.043555582896538, + "grad_norm": 7.8125, + "learning_rate": 5.0532619987204845e-06, + "loss": 1.0606595, + "memory(GiB)": 302.58, + "step": 186600, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.74144487, + "epoch": 1.0436674323695172, + "grad_norm": 6.8125, + "learning_rate": 5.05233734531457e-06, + "loss": 1.02564106, + "memory(GiB)": 302.58, + "step": 186620, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.75894856, + "epoch": 1.0437792818424965, + "grad_norm": 6.71875, + "learning_rate": 5.051412690118555e-06, + "loss": 0.94705439, + "memory(GiB)": 302.58, + "step": 186640, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.74293332, + "epoch": 1.0438911313154757, + "grad_norm": 6.75, + "learning_rate": 5.05048803316406e-06, + "loss": 1.0036994, + "memory(GiB)": 302.58, + "step": 186660, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75379076, + "epoch": 1.044002980788455, + "grad_norm": 7.5625, + "learning_rate": 5.049563374482717e-06, + "loss": 0.94739256, + "memory(GiB)": 302.58, + "step": 186680, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74844112, + "epoch": 1.0441148302614343, + "grad_norm": 8.3125, + "learning_rate": 5.0486387141061475e-06, + "loss": 0.9911911, + "memory(GiB)": 302.58, + "step": 186700, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.78059874, + "epoch": 1.0442266797344135, + "grad_norm": 7.375, + "learning_rate": 5.047714052065979e-06, + "loss": 0.83627157, + "memory(GiB)": 302.58, + "step": 186720, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.74964552, + "epoch": 1.0443385292073928, + "grad_norm": 9.1875, + "learning_rate": 5.04678938839384e-06, + "loss": 0.95886831, + "memory(GiB)": 302.58, + "step": 186740, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.7542099, + "epoch": 1.044450378680372, + "grad_norm": 7.90625, + "learning_rate": 5.0458647231213556e-06, + "loss": 0.96002226, + "memory(GiB)": 302.58, + "step": 186760, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.76204896, + "epoch": 1.0445622281533513, + "grad_norm": 8.3125, + "learning_rate": 5.044940056280152e-06, + "loss": 0.93602304, + "memory(GiB)": 302.58, + "step": 186780, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.75252123, + "epoch": 1.0446740776263306, + "grad_norm": 6.21875, + "learning_rate": 5.044015387901856e-06, + "loss": 0.98290367, + "memory(GiB)": 302.58, + "step": 186800, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.75062714, + "epoch": 1.0447859270993098, + "grad_norm": 6.9375, + "learning_rate": 5.043090718018094e-06, + "loss": 0.98170328, + "memory(GiB)": 302.58, + "step": 186820, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.73595791, + "epoch": 1.044897776572289, + "grad_norm": 6.96875, + "learning_rate": 5.042166046660493e-06, + "loss": 1.04466276, + "memory(GiB)": 302.58, + "step": 186840, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.73717904, + "epoch": 1.0450096260452684, + "grad_norm": 8.0625, + "learning_rate": 5.041241373860679e-06, + "loss": 1.01460686, + "memory(GiB)": 302.58, + "step": 186860, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.75031095, + "epoch": 1.0451214755182476, + "grad_norm": 12.0, + "learning_rate": 5.0403166996502794e-06, + "loss": 0.94929066, + "memory(GiB)": 302.58, + "step": 186880, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.73011417, + "epoch": 1.045233324991227, + "grad_norm": 8.5, + "learning_rate": 5.039392024060921e-06, + "loss": 1.07481146, + "memory(GiB)": 302.58, + "step": 186900, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.75962706, + "epoch": 1.0453451744642062, + "grad_norm": 5.59375, + "learning_rate": 5.0384673471242305e-06, + "loss": 0.96015701, + "memory(GiB)": 302.58, + "step": 186920, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.74295936, + "epoch": 1.0454570239371854, + "grad_norm": 5.4375, + "learning_rate": 5.037542668871836e-06, + "loss": 1.006989, + "memory(GiB)": 302.58, + "step": 186940, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.73882513, + "epoch": 1.0455688734101647, + "grad_norm": 6.53125, + "learning_rate": 5.036617989335362e-06, + "loss": 1.01668892, + "memory(GiB)": 302.58, + "step": 186960, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.76985755, + "epoch": 1.045680722883144, + "grad_norm": 8.0625, + "learning_rate": 5.035693308546437e-06, + "loss": 0.89270506, + "memory(GiB)": 302.58, + "step": 186980, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.75731411, + "epoch": 1.0457925723561232, + "grad_norm": 7.21875, + "learning_rate": 5.034768626536686e-06, + "loss": 0.95362644, + "memory(GiB)": 302.58, + "step": 187000, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.7358212, + "epoch": 1.0459044218291025, + "grad_norm": 5.5, + "learning_rate": 5.033843943337738e-06, + "loss": 1.04166946, + "memory(GiB)": 302.58, + "step": 187020, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.74356818, + "epoch": 1.0460162713020817, + "grad_norm": 4.71875, + "learning_rate": 5.0329192589812215e-06, + "loss": 0.99520645, + "memory(GiB)": 302.58, + "step": 187040, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.74020147, + "epoch": 1.046128120775061, + "grad_norm": 7.1875, + "learning_rate": 5.031994573498762e-06, + "loss": 1.03143139, + "memory(GiB)": 302.58, + "step": 187060, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.76723847, + "epoch": 1.0462399702480403, + "grad_norm": 5.78125, + "learning_rate": 5.0310698869219854e-06, + "loss": 0.9083848, + "memory(GiB)": 302.58, + "step": 187080, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.75548635, + "epoch": 1.0463518197210195, + "grad_norm": 10.3125, + "learning_rate": 5.0301451992825216e-06, + "loss": 0.93359022, + "memory(GiB)": 302.58, + "step": 187100, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.76099138, + "epoch": 1.0464636691939988, + "grad_norm": 10.875, + "learning_rate": 5.029220510611995e-06, + "loss": 0.95251255, + "memory(GiB)": 302.58, + "step": 187120, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.74785051, + "epoch": 1.046575518666978, + "grad_norm": 8.875, + "learning_rate": 5.028295820942034e-06, + "loss": 0.99906015, + "memory(GiB)": 302.58, + "step": 187140, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74835553, + "epoch": 1.0466873681399573, + "grad_norm": 6.6875, + "learning_rate": 5.027371130304267e-06, + "loss": 1.00186663, + "memory(GiB)": 302.58, + "step": 187160, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.76075501, + "epoch": 1.0467992176129366, + "grad_norm": 7.625, + "learning_rate": 5.02644643873032e-06, + "loss": 0.92434053, + "memory(GiB)": 302.58, + "step": 187180, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.72586546, + "epoch": 1.0469110670859159, + "grad_norm": 5.78125, + "learning_rate": 5.0255217462518215e-06, + "loss": 1.0660512, + "memory(GiB)": 302.58, + "step": 187200, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.73420272, + "epoch": 1.0470229165588951, + "grad_norm": 8.0, + "learning_rate": 5.024597052900398e-06, + "loss": 1.06897078, + "memory(GiB)": 302.58, + "step": 187220, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.74639988, + "epoch": 1.0471347660318744, + "grad_norm": 6.6875, + "learning_rate": 5.023672358707677e-06, + "loss": 0.9990345, + "memory(GiB)": 302.58, + "step": 187240, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.76187449, + "epoch": 1.0472466155048537, + "grad_norm": 8.375, + "learning_rate": 5.022747663705287e-06, + "loss": 0.93039503, + "memory(GiB)": 302.58, + "step": 187260, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.76150646, + "epoch": 1.047358464977833, + "grad_norm": 7.3125, + "learning_rate": 5.021822967924856e-06, + "loss": 0.94919596, + "memory(GiB)": 302.58, + "step": 187280, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.74020548, + "epoch": 1.0474703144508122, + "grad_norm": 10.25, + "learning_rate": 5.020898271398007e-06, + "loss": 1.01108685, + "memory(GiB)": 302.58, + "step": 187300, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.73987126, + "epoch": 1.0475821639237914, + "grad_norm": 7.0, + "learning_rate": 5.019973574156373e-06, + "loss": 1.01957827, + "memory(GiB)": 302.58, + "step": 187320, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.73306875, + "epoch": 1.0476940133967707, + "grad_norm": 9.5625, + "learning_rate": 5.019048876231579e-06, + "loss": 1.05979652, + "memory(GiB)": 302.58, + "step": 187340, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.75437374, + "epoch": 1.04780586286975, + "grad_norm": 4.875, + "learning_rate": 5.018124177655254e-06, + "loss": 0.97453671, + "memory(GiB)": 302.58, + "step": 187360, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.75112438, + "epoch": 1.0479177123427292, + "grad_norm": 6.71875, + "learning_rate": 5.017199478459024e-06, + "loss": 0.96457071, + "memory(GiB)": 302.58, + "step": 187380, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.77145495, + "epoch": 1.0480295618157085, + "grad_norm": 5.5, + "learning_rate": 5.016274778674517e-06, + "loss": 0.91414394, + "memory(GiB)": 302.58, + "step": 187400, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.75357699, + "epoch": 1.0481414112886878, + "grad_norm": 7.40625, + "learning_rate": 5.015350078333362e-06, + "loss": 0.94633541, + "memory(GiB)": 302.58, + "step": 187420, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.74505672, + "epoch": 1.048253260761667, + "grad_norm": 7.71875, + "learning_rate": 5.014425377467183e-06, + "loss": 1.00679359, + "memory(GiB)": 302.58, + "step": 187440, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.75434237, + "epoch": 1.0483651102346463, + "grad_norm": 6.40625, + "learning_rate": 5.013500676107612e-06, + "loss": 0.95330915, + "memory(GiB)": 302.58, + "step": 187460, + "train_speed(iter/s)": 0.123826 + }, + { + "acc": 0.751507, + "epoch": 1.0484769597076256, + "grad_norm": 7.96875, + "learning_rate": 5.0125759742862755e-06, + "loss": 0.9735877, + "memory(GiB)": 302.58, + "step": 187480, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.75678086, + "epoch": 1.0485888091806048, + "grad_norm": 9.125, + "learning_rate": 5.011651272034801e-06, + "loss": 0.96594772, + "memory(GiB)": 302.58, + "step": 187500, + "train_speed(iter/s)": 0.123839 + }, + { + "acc": 0.75018597, + "epoch": 1.048700658653584, + "grad_norm": 8.125, + "learning_rate": 5.010726569384816e-06, + "loss": 0.98432102, + "memory(GiB)": 302.58, + "step": 187520, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.76972947, + "epoch": 1.0488125081265633, + "grad_norm": 7.6875, + "learning_rate": 5.009801866367948e-06, + "loss": 0.90542383, + "memory(GiB)": 302.58, + "step": 187540, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.75639472, + "epoch": 1.0489243575995426, + "grad_norm": 11.375, + "learning_rate": 5.008877163015825e-06, + "loss": 0.96562729, + "memory(GiB)": 302.58, + "step": 187560, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.73596997, + "epoch": 1.0490362070725219, + "grad_norm": 6.15625, + "learning_rate": 5.0079524593600735e-06, + "loss": 1.05358171, + "memory(GiB)": 302.58, + "step": 187580, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.74760122, + "epoch": 1.0491480565455011, + "grad_norm": 7.3125, + "learning_rate": 5.0070277554323244e-06, + "loss": 1.00233927, + "memory(GiB)": 302.58, + "step": 187600, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.75060987, + "epoch": 1.0492599060184804, + "grad_norm": 6.5625, + "learning_rate": 5.0061030512642035e-06, + "loss": 0.98005857, + "memory(GiB)": 302.58, + "step": 187620, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.75323529, + "epoch": 1.0493717554914597, + "grad_norm": 7.28125, + "learning_rate": 5.005178346887339e-06, + "loss": 0.95878277, + "memory(GiB)": 302.58, + "step": 187640, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.75620537, + "epoch": 1.049483604964439, + "grad_norm": 7.1875, + "learning_rate": 5.004253642333359e-06, + "loss": 0.96931, + "memory(GiB)": 302.58, + "step": 187660, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.74374952, + "epoch": 1.0495954544374182, + "grad_norm": 5.125, + "learning_rate": 5.00332893763389e-06, + "loss": 1.00737553, + "memory(GiB)": 302.58, + "step": 187680, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.74091134, + "epoch": 1.0497073039103975, + "grad_norm": 7.875, + "learning_rate": 5.0024042328205605e-06, + "loss": 1.03638391, + "memory(GiB)": 302.58, + "step": 187700, + "train_speed(iter/s)": 0.123902 + }, + { + "acc": 0.75340824, + "epoch": 1.0498191533833767, + "grad_norm": 7.15625, + "learning_rate": 5.001479527924999e-06, + "loss": 0.99127312, + "memory(GiB)": 302.58, + "step": 187720, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.74323168, + "epoch": 1.049931002856356, + "grad_norm": 8.5, + "learning_rate": 5.000554822978834e-06, + "loss": 1.01692553, + "memory(GiB)": 302.58, + "step": 187740, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.74377789, + "epoch": 1.0500428523293353, + "grad_norm": 6.09375, + "learning_rate": 4.999630118013691e-06, + "loss": 1.01214514, + "memory(GiB)": 302.58, + "step": 187760, + "train_speed(iter/s)": 0.123921 + }, + { + "acc": 0.74171176, + "epoch": 1.0501547018023145, + "grad_norm": 7.78125, + "learning_rate": 4.998705413061199e-06, + "loss": 1.0084053, + "memory(GiB)": 302.58, + "step": 187780, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.76982059, + "epoch": 1.0502665512752938, + "grad_norm": 6.75, + "learning_rate": 4.997780708152987e-06, + "loss": 0.88806458, + "memory(GiB)": 302.58, + "step": 187800, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.75688329, + "epoch": 1.050378400748273, + "grad_norm": 7.34375, + "learning_rate": 4.996856003320682e-06, + "loss": 0.95683403, + "memory(GiB)": 302.58, + "step": 187820, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.75109425, + "epoch": 1.0504902502212523, + "grad_norm": 4.6875, + "learning_rate": 4.99593129859591e-06, + "loss": 0.98720903, + "memory(GiB)": 302.58, + "step": 187840, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.77470374, + "epoch": 1.0506020996942316, + "grad_norm": 7.59375, + "learning_rate": 4.995006594010302e-06, + "loss": 0.86930714, + "memory(GiB)": 302.58, + "step": 187860, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.75556669, + "epoch": 1.0507139491672108, + "grad_norm": 6.78125, + "learning_rate": 4.994081889595484e-06, + "loss": 0.9562252, + "memory(GiB)": 302.58, + "step": 187880, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.76818275, + "epoch": 1.05082579864019, + "grad_norm": 6.90625, + "learning_rate": 4.993157185383084e-06, + "loss": 0.90580864, + "memory(GiB)": 302.58, + "step": 187900, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.7560246, + "epoch": 1.0509376481131694, + "grad_norm": 6.96875, + "learning_rate": 4.99223248140473e-06, + "loss": 0.9528161, + "memory(GiB)": 302.58, + "step": 187920, + "train_speed(iter/s)": 0.12397 + }, + { + "acc": 0.75368834, + "epoch": 1.0510494975861486, + "grad_norm": 8.25, + "learning_rate": 4.99130777769205e-06, + "loss": 0.96508007, + "memory(GiB)": 302.58, + "step": 187940, + "train_speed(iter/s)": 0.123977 + }, + { + "acc": 0.75447483, + "epoch": 1.051161347059128, + "grad_norm": 5.59375, + "learning_rate": 4.990383074276671e-06, + "loss": 0.95276804, + "memory(GiB)": 302.58, + "step": 187960, + "train_speed(iter/s)": 0.123983 + }, + { + "acc": 0.74852352, + "epoch": 1.0512731965321072, + "grad_norm": 8.9375, + "learning_rate": 4.989458371190221e-06, + "loss": 0.97974024, + "memory(GiB)": 302.58, + "step": 187980, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.76270123, + "epoch": 1.0513850460050864, + "grad_norm": 8.3125, + "learning_rate": 4.988533668464329e-06, + "loss": 0.94214821, + "memory(GiB)": 302.58, + "step": 188000, + "train_speed(iter/s)": 0.123995 + }, + { + "epoch": 1.0513850460050864, + "eval_acc": 0.7064133344257502, + "eval_loss": 1.0142465829849243, + "eval_runtime": 7542.2256, + "eval_samples_per_second": 9.982, + "eval_steps_per_second": 9.982, + "step": 188000 + }, + { + "acc": 0.74902029, + "epoch": 1.0514968954780657, + "grad_norm": 6.8125, + "learning_rate": 4.987608966130622e-06, + "loss": 0.99929733, + "memory(GiB)": 302.58, + "step": 188020, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.74929161, + "epoch": 1.051608744951045, + "grad_norm": 8.4375, + "learning_rate": 4.986684264220727e-06, + "loss": 0.98408861, + "memory(GiB)": 302.58, + "step": 188040, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.74026971, + "epoch": 1.0517205944240242, + "grad_norm": 7.375, + "learning_rate": 4.985759562766274e-06, + "loss": 1.02098932, + "memory(GiB)": 302.58, + "step": 188060, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.75374422, + "epoch": 1.0518324438970035, + "grad_norm": 6.15625, + "learning_rate": 4.984834861798887e-06, + "loss": 0.97500572, + "memory(GiB)": 302.58, + "step": 188080, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.74033117, + "epoch": 1.0519442933699827, + "grad_norm": 8.125, + "learning_rate": 4.983910161350197e-06, + "loss": 1.02558756, + "memory(GiB)": 302.58, + "step": 188100, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.76098843, + "epoch": 1.052056142842962, + "grad_norm": 6.75, + "learning_rate": 4.9829854614518295e-06, + "loss": 0.91893473, + "memory(GiB)": 302.58, + "step": 188120, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.73630004, + "epoch": 1.0521679923159413, + "grad_norm": 7.5625, + "learning_rate": 4.9820607621354125e-06, + "loss": 1.03344126, + "memory(GiB)": 302.58, + "step": 188140, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.75828524, + "epoch": 1.0522798417889205, + "grad_norm": 8.6875, + "learning_rate": 4.981136063432576e-06, + "loss": 0.93078051, + "memory(GiB)": 302.58, + "step": 188160, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.75664821, + "epoch": 1.0523916912618998, + "grad_norm": 7.65625, + "learning_rate": 4.980211365374945e-06, + "loss": 0.96984186, + "memory(GiB)": 302.58, + "step": 188180, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74916816, + "epoch": 1.052503540734879, + "grad_norm": 8.1875, + "learning_rate": 4.979286667994148e-06, + "loss": 0.98974066, + "memory(GiB)": 302.58, + "step": 188200, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.76152449, + "epoch": 1.0526153902078583, + "grad_norm": 9.125, + "learning_rate": 4.978361971321814e-06, + "loss": 0.9178793, + "memory(GiB)": 302.58, + "step": 188220, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.75531473, + "epoch": 1.0527272396808376, + "grad_norm": 7.59375, + "learning_rate": 4.977437275389568e-06, + "loss": 0.93944578, + "memory(GiB)": 302.58, + "step": 188240, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.75608959, + "epoch": 1.0528390891538169, + "grad_norm": 9.75, + "learning_rate": 4.976512580229039e-06, + "loss": 0.93664551, + "memory(GiB)": 302.58, + "step": 188260, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75538092, + "epoch": 1.0529509386267961, + "grad_norm": 7.59375, + "learning_rate": 4.9755878858718555e-06, + "loss": 0.97198896, + "memory(GiB)": 302.58, + "step": 188280, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.74561687, + "epoch": 1.0530627880997754, + "grad_norm": 5.4375, + "learning_rate": 4.974663192349643e-06, + "loss": 0.97374535, + "memory(GiB)": 302.58, + "step": 188300, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.76244621, + "epoch": 1.0531746375727546, + "grad_norm": 5.78125, + "learning_rate": 4.973738499694028e-06, + "loss": 0.92975502, + "memory(GiB)": 302.58, + "step": 188320, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.75390892, + "epoch": 1.053286487045734, + "grad_norm": 7.03125, + "learning_rate": 4.972813807936641e-06, + "loss": 0.96431856, + "memory(GiB)": 302.58, + "step": 188340, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.76767988, + "epoch": 1.0533983365187132, + "grad_norm": 7.5625, + "learning_rate": 4.971889117109108e-06, + "loss": 0.90281353, + "memory(GiB)": 302.58, + "step": 188360, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.74292965, + "epoch": 1.0535101859916924, + "grad_norm": 6.71875, + "learning_rate": 4.970964427243056e-06, + "loss": 1.03062944, + "memory(GiB)": 302.58, + "step": 188380, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.76353288, + "epoch": 1.0536220354646717, + "grad_norm": 9.375, + "learning_rate": 4.970039738370113e-06, + "loss": 0.91865578, + "memory(GiB)": 302.58, + "step": 188400, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.73798127, + "epoch": 1.053733884937651, + "grad_norm": 7.5625, + "learning_rate": 4.969115050521906e-06, + "loss": 1.04348097, + "memory(GiB)": 302.58, + "step": 188420, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.75436692, + "epoch": 1.0538457344106302, + "grad_norm": 7.65625, + "learning_rate": 4.968190363730062e-06, + "loss": 0.97039557, + "memory(GiB)": 302.58, + "step": 188440, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.751755, + "epoch": 1.0539575838836095, + "grad_norm": 6.96875, + "learning_rate": 4.967265678026208e-06, + "loss": 0.9768733, + "memory(GiB)": 302.58, + "step": 188460, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74530611, + "epoch": 1.0540694333565888, + "grad_norm": 7.59375, + "learning_rate": 4.966340993441972e-06, + "loss": 1.00522966, + "memory(GiB)": 302.58, + "step": 188480, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.75819583, + "epoch": 1.054181282829568, + "grad_norm": 10.125, + "learning_rate": 4.965416310008981e-06, + "loss": 0.94907904, + "memory(GiB)": 302.58, + "step": 188500, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.76554756, + "epoch": 1.0542931323025473, + "grad_norm": 9.625, + "learning_rate": 4.964491627758862e-06, + "loss": 0.91172934, + "memory(GiB)": 302.58, + "step": 188520, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75725665, + "epoch": 1.0544049817755266, + "grad_norm": 8.75, + "learning_rate": 4.963566946723242e-06, + "loss": 0.94255838, + "memory(GiB)": 302.58, + "step": 188540, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.75258565, + "epoch": 1.0545168312485058, + "grad_norm": 8.3125, + "learning_rate": 4.9626422669337475e-06, + "loss": 0.98025084, + "memory(GiB)": 302.58, + "step": 188560, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.73818617, + "epoch": 1.054628680721485, + "grad_norm": 6.8125, + "learning_rate": 4.961717588422005e-06, + "loss": 1.01918221, + "memory(GiB)": 302.58, + "step": 188580, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.75638561, + "epoch": 1.0547405301944643, + "grad_norm": 6.5, + "learning_rate": 4.960792911219643e-06, + "loss": 0.94347658, + "memory(GiB)": 302.58, + "step": 188600, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.76232677, + "epoch": 1.0548523796674436, + "grad_norm": 9.6875, + "learning_rate": 4.959868235358288e-06, + "loss": 0.94551229, + "memory(GiB)": 302.58, + "step": 188620, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.74418578, + "epoch": 1.0549642291404229, + "grad_norm": 7.75, + "learning_rate": 4.958943560869567e-06, + "loss": 1.0048502, + "memory(GiB)": 302.58, + "step": 188640, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.75132608, + "epoch": 1.0550760786134021, + "grad_norm": 6.21875, + "learning_rate": 4.958018887785107e-06, + "loss": 0.98763275, + "memory(GiB)": 302.58, + "step": 188660, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75603352, + "epoch": 1.0551879280863814, + "grad_norm": 9.375, + "learning_rate": 4.957094216136533e-06, + "loss": 0.95776949, + "memory(GiB)": 302.58, + "step": 188680, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.74107728, + "epoch": 1.0552997775593607, + "grad_norm": 9.625, + "learning_rate": 4.9561695459554735e-06, + "loss": 1.02851963, + "memory(GiB)": 302.58, + "step": 188700, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.76351976, + "epoch": 1.05541162703234, + "grad_norm": 7.21875, + "learning_rate": 4.955244877273553e-06, + "loss": 0.94010057, + "memory(GiB)": 302.58, + "step": 188720, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75915084, + "epoch": 1.0555234765053192, + "grad_norm": 7.3125, + "learning_rate": 4.9543202101224e-06, + "loss": 0.96124077, + "memory(GiB)": 302.58, + "step": 188740, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.75043559, + "epoch": 1.0556353259782985, + "grad_norm": 7.3125, + "learning_rate": 4.953395544533642e-06, + "loss": 0.98674383, + "memory(GiB)": 302.58, + "step": 188760, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.76040497, + "epoch": 1.0557471754512777, + "grad_norm": 9.5625, + "learning_rate": 4.952470880538903e-06, + "loss": 0.9337719, + "memory(GiB)": 302.58, + "step": 188780, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74773493, + "epoch": 1.055859024924257, + "grad_norm": 9.0, + "learning_rate": 4.951546218169812e-06, + "loss": 1.01113262, + "memory(GiB)": 302.58, + "step": 188800, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.75994072, + "epoch": 1.0559708743972362, + "grad_norm": 7.1875, + "learning_rate": 4.950621557457994e-06, + "loss": 0.92155123, + "memory(GiB)": 302.58, + "step": 188820, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.75421619, + "epoch": 1.0560827238702155, + "grad_norm": 5.71875, + "learning_rate": 4.949696898435074e-06, + "loss": 0.9597517, + "memory(GiB)": 302.58, + "step": 188840, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.75059047, + "epoch": 1.0561945733431948, + "grad_norm": 5.96875, + "learning_rate": 4.948772241132679e-06, + "loss": 1.00009756, + "memory(GiB)": 302.58, + "step": 188860, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.75920501, + "epoch": 1.056306422816174, + "grad_norm": 5.34375, + "learning_rate": 4.947847585582437e-06, + "loss": 0.93373995, + "memory(GiB)": 302.58, + "step": 188880, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.74619846, + "epoch": 1.0564182722891533, + "grad_norm": 6.78125, + "learning_rate": 4.9469229318159725e-06, + "loss": 0.99080753, + "memory(GiB)": 302.58, + "step": 188900, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.77072821, + "epoch": 1.0565301217621326, + "grad_norm": 5.96875, + "learning_rate": 4.945998279864913e-06, + "loss": 0.89729853, + "memory(GiB)": 302.58, + "step": 188920, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.75219398, + "epoch": 1.0566419712351118, + "grad_norm": 8.5, + "learning_rate": 4.945073629760882e-06, + "loss": 0.96241426, + "memory(GiB)": 302.58, + "step": 188940, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.75432296, + "epoch": 1.056753820708091, + "grad_norm": 6.21875, + "learning_rate": 4.944148981535509e-06, + "loss": 0.95769291, + "memory(GiB)": 302.58, + "step": 188960, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.77081747, + "epoch": 1.0568656701810704, + "grad_norm": 8.4375, + "learning_rate": 4.943224335220417e-06, + "loss": 0.92116919, + "memory(GiB)": 302.58, + "step": 188980, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.75349526, + "epoch": 1.0569775196540496, + "grad_norm": 6.875, + "learning_rate": 4.942299690847232e-06, + "loss": 0.97847595, + "memory(GiB)": 302.58, + "step": 189000, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.75584102, + "epoch": 1.0570893691270289, + "grad_norm": 6.1875, + "learning_rate": 4.941375048447581e-06, + "loss": 0.92791357, + "memory(GiB)": 302.58, + "step": 189020, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.75285001, + "epoch": 1.0572012186000082, + "grad_norm": 7.65625, + "learning_rate": 4.940450408053089e-06, + "loss": 0.97041883, + "memory(GiB)": 302.58, + "step": 189040, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.74762325, + "epoch": 1.0573130680729874, + "grad_norm": 7.03125, + "learning_rate": 4.939525769695382e-06, + "loss": 0.98967295, + "memory(GiB)": 302.58, + "step": 189060, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.7578454, + "epoch": 1.0574249175459667, + "grad_norm": 6.53125, + "learning_rate": 4.9386011334060865e-06, + "loss": 0.96278944, + "memory(GiB)": 302.58, + "step": 189080, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.7461494, + "epoch": 1.057536767018946, + "grad_norm": 8.625, + "learning_rate": 4.937676499216826e-06, + "loss": 0.99718304, + "memory(GiB)": 302.58, + "step": 189100, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.75517354, + "epoch": 1.0576486164919252, + "grad_norm": 5.0, + "learning_rate": 4.936751867159227e-06, + "loss": 0.96584759, + "memory(GiB)": 302.58, + "step": 189120, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.75089278, + "epoch": 1.0577604659649045, + "grad_norm": 8.25, + "learning_rate": 4.935827237264914e-06, + "loss": 1.01297874, + "memory(GiB)": 302.58, + "step": 189140, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.74612603, + "epoch": 1.0578723154378837, + "grad_norm": 8.25, + "learning_rate": 4.9349026095655135e-06, + "loss": 0.99136229, + "memory(GiB)": 302.58, + "step": 189160, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.75549254, + "epoch": 1.057984164910863, + "grad_norm": 6.125, + "learning_rate": 4.9339779840926504e-06, + "loss": 0.96850834, + "memory(GiB)": 302.58, + "step": 189180, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.75621042, + "epoch": 1.0580960143838423, + "grad_norm": 7.375, + "learning_rate": 4.933053360877949e-06, + "loss": 0.9308383, + "memory(GiB)": 302.58, + "step": 189200, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.75295668, + "epoch": 1.0582078638568215, + "grad_norm": 8.75, + "learning_rate": 4.932128739953036e-06, + "loss": 0.95419846, + "memory(GiB)": 302.58, + "step": 189220, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.73634973, + "epoch": 1.0583197133298008, + "grad_norm": 5.8125, + "learning_rate": 4.931204121349535e-06, + "loss": 1.06578865, + "memory(GiB)": 302.58, + "step": 189240, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.74410372, + "epoch": 1.05843156280278, + "grad_norm": 8.625, + "learning_rate": 4.9302795050990695e-06, + "loss": 1.00054207, + "memory(GiB)": 302.58, + "step": 189260, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.76642547, + "epoch": 1.0585434122757593, + "grad_norm": 10.75, + "learning_rate": 4.9293548912332675e-06, + "loss": 0.91712713, + "memory(GiB)": 302.58, + "step": 189280, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.73526516, + "epoch": 1.0586552617487386, + "grad_norm": 9.5, + "learning_rate": 4.928430279783752e-06, + "loss": 1.04160843, + "memory(GiB)": 302.58, + "step": 189300, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.76624818, + "epoch": 1.0587671112217178, + "grad_norm": 9.5, + "learning_rate": 4.927505670782149e-06, + "loss": 0.92306547, + "memory(GiB)": 302.58, + "step": 189320, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.75391779, + "epoch": 1.0588789606946971, + "grad_norm": 6.5625, + "learning_rate": 4.926581064260081e-06, + "loss": 0.96038456, + "memory(GiB)": 302.58, + "step": 189340, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.73725352, + "epoch": 1.0589908101676764, + "grad_norm": 7.09375, + "learning_rate": 4.9256564602491735e-06, + "loss": 1.03313704, + "memory(GiB)": 302.58, + "step": 189360, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.74452786, + "epoch": 1.0591026596406556, + "grad_norm": 5.6875, + "learning_rate": 4.9247318587810506e-06, + "loss": 1.00867853, + "memory(GiB)": 302.58, + "step": 189380, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.74682307, + "epoch": 1.059214509113635, + "grad_norm": 4.71875, + "learning_rate": 4.923807259887335e-06, + "loss": 1.01831837, + "memory(GiB)": 302.58, + "step": 189400, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.75563307, + "epoch": 1.0593263585866142, + "grad_norm": 7.0625, + "learning_rate": 4.922882663599655e-06, + "loss": 0.94350653, + "memory(GiB)": 302.58, + "step": 189420, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.73837776, + "epoch": 1.0594382080595934, + "grad_norm": 8.0625, + "learning_rate": 4.921958069949631e-06, + "loss": 1.02228489, + "memory(GiB)": 302.58, + "step": 189440, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.75445948, + "epoch": 1.0595500575325727, + "grad_norm": 6.96875, + "learning_rate": 4.921033478968889e-06, + "loss": 0.96371517, + "memory(GiB)": 302.58, + "step": 189460, + "train_speed(iter/s)": 0.123822 + }, + { + "acc": 0.76215138, + "epoch": 1.059661907005552, + "grad_norm": 6.375, + "learning_rate": 4.920108890689053e-06, + "loss": 0.91014805, + "memory(GiB)": 302.58, + "step": 189480, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.74299388, + "epoch": 1.0597737564785312, + "grad_norm": 9.3125, + "learning_rate": 4.919184305141746e-06, + "loss": 1.03226032, + "memory(GiB)": 302.58, + "step": 189500, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.74219089, + "epoch": 1.0598856059515105, + "grad_norm": 8.4375, + "learning_rate": 4.9182597223585925e-06, + "loss": 1.03143759, + "memory(GiB)": 302.58, + "step": 189520, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.73763161, + "epoch": 1.0599974554244898, + "grad_norm": 6.3125, + "learning_rate": 4.917335142371216e-06, + "loss": 1.03496046, + "memory(GiB)": 302.58, + "step": 189540, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.74280529, + "epoch": 1.060109304897469, + "grad_norm": 8.4375, + "learning_rate": 4.916410565211239e-06, + "loss": 0.97962704, + "memory(GiB)": 302.58, + "step": 189560, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.74035034, + "epoch": 1.0602211543704483, + "grad_norm": 8.0625, + "learning_rate": 4.915485990910286e-06, + "loss": 1.02369385, + "memory(GiB)": 302.58, + "step": 189580, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.74956365, + "epoch": 1.0603330038434275, + "grad_norm": 9.25, + "learning_rate": 4.914561419499979e-06, + "loss": 0.98719254, + "memory(GiB)": 302.58, + "step": 189600, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.73291097, + "epoch": 1.0604448533164068, + "grad_norm": 5.34375, + "learning_rate": 4.913636851011944e-06, + "loss": 1.04657383, + "memory(GiB)": 302.58, + "step": 189620, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.7637537, + "epoch": 1.060556702789386, + "grad_norm": 8.625, + "learning_rate": 4.912712285477801e-06, + "loss": 0.91965561, + "memory(GiB)": 302.58, + "step": 189640, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.74712296, + "epoch": 1.0606685522623653, + "grad_norm": 6.53125, + "learning_rate": 4.911787722929176e-06, + "loss": 0.99079504, + "memory(GiB)": 302.58, + "step": 189660, + "train_speed(iter/s)": 0.123888 + }, + { + "acc": 0.75160251, + "epoch": 1.0607804017353446, + "grad_norm": 7.1875, + "learning_rate": 4.910863163397691e-06, + "loss": 0.97980433, + "memory(GiB)": 302.58, + "step": 189680, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.73372078, + "epoch": 1.0608922512083239, + "grad_norm": 5.5, + "learning_rate": 4.9099386069149665e-06, + "loss": 1.0582696, + "memory(GiB)": 302.58, + "step": 189700, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.74907589, + "epoch": 1.0610041006813031, + "grad_norm": 9.9375, + "learning_rate": 4.909014053512629e-06, + "loss": 0.98129168, + "memory(GiB)": 302.58, + "step": 189720, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.76547151, + "epoch": 1.0611159501542824, + "grad_norm": 9.0, + "learning_rate": 4.908089503222297e-06, + "loss": 0.92798815, + "memory(GiB)": 302.58, + "step": 189740, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.75310388, + "epoch": 1.0612277996272617, + "grad_norm": 4.4375, + "learning_rate": 4.907164956075598e-06, + "loss": 0.98744936, + "memory(GiB)": 302.58, + "step": 189760, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.76089697, + "epoch": 1.061339649100241, + "grad_norm": 8.625, + "learning_rate": 4.9062404121041505e-06, + "loss": 0.97045765, + "memory(GiB)": 302.58, + "step": 189780, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.74422913, + "epoch": 1.0614514985732202, + "grad_norm": 6.15625, + "learning_rate": 4.905315871339579e-06, + "loss": 1.02415743, + "memory(GiB)": 302.58, + "step": 189800, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.76586056, + "epoch": 1.0615633480461995, + "grad_norm": 5.375, + "learning_rate": 4.904391333813505e-06, + "loss": 0.92306185, + "memory(GiB)": 302.58, + "step": 189820, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.76424713, + "epoch": 1.0616751975191787, + "grad_norm": 6.6875, + "learning_rate": 4.90346679955755e-06, + "loss": 0.91807079, + "memory(GiB)": 302.58, + "step": 189840, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.75228434, + "epoch": 1.061787046992158, + "grad_norm": 5.90625, + "learning_rate": 4.902542268603337e-06, + "loss": 0.97357979, + "memory(GiB)": 302.58, + "step": 189860, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.73359823, + "epoch": 1.0618988964651372, + "grad_norm": 8.625, + "learning_rate": 4.901617740982486e-06, + "loss": 1.04449081, + "memory(GiB)": 302.58, + "step": 189880, + "train_speed(iter/s)": 0.123959 + }, + { + "acc": 0.74772167, + "epoch": 1.0620107459381165, + "grad_norm": 7.75, + "learning_rate": 4.900693216726622e-06, + "loss": 1.00083551, + "memory(GiB)": 302.58, + "step": 189900, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.75339947, + "epoch": 1.0621225954110958, + "grad_norm": 6.0, + "learning_rate": 4.899768695867364e-06, + "loss": 0.96078682, + "memory(GiB)": 302.58, + "step": 189920, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.74873142, + "epoch": 1.062234444884075, + "grad_norm": 7.0625, + "learning_rate": 4.898844178436334e-06, + "loss": 0.97150278, + "memory(GiB)": 302.58, + "step": 189940, + "train_speed(iter/s)": 0.123977 + }, + { + "acc": 0.76354065, + "epoch": 1.0623462943570543, + "grad_norm": 5.0625, + "learning_rate": 4.897919664465155e-06, + "loss": 0.90976658, + "memory(GiB)": 302.58, + "step": 189960, + "train_speed(iter/s)": 0.123984 + }, + { + "acc": 0.74761472, + "epoch": 1.0624581438300336, + "grad_norm": 4.34375, + "learning_rate": 4.896995153985447e-06, + "loss": 0.98684387, + "memory(GiB)": 302.58, + "step": 189980, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.74046922, + "epoch": 1.0625699933030128, + "grad_norm": 7.9375, + "learning_rate": 4.896070647028831e-06, + "loss": 1.02995968, + "memory(GiB)": 302.58, + "step": 190000, + "train_speed(iter/s)": 0.123995 + }, + { + "epoch": 1.0625699933030128, + "eval_acc": 0.7063700025506158, + "eval_loss": 1.0141493082046509, + "eval_runtime": 7494.6925, + "eval_samples_per_second": 10.045, + "eval_steps_per_second": 10.045, + "step": 190000 + }, + { + "acc": 0.74469886, + "epoch": 1.062681842775992, + "grad_norm": 10.875, + "learning_rate": 4.895146143626927e-06, + "loss": 1.0151062, + "memory(GiB)": 302.58, + "step": 190020, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.73918843, + "epoch": 1.0627936922489714, + "grad_norm": 6.84375, + "learning_rate": 4.89422164381136e-06, + "loss": 1.0405591, + "memory(GiB)": 302.58, + "step": 190040, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.75975828, + "epoch": 1.0629055417219506, + "grad_norm": 6.15625, + "learning_rate": 4.893297147613747e-06, + "loss": 0.93935308, + "memory(GiB)": 302.58, + "step": 190060, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.7465723, + "epoch": 1.0630173911949299, + "grad_norm": 8.375, + "learning_rate": 4.89237265506571e-06, + "loss": 0.99927912, + "memory(GiB)": 302.58, + "step": 190080, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.76025953, + "epoch": 1.0631292406679091, + "grad_norm": 7.25, + "learning_rate": 4.89144816619887e-06, + "loss": 0.93919725, + "memory(GiB)": 302.58, + "step": 190100, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75082235, + "epoch": 1.0632410901408884, + "grad_norm": 10.875, + "learning_rate": 4.890523681044846e-06, + "loss": 0.97911453, + "memory(GiB)": 302.58, + "step": 190120, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74806833, + "epoch": 1.0633529396138677, + "grad_norm": 7.125, + "learning_rate": 4.8895991996352585e-06, + "loss": 0.99368744, + "memory(GiB)": 302.58, + "step": 190140, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.76144099, + "epoch": 1.063464789086847, + "grad_norm": 7.90625, + "learning_rate": 4.8886747220017295e-06, + "loss": 0.94013624, + "memory(GiB)": 302.58, + "step": 190160, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.75289555, + "epoch": 1.0635766385598262, + "grad_norm": 6.6875, + "learning_rate": 4.887750248175878e-06, + "loss": 0.95830364, + "memory(GiB)": 302.58, + "step": 190180, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.75569153, + "epoch": 1.0636884880328055, + "grad_norm": 6.71875, + "learning_rate": 4.886825778189324e-06, + "loss": 0.95862427, + "memory(GiB)": 302.58, + "step": 190200, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.73178835, + "epoch": 1.0638003375057847, + "grad_norm": 7.15625, + "learning_rate": 4.885901312073687e-06, + "loss": 1.04812336, + "memory(GiB)": 302.58, + "step": 190220, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75881686, + "epoch": 1.063912186978764, + "grad_norm": 8.9375, + "learning_rate": 4.884976849860588e-06, + "loss": 0.94074736, + "memory(GiB)": 302.58, + "step": 190240, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.76183867, + "epoch": 1.0640240364517433, + "grad_norm": 7.3125, + "learning_rate": 4.884052391581645e-06, + "loss": 0.95502872, + "memory(GiB)": 302.58, + "step": 190260, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.74262695, + "epoch": 1.0641358859247225, + "grad_norm": 7.25, + "learning_rate": 4.883127937268476e-06, + "loss": 1.03535795, + "memory(GiB)": 302.58, + "step": 190280, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.73336091, + "epoch": 1.0642477353977018, + "grad_norm": 6.375, + "learning_rate": 4.8822034869527035e-06, + "loss": 1.05744972, + "memory(GiB)": 302.58, + "step": 190300, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.76008472, + "epoch": 1.064359584870681, + "grad_norm": 10.875, + "learning_rate": 4.881279040665945e-06, + "loss": 0.94824114, + "memory(GiB)": 302.58, + "step": 190320, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.75176039, + "epoch": 1.0644714343436603, + "grad_norm": 7.09375, + "learning_rate": 4.880354598439821e-06, + "loss": 0.98035078, + "memory(GiB)": 302.58, + "step": 190340, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.74063778, + "epoch": 1.0645832838166396, + "grad_norm": 6.15625, + "learning_rate": 4.879430160305949e-06, + "loss": 1.02312956, + "memory(GiB)": 302.58, + "step": 190360, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.73913894, + "epoch": 1.0646951332896188, + "grad_norm": 7.5625, + "learning_rate": 4.878505726295947e-06, + "loss": 1.04684649, + "memory(GiB)": 302.58, + "step": 190380, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74554563, + "epoch": 1.064806982762598, + "grad_norm": 6.46875, + "learning_rate": 4.877581296441434e-06, + "loss": 0.99105892, + "memory(GiB)": 302.58, + "step": 190400, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.74295874, + "epoch": 1.0649188322355774, + "grad_norm": 6.3125, + "learning_rate": 4.876656870774029e-06, + "loss": 1.01805897, + "memory(GiB)": 302.58, + "step": 190420, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.72980285, + "epoch": 1.0650306817085566, + "grad_norm": 5.71875, + "learning_rate": 4.87573244932535e-06, + "loss": 1.09050655, + "memory(GiB)": 302.58, + "step": 190440, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.74569345, + "epoch": 1.065142531181536, + "grad_norm": 6.0625, + "learning_rate": 4.874808032127016e-06, + "loss": 0.99015303, + "memory(GiB)": 302.58, + "step": 190460, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74275327, + "epoch": 1.0652543806545152, + "grad_norm": 7.46875, + "learning_rate": 4.873883619210644e-06, + "loss": 1.01017418, + "memory(GiB)": 302.58, + "step": 190480, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74925442, + "epoch": 1.0653662301274944, + "grad_norm": 6.15625, + "learning_rate": 4.872959210607853e-06, + "loss": 0.96958847, + "memory(GiB)": 302.58, + "step": 190500, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.75450358, + "epoch": 1.0654780796004737, + "grad_norm": 6.96875, + "learning_rate": 4.872034806350259e-06, + "loss": 0.98796864, + "memory(GiB)": 302.58, + "step": 190520, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.73998432, + "epoch": 1.065589929073453, + "grad_norm": 9.3125, + "learning_rate": 4.87111040646948e-06, + "loss": 1.02252054, + "memory(GiB)": 302.58, + "step": 190540, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.75959711, + "epoch": 1.0657017785464322, + "grad_norm": 10.75, + "learning_rate": 4.870186010997133e-06, + "loss": 0.94249172, + "memory(GiB)": 302.58, + "step": 190560, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.75707278, + "epoch": 1.0658136280194115, + "grad_norm": 9.75, + "learning_rate": 4.869261619964837e-06, + "loss": 0.97093229, + "memory(GiB)": 302.58, + "step": 190580, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.7565917, + "epoch": 1.0659254774923907, + "grad_norm": 7.46875, + "learning_rate": 4.868337233404209e-06, + "loss": 0.94826574, + "memory(GiB)": 302.58, + "step": 190600, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.75205922, + "epoch": 1.06603732696537, + "grad_norm": 8.875, + "learning_rate": 4.8674128513468646e-06, + "loss": 0.96496363, + "memory(GiB)": 302.58, + "step": 190620, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.76588244, + "epoch": 1.0661491764383493, + "grad_norm": 5.65625, + "learning_rate": 4.866488473824419e-06, + "loss": 0.92061367, + "memory(GiB)": 302.58, + "step": 190640, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75195737, + "epoch": 1.0662610259113285, + "grad_norm": 6.5625, + "learning_rate": 4.865564100868493e-06, + "loss": 0.9548111, + "memory(GiB)": 302.58, + "step": 190660, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74815931, + "epoch": 1.0663728753843078, + "grad_norm": 9.3125, + "learning_rate": 4.864639732510701e-06, + "loss": 1.01995058, + "memory(GiB)": 302.58, + "step": 190680, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.73135371, + "epoch": 1.066484724857287, + "grad_norm": 6.625, + "learning_rate": 4.86371536878266e-06, + "loss": 1.06843033, + "memory(GiB)": 302.58, + "step": 190700, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.7511086, + "epoch": 1.0665965743302663, + "grad_norm": 7.6875, + "learning_rate": 4.862791009715984e-06, + "loss": 1.00310068, + "memory(GiB)": 302.58, + "step": 190720, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.76178203, + "epoch": 1.0667084238032456, + "grad_norm": 10.875, + "learning_rate": 4.861866655342292e-06, + "loss": 0.94237461, + "memory(GiB)": 302.58, + "step": 190740, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.72807178, + "epoch": 1.0668202732762249, + "grad_norm": 8.5625, + "learning_rate": 4.860942305693197e-06, + "loss": 1.07276297, + "memory(GiB)": 302.58, + "step": 190760, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.75812869, + "epoch": 1.0669321227492041, + "grad_norm": 6.8125, + "learning_rate": 4.860017960800317e-06, + "loss": 0.92707157, + "memory(GiB)": 302.58, + "step": 190780, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.75870843, + "epoch": 1.0670439722221834, + "grad_norm": 7.65625, + "learning_rate": 4.859093620695267e-06, + "loss": 0.95147972, + "memory(GiB)": 302.58, + "step": 190800, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.75048327, + "epoch": 1.0671558216951627, + "grad_norm": 5.25, + "learning_rate": 4.858169285409661e-06, + "loss": 0.95087814, + "memory(GiB)": 302.58, + "step": 190820, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.74918332, + "epoch": 1.067267671168142, + "grad_norm": 10.25, + "learning_rate": 4.857244954975116e-06, + "loss": 0.98250246, + "memory(GiB)": 302.58, + "step": 190840, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.74870749, + "epoch": 1.0673795206411212, + "grad_norm": 8.75, + "learning_rate": 4.856320629423246e-06, + "loss": 1.00202961, + "memory(GiB)": 302.58, + "step": 190860, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.72654481, + "epoch": 1.0674913701141004, + "grad_norm": 6.625, + "learning_rate": 4.8553963087856644e-06, + "loss": 1.08741283, + "memory(GiB)": 302.58, + "step": 190880, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.74446473, + "epoch": 1.0676032195870797, + "grad_norm": 5.65625, + "learning_rate": 4.8544719930939895e-06, + "loss": 1.0363966, + "memory(GiB)": 302.58, + "step": 190900, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.75083523, + "epoch": 1.067715069060059, + "grad_norm": 6.625, + "learning_rate": 4.8535476823798335e-06, + "loss": 0.966681, + "memory(GiB)": 302.58, + "step": 190920, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.74454446, + "epoch": 1.0678269185330382, + "grad_norm": 8.8125, + "learning_rate": 4.852623376674812e-06, + "loss": 0.99920578, + "memory(GiB)": 302.58, + "step": 190940, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.74614587, + "epoch": 1.0679387680060175, + "grad_norm": 7.25, + "learning_rate": 4.851699076010537e-06, + "loss": 1.01976843, + "memory(GiB)": 302.58, + "step": 190960, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.76187916, + "epoch": 1.0680506174789968, + "grad_norm": 8.375, + "learning_rate": 4.850774780418625e-06, + "loss": 0.93591089, + "memory(GiB)": 302.58, + "step": 190980, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.73951387, + "epoch": 1.068162466951976, + "grad_norm": 5.0, + "learning_rate": 4.849850489930688e-06, + "loss": 1.03574982, + "memory(GiB)": 302.58, + "step": 191000, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74963326, + "epoch": 1.0682743164249553, + "grad_norm": 9.125, + "learning_rate": 4.848926204578339e-06, + "loss": 0.99242001, + "memory(GiB)": 302.58, + "step": 191020, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.74546537, + "epoch": 1.0683861658979346, + "grad_norm": 4.53125, + "learning_rate": 4.848001924393195e-06, + "loss": 0.99471197, + "memory(GiB)": 302.58, + "step": 191040, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.74229612, + "epoch": 1.0684980153709138, + "grad_norm": 8.1875, + "learning_rate": 4.847077649406866e-06, + "loss": 0.99994173, + "memory(GiB)": 302.58, + "step": 191060, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.7399631, + "epoch": 1.068609864843893, + "grad_norm": 8.625, + "learning_rate": 4.846153379650967e-06, + "loss": 1.02709932, + "memory(GiB)": 302.58, + "step": 191080, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.75927491, + "epoch": 1.0687217143168724, + "grad_norm": 7.84375, + "learning_rate": 4.84522911515711e-06, + "loss": 0.94688292, + "memory(GiB)": 302.58, + "step": 191100, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.75349226, + "epoch": 1.0688335637898516, + "grad_norm": 7.1875, + "learning_rate": 4.844304855956908e-06, + "loss": 0.948909, + "memory(GiB)": 302.58, + "step": 191120, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.74588857, + "epoch": 1.0689454132628309, + "grad_norm": 6.875, + "learning_rate": 4.843380602081974e-06, + "loss": 1.00236053, + "memory(GiB)": 302.58, + "step": 191140, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.75967579, + "epoch": 1.0690572627358101, + "grad_norm": 9.5625, + "learning_rate": 4.842456353563918e-06, + "loss": 0.95296364, + "memory(GiB)": 302.58, + "step": 191160, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.76334114, + "epoch": 1.0691691122087894, + "grad_norm": 6.65625, + "learning_rate": 4.841532110434357e-06, + "loss": 0.90396786, + "memory(GiB)": 302.58, + "step": 191180, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.73737712, + "epoch": 1.0692809616817687, + "grad_norm": 6.8125, + "learning_rate": 4.840607872724899e-06, + "loss": 1.05961609, + "memory(GiB)": 302.58, + "step": 191200, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.74471722, + "epoch": 1.069392811154748, + "grad_norm": 6.375, + "learning_rate": 4.839683640467157e-06, + "loss": 0.99766455, + "memory(GiB)": 302.58, + "step": 191220, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.73406963, + "epoch": 1.0695046606277272, + "grad_norm": 4.53125, + "learning_rate": 4.8387594136927436e-06, + "loss": 1.03531122, + "memory(GiB)": 302.58, + "step": 191240, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.74991169, + "epoch": 1.0696165101007065, + "grad_norm": 5.65625, + "learning_rate": 4.837835192433269e-06, + "loss": 0.98957033, + "memory(GiB)": 302.58, + "step": 191260, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.74187093, + "epoch": 1.0697283595736857, + "grad_norm": 6.8125, + "learning_rate": 4.836910976720345e-06, + "loss": 1.03029633, + "memory(GiB)": 302.58, + "step": 191280, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.74514894, + "epoch": 1.069840209046665, + "grad_norm": 4.9375, + "learning_rate": 4.835986766585582e-06, + "loss": 1.00158415, + "memory(GiB)": 302.58, + "step": 191300, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.73477678, + "epoch": 1.0699520585196443, + "grad_norm": 6.09375, + "learning_rate": 4.8350625620605925e-06, + "loss": 1.07401543, + "memory(GiB)": 302.58, + "step": 191320, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.73687439, + "epoch": 1.0700639079926235, + "grad_norm": 6.78125, + "learning_rate": 4.834138363176987e-06, + "loss": 1.03579693, + "memory(GiB)": 302.58, + "step": 191340, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.76638827, + "epoch": 1.0701757574656028, + "grad_norm": 7.28125, + "learning_rate": 4.833214169966375e-06, + "loss": 0.89908094, + "memory(GiB)": 302.58, + "step": 191360, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.73746357, + "epoch": 1.070287606938582, + "grad_norm": 9.75, + "learning_rate": 4.832289982460367e-06, + "loss": 1.04609718, + "memory(GiB)": 302.58, + "step": 191380, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.75066004, + "epoch": 1.0703994564115613, + "grad_norm": 7.3125, + "learning_rate": 4.831365800690574e-06, + "loss": 0.9894165, + "memory(GiB)": 302.58, + "step": 191400, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.75362024, + "epoch": 1.0705113058845406, + "grad_norm": 7.9375, + "learning_rate": 4.8304416246886045e-06, + "loss": 0.96225996, + "memory(GiB)": 302.58, + "step": 191420, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.7483602, + "epoch": 1.0706231553575198, + "grad_norm": 4.0625, + "learning_rate": 4.829517454486069e-06, + "loss": 0.98400688, + "memory(GiB)": 302.58, + "step": 191440, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.74222937, + "epoch": 1.070735004830499, + "grad_norm": 5.71875, + "learning_rate": 4.828593290114578e-06, + "loss": 1.02995796, + "memory(GiB)": 302.58, + "step": 191460, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.74712362, + "epoch": 1.0708468543034784, + "grad_norm": 4.90625, + "learning_rate": 4.82766913160574e-06, + "loss": 0.99179401, + "memory(GiB)": 302.58, + "step": 191480, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.74311051, + "epoch": 1.0709587037764576, + "grad_norm": 7.1875, + "learning_rate": 4.8267449789911634e-06, + "loss": 0.99547768, + "memory(GiB)": 302.58, + "step": 191500, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.7355587, + "epoch": 1.071070553249437, + "grad_norm": 5.5625, + "learning_rate": 4.825820832302458e-06, + "loss": 1.01636686, + "memory(GiB)": 302.58, + "step": 191520, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.74026065, + "epoch": 1.0711824027224162, + "grad_norm": 8.375, + "learning_rate": 4.824896691571232e-06, + "loss": 1.03497448, + "memory(GiB)": 302.58, + "step": 191540, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.75304704, + "epoch": 1.0712942521953954, + "grad_norm": 6.8125, + "learning_rate": 4.823972556829096e-06, + "loss": 0.96763077, + "memory(GiB)": 302.58, + "step": 191560, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.7428834, + "epoch": 1.0714061016683747, + "grad_norm": 9.5, + "learning_rate": 4.823048428107655e-06, + "loss": 1.01354809, + "memory(GiB)": 302.58, + "step": 191580, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.74455733, + "epoch": 1.071517951141354, + "grad_norm": 5.78125, + "learning_rate": 4.8221243054385195e-06, + "loss": 0.99949532, + "memory(GiB)": 302.58, + "step": 191600, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.75496387, + "epoch": 1.0716298006143332, + "grad_norm": 10.375, + "learning_rate": 4.821200188853297e-06, + "loss": 0.95339975, + "memory(GiB)": 302.58, + "step": 191620, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.74102778, + "epoch": 1.0717416500873125, + "grad_norm": 7.0625, + "learning_rate": 4.820276078383595e-06, + "loss": 1.02815571, + "memory(GiB)": 302.58, + "step": 191640, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.76341963, + "epoch": 1.0718534995602917, + "grad_norm": 10.5, + "learning_rate": 4.8193519740610216e-06, + "loss": 0.92980175, + "memory(GiB)": 302.58, + "step": 191660, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.76493297, + "epoch": 1.071965349033271, + "grad_norm": 5.53125, + "learning_rate": 4.818427875917182e-06, + "loss": 0.91644707, + "memory(GiB)": 302.58, + "step": 191680, + "train_speed(iter/s)": 0.123898 + }, + { + "acc": 0.75993609, + "epoch": 1.0720771985062503, + "grad_norm": 6.875, + "learning_rate": 4.817503783983685e-06, + "loss": 0.9447032, + "memory(GiB)": 302.58, + "step": 191700, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.75329099, + "epoch": 1.0721890479792295, + "grad_norm": 8.9375, + "learning_rate": 4.816579698292138e-06, + "loss": 0.95717258, + "memory(GiB)": 302.58, + "step": 191720, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.75722213, + "epoch": 1.0723008974522088, + "grad_norm": 8.1875, + "learning_rate": 4.8156556188741465e-06, + "loss": 0.96738501, + "memory(GiB)": 302.58, + "step": 191740, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.74324613, + "epoch": 1.072412746925188, + "grad_norm": 12.4375, + "learning_rate": 4.814731545761318e-06, + "loss": 1.00719719, + "memory(GiB)": 302.58, + "step": 191760, + "train_speed(iter/s)": 0.123923 + }, + { + "acc": 0.74890876, + "epoch": 1.0725245963981673, + "grad_norm": 6.53125, + "learning_rate": 4.8138074789852575e-06, + "loss": 0.98177261, + "memory(GiB)": 302.58, + "step": 191780, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.7532639, + "epoch": 1.0726364458711466, + "grad_norm": 5.96875, + "learning_rate": 4.812883418577572e-06, + "loss": 0.97057924, + "memory(GiB)": 302.58, + "step": 191800, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.75274086, + "epoch": 1.0727482953441259, + "grad_norm": 8.3125, + "learning_rate": 4.811959364569868e-06, + "loss": 0.97354155, + "memory(GiB)": 302.58, + "step": 191820, + "train_speed(iter/s)": 0.123941 + }, + { + "acc": 0.75027704, + "epoch": 1.0728601448171051, + "grad_norm": 7.21875, + "learning_rate": 4.8110353169937475e-06, + "loss": 0.97924414, + "memory(GiB)": 302.58, + "step": 191840, + "train_speed(iter/s)": 0.123947 + }, + { + "acc": 0.74429741, + "epoch": 1.0729719942900844, + "grad_norm": 5.71875, + "learning_rate": 4.81011127588082e-06, + "loss": 1.00134468, + "memory(GiB)": 302.58, + "step": 191860, + "train_speed(iter/s)": 0.123953 + }, + { + "acc": 0.75615568, + "epoch": 1.0730838437630636, + "grad_norm": 7.96875, + "learning_rate": 4.809187241262688e-06, + "loss": 0.95214367, + "memory(GiB)": 302.58, + "step": 191880, + "train_speed(iter/s)": 0.123959 + }, + { + "acc": 0.76024036, + "epoch": 1.073195693236043, + "grad_norm": 6.15625, + "learning_rate": 4.808263213170956e-06, + "loss": 0.93661079, + "memory(GiB)": 302.58, + "step": 191900, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.7414907, + "epoch": 1.0733075427090222, + "grad_norm": 7.0, + "learning_rate": 4.807339191637232e-06, + "loss": 1.02180681, + "memory(GiB)": 302.58, + "step": 191920, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.75033445, + "epoch": 1.0734193921820014, + "grad_norm": 5.125, + "learning_rate": 4.80641517669312e-06, + "loss": 0.98732061, + "memory(GiB)": 302.58, + "step": 191940, + "train_speed(iter/s)": 0.123977 + }, + { + "acc": 0.7518775, + "epoch": 1.0735312416549807, + "grad_norm": 9.1875, + "learning_rate": 4.805491168370221e-06, + "loss": 0.95480127, + "memory(GiB)": 302.58, + "step": 191960, + "train_speed(iter/s)": 0.123983 + }, + { + "acc": 0.75711627, + "epoch": 1.07364309112796, + "grad_norm": 6.21875, + "learning_rate": 4.804567166700142e-06, + "loss": 0.95671415, + "memory(GiB)": 302.58, + "step": 191980, + "train_speed(iter/s)": 0.123989 + }, + { + "acc": 0.74852509, + "epoch": 1.0737549406009392, + "grad_norm": 6.46875, + "learning_rate": 4.803643171714484e-06, + "loss": 0.9822917, + "memory(GiB)": 302.58, + "step": 192000, + "train_speed(iter/s)": 0.123995 + }, + { + "epoch": 1.0737549406009392, + "eval_acc": 0.7064164894200716, + "eval_loss": 1.0142232179641724, + "eval_runtime": 7524.9382, + "eval_samples_per_second": 10.004, + "eval_steps_per_second": 10.004, + "step": 192000 + }, + { + "acc": 0.75887632, + "epoch": 1.0738667900739185, + "grad_norm": 7.15625, + "learning_rate": 4.802719183444854e-06, + "loss": 0.92532654, + "memory(GiB)": 302.58, + "step": 192020, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.73977094, + "epoch": 1.0739786395468978, + "grad_norm": 8.9375, + "learning_rate": 4.801795201922851e-06, + "loss": 1.04686966, + "memory(GiB)": 302.58, + "step": 192040, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.75331717, + "epoch": 1.074090489019877, + "grad_norm": 7.1875, + "learning_rate": 4.8008712271800826e-06, + "loss": 0.95574722, + "memory(GiB)": 302.58, + "step": 192060, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.73172092, + "epoch": 1.0742023384928563, + "grad_norm": 8.1875, + "learning_rate": 4.7999472592481485e-06, + "loss": 1.06318455, + "memory(GiB)": 302.58, + "step": 192080, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75308971, + "epoch": 1.0743141879658356, + "grad_norm": 8.8125, + "learning_rate": 4.799023298158654e-06, + "loss": 0.98160744, + "memory(GiB)": 302.58, + "step": 192100, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75398817, + "epoch": 1.0744260374388148, + "grad_norm": 9.9375, + "learning_rate": 4.798099343943199e-06, + "loss": 0.98525543, + "memory(GiB)": 302.58, + "step": 192120, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75384078, + "epoch": 1.074537886911794, + "grad_norm": 6.8125, + "learning_rate": 4.797175396633386e-06, + "loss": 0.97586365, + "memory(GiB)": 302.58, + "step": 192140, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74672389, + "epoch": 1.0746497363847733, + "grad_norm": 7.3125, + "learning_rate": 4.7962514562608195e-06, + "loss": 1.01661367, + "memory(GiB)": 302.58, + "step": 192160, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.74464431, + "epoch": 1.0747615858577526, + "grad_norm": 6.96875, + "learning_rate": 4.7953275228570976e-06, + "loss": 1.02506266, + "memory(GiB)": 302.58, + "step": 192180, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74905605, + "epoch": 1.0748734353307319, + "grad_norm": 7.28125, + "learning_rate": 4.7944035964538234e-06, + "loss": 0.98047113, + "memory(GiB)": 302.58, + "step": 192200, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.74631124, + "epoch": 1.0749852848037111, + "grad_norm": 8.8125, + "learning_rate": 4.793479677082599e-06, + "loss": 1.00151768, + "memory(GiB)": 302.58, + "step": 192220, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.7473495, + "epoch": 1.0750971342766904, + "grad_norm": 10.0625, + "learning_rate": 4.792555764775025e-06, + "loss": 1.00528841, + "memory(GiB)": 302.58, + "step": 192240, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.76312032, + "epoch": 1.0752089837496697, + "grad_norm": 11.0, + "learning_rate": 4.7916318595627006e-06, + "loss": 0.92725945, + "memory(GiB)": 302.58, + "step": 192260, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.73602962, + "epoch": 1.075320833222649, + "grad_norm": 6.75, + "learning_rate": 4.790707961477227e-06, + "loss": 1.04741383, + "memory(GiB)": 302.58, + "step": 192280, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.74306555, + "epoch": 1.0754326826956282, + "grad_norm": 8.4375, + "learning_rate": 4.789784070550206e-06, + "loss": 1.0302063, + "memory(GiB)": 302.58, + "step": 192300, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.76107278, + "epoch": 1.0755445321686075, + "grad_norm": 7.40625, + "learning_rate": 4.788860186813235e-06, + "loss": 0.94909477, + "memory(GiB)": 302.58, + "step": 192320, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.75646605, + "epoch": 1.0756563816415867, + "grad_norm": 9.5625, + "learning_rate": 4.787936310297915e-06, + "loss": 0.93424301, + "memory(GiB)": 302.58, + "step": 192340, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.74461651, + "epoch": 1.075768231114566, + "grad_norm": 8.0625, + "learning_rate": 4.787012441035846e-06, + "loss": 0.98763409, + "memory(GiB)": 302.58, + "step": 192360, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.74337258, + "epoch": 1.0758800805875453, + "grad_norm": 8.1875, + "learning_rate": 4.786088579058628e-06, + "loss": 0.9939147, + "memory(GiB)": 302.58, + "step": 192380, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.75293036, + "epoch": 1.0759919300605245, + "grad_norm": 6.28125, + "learning_rate": 4.785164724397858e-06, + "loss": 0.98942289, + "memory(GiB)": 302.58, + "step": 192400, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.75261292, + "epoch": 1.0761037795335038, + "grad_norm": 7.28125, + "learning_rate": 4.784240877085136e-06, + "loss": 0.97770033, + "memory(GiB)": 302.58, + "step": 192420, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74772844, + "epoch": 1.076215629006483, + "grad_norm": 9.0, + "learning_rate": 4.78331703715206e-06, + "loss": 0.9862793, + "memory(GiB)": 302.58, + "step": 192440, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.75284543, + "epoch": 1.0763274784794623, + "grad_norm": 6.84375, + "learning_rate": 4.782393204630228e-06, + "loss": 0.98652925, + "memory(GiB)": 302.58, + "step": 192460, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.75149736, + "epoch": 1.0764393279524416, + "grad_norm": 7.375, + "learning_rate": 4.781469379551239e-06, + "loss": 0.98858299, + "memory(GiB)": 302.58, + "step": 192480, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73415103, + "epoch": 1.0765511774254208, + "grad_norm": 6.1875, + "learning_rate": 4.78054556194669e-06, + "loss": 1.04194212, + "memory(GiB)": 302.58, + "step": 192500, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.76250429, + "epoch": 1.0766630268984, + "grad_norm": 6.21875, + "learning_rate": 4.779621751848179e-06, + "loss": 0.93671122, + "memory(GiB)": 302.58, + "step": 192520, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74071522, + "epoch": 1.0767748763713794, + "grad_norm": 7.03125, + "learning_rate": 4.778697949287302e-06, + "loss": 1.02062349, + "memory(GiB)": 302.58, + "step": 192540, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.74594903, + "epoch": 1.0768867258443586, + "grad_norm": 9.25, + "learning_rate": 4.777774154295658e-06, + "loss": 0.99367294, + "memory(GiB)": 302.58, + "step": 192560, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.72371745, + "epoch": 1.076998575317338, + "grad_norm": 7.84375, + "learning_rate": 4.776850366904841e-06, + "loss": 1.10417366, + "memory(GiB)": 302.58, + "step": 192580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.7536087, + "epoch": 1.0771104247903172, + "grad_norm": 10.4375, + "learning_rate": 4.77592658714645e-06, + "loss": 0.97776346, + "memory(GiB)": 302.58, + "step": 192600, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74764552, + "epoch": 1.0772222742632964, + "grad_norm": 6.53125, + "learning_rate": 4.77500281505208e-06, + "loss": 0.99595137, + "memory(GiB)": 302.58, + "step": 192620, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.7529315, + "epoch": 1.0773341237362757, + "grad_norm": 4.9375, + "learning_rate": 4.774079050653328e-06, + "loss": 0.96950417, + "memory(GiB)": 302.58, + "step": 192640, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.76491146, + "epoch": 1.077445973209255, + "grad_norm": 8.75, + "learning_rate": 4.773155293981788e-06, + "loss": 0.90313129, + "memory(GiB)": 302.58, + "step": 192660, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.75310192, + "epoch": 1.0775578226822342, + "grad_norm": 6.96875, + "learning_rate": 4.772231545069057e-06, + "loss": 0.95938854, + "memory(GiB)": 302.58, + "step": 192680, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.75913267, + "epoch": 1.0776696721552135, + "grad_norm": 8.375, + "learning_rate": 4.771307803946729e-06, + "loss": 0.95467978, + "memory(GiB)": 302.58, + "step": 192700, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.75192013, + "epoch": 1.0777815216281927, + "grad_norm": 5.875, + "learning_rate": 4.770384070646398e-06, + "loss": 0.9841094, + "memory(GiB)": 302.58, + "step": 192720, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.73640914, + "epoch": 1.077893371101172, + "grad_norm": 6.03125, + "learning_rate": 4.769460345199661e-06, + "loss": 1.04255114, + "memory(GiB)": 302.58, + "step": 192740, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.74123549, + "epoch": 1.0780052205741513, + "grad_norm": 7.625, + "learning_rate": 4.768536627638112e-06, + "loss": 1.02643566, + "memory(GiB)": 302.58, + "step": 192760, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.75277491, + "epoch": 1.0781170700471305, + "grad_norm": 6.25, + "learning_rate": 4.7676129179933435e-06, + "loss": 0.9733532, + "memory(GiB)": 302.58, + "step": 192780, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.74158216, + "epoch": 1.0782289195201098, + "grad_norm": 8.8125, + "learning_rate": 4.76668921629695e-06, + "loss": 1.02280817, + "memory(GiB)": 302.58, + "step": 192800, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.76080627, + "epoch": 1.078340768993089, + "grad_norm": 9.3125, + "learning_rate": 4.765765522580525e-06, + "loss": 0.933218, + "memory(GiB)": 302.58, + "step": 192820, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.75894365, + "epoch": 1.0784526184660683, + "grad_norm": 7.28125, + "learning_rate": 4.764841836875663e-06, + "loss": 0.93131542, + "memory(GiB)": 302.58, + "step": 192840, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.73650651, + "epoch": 1.0785644679390476, + "grad_norm": 8.625, + "learning_rate": 4.7639181592139535e-06, + "loss": 1.04561768, + "memory(GiB)": 302.58, + "step": 192860, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.76110735, + "epoch": 1.0786763174120269, + "grad_norm": 5.5, + "learning_rate": 4.762994489626993e-06, + "loss": 0.94527683, + "memory(GiB)": 302.58, + "step": 192880, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.75368505, + "epoch": 1.0787881668850061, + "grad_norm": 8.4375, + "learning_rate": 4.7620708281463725e-06, + "loss": 0.95373011, + "memory(GiB)": 302.58, + "step": 192900, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.74485612, + "epoch": 1.0789000163579854, + "grad_norm": 8.75, + "learning_rate": 4.761147174803684e-06, + "loss": 1.02586756, + "memory(GiB)": 302.58, + "step": 192920, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.73451705, + "epoch": 1.0790118658309646, + "grad_norm": 9.3125, + "learning_rate": 4.76022352963052e-06, + "loss": 1.06863346, + "memory(GiB)": 302.58, + "step": 192940, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.75413218, + "epoch": 1.079123715303944, + "grad_norm": 7.84375, + "learning_rate": 4.759299892658472e-06, + "loss": 0.97865515, + "memory(GiB)": 302.58, + "step": 192960, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.75532112, + "epoch": 1.0792355647769232, + "grad_norm": 5.46875, + "learning_rate": 4.758376263919129e-06, + "loss": 0.9767972, + "memory(GiB)": 302.58, + "step": 192980, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.75407295, + "epoch": 1.0793474142499024, + "grad_norm": 7.1875, + "learning_rate": 4.757452643444085e-06, + "loss": 0.9745822, + "memory(GiB)": 302.58, + "step": 193000, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.74656029, + "epoch": 1.0794592637228817, + "grad_norm": 7.71875, + "learning_rate": 4.75652903126493e-06, + "loss": 1.02201347, + "memory(GiB)": 302.58, + "step": 193020, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.76059141, + "epoch": 1.079571113195861, + "grad_norm": 7.5625, + "learning_rate": 4.755605427413253e-06, + "loss": 0.95109005, + "memory(GiB)": 302.58, + "step": 193040, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.75964508, + "epoch": 1.0796829626688402, + "grad_norm": 9.0625, + "learning_rate": 4.754681831920646e-06, + "loss": 0.93497953, + "memory(GiB)": 302.58, + "step": 193060, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.73924174, + "epoch": 1.0797948121418195, + "grad_norm": 7.03125, + "learning_rate": 4.7537582448186995e-06, + "loss": 1.04833946, + "memory(GiB)": 302.58, + "step": 193080, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.74412451, + "epoch": 1.0799066616147988, + "grad_norm": 8.4375, + "learning_rate": 4.752834666139001e-06, + "loss": 1.01908417, + "memory(GiB)": 302.58, + "step": 193100, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.75009413, + "epoch": 1.080018511087778, + "grad_norm": 8.625, + "learning_rate": 4.751911095913141e-06, + "loss": 0.99218092, + "memory(GiB)": 302.58, + "step": 193120, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.75072875, + "epoch": 1.0801303605607573, + "grad_norm": 5.125, + "learning_rate": 4.7509875341727064e-06, + "loss": 0.98028898, + "memory(GiB)": 302.58, + "step": 193140, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.77089033, + "epoch": 1.0802422100337365, + "grad_norm": 8.5625, + "learning_rate": 4.7500639809492874e-06, + "loss": 0.9039423, + "memory(GiB)": 302.58, + "step": 193160, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.74364734, + "epoch": 1.0803540595067158, + "grad_norm": 10.5625, + "learning_rate": 4.749140436274475e-06, + "loss": 1.01624737, + "memory(GiB)": 302.58, + "step": 193180, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.74356446, + "epoch": 1.080465908979695, + "grad_norm": 7.625, + "learning_rate": 4.7482169001798535e-06, + "loss": 1.01447115, + "memory(GiB)": 302.58, + "step": 193200, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.75729709, + "epoch": 1.0805777584526743, + "grad_norm": 7.5, + "learning_rate": 4.747293372697014e-06, + "loss": 0.94538279, + "memory(GiB)": 302.58, + "step": 193220, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.75146275, + "epoch": 1.0806896079256536, + "grad_norm": 6.5625, + "learning_rate": 4.746369853857541e-06, + "loss": 0.98097219, + "memory(GiB)": 302.58, + "step": 193240, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.75780206, + "epoch": 1.0808014573986329, + "grad_norm": 5.9375, + "learning_rate": 4.745446343693025e-06, + "loss": 0.93167505, + "memory(GiB)": 302.58, + "step": 193260, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.74753246, + "epoch": 1.0809133068716121, + "grad_norm": 5.59375, + "learning_rate": 4.74452284223505e-06, + "loss": 0.99852676, + "memory(GiB)": 302.58, + "step": 193280, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.74652972, + "epoch": 1.0810251563445914, + "grad_norm": 9.75, + "learning_rate": 4.743599349515204e-06, + "loss": 0.96550426, + "memory(GiB)": 302.58, + "step": 193300, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.76108108, + "epoch": 1.0811370058175707, + "grad_norm": 6.78125, + "learning_rate": 4.742675865565073e-06, + "loss": 0.94644165, + "memory(GiB)": 302.58, + "step": 193320, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.74483352, + "epoch": 1.08124885529055, + "grad_norm": 6.71875, + "learning_rate": 4.741752390416244e-06, + "loss": 1.00897923, + "memory(GiB)": 302.58, + "step": 193340, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.74799442, + "epoch": 1.0813607047635292, + "grad_norm": 6.6875, + "learning_rate": 4.740828924100302e-06, + "loss": 0.99003496, + "memory(GiB)": 302.58, + "step": 193360, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.76529288, + "epoch": 1.0814725542365085, + "grad_norm": 8.4375, + "learning_rate": 4.739905466648833e-06, + "loss": 0.92415934, + "memory(GiB)": 302.58, + "step": 193380, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.75909319, + "epoch": 1.0815844037094877, + "grad_norm": 4.5, + "learning_rate": 4.738982018093421e-06, + "loss": 0.94201412, + "memory(GiB)": 302.58, + "step": 193400, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.74756742, + "epoch": 1.081696253182467, + "grad_norm": 7.625, + "learning_rate": 4.738058578465652e-06, + "loss": 0.99411488, + "memory(GiB)": 302.58, + "step": 193420, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.7468493, + "epoch": 1.0818081026554462, + "grad_norm": 6.34375, + "learning_rate": 4.737135147797111e-06, + "loss": 0.99280624, + "memory(GiB)": 302.58, + "step": 193440, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.78045225, + "epoch": 1.0819199521284255, + "grad_norm": 7.625, + "learning_rate": 4.736211726119379e-06, + "loss": 0.87122326, + "memory(GiB)": 302.58, + "step": 193460, + "train_speed(iter/s)": 0.123826 + }, + { + "acc": 0.74905167, + "epoch": 1.0820318016014048, + "grad_norm": 10.125, + "learning_rate": 4.735288313464044e-06, + "loss": 0.99107389, + "memory(GiB)": 302.58, + "step": 193480, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.75127916, + "epoch": 1.082143651074384, + "grad_norm": 7.875, + "learning_rate": 4.734364909862689e-06, + "loss": 0.98064985, + "memory(GiB)": 302.58, + "step": 193500, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.75524669, + "epoch": 1.0822555005473633, + "grad_norm": 6.03125, + "learning_rate": 4.733441515346895e-06, + "loss": 0.94500799, + "memory(GiB)": 302.58, + "step": 193520, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.76722651, + "epoch": 1.0823673500203426, + "grad_norm": 8.75, + "learning_rate": 4.732518129948247e-06, + "loss": 0.93454208, + "memory(GiB)": 302.58, + "step": 193540, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.7410049, + "epoch": 1.0824791994933218, + "grad_norm": 9.25, + "learning_rate": 4.7315947536983275e-06, + "loss": 1.0181715, + "memory(GiB)": 302.58, + "step": 193560, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.74877949, + "epoch": 1.082591048966301, + "grad_norm": 8.0625, + "learning_rate": 4.730671386628718e-06, + "loss": 0.99718189, + "memory(GiB)": 302.58, + "step": 193580, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.76530833, + "epoch": 1.0827028984392804, + "grad_norm": 7.8125, + "learning_rate": 4.729748028771e-06, + "loss": 0.90774584, + "memory(GiB)": 302.58, + "step": 193600, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.75640545, + "epoch": 1.0828147479122596, + "grad_norm": 4.09375, + "learning_rate": 4.728824680156758e-06, + "loss": 0.95099144, + "memory(GiB)": 302.58, + "step": 193620, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.76022372, + "epoch": 1.0829265973852389, + "grad_norm": 7.71875, + "learning_rate": 4.727901340817571e-06, + "loss": 0.9469451, + "memory(GiB)": 302.58, + "step": 193640, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.73905745, + "epoch": 1.0830384468582182, + "grad_norm": 7.96875, + "learning_rate": 4.726978010785022e-06, + "loss": 1.02119837, + "memory(GiB)": 302.58, + "step": 193660, + "train_speed(iter/s)": 0.123885 + }, + { + "acc": 0.73574963, + "epoch": 1.0831502963311974, + "grad_norm": 7.71875, + "learning_rate": 4.72605469009069e-06, + "loss": 1.03644714, + "memory(GiB)": 302.58, + "step": 193680, + "train_speed(iter/s)": 0.123891 + }, + { + "acc": 0.7634985, + "epoch": 1.0832621458041767, + "grad_norm": 6.0625, + "learning_rate": 4.7251313787661565e-06, + "loss": 0.9506053, + "memory(GiB)": 302.58, + "step": 193700, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.75193563, + "epoch": 1.083373995277156, + "grad_norm": 8.125, + "learning_rate": 4.724208076843001e-06, + "loss": 0.9784091, + "memory(GiB)": 302.58, + "step": 193720, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.76064262, + "epoch": 1.0834858447501352, + "grad_norm": 7.59375, + "learning_rate": 4.723284784352803e-06, + "loss": 0.94740963, + "memory(GiB)": 302.58, + "step": 193740, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.7455245, + "epoch": 1.0835976942231145, + "grad_norm": 7.8125, + "learning_rate": 4.722361501327144e-06, + "loss": 0.98115816, + "memory(GiB)": 302.58, + "step": 193760, + "train_speed(iter/s)": 0.123915 + }, + { + "acc": 0.75052466, + "epoch": 1.0837095436960937, + "grad_norm": 5.625, + "learning_rate": 4.721438227797602e-06, + "loss": 0.97231388, + "memory(GiB)": 302.58, + "step": 193780, + "train_speed(iter/s)": 0.123921 + }, + { + "acc": 0.75371881, + "epoch": 1.083821393169073, + "grad_norm": 6.71875, + "learning_rate": 4.720514963795756e-06, + "loss": 0.94112082, + "memory(GiB)": 302.58, + "step": 193800, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.73738451, + "epoch": 1.0839332426420523, + "grad_norm": 9.25, + "learning_rate": 4.719591709353184e-06, + "loss": 1.039501, + "memory(GiB)": 302.58, + "step": 193820, + "train_speed(iter/s)": 0.123934 + }, + { + "acc": 0.7303525, + "epoch": 1.0840450921150315, + "grad_norm": 5.84375, + "learning_rate": 4.718668464501465e-06, + "loss": 1.07698088, + "memory(GiB)": 302.58, + "step": 193840, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.74666929, + "epoch": 1.0841569415880108, + "grad_norm": 7.875, + "learning_rate": 4.717745229272177e-06, + "loss": 1.00807514, + "memory(GiB)": 302.58, + "step": 193860, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.75695529, + "epoch": 1.08426879106099, + "grad_norm": 8.0, + "learning_rate": 4.716822003696895e-06, + "loss": 0.9397397, + "memory(GiB)": 302.58, + "step": 193880, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.73716478, + "epoch": 1.0843806405339693, + "grad_norm": 7.9375, + "learning_rate": 4.7158987878072e-06, + "loss": 1.01930122, + "memory(GiB)": 302.58, + "step": 193900, + "train_speed(iter/s)": 0.123958 + }, + { + "acc": 0.74936604, + "epoch": 1.0844924900069486, + "grad_norm": 6.84375, + "learning_rate": 4.714975581634668e-06, + "loss": 0.98615704, + "memory(GiB)": 302.58, + "step": 193920, + "train_speed(iter/s)": 0.123964 + }, + { + "acc": 0.75196047, + "epoch": 1.0846043394799278, + "grad_norm": 7.03125, + "learning_rate": 4.714052385210875e-06, + "loss": 0.97665377, + "memory(GiB)": 302.58, + "step": 193940, + "train_speed(iter/s)": 0.12397 + }, + { + "acc": 0.75521774, + "epoch": 1.0847161889529071, + "grad_norm": 4.84375, + "learning_rate": 4.713129198567396e-06, + "loss": 0.96192436, + "memory(GiB)": 302.58, + "step": 193960, + "train_speed(iter/s)": 0.123976 + }, + { + "acc": 0.75073643, + "epoch": 1.0848280384258864, + "grad_norm": 5.25, + "learning_rate": 4.712206021735809e-06, + "loss": 0.99246626, + "memory(GiB)": 302.58, + "step": 193980, + "train_speed(iter/s)": 0.123982 + }, + { + "acc": 0.77142959, + "epoch": 1.0849398878988656, + "grad_norm": 6.375, + "learning_rate": 4.711282854747687e-06, + "loss": 0.88974514, + "memory(GiB)": 302.58, + "step": 194000, + "train_speed(iter/s)": 0.123988 + }, + { + "epoch": 1.0849398878988656, + "eval_acc": 0.7064305390041595, + "eval_loss": 1.0141032934188843, + "eval_runtime": 7501.6737, + "eval_samples_per_second": 10.035, + "eval_steps_per_second": 10.035, + "step": 194000 + }, + { + "acc": 0.74510427, + "epoch": 1.085051737371845, + "grad_norm": 6.1875, + "learning_rate": 4.710359697634608e-06, + "loss": 0.9952075, + "memory(GiB)": 302.58, + "step": 194020, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.74906182, + "epoch": 1.0851635868448242, + "grad_norm": 9.0, + "learning_rate": 4.7094365504281455e-06, + "loss": 0.98104248, + "memory(GiB)": 302.58, + "step": 194040, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.75538239, + "epoch": 1.0852754363178034, + "grad_norm": 6.03125, + "learning_rate": 4.708513413159875e-06, + "loss": 0.96734753, + "memory(GiB)": 302.58, + "step": 194060, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.73779612, + "epoch": 1.0853872857907827, + "grad_norm": 6.15625, + "learning_rate": 4.70759028586137e-06, + "loss": 1.03884201, + "memory(GiB)": 302.58, + "step": 194080, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74749718, + "epoch": 1.085499135263762, + "grad_norm": 5.96875, + "learning_rate": 4.706667168564204e-06, + "loss": 0.978228, + "memory(GiB)": 302.58, + "step": 194100, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75314755, + "epoch": 1.0856109847367412, + "grad_norm": 7.46875, + "learning_rate": 4.7057440612999515e-06, + "loss": 0.95529976, + "memory(GiB)": 302.58, + "step": 194120, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75547853, + "epoch": 1.0857228342097205, + "grad_norm": 4.65625, + "learning_rate": 4.704820964100183e-06, + "loss": 0.97043848, + "memory(GiB)": 302.58, + "step": 194140, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.75832944, + "epoch": 1.0858346836826998, + "grad_norm": 5.71875, + "learning_rate": 4.703897876996475e-06, + "loss": 0.93751268, + "memory(GiB)": 302.58, + "step": 194160, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.74952826, + "epoch": 1.085946533155679, + "grad_norm": 8.375, + "learning_rate": 4.7029748000204e-06, + "loss": 0.98931646, + "memory(GiB)": 302.58, + "step": 194180, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.74622049, + "epoch": 1.0860583826286583, + "grad_norm": 7.03125, + "learning_rate": 4.702051733203527e-06, + "loss": 0.9859911, + "memory(GiB)": 302.58, + "step": 194200, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.73422799, + "epoch": 1.0861702321016375, + "grad_norm": 5.15625, + "learning_rate": 4.70112867657743e-06, + "loss": 1.04652119, + "memory(GiB)": 302.58, + "step": 194220, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.74609013, + "epoch": 1.0862820815746168, + "grad_norm": 6.46875, + "learning_rate": 4.70020563017368e-06, + "loss": 1.00053444, + "memory(GiB)": 302.58, + "step": 194240, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.75372953, + "epoch": 1.086393931047596, + "grad_norm": 7.34375, + "learning_rate": 4.699282594023848e-06, + "loss": 0.96147251, + "memory(GiB)": 302.58, + "step": 194260, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74194412, + "epoch": 1.0865057805205753, + "grad_norm": 9.0625, + "learning_rate": 4.6983595681595036e-06, + "loss": 1.00693808, + "memory(GiB)": 302.58, + "step": 194280, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.75289712, + "epoch": 1.0866176299935546, + "grad_norm": 6.875, + "learning_rate": 4.69743655261222e-06, + "loss": 0.9631465, + "memory(GiB)": 302.58, + "step": 194300, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.73376503, + "epoch": 1.0867294794665339, + "grad_norm": 10.0625, + "learning_rate": 4.696513547413565e-06, + "loss": 1.03928957, + "memory(GiB)": 302.58, + "step": 194320, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.76226015, + "epoch": 1.0868413289395131, + "grad_norm": 5.59375, + "learning_rate": 4.695590552595109e-06, + "loss": 0.9371769, + "memory(GiB)": 302.58, + "step": 194340, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.7370048, + "epoch": 1.0869531784124924, + "grad_norm": 5.6875, + "learning_rate": 4.694667568188423e-06, + "loss": 1.05695305, + "memory(GiB)": 302.58, + "step": 194360, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.74919791, + "epoch": 1.0870650278854717, + "grad_norm": 5.59375, + "learning_rate": 4.693744594225072e-06, + "loss": 0.98745289, + "memory(GiB)": 302.58, + "step": 194380, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.74660368, + "epoch": 1.087176877358451, + "grad_norm": 7.25, + "learning_rate": 4.6928216307366265e-06, + "loss": 0.99442148, + "memory(GiB)": 302.58, + "step": 194400, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.75424824, + "epoch": 1.0872887268314302, + "grad_norm": 7.3125, + "learning_rate": 4.6918986777546565e-06, + "loss": 0.97640429, + "memory(GiB)": 302.58, + "step": 194420, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75857997, + "epoch": 1.0874005763044094, + "grad_norm": 6.65625, + "learning_rate": 4.690975735310729e-06, + "loss": 0.9229373, + "memory(GiB)": 302.58, + "step": 194440, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.74923954, + "epoch": 1.0875124257773887, + "grad_norm": 6.4375, + "learning_rate": 4.690052803436411e-06, + "loss": 0.99163666, + "memory(GiB)": 302.58, + "step": 194460, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.76165447, + "epoch": 1.087624275250368, + "grad_norm": 6.5625, + "learning_rate": 4.689129882163269e-06, + "loss": 0.91778612, + "memory(GiB)": 302.58, + "step": 194480, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73502669, + "epoch": 1.0877361247233472, + "grad_norm": 9.0, + "learning_rate": 4.688206971522873e-06, + "loss": 1.08283014, + "memory(GiB)": 302.58, + "step": 194500, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.74339533, + "epoch": 1.0878479741963265, + "grad_norm": 7.125, + "learning_rate": 4.687284071546786e-06, + "loss": 1.02158051, + "memory(GiB)": 302.58, + "step": 194520, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.76339006, + "epoch": 1.0879598236693058, + "grad_norm": 10.0, + "learning_rate": 4.686361182266575e-06, + "loss": 0.92596388, + "memory(GiB)": 302.58, + "step": 194540, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.76499639, + "epoch": 1.088071673142285, + "grad_norm": 8.25, + "learning_rate": 4.685438303713807e-06, + "loss": 0.92343655, + "memory(GiB)": 302.58, + "step": 194560, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.75204954, + "epoch": 1.0881835226152643, + "grad_norm": 5.84375, + "learning_rate": 4.684515435920046e-06, + "loss": 0.96570425, + "memory(GiB)": 302.58, + "step": 194580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.74932575, + "epoch": 1.0882953720882436, + "grad_norm": 6.09375, + "learning_rate": 4.683592578916858e-06, + "loss": 1.00711174, + "memory(GiB)": 302.58, + "step": 194600, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.77294664, + "epoch": 1.0884072215612228, + "grad_norm": 8.625, + "learning_rate": 4.682669732735806e-06, + "loss": 0.89286213, + "memory(GiB)": 302.58, + "step": 194620, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.75870442, + "epoch": 1.088519071034202, + "grad_norm": 9.6875, + "learning_rate": 4.681746897408456e-06, + "loss": 0.95093136, + "memory(GiB)": 302.58, + "step": 194640, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74580445, + "epoch": 1.0886309205071814, + "grad_norm": 9.0625, + "learning_rate": 4.680824072966371e-06, + "loss": 1.01047983, + "memory(GiB)": 302.58, + "step": 194660, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.73443985, + "epoch": 1.0887427699801606, + "grad_norm": 9.25, + "learning_rate": 4.679901259441116e-06, + "loss": 1.05602942, + "memory(GiB)": 302.58, + "step": 194680, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.7632431, + "epoch": 1.0888546194531399, + "grad_norm": 8.3125, + "learning_rate": 4.678978456864251e-06, + "loss": 0.91714716, + "memory(GiB)": 302.58, + "step": 194700, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.75861716, + "epoch": 1.0889664689261191, + "grad_norm": 7.40625, + "learning_rate": 4.678055665267342e-06, + "loss": 0.96492929, + "memory(GiB)": 302.58, + "step": 194720, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.75123906, + "epoch": 1.0890783183990984, + "grad_norm": 10.0625, + "learning_rate": 4.6771328846819495e-06, + "loss": 1.00240088, + "memory(GiB)": 302.58, + "step": 194740, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.75063114, + "epoch": 1.0891901678720777, + "grad_norm": 7.21875, + "learning_rate": 4.676210115139636e-06, + "loss": 0.96049309, + "memory(GiB)": 302.58, + "step": 194760, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.75504813, + "epoch": 1.089302017345057, + "grad_norm": 9.8125, + "learning_rate": 4.675287356671963e-06, + "loss": 0.96339016, + "memory(GiB)": 302.58, + "step": 194780, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.73823504, + "epoch": 1.0894138668180362, + "grad_norm": 10.6875, + "learning_rate": 4.674364609310493e-06, + "loss": 1.04693909, + "memory(GiB)": 302.58, + "step": 194800, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.74542117, + "epoch": 1.0895257162910155, + "grad_norm": 5.34375, + "learning_rate": 4.673441873086786e-06, + "loss": 1.00407639, + "memory(GiB)": 302.58, + "step": 194820, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.75922012, + "epoch": 1.0896375657639947, + "grad_norm": 8.75, + "learning_rate": 4.6725191480324026e-06, + "loss": 0.9616643, + "memory(GiB)": 302.58, + "step": 194840, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.74518104, + "epoch": 1.089749415236974, + "grad_norm": 6.21875, + "learning_rate": 4.671596434178902e-06, + "loss": 0.98518896, + "memory(GiB)": 302.58, + "step": 194860, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.7668458, + "epoch": 1.0898612647099533, + "grad_norm": 9.4375, + "learning_rate": 4.670673731557844e-06, + "loss": 0.89539032, + "memory(GiB)": 302.58, + "step": 194880, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.74699216, + "epoch": 1.0899731141829325, + "grad_norm": 8.375, + "learning_rate": 4.669751040200789e-06, + "loss": 0.98701468, + "memory(GiB)": 302.58, + "step": 194900, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.75801768, + "epoch": 1.0900849636559118, + "grad_norm": 8.4375, + "learning_rate": 4.668828360139297e-06, + "loss": 0.95713387, + "memory(GiB)": 302.58, + "step": 194920, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.7381175, + "epoch": 1.090196813128891, + "grad_norm": 6.59375, + "learning_rate": 4.667905691404924e-06, + "loss": 1.03950243, + "memory(GiB)": 302.58, + "step": 194940, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.76299558, + "epoch": 1.0903086626018703, + "grad_norm": 7.84375, + "learning_rate": 4.66698303402923e-06, + "loss": 0.93883305, + "memory(GiB)": 302.58, + "step": 194960, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.74795861, + "epoch": 1.0904205120748496, + "grad_norm": 5.625, + "learning_rate": 4.666060388043772e-06, + "loss": 0.97992706, + "memory(GiB)": 302.58, + "step": 194980, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.74721837, + "epoch": 1.0905323615478288, + "grad_norm": 10.1875, + "learning_rate": 4.665137753480108e-06, + "loss": 0.98286572, + "memory(GiB)": 302.58, + "step": 195000, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.760253, + "epoch": 1.090644211020808, + "grad_norm": 7.78125, + "learning_rate": 4.664215130369793e-06, + "loss": 0.9319088, + "memory(GiB)": 302.58, + "step": 195020, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.75387301, + "epoch": 1.0907560604937874, + "grad_norm": 6.125, + "learning_rate": 4.663292518744387e-06, + "loss": 0.96556969, + "memory(GiB)": 302.58, + "step": 195040, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.75175581, + "epoch": 1.0908679099667666, + "grad_norm": 9.9375, + "learning_rate": 4.662369918635444e-06, + "loss": 0.97816143, + "memory(GiB)": 302.58, + "step": 195060, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.74145508, + "epoch": 1.090979759439746, + "grad_norm": 8.25, + "learning_rate": 4.66144733007452e-06, + "loss": 1.0221899, + "memory(GiB)": 302.58, + "step": 195080, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.74302778, + "epoch": 1.0910916089127252, + "grad_norm": 8.875, + "learning_rate": 4.660524753093172e-06, + "loss": 1.01474886, + "memory(GiB)": 302.58, + "step": 195100, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.75039754, + "epoch": 1.0912034583857044, + "grad_norm": 9.0, + "learning_rate": 4.6596021877229534e-06, + "loss": 0.99763947, + "memory(GiB)": 302.58, + "step": 195120, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.74071927, + "epoch": 1.0913153078586837, + "grad_norm": 8.625, + "learning_rate": 4.658679633995419e-06, + "loss": 1.02497482, + "memory(GiB)": 302.58, + "step": 195140, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.73612213, + "epoch": 1.091427157331663, + "grad_norm": 5.6875, + "learning_rate": 4.6577570919421226e-06, + "loss": 1.04108467, + "memory(GiB)": 302.58, + "step": 195160, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.75674605, + "epoch": 1.0915390068046422, + "grad_norm": 8.4375, + "learning_rate": 4.65683456159462e-06, + "loss": 0.98934784, + "memory(GiB)": 302.58, + "step": 195180, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.75947871, + "epoch": 1.0916508562776215, + "grad_norm": 8.3125, + "learning_rate": 4.655912042984463e-06, + "loss": 0.95017214, + "memory(GiB)": 302.58, + "step": 195200, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.74757833, + "epoch": 1.0917627057506007, + "grad_norm": 5.9375, + "learning_rate": 4.654989536143206e-06, + "loss": 1.01250086, + "memory(GiB)": 302.58, + "step": 195220, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.74413376, + "epoch": 1.09187455522358, + "grad_norm": 10.875, + "learning_rate": 4.654067041102401e-06, + "loss": 1.00506001, + "memory(GiB)": 302.58, + "step": 195240, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.74155283, + "epoch": 1.0919864046965593, + "grad_norm": 5.90625, + "learning_rate": 4.653144557893599e-06, + "loss": 1.01218119, + "memory(GiB)": 302.58, + "step": 195260, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.74730811, + "epoch": 1.0920982541695385, + "grad_norm": 5.3125, + "learning_rate": 4.652222086548354e-06, + "loss": 1.00428534, + "memory(GiB)": 302.58, + "step": 195280, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.75148797, + "epoch": 1.0922101036425178, + "grad_norm": 8.375, + "learning_rate": 4.651299627098214e-06, + "loss": 0.97532549, + "memory(GiB)": 302.58, + "step": 195300, + "train_speed(iter/s)": 0.123775 + }, + { + "acc": 0.73270826, + "epoch": 1.092321953115497, + "grad_norm": 9.625, + "learning_rate": 4.650377179574735e-06, + "loss": 1.05341387, + "memory(GiB)": 302.58, + "step": 195320, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.75028281, + "epoch": 1.0924338025884763, + "grad_norm": 8.125, + "learning_rate": 4.649454744009464e-06, + "loss": 1.00095282, + "memory(GiB)": 302.58, + "step": 195340, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.75683036, + "epoch": 1.0925456520614556, + "grad_norm": 7.21875, + "learning_rate": 4.648532320433954e-06, + "loss": 0.96328373, + "memory(GiB)": 302.58, + "step": 195360, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.74212103, + "epoch": 1.0926575015344349, + "grad_norm": 7.375, + "learning_rate": 4.647609908879751e-06, + "loss": 1.0319066, + "memory(GiB)": 302.58, + "step": 195380, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.74738908, + "epoch": 1.0927693510074141, + "grad_norm": 8.5625, + "learning_rate": 4.646687509378407e-06, + "loss": 0.98928833, + "memory(GiB)": 302.58, + "step": 195400, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.76481915, + "epoch": 1.0928812004803934, + "grad_norm": 9.25, + "learning_rate": 4.645765121961472e-06, + "loss": 0.90265856, + "memory(GiB)": 302.58, + "step": 195420, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.76344795, + "epoch": 1.0929930499533727, + "grad_norm": 5.875, + "learning_rate": 4.644842746660491e-06, + "loss": 0.93392115, + "memory(GiB)": 302.58, + "step": 195440, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.74555497, + "epoch": 1.093104899426352, + "grad_norm": 5.125, + "learning_rate": 4.643920383507015e-06, + "loss": 0.99107428, + "memory(GiB)": 302.58, + "step": 195460, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.75407772, + "epoch": 1.0932167488993312, + "grad_norm": 4.53125, + "learning_rate": 4.642998032532591e-06, + "loss": 0.98897915, + "memory(GiB)": 302.58, + "step": 195480, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.74057035, + "epoch": 1.0933285983723104, + "grad_norm": 7.5625, + "learning_rate": 4.642075693768767e-06, + "loss": 1.02089367, + "memory(GiB)": 302.58, + "step": 195500, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.75503554, + "epoch": 1.0934404478452897, + "grad_norm": 10.125, + "learning_rate": 4.6411533672470885e-06, + "loss": 0.97197123, + "memory(GiB)": 302.58, + "step": 195520, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.75060716, + "epoch": 1.093552297318269, + "grad_norm": 8.1875, + "learning_rate": 4.640231052999103e-06, + "loss": 0.9608552, + "memory(GiB)": 302.58, + "step": 195540, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.76357498, + "epoch": 1.0936641467912482, + "grad_norm": 10.9375, + "learning_rate": 4.639308751056356e-06, + "loss": 0.92351036, + "memory(GiB)": 302.58, + "step": 195560, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.73497391, + "epoch": 1.0937759962642275, + "grad_norm": 5.3125, + "learning_rate": 4.638386461450393e-06, + "loss": 1.04891186, + "memory(GiB)": 302.58, + "step": 195580, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.75418749, + "epoch": 1.0938878457372068, + "grad_norm": 8.625, + "learning_rate": 4.637464184212761e-06, + "loss": 0.97803268, + "memory(GiB)": 302.58, + "step": 195600, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.74728341, + "epoch": 1.093999695210186, + "grad_norm": 7.4375, + "learning_rate": 4.636541919375002e-06, + "loss": 0.99596987, + "memory(GiB)": 302.58, + "step": 195620, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.75938616, + "epoch": 1.0941115446831653, + "grad_norm": 8.1875, + "learning_rate": 4.635619666968663e-06, + "loss": 0.93487644, + "memory(GiB)": 302.58, + "step": 195640, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.74104395, + "epoch": 1.0942233941561446, + "grad_norm": 7.21875, + "learning_rate": 4.634697427025284e-06, + "loss": 1.01510944, + "memory(GiB)": 302.58, + "step": 195660, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.73839879, + "epoch": 1.0943352436291238, + "grad_norm": 5.6875, + "learning_rate": 4.633775199576414e-06, + "loss": 1.02346525, + "memory(GiB)": 302.58, + "step": 195680, + "train_speed(iter/s)": 0.123888 + }, + { + "acc": 0.7375545, + "epoch": 1.094447093102103, + "grad_norm": 7.4375, + "learning_rate": 4.632852984653593e-06, + "loss": 1.03397264, + "memory(GiB)": 302.58, + "step": 195700, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.77129493, + "epoch": 1.0945589425750826, + "grad_norm": 7.25, + "learning_rate": 4.631930782288365e-06, + "loss": 0.89271307, + "memory(GiB)": 302.58, + "step": 195720, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.75676928, + "epoch": 1.0946707920480616, + "grad_norm": 7.1875, + "learning_rate": 4.63100859251227e-06, + "loss": 0.93729305, + "memory(GiB)": 302.58, + "step": 195740, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.76030998, + "epoch": 1.094782641521041, + "grad_norm": 7.8125, + "learning_rate": 4.630086415356851e-06, + "loss": 0.94690428, + "memory(GiB)": 302.58, + "step": 195760, + "train_speed(iter/s)": 0.123913 + }, + { + "acc": 0.75377302, + "epoch": 1.0948944909940201, + "grad_norm": 7.34375, + "learning_rate": 4.629164250853651e-06, + "loss": 0.97363625, + "memory(GiB)": 302.58, + "step": 195780, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.74287548, + "epoch": 1.0950063404669996, + "grad_norm": 7.15625, + "learning_rate": 4.6282420990342095e-06, + "loss": 1.0286375, + "memory(GiB)": 302.58, + "step": 195800, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.75987167, + "epoch": 1.0951181899399787, + "grad_norm": 7.09375, + "learning_rate": 4.627319959930066e-06, + "loss": 0.94608107, + "memory(GiB)": 302.58, + "step": 195820, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.75868082, + "epoch": 1.0952300394129582, + "grad_norm": 7.75, + "learning_rate": 4.626397833572762e-06, + "loss": 0.94566097, + "memory(GiB)": 302.58, + "step": 195840, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.73953633, + "epoch": 1.0953418888859372, + "grad_norm": 6.125, + "learning_rate": 4.625475719993837e-06, + "loss": 1.04266815, + "memory(GiB)": 302.58, + "step": 195860, + "train_speed(iter/s)": 0.123942 + }, + { + "acc": 0.7506072, + "epoch": 1.0954537383589167, + "grad_norm": 4.03125, + "learning_rate": 4.624553619224831e-06, + "loss": 1.00151854, + "memory(GiB)": 302.58, + "step": 195880, + "train_speed(iter/s)": 0.123948 + }, + { + "acc": 0.76506696, + "epoch": 1.0955655878318957, + "grad_norm": 9.375, + "learning_rate": 4.623631531297279e-06, + "loss": 0.90467024, + "memory(GiB)": 302.58, + "step": 195900, + "train_speed(iter/s)": 0.123954 + }, + { + "acc": 0.75097585, + "epoch": 1.0956774373048752, + "grad_norm": 8.1875, + "learning_rate": 4.6227094562427234e-06, + "loss": 0.97319727, + "memory(GiB)": 302.58, + "step": 195920, + "train_speed(iter/s)": 0.12396 + }, + { + "acc": 0.7442471, + "epoch": 1.0957892867778543, + "grad_norm": 5.40625, + "learning_rate": 4.621787394092701e-06, + "loss": 1.02081194, + "memory(GiB)": 302.58, + "step": 195940, + "train_speed(iter/s)": 0.123966 + }, + { + "acc": 0.74361167, + "epoch": 1.0959011362508337, + "grad_norm": 5.78125, + "learning_rate": 4.620865344878749e-06, + "loss": 1.00450096, + "memory(GiB)": 302.58, + "step": 195960, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.74503908, + "epoch": 1.0960129857238128, + "grad_norm": 9.375, + "learning_rate": 4.6199433086324045e-06, + "loss": 1.0178421, + "memory(GiB)": 302.58, + "step": 195980, + "train_speed(iter/s)": 0.123978 + }, + { + "acc": 0.73138032, + "epoch": 1.0961248351967923, + "grad_norm": 6.1875, + "learning_rate": 4.619021285385204e-06, + "loss": 1.08556023, + "memory(GiB)": 302.58, + "step": 196000, + "train_speed(iter/s)": 0.123984 + }, + { + "epoch": 1.0961248351967923, + "eval_acc": 0.7064417293746437, + "eval_loss": 1.0137450695037842, + "eval_runtime": 7505.2084, + "eval_samples_per_second": 10.031, + "eval_steps_per_second": 10.031, + "step": 196000 + }, + { + "acc": 0.7506763, + "epoch": 1.0962366846697713, + "grad_norm": 7.5, + "learning_rate": 4.618099275168684e-06, + "loss": 0.98681726, + "memory(GiB)": 302.58, + "step": 196020, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.74040823, + "epoch": 1.0963485341427508, + "grad_norm": 4.625, + "learning_rate": 4.617177278014378e-06, + "loss": 1.01493168, + "memory(GiB)": 302.58, + "step": 196040, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.7450314, + "epoch": 1.0964603836157298, + "grad_norm": 9.0, + "learning_rate": 4.616255293953824e-06, + "loss": 1.00815859, + "memory(GiB)": 302.58, + "step": 196060, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.73622565, + "epoch": 1.0965722330887093, + "grad_norm": 7.375, + "learning_rate": 4.615333323018556e-06, + "loss": 1.03927317, + "memory(GiB)": 302.58, + "step": 196080, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75058079, + "epoch": 1.0966840825616884, + "grad_norm": 8.25, + "learning_rate": 4.614411365240108e-06, + "loss": 0.97255507, + "memory(GiB)": 302.58, + "step": 196100, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.76238036, + "epoch": 1.0967959320346679, + "grad_norm": 9.4375, + "learning_rate": 4.613489420650014e-06, + "loss": 0.93706264, + "memory(GiB)": 302.58, + "step": 196120, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.76283193, + "epoch": 1.096907781507647, + "grad_norm": 9.6875, + "learning_rate": 4.612567489279808e-06, + "loss": 0.92783623, + "memory(GiB)": 302.58, + "step": 196140, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.75401649, + "epoch": 1.0970196309806264, + "grad_norm": 7.75, + "learning_rate": 4.611645571161021e-06, + "loss": 0.99129572, + "memory(GiB)": 302.58, + "step": 196160, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.74951086, + "epoch": 1.0971314804536054, + "grad_norm": 6.15625, + "learning_rate": 4.610723666325187e-06, + "loss": 0.97599745, + "memory(GiB)": 302.58, + "step": 196180, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.75059857, + "epoch": 1.097243329926585, + "grad_norm": 7.21875, + "learning_rate": 4.609801774803839e-06, + "loss": 0.98927422, + "memory(GiB)": 302.58, + "step": 196200, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.7662128, + "epoch": 1.097355179399564, + "grad_norm": 9.625, + "learning_rate": 4.608879896628508e-06, + "loss": 0.90659828, + "memory(GiB)": 302.58, + "step": 196220, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.74622569, + "epoch": 1.0974670288725434, + "grad_norm": 7.6875, + "learning_rate": 4.607958031830723e-06, + "loss": 1.00793324, + "memory(GiB)": 302.58, + "step": 196240, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.76460514, + "epoch": 1.0975788783455225, + "grad_norm": 7.5, + "learning_rate": 4.607036180442018e-06, + "loss": 0.90604401, + "memory(GiB)": 302.58, + "step": 196260, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.75690141, + "epoch": 1.097690727818502, + "grad_norm": 7.75, + "learning_rate": 4.606114342493921e-06, + "loss": 0.96583338, + "memory(GiB)": 302.58, + "step": 196280, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74714942, + "epoch": 1.097802577291481, + "grad_norm": 6.59375, + "learning_rate": 4.605192518017963e-06, + "loss": 0.99111919, + "memory(GiB)": 302.58, + "step": 196300, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74793372, + "epoch": 1.0979144267644605, + "grad_norm": 7.34375, + "learning_rate": 4.604270707045671e-06, + "loss": 0.98842821, + "memory(GiB)": 302.58, + "step": 196320, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74539776, + "epoch": 1.0980262762374395, + "grad_norm": 5.25, + "learning_rate": 4.603348909608577e-06, + "loss": 1.01533918, + "memory(GiB)": 302.58, + "step": 196340, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.73946748, + "epoch": 1.098138125710419, + "grad_norm": 5.9375, + "learning_rate": 4.602427125738209e-06, + "loss": 1.01562901, + "memory(GiB)": 302.58, + "step": 196360, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.74445076, + "epoch": 1.0982499751833983, + "grad_norm": 6.0, + "learning_rate": 4.6015053554660925e-06, + "loss": 0.98986149, + "memory(GiB)": 302.58, + "step": 196380, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.74477654, + "epoch": 1.0983618246563776, + "grad_norm": 7.84375, + "learning_rate": 4.600583598823757e-06, + "loss": 1.00990858, + "memory(GiB)": 302.58, + "step": 196400, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.72862301, + "epoch": 1.0984736741293568, + "grad_norm": 6.6875, + "learning_rate": 4.59966185584273e-06, + "loss": 1.06092691, + "memory(GiB)": 302.58, + "step": 196420, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74972868, + "epoch": 1.098585523602336, + "grad_norm": 10.4375, + "learning_rate": 4.598740126554534e-06, + "loss": 1.01611128, + "memory(GiB)": 302.58, + "step": 196440, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.76696849, + "epoch": 1.0986973730753153, + "grad_norm": 7.40625, + "learning_rate": 4.5978184109907e-06, + "loss": 0.90342379, + "memory(GiB)": 302.58, + "step": 196460, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.75188432, + "epoch": 1.0988092225482946, + "grad_norm": 7.375, + "learning_rate": 4.596896709182751e-06, + "loss": 0.97043047, + "memory(GiB)": 302.58, + "step": 196480, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75157614, + "epoch": 1.0989210720212739, + "grad_norm": 5.90625, + "learning_rate": 4.595975021162214e-06, + "loss": 0.97296019, + "memory(GiB)": 302.58, + "step": 196500, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.7404808, + "epoch": 1.0990329214942531, + "grad_norm": 4.75, + "learning_rate": 4.595053346960612e-06, + "loss": 1.02944632, + "memory(GiB)": 302.58, + "step": 196520, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74135146, + "epoch": 1.0991447709672324, + "grad_norm": 8.5625, + "learning_rate": 4.59413168660947e-06, + "loss": 1.02119904, + "memory(GiB)": 302.58, + "step": 196540, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.74090185, + "epoch": 1.0992566204402117, + "grad_norm": 7.90625, + "learning_rate": 4.59321004014031e-06, + "loss": 1.01108284, + "memory(GiB)": 302.58, + "step": 196560, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.74937458, + "epoch": 1.099368469913191, + "grad_norm": 6.5625, + "learning_rate": 4.592288407584656e-06, + "loss": 1.00145111, + "memory(GiB)": 302.58, + "step": 196580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75814748, + "epoch": 1.0994803193861702, + "grad_norm": 8.125, + "learning_rate": 4.591366788974032e-06, + "loss": 0.93647308, + "memory(GiB)": 302.58, + "step": 196600, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.72991037, + "epoch": 1.0995921688591495, + "grad_norm": 7.15625, + "learning_rate": 4.59044518433996e-06, + "loss": 1.06214676, + "memory(GiB)": 302.58, + "step": 196620, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.74761252, + "epoch": 1.0997040183321287, + "grad_norm": 6.40625, + "learning_rate": 4.589523593713962e-06, + "loss": 1.00963316, + "memory(GiB)": 302.58, + "step": 196640, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75545917, + "epoch": 1.099815867805108, + "grad_norm": 9.25, + "learning_rate": 4.588602017127558e-06, + "loss": 0.95472441, + "memory(GiB)": 302.58, + "step": 196660, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.75158119, + "epoch": 1.0999277172780872, + "grad_norm": 6.90625, + "learning_rate": 4.587680454612269e-06, + "loss": 0.97094336, + "memory(GiB)": 302.58, + "step": 196680, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.7565351, + "epoch": 1.1000395667510665, + "grad_norm": 4.40625, + "learning_rate": 4.5867589061996165e-06, + "loss": 0.9595829, + "memory(GiB)": 302.58, + "step": 196700, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.76483159, + "epoch": 1.1001514162240458, + "grad_norm": 6.625, + "learning_rate": 4.585837371921119e-06, + "loss": 0.90805912, + "memory(GiB)": 302.58, + "step": 196720, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.74217167, + "epoch": 1.100263265697025, + "grad_norm": 7.75, + "learning_rate": 4.584915851808297e-06, + "loss": 1.03259373, + "memory(GiB)": 302.58, + "step": 196740, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.74063854, + "epoch": 1.1003751151700043, + "grad_norm": 7.25, + "learning_rate": 4.583994345892669e-06, + "loss": 1.02848864, + "memory(GiB)": 302.58, + "step": 196760, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.75799356, + "epoch": 1.1004869646429836, + "grad_norm": 8.0625, + "learning_rate": 4.583072854205755e-06, + "loss": 0.94109449, + "memory(GiB)": 302.58, + "step": 196780, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.75557027, + "epoch": 1.1005988141159628, + "grad_norm": 10.0, + "learning_rate": 4.58215137677907e-06, + "loss": 0.9595768, + "memory(GiB)": 302.58, + "step": 196800, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.73402948, + "epoch": 1.100710663588942, + "grad_norm": 7.15625, + "learning_rate": 4.581229913644133e-06, + "loss": 1.05372772, + "memory(GiB)": 302.58, + "step": 196820, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.72616072, + "epoch": 1.1008225130619214, + "grad_norm": 8.625, + "learning_rate": 4.580308464832462e-06, + "loss": 1.09879913, + "memory(GiB)": 302.58, + "step": 196840, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.75420737, + "epoch": 1.1009343625349006, + "grad_norm": 8.3125, + "learning_rate": 4.57938703037557e-06, + "loss": 0.96636515, + "memory(GiB)": 302.58, + "step": 196860, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.7634316, + "epoch": 1.1010462120078799, + "grad_norm": 6.40625, + "learning_rate": 4.578465610304977e-06, + "loss": 0.9217041, + "memory(GiB)": 302.58, + "step": 196880, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.76390634, + "epoch": 1.1011580614808592, + "grad_norm": 7.6875, + "learning_rate": 4.5775442046521955e-06, + "loss": 0.92752266, + "memory(GiB)": 302.58, + "step": 196900, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.75434351, + "epoch": 1.1012699109538384, + "grad_norm": 6.53125, + "learning_rate": 4.57662281344874e-06, + "loss": 0.96870356, + "memory(GiB)": 302.58, + "step": 196920, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.76290226, + "epoch": 1.1013817604268177, + "grad_norm": 6.34375, + "learning_rate": 4.57570143672613e-06, + "loss": 0.92089634, + "memory(GiB)": 302.58, + "step": 196940, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.74836617, + "epoch": 1.101493609899797, + "grad_norm": 7.78125, + "learning_rate": 4.574780074515875e-06, + "loss": 0.99593077, + "memory(GiB)": 302.58, + "step": 196960, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.7398654, + "epoch": 1.1016054593727762, + "grad_norm": 6.90625, + "learning_rate": 4.573858726849491e-06, + "loss": 1.02717915, + "memory(GiB)": 302.58, + "step": 196980, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.75837178, + "epoch": 1.1017173088457555, + "grad_norm": 7.125, + "learning_rate": 4.5729373937584895e-06, + "loss": 0.9539835, + "memory(GiB)": 302.58, + "step": 197000, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.75503964, + "epoch": 1.1018291583187347, + "grad_norm": 9.625, + "learning_rate": 4.572016075274383e-06, + "loss": 0.93469486, + "memory(GiB)": 302.58, + "step": 197020, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.7522552, + "epoch": 1.101941007791714, + "grad_norm": 7.90625, + "learning_rate": 4.571094771428684e-06, + "loss": 0.95756445, + "memory(GiB)": 302.58, + "step": 197040, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.75976119, + "epoch": 1.1020528572646933, + "grad_norm": 8.0625, + "learning_rate": 4.570173482252903e-06, + "loss": 0.94209843, + "memory(GiB)": 302.58, + "step": 197060, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.75351062, + "epoch": 1.1021647067376725, + "grad_norm": 6.625, + "learning_rate": 4.5692522077785524e-06, + "loss": 0.96193466, + "memory(GiB)": 302.58, + "step": 197080, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.74885364, + "epoch": 1.1022765562106518, + "grad_norm": 6.125, + "learning_rate": 4.568330948037143e-06, + "loss": 0.99111471, + "memory(GiB)": 302.58, + "step": 197100, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.75306926, + "epoch": 1.102388405683631, + "grad_norm": 6.625, + "learning_rate": 4.567409703060183e-06, + "loss": 0.98080149, + "memory(GiB)": 302.58, + "step": 197120, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.76055899, + "epoch": 1.1025002551566103, + "grad_norm": 7.75, + "learning_rate": 4.566488472879184e-06, + "loss": 0.95451899, + "memory(GiB)": 302.58, + "step": 197140, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.74116774, + "epoch": 1.1026121046295896, + "grad_norm": 7.6875, + "learning_rate": 4.565567257525653e-06, + "loss": 1.02555084, + "memory(GiB)": 302.58, + "step": 197160, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.73591719, + "epoch": 1.1027239541025688, + "grad_norm": 7.0625, + "learning_rate": 4.564646057031098e-06, + "loss": 1.04627523, + "memory(GiB)": 302.58, + "step": 197180, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.75602355, + "epoch": 1.1028358035755481, + "grad_norm": 8.6875, + "learning_rate": 4.563724871427029e-06, + "loss": 0.94735918, + "memory(GiB)": 302.58, + "step": 197200, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.75261583, + "epoch": 1.1029476530485274, + "grad_norm": 5.8125, + "learning_rate": 4.5628037007449545e-06, + "loss": 0.98660431, + "memory(GiB)": 302.58, + "step": 197220, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.7472012, + "epoch": 1.1030595025215066, + "grad_norm": 8.125, + "learning_rate": 4.561882545016378e-06, + "loss": 0.99255972, + "memory(GiB)": 302.58, + "step": 197240, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.74628577, + "epoch": 1.103171351994486, + "grad_norm": 6.375, + "learning_rate": 4.560961404272809e-06, + "loss": 1.00410357, + "memory(GiB)": 302.58, + "step": 197260, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.75997124, + "epoch": 1.1032832014674652, + "grad_norm": 8.1875, + "learning_rate": 4.560040278545751e-06, + "loss": 0.93530874, + "memory(GiB)": 302.58, + "step": 197280, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.75134182, + "epoch": 1.1033950509404444, + "grad_norm": 12.6875, + "learning_rate": 4.559119167866712e-06, + "loss": 0.99673338, + "memory(GiB)": 302.58, + "step": 197300, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.73836799, + "epoch": 1.1035069004134237, + "grad_norm": 7.46875, + "learning_rate": 4.558198072267193e-06, + "loss": 1.03911457, + "memory(GiB)": 302.58, + "step": 197320, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.75507803, + "epoch": 1.103618749886403, + "grad_norm": 7.15625, + "learning_rate": 4.557276991778703e-06, + "loss": 0.9650075, + "memory(GiB)": 302.58, + "step": 197340, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.739469, + "epoch": 1.1037305993593822, + "grad_norm": 4.3125, + "learning_rate": 4.556355926432743e-06, + "loss": 1.01308498, + "memory(GiB)": 302.58, + "step": 197360, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.76826339, + "epoch": 1.1038424488323615, + "grad_norm": 7.75, + "learning_rate": 4.555434876260817e-06, + "loss": 0.89954109, + "memory(GiB)": 302.58, + "step": 197380, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.75178561, + "epoch": 1.1039542983053408, + "grad_norm": 7.125, + "learning_rate": 4.5545138412944285e-06, + "loss": 0.96051683, + "memory(GiB)": 302.58, + "step": 197400, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.74521236, + "epoch": 1.10406614777832, + "grad_norm": 9.125, + "learning_rate": 4.5535928215650786e-06, + "loss": 0.99029484, + "memory(GiB)": 302.58, + "step": 197420, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.76535935, + "epoch": 1.1041779972512993, + "grad_norm": 7.34375, + "learning_rate": 4.55267181710427e-06, + "loss": 0.92440577, + "memory(GiB)": 302.58, + "step": 197440, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.75196686, + "epoch": 1.1042898467242785, + "grad_norm": 8.5, + "learning_rate": 4.551750827943503e-06, + "loss": 0.97971401, + "memory(GiB)": 302.58, + "step": 197460, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.76439633, + "epoch": 1.1044016961972578, + "grad_norm": 7.6875, + "learning_rate": 4.550829854114279e-06, + "loss": 0.916959, + "memory(GiB)": 302.58, + "step": 197480, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.7595561, + "epoch": 1.104513545670237, + "grad_norm": 7.40625, + "learning_rate": 4.5499088956481e-06, + "loss": 0.95046625, + "memory(GiB)": 302.58, + "step": 197500, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.76421962, + "epoch": 1.1046253951432163, + "grad_norm": 6.1875, + "learning_rate": 4.548987952576462e-06, + "loss": 0.9221137, + "memory(GiB)": 302.58, + "step": 197520, + "train_speed(iter/s)": 0.12384 + }, + { + "acc": 0.75529017, + "epoch": 1.1047372446161956, + "grad_norm": 7.15625, + "learning_rate": 4.5480670249308675e-06, + "loss": 0.96241083, + "memory(GiB)": 302.58, + "step": 197540, + "train_speed(iter/s)": 0.123846 + }, + { + "acc": 0.75682554, + "epoch": 1.1048490940891749, + "grad_norm": 7.40625, + "learning_rate": 4.547146112742814e-06, + "loss": 0.94120216, + "memory(GiB)": 302.58, + "step": 197560, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.75640326, + "epoch": 1.1049609435621541, + "grad_norm": 6.375, + "learning_rate": 4.546225216043799e-06, + "loss": 0.96451435, + "memory(GiB)": 302.58, + "step": 197580, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.76138821, + "epoch": 1.1050727930351334, + "grad_norm": 8.25, + "learning_rate": 4.545304334865319e-06, + "loss": 0.95849304, + "memory(GiB)": 302.58, + "step": 197600, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.75577631, + "epoch": 1.1051846425081127, + "grad_norm": 6.53125, + "learning_rate": 4.544383469238874e-06, + "loss": 0.95184755, + "memory(GiB)": 302.58, + "step": 197620, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.76958961, + "epoch": 1.105296491981092, + "grad_norm": 6.4375, + "learning_rate": 4.543462619195958e-06, + "loss": 0.87964373, + "memory(GiB)": 302.58, + "step": 197640, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.7555284, + "epoch": 1.1054083414540712, + "grad_norm": 6.5, + "learning_rate": 4.54254178476807e-06, + "loss": 0.95559597, + "memory(GiB)": 302.58, + "step": 197660, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.77018976, + "epoch": 1.1055201909270505, + "grad_norm": 12.5625, + "learning_rate": 4.541620965986702e-06, + "loss": 0.90687618, + "memory(GiB)": 302.58, + "step": 197680, + "train_speed(iter/s)": 0.123888 + }, + { + "acc": 0.7603981, + "epoch": 1.1056320404000297, + "grad_norm": 4.375, + "learning_rate": 4.540700162883351e-06, + "loss": 0.93344946, + "memory(GiB)": 302.58, + "step": 197700, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.75270872, + "epoch": 1.105743889873009, + "grad_norm": 7.25, + "learning_rate": 4.539779375489511e-06, + "loss": 0.96697826, + "memory(GiB)": 302.58, + "step": 197720, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.75112252, + "epoch": 1.1058557393459882, + "grad_norm": 7.46875, + "learning_rate": 4.538858603836674e-06, + "loss": 0.96202068, + "memory(GiB)": 302.58, + "step": 197740, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.74009676, + "epoch": 1.1059675888189675, + "grad_norm": 6.84375, + "learning_rate": 4.537937847956337e-06, + "loss": 1.02849016, + "memory(GiB)": 302.58, + "step": 197760, + "train_speed(iter/s)": 0.123912 + }, + { + "acc": 0.75802169, + "epoch": 1.1060794382919468, + "grad_norm": 8.125, + "learning_rate": 4.5370171078799896e-06, + "loss": 0.94073067, + "memory(GiB)": 302.58, + "step": 197780, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.76802487, + "epoch": 1.106191287764926, + "grad_norm": 8.25, + "learning_rate": 4.536096383639126e-06, + "loss": 0.91636267, + "memory(GiB)": 302.58, + "step": 197800, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.75507774, + "epoch": 1.1063031372379053, + "grad_norm": 6.84375, + "learning_rate": 4.535175675265237e-06, + "loss": 0.9694006, + "memory(GiB)": 302.58, + "step": 197820, + "train_speed(iter/s)": 0.12393 + }, + { + "acc": 0.75633388, + "epoch": 1.1064149867108846, + "grad_norm": 6.46875, + "learning_rate": 4.534254982789814e-06, + "loss": 0.9396471, + "memory(GiB)": 302.58, + "step": 197840, + "train_speed(iter/s)": 0.123936 + }, + { + "acc": 0.74864254, + "epoch": 1.1065268361838638, + "grad_norm": 7.625, + "learning_rate": 4.533334306244347e-06, + "loss": 1.00144033, + "memory(GiB)": 302.58, + "step": 197860, + "train_speed(iter/s)": 0.123942 + }, + { + "acc": 0.74696145, + "epoch": 1.106638685656843, + "grad_norm": 3.828125, + "learning_rate": 4.532413645660325e-06, + "loss": 1.00815678, + "memory(GiB)": 302.58, + "step": 197880, + "train_speed(iter/s)": 0.123947 + }, + { + "acc": 0.74731822, + "epoch": 1.1067505351298224, + "grad_norm": 8.375, + "learning_rate": 4.531493001069241e-06, + "loss": 0.99715776, + "memory(GiB)": 302.58, + "step": 197900, + "train_speed(iter/s)": 0.123953 + }, + { + "acc": 0.76337996, + "epoch": 1.1068623846028016, + "grad_norm": 7.5, + "learning_rate": 4.5305723725025815e-06, + "loss": 0.92523651, + "memory(GiB)": 302.58, + "step": 197920, + "train_speed(iter/s)": 0.123959 + }, + { + "acc": 0.762076, + "epoch": 1.1069742340757809, + "grad_norm": 6.5625, + "learning_rate": 4.529651759991836e-06, + "loss": 0.90914612, + "memory(GiB)": 302.58, + "step": 197940, + "train_speed(iter/s)": 0.123965 + }, + { + "acc": 0.73781576, + "epoch": 1.1070860835487601, + "grad_norm": 7.875, + "learning_rate": 4.52873116356849e-06, + "loss": 1.05677576, + "memory(GiB)": 302.58, + "step": 197960, + "train_speed(iter/s)": 0.123971 + }, + { + "acc": 0.74401708, + "epoch": 1.1071979330217394, + "grad_norm": 6.875, + "learning_rate": 4.527810583264035e-06, + "loss": 0.99886274, + "memory(GiB)": 302.58, + "step": 197980, + "train_speed(iter/s)": 0.123976 + }, + { + "acc": 0.75647039, + "epoch": 1.1073097824947187, + "grad_norm": 7.28125, + "learning_rate": 4.526890019109953e-06, + "loss": 0.96388111, + "memory(GiB)": 302.58, + "step": 198000, + "train_speed(iter/s)": 0.123982 + }, + { + "epoch": 1.1073097824947187, + "eval_acc": 0.7064943783423837, + "eval_loss": 1.0136460065841675, + "eval_runtime": 7497.7164, + "eval_samples_per_second": 10.041, + "eval_steps_per_second": 10.041, + "step": 198000 + }, + { + "acc": 0.75363584, + "epoch": 1.107421631967698, + "grad_norm": 8.625, + "learning_rate": 4.525969471137733e-06, + "loss": 0.97107506, + "memory(GiB)": 302.58, + "step": 198020, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.76370678, + "epoch": 1.1075334814406772, + "grad_norm": 7.125, + "learning_rate": 4.525048939378861e-06, + "loss": 0.93195534, + "memory(GiB)": 302.58, + "step": 198040, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.75510826, + "epoch": 1.1076453309136565, + "grad_norm": 10.25, + "learning_rate": 4.5241284238648205e-06, + "loss": 0.97194691, + "memory(GiB)": 302.58, + "step": 198060, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.76271682, + "epoch": 1.1077571803866357, + "grad_norm": 6.9375, + "learning_rate": 4.523207924627098e-06, + "loss": 0.92278252, + "memory(GiB)": 302.58, + "step": 198080, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.7453588, + "epoch": 1.107869029859615, + "grad_norm": 7.375, + "learning_rate": 4.522287441697175e-06, + "loss": 1.00272675, + "memory(GiB)": 302.58, + "step": 198100, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.75580139, + "epoch": 1.1079808793325943, + "grad_norm": 8.5625, + "learning_rate": 4.521366975106538e-06, + "loss": 0.95894051, + "memory(GiB)": 302.58, + "step": 198120, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74068713, + "epoch": 1.1080927288055735, + "grad_norm": 10.3125, + "learning_rate": 4.520446524886666e-06, + "loss": 1.01609383, + "memory(GiB)": 302.58, + "step": 198140, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.74198027, + "epoch": 1.1082045782785528, + "grad_norm": 5.59375, + "learning_rate": 4.519526091069044e-06, + "loss": 1.03227224, + "memory(GiB)": 302.58, + "step": 198160, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.76103277, + "epoch": 1.108316427751532, + "grad_norm": 8.3125, + "learning_rate": 4.518605673685154e-06, + "loss": 0.92686739, + "memory(GiB)": 302.58, + "step": 198180, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.74638529, + "epoch": 1.1084282772245113, + "grad_norm": 6.09375, + "learning_rate": 4.517685272766475e-06, + "loss": 1.00185862, + "memory(GiB)": 302.58, + "step": 198200, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.75012097, + "epoch": 1.1085401266974906, + "grad_norm": 7.1875, + "learning_rate": 4.51676488834449e-06, + "loss": 0.98471432, + "memory(GiB)": 302.58, + "step": 198220, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.73960552, + "epoch": 1.1086519761704698, + "grad_norm": 10.0625, + "learning_rate": 4.515844520450679e-06, + "loss": 1.02192688, + "memory(GiB)": 302.58, + "step": 198240, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.76839719, + "epoch": 1.108763825643449, + "grad_norm": 6.6875, + "learning_rate": 4.51492416911652e-06, + "loss": 0.8792861, + "memory(GiB)": 302.58, + "step": 198260, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.75114479, + "epoch": 1.1088756751164284, + "grad_norm": 6.0, + "learning_rate": 4.514003834373493e-06, + "loss": 0.98537512, + "memory(GiB)": 302.58, + "step": 198280, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74123921, + "epoch": 1.1089875245894076, + "grad_norm": 8.25, + "learning_rate": 4.513083516253076e-06, + "loss": 1.02882147, + "memory(GiB)": 302.58, + "step": 198300, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.74847836, + "epoch": 1.109099374062387, + "grad_norm": 6.8125, + "learning_rate": 4.512163214786746e-06, + "loss": 0.97676039, + "memory(GiB)": 302.58, + "step": 198320, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.74609365, + "epoch": 1.1092112235353662, + "grad_norm": 9.375, + "learning_rate": 4.51124293000598e-06, + "loss": 0.97942219, + "memory(GiB)": 302.58, + "step": 198340, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.7543941, + "epoch": 1.1093230730083454, + "grad_norm": 9.625, + "learning_rate": 4.510322661942257e-06, + "loss": 0.95508842, + "memory(GiB)": 302.58, + "step": 198360, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74119763, + "epoch": 1.1094349224813247, + "grad_norm": 6.34375, + "learning_rate": 4.509402410627051e-06, + "loss": 1.03007717, + "memory(GiB)": 302.58, + "step": 198380, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.73208838, + "epoch": 1.109546771954304, + "grad_norm": 5.125, + "learning_rate": 4.508482176091839e-06, + "loss": 1.05133133, + "memory(GiB)": 302.58, + "step": 198400, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.74234905, + "epoch": 1.1096586214272832, + "grad_norm": 6.59375, + "learning_rate": 4.507561958368095e-06, + "loss": 1.02848511, + "memory(GiB)": 302.58, + "step": 198420, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.75673261, + "epoch": 1.1097704709002625, + "grad_norm": 10.5625, + "learning_rate": 4.506641757487294e-06, + "loss": 0.9411953, + "memory(GiB)": 302.58, + "step": 198440, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.7548017, + "epoch": 1.1098823203732417, + "grad_norm": 8.4375, + "learning_rate": 4.505721573480908e-06, + "loss": 0.95792999, + "memory(GiB)": 302.58, + "step": 198460, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.74877872, + "epoch": 1.109994169846221, + "grad_norm": 5.09375, + "learning_rate": 4.504801406380412e-06, + "loss": 0.98737383, + "memory(GiB)": 302.58, + "step": 198480, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.72581382, + "epoch": 1.1101060193192003, + "grad_norm": 6.3125, + "learning_rate": 4.5038812562172775e-06, + "loss": 1.09029055, + "memory(GiB)": 302.58, + "step": 198500, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.75109868, + "epoch": 1.1102178687921795, + "grad_norm": 7.09375, + "learning_rate": 4.502961123022978e-06, + "loss": 0.97306843, + "memory(GiB)": 302.58, + "step": 198520, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.73500657, + "epoch": 1.1103297182651588, + "grad_norm": 7.3125, + "learning_rate": 4.5020410068289845e-06, + "loss": 1.06407242, + "memory(GiB)": 302.58, + "step": 198540, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74864497, + "epoch": 1.110441567738138, + "grad_norm": 8.375, + "learning_rate": 4.501120907666767e-06, + "loss": 0.97179661, + "memory(GiB)": 302.58, + "step": 198560, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75723014, + "epoch": 1.1105534172111173, + "grad_norm": 6.0625, + "learning_rate": 4.500200825567796e-06, + "loss": 0.93313036, + "memory(GiB)": 302.58, + "step": 198580, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.75662708, + "epoch": 1.1106652666840966, + "grad_norm": 5.84375, + "learning_rate": 4.499280760563543e-06, + "loss": 0.93051167, + "memory(GiB)": 302.58, + "step": 198600, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.73722229, + "epoch": 1.1107771161570759, + "grad_norm": 6.75, + "learning_rate": 4.498360712685473e-06, + "loss": 1.04008293, + "memory(GiB)": 302.58, + "step": 198620, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.7348074, + "epoch": 1.1108889656300551, + "grad_norm": 10.75, + "learning_rate": 4.497440681965059e-06, + "loss": 1.06148987, + "memory(GiB)": 302.58, + "step": 198640, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.76494737, + "epoch": 1.1110008151030344, + "grad_norm": 5.75, + "learning_rate": 4.496520668433768e-06, + "loss": 0.91124411, + "memory(GiB)": 302.58, + "step": 198660, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.75707984, + "epoch": 1.1111126645760137, + "grad_norm": 4.84375, + "learning_rate": 4.495600672123066e-06, + "loss": 0.9715476, + "memory(GiB)": 302.58, + "step": 198680, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75407677, + "epoch": 1.111224514048993, + "grad_norm": 7.1875, + "learning_rate": 4.494680693064421e-06, + "loss": 0.95908461, + "memory(GiB)": 302.58, + "step": 198700, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.75829372, + "epoch": 1.1113363635219722, + "grad_norm": 7.4375, + "learning_rate": 4.493760731289298e-06, + "loss": 0.96166582, + "memory(GiB)": 302.58, + "step": 198720, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.76401429, + "epoch": 1.1114482129949514, + "grad_norm": 7.5625, + "learning_rate": 4.492840786829162e-06, + "loss": 0.90884953, + "memory(GiB)": 302.58, + "step": 198740, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.75989294, + "epoch": 1.1115600624679307, + "grad_norm": 8.25, + "learning_rate": 4.491920859715481e-06, + "loss": 0.94591475, + "memory(GiB)": 302.58, + "step": 198760, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74927349, + "epoch": 1.11167191194091, + "grad_norm": 9.1875, + "learning_rate": 4.491000949979717e-06, + "loss": 0.9851903, + "memory(GiB)": 302.58, + "step": 198780, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.76898537, + "epoch": 1.1117837614138892, + "grad_norm": 6.34375, + "learning_rate": 4.490081057653335e-06, + "loss": 0.91510496, + "memory(GiB)": 302.58, + "step": 198800, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.76791687, + "epoch": 1.1118956108868685, + "grad_norm": 7.25, + "learning_rate": 4.489161182767797e-06, + "loss": 0.90089197, + "memory(GiB)": 302.58, + "step": 198820, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.74591684, + "epoch": 1.1120074603598478, + "grad_norm": 7.46875, + "learning_rate": 4.488241325354567e-06, + "loss": 1.00085335, + "memory(GiB)": 302.58, + "step": 198840, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.75528846, + "epoch": 1.112119309832827, + "grad_norm": 6.40625, + "learning_rate": 4.487321485445107e-06, + "loss": 0.94297981, + "memory(GiB)": 302.58, + "step": 198860, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.75523705, + "epoch": 1.1122311593058063, + "grad_norm": 7.4375, + "learning_rate": 4.486401663070875e-06, + "loss": 0.97191696, + "memory(GiB)": 302.58, + "step": 198880, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.7243361, + "epoch": 1.1123430087787856, + "grad_norm": 5.65625, + "learning_rate": 4.485481858263337e-06, + "loss": 1.09700394, + "memory(GiB)": 302.58, + "step": 198900, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.75690727, + "epoch": 1.1124548582517648, + "grad_norm": 8.4375, + "learning_rate": 4.484562071053951e-06, + "loss": 0.96845922, + "memory(GiB)": 302.58, + "step": 198920, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.74662604, + "epoch": 1.112566707724744, + "grad_norm": 8.9375, + "learning_rate": 4.4836423014741764e-06, + "loss": 1.00042295, + "memory(GiB)": 302.58, + "step": 198940, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.76014471, + "epoch": 1.1126785571977234, + "grad_norm": 9.25, + "learning_rate": 4.482722549555472e-06, + "loss": 0.96463032, + "memory(GiB)": 302.58, + "step": 198960, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.74373484, + "epoch": 1.1127904066707026, + "grad_norm": 8.125, + "learning_rate": 4.481802815329297e-06, + "loss": 1.0160449, + "memory(GiB)": 302.58, + "step": 198980, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74405289, + "epoch": 1.1129022561436819, + "grad_norm": 4.71875, + "learning_rate": 4.480883098827109e-06, + "loss": 1.00295019, + "memory(GiB)": 302.58, + "step": 199000, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.75969701, + "epoch": 1.1130141056166611, + "grad_norm": 7.15625, + "learning_rate": 4.4799634000803645e-06, + "loss": 0.93508129, + "memory(GiB)": 302.58, + "step": 199020, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.7573843, + "epoch": 1.1131259550896404, + "grad_norm": 5.375, + "learning_rate": 4.4790437191205215e-06, + "loss": 0.93324938, + "memory(GiB)": 302.58, + "step": 199040, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.75977144, + "epoch": 1.1132378045626197, + "grad_norm": 8.0, + "learning_rate": 4.478124055979034e-06, + "loss": 0.95592127, + "memory(GiB)": 302.58, + "step": 199060, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.74934278, + "epoch": 1.113349654035599, + "grad_norm": 7.53125, + "learning_rate": 4.477204410687359e-06, + "loss": 1.01550646, + "memory(GiB)": 302.58, + "step": 199080, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.75824366, + "epoch": 1.1134615035085782, + "grad_norm": 9.4375, + "learning_rate": 4.476284783276952e-06, + "loss": 0.94240141, + "memory(GiB)": 302.58, + "step": 199100, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.77721162, + "epoch": 1.1135733529815575, + "grad_norm": 5.34375, + "learning_rate": 4.475365173779266e-06, + "loss": 0.85996323, + "memory(GiB)": 302.58, + "step": 199120, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74921908, + "epoch": 1.1136852024545367, + "grad_norm": 9.0625, + "learning_rate": 4.474445582225754e-06, + "loss": 0.97027483, + "memory(GiB)": 302.58, + "step": 199140, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.74392195, + "epoch": 1.113797051927516, + "grad_norm": 6.34375, + "learning_rate": 4.473526008647868e-06, + "loss": 1.0093112, + "memory(GiB)": 302.58, + "step": 199160, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.74151735, + "epoch": 1.1139089014004953, + "grad_norm": 10.8125, + "learning_rate": 4.4726064530770645e-06, + "loss": 1.0144269, + "memory(GiB)": 302.58, + "step": 199180, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.75246739, + "epoch": 1.1140207508734745, + "grad_norm": 9.0, + "learning_rate": 4.471686915544791e-06, + "loss": 0.97460527, + "memory(GiB)": 302.58, + "step": 199200, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.75676446, + "epoch": 1.1141326003464538, + "grad_norm": 9.8125, + "learning_rate": 4.470767396082501e-06, + "loss": 0.94619236, + "memory(GiB)": 302.58, + "step": 199220, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.75284119, + "epoch": 1.114244449819433, + "grad_norm": 6.40625, + "learning_rate": 4.469847894721644e-06, + "loss": 0.95736856, + "memory(GiB)": 302.58, + "step": 199240, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.73573399, + "epoch": 1.1143562992924123, + "grad_norm": 4.21875, + "learning_rate": 4.468928411493671e-06, + "loss": 1.02118177, + "memory(GiB)": 302.58, + "step": 199260, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.75369968, + "epoch": 1.1144681487653916, + "grad_norm": 5.59375, + "learning_rate": 4.468008946430029e-06, + "loss": 0.96078329, + "memory(GiB)": 302.58, + "step": 199280, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.73959708, + "epoch": 1.1145799982383708, + "grad_norm": 9.3125, + "learning_rate": 4.467089499562167e-06, + "loss": 1.04329386, + "memory(GiB)": 302.58, + "step": 199300, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.75119138, + "epoch": 1.11469184771135, + "grad_norm": 5.78125, + "learning_rate": 4.466170070921536e-06, + "loss": 0.97115717, + "memory(GiB)": 302.58, + "step": 199320, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.73181596, + "epoch": 1.1148036971843294, + "grad_norm": 8.5, + "learning_rate": 4.465250660539581e-06, + "loss": 1.07676601, + "memory(GiB)": 302.58, + "step": 199340, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.75867796, + "epoch": 1.1149155466573086, + "grad_norm": 9.5, + "learning_rate": 4.4643312684477485e-06, + "loss": 0.93959284, + "memory(GiB)": 302.58, + "step": 199360, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.75119486, + "epoch": 1.115027396130288, + "grad_norm": 6.5625, + "learning_rate": 4.463411894677485e-06, + "loss": 0.98360577, + "memory(GiB)": 302.58, + "step": 199380, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.74500766, + "epoch": 1.1151392456032672, + "grad_norm": 7.25, + "learning_rate": 4.462492539260237e-06, + "loss": 1.01423492, + "memory(GiB)": 302.58, + "step": 199400, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.75267997, + "epoch": 1.1152510950762464, + "grad_norm": 8.875, + "learning_rate": 4.461573202227446e-06, + "loss": 0.98182745, + "memory(GiB)": 302.58, + "step": 199420, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.75709996, + "epoch": 1.1153629445492257, + "grad_norm": 7.125, + "learning_rate": 4.460653883610561e-06, + "loss": 0.9571394, + "memory(GiB)": 302.58, + "step": 199440, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.73916311, + "epoch": 1.115474794022205, + "grad_norm": 6.34375, + "learning_rate": 4.459734583441023e-06, + "loss": 1.01124601, + "memory(GiB)": 302.58, + "step": 199460, + "train_speed(iter/s)": 0.123826 + }, + { + "acc": 0.75850086, + "epoch": 1.1155866434951842, + "grad_norm": 6.0625, + "learning_rate": 4.4588153017502754e-06, + "loss": 0.96678972, + "memory(GiB)": 302.58, + "step": 199480, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.75626812, + "epoch": 1.1156984929681635, + "grad_norm": 11.4375, + "learning_rate": 4.4578960385697605e-06, + "loss": 0.94647827, + "memory(GiB)": 302.58, + "step": 199500, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.74350581, + "epoch": 1.1158103424411427, + "grad_norm": 4.90625, + "learning_rate": 4.45697679393092e-06, + "loss": 1.0046051, + "memory(GiB)": 302.58, + "step": 199520, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.75636215, + "epoch": 1.115922191914122, + "grad_norm": 9.5, + "learning_rate": 4.456057567865195e-06, + "loss": 0.94295874, + "memory(GiB)": 302.58, + "step": 199540, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.74935551, + "epoch": 1.1160340413871013, + "grad_norm": 8.125, + "learning_rate": 4.455138360404026e-06, + "loss": 0.98024712, + "memory(GiB)": 302.58, + "step": 199560, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.75622234, + "epoch": 1.1161458908600805, + "grad_norm": 6.96875, + "learning_rate": 4.454219171578854e-06, + "loss": 0.93840723, + "memory(GiB)": 302.58, + "step": 199580, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.74446383, + "epoch": 1.1162577403330598, + "grad_norm": 6.03125, + "learning_rate": 4.4533000014211165e-06, + "loss": 0.98859978, + "memory(GiB)": 302.58, + "step": 199600, + "train_speed(iter/s)": 0.123869 + }, + { + "acc": 0.74203091, + "epoch": 1.116369589806039, + "grad_norm": 5.75, + "learning_rate": 4.452380849962251e-06, + "loss": 1.01944895, + "memory(GiB)": 302.58, + "step": 199620, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.7589376, + "epoch": 1.1164814392790183, + "grad_norm": 8.4375, + "learning_rate": 4.451461717233698e-06, + "loss": 0.94259062, + "memory(GiB)": 302.58, + "step": 199640, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.73841505, + "epoch": 1.1165932887519976, + "grad_norm": 7.65625, + "learning_rate": 4.450542603266894e-06, + "loss": 1.03725872, + "memory(GiB)": 302.58, + "step": 199660, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.77034945, + "epoch": 1.1167051382249769, + "grad_norm": 8.0625, + "learning_rate": 4.449623508093274e-06, + "loss": 0.89292746, + "memory(GiB)": 302.58, + "step": 199680, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.75860982, + "epoch": 1.1168169876979561, + "grad_norm": 5.75, + "learning_rate": 4.448704431744276e-06, + "loss": 0.96299763, + "memory(GiB)": 302.58, + "step": 199700, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.73666039, + "epoch": 1.1169288371709354, + "grad_norm": 6.78125, + "learning_rate": 4.447785374251335e-06, + "loss": 1.03818979, + "memory(GiB)": 302.58, + "step": 199720, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.74993072, + "epoch": 1.1170406866439146, + "grad_norm": 9.5625, + "learning_rate": 4.446866335645886e-06, + "loss": 1.00628595, + "memory(GiB)": 302.58, + "step": 199740, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.75348144, + "epoch": 1.117152536116894, + "grad_norm": 5.9375, + "learning_rate": 4.4459473159593595e-06, + "loss": 0.95524969, + "memory(GiB)": 302.58, + "step": 199760, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.75390973, + "epoch": 1.1172643855898732, + "grad_norm": 6.6875, + "learning_rate": 4.4450283152231925e-06, + "loss": 0.95458031, + "memory(GiB)": 302.58, + "step": 199780, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.77168641, + "epoch": 1.1173762350628524, + "grad_norm": 10.0, + "learning_rate": 4.444109333468818e-06, + "loss": 0.87700987, + "memory(GiB)": 302.58, + "step": 199800, + "train_speed(iter/s)": 0.123926 + }, + { + "acc": 0.73386536, + "epoch": 1.1174880845358317, + "grad_norm": 4.78125, + "learning_rate": 4.443190370727667e-06, + "loss": 1.06187668, + "memory(GiB)": 302.58, + "step": 199820, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.74262967, + "epoch": 1.117599934008811, + "grad_norm": 9.3125, + "learning_rate": 4.44227142703117e-06, + "loss": 1.01480999, + "memory(GiB)": 302.58, + "step": 199840, + "train_speed(iter/s)": 0.123937 + }, + { + "acc": 0.7438777, + "epoch": 1.1177117834817902, + "grad_norm": 8.0, + "learning_rate": 4.4413525024107595e-06, + "loss": 1.003895, + "memory(GiB)": 302.58, + "step": 199860, + "train_speed(iter/s)": 0.123943 + }, + { + "acc": 0.75470448, + "epoch": 1.1178236329547695, + "grad_norm": 8.625, + "learning_rate": 4.4404335968978644e-06, + "loss": 0.945222, + "memory(GiB)": 302.58, + "step": 199880, + "train_speed(iter/s)": 0.123949 + }, + { + "acc": 0.73576913, + "epoch": 1.1179354824277488, + "grad_norm": 6.09375, + "learning_rate": 4.439514710523914e-06, + "loss": 1.04896326, + "memory(GiB)": 302.58, + "step": 199900, + "train_speed(iter/s)": 0.123955 + }, + { + "acc": 0.75679035, + "epoch": 1.118047331900728, + "grad_norm": 6.59375, + "learning_rate": 4.438595843320338e-06, + "loss": 0.95526323, + "memory(GiB)": 302.58, + "step": 199920, + "train_speed(iter/s)": 0.12396 + }, + { + "acc": 0.74757433, + "epoch": 1.1181591813737073, + "grad_norm": 10.3125, + "learning_rate": 4.437676995318565e-06, + "loss": 1.01001749, + "memory(GiB)": 302.58, + "step": 199940, + "train_speed(iter/s)": 0.123966 + }, + { + "acc": 0.74403481, + "epoch": 1.1182710308466866, + "grad_norm": 5.84375, + "learning_rate": 4.436758166550022e-06, + "loss": 1.01208334, + "memory(GiB)": 302.58, + "step": 199960, + "train_speed(iter/s)": 0.123972 + }, + { + "acc": 0.75548158, + "epoch": 1.1183828803196658, + "grad_norm": 4.375, + "learning_rate": 4.435839357046136e-06, + "loss": 0.967521, + "memory(GiB)": 302.58, + "step": 199980, + "train_speed(iter/s)": 0.123977 + }, + { + "acc": 0.75449972, + "epoch": 1.118494729792645, + "grad_norm": 5.65625, + "learning_rate": 4.434920566838332e-06, + "loss": 0.97388372, + "memory(GiB)": 302.58, + "step": 200000, + "train_speed(iter/s)": 0.123983 + }, + { + "epoch": 1.118494729792645, + "eval_acc": 0.7064977305238503, + "eval_loss": 1.013623833656311, + "eval_runtime": 7480.343, + "eval_samples_per_second": 10.064, + "eval_steps_per_second": 10.064, + "step": 200000 + }, + { + "acc": 0.75119066, + "epoch": 1.1186065792656243, + "grad_norm": 6.625, + "learning_rate": 4.434001795958037e-06, + "loss": 0.98020725, + "memory(GiB)": 302.58, + "step": 200020, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74533687, + "epoch": 1.1187184287386036, + "grad_norm": 9.0625, + "learning_rate": 4.433083044436674e-06, + "loss": 1.00100965, + "memory(GiB)": 302.58, + "step": 200040, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75268879, + "epoch": 1.1188302782115829, + "grad_norm": 7.625, + "learning_rate": 4.4321643123056694e-06, + "loss": 0.99229116, + "memory(GiB)": 302.58, + "step": 200060, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.74756503, + "epoch": 1.1189421276845621, + "grad_norm": 7.59375, + "learning_rate": 4.431245599596447e-06, + "loss": 1.00124063, + "memory(GiB)": 302.58, + "step": 200080, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.75135536, + "epoch": 1.1190539771575414, + "grad_norm": 7.53125, + "learning_rate": 4.430326906340427e-06, + "loss": 0.97686615, + "memory(GiB)": 302.58, + "step": 200100, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74370394, + "epoch": 1.1191658266305207, + "grad_norm": 8.75, + "learning_rate": 4.429408232569034e-06, + "loss": 1.02158947, + "memory(GiB)": 302.58, + "step": 200120, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.74602942, + "epoch": 1.1192776761035, + "grad_norm": 7.625, + "learning_rate": 4.428489578313689e-06, + "loss": 0.98489933, + "memory(GiB)": 302.58, + "step": 200140, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.75091715, + "epoch": 1.1193895255764792, + "grad_norm": 6.40625, + "learning_rate": 4.4275709436058114e-06, + "loss": 0.96042728, + "memory(GiB)": 302.58, + "step": 200160, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.75919509, + "epoch": 1.1195013750494585, + "grad_norm": 9.1875, + "learning_rate": 4.426652328476823e-06, + "loss": 0.95179653, + "memory(GiB)": 302.58, + "step": 200180, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75919948, + "epoch": 1.1196132245224377, + "grad_norm": 7.1875, + "learning_rate": 4.425733732958143e-06, + "loss": 0.93345919, + "memory(GiB)": 302.58, + "step": 200200, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.75542808, + "epoch": 1.119725073995417, + "grad_norm": 7.6875, + "learning_rate": 4.424815157081191e-06, + "loss": 0.95739708, + "memory(GiB)": 302.58, + "step": 200220, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.75114923, + "epoch": 1.1198369234683963, + "grad_norm": 7.71875, + "learning_rate": 4.423896600877384e-06, + "loss": 0.98508282, + "memory(GiB)": 302.58, + "step": 200240, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75063763, + "epoch": 1.1199487729413755, + "grad_norm": 6.5, + "learning_rate": 4.422978064378141e-06, + "loss": 0.96784172, + "memory(GiB)": 302.58, + "step": 200260, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75362616, + "epoch": 1.1200606224143548, + "grad_norm": 6.75, + "learning_rate": 4.422059547614877e-06, + "loss": 0.95679226, + "memory(GiB)": 302.58, + "step": 200280, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74416566, + "epoch": 1.120172471887334, + "grad_norm": 7.6875, + "learning_rate": 4.421141050619009e-06, + "loss": 1.01642017, + "memory(GiB)": 302.58, + "step": 200300, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.74993763, + "epoch": 1.1202843213603133, + "grad_norm": 7.875, + "learning_rate": 4.420222573421951e-06, + "loss": 0.9835043, + "memory(GiB)": 302.58, + "step": 200320, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.74513278, + "epoch": 1.1203961708332926, + "grad_norm": 6.34375, + "learning_rate": 4.419304116055122e-06, + "loss": 1.00123825, + "memory(GiB)": 302.58, + "step": 200340, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.74616427, + "epoch": 1.1205080203062718, + "grad_norm": 5.96875, + "learning_rate": 4.418385678549933e-06, + "loss": 1.02020607, + "memory(GiB)": 302.58, + "step": 200360, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.76142759, + "epoch": 1.120619869779251, + "grad_norm": 5.4375, + "learning_rate": 4.4174672609377975e-06, + "loss": 0.92407103, + "memory(GiB)": 302.58, + "step": 200380, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.75600181, + "epoch": 1.1207317192522304, + "grad_norm": 6.75, + "learning_rate": 4.416548863250129e-06, + "loss": 0.95954332, + "memory(GiB)": 302.58, + "step": 200400, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.75339975, + "epoch": 1.1208435687252096, + "grad_norm": 8.625, + "learning_rate": 4.415630485518339e-06, + "loss": 0.96582537, + "memory(GiB)": 302.58, + "step": 200420, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74752555, + "epoch": 1.120955418198189, + "grad_norm": 6.75, + "learning_rate": 4.414712127773839e-06, + "loss": 1.00179195, + "memory(GiB)": 302.58, + "step": 200440, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74792962, + "epoch": 1.1210672676711682, + "grad_norm": 6.53125, + "learning_rate": 4.413793790048042e-06, + "loss": 0.99568863, + "memory(GiB)": 302.58, + "step": 200460, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.74498053, + "epoch": 1.1211791171441474, + "grad_norm": 6.84375, + "learning_rate": 4.4128754723723545e-06, + "loss": 1.0050993, + "memory(GiB)": 302.58, + "step": 200480, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.73776932, + "epoch": 1.1212909666171267, + "grad_norm": 6.34375, + "learning_rate": 4.411957174778189e-06, + "loss": 1.06684799, + "memory(GiB)": 302.58, + "step": 200500, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.72268138, + "epoch": 1.121402816090106, + "grad_norm": 8.375, + "learning_rate": 4.411038897296952e-06, + "loss": 1.10892544, + "memory(GiB)": 302.58, + "step": 200520, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74058685, + "epoch": 1.1215146655630852, + "grad_norm": 8.0, + "learning_rate": 4.410120639960052e-06, + "loss": 1.01704912, + "memory(GiB)": 302.58, + "step": 200540, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75914431, + "epoch": 1.1216265150360645, + "grad_norm": 5.34375, + "learning_rate": 4.409202402798897e-06, + "loss": 0.93341761, + "memory(GiB)": 302.58, + "step": 200560, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.74846997, + "epoch": 1.1217383645090437, + "grad_norm": 6.4375, + "learning_rate": 4.408284185844892e-06, + "loss": 0.98068151, + "memory(GiB)": 302.58, + "step": 200580, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.75453968, + "epoch": 1.121850213982023, + "grad_norm": 8.1875, + "learning_rate": 4.407365989129446e-06, + "loss": 0.95809641, + "memory(GiB)": 302.58, + "step": 200600, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74717159, + "epoch": 1.1219620634550023, + "grad_norm": 7.8125, + "learning_rate": 4.406447812683961e-06, + "loss": 1.01604443, + "memory(GiB)": 302.58, + "step": 200620, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.74687228, + "epoch": 1.1220739129279815, + "grad_norm": 5.40625, + "learning_rate": 4.405529656539843e-06, + "loss": 0.99853029, + "memory(GiB)": 302.58, + "step": 200640, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.75470219, + "epoch": 1.1221857624009608, + "grad_norm": 5.90625, + "learning_rate": 4.404611520728497e-06, + "loss": 0.96722784, + "memory(GiB)": 302.58, + "step": 200660, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.75915828, + "epoch": 1.12229761187394, + "grad_norm": 9.25, + "learning_rate": 4.403693405281322e-06, + "loss": 0.92805634, + "memory(GiB)": 302.58, + "step": 200680, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.75440435, + "epoch": 1.1224094613469193, + "grad_norm": 4.25, + "learning_rate": 4.402775310229726e-06, + "loss": 0.95906277, + "memory(GiB)": 302.58, + "step": 200700, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.73832874, + "epoch": 1.1225213108198986, + "grad_norm": 8.125, + "learning_rate": 4.401857235605108e-06, + "loss": 1.02338018, + "memory(GiB)": 302.58, + "step": 200720, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.73671045, + "epoch": 1.1226331602928779, + "grad_norm": 7.78125, + "learning_rate": 4.400939181438868e-06, + "loss": 1.06498289, + "memory(GiB)": 302.58, + "step": 200740, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.75627365, + "epoch": 1.1227450097658571, + "grad_norm": 5.53125, + "learning_rate": 4.400021147762409e-06, + "loss": 0.95505972, + "memory(GiB)": 302.58, + "step": 200760, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.76643171, + "epoch": 1.1228568592388364, + "grad_norm": 7.09375, + "learning_rate": 4.3991031346071275e-06, + "loss": 0.94066067, + "memory(GiB)": 302.58, + "step": 200780, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.76417127, + "epoch": 1.1229687087118156, + "grad_norm": 7.0625, + "learning_rate": 4.398185142004426e-06, + "loss": 0.9174942, + "memory(GiB)": 302.58, + "step": 200800, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.73413358, + "epoch": 1.123080558184795, + "grad_norm": 10.9375, + "learning_rate": 4.3972671699857e-06, + "loss": 1.0452796, + "memory(GiB)": 302.58, + "step": 200820, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.74144521, + "epoch": 1.1231924076577742, + "grad_norm": 7.28125, + "learning_rate": 4.396349218582349e-06, + "loss": 1.01685686, + "memory(GiB)": 302.58, + "step": 200840, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.75645227, + "epoch": 1.1233042571307534, + "grad_norm": 5.21875, + "learning_rate": 4.395431287825767e-06, + "loss": 0.97388096, + "memory(GiB)": 302.58, + "step": 200860, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75978985, + "epoch": 1.1234161066037327, + "grad_norm": 6.875, + "learning_rate": 4.394513377747354e-06, + "loss": 0.93966503, + "memory(GiB)": 302.58, + "step": 200880, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.75260258, + "epoch": 1.123527956076712, + "grad_norm": 6.46875, + "learning_rate": 4.393595488378503e-06, + "loss": 0.97658978, + "memory(GiB)": 302.58, + "step": 200900, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.73044124, + "epoch": 1.1236398055496912, + "grad_norm": 7.59375, + "learning_rate": 4.392677619750607e-06, + "loss": 1.06137247, + "memory(GiB)": 302.58, + "step": 200920, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.73356609, + "epoch": 1.1237516550226705, + "grad_norm": 5.09375, + "learning_rate": 4.391759771895065e-06, + "loss": 1.05174408, + "memory(GiB)": 302.58, + "step": 200940, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.75145273, + "epoch": 1.1238635044956498, + "grad_norm": 7.9375, + "learning_rate": 4.390841944843265e-06, + "loss": 0.98406439, + "memory(GiB)": 302.58, + "step": 200960, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.74037981, + "epoch": 1.123975353968629, + "grad_norm": 6.625, + "learning_rate": 4.389924138626604e-06, + "loss": 1.02293701, + "memory(GiB)": 302.58, + "step": 200980, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.74307671, + "epoch": 1.1240872034416083, + "grad_norm": 8.75, + "learning_rate": 4.389006353276472e-06, + "loss": 1.00743618, + "memory(GiB)": 302.58, + "step": 201000, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.75310092, + "epoch": 1.1241990529145875, + "grad_norm": 6.09375, + "learning_rate": 4.388088588824259e-06, + "loss": 0.96164703, + "memory(GiB)": 302.58, + "step": 201020, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.75244641, + "epoch": 1.1243109023875668, + "grad_norm": 7.59375, + "learning_rate": 4.3871708453013575e-06, + "loss": 0.96302032, + "memory(GiB)": 302.58, + "step": 201040, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.75300126, + "epoch": 1.124422751860546, + "grad_norm": 6.96875, + "learning_rate": 4.386253122739155e-06, + "loss": 0.97227335, + "memory(GiB)": 302.58, + "step": 201060, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.75798616, + "epoch": 1.1245346013335253, + "grad_norm": 9.9375, + "learning_rate": 4.385335421169042e-06, + "loss": 0.94256411, + "memory(GiB)": 302.58, + "step": 201080, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.75834441, + "epoch": 1.1246464508065046, + "grad_norm": 4.375, + "learning_rate": 4.384417740622408e-06, + "loss": 0.94508686, + "memory(GiB)": 302.58, + "step": 201100, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.74843183, + "epoch": 1.1247583002794839, + "grad_norm": 10.0625, + "learning_rate": 4.383500081130639e-06, + "loss": 1.00276747, + "memory(GiB)": 302.58, + "step": 201120, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.72873526, + "epoch": 1.1248701497524631, + "grad_norm": 5.25, + "learning_rate": 4.382582442725122e-06, + "loss": 1.08237505, + "memory(GiB)": 302.58, + "step": 201140, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.76739855, + "epoch": 1.1249819992254424, + "grad_norm": 7.03125, + "learning_rate": 4.381664825437243e-06, + "loss": 0.92570181, + "memory(GiB)": 302.58, + "step": 201160, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.74960136, + "epoch": 1.1250938486984217, + "grad_norm": 4.84375, + "learning_rate": 4.380747229298386e-06, + "loss": 0.96159916, + "memory(GiB)": 302.58, + "step": 201180, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.73638759, + "epoch": 1.125205698171401, + "grad_norm": 7.53125, + "learning_rate": 4.379829654339939e-06, + "loss": 1.04885464, + "memory(GiB)": 302.58, + "step": 201200, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.74310274, + "epoch": 1.1253175476443802, + "grad_norm": 7.15625, + "learning_rate": 4.378912100593285e-06, + "loss": 1.02687044, + "memory(GiB)": 302.58, + "step": 201220, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.74311461, + "epoch": 1.1254293971173595, + "grad_norm": 5.1875, + "learning_rate": 4.377994568089806e-06, + "loss": 1.02359924, + "memory(GiB)": 302.58, + "step": 201240, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.7486948, + "epoch": 1.1255412465903387, + "grad_norm": 6.5625, + "learning_rate": 4.377077056860885e-06, + "loss": 0.97791519, + "memory(GiB)": 302.58, + "step": 201260, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.73071079, + "epoch": 1.125653096063318, + "grad_norm": 8.0, + "learning_rate": 4.376159566937904e-06, + "loss": 1.06505785, + "memory(GiB)": 302.58, + "step": 201280, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.7538373, + "epoch": 1.1257649455362972, + "grad_norm": 10.5, + "learning_rate": 4.375242098352244e-06, + "loss": 0.96067104, + "memory(GiB)": 302.58, + "step": 201300, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.75154095, + "epoch": 1.1258767950092765, + "grad_norm": 5.34375, + "learning_rate": 4.374324651135284e-06, + "loss": 0.97535524, + "memory(GiB)": 302.58, + "step": 201320, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.7449235, + "epoch": 1.1259886444822558, + "grad_norm": 7.3125, + "learning_rate": 4.373407225318406e-06, + "loss": 1.01702614, + "memory(GiB)": 302.58, + "step": 201340, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.7489996, + "epoch": 1.126100493955235, + "grad_norm": 10.6875, + "learning_rate": 4.372489820932988e-06, + "loss": 0.97431049, + "memory(GiB)": 302.58, + "step": 201360, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.75404792, + "epoch": 1.1262123434282143, + "grad_norm": 4.5625, + "learning_rate": 4.3715724380104075e-06, + "loss": 0.96682234, + "memory(GiB)": 302.58, + "step": 201380, + "train_speed(iter/s)": 0.123801 + }, + { + "acc": 0.74380188, + "epoch": 1.1263241929011936, + "grad_norm": 7.5, + "learning_rate": 4.3706550765820425e-06, + "loss": 1.02380276, + "memory(GiB)": 302.58, + "step": 201400, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.73935027, + "epoch": 1.1264360423741728, + "grad_norm": 7.0, + "learning_rate": 4.36973773667927e-06, + "loss": 1.02248926, + "memory(GiB)": 302.58, + "step": 201420, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.75218477, + "epoch": 1.126547891847152, + "grad_norm": 5.96875, + "learning_rate": 4.368820418333465e-06, + "loss": 0.9600709, + "memory(GiB)": 302.58, + "step": 201440, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.75059814, + "epoch": 1.1266597413201314, + "grad_norm": 6.15625, + "learning_rate": 4.367903121576002e-06, + "loss": 0.97054453, + "memory(GiB)": 302.58, + "step": 201460, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.75351171, + "epoch": 1.1267715907931106, + "grad_norm": 6.75, + "learning_rate": 4.366985846438258e-06, + "loss": 0.96079283, + "memory(GiB)": 302.58, + "step": 201480, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.77529669, + "epoch": 1.1268834402660899, + "grad_norm": 7.8125, + "learning_rate": 4.366068592951604e-06, + "loss": 0.86589632, + "memory(GiB)": 302.58, + "step": 201500, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.75111613, + "epoch": 1.1269952897390692, + "grad_norm": 7.09375, + "learning_rate": 4.365151361147415e-06, + "loss": 0.95619211, + "memory(GiB)": 302.58, + "step": 201520, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.74497705, + "epoch": 1.1271071392120484, + "grad_norm": 7.84375, + "learning_rate": 4.364234151057063e-06, + "loss": 1.02072897, + "memory(GiB)": 302.58, + "step": 201540, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.77062974, + "epoch": 1.1272189886850277, + "grad_norm": 6.875, + "learning_rate": 4.363316962711919e-06, + "loss": 0.87112074, + "memory(GiB)": 302.58, + "step": 201560, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.76591864, + "epoch": 1.127330838158007, + "grad_norm": 6.78125, + "learning_rate": 4.362399796143353e-06, + "loss": 0.91621313, + "memory(GiB)": 302.58, + "step": 201580, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.76625762, + "epoch": 1.1274426876309862, + "grad_norm": 6.28125, + "learning_rate": 4.361482651382736e-06, + "loss": 0.88503752, + "memory(GiB)": 302.58, + "step": 201600, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.7420136, + "epoch": 1.1275545371039655, + "grad_norm": 6.1875, + "learning_rate": 4.360565528461437e-06, + "loss": 1.00462828, + "memory(GiB)": 302.58, + "step": 201620, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.73230076, + "epoch": 1.1276663865769447, + "grad_norm": 9.125, + "learning_rate": 4.359648427410824e-06, + "loss": 1.06542883, + "memory(GiB)": 302.58, + "step": 201640, + "train_speed(iter/s)": 0.123875 + }, + { + "acc": 0.75991201, + "epoch": 1.127778236049924, + "grad_norm": 6.75, + "learning_rate": 4.358731348262266e-06, + "loss": 0.95056286, + "memory(GiB)": 302.58, + "step": 201660, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.74742866, + "epoch": 1.1278900855229033, + "grad_norm": 6.8125, + "learning_rate": 4.35781429104713e-06, + "loss": 0.99749584, + "memory(GiB)": 302.58, + "step": 201680, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.76019936, + "epoch": 1.1280019349958825, + "grad_norm": 5.34375, + "learning_rate": 4.356897255796781e-06, + "loss": 0.93338623, + "memory(GiB)": 302.58, + "step": 201700, + "train_speed(iter/s)": 0.123893 + }, + { + "acc": 0.75161734, + "epoch": 1.1281137844688618, + "grad_norm": 7.03125, + "learning_rate": 4.355980242542584e-06, + "loss": 0.9640666, + "memory(GiB)": 302.58, + "step": 201720, + "train_speed(iter/s)": 0.123898 + }, + { + "acc": 0.760497, + "epoch": 1.128225633941841, + "grad_norm": 8.875, + "learning_rate": 4.355063251315905e-06, + "loss": 0.94546022, + "memory(GiB)": 302.58, + "step": 201740, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.7559175, + "epoch": 1.1283374834148203, + "grad_norm": 5.96875, + "learning_rate": 4.354146282148108e-06, + "loss": 0.9741931, + "memory(GiB)": 302.58, + "step": 201760, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.75034881, + "epoch": 1.1284493328877996, + "grad_norm": 5.9375, + "learning_rate": 4.353229335070557e-06, + "loss": 0.98770304, + "memory(GiB)": 302.58, + "step": 201780, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.74809141, + "epoch": 1.1285611823607788, + "grad_norm": 6.875, + "learning_rate": 4.3523124101146135e-06, + "loss": 0.99315872, + "memory(GiB)": 302.58, + "step": 201800, + "train_speed(iter/s)": 0.123923 + }, + { + "acc": 0.75138817, + "epoch": 1.1286730318337581, + "grad_norm": 7.09375, + "learning_rate": 4.351395507311639e-06, + "loss": 0.98557043, + "memory(GiB)": 302.58, + "step": 201820, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.75236688, + "epoch": 1.1287848813067374, + "grad_norm": 5.96875, + "learning_rate": 4.350478626692995e-06, + "loss": 0.98241634, + "memory(GiB)": 302.58, + "step": 201840, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.77767839, + "epoch": 1.1288967307797166, + "grad_norm": 8.125, + "learning_rate": 4.349561768290042e-06, + "loss": 0.86445446, + "memory(GiB)": 302.58, + "step": 201860, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.74864068, + "epoch": 1.129008580252696, + "grad_norm": 6.3125, + "learning_rate": 4.348644932134137e-06, + "loss": 1.00397797, + "memory(GiB)": 302.58, + "step": 201880, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.76689043, + "epoch": 1.1291204297256752, + "grad_norm": 6.5625, + "learning_rate": 4.347728118256642e-06, + "loss": 0.9129303, + "memory(GiB)": 302.58, + "step": 201900, + "train_speed(iter/s)": 0.123952 + }, + { + "acc": 0.74883289, + "epoch": 1.1292322791986544, + "grad_norm": 7.78125, + "learning_rate": 4.3468113266889125e-06, + "loss": 1.00705843, + "memory(GiB)": 302.58, + "step": 201920, + "train_speed(iter/s)": 0.123957 + }, + { + "acc": 0.75296421, + "epoch": 1.1293441286716337, + "grad_norm": 8.0625, + "learning_rate": 4.345894557462308e-06, + "loss": 0.97711506, + "memory(GiB)": 302.58, + "step": 201940, + "train_speed(iter/s)": 0.123963 + }, + { + "acc": 0.7633152, + "epoch": 1.129455978144613, + "grad_norm": 9.3125, + "learning_rate": 4.344977810608184e-06, + "loss": 0.93096752, + "memory(GiB)": 302.58, + "step": 201960, + "train_speed(iter/s)": 0.123969 + }, + { + "acc": 0.74686995, + "epoch": 1.1295678276175922, + "grad_norm": 8.625, + "learning_rate": 4.3440610861578955e-06, + "loss": 0.97947292, + "memory(GiB)": 302.58, + "step": 201980, + "train_speed(iter/s)": 0.123975 + }, + { + "acc": 0.75617371, + "epoch": 1.1296796770905715, + "grad_norm": 8.6875, + "learning_rate": 4.343144384142799e-06, + "loss": 0.96264381, + "memory(GiB)": 302.58, + "step": 202000, + "train_speed(iter/s)": 0.123981 + }, + { + "epoch": 1.1296796770905715, + "eval_acc": 0.70654165396042, + "eval_loss": 1.0132790803909302, + "eval_runtime": 7511.7113, + "eval_samples_per_second": 10.022, + "eval_steps_per_second": 10.022, + "step": 202000 + }, + { + "acc": 0.76643448, + "epoch": 1.1297915265635508, + "grad_norm": 8.875, + "learning_rate": 4.3422277045942455e-06, + "loss": 0.9237958, + "memory(GiB)": 302.58, + "step": 202020, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.74911671, + "epoch": 1.12990337603653, + "grad_norm": 5.125, + "learning_rate": 4.341311047543591e-06, + "loss": 0.98296175, + "memory(GiB)": 302.58, + "step": 202040, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.74885607, + "epoch": 1.1300152255095093, + "grad_norm": 9.625, + "learning_rate": 4.340394413022187e-06, + "loss": 0.97998028, + "memory(GiB)": 302.58, + "step": 202060, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74935684, + "epoch": 1.1301270749824885, + "grad_norm": 9.1875, + "learning_rate": 4.339477801061386e-06, + "loss": 0.97285852, + "memory(GiB)": 302.58, + "step": 202080, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.72325211, + "epoch": 1.1302389244554678, + "grad_norm": 5.4375, + "learning_rate": 4.3385612116925395e-06, + "loss": 1.10125971, + "memory(GiB)": 302.58, + "step": 202100, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.75964074, + "epoch": 1.130350773928447, + "grad_norm": 5.625, + "learning_rate": 4.337644644946996e-06, + "loss": 0.92476139, + "memory(GiB)": 302.58, + "step": 202120, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.77193098, + "epoch": 1.1304626234014263, + "grad_norm": 7.46875, + "learning_rate": 4.336728100856106e-06, + "loss": 0.89012728, + "memory(GiB)": 302.58, + "step": 202140, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.74951363, + "epoch": 1.1305744728744056, + "grad_norm": 8.5625, + "learning_rate": 4.3358115794512185e-06, + "loss": 0.97604866, + "memory(GiB)": 302.58, + "step": 202160, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.74851341, + "epoch": 1.1306863223473849, + "grad_norm": 6.75, + "learning_rate": 4.334895080763681e-06, + "loss": 0.97617645, + "memory(GiB)": 302.58, + "step": 202180, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.74281011, + "epoch": 1.1307981718203641, + "grad_norm": 7.28125, + "learning_rate": 4.333978604824839e-06, + "loss": 1.0030652, + "memory(GiB)": 302.58, + "step": 202200, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74525485, + "epoch": 1.1309100212933434, + "grad_norm": 7.6875, + "learning_rate": 4.333062151666043e-06, + "loss": 0.98385258, + "memory(GiB)": 302.58, + "step": 202220, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.75105705, + "epoch": 1.1310218707663227, + "grad_norm": 5.9375, + "learning_rate": 4.332145721318635e-06, + "loss": 0.98742552, + "memory(GiB)": 302.58, + "step": 202240, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.76230302, + "epoch": 1.131133720239302, + "grad_norm": 8.3125, + "learning_rate": 4.3312293138139615e-06, + "loss": 0.93441505, + "memory(GiB)": 302.58, + "step": 202260, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.75460558, + "epoch": 1.1312455697122812, + "grad_norm": 7.0, + "learning_rate": 4.330312929183366e-06, + "loss": 0.97234402, + "memory(GiB)": 302.58, + "step": 202280, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.75587139, + "epoch": 1.1313574191852604, + "grad_norm": 6.75, + "learning_rate": 4.329396567458192e-06, + "loss": 0.95003052, + "memory(GiB)": 302.58, + "step": 202300, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.74213219, + "epoch": 1.1314692686582397, + "grad_norm": 7.96875, + "learning_rate": 4.328480228669782e-06, + "loss": 1.02791929, + "memory(GiB)": 302.58, + "step": 202320, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.74610972, + "epoch": 1.131581118131219, + "grad_norm": 6.28125, + "learning_rate": 4.327563912849476e-06, + "loss": 1.02267742, + "memory(GiB)": 302.58, + "step": 202340, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.76094451, + "epoch": 1.1316929676041982, + "grad_norm": 8.5625, + "learning_rate": 4.326647620028618e-06, + "loss": 0.93573923, + "memory(GiB)": 302.58, + "step": 202360, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.75564399, + "epoch": 1.1318048170771775, + "grad_norm": 7.875, + "learning_rate": 4.325731350238546e-06, + "loss": 0.97662373, + "memory(GiB)": 302.58, + "step": 202380, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.76318226, + "epoch": 1.1319166665501568, + "grad_norm": 8.75, + "learning_rate": 4.3248151035106e-06, + "loss": 0.92566557, + "memory(GiB)": 302.58, + "step": 202400, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.76036, + "epoch": 1.132028516023136, + "grad_norm": 7.6875, + "learning_rate": 4.3238988798761184e-06, + "loss": 0.94449873, + "memory(GiB)": 302.58, + "step": 202420, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.76238232, + "epoch": 1.1321403654961153, + "grad_norm": 7.03125, + "learning_rate": 4.3229826793664395e-06, + "loss": 0.93053007, + "memory(GiB)": 302.58, + "step": 202440, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.73942599, + "epoch": 1.1322522149690946, + "grad_norm": 8.5625, + "learning_rate": 4.322066502012899e-06, + "loss": 1.03988733, + "memory(GiB)": 302.58, + "step": 202460, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73927159, + "epoch": 1.1323640644420738, + "grad_norm": 7.53125, + "learning_rate": 4.321150347846833e-06, + "loss": 1.01687326, + "memory(GiB)": 302.58, + "step": 202480, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.7664185, + "epoch": 1.132475913915053, + "grad_norm": 7.1875, + "learning_rate": 4.320234216899579e-06, + "loss": 0.9209815, + "memory(GiB)": 302.58, + "step": 202500, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.75057693, + "epoch": 1.1325877633880324, + "grad_norm": 8.9375, + "learning_rate": 4.3193181092024695e-06, + "loss": 0.98359346, + "memory(GiB)": 302.58, + "step": 202520, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74330745, + "epoch": 1.1326996128610116, + "grad_norm": 9.125, + "learning_rate": 4.31840202478684e-06, + "loss": 1.02155962, + "memory(GiB)": 302.58, + "step": 202540, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.76101837, + "epoch": 1.1328114623339909, + "grad_norm": 7.9375, + "learning_rate": 4.317485963684021e-06, + "loss": 0.92882977, + "memory(GiB)": 302.58, + "step": 202560, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.7478404, + "epoch": 1.1329233118069701, + "grad_norm": 9.4375, + "learning_rate": 4.316569925925348e-06, + "loss": 0.97560844, + "memory(GiB)": 302.58, + "step": 202580, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.75007715, + "epoch": 1.1330351612799494, + "grad_norm": 5.96875, + "learning_rate": 4.3156539115421505e-06, + "loss": 0.99581337, + "memory(GiB)": 302.58, + "step": 202600, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75453134, + "epoch": 1.1331470107529287, + "grad_norm": 7.78125, + "learning_rate": 4.314737920565758e-06, + "loss": 0.97835064, + "memory(GiB)": 302.58, + "step": 202620, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.7440105, + "epoch": 1.133258860225908, + "grad_norm": 7.6875, + "learning_rate": 4.313821953027502e-06, + "loss": 1.01170826, + "memory(GiB)": 302.58, + "step": 202640, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.72984176, + "epoch": 1.1333707096988872, + "grad_norm": 5.53125, + "learning_rate": 4.3129060089587104e-06, + "loss": 1.06967907, + "memory(GiB)": 302.58, + "step": 202660, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.75007882, + "epoch": 1.1334825591718665, + "grad_norm": 8.6875, + "learning_rate": 4.3119900883907134e-06, + "loss": 0.98044977, + "memory(GiB)": 302.58, + "step": 202680, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74169626, + "epoch": 1.1335944086448457, + "grad_norm": 7.1875, + "learning_rate": 4.311074191354837e-06, + "loss": 1.01589737, + "memory(GiB)": 302.58, + "step": 202700, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.77259936, + "epoch": 1.133706258117825, + "grad_norm": 10.125, + "learning_rate": 4.310158317882407e-06, + "loss": 0.89378834, + "memory(GiB)": 302.58, + "step": 202720, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.75435071, + "epoch": 1.1338181075908043, + "grad_norm": 7.3125, + "learning_rate": 4.309242468004748e-06, + "loss": 0.93566809, + "memory(GiB)": 302.58, + "step": 202740, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.75077844, + "epoch": 1.1339299570637835, + "grad_norm": 5.65625, + "learning_rate": 4.30832664175319e-06, + "loss": 0.96612616, + "memory(GiB)": 302.58, + "step": 202760, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.77815113, + "epoch": 1.1340418065367628, + "grad_norm": 5.8125, + "learning_rate": 4.307410839159053e-06, + "loss": 0.86377525, + "memory(GiB)": 302.58, + "step": 202780, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75631618, + "epoch": 1.134153656009742, + "grad_norm": 7.8125, + "learning_rate": 4.3064950602536614e-06, + "loss": 0.95598793, + "memory(GiB)": 302.58, + "step": 202800, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.74943194, + "epoch": 1.1342655054827213, + "grad_norm": 7.625, + "learning_rate": 4.305579305068338e-06, + "loss": 1.01457863, + "memory(GiB)": 302.58, + "step": 202820, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.75011554, + "epoch": 1.1343773549557006, + "grad_norm": 6.71875, + "learning_rate": 4.304663573634404e-06, + "loss": 0.99759998, + "memory(GiB)": 302.58, + "step": 202840, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74838514, + "epoch": 1.1344892044286798, + "grad_norm": 6.9375, + "learning_rate": 4.303747865983182e-06, + "loss": 1.00702238, + "memory(GiB)": 302.58, + "step": 202860, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.74578772, + "epoch": 1.134601053901659, + "grad_norm": 7.84375, + "learning_rate": 4.3028321821459884e-06, + "loss": 1.00662699, + "memory(GiB)": 302.58, + "step": 202880, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.74725108, + "epoch": 1.1347129033746384, + "grad_norm": 9.4375, + "learning_rate": 4.301916522154146e-06, + "loss": 0.96664248, + "memory(GiB)": 302.58, + "step": 202900, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.74910207, + "epoch": 1.1348247528476176, + "grad_norm": 7.71875, + "learning_rate": 4.301000886038972e-06, + "loss": 0.96363993, + "memory(GiB)": 302.58, + "step": 202920, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.75044765, + "epoch": 1.134936602320597, + "grad_norm": 5.15625, + "learning_rate": 4.300085273831785e-06, + "loss": 0.95976734, + "memory(GiB)": 302.58, + "step": 202940, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.73628168, + "epoch": 1.1350484517935762, + "grad_norm": 5.0, + "learning_rate": 4.299169685563901e-06, + "loss": 1.03763227, + "memory(GiB)": 302.58, + "step": 202960, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.75571432, + "epoch": 1.1351603012665554, + "grad_norm": 8.0625, + "learning_rate": 4.298254121266635e-06, + "loss": 0.95819235, + "memory(GiB)": 302.58, + "step": 202980, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.75089149, + "epoch": 1.1352721507395347, + "grad_norm": 4.40625, + "learning_rate": 4.297338580971303e-06, + "loss": 0.99014711, + "memory(GiB)": 302.58, + "step": 203000, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.74175, + "epoch": 1.135384000212514, + "grad_norm": 7.0, + "learning_rate": 4.29642306470922e-06, + "loss": 1.02112379, + "memory(GiB)": 302.58, + "step": 203020, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.75395107, + "epoch": 1.1354958496854932, + "grad_norm": 8.625, + "learning_rate": 4.295507572511699e-06, + "loss": 0.96457481, + "memory(GiB)": 302.58, + "step": 203040, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.74730277, + "epoch": 1.1356076991584725, + "grad_norm": 8.0625, + "learning_rate": 4.294592104410053e-06, + "loss": 0.98978291, + "memory(GiB)": 302.58, + "step": 203060, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.74942627, + "epoch": 1.1357195486314517, + "grad_norm": 8.125, + "learning_rate": 4.293676660435595e-06, + "loss": 0.9889245, + "memory(GiB)": 302.58, + "step": 203080, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.75116596, + "epoch": 1.135831398104431, + "grad_norm": 8.0, + "learning_rate": 4.292761240619634e-06, + "loss": 0.96955957, + "memory(GiB)": 302.58, + "step": 203100, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.76805172, + "epoch": 1.1359432475774103, + "grad_norm": 6.21875, + "learning_rate": 4.291845844993481e-06, + "loss": 0.92048006, + "memory(GiB)": 302.58, + "step": 203120, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.74626579, + "epoch": 1.1360550970503895, + "grad_norm": 8.75, + "learning_rate": 4.290930473588446e-06, + "loss": 0.99515543, + "memory(GiB)": 302.58, + "step": 203140, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.74818187, + "epoch": 1.1361669465233688, + "grad_norm": 7.03125, + "learning_rate": 4.290015126435836e-06, + "loss": 0.99065828, + "memory(GiB)": 302.58, + "step": 203160, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.76649714, + "epoch": 1.136278795996348, + "grad_norm": 7.125, + "learning_rate": 4.289099803566959e-06, + "loss": 0.90180254, + "memory(GiB)": 302.58, + "step": 203180, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.75393491, + "epoch": 1.1363906454693273, + "grad_norm": 7.125, + "learning_rate": 4.2881845050131255e-06, + "loss": 0.96880131, + "memory(GiB)": 302.58, + "step": 203200, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.75252337, + "epoch": 1.1365024949423066, + "grad_norm": 9.1875, + "learning_rate": 4.287269230805638e-06, + "loss": 0.98221846, + "memory(GiB)": 302.58, + "step": 203220, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.76380253, + "epoch": 1.1366143444152859, + "grad_norm": 5.75, + "learning_rate": 4.286353980975803e-06, + "loss": 0.93009539, + "memory(GiB)": 302.58, + "step": 203240, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.74070501, + "epoch": 1.1367261938882651, + "grad_norm": 6.90625, + "learning_rate": 4.285438755554925e-06, + "loss": 0.99743881, + "memory(GiB)": 302.58, + "step": 203260, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.74529686, + "epoch": 1.1368380433612444, + "grad_norm": 5.1875, + "learning_rate": 4.284523554574307e-06, + "loss": 1.01283302, + "memory(GiB)": 302.58, + "step": 203280, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.76650949, + "epoch": 1.1369498928342237, + "grad_norm": 5.4375, + "learning_rate": 4.2836083780652525e-06, + "loss": 0.90192947, + "memory(GiB)": 302.58, + "step": 203300, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.76005611, + "epoch": 1.137061742307203, + "grad_norm": 7.6875, + "learning_rate": 4.282693226059064e-06, + "loss": 0.95628948, + "memory(GiB)": 302.58, + "step": 203320, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.73929367, + "epoch": 1.1371735917801822, + "grad_norm": 9.75, + "learning_rate": 4.281778098587041e-06, + "loss": 1.01485138, + "memory(GiB)": 302.58, + "step": 203340, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.74938979, + "epoch": 1.1372854412531614, + "grad_norm": 9.5625, + "learning_rate": 4.280862995680482e-06, + "loss": 0.99370298, + "memory(GiB)": 302.58, + "step": 203360, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.73604712, + "epoch": 1.1373972907261407, + "grad_norm": 7.6875, + "learning_rate": 4.279947917370692e-06, + "loss": 1.06847324, + "memory(GiB)": 302.58, + "step": 203380, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.75738592, + "epoch": 1.13750914019912, + "grad_norm": 6.6875, + "learning_rate": 4.279032863688966e-06, + "loss": 0.94495115, + "memory(GiB)": 302.58, + "step": 203400, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.74730811, + "epoch": 1.1376209896720992, + "grad_norm": 6.375, + "learning_rate": 4.278117834666601e-06, + "loss": 1.00885839, + "memory(GiB)": 302.58, + "step": 203420, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.75838637, + "epoch": 1.1377328391450785, + "grad_norm": 4.84375, + "learning_rate": 4.277202830334896e-06, + "loss": 0.96812849, + "memory(GiB)": 302.58, + "step": 203440, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.74628773, + "epoch": 1.1378446886180578, + "grad_norm": 7.65625, + "learning_rate": 4.2762878507251455e-06, + "loss": 0.97648659, + "memory(GiB)": 302.58, + "step": 203460, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.74057865, + "epoch": 1.137956538091037, + "grad_norm": 11.125, + "learning_rate": 4.275372895868645e-06, + "loss": 1.03975792, + "memory(GiB)": 302.58, + "step": 203480, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.76543894, + "epoch": 1.1380683875640163, + "grad_norm": 6.5, + "learning_rate": 4.27445796579669e-06, + "loss": 0.89616385, + "memory(GiB)": 302.58, + "step": 203500, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.73695331, + "epoch": 1.1381802370369956, + "grad_norm": 7.25, + "learning_rate": 4.273543060540574e-06, + "loss": 1.02870617, + "memory(GiB)": 302.58, + "step": 203520, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.74549866, + "epoch": 1.1382920865099748, + "grad_norm": 11.8125, + "learning_rate": 4.272628180131588e-06, + "loss": 0.98197784, + "memory(GiB)": 302.58, + "step": 203540, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.72300687, + "epoch": 1.138403935982954, + "grad_norm": 6.75, + "learning_rate": 4.271713324601025e-06, + "loss": 1.11257486, + "memory(GiB)": 302.58, + "step": 203560, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.77009435, + "epoch": 1.1385157854559333, + "grad_norm": 5.84375, + "learning_rate": 4.270798493980175e-06, + "loss": 0.89688396, + "memory(GiB)": 302.58, + "step": 203580, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.7357307, + "epoch": 1.1386276349289126, + "grad_norm": 5.75, + "learning_rate": 4.269883688300329e-06, + "loss": 1.04105721, + "memory(GiB)": 302.58, + "step": 203600, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.73718214, + "epoch": 1.1387394844018919, + "grad_norm": 10.8125, + "learning_rate": 4.268968907592776e-06, + "loss": 1.04948425, + "memory(GiB)": 302.58, + "step": 203620, + "train_speed(iter/s)": 0.123866 + }, + { + "acc": 0.74376283, + "epoch": 1.1388513338748711, + "grad_norm": 5.3125, + "learning_rate": 4.268054151888804e-06, + "loss": 1.00199366, + "memory(GiB)": 302.58, + "step": 203640, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.74073057, + "epoch": 1.1389631833478504, + "grad_norm": 10.0, + "learning_rate": 4.267139421219702e-06, + "loss": 1.04163876, + "memory(GiB)": 302.58, + "step": 203660, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.7337142, + "epoch": 1.1390750328208297, + "grad_norm": 4.78125, + "learning_rate": 4.266224715616754e-06, + "loss": 1.06300077, + "memory(GiB)": 302.58, + "step": 203680, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.76711655, + "epoch": 1.139186882293809, + "grad_norm": 6.90625, + "learning_rate": 4.265310035111249e-06, + "loss": 0.93212366, + "memory(GiB)": 302.58, + "step": 203700, + "train_speed(iter/s)": 0.123888 + }, + { + "acc": 0.74994202, + "epoch": 1.1392987317667882, + "grad_norm": 7.1875, + "learning_rate": 4.26439537973447e-06, + "loss": 0.98731117, + "memory(GiB)": 302.58, + "step": 203720, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.7455482, + "epoch": 1.1394105812397675, + "grad_norm": 5.8125, + "learning_rate": 4.263480749517702e-06, + "loss": 1.00167351, + "memory(GiB)": 302.58, + "step": 203740, + "train_speed(iter/s)": 0.123899 + }, + { + "acc": 0.73471632, + "epoch": 1.1395224307127467, + "grad_norm": 4.59375, + "learning_rate": 4.262566144492226e-06, + "loss": 1.03547249, + "memory(GiB)": 302.58, + "step": 203760, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.75352993, + "epoch": 1.139634280185726, + "grad_norm": 7.4375, + "learning_rate": 4.2616515646893275e-06, + "loss": 0.96248264, + "memory(GiB)": 302.58, + "step": 203780, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.75970821, + "epoch": 1.1397461296587053, + "grad_norm": 6.25, + "learning_rate": 4.260737010140286e-06, + "loss": 0.92131224, + "memory(GiB)": 302.58, + "step": 203800, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.75841761, + "epoch": 1.1398579791316845, + "grad_norm": 7.96875, + "learning_rate": 4.259822480876384e-06, + "loss": 0.94339371, + "memory(GiB)": 302.58, + "step": 203820, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.75485873, + "epoch": 1.1399698286046638, + "grad_norm": 6.71875, + "learning_rate": 4.2589079769289005e-06, + "loss": 0.94672623, + "memory(GiB)": 302.58, + "step": 203840, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.74953203, + "epoch": 1.140081678077643, + "grad_norm": 8.25, + "learning_rate": 4.2579934983291134e-06, + "loss": 0.98940182, + "memory(GiB)": 302.58, + "step": 203860, + "train_speed(iter/s)": 0.123934 + }, + { + "acc": 0.74877853, + "epoch": 1.1401935275506223, + "grad_norm": 6.6875, + "learning_rate": 4.257079045108302e-06, + "loss": 0.98952036, + "memory(GiB)": 302.58, + "step": 203880, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.73763056, + "epoch": 1.1403053770236016, + "grad_norm": 4.59375, + "learning_rate": 4.256164617297741e-06, + "loss": 1.02267885, + "memory(GiB)": 302.58, + "step": 203900, + "train_speed(iter/s)": 0.123946 + }, + { + "acc": 0.76311474, + "epoch": 1.1404172264965808, + "grad_norm": 5.625, + "learning_rate": 4.25525021492871e-06, + "loss": 0.91722679, + "memory(GiB)": 302.58, + "step": 203920, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.7628356, + "epoch": 1.14052907596956, + "grad_norm": 8.125, + "learning_rate": 4.2543358380324836e-06, + "loss": 0.9308629, + "memory(GiB)": 302.58, + "step": 203940, + "train_speed(iter/s)": 0.123957 + }, + { + "acc": 0.75078831, + "epoch": 1.1406409254425394, + "grad_norm": 6.53125, + "learning_rate": 4.253421486640336e-06, + "loss": 0.98362045, + "memory(GiB)": 302.58, + "step": 203960, + "train_speed(iter/s)": 0.123962 + }, + { + "acc": 0.77048111, + "epoch": 1.1407527749155186, + "grad_norm": 7.125, + "learning_rate": 4.252507160783541e-06, + "loss": 0.89239769, + "memory(GiB)": 302.58, + "step": 203980, + "train_speed(iter/s)": 0.123968 + }, + { + "acc": 0.74929237, + "epoch": 1.140864624388498, + "grad_norm": 6.0625, + "learning_rate": 4.251592860493371e-06, + "loss": 0.99271946, + "memory(GiB)": 302.58, + "step": 204000, + "train_speed(iter/s)": 0.123974 + }, + { + "epoch": 1.140864624388498, + "eval_acc": 0.7065196182969558, + "eval_loss": 1.0136148929595947, + "eval_runtime": 7512.2971, + "eval_samples_per_second": 10.021, + "eval_steps_per_second": 10.021, + "step": 204000 + }, + { + "acc": 0.75677638, + "epoch": 1.1409764738614772, + "grad_norm": 6.34375, + "learning_rate": 4.250678585801099e-06, + "loss": 0.96120596, + "memory(GiB)": 302.58, + "step": 204020, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.73785782, + "epoch": 1.1410883233344564, + "grad_norm": 9.1875, + "learning_rate": 4.249764336737993e-06, + "loss": 1.04228668, + "memory(GiB)": 302.58, + "step": 204040, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75184689, + "epoch": 1.1412001728074357, + "grad_norm": 8.0625, + "learning_rate": 4.2488501133353275e-06, + "loss": 0.98887882, + "memory(GiB)": 302.58, + "step": 204060, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.74052868, + "epoch": 1.141312022280415, + "grad_norm": 6.28125, + "learning_rate": 4.24793591562437e-06, + "loss": 1.02504854, + "memory(GiB)": 302.58, + "step": 204080, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.7434319, + "epoch": 1.1414238717533942, + "grad_norm": 5.0, + "learning_rate": 4.247021743636389e-06, + "loss": 1.0004262, + "memory(GiB)": 302.58, + "step": 204100, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.76349664, + "epoch": 1.1415357212263735, + "grad_norm": 7.53125, + "learning_rate": 4.246107597402652e-06, + "loss": 0.91872711, + "memory(GiB)": 302.58, + "step": 204120, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.74983988, + "epoch": 1.1416475706993527, + "grad_norm": 8.6875, + "learning_rate": 4.245193476954426e-06, + "loss": 0.97088003, + "memory(GiB)": 302.58, + "step": 204140, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.7129992, + "epoch": 1.141759420172332, + "grad_norm": 7.65625, + "learning_rate": 4.2442793823229755e-06, + "loss": 1.14758863, + "memory(GiB)": 302.58, + "step": 204160, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.76667533, + "epoch": 1.1418712696453113, + "grad_norm": 8.9375, + "learning_rate": 4.243365313539566e-06, + "loss": 0.89922409, + "memory(GiB)": 302.58, + "step": 204180, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75250592, + "epoch": 1.1419831191182905, + "grad_norm": 5.78125, + "learning_rate": 4.242451270635463e-06, + "loss": 0.95973167, + "memory(GiB)": 302.58, + "step": 204200, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.75151062, + "epoch": 1.1420949685912698, + "grad_norm": 8.25, + "learning_rate": 4.2415372536419285e-06, + "loss": 0.95911036, + "memory(GiB)": 302.58, + "step": 204220, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.76051941, + "epoch": 1.142206818064249, + "grad_norm": 6.125, + "learning_rate": 4.240623262590225e-06, + "loss": 0.95904922, + "memory(GiB)": 302.58, + "step": 204240, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.74216485, + "epoch": 1.1423186675372283, + "grad_norm": 8.0625, + "learning_rate": 4.239709297511614e-06, + "loss": 1.0276226, + "memory(GiB)": 302.58, + "step": 204260, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74493418, + "epoch": 1.1424305170102076, + "grad_norm": 6.84375, + "learning_rate": 4.238795358437356e-06, + "loss": 1.00833626, + "memory(GiB)": 302.58, + "step": 204280, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.75040789, + "epoch": 1.1425423664831869, + "grad_norm": 6.78125, + "learning_rate": 4.237881445398709e-06, + "loss": 0.96050711, + "memory(GiB)": 302.58, + "step": 204300, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75004573, + "epoch": 1.1426542159561661, + "grad_norm": 6.0625, + "learning_rate": 4.236967558426934e-06, + "loss": 0.99052486, + "memory(GiB)": 302.58, + "step": 204320, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.75281477, + "epoch": 1.1427660654291454, + "grad_norm": 6.375, + "learning_rate": 4.236053697553289e-06, + "loss": 0.97305584, + "memory(GiB)": 302.58, + "step": 204340, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74546604, + "epoch": 1.1428779149021246, + "grad_norm": 8.5625, + "learning_rate": 4.235139862809029e-06, + "loss": 1.00105886, + "memory(GiB)": 302.58, + "step": 204360, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75245528, + "epoch": 1.142989764375104, + "grad_norm": 7.75, + "learning_rate": 4.2342260542254115e-06, + "loss": 0.97703972, + "memory(GiB)": 302.58, + "step": 204380, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74404082, + "epoch": 1.1431016138480832, + "grad_norm": 7.03125, + "learning_rate": 4.233312271833691e-06, + "loss": 1.01629362, + "memory(GiB)": 302.58, + "step": 204400, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74981613, + "epoch": 1.1432134633210624, + "grad_norm": 7.9375, + "learning_rate": 4.232398515665122e-06, + "loss": 0.98992071, + "memory(GiB)": 302.58, + "step": 204420, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.73623209, + "epoch": 1.1433253127940417, + "grad_norm": 11.1875, + "learning_rate": 4.231484785750956e-06, + "loss": 1.04939022, + "memory(GiB)": 302.58, + "step": 204440, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.74062743, + "epoch": 1.143437162267021, + "grad_norm": 7.78125, + "learning_rate": 4.2305710821224485e-06, + "loss": 1.03227119, + "memory(GiB)": 302.58, + "step": 204460, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.76545491, + "epoch": 1.1435490117400002, + "grad_norm": 10.5625, + "learning_rate": 4.22965740481085e-06, + "loss": 0.93552914, + "memory(GiB)": 302.58, + "step": 204480, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.75388999, + "epoch": 1.1436608612129795, + "grad_norm": 8.4375, + "learning_rate": 4.22874375384741e-06, + "loss": 0.98322926, + "memory(GiB)": 302.58, + "step": 204500, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.76935568, + "epoch": 1.1437727106859588, + "grad_norm": 5.625, + "learning_rate": 4.2278301292633806e-06, + "loss": 0.88255758, + "memory(GiB)": 302.58, + "step": 204520, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.75523186, + "epoch": 1.143884560158938, + "grad_norm": 9.125, + "learning_rate": 4.226916531090009e-06, + "loss": 0.94191723, + "memory(GiB)": 302.58, + "step": 204540, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.74892726, + "epoch": 1.1439964096319173, + "grad_norm": 6.0625, + "learning_rate": 4.226002959358543e-06, + "loss": 0.98585482, + "memory(GiB)": 302.58, + "step": 204560, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.73896613, + "epoch": 1.1441082591048966, + "grad_norm": 5.78125, + "learning_rate": 4.225089414100231e-06, + "loss": 1.03782864, + "memory(GiB)": 302.58, + "step": 204580, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.74645171, + "epoch": 1.1442201085778758, + "grad_norm": 7.46875, + "learning_rate": 4.2241758953463175e-06, + "loss": 0.97768631, + "memory(GiB)": 302.58, + "step": 204600, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.75283289, + "epoch": 1.144331958050855, + "grad_norm": 9.5625, + "learning_rate": 4.223262403128049e-06, + "loss": 0.95417938, + "memory(GiB)": 302.58, + "step": 204620, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.75306454, + "epoch": 1.1444438075238343, + "grad_norm": 7.46875, + "learning_rate": 4.222348937476667e-06, + "loss": 0.94047604, + "memory(GiB)": 302.58, + "step": 204640, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.74750109, + "epoch": 1.1445556569968136, + "grad_norm": 7.875, + "learning_rate": 4.22143549842342e-06, + "loss": 0.97771683, + "memory(GiB)": 302.58, + "step": 204660, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.76016436, + "epoch": 1.1446675064697929, + "grad_norm": 6.9375, + "learning_rate": 4.220522085999546e-06, + "loss": 0.94338293, + "memory(GiB)": 302.58, + "step": 204680, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.74173393, + "epoch": 1.1447793559427721, + "grad_norm": 9.3125, + "learning_rate": 4.219608700236289e-06, + "loss": 1.03670149, + "memory(GiB)": 302.58, + "step": 204700, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.73914418, + "epoch": 1.1448912054157514, + "grad_norm": 9.0, + "learning_rate": 4.21869534116489e-06, + "loss": 1.02458248, + "memory(GiB)": 302.58, + "step": 204720, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.75790181, + "epoch": 1.1450030548887307, + "grad_norm": 6.75, + "learning_rate": 4.217782008816587e-06, + "loss": 0.94911804, + "memory(GiB)": 302.58, + "step": 204740, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.75437632, + "epoch": 1.14511490436171, + "grad_norm": 7.5625, + "learning_rate": 4.21686870322262e-06, + "loss": 0.95939274, + "memory(GiB)": 302.58, + "step": 204760, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.75100384, + "epoch": 1.1452267538346892, + "grad_norm": 9.0, + "learning_rate": 4.2159554244142246e-06, + "loss": 0.96736345, + "memory(GiB)": 302.58, + "step": 204780, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.75481186, + "epoch": 1.1453386033076685, + "grad_norm": 9.25, + "learning_rate": 4.215042172422642e-06, + "loss": 0.98819046, + "memory(GiB)": 302.58, + "step": 204800, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.74880214, + "epoch": 1.1454504527806477, + "grad_norm": 6.125, + "learning_rate": 4.214128947279105e-06, + "loss": 1.00294828, + "memory(GiB)": 302.58, + "step": 204820, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.76271648, + "epoch": 1.145562302253627, + "grad_norm": 6.25, + "learning_rate": 4.213215749014851e-06, + "loss": 0.92882528, + "memory(GiB)": 302.58, + "step": 204840, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.75077066, + "epoch": 1.1456741517266062, + "grad_norm": 6.78125, + "learning_rate": 4.212302577661112e-06, + "loss": 0.96449203, + "memory(GiB)": 302.58, + "step": 204860, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.73800921, + "epoch": 1.1457860011995855, + "grad_norm": 4.9375, + "learning_rate": 4.2113894332491235e-06, + "loss": 1.03099823, + "memory(GiB)": 302.58, + "step": 204880, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.73454309, + "epoch": 1.1458978506725648, + "grad_norm": 6.25, + "learning_rate": 4.210476315810117e-06, + "loss": 1.07583542, + "memory(GiB)": 302.58, + "step": 204900, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.73168764, + "epoch": 1.146009700145544, + "grad_norm": 5.25, + "learning_rate": 4.209563225375323e-06, + "loss": 1.07779922, + "memory(GiB)": 302.58, + "step": 204920, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.73526134, + "epoch": 1.1461215496185233, + "grad_norm": 5.46875, + "learning_rate": 4.208650161975973e-06, + "loss": 1.0410965, + "memory(GiB)": 302.58, + "step": 204940, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.73681326, + "epoch": 1.1462333990915026, + "grad_norm": 7.125, + "learning_rate": 4.207737125643298e-06, + "loss": 1.02855263, + "memory(GiB)": 302.58, + "step": 204960, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.75732689, + "epoch": 1.1463452485644818, + "grad_norm": 9.4375, + "learning_rate": 4.206824116408525e-06, + "loss": 0.94962177, + "memory(GiB)": 302.58, + "step": 204980, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.74063005, + "epoch": 1.146457098037461, + "grad_norm": 6.46875, + "learning_rate": 4.205911134302883e-06, + "loss": 1.03766403, + "memory(GiB)": 302.58, + "step": 205000, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.73462634, + "epoch": 1.1465689475104404, + "grad_norm": 7.4375, + "learning_rate": 4.204998179357598e-06, + "loss": 1.04059105, + "memory(GiB)": 302.58, + "step": 205020, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.76061063, + "epoch": 1.1466807969834196, + "grad_norm": 6.28125, + "learning_rate": 4.204085251603895e-06, + "loss": 0.92123775, + "memory(GiB)": 302.58, + "step": 205040, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.74938917, + "epoch": 1.146792646456399, + "grad_norm": 6.28125, + "learning_rate": 4.203172351073e-06, + "loss": 0.99725409, + "memory(GiB)": 302.58, + "step": 205060, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.74165254, + "epoch": 1.1469044959293782, + "grad_norm": 6.03125, + "learning_rate": 4.202259477796138e-06, + "loss": 1.04799566, + "memory(GiB)": 302.58, + "step": 205080, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.74729877, + "epoch": 1.1470163454023574, + "grad_norm": 8.0, + "learning_rate": 4.2013466318045316e-06, + "loss": 0.98854694, + "memory(GiB)": 302.58, + "step": 205100, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.75344253, + "epoch": 1.1471281948753367, + "grad_norm": 6.8125, + "learning_rate": 4.200433813129403e-06, + "loss": 0.95811214, + "memory(GiB)": 302.58, + "step": 205120, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.73593345, + "epoch": 1.147240044348316, + "grad_norm": 6.1875, + "learning_rate": 4.1995210218019734e-06, + "loss": 1.06188755, + "memory(GiB)": 302.58, + "step": 205140, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74585004, + "epoch": 1.1473518938212952, + "grad_norm": 6.5, + "learning_rate": 4.1986082578534625e-06, + "loss": 0.98379278, + "memory(GiB)": 302.58, + "step": 205160, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.74314575, + "epoch": 1.1474637432942745, + "grad_norm": 9.625, + "learning_rate": 4.19769552131509e-06, + "loss": 1.0228446, + "memory(GiB)": 302.58, + "step": 205180, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.74585252, + "epoch": 1.1475755927672537, + "grad_norm": 7.6875, + "learning_rate": 4.1967828122180755e-06, + "loss": 0.99274092, + "memory(GiB)": 302.58, + "step": 205200, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.76280932, + "epoch": 1.147687442240233, + "grad_norm": 7.0625, + "learning_rate": 4.195870130593635e-06, + "loss": 0.90498123, + "memory(GiB)": 302.58, + "step": 205220, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.75387616, + "epoch": 1.1477992917132123, + "grad_norm": 9.6875, + "learning_rate": 4.194957476472987e-06, + "loss": 0.97284231, + "memory(GiB)": 302.58, + "step": 205240, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.74269519, + "epoch": 1.1479111411861915, + "grad_norm": 6.9375, + "learning_rate": 4.194044849887346e-06, + "loss": 1.03518858, + "memory(GiB)": 302.58, + "step": 205260, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.74923916, + "epoch": 1.1480229906591708, + "grad_norm": 9.4375, + "learning_rate": 4.193132250867926e-06, + "loss": 0.98523521, + "memory(GiB)": 302.58, + "step": 205280, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.77373514, + "epoch": 1.14813484013215, + "grad_norm": 8.5, + "learning_rate": 4.192219679445943e-06, + "loss": 0.88098755, + "memory(GiB)": 302.58, + "step": 205300, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.73709669, + "epoch": 1.1482466896051293, + "grad_norm": 6.34375, + "learning_rate": 4.191307135652606e-06, + "loss": 1.02995911, + "memory(GiB)": 302.58, + "step": 205320, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.74750066, + "epoch": 1.1483585390781086, + "grad_norm": 4.96875, + "learning_rate": 4.19039461951913e-06, + "loss": 1.00367975, + "memory(GiB)": 302.58, + "step": 205340, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.74560881, + "epoch": 1.1484703885510879, + "grad_norm": 10.875, + "learning_rate": 4.189482131076726e-06, + "loss": 1.0167367, + "memory(GiB)": 302.58, + "step": 205360, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.76307654, + "epoch": 1.1485822380240671, + "grad_norm": 8.25, + "learning_rate": 4.188569670356603e-06, + "loss": 0.93494339, + "memory(GiB)": 302.58, + "step": 205380, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.74519453, + "epoch": 1.1486940874970464, + "grad_norm": 6.125, + "learning_rate": 4.18765723738997e-06, + "loss": 1.02348766, + "memory(GiB)": 302.58, + "step": 205400, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.73429532, + "epoch": 1.1488059369700256, + "grad_norm": 4.6875, + "learning_rate": 4.186744832208036e-06, + "loss": 1.04768267, + "memory(GiB)": 302.58, + "step": 205420, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.74987206, + "epoch": 1.148917786443005, + "grad_norm": 5.75, + "learning_rate": 4.185832454842007e-06, + "loss": 1.01533117, + "memory(GiB)": 302.58, + "step": 205440, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.75941138, + "epoch": 1.1490296359159844, + "grad_norm": 5.09375, + "learning_rate": 4.184920105323088e-06, + "loss": 0.94271431, + "memory(GiB)": 302.58, + "step": 205460, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.77505016, + "epoch": 1.1491414853889634, + "grad_norm": 7.5625, + "learning_rate": 4.1840077836824875e-06, + "loss": 0.87304506, + "memory(GiB)": 302.58, + "step": 205480, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.74960833, + "epoch": 1.149253334861943, + "grad_norm": 8.125, + "learning_rate": 4.1830954899514084e-06, + "loss": 0.99821672, + "memory(GiB)": 302.58, + "step": 205500, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.74430003, + "epoch": 1.149365184334922, + "grad_norm": 5.625, + "learning_rate": 4.1821832241610545e-06, + "loss": 1.00692549, + "memory(GiB)": 302.58, + "step": 205520, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.75587201, + "epoch": 1.1494770338079014, + "grad_norm": 7.3125, + "learning_rate": 4.181270986342627e-06, + "loss": 0.93744326, + "memory(GiB)": 302.58, + "step": 205540, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.74690447, + "epoch": 1.1495888832808805, + "grad_norm": 7.0, + "learning_rate": 4.180358776527328e-06, + "loss": 0.98934851, + "memory(GiB)": 302.58, + "step": 205560, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.75044723, + "epoch": 1.14970073275386, + "grad_norm": 8.25, + "learning_rate": 4.1794465947463576e-06, + "loss": 0.98517704, + "memory(GiB)": 302.58, + "step": 205580, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.7454195, + "epoch": 1.149812582226839, + "grad_norm": 8.625, + "learning_rate": 4.178534441030914e-06, + "loss": 1.00046091, + "memory(GiB)": 302.58, + "step": 205600, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.74447699, + "epoch": 1.1499244316998185, + "grad_norm": 5.78125, + "learning_rate": 4.1776223154121995e-06, + "loss": 1.01095915, + "memory(GiB)": 302.58, + "step": 205620, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.74756784, + "epoch": 1.1500362811727975, + "grad_norm": 9.5, + "learning_rate": 4.176710217921409e-06, + "loss": 0.99053783, + "memory(GiB)": 302.58, + "step": 205640, + "train_speed(iter/s)": 0.123866 + }, + { + "acc": 0.74948249, + "epoch": 1.150148130645777, + "grad_norm": 7.25, + "learning_rate": 4.1757981485897394e-06, + "loss": 0.97462654, + "memory(GiB)": 302.58, + "step": 205660, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.74322362, + "epoch": 1.150259980118756, + "grad_norm": 7.34375, + "learning_rate": 4.174886107448385e-06, + "loss": 0.9987525, + "memory(GiB)": 302.58, + "step": 205680, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.75328078, + "epoch": 1.1503718295917356, + "grad_norm": 4.28125, + "learning_rate": 4.173974094528543e-06, + "loss": 0.94247494, + "memory(GiB)": 302.58, + "step": 205700, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.75291371, + "epoch": 1.1504836790647146, + "grad_norm": 5.375, + "learning_rate": 4.1730621098614075e-06, + "loss": 0.97449112, + "memory(GiB)": 302.58, + "step": 205720, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.73732266, + "epoch": 1.150595528537694, + "grad_norm": 9.75, + "learning_rate": 4.172150153478169e-06, + "loss": 1.05177383, + "memory(GiB)": 302.58, + "step": 205740, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.75431533, + "epoch": 1.1507073780106731, + "grad_norm": 6.65625, + "learning_rate": 4.171238225410022e-06, + "loss": 0.96974726, + "memory(GiB)": 302.58, + "step": 205760, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.76780066, + "epoch": 1.1508192274836526, + "grad_norm": 7.3125, + "learning_rate": 4.170326325688154e-06, + "loss": 0.91357679, + "memory(GiB)": 302.58, + "step": 205780, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.74730453, + "epoch": 1.1509310769566317, + "grad_norm": 9.1875, + "learning_rate": 4.169414454343756e-06, + "loss": 1.01494827, + "memory(GiB)": 302.58, + "step": 205800, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.75380764, + "epoch": 1.1510429264296111, + "grad_norm": 7.4375, + "learning_rate": 4.1685026114080186e-06, + "loss": 0.9682642, + "memory(GiB)": 302.58, + "step": 205820, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.76749129, + "epoch": 1.1511547759025902, + "grad_norm": 6.40625, + "learning_rate": 4.167590796912128e-06, + "loss": 0.90802336, + "memory(GiB)": 302.58, + "step": 205840, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.76586061, + "epoch": 1.1512666253755697, + "grad_norm": 5.3125, + "learning_rate": 4.166679010887272e-06, + "loss": 0.90392466, + "memory(GiB)": 302.58, + "step": 205860, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.76277523, + "epoch": 1.1513784748485487, + "grad_norm": 7.78125, + "learning_rate": 4.165767253364636e-06, + "loss": 0.93576117, + "memory(GiB)": 302.58, + "step": 205880, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.74714007, + "epoch": 1.1514903243215282, + "grad_norm": 8.9375, + "learning_rate": 4.164855524375406e-06, + "loss": 0.98853178, + "memory(GiB)": 302.58, + "step": 205900, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.74062214, + "epoch": 1.1516021737945072, + "grad_norm": 8.0625, + "learning_rate": 4.163943823950763e-06, + "loss": 1.03524818, + "memory(GiB)": 302.58, + "step": 205920, + "train_speed(iter/s)": 0.123944 + }, + { + "acc": 0.7490778, + "epoch": 1.1517140232674867, + "grad_norm": 7.46875, + "learning_rate": 4.163032152121894e-06, + "loss": 0.99155464, + "memory(GiB)": 302.58, + "step": 205940, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.75814104, + "epoch": 1.1518258727404658, + "grad_norm": 9.4375, + "learning_rate": 4.162120508919979e-06, + "loss": 0.94889736, + "memory(GiB)": 302.58, + "step": 205960, + "train_speed(iter/s)": 0.123955 + }, + { + "acc": 0.75983458, + "epoch": 1.1519377222134453, + "grad_norm": 7.0625, + "learning_rate": 4.1612088943762e-06, + "loss": 0.94240875, + "memory(GiB)": 302.58, + "step": 205980, + "train_speed(iter/s)": 0.123961 + }, + { + "acc": 0.75746522, + "epoch": 1.1520495716864243, + "grad_norm": 10.0625, + "learning_rate": 4.160297308521736e-06, + "loss": 0.95416889, + "memory(GiB)": 302.58, + "step": 206000, + "train_speed(iter/s)": 0.123966 + }, + { + "epoch": 1.1520495716864243, + "eval_acc": 0.7065483583233532, + "eval_loss": 1.013310194015503, + "eval_runtime": 7548.7533, + "eval_samples_per_second": 9.973, + "eval_steps_per_second": 9.973, + "step": 206000 + }, + { + "acc": 0.76035213, + "epoch": 1.1521614211594038, + "grad_norm": 5.09375, + "learning_rate": 4.159385751387767e-06, + "loss": 0.93156271, + "memory(GiB)": 302.58, + "step": 206020, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.74229221, + "epoch": 1.1522732706323828, + "grad_norm": 7.875, + "learning_rate": 4.158474223005471e-06, + "loss": 1.02242432, + "memory(GiB)": 302.58, + "step": 206040, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.76823902, + "epoch": 1.1523851201053623, + "grad_norm": 9.0625, + "learning_rate": 4.157562723406024e-06, + "loss": 0.92366619, + "memory(GiB)": 302.58, + "step": 206060, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.77067432, + "epoch": 1.1524969695783414, + "grad_norm": 9.375, + "learning_rate": 4.156651252620603e-06, + "loss": 0.87901974, + "memory(GiB)": 302.58, + "step": 206080, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.74727812, + "epoch": 1.1526088190513208, + "grad_norm": 10.5625, + "learning_rate": 4.155739810680384e-06, + "loss": 0.99787149, + "memory(GiB)": 302.58, + "step": 206100, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.7383604, + "epoch": 1.1527206685242999, + "grad_norm": 8.5, + "learning_rate": 4.154828397616541e-06, + "loss": 1.0342968, + "memory(GiB)": 302.58, + "step": 206120, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.75751634, + "epoch": 1.1528325179972794, + "grad_norm": 6.625, + "learning_rate": 4.153917013460247e-06, + "loss": 0.9764699, + "memory(GiB)": 302.58, + "step": 206140, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.73491464, + "epoch": 1.1529443674702584, + "grad_norm": 7.8125, + "learning_rate": 4.153005658242673e-06, + "loss": 1.03556023, + "memory(GiB)": 302.58, + "step": 206160, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.74029903, + "epoch": 1.153056216943238, + "grad_norm": 5.65625, + "learning_rate": 4.152094331994992e-06, + "loss": 1.03236179, + "memory(GiB)": 302.58, + "step": 206180, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.74827337, + "epoch": 1.153168066416217, + "grad_norm": 7.125, + "learning_rate": 4.151183034748372e-06, + "loss": 1.00329428, + "memory(GiB)": 302.58, + "step": 206200, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.7428019, + "epoch": 1.1532799158891964, + "grad_norm": 5.8125, + "learning_rate": 4.150271766533985e-06, + "loss": 1.01559248, + "memory(GiB)": 302.58, + "step": 206220, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.76176376, + "epoch": 1.1533917653621755, + "grad_norm": 6.59375, + "learning_rate": 4.149360527382998e-06, + "loss": 0.9135747, + "memory(GiB)": 302.58, + "step": 206240, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.75394659, + "epoch": 1.153503614835155, + "grad_norm": 7.40625, + "learning_rate": 4.148449317326579e-06, + "loss": 0.9649703, + "memory(GiB)": 302.58, + "step": 206260, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75959058, + "epoch": 1.153615464308134, + "grad_norm": 6.96875, + "learning_rate": 4.147538136395891e-06, + "loss": 0.92669926, + "memory(GiB)": 302.58, + "step": 206280, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.75481834, + "epoch": 1.1537273137811135, + "grad_norm": 12.625, + "learning_rate": 4.146626984622104e-06, + "loss": 0.99104204, + "memory(GiB)": 302.58, + "step": 206300, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.75158911, + "epoch": 1.1538391632540925, + "grad_norm": 6.3125, + "learning_rate": 4.14571586203638e-06, + "loss": 0.97915697, + "memory(GiB)": 302.58, + "step": 206320, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.75494351, + "epoch": 1.153951012727072, + "grad_norm": 6.84375, + "learning_rate": 4.144804768669881e-06, + "loss": 0.9741087, + "memory(GiB)": 302.58, + "step": 206340, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.76919045, + "epoch": 1.154062862200051, + "grad_norm": 5.59375, + "learning_rate": 4.143893704553772e-06, + "loss": 0.90600996, + "memory(GiB)": 302.58, + "step": 206360, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74607148, + "epoch": 1.1541747116730305, + "grad_norm": 7.40625, + "learning_rate": 4.142982669719213e-06, + "loss": 0.97935658, + "memory(GiB)": 302.58, + "step": 206380, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75264273, + "epoch": 1.1542865611460096, + "grad_norm": 8.75, + "learning_rate": 4.142071664197363e-06, + "loss": 0.9814167, + "memory(GiB)": 302.58, + "step": 206400, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.73469052, + "epoch": 1.154398410618989, + "grad_norm": 6.5, + "learning_rate": 4.1411606880193835e-06, + "loss": 1.06014643, + "memory(GiB)": 302.58, + "step": 206420, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.75735607, + "epoch": 1.154510260091968, + "grad_norm": 9.4375, + "learning_rate": 4.140249741216432e-06, + "loss": 0.94656048, + "memory(GiB)": 302.58, + "step": 206440, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.76502433, + "epoch": 1.1546221095649476, + "grad_norm": 6.71875, + "learning_rate": 4.1393388238196655e-06, + "loss": 0.9174161, + "memory(GiB)": 302.58, + "step": 206460, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.76344433, + "epoch": 1.1547339590379266, + "grad_norm": 7.46875, + "learning_rate": 4.1384279358602385e-06, + "loss": 0.92581434, + "memory(GiB)": 302.58, + "step": 206480, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.74876184, + "epoch": 1.1548458085109061, + "grad_norm": 7.25, + "learning_rate": 4.1375170773693095e-06, + "loss": 0.98658447, + "memory(GiB)": 302.58, + "step": 206500, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.75071812, + "epoch": 1.1549576579838852, + "grad_norm": 9.5, + "learning_rate": 4.136606248378032e-06, + "loss": 0.97128696, + "memory(GiB)": 302.58, + "step": 206520, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.77319107, + "epoch": 1.1550695074568647, + "grad_norm": 11.625, + "learning_rate": 4.135695448917557e-06, + "loss": 0.87989035, + "memory(GiB)": 302.58, + "step": 206540, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74925227, + "epoch": 1.1551813569298437, + "grad_norm": 6.25, + "learning_rate": 4.13478467901904e-06, + "loss": 0.97921019, + "memory(GiB)": 302.58, + "step": 206560, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.76251287, + "epoch": 1.1552932064028232, + "grad_norm": 7.0, + "learning_rate": 4.133873938713629e-06, + "loss": 0.93230772, + "memory(GiB)": 302.58, + "step": 206580, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.74907532, + "epoch": 1.1554050558758022, + "grad_norm": 8.0, + "learning_rate": 4.132963228032477e-06, + "loss": 0.99090281, + "memory(GiB)": 302.58, + "step": 206600, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.75971837, + "epoch": 1.1555169053487817, + "grad_norm": 7.125, + "learning_rate": 4.13205254700673e-06, + "loss": 0.9584857, + "memory(GiB)": 302.58, + "step": 206620, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.73977818, + "epoch": 1.1556287548217608, + "grad_norm": 6.875, + "learning_rate": 4.13114189566754e-06, + "loss": 1.03340073, + "memory(GiB)": 302.58, + "step": 206640, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.76016798, + "epoch": 1.1557406042947402, + "grad_norm": 6.5, + "learning_rate": 4.130231274046052e-06, + "loss": 0.92865133, + "memory(GiB)": 302.58, + "step": 206660, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.76182365, + "epoch": 1.1558524537677193, + "grad_norm": 6.21875, + "learning_rate": 4.129320682173411e-06, + "loss": 0.92592363, + "memory(GiB)": 302.58, + "step": 206680, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.74385142, + "epoch": 1.1559643032406988, + "grad_norm": 5.28125, + "learning_rate": 4.128410120080765e-06, + "loss": 1.00098753, + "memory(GiB)": 302.58, + "step": 206700, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.75503945, + "epoch": 1.156076152713678, + "grad_norm": 6.375, + "learning_rate": 4.127499587799256e-06, + "loss": 0.96532001, + "memory(GiB)": 302.58, + "step": 206720, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.75372114, + "epoch": 1.1561880021866573, + "grad_norm": 8.9375, + "learning_rate": 4.126589085360027e-06, + "loss": 0.95978212, + "memory(GiB)": 302.58, + "step": 206740, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.7424047, + "epoch": 1.1562998516596366, + "grad_norm": 8.0, + "learning_rate": 4.125678612794221e-06, + "loss": 1.04345846, + "memory(GiB)": 302.58, + "step": 206760, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.75520234, + "epoch": 1.1564117011326158, + "grad_norm": 9.75, + "learning_rate": 4.12476817013298e-06, + "loss": 0.96306992, + "memory(GiB)": 302.58, + "step": 206780, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.75900183, + "epoch": 1.156523550605595, + "grad_norm": 8.9375, + "learning_rate": 4.123857757407443e-06, + "loss": 0.95156898, + "memory(GiB)": 302.58, + "step": 206800, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.74519005, + "epoch": 1.1566354000785743, + "grad_norm": 9.1875, + "learning_rate": 4.122947374648748e-06, + "loss": 1.01419735, + "memory(GiB)": 302.58, + "step": 206820, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.74969678, + "epoch": 1.1567472495515536, + "grad_norm": 8.125, + "learning_rate": 4.122037021888034e-06, + "loss": 0.99395304, + "memory(GiB)": 302.58, + "step": 206840, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.74902883, + "epoch": 1.1568590990245329, + "grad_norm": 7.46875, + "learning_rate": 4.121126699156438e-06, + "loss": 0.98333015, + "memory(GiB)": 302.58, + "step": 206860, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.74517956, + "epoch": 1.1569709484975121, + "grad_norm": 7.125, + "learning_rate": 4.1202164064850945e-06, + "loss": 1.01220846, + "memory(GiB)": 302.58, + "step": 206880, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74684234, + "epoch": 1.1570827979704914, + "grad_norm": 9.5, + "learning_rate": 4.119306143905141e-06, + "loss": 0.99226017, + "memory(GiB)": 302.58, + "step": 206900, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.75800495, + "epoch": 1.1571946474434707, + "grad_norm": 7.625, + "learning_rate": 4.11839591144771e-06, + "loss": 0.95419359, + "memory(GiB)": 302.58, + "step": 206920, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.74509659, + "epoch": 1.15730649691645, + "grad_norm": 8.25, + "learning_rate": 4.117485709143931e-06, + "loss": 1.00226936, + "memory(GiB)": 302.58, + "step": 206940, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.73451209, + "epoch": 1.1574183463894292, + "grad_norm": 6.96875, + "learning_rate": 4.116575537024943e-06, + "loss": 1.0639061, + "memory(GiB)": 302.58, + "step": 206960, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.75490742, + "epoch": 1.1575301958624085, + "grad_norm": 8.25, + "learning_rate": 4.115665395121871e-06, + "loss": 0.96535625, + "memory(GiB)": 302.58, + "step": 206980, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.74967246, + "epoch": 1.1576420453353877, + "grad_norm": 9.9375, + "learning_rate": 4.114755283465849e-06, + "loss": 1.00539827, + "memory(GiB)": 302.58, + "step": 207000, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.75090108, + "epoch": 1.157753894808367, + "grad_norm": 6.5, + "learning_rate": 4.113845202088001e-06, + "loss": 0.98269615, + "memory(GiB)": 302.58, + "step": 207020, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.74785671, + "epoch": 1.1578657442813463, + "grad_norm": 7.53125, + "learning_rate": 4.112935151019459e-06, + "loss": 0.98128395, + "memory(GiB)": 302.58, + "step": 207040, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.7451314, + "epoch": 1.1579775937543255, + "grad_norm": 6.65625, + "learning_rate": 4.112025130291346e-06, + "loss": 0.98128929, + "memory(GiB)": 302.58, + "step": 207060, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.76210418, + "epoch": 1.1580894432273048, + "grad_norm": 8.3125, + "learning_rate": 4.111115139934789e-06, + "loss": 0.92996521, + "memory(GiB)": 302.58, + "step": 207080, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.76022491, + "epoch": 1.158201292700284, + "grad_norm": 9.875, + "learning_rate": 4.110205179980914e-06, + "loss": 0.93080568, + "memory(GiB)": 302.58, + "step": 207100, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.76561522, + "epoch": 1.1583131421732633, + "grad_norm": 6.8125, + "learning_rate": 4.109295250460843e-06, + "loss": 0.91433535, + "memory(GiB)": 302.58, + "step": 207120, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.74652286, + "epoch": 1.1584249916462426, + "grad_norm": 7.34375, + "learning_rate": 4.108385351405699e-06, + "loss": 0.99713421, + "memory(GiB)": 302.58, + "step": 207140, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.75546966, + "epoch": 1.1585368411192218, + "grad_norm": 9.25, + "learning_rate": 4.1074754828466025e-06, + "loss": 0.97565451, + "memory(GiB)": 302.58, + "step": 207160, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.75140586, + "epoch": 1.158648690592201, + "grad_norm": 8.875, + "learning_rate": 4.1065656448146755e-06, + "loss": 0.99592972, + "memory(GiB)": 302.58, + "step": 207180, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.7447773, + "epoch": 1.1587605400651804, + "grad_norm": 8.3125, + "learning_rate": 4.105655837341036e-06, + "loss": 1.01636353, + "memory(GiB)": 302.58, + "step": 207200, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.7574122, + "epoch": 1.1588723895381596, + "grad_norm": 7.1875, + "learning_rate": 4.104746060456802e-06, + "loss": 0.94745188, + "memory(GiB)": 302.58, + "step": 207220, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.75815511, + "epoch": 1.158984239011139, + "grad_norm": 5.65625, + "learning_rate": 4.103836314193093e-06, + "loss": 0.95048227, + "memory(GiB)": 302.58, + "step": 207240, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.76108742, + "epoch": 1.1590960884841182, + "grad_norm": 8.4375, + "learning_rate": 4.1029265985810225e-06, + "loss": 0.94909163, + "memory(GiB)": 302.58, + "step": 207260, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.74354239, + "epoch": 1.1592079379570974, + "grad_norm": 7.09375, + "learning_rate": 4.102016913651708e-06, + "loss": 1.04950638, + "memory(GiB)": 302.58, + "step": 207280, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.77316575, + "epoch": 1.1593197874300767, + "grad_norm": 8.25, + "learning_rate": 4.1011072594362615e-06, + "loss": 0.86827745, + "memory(GiB)": 302.58, + "step": 207300, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.75662193, + "epoch": 1.159431636903056, + "grad_norm": 7.65625, + "learning_rate": 4.100197635965797e-06, + "loss": 0.95115194, + "memory(GiB)": 302.58, + "step": 207320, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.76275191, + "epoch": 1.1595434863760352, + "grad_norm": 6.9375, + "learning_rate": 4.099288043271428e-06, + "loss": 0.9285037, + "memory(GiB)": 302.58, + "step": 207340, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.76835971, + "epoch": 1.1596553358490145, + "grad_norm": 8.6875, + "learning_rate": 4.098378481384261e-06, + "loss": 0.90899019, + "memory(GiB)": 302.58, + "step": 207360, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.7538579, + "epoch": 1.1597671853219937, + "grad_norm": 7.90625, + "learning_rate": 4.097468950335412e-06, + "loss": 0.96884451, + "memory(GiB)": 302.58, + "step": 207380, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.74411697, + "epoch": 1.159879034794973, + "grad_norm": 5.84375, + "learning_rate": 4.0965594501559845e-06, + "loss": 1.02108812, + "memory(GiB)": 302.58, + "step": 207400, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.76070137, + "epoch": 1.1599908842679523, + "grad_norm": 8.375, + "learning_rate": 4.09564998087709e-06, + "loss": 0.93808918, + "memory(GiB)": 302.58, + "step": 207420, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.76263537, + "epoch": 1.1601027337409315, + "grad_norm": 7.59375, + "learning_rate": 4.094740542529833e-06, + "loss": 0.95166225, + "memory(GiB)": 302.58, + "step": 207440, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.75034108, + "epoch": 1.1602145832139108, + "grad_norm": 8.5, + "learning_rate": 4.093831135145321e-06, + "loss": 0.99332781, + "memory(GiB)": 302.58, + "step": 207460, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.76316104, + "epoch": 1.16032643268689, + "grad_norm": 7.75, + "learning_rate": 4.092921758754656e-06, + "loss": 0.91757565, + "memory(GiB)": 302.58, + "step": 207480, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.74324489, + "epoch": 1.1604382821598693, + "grad_norm": 8.8125, + "learning_rate": 4.092012413388944e-06, + "loss": 1.04876928, + "memory(GiB)": 302.58, + "step": 207500, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.761443, + "epoch": 1.1605501316328486, + "grad_norm": 8.375, + "learning_rate": 4.0911030990792865e-06, + "loss": 0.9319725, + "memory(GiB)": 302.58, + "step": 207520, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.72885618, + "epoch": 1.1606619811058279, + "grad_norm": 7.03125, + "learning_rate": 4.090193815856785e-06, + "loss": 1.09534273, + "memory(GiB)": 302.58, + "step": 207540, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.73716488, + "epoch": 1.1607738305788071, + "grad_norm": 7.90625, + "learning_rate": 4.0892845637525405e-06, + "loss": 1.03346138, + "memory(GiB)": 302.58, + "step": 207560, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.7473186, + "epoch": 1.1608856800517864, + "grad_norm": 9.125, + "learning_rate": 4.088375342797651e-06, + "loss": 1.02150908, + "memory(GiB)": 302.58, + "step": 207580, + "train_speed(iter/s)": 0.123839 + }, + { + "acc": 0.76107702, + "epoch": 1.1609975295247656, + "grad_norm": 8.9375, + "learning_rate": 4.087466153023216e-06, + "loss": 0.93440142, + "memory(GiB)": 302.58, + "step": 207600, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.75690827, + "epoch": 1.161109378997745, + "grad_norm": 8.3125, + "learning_rate": 4.0865569944603315e-06, + "loss": 0.95014248, + "memory(GiB)": 302.58, + "step": 207620, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.75510206, + "epoch": 1.1612212284707242, + "grad_norm": 8.8125, + "learning_rate": 4.085647867140095e-06, + "loss": 0.96651974, + "memory(GiB)": 302.58, + "step": 207640, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.76547561, + "epoch": 1.1613330779437034, + "grad_norm": 6.28125, + "learning_rate": 4.084738771093601e-06, + "loss": 0.92246962, + "memory(GiB)": 302.58, + "step": 207660, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.73967819, + "epoch": 1.1614449274166827, + "grad_norm": 5.34375, + "learning_rate": 4.083829706351942e-06, + "loss": 1.00726547, + "memory(GiB)": 302.58, + "step": 207680, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.74843621, + "epoch": 1.161556776889662, + "grad_norm": 7.375, + "learning_rate": 4.082920672946213e-06, + "loss": 0.99471502, + "memory(GiB)": 302.58, + "step": 207700, + "train_speed(iter/s)": 0.123872 + }, + { + "acc": 0.76256394, + "epoch": 1.1616686263626412, + "grad_norm": 6.59375, + "learning_rate": 4.0820116709075046e-06, + "loss": 0.91389141, + "memory(GiB)": 302.58, + "step": 207720, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.75147238, + "epoch": 1.1617804758356205, + "grad_norm": 7.5625, + "learning_rate": 4.0811027002669075e-06, + "loss": 0.98243628, + "memory(GiB)": 302.58, + "step": 207740, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.72345066, + "epoch": 1.1618923253085998, + "grad_norm": 6.25, + "learning_rate": 4.080193761055512e-06, + "loss": 1.10744123, + "memory(GiB)": 302.58, + "step": 207760, + "train_speed(iter/s)": 0.123888 + }, + { + "acc": 0.74747472, + "epoch": 1.162004174781579, + "grad_norm": 8.8125, + "learning_rate": 4.079284853304407e-06, + "loss": 1.01000023, + "memory(GiB)": 302.58, + "step": 207780, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.75087175, + "epoch": 1.1621160242545583, + "grad_norm": 9.75, + "learning_rate": 4.0783759770446795e-06, + "loss": 0.99200401, + "memory(GiB)": 302.58, + "step": 207800, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.76293864, + "epoch": 1.1622278737275376, + "grad_norm": 7.1875, + "learning_rate": 4.0774671323074155e-06, + "loss": 0.92268229, + "memory(GiB)": 302.58, + "step": 207820, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.75060143, + "epoch": 1.1623397232005168, + "grad_norm": 8.25, + "learning_rate": 4.076558319123702e-06, + "loss": 0.96296358, + "memory(GiB)": 302.58, + "step": 207840, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.75737991, + "epoch": 1.162451572673496, + "grad_norm": 4.28125, + "learning_rate": 4.075649537524622e-06, + "loss": 0.94329233, + "memory(GiB)": 302.58, + "step": 207860, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.74143267, + "epoch": 1.1625634221464753, + "grad_norm": 7.625, + "learning_rate": 4.074740787541257e-06, + "loss": 1.01791773, + "memory(GiB)": 302.58, + "step": 207880, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.74399142, + "epoch": 1.1626752716194546, + "grad_norm": 8.625, + "learning_rate": 4.0738320692046915e-06, + "loss": 1.01566467, + "memory(GiB)": 302.58, + "step": 207900, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.7429348, + "epoch": 1.1627871210924339, + "grad_norm": 9.0625, + "learning_rate": 4.072923382546007e-06, + "loss": 1.0121295, + "memory(GiB)": 302.58, + "step": 207920, + "train_speed(iter/s)": 0.123934 + }, + { + "acc": 0.74573803, + "epoch": 1.1628989705654131, + "grad_norm": 7.78125, + "learning_rate": 4.072014727596282e-06, + "loss": 1.02808161, + "memory(GiB)": 302.58, + "step": 207940, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.73583684, + "epoch": 1.1630108200383924, + "grad_norm": 7.59375, + "learning_rate": 4.071106104386596e-06, + "loss": 1.05327253, + "memory(GiB)": 302.58, + "step": 207960, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.7477406, + "epoch": 1.1631226695113717, + "grad_norm": 9.625, + "learning_rate": 4.070197512948028e-06, + "loss": 0.99298563, + "memory(GiB)": 302.58, + "step": 207980, + "train_speed(iter/s)": 0.123951 + }, + { + "acc": 0.75892625, + "epoch": 1.163234518984351, + "grad_norm": 7.125, + "learning_rate": 4.069288953311652e-06, + "loss": 0.94181547, + "memory(GiB)": 302.58, + "step": 208000, + "train_speed(iter/s)": 0.123956 + }, + { + "epoch": 1.163234518984351, + "eval_acc": 0.7066120004744323, + "eval_loss": 1.013183832168579, + "eval_runtime": 7498.0447, + "eval_samples_per_second": 10.04, + "eval_steps_per_second": 10.04, + "step": 208000 + }, + { + "acc": 0.74701943, + "epoch": 1.1633463684573302, + "grad_norm": 6.8125, + "learning_rate": 4.0683804255085445e-06, + "loss": 0.98101845, + "memory(GiB)": 302.58, + "step": 208020, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.7324965, + "epoch": 1.1634582179303095, + "grad_norm": 8.0, + "learning_rate": 4.067471929569781e-06, + "loss": 1.049296, + "memory(GiB)": 302.58, + "step": 208040, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74052291, + "epoch": 1.1635700674032887, + "grad_norm": 7.21875, + "learning_rate": 4.0665634655264345e-06, + "loss": 1.01740379, + "memory(GiB)": 302.58, + "step": 208060, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.72897453, + "epoch": 1.163681916876268, + "grad_norm": 6.15625, + "learning_rate": 4.065655033409577e-06, + "loss": 1.08715839, + "memory(GiB)": 302.58, + "step": 208080, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.75281754, + "epoch": 1.1637937663492472, + "grad_norm": 6.25, + "learning_rate": 4.064746633250281e-06, + "loss": 0.97949619, + "memory(GiB)": 302.58, + "step": 208100, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.73947606, + "epoch": 1.1639056158222265, + "grad_norm": 5.65625, + "learning_rate": 4.063838265079615e-06, + "loss": 1.04115105, + "memory(GiB)": 302.58, + "step": 208120, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74061818, + "epoch": 1.1640174652952058, + "grad_norm": 8.5625, + "learning_rate": 4.0629299289286495e-06, + "loss": 1.01472673, + "memory(GiB)": 302.58, + "step": 208140, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.74405422, + "epoch": 1.164129314768185, + "grad_norm": 5.25, + "learning_rate": 4.0620216248284515e-06, + "loss": 1.01309366, + "memory(GiB)": 302.58, + "step": 208160, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.76283274, + "epoch": 1.1642411642411643, + "grad_norm": 7.78125, + "learning_rate": 4.061113352810087e-06, + "loss": 0.92341938, + "memory(GiB)": 302.58, + "step": 208180, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.75316129, + "epoch": 1.1643530137141436, + "grad_norm": 7.65625, + "learning_rate": 4.060205112904623e-06, + "loss": 0.96708212, + "memory(GiB)": 302.58, + "step": 208200, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.74844885, + "epoch": 1.1644648631871228, + "grad_norm": 7.84375, + "learning_rate": 4.059296905143124e-06, + "loss": 0.98120975, + "memory(GiB)": 302.58, + "step": 208220, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.74197755, + "epoch": 1.164576712660102, + "grad_norm": 12.0625, + "learning_rate": 4.058388729556656e-06, + "loss": 1.01853762, + "memory(GiB)": 302.58, + "step": 208240, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.75609961, + "epoch": 1.1646885621330814, + "grad_norm": 8.25, + "learning_rate": 4.057480586176278e-06, + "loss": 0.93654051, + "memory(GiB)": 302.58, + "step": 208260, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74060411, + "epoch": 1.1648004116060606, + "grad_norm": 7.375, + "learning_rate": 4.0565724750330525e-06, + "loss": 1.02616806, + "memory(GiB)": 302.58, + "step": 208280, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.74835753, + "epoch": 1.16491226107904, + "grad_norm": 7.65625, + "learning_rate": 4.05566439615804e-06, + "loss": 1.00516977, + "memory(GiB)": 302.58, + "step": 208300, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.75889044, + "epoch": 1.1650241105520192, + "grad_norm": 7.625, + "learning_rate": 4.054756349582301e-06, + "loss": 0.93896942, + "memory(GiB)": 302.58, + "step": 208320, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.7331594, + "epoch": 1.1651359600249984, + "grad_norm": 6.5, + "learning_rate": 4.053848335336891e-06, + "loss": 1.04572926, + "memory(GiB)": 302.58, + "step": 208340, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.76048861, + "epoch": 1.1652478094979777, + "grad_norm": 8.125, + "learning_rate": 4.052940353452867e-06, + "loss": 0.94081211, + "memory(GiB)": 302.58, + "step": 208360, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74861422, + "epoch": 1.165359658970957, + "grad_norm": 6.9375, + "learning_rate": 4.052032403961286e-06, + "loss": 0.97161808, + "memory(GiB)": 302.58, + "step": 208380, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.76127753, + "epoch": 1.1654715084439362, + "grad_norm": 9.5625, + "learning_rate": 4.051124486893204e-06, + "loss": 0.9208827, + "memory(GiB)": 302.58, + "step": 208400, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.76467991, + "epoch": 1.1655833579169155, + "grad_norm": 8.6875, + "learning_rate": 4.050216602279673e-06, + "loss": 0.92080021, + "memory(GiB)": 302.58, + "step": 208420, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.7603478, + "epoch": 1.1656952073898947, + "grad_norm": 5.34375, + "learning_rate": 4.049308750151746e-06, + "loss": 0.91961718, + "memory(GiB)": 302.58, + "step": 208440, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.74295073, + "epoch": 1.165807056862874, + "grad_norm": 5.875, + "learning_rate": 4.048400930540475e-06, + "loss": 1.0083828, + "memory(GiB)": 302.58, + "step": 208460, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.74938359, + "epoch": 1.1659189063358533, + "grad_norm": 6.875, + "learning_rate": 4.047493143476908e-06, + "loss": 0.96611776, + "memory(GiB)": 302.58, + "step": 208480, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.75403075, + "epoch": 1.1660307558088325, + "grad_norm": 4.9375, + "learning_rate": 4.046585388992096e-06, + "loss": 0.94927835, + "memory(GiB)": 302.58, + "step": 208500, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.76546068, + "epoch": 1.1661426052818118, + "grad_norm": 10.4375, + "learning_rate": 4.0456776671170875e-06, + "loss": 0.91875982, + "memory(GiB)": 302.58, + "step": 208520, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.74239845, + "epoch": 1.166254454754791, + "grad_norm": 5.125, + "learning_rate": 4.044769977882929e-06, + "loss": 1.02741337, + "memory(GiB)": 302.58, + "step": 208540, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.75656562, + "epoch": 1.1663663042277703, + "grad_norm": 7.71875, + "learning_rate": 4.043862321320666e-06, + "loss": 0.95516052, + "memory(GiB)": 302.58, + "step": 208560, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.76411271, + "epoch": 1.1664781537007496, + "grad_norm": 6.8125, + "learning_rate": 4.042954697461343e-06, + "loss": 0.91069107, + "memory(GiB)": 302.58, + "step": 208580, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.73625674, + "epoch": 1.1665900031737289, + "grad_norm": 5.21875, + "learning_rate": 4.042047106336005e-06, + "loss": 1.03611679, + "memory(GiB)": 302.58, + "step": 208600, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.75672398, + "epoch": 1.1667018526467081, + "grad_norm": 5.46875, + "learning_rate": 4.041139547975694e-06, + "loss": 0.95986118, + "memory(GiB)": 302.58, + "step": 208620, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.74761372, + "epoch": 1.1668137021196874, + "grad_norm": 7.75, + "learning_rate": 4.040232022411448e-06, + "loss": 1.00391855, + "memory(GiB)": 302.58, + "step": 208640, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.74451914, + "epoch": 1.1669255515926666, + "grad_norm": 7.96875, + "learning_rate": 4.039324529674313e-06, + "loss": 1.01480608, + "memory(GiB)": 302.58, + "step": 208660, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74114771, + "epoch": 1.167037401065646, + "grad_norm": 6.4375, + "learning_rate": 4.038417069795324e-06, + "loss": 1.03073416, + "memory(GiB)": 302.58, + "step": 208680, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74721928, + "epoch": 1.1671492505386252, + "grad_norm": 7.65625, + "learning_rate": 4.037509642805521e-06, + "loss": 1.01527205, + "memory(GiB)": 302.58, + "step": 208700, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74485569, + "epoch": 1.1672611000116044, + "grad_norm": 5.40625, + "learning_rate": 4.0366022487359405e-06, + "loss": 1.01240492, + "memory(GiB)": 302.58, + "step": 208720, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.75653, + "epoch": 1.1673729494845837, + "grad_norm": 5.71875, + "learning_rate": 4.035694887617617e-06, + "loss": 0.96911879, + "memory(GiB)": 302.58, + "step": 208740, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.74239345, + "epoch": 1.167484798957563, + "grad_norm": 8.1875, + "learning_rate": 4.034787559481587e-06, + "loss": 1.02366695, + "memory(GiB)": 302.58, + "step": 208760, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.76001434, + "epoch": 1.1675966484305422, + "grad_norm": 8.1875, + "learning_rate": 4.033880264358881e-06, + "loss": 0.94085207, + "memory(GiB)": 302.58, + "step": 208780, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.74393287, + "epoch": 1.1677084979035215, + "grad_norm": 5.46875, + "learning_rate": 4.032973002280536e-06, + "loss": 1.02184553, + "memory(GiB)": 302.58, + "step": 208800, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.75071096, + "epoch": 1.1678203473765008, + "grad_norm": 5.875, + "learning_rate": 4.032065773277579e-06, + "loss": 0.97431622, + "memory(GiB)": 302.58, + "step": 208820, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.76772099, + "epoch": 1.16793219684948, + "grad_norm": 8.1875, + "learning_rate": 4.031158577381043e-06, + "loss": 0.88947763, + "memory(GiB)": 302.58, + "step": 208840, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.75868115, + "epoch": 1.1680440463224593, + "grad_norm": 6.375, + "learning_rate": 4.0302514146219555e-06, + "loss": 0.95478582, + "memory(GiB)": 302.58, + "step": 208860, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.7321373, + "epoch": 1.1681558957954385, + "grad_norm": 8.75, + "learning_rate": 4.029344285031344e-06, + "loss": 1.06549349, + "memory(GiB)": 302.58, + "step": 208880, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.72586813, + "epoch": 1.1682677452684178, + "grad_norm": 6.875, + "learning_rate": 4.028437188640236e-06, + "loss": 1.09557562, + "memory(GiB)": 302.58, + "step": 208900, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.73921695, + "epoch": 1.168379594741397, + "grad_norm": 9.5625, + "learning_rate": 4.027530125479657e-06, + "loss": 1.02851305, + "memory(GiB)": 302.58, + "step": 208920, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.75752354, + "epoch": 1.1684914442143763, + "grad_norm": 7.15625, + "learning_rate": 4.026623095580631e-06, + "loss": 0.94228487, + "memory(GiB)": 302.58, + "step": 208940, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74300833, + "epoch": 1.1686032936873556, + "grad_norm": 4.875, + "learning_rate": 4.025716098974182e-06, + "loss": 0.99027824, + "memory(GiB)": 302.58, + "step": 208960, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.74874763, + "epoch": 1.1687151431603349, + "grad_norm": 5.71875, + "learning_rate": 4.024809135691332e-06, + "loss": 0.99362125, + "memory(GiB)": 302.58, + "step": 208980, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.74646015, + "epoch": 1.1688269926333141, + "grad_norm": 5.96875, + "learning_rate": 4.023902205763101e-06, + "loss": 0.99861975, + "memory(GiB)": 302.58, + "step": 209000, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.7321516, + "epoch": 1.1689388421062934, + "grad_norm": 8.5, + "learning_rate": 4.022995309220511e-06, + "loss": 1.06363678, + "memory(GiB)": 302.58, + "step": 209020, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.75928864, + "epoch": 1.1690506915792727, + "grad_norm": 5.15625, + "learning_rate": 4.022088446094578e-06, + "loss": 0.92669296, + "memory(GiB)": 302.58, + "step": 209040, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.77028756, + "epoch": 1.169162541052252, + "grad_norm": 7.8125, + "learning_rate": 4.021181616416322e-06, + "loss": 0.89842339, + "memory(GiB)": 302.58, + "step": 209060, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.76854453, + "epoch": 1.1692743905252312, + "grad_norm": 7.0625, + "learning_rate": 4.020274820216758e-06, + "loss": 0.93214388, + "memory(GiB)": 302.58, + "step": 209080, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.74802375, + "epoch": 1.1693862399982105, + "grad_norm": 7.4375, + "learning_rate": 4.019368057526902e-06, + "loss": 0.99839973, + "memory(GiB)": 302.58, + "step": 209100, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.73903131, + "epoch": 1.1694980894711897, + "grad_norm": 8.0625, + "learning_rate": 4.018461328377768e-06, + "loss": 1.05108318, + "memory(GiB)": 302.58, + "step": 209120, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.75907631, + "epoch": 1.169609938944169, + "grad_norm": 7.78125, + "learning_rate": 4.017554632800369e-06, + "loss": 0.95151834, + "memory(GiB)": 302.58, + "step": 209140, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.74814482, + "epoch": 1.1697217884171482, + "grad_norm": 8.5625, + "learning_rate": 4.0166479708257165e-06, + "loss": 1.00483055, + "memory(GiB)": 302.58, + "step": 209160, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.74959097, + "epoch": 1.1698336378901275, + "grad_norm": 8.8125, + "learning_rate": 4.015741342484821e-06, + "loss": 0.99260235, + "memory(GiB)": 302.58, + "step": 209180, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74929328, + "epoch": 1.1699454873631068, + "grad_norm": 5.3125, + "learning_rate": 4.014834747808694e-06, + "loss": 0.97478228, + "memory(GiB)": 302.58, + "step": 209200, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.76560211, + "epoch": 1.170057336836086, + "grad_norm": 11.125, + "learning_rate": 4.013928186828342e-06, + "loss": 0.932864, + "memory(GiB)": 302.58, + "step": 209220, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.76748013, + "epoch": 1.1701691863090653, + "grad_norm": 6.53125, + "learning_rate": 4.013021659574771e-06, + "loss": 0.9241333, + "memory(GiB)": 302.58, + "step": 209240, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.75561385, + "epoch": 1.1702810357820446, + "grad_norm": 7.84375, + "learning_rate": 4.012115166078991e-06, + "loss": 0.9581789, + "memory(GiB)": 302.58, + "step": 209260, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.77012563, + "epoch": 1.1703928852550238, + "grad_norm": 5.375, + "learning_rate": 4.011208706372005e-06, + "loss": 0.88877954, + "memory(GiB)": 302.58, + "step": 209280, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.74079723, + "epoch": 1.170504734728003, + "grad_norm": 5.0625, + "learning_rate": 4.010302280484816e-06, + "loss": 1.02098389, + "memory(GiB)": 302.58, + "step": 209300, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.76087356, + "epoch": 1.1706165842009824, + "grad_norm": 6.0625, + "learning_rate": 4.009395888448426e-06, + "loss": 0.94279566, + "memory(GiB)": 302.58, + "step": 209320, + "train_speed(iter/s)": 0.123765 + }, + { + "acc": 0.74746943, + "epoch": 1.1707284336739616, + "grad_norm": 6.65625, + "learning_rate": 4.008489530293839e-06, + "loss": 0.98809767, + "memory(GiB)": 302.58, + "step": 209340, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.75362244, + "epoch": 1.1708402831469409, + "grad_norm": 6.625, + "learning_rate": 4.007583206052053e-06, + "loss": 0.95965443, + "memory(GiB)": 302.58, + "step": 209360, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.76007299, + "epoch": 1.1709521326199201, + "grad_norm": 8.875, + "learning_rate": 4.00667691575407e-06, + "loss": 0.94081039, + "memory(GiB)": 302.58, + "step": 209380, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.76916103, + "epoch": 1.1710639820928994, + "grad_norm": 6.65625, + "learning_rate": 4.005770659430885e-06, + "loss": 0.90849495, + "memory(GiB)": 302.58, + "step": 209400, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.76135297, + "epoch": 1.1711758315658787, + "grad_norm": 9.5625, + "learning_rate": 4.004864437113497e-06, + "loss": 0.92748852, + "memory(GiB)": 302.58, + "step": 209420, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.7738225, + "epoch": 1.171287681038858, + "grad_norm": 7.3125, + "learning_rate": 4.0039582488329e-06, + "loss": 0.87079096, + "memory(GiB)": 302.58, + "step": 209440, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.76588097, + "epoch": 1.1713995305118372, + "grad_norm": 7.75, + "learning_rate": 4.003052094620087e-06, + "loss": 0.90927277, + "memory(GiB)": 302.58, + "step": 209460, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.76031966, + "epoch": 1.1715113799848165, + "grad_norm": 9.375, + "learning_rate": 4.002145974506057e-06, + "loss": 0.94617252, + "memory(GiB)": 302.58, + "step": 209480, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.76181803, + "epoch": 1.1716232294577957, + "grad_norm": 6.65625, + "learning_rate": 4.0012398885217975e-06, + "loss": 0.92015057, + "memory(GiB)": 302.58, + "step": 209500, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.74889197, + "epoch": 1.171735078930775, + "grad_norm": 9.0625, + "learning_rate": 4.0003338366983e-06, + "loss": 1.01341257, + "memory(GiB)": 302.58, + "step": 209520, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.77067575, + "epoch": 1.1718469284037543, + "grad_norm": 7.5, + "learning_rate": 3.999427819066556e-06, + "loss": 0.89768505, + "memory(GiB)": 302.58, + "step": 209540, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.75511937, + "epoch": 1.1719587778767335, + "grad_norm": 5.0625, + "learning_rate": 3.998521835657554e-06, + "loss": 0.96445255, + "memory(GiB)": 302.58, + "step": 209560, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.75255413, + "epoch": 1.1720706273497128, + "grad_norm": 5.71875, + "learning_rate": 3.99761588650228e-06, + "loss": 0.95643635, + "memory(GiB)": 302.58, + "step": 209580, + "train_speed(iter/s)": 0.12384 + }, + { + "acc": 0.7508955, + "epoch": 1.172182476822692, + "grad_norm": 5.5625, + "learning_rate": 3.996709971631722e-06, + "loss": 0.99657564, + "memory(GiB)": 302.58, + "step": 209600, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.7447052, + "epoch": 1.1722943262956713, + "grad_norm": 8.3125, + "learning_rate": 3.995804091076864e-06, + "loss": 0.99922876, + "memory(GiB)": 302.58, + "step": 209620, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.74369941, + "epoch": 1.1724061757686506, + "grad_norm": 6.71875, + "learning_rate": 3.99489824486869e-06, + "loss": 1.00442162, + "memory(GiB)": 302.58, + "step": 209640, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.75070896, + "epoch": 1.1725180252416298, + "grad_norm": 8.0625, + "learning_rate": 3.993992433038183e-06, + "loss": 0.95339565, + "memory(GiB)": 302.58, + "step": 209660, + "train_speed(iter/s)": 0.123862 + }, + { + "acc": 0.75205927, + "epoch": 1.172629874714609, + "grad_norm": 8.5, + "learning_rate": 3.993086655616324e-06, + "loss": 0.99803667, + "memory(GiB)": 302.58, + "step": 209680, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74497004, + "epoch": 1.1727417241875884, + "grad_norm": 6.96875, + "learning_rate": 3.9921809126340955e-06, + "loss": 1.00543718, + "memory(GiB)": 302.58, + "step": 209700, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.74068627, + "epoch": 1.1728535736605676, + "grad_norm": 8.625, + "learning_rate": 3.9912752041224755e-06, + "loss": 1.01939058, + "memory(GiB)": 302.58, + "step": 209720, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.75223227, + "epoch": 1.172965423133547, + "grad_norm": 8.1875, + "learning_rate": 3.990369530112442e-06, + "loss": 0.96934643, + "memory(GiB)": 302.58, + "step": 209740, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.76652589, + "epoch": 1.1730772726065262, + "grad_norm": 5.03125, + "learning_rate": 3.989463890634971e-06, + "loss": 0.91217718, + "memory(GiB)": 302.58, + "step": 209760, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.74591794, + "epoch": 1.1731891220795054, + "grad_norm": 4.8125, + "learning_rate": 3.988558285721041e-06, + "loss": 1.02055607, + "memory(GiB)": 302.58, + "step": 209780, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.74426565, + "epoch": 1.1733009715524847, + "grad_norm": 7.46875, + "learning_rate": 3.987652715401623e-06, + "loss": 1.02269859, + "memory(GiB)": 302.58, + "step": 209800, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.7710587, + "epoch": 1.173412821025464, + "grad_norm": 7.53125, + "learning_rate": 3.986747179707693e-06, + "loss": 0.90828228, + "memory(GiB)": 302.58, + "step": 209820, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.74950237, + "epoch": 1.1735246704984432, + "grad_norm": 4.84375, + "learning_rate": 3.985841678670222e-06, + "loss": 0.95613155, + "memory(GiB)": 302.58, + "step": 209840, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.75475936, + "epoch": 1.1736365199714225, + "grad_norm": 8.1875, + "learning_rate": 3.984936212320183e-06, + "loss": 0.9734643, + "memory(GiB)": 302.58, + "step": 209860, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.75332594, + "epoch": 1.1737483694444018, + "grad_norm": 8.0625, + "learning_rate": 3.984030780688543e-06, + "loss": 0.96833344, + "memory(GiB)": 302.58, + "step": 209880, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.74335227, + "epoch": 1.173860218917381, + "grad_norm": 6.75, + "learning_rate": 3.9831253838062715e-06, + "loss": 1.01550379, + "memory(GiB)": 302.58, + "step": 209900, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.74252772, + "epoch": 1.1739720683903603, + "grad_norm": 6.3125, + "learning_rate": 3.982220021704336e-06, + "loss": 1.01104908, + "memory(GiB)": 302.58, + "step": 209920, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.75104828, + "epoch": 1.1740839178633395, + "grad_norm": 9.0625, + "learning_rate": 3.981314694413704e-06, + "loss": 1.00687265, + "memory(GiB)": 302.58, + "step": 209940, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.73460588, + "epoch": 1.1741957673363188, + "grad_norm": 6.125, + "learning_rate": 3.9804094019653395e-06, + "loss": 1.043398, + "memory(GiB)": 302.58, + "step": 209960, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.7621902, + "epoch": 1.174307616809298, + "grad_norm": 16.25, + "learning_rate": 3.9795041443902064e-06, + "loss": 0.92035389, + "memory(GiB)": 302.58, + "step": 209980, + "train_speed(iter/s)": 0.12395 + }, + { + "acc": 0.75792198, + "epoch": 1.1744194662822773, + "grad_norm": 9.625, + "learning_rate": 3.9785989217192675e-06, + "loss": 0.96634312, + "memory(GiB)": 302.58, + "step": 210000, + "train_speed(iter/s)": 0.123956 + }, + { + "epoch": 1.1744194662822773, + "eval_acc": 0.7066111131322793, + "eval_loss": 1.0132148265838623, + "eval_runtime": 7504.2156, + "eval_samples_per_second": 10.032, + "eval_steps_per_second": 10.032, + "step": 210000 + }, + { + "acc": 0.75144839, + "epoch": 1.1745313157552566, + "grad_norm": 8.5, + "learning_rate": 3.977693733983484e-06, + "loss": 0.97635651, + "memory(GiB)": 302.58, + "step": 210020, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.75167012, + "epoch": 1.1746431652282359, + "grad_norm": 10.8125, + "learning_rate": 3.976788581213817e-06, + "loss": 0.99518404, + "memory(GiB)": 302.58, + "step": 210040, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74736285, + "epoch": 1.1747550147012151, + "grad_norm": 7.09375, + "learning_rate": 3.975883463441224e-06, + "loss": 1.00254698, + "memory(GiB)": 302.58, + "step": 210060, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.74571099, + "epoch": 1.1748668641741944, + "grad_norm": 7.25, + "learning_rate": 3.974978380696664e-06, + "loss": 1.00425301, + "memory(GiB)": 302.58, + "step": 210080, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.74904027, + "epoch": 1.1749787136471737, + "grad_norm": 8.3125, + "learning_rate": 3.974073333011096e-06, + "loss": 0.9920969, + "memory(GiB)": 302.58, + "step": 210100, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.73630342, + "epoch": 1.175090563120153, + "grad_norm": 6.375, + "learning_rate": 3.97316832041547e-06, + "loss": 1.02670145, + "memory(GiB)": 302.58, + "step": 210120, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.7469388, + "epoch": 1.1752024125931322, + "grad_norm": 9.5, + "learning_rate": 3.9722633429407456e-06, + "loss": 0.99789991, + "memory(GiB)": 302.58, + "step": 210140, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.7557363, + "epoch": 1.1753142620661114, + "grad_norm": 8.1875, + "learning_rate": 3.971358400617873e-06, + "loss": 0.97793694, + "memory(GiB)": 302.58, + "step": 210160, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.74460192, + "epoch": 1.1754261115390907, + "grad_norm": 5.75, + "learning_rate": 3.970453493477805e-06, + "loss": 1.01389704, + "memory(GiB)": 302.58, + "step": 210180, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.74632349, + "epoch": 1.17553796101207, + "grad_norm": 6.3125, + "learning_rate": 3.969548621551491e-06, + "loss": 1.00125523, + "memory(GiB)": 302.58, + "step": 210200, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.7519146, + "epoch": 1.1756498104850492, + "grad_norm": 8.125, + "learning_rate": 3.9686437848698825e-06, + "loss": 0.97084341, + "memory(GiB)": 302.58, + "step": 210220, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74988456, + "epoch": 1.1757616599580285, + "grad_norm": 7.96875, + "learning_rate": 3.9677389834639265e-06, + "loss": 0.9654314, + "memory(GiB)": 302.58, + "step": 210240, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.75492463, + "epoch": 1.1758735094310078, + "grad_norm": 4.75, + "learning_rate": 3.966834217364572e-06, + "loss": 0.95788012, + "memory(GiB)": 302.58, + "step": 210260, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.74427691, + "epoch": 1.175985358903987, + "grad_norm": 8.5625, + "learning_rate": 3.965929486602762e-06, + "loss": 1.0132308, + "memory(GiB)": 302.58, + "step": 210280, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.74391799, + "epoch": 1.1760972083769663, + "grad_norm": 9.1875, + "learning_rate": 3.965024791209443e-06, + "loss": 1.01908455, + "memory(GiB)": 302.58, + "step": 210300, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.75846357, + "epoch": 1.1762090578499456, + "grad_norm": 10.125, + "learning_rate": 3.964120131215557e-06, + "loss": 0.94023161, + "memory(GiB)": 302.58, + "step": 210320, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.75449529, + "epoch": 1.1763209073229248, + "grad_norm": 6.0, + "learning_rate": 3.963215506652047e-06, + "loss": 0.97629089, + "memory(GiB)": 302.58, + "step": 210340, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.75933423, + "epoch": 1.176432756795904, + "grad_norm": 8.375, + "learning_rate": 3.962310917549856e-06, + "loss": 0.93635283, + "memory(GiB)": 302.58, + "step": 210360, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.76676464, + "epoch": 1.1765446062688834, + "grad_norm": 8.8125, + "learning_rate": 3.961406363939921e-06, + "loss": 0.91478157, + "memory(GiB)": 302.58, + "step": 210380, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.75707259, + "epoch": 1.1766564557418626, + "grad_norm": 8.25, + "learning_rate": 3.960501845853183e-06, + "loss": 0.95968456, + "memory(GiB)": 302.58, + "step": 210400, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.73759513, + "epoch": 1.1767683052148419, + "grad_norm": 8.75, + "learning_rate": 3.959597363320577e-06, + "loss": 1.03166208, + "memory(GiB)": 302.58, + "step": 210420, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.74926934, + "epoch": 1.1768801546878211, + "grad_norm": 7.28125, + "learning_rate": 3.958692916373039e-06, + "loss": 0.98596334, + "memory(GiB)": 302.58, + "step": 210440, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74581265, + "epoch": 1.1769920041608004, + "grad_norm": 8.3125, + "learning_rate": 3.957788505041507e-06, + "loss": 1.01054916, + "memory(GiB)": 302.58, + "step": 210460, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.72867503, + "epoch": 1.1771038536337797, + "grad_norm": 6.84375, + "learning_rate": 3.956884129356911e-06, + "loss": 1.07137318, + "memory(GiB)": 302.58, + "step": 210480, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.74000597, + "epoch": 1.177215703106759, + "grad_norm": 8.75, + "learning_rate": 3.9559797893501865e-06, + "loss": 1.02397366, + "memory(GiB)": 302.58, + "step": 210500, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75613794, + "epoch": 1.1773275525797382, + "grad_norm": 8.0625, + "learning_rate": 3.955075485052264e-06, + "loss": 0.96698074, + "memory(GiB)": 302.58, + "step": 210520, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74983459, + "epoch": 1.1774394020527175, + "grad_norm": 8.6875, + "learning_rate": 3.9541712164940734e-06, + "loss": 0.96144199, + "memory(GiB)": 302.58, + "step": 210540, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.76285954, + "epoch": 1.1775512515256967, + "grad_norm": 5.375, + "learning_rate": 3.953266983706543e-06, + "loss": 0.93285761, + "memory(GiB)": 302.58, + "step": 210560, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.75107131, + "epoch": 1.177663100998676, + "grad_norm": 6.0, + "learning_rate": 3.952362786720601e-06, + "loss": 0.97527609, + "memory(GiB)": 302.58, + "step": 210580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75798826, + "epoch": 1.1777749504716553, + "grad_norm": 10.5625, + "learning_rate": 3.951458625567173e-06, + "loss": 0.95214863, + "memory(GiB)": 302.58, + "step": 210600, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.75607777, + "epoch": 1.1778867999446345, + "grad_norm": 10.8125, + "learning_rate": 3.950554500277185e-06, + "loss": 0.94069691, + "memory(GiB)": 302.58, + "step": 210620, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.75952644, + "epoch": 1.1779986494176138, + "grad_norm": 6.71875, + "learning_rate": 3.949650410881561e-06, + "loss": 0.93958559, + "memory(GiB)": 302.58, + "step": 210640, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.75346465, + "epoch": 1.178110498890593, + "grad_norm": 6.34375, + "learning_rate": 3.948746357411224e-06, + "loss": 0.98735132, + "memory(GiB)": 302.58, + "step": 210660, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75934443, + "epoch": 1.1782223483635723, + "grad_norm": 9.0625, + "learning_rate": 3.947842339897095e-06, + "loss": 0.94624615, + "memory(GiB)": 302.58, + "step": 210680, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.76593332, + "epoch": 1.1783341978365516, + "grad_norm": 7.09375, + "learning_rate": 3.946938358370092e-06, + "loss": 0.92965164, + "memory(GiB)": 302.58, + "step": 210700, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.7453021, + "epoch": 1.1784460473095308, + "grad_norm": 6.3125, + "learning_rate": 3.9460344128611385e-06, + "loss": 1.007724, + "memory(GiB)": 302.58, + "step": 210720, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.7427475, + "epoch": 1.17855789678251, + "grad_norm": 7.5, + "learning_rate": 3.94513050340115e-06, + "loss": 1.01590414, + "memory(GiB)": 302.58, + "step": 210740, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.74303017, + "epoch": 1.1786697462554894, + "grad_norm": 7.0625, + "learning_rate": 3.944226630021043e-06, + "loss": 1.02470112, + "memory(GiB)": 302.58, + "step": 210760, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.74721603, + "epoch": 1.1787815957284686, + "grad_norm": 6.71875, + "learning_rate": 3.943322792751734e-06, + "loss": 1.00684156, + "memory(GiB)": 302.58, + "step": 210780, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.75926766, + "epoch": 1.178893445201448, + "grad_norm": 8.0, + "learning_rate": 3.942418991624133e-06, + "loss": 0.94370775, + "memory(GiB)": 302.58, + "step": 210800, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.74952569, + "epoch": 1.1790052946744272, + "grad_norm": 8.6875, + "learning_rate": 3.941515226669159e-06, + "loss": 0.9828187, + "memory(GiB)": 302.58, + "step": 210820, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75388713, + "epoch": 1.1791171441474064, + "grad_norm": 8.6875, + "learning_rate": 3.94061149791772e-06, + "loss": 0.97799091, + "memory(GiB)": 302.58, + "step": 210840, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.75252962, + "epoch": 1.1792289936203857, + "grad_norm": 8.125, + "learning_rate": 3.939707805400727e-06, + "loss": 0.97697802, + "memory(GiB)": 302.58, + "step": 210860, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.72876048, + "epoch": 1.179340843093365, + "grad_norm": 6.625, + "learning_rate": 3.938804149149089e-06, + "loss": 1.06902428, + "memory(GiB)": 302.58, + "step": 210880, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74685736, + "epoch": 1.1794526925663442, + "grad_norm": 6.3125, + "learning_rate": 3.9379005291937145e-06, + "loss": 0.98904772, + "memory(GiB)": 302.58, + "step": 210900, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75199466, + "epoch": 1.1795645420393235, + "grad_norm": 7.25, + "learning_rate": 3.936996945565509e-06, + "loss": 0.96608019, + "memory(GiB)": 302.58, + "step": 210920, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.73714747, + "epoch": 1.1796763915123027, + "grad_norm": 7.25, + "learning_rate": 3.9360933982953785e-06, + "loss": 1.02576132, + "memory(GiB)": 302.58, + "step": 210940, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.74729738, + "epoch": 1.179788240985282, + "grad_norm": 5.6875, + "learning_rate": 3.935189887414227e-06, + "loss": 1.00115957, + "memory(GiB)": 302.58, + "step": 210960, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.75892606, + "epoch": 1.1799000904582613, + "grad_norm": 8.375, + "learning_rate": 3.934286412952959e-06, + "loss": 0.95294104, + "memory(GiB)": 302.58, + "step": 210980, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.7481463, + "epoch": 1.1800119399312405, + "grad_norm": 9.5, + "learning_rate": 3.933382974942475e-06, + "loss": 0.9883852, + "memory(GiB)": 302.58, + "step": 211000, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.74809585, + "epoch": 1.1801237894042198, + "grad_norm": 8.1875, + "learning_rate": 3.932479573413674e-06, + "loss": 1.01141243, + "memory(GiB)": 302.58, + "step": 211020, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.76121111, + "epoch": 1.180235638877199, + "grad_norm": 7.0, + "learning_rate": 3.931576208397458e-06, + "loss": 0.92241316, + "memory(GiB)": 302.58, + "step": 211040, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.75008769, + "epoch": 1.1803474883501783, + "grad_norm": 7.875, + "learning_rate": 3.9306728799247235e-06, + "loss": 0.99632626, + "memory(GiB)": 302.58, + "step": 211060, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.75022664, + "epoch": 1.1804593378231576, + "grad_norm": 5.34375, + "learning_rate": 3.929769588026366e-06, + "loss": 0.98165741, + "memory(GiB)": 302.58, + "step": 211080, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.75518341, + "epoch": 1.1805711872961369, + "grad_norm": 7.0, + "learning_rate": 3.928866332733283e-06, + "loss": 0.96316137, + "memory(GiB)": 302.58, + "step": 211100, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.75107331, + "epoch": 1.1806830367691161, + "grad_norm": 6.71875, + "learning_rate": 3.927963114076368e-06, + "loss": 0.98312397, + "memory(GiB)": 302.58, + "step": 211120, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.75213022, + "epoch": 1.1807948862420954, + "grad_norm": 8.0, + "learning_rate": 3.927059932086514e-06, + "loss": 0.99290142, + "memory(GiB)": 302.58, + "step": 211140, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.74515424, + "epoch": 1.1809067357150747, + "grad_norm": 7.15625, + "learning_rate": 3.926156786794613e-06, + "loss": 1.01221838, + "memory(GiB)": 302.58, + "step": 211160, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.74894934, + "epoch": 1.181018585188054, + "grad_norm": 7.4375, + "learning_rate": 3.925253678231554e-06, + "loss": 0.98988323, + "memory(GiB)": 302.58, + "step": 211180, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.73841586, + "epoch": 1.1811304346610332, + "grad_norm": 8.25, + "learning_rate": 3.9243506064282275e-06, + "loss": 1.04389153, + "memory(GiB)": 302.58, + "step": 211200, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.74367685, + "epoch": 1.1812422841340124, + "grad_norm": 5.78125, + "learning_rate": 3.92344757141552e-06, + "loss": 1.01387444, + "memory(GiB)": 302.58, + "step": 211220, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.75372338, + "epoch": 1.1813541336069917, + "grad_norm": 6.46875, + "learning_rate": 3.92254457322432e-06, + "loss": 0.97929516, + "memory(GiB)": 302.58, + "step": 211240, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.73387389, + "epoch": 1.181465983079971, + "grad_norm": 5.1875, + "learning_rate": 3.921641611885512e-06, + "loss": 1.04152393, + "memory(GiB)": 302.58, + "step": 211260, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.74579272, + "epoch": 1.1815778325529502, + "grad_norm": 5.3125, + "learning_rate": 3.920738687429981e-06, + "loss": 1.02710924, + "memory(GiB)": 302.58, + "step": 211280, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.7573072, + "epoch": 1.1816896820259295, + "grad_norm": 5.9375, + "learning_rate": 3.919835799888608e-06, + "loss": 0.95750399, + "memory(GiB)": 302.58, + "step": 211300, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.74737201, + "epoch": 1.1818015314989088, + "grad_norm": 10.0625, + "learning_rate": 3.918932949292277e-06, + "loss": 1.00731192, + "memory(GiB)": 302.58, + "step": 211320, + "train_speed(iter/s)": 0.123765 + }, + { + "acc": 0.74617186, + "epoch": 1.181913380971888, + "grad_norm": 7.15625, + "learning_rate": 3.918030135671867e-06, + "loss": 0.98812475, + "memory(GiB)": 302.58, + "step": 211340, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.75272346, + "epoch": 1.1820252304448673, + "grad_norm": 5.90625, + "learning_rate": 3.9171273590582555e-06, + "loss": 0.98981209, + "memory(GiB)": 302.58, + "step": 211360, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.74733362, + "epoch": 1.1821370799178466, + "grad_norm": 7.3125, + "learning_rate": 3.916224619482323e-06, + "loss": 1.01110592, + "memory(GiB)": 302.58, + "step": 211380, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.75541725, + "epoch": 1.1822489293908258, + "grad_norm": 6.3125, + "learning_rate": 3.915321916974946e-06, + "loss": 0.96220093, + "memory(GiB)": 302.58, + "step": 211400, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.76163573, + "epoch": 1.182360778863805, + "grad_norm": 5.125, + "learning_rate": 3.914419251566998e-06, + "loss": 0.94123678, + "memory(GiB)": 302.58, + "step": 211420, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.7577446, + "epoch": 1.1824726283367843, + "grad_norm": 8.25, + "learning_rate": 3.913516623289354e-06, + "loss": 0.9751689, + "memory(GiB)": 302.58, + "step": 211440, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.74511571, + "epoch": 1.1825844778097636, + "grad_norm": 7.15625, + "learning_rate": 3.912614032172887e-06, + "loss": 0.9941412, + "memory(GiB)": 302.58, + "step": 211460, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.76022182, + "epoch": 1.1826963272827429, + "grad_norm": 6.03125, + "learning_rate": 3.911711478248467e-06, + "loss": 0.9215229, + "memory(GiB)": 302.58, + "step": 211480, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.75409522, + "epoch": 1.1828081767557221, + "grad_norm": 10.5, + "learning_rate": 3.910808961546966e-06, + "loss": 0.96361828, + "memory(GiB)": 302.58, + "step": 211500, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.7489707, + "epoch": 1.1829200262287014, + "grad_norm": 6.59375, + "learning_rate": 3.909906482099252e-06, + "loss": 0.99881783, + "memory(GiB)": 302.58, + "step": 211520, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.77261829, + "epoch": 1.1830318757016807, + "grad_norm": 7.9375, + "learning_rate": 3.9090040399361934e-06, + "loss": 0.86859617, + "memory(GiB)": 302.58, + "step": 211540, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.72973251, + "epoch": 1.18314372517466, + "grad_norm": 5.5, + "learning_rate": 3.9081016350886556e-06, + "loss": 1.09407597, + "memory(GiB)": 302.58, + "step": 211560, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.75913405, + "epoch": 1.1832555746476392, + "grad_norm": 7.65625, + "learning_rate": 3.907199267587505e-06, + "loss": 0.91896362, + "memory(GiB)": 302.58, + "step": 211580, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.75744863, + "epoch": 1.1833674241206185, + "grad_norm": 8.9375, + "learning_rate": 3.9062969374636036e-06, + "loss": 0.97040644, + "memory(GiB)": 302.58, + "step": 211600, + "train_speed(iter/s)": 0.12384 + }, + { + "acc": 0.73951583, + "epoch": 1.1834792735935977, + "grad_norm": 8.5625, + "learning_rate": 3.905394644747815e-06, + "loss": 1.02190971, + "memory(GiB)": 302.58, + "step": 211620, + "train_speed(iter/s)": 0.123846 + }, + { + "acc": 0.75976973, + "epoch": 1.183591123066577, + "grad_norm": 11.0, + "learning_rate": 3.904492389471001e-06, + "loss": 0.92189283, + "memory(GiB)": 302.58, + "step": 211640, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.72830338, + "epoch": 1.1837029725395563, + "grad_norm": 7.3125, + "learning_rate": 3.9035901716640215e-06, + "loss": 1.09331932, + "memory(GiB)": 302.58, + "step": 211660, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.76315598, + "epoch": 1.1838148220125355, + "grad_norm": 5.15625, + "learning_rate": 3.902687991357734e-06, + "loss": 0.92904186, + "memory(GiB)": 302.58, + "step": 211680, + "train_speed(iter/s)": 0.123862 + }, + { + "acc": 0.75407343, + "epoch": 1.1839266714855148, + "grad_norm": 9.8125, + "learning_rate": 3.9017858485829975e-06, + "loss": 0.95581484, + "memory(GiB)": 302.58, + "step": 211700, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74802227, + "epoch": 1.184038520958494, + "grad_norm": 6.15625, + "learning_rate": 3.900883743370666e-06, + "loss": 0.9859746, + "memory(GiB)": 302.58, + "step": 211720, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.75553484, + "epoch": 1.1841503704314733, + "grad_norm": 7.65625, + "learning_rate": 3.899981675751597e-06, + "loss": 0.95620289, + "memory(GiB)": 302.58, + "step": 211740, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.74703579, + "epoch": 1.1842622199044526, + "grad_norm": 7.40625, + "learning_rate": 3.899079645756641e-06, + "loss": 1.00265751, + "memory(GiB)": 302.58, + "step": 211760, + "train_speed(iter/s)": 0.123885 + }, + { + "acc": 0.76391931, + "epoch": 1.1843740693774318, + "grad_norm": 8.6875, + "learning_rate": 3.898177653416654e-06, + "loss": 0.90055475, + "memory(GiB)": 302.58, + "step": 211780, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.75066895, + "epoch": 1.184485918850411, + "grad_norm": 7.84375, + "learning_rate": 3.8972756987624835e-06, + "loss": 1.01754379, + "memory(GiB)": 302.58, + "step": 211800, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.73968592, + "epoch": 1.1845977683233904, + "grad_norm": 7.34375, + "learning_rate": 3.896373781824982e-06, + "loss": 1.00878849, + "memory(GiB)": 302.58, + "step": 211820, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.76005931, + "epoch": 1.1847096177963696, + "grad_norm": 8.1875, + "learning_rate": 3.895471902634996e-06, + "loss": 0.94643421, + "memory(GiB)": 302.58, + "step": 211840, + "train_speed(iter/s)": 0.123907 + }, + { + "acc": 0.74120107, + "epoch": 1.184821467269349, + "grad_norm": 8.125, + "learning_rate": 3.894570061223374e-06, + "loss": 1.03026104, + "memory(GiB)": 302.58, + "step": 211860, + "train_speed(iter/s)": 0.123912 + }, + { + "acc": 0.74085808, + "epoch": 1.1849333167423282, + "grad_norm": 7.5625, + "learning_rate": 3.893668257620961e-06, + "loss": 1.02471104, + "memory(GiB)": 302.58, + "step": 211880, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.74413581, + "epoch": 1.1850451662153074, + "grad_norm": 8.25, + "learning_rate": 3.8927664918586e-06, + "loss": 1.03411474, + "memory(GiB)": 302.58, + "step": 211900, + "train_speed(iter/s)": 0.123923 + }, + { + "acc": 0.7462038, + "epoch": 1.1851570156882867, + "grad_norm": 9.25, + "learning_rate": 3.891864763967137e-06, + "loss": 0.98632679, + "memory(GiB)": 302.58, + "step": 211920, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.75328817, + "epoch": 1.185268865161266, + "grad_norm": 7.71875, + "learning_rate": 3.890963073977413e-06, + "loss": 0.96387882, + "memory(GiB)": 302.58, + "step": 211940, + "train_speed(iter/s)": 0.123934 + }, + { + "acc": 0.73638678, + "epoch": 1.1853807146342452, + "grad_norm": 4.1875, + "learning_rate": 3.890061421920267e-06, + "loss": 1.0753479, + "memory(GiB)": 302.58, + "step": 211960, + "train_speed(iter/s)": 0.12394 + }, + { + "acc": 0.75322442, + "epoch": 1.1854925641072245, + "grad_norm": 4.8125, + "learning_rate": 3.889159807826541e-06, + "loss": 0.96244144, + "memory(GiB)": 302.58, + "step": 211980, + "train_speed(iter/s)": 0.123945 + }, + { + "acc": 0.74474654, + "epoch": 1.1856044135802037, + "grad_norm": 9.25, + "learning_rate": 3.888258231727073e-06, + "loss": 1.00315313, + "memory(GiB)": 302.58, + "step": 212000, + "train_speed(iter/s)": 0.123951 + }, + { + "epoch": 1.1856044135802037, + "eval_acc": 0.7066513393098786, + "eval_loss": 1.0129332542419434, + "eval_runtime": 7507.6294, + "eval_samples_per_second": 10.028, + "eval_steps_per_second": 10.028, + "step": 212000 + }, + { + "acc": 0.71940312, + "epoch": 1.185716263053183, + "grad_norm": 5.1875, + "learning_rate": 3.887356693652699e-06, + "loss": 1.0927742, + "memory(GiB)": 302.58, + "step": 212020, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.7516202, + "epoch": 1.1858281125261623, + "grad_norm": 9.25, + "learning_rate": 3.886455193634253e-06, + "loss": 0.97201643, + "memory(GiB)": 302.58, + "step": 212040, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.73837376, + "epoch": 1.1859399619991415, + "grad_norm": 5.0625, + "learning_rate": 3.88555373170257e-06, + "loss": 1.04244614, + "memory(GiB)": 302.58, + "step": 212060, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.73213739, + "epoch": 1.1860518114721208, + "grad_norm": 10.4375, + "learning_rate": 3.884652307888484e-06, + "loss": 1.04006252, + "memory(GiB)": 302.58, + "step": 212080, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.74561868, + "epoch": 1.1861636609451, + "grad_norm": 6.5625, + "learning_rate": 3.883750922222825e-06, + "loss": 1.03068638, + "memory(GiB)": 302.58, + "step": 212100, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74781265, + "epoch": 1.1862755104180793, + "grad_norm": 6.59375, + "learning_rate": 3.882849574736425e-06, + "loss": 0.98946381, + "memory(GiB)": 302.58, + "step": 212120, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.75330033, + "epoch": 1.1863873598910586, + "grad_norm": 6.9375, + "learning_rate": 3.881948265460112e-06, + "loss": 0.96152506, + "memory(GiB)": 302.58, + "step": 212140, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.752455, + "epoch": 1.1864992093640379, + "grad_norm": 7.59375, + "learning_rate": 3.881046994424712e-06, + "loss": 0.96265087, + "memory(GiB)": 302.58, + "step": 212160, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.76581349, + "epoch": 1.1866110588370171, + "grad_norm": 7.3125, + "learning_rate": 3.880145761661055e-06, + "loss": 0.88547325, + "memory(GiB)": 302.58, + "step": 212180, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.75066977, + "epoch": 1.1867229083099964, + "grad_norm": 7.15625, + "learning_rate": 3.879244567199962e-06, + "loss": 0.98392324, + "memory(GiB)": 302.58, + "step": 212200, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.7353888, + "epoch": 1.1868347577829756, + "grad_norm": 5.75, + "learning_rate": 3.878343411072259e-06, + "loss": 1.05950394, + "memory(GiB)": 302.58, + "step": 212220, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.73163137, + "epoch": 1.186946607255955, + "grad_norm": 7.53125, + "learning_rate": 3.877442293308768e-06, + "loss": 1.05332718, + "memory(GiB)": 302.58, + "step": 212240, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.72962132, + "epoch": 1.1870584567289342, + "grad_norm": 8.625, + "learning_rate": 3.876541213940311e-06, + "loss": 1.06347532, + "memory(GiB)": 302.58, + "step": 212260, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.76970215, + "epoch": 1.1871703062019134, + "grad_norm": 6.75, + "learning_rate": 3.875640172997707e-06, + "loss": 0.9065526, + "memory(GiB)": 302.58, + "step": 212280, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.73931293, + "epoch": 1.1872821556748927, + "grad_norm": 8.1875, + "learning_rate": 3.874739170511775e-06, + "loss": 1.03506975, + "memory(GiB)": 302.58, + "step": 212300, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.75101972, + "epoch": 1.187394005147872, + "grad_norm": 7.625, + "learning_rate": 3.87383820651333e-06, + "loss": 0.97457962, + "memory(GiB)": 302.58, + "step": 212320, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.76111917, + "epoch": 1.1875058546208512, + "grad_norm": 4.59375, + "learning_rate": 3.872937281033189e-06, + "loss": 0.95302525, + "memory(GiB)": 302.58, + "step": 212340, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.75362129, + "epoch": 1.1876177040938305, + "grad_norm": 6.53125, + "learning_rate": 3.872036394102167e-06, + "loss": 0.95521069, + "memory(GiB)": 302.58, + "step": 212360, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.75084653, + "epoch": 1.1877295535668098, + "grad_norm": 8.6875, + "learning_rate": 3.8711355457510766e-06, + "loss": 0.97315092, + "memory(GiB)": 302.58, + "step": 212380, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.75772972, + "epoch": 1.187841403039789, + "grad_norm": 7.0625, + "learning_rate": 3.8702347360107315e-06, + "loss": 0.95645752, + "memory(GiB)": 302.58, + "step": 212400, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74811134, + "epoch": 1.1879532525127683, + "grad_norm": 7.0625, + "learning_rate": 3.86933396491194e-06, + "loss": 0.99192724, + "memory(GiB)": 302.58, + "step": 212420, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.77197046, + "epoch": 1.1880651019857476, + "grad_norm": 7.1875, + "learning_rate": 3.868433232485512e-06, + "loss": 0.890382, + "memory(GiB)": 302.58, + "step": 212440, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74114614, + "epoch": 1.1881769514587268, + "grad_norm": 7.90625, + "learning_rate": 3.867532538762257e-06, + "loss": 1.03116446, + "memory(GiB)": 302.58, + "step": 212460, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.75733361, + "epoch": 1.188288800931706, + "grad_norm": 7.96875, + "learning_rate": 3.866631883772979e-06, + "loss": 0.9534337, + "memory(GiB)": 302.58, + "step": 212480, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73597994, + "epoch": 1.1884006504046853, + "grad_norm": 8.0, + "learning_rate": 3.865731267548483e-06, + "loss": 1.05486803, + "memory(GiB)": 302.58, + "step": 212500, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.76490479, + "epoch": 1.1885124998776646, + "grad_norm": 6.65625, + "learning_rate": 3.864830690119576e-06, + "loss": 0.92573462, + "memory(GiB)": 302.58, + "step": 212520, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.75184956, + "epoch": 1.1886243493506439, + "grad_norm": 9.5625, + "learning_rate": 3.863930151517059e-06, + "loss": 0.97999496, + "memory(GiB)": 302.58, + "step": 212540, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.7349196, + "epoch": 1.1887361988236231, + "grad_norm": 7.53125, + "learning_rate": 3.863029651771733e-06, + "loss": 1.03317041, + "memory(GiB)": 302.58, + "step": 212560, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.73761311, + "epoch": 1.1888480482966024, + "grad_norm": 8.1875, + "learning_rate": 3.8621291909143975e-06, + "loss": 1.04418755, + "memory(GiB)": 302.58, + "step": 212580, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.76224604, + "epoch": 1.1889598977695817, + "grad_norm": 7.40625, + "learning_rate": 3.861228768975853e-06, + "loss": 0.91101131, + "memory(GiB)": 302.58, + "step": 212600, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.75122352, + "epoch": 1.189071747242561, + "grad_norm": 7.71875, + "learning_rate": 3.860328385986894e-06, + "loss": 0.96737709, + "memory(GiB)": 302.58, + "step": 212620, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.76269126, + "epoch": 1.1891835967155402, + "grad_norm": 6.3125, + "learning_rate": 3.859428041978318e-06, + "loss": 0.93318443, + "memory(GiB)": 302.58, + "step": 212640, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75982618, + "epoch": 1.1892954461885195, + "grad_norm": 7.9375, + "learning_rate": 3.858527736980921e-06, + "loss": 0.93580856, + "memory(GiB)": 302.58, + "step": 212660, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74172201, + "epoch": 1.1894072956614987, + "grad_norm": 9.25, + "learning_rate": 3.857627471025494e-06, + "loss": 1.02343025, + "memory(GiB)": 302.58, + "step": 212680, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.75945745, + "epoch": 1.189519145134478, + "grad_norm": 6.4375, + "learning_rate": 3.85672724414283e-06, + "loss": 0.95219297, + "memory(GiB)": 302.58, + "step": 212700, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.75154114, + "epoch": 1.1896309946074572, + "grad_norm": 7.78125, + "learning_rate": 3.8558270563637205e-06, + "loss": 0.98694191, + "memory(GiB)": 302.58, + "step": 212720, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74715028, + "epoch": 1.1897428440804365, + "grad_norm": 6.71875, + "learning_rate": 3.854926907718953e-06, + "loss": 1.00041313, + "memory(GiB)": 302.58, + "step": 212740, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.74955416, + "epoch": 1.1898546935534158, + "grad_norm": 8.375, + "learning_rate": 3.854026798239316e-06, + "loss": 0.97968006, + "memory(GiB)": 302.58, + "step": 212760, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.74047179, + "epoch": 1.189966543026395, + "grad_norm": 6.78125, + "learning_rate": 3.853126727955595e-06, + "loss": 1.02406902, + "memory(GiB)": 302.58, + "step": 212780, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.7536581, + "epoch": 1.1900783924993743, + "grad_norm": 7.90625, + "learning_rate": 3.852226696898579e-06, + "loss": 0.96441641, + "memory(GiB)": 302.58, + "step": 212800, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.72147784, + "epoch": 1.1901902419723536, + "grad_norm": 9.25, + "learning_rate": 3.8513267050990474e-06, + "loss": 1.10114889, + "memory(GiB)": 302.58, + "step": 212820, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.75702496, + "epoch": 1.1903020914453328, + "grad_norm": 9.25, + "learning_rate": 3.850426752587787e-06, + "loss": 0.95306187, + "memory(GiB)": 302.58, + "step": 212840, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.77360368, + "epoch": 1.190413940918312, + "grad_norm": 7.3125, + "learning_rate": 3.849526839395575e-06, + "loss": 0.91109743, + "memory(GiB)": 302.58, + "step": 212860, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.75685511, + "epoch": 1.1905257903912914, + "grad_norm": 8.0, + "learning_rate": 3.848626965553194e-06, + "loss": 0.94338264, + "memory(GiB)": 302.58, + "step": 212880, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.75748262, + "epoch": 1.1906376398642706, + "grad_norm": 7.5, + "learning_rate": 3.847727131091422e-06, + "loss": 0.95650129, + "memory(GiB)": 302.58, + "step": 212900, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.76252966, + "epoch": 1.1907494893372499, + "grad_norm": 6.0625, + "learning_rate": 3.846827336041034e-06, + "loss": 0.94019775, + "memory(GiB)": 302.58, + "step": 212920, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.76220312, + "epoch": 1.1908613388102292, + "grad_norm": 7.3125, + "learning_rate": 3.845927580432809e-06, + "loss": 0.9402935, + "memory(GiB)": 302.58, + "step": 212940, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.76465192, + "epoch": 1.1909731882832084, + "grad_norm": 6.0625, + "learning_rate": 3.84502786429752e-06, + "loss": 0.92664032, + "memory(GiB)": 302.58, + "step": 212960, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.7590488, + "epoch": 1.1910850377561877, + "grad_norm": 7.9375, + "learning_rate": 3.844128187665941e-06, + "loss": 0.95652761, + "memory(GiB)": 302.58, + "step": 212980, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.73713984, + "epoch": 1.191196887229167, + "grad_norm": 6.21875, + "learning_rate": 3.843228550568843e-06, + "loss": 1.02857018, + "memory(GiB)": 302.58, + "step": 213000, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.72847509, + "epoch": 1.1913087367021462, + "grad_norm": 5.71875, + "learning_rate": 3.842328953036997e-06, + "loss": 1.06811562, + "memory(GiB)": 302.58, + "step": 213020, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.7404974, + "epoch": 1.1914205861751255, + "grad_norm": 5.375, + "learning_rate": 3.84142939510117e-06, + "loss": 1.0069788, + "memory(GiB)": 302.58, + "step": 213040, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.74460711, + "epoch": 1.1915324356481047, + "grad_norm": 8.6875, + "learning_rate": 3.840529876792132e-06, + "loss": 0.998563, + "memory(GiB)": 302.58, + "step": 213060, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.73779306, + "epoch": 1.191644285121084, + "grad_norm": 5.59375, + "learning_rate": 3.83963039814065e-06, + "loss": 1.06029682, + "memory(GiB)": 302.58, + "step": 213080, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.74689207, + "epoch": 1.1917561345940633, + "grad_norm": 4.9375, + "learning_rate": 3.838730959177487e-06, + "loss": 0.99649906, + "memory(GiB)": 302.58, + "step": 213100, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.74096336, + "epoch": 1.1918679840670425, + "grad_norm": 4.46875, + "learning_rate": 3.837831559933408e-06, + "loss": 1.01969891, + "memory(GiB)": 302.58, + "step": 213120, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.74995604, + "epoch": 1.1919798335400218, + "grad_norm": 7.1875, + "learning_rate": 3.836932200439174e-06, + "loss": 0.96473236, + "memory(GiB)": 302.58, + "step": 213140, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.74352355, + "epoch": 1.192091683013001, + "grad_norm": 7.125, + "learning_rate": 3.836032880725547e-06, + "loss": 1.01614342, + "memory(GiB)": 302.58, + "step": 213160, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.72831683, + "epoch": 1.1922035324859803, + "grad_norm": 6.3125, + "learning_rate": 3.835133600823285e-06, + "loss": 1.06632509, + "memory(GiB)": 302.58, + "step": 213180, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.75875659, + "epoch": 1.1923153819589596, + "grad_norm": 7.53125, + "learning_rate": 3.834234360763149e-06, + "loss": 0.94054585, + "memory(GiB)": 302.58, + "step": 213200, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.74951763, + "epoch": 1.1924272314319388, + "grad_norm": 5.6875, + "learning_rate": 3.833335160575894e-06, + "loss": 0.99517908, + "memory(GiB)": 302.58, + "step": 213220, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.73620725, + "epoch": 1.1925390809049181, + "grad_norm": 9.3125, + "learning_rate": 3.8324360002922756e-06, + "loss": 1.05035906, + "memory(GiB)": 302.58, + "step": 213240, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.75373874, + "epoch": 1.1926509303778974, + "grad_norm": 7.25, + "learning_rate": 3.8315368799430485e-06, + "loss": 0.94127464, + "memory(GiB)": 302.58, + "step": 213260, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.7432312, + "epoch": 1.1927627798508766, + "grad_norm": 7.1875, + "learning_rate": 3.830637799558966e-06, + "loss": 1.00214424, + "memory(GiB)": 302.58, + "step": 213280, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.74970789, + "epoch": 1.192874629323856, + "grad_norm": 6.9375, + "learning_rate": 3.829738759170778e-06, + "loss": 0.98464355, + "memory(GiB)": 302.58, + "step": 213300, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.7555316, + "epoch": 1.1929864787968352, + "grad_norm": 5.40625, + "learning_rate": 3.828839758809236e-06, + "loss": 0.96249266, + "memory(GiB)": 302.58, + "step": 213320, + "train_speed(iter/s)": 0.123762 + }, + { + "acc": 0.76260715, + "epoch": 1.1930983282698144, + "grad_norm": 6.65625, + "learning_rate": 3.827940798505088e-06, + "loss": 0.92711287, + "memory(GiB)": 302.58, + "step": 213340, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.75523543, + "epoch": 1.1932101777427937, + "grad_norm": 8.4375, + "learning_rate": 3.82704187828908e-06, + "loss": 0.96967125, + "memory(GiB)": 302.58, + "step": 213360, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.75691748, + "epoch": 1.193322027215773, + "grad_norm": 6.125, + "learning_rate": 3.826142998191959e-06, + "loss": 0.95897541, + "memory(GiB)": 302.58, + "step": 213380, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.75384316, + "epoch": 1.1934338766887522, + "grad_norm": 7.53125, + "learning_rate": 3.82524415824447e-06, + "loss": 0.949053, + "memory(GiB)": 302.58, + "step": 213400, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.75872779, + "epoch": 1.1935457261617315, + "grad_norm": 6.125, + "learning_rate": 3.824345358477357e-06, + "loss": 0.94814148, + "memory(GiB)": 302.58, + "step": 213420, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.76080513, + "epoch": 1.1936575756347108, + "grad_norm": 6.96875, + "learning_rate": 3.82344659892136e-06, + "loss": 0.94161577, + "memory(GiB)": 302.58, + "step": 213440, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.74977574, + "epoch": 1.19376942510769, + "grad_norm": 4.0625, + "learning_rate": 3.822547879607219e-06, + "loss": 0.98701773, + "memory(GiB)": 302.58, + "step": 213460, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.74323611, + "epoch": 1.1938812745806693, + "grad_norm": 10.625, + "learning_rate": 3.8216492005656754e-06, + "loss": 0.99535017, + "memory(GiB)": 302.58, + "step": 213480, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.73687291, + "epoch": 1.1939931240536485, + "grad_norm": 6.59375, + "learning_rate": 3.820750561827466e-06, + "loss": 1.05051413, + "memory(GiB)": 302.58, + "step": 213500, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.74507217, + "epoch": 1.1941049735266278, + "grad_norm": 6.65625, + "learning_rate": 3.819851963423325e-06, + "loss": 1.01358109, + "memory(GiB)": 302.58, + "step": 213520, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.74055495, + "epoch": 1.194216822999607, + "grad_norm": 8.3125, + "learning_rate": 3.81895340538399e-06, + "loss": 1.01057262, + "memory(GiB)": 302.58, + "step": 213540, + "train_speed(iter/s)": 0.123822 + }, + { + "acc": 0.75062099, + "epoch": 1.1943286724725863, + "grad_norm": 7.6875, + "learning_rate": 3.8180548877401936e-06, + "loss": 0.96830082, + "memory(GiB)": 302.58, + "step": 213560, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.74777193, + "epoch": 1.1944405219455656, + "grad_norm": 6.3125, + "learning_rate": 3.817156410522667e-06, + "loss": 1.00886421, + "memory(GiB)": 302.58, + "step": 213580, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.75264053, + "epoch": 1.1945523714185449, + "grad_norm": 9.3125, + "learning_rate": 3.816257973762143e-06, + "loss": 0.9772543, + "memory(GiB)": 302.58, + "step": 213600, + "train_speed(iter/s)": 0.123839 + }, + { + "acc": 0.76782408, + "epoch": 1.1946642208915241, + "grad_norm": 8.625, + "learning_rate": 3.81535957748935e-06, + "loss": 0.89522657, + "memory(GiB)": 302.58, + "step": 213620, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.75491066, + "epoch": 1.1947760703645034, + "grad_norm": 6.75, + "learning_rate": 3.8144612217350153e-06, + "loss": 0.97616873, + "memory(GiB)": 302.58, + "step": 213640, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.73767333, + "epoch": 1.1948879198374827, + "grad_norm": 7.0625, + "learning_rate": 3.813562906529865e-06, + "loss": 1.04741888, + "memory(GiB)": 302.58, + "step": 213660, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.75822911, + "epoch": 1.194999769310462, + "grad_norm": 6.40625, + "learning_rate": 3.8126646319046266e-06, + "loss": 0.96106625, + "memory(GiB)": 302.58, + "step": 213680, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.73394675, + "epoch": 1.1951116187834412, + "grad_norm": 6.9375, + "learning_rate": 3.811766397890023e-06, + "loss": 1.06266708, + "memory(GiB)": 302.58, + "step": 213700, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.74165888, + "epoch": 1.1952234682564205, + "grad_norm": 4.90625, + "learning_rate": 3.8108682045167757e-06, + "loss": 1.02613096, + "memory(GiB)": 302.58, + "step": 213720, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.75067859, + "epoch": 1.1953353177293997, + "grad_norm": 6.21875, + "learning_rate": 3.809970051815607e-06, + "loss": 0.97767582, + "memory(GiB)": 302.58, + "step": 213740, + "train_speed(iter/s)": 0.123878 + }, + { + "acc": 0.76172109, + "epoch": 1.195447167202379, + "grad_norm": 7.90625, + "learning_rate": 3.8090719398172356e-06, + "loss": 0.95724716, + "memory(GiB)": 302.58, + "step": 213760, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.75175161, + "epoch": 1.1955590166753582, + "grad_norm": 6.28125, + "learning_rate": 3.8081738685523793e-06, + "loss": 0.98228388, + "memory(GiB)": 302.58, + "step": 213780, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.76065874, + "epoch": 1.1956708661483375, + "grad_norm": 9.6875, + "learning_rate": 3.8072758380517573e-06, + "loss": 0.93117561, + "memory(GiB)": 302.58, + "step": 213800, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.75315804, + "epoch": 1.1957827156213168, + "grad_norm": 6.125, + "learning_rate": 3.8063778483460826e-06, + "loss": 0.96825771, + "memory(GiB)": 302.58, + "step": 213820, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.75676842, + "epoch": 1.195894565094296, + "grad_norm": 8.875, + "learning_rate": 3.8054798994660713e-06, + "loss": 0.96487141, + "memory(GiB)": 302.58, + "step": 213840, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.76843581, + "epoch": 1.1960064145672753, + "grad_norm": 8.8125, + "learning_rate": 3.8045819914424353e-06, + "loss": 0.89505949, + "memory(GiB)": 302.58, + "step": 213860, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.75362358, + "epoch": 1.1961182640402546, + "grad_norm": 9.625, + "learning_rate": 3.8036841243058853e-06, + "loss": 0.96892748, + "memory(GiB)": 302.58, + "step": 213880, + "train_speed(iter/s)": 0.123917 + }, + { + "acc": 0.76127882, + "epoch": 1.1962301135132338, + "grad_norm": 6.71875, + "learning_rate": 3.8027862980871316e-06, + "loss": 0.93070126, + "memory(GiB)": 302.58, + "step": 213900, + "train_speed(iter/s)": 0.123922 + }, + { + "acc": 0.74790263, + "epoch": 1.196341962986213, + "grad_norm": 5.84375, + "learning_rate": 3.801888512816882e-06, + "loss": 0.98417616, + "memory(GiB)": 302.58, + "step": 213920, + "train_speed(iter/s)": 0.123928 + }, + { + "acc": 0.76353722, + "epoch": 1.1964538124591924, + "grad_norm": 8.375, + "learning_rate": 3.8009907685258453e-06, + "loss": 0.93420172, + "memory(GiB)": 302.58, + "step": 213940, + "train_speed(iter/s)": 0.123933 + }, + { + "acc": 0.77002072, + "epoch": 1.1965656619321716, + "grad_norm": 6.5, + "learning_rate": 3.8000930652447264e-06, + "loss": 0.89192972, + "memory(GiB)": 302.58, + "step": 213960, + "train_speed(iter/s)": 0.123939 + }, + { + "acc": 0.76567793, + "epoch": 1.1966775114051509, + "grad_norm": 7.375, + "learning_rate": 3.7991954030042287e-06, + "loss": 0.90749292, + "memory(GiB)": 302.58, + "step": 213980, + "train_speed(iter/s)": 0.123944 + }, + { + "acc": 0.7453321, + "epoch": 1.1967893608781301, + "grad_norm": 6.9375, + "learning_rate": 3.7982977818350568e-06, + "loss": 0.98534784, + "memory(GiB)": 302.58, + "step": 214000, + "train_speed(iter/s)": 0.12395 + }, + { + "epoch": 1.1967893608781301, + "eval_acc": 0.706677762387321, + "eval_loss": 1.012924075126648, + "eval_runtime": 7498.0777, + "eval_samples_per_second": 10.04, + "eval_steps_per_second": 10.04, + "step": 214000 + }, + { + "acc": 0.73442035, + "epoch": 1.1969012103511094, + "grad_norm": 6.5625, + "learning_rate": 3.7974002017679112e-06, + "loss": 1.0631649, + "memory(GiB)": 302.58, + "step": 214020, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.76193395, + "epoch": 1.1970130598240887, + "grad_norm": 10.3125, + "learning_rate": 3.7965026628334915e-06, + "loss": 0.93999891, + "memory(GiB)": 302.58, + "step": 214040, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.753021, + "epoch": 1.197124909297068, + "grad_norm": 11.375, + "learning_rate": 3.7956051650624963e-06, + "loss": 0.96467991, + "memory(GiB)": 302.58, + "step": 214060, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.75617051, + "epoch": 1.1972367587700472, + "grad_norm": 7.875, + "learning_rate": 3.794707708485624e-06, + "loss": 0.93423367, + "memory(GiB)": 302.58, + "step": 214080, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.76576233, + "epoch": 1.1973486082430265, + "grad_norm": 6.5625, + "learning_rate": 3.7938102931335707e-06, + "loss": 0.93288555, + "memory(GiB)": 302.58, + "step": 214100, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.75052214, + "epoch": 1.1974604577160057, + "grad_norm": 9.0, + "learning_rate": 3.7929129190370303e-06, + "loss": 1.00427475, + "memory(GiB)": 302.58, + "step": 214120, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.74971557, + "epoch": 1.197572307188985, + "grad_norm": 4.875, + "learning_rate": 3.792015586226695e-06, + "loss": 1.00714912, + "memory(GiB)": 302.58, + "step": 214140, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.76935935, + "epoch": 1.1976841566619643, + "grad_norm": 7.46875, + "learning_rate": 3.791118294733257e-06, + "loss": 0.92549925, + "memory(GiB)": 302.58, + "step": 214160, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.75181608, + "epoch": 1.1977960061349435, + "grad_norm": 7.15625, + "learning_rate": 3.7902210445874066e-06, + "loss": 0.97584715, + "memory(GiB)": 302.58, + "step": 214180, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.7548656, + "epoch": 1.1979078556079228, + "grad_norm": 7.1875, + "learning_rate": 3.7893238358198314e-06, + "loss": 0.9677186, + "memory(GiB)": 302.58, + "step": 214200, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74203143, + "epoch": 1.198019705080902, + "grad_norm": 8.9375, + "learning_rate": 3.7884266684612215e-06, + "loss": 1.02421818, + "memory(GiB)": 302.58, + "step": 214220, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.75547724, + "epoch": 1.1981315545538813, + "grad_norm": 6.0625, + "learning_rate": 3.7875295425422607e-06, + "loss": 0.95829868, + "memory(GiB)": 302.58, + "step": 214240, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.76084137, + "epoch": 1.1982434040268606, + "grad_norm": 7.5625, + "learning_rate": 3.7866324580936354e-06, + "loss": 0.9311018, + "memory(GiB)": 302.58, + "step": 214260, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.77163839, + "epoch": 1.1983552534998398, + "grad_norm": 8.5625, + "learning_rate": 3.785735415146026e-06, + "loss": 0.89071455, + "memory(GiB)": 302.58, + "step": 214280, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.7531189, + "epoch": 1.198467102972819, + "grad_norm": 6.1875, + "learning_rate": 3.784838413730117e-06, + "loss": 0.94092903, + "memory(GiB)": 302.58, + "step": 214300, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.76692386, + "epoch": 1.1985789524457984, + "grad_norm": 8.625, + "learning_rate": 3.7839414538765866e-06, + "loss": 0.91641922, + "memory(GiB)": 302.58, + "step": 214320, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.75266581, + "epoch": 1.1986908019187776, + "grad_norm": 7.875, + "learning_rate": 3.783044535616114e-06, + "loss": 0.96375618, + "memory(GiB)": 302.58, + "step": 214340, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.7537528, + "epoch": 1.198802651391757, + "grad_norm": 8.25, + "learning_rate": 3.7821476589793777e-06, + "loss": 0.97017012, + "memory(GiB)": 302.58, + "step": 214360, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.7509202, + "epoch": 1.1989145008647362, + "grad_norm": 8.375, + "learning_rate": 3.7812508239970536e-06, + "loss": 0.96829023, + "memory(GiB)": 302.58, + "step": 214380, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.73879995, + "epoch": 1.1990263503377154, + "grad_norm": 6.625, + "learning_rate": 3.7803540306998154e-06, + "loss": 1.04036379, + "memory(GiB)": 302.58, + "step": 214400, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75669661, + "epoch": 1.1991381998106947, + "grad_norm": 7.71875, + "learning_rate": 3.779457279118337e-06, + "loss": 0.9379528, + "memory(GiB)": 302.58, + "step": 214420, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.74079199, + "epoch": 1.199250049283674, + "grad_norm": 6.90625, + "learning_rate": 3.77856056928329e-06, + "loss": 1.00742464, + "memory(GiB)": 302.58, + "step": 214440, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.76570458, + "epoch": 1.1993618987566532, + "grad_norm": 12.0625, + "learning_rate": 3.7776639012253426e-06, + "loss": 0.92153816, + "memory(GiB)": 302.58, + "step": 214460, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.74802871, + "epoch": 1.1994737482296325, + "grad_norm": 7.53125, + "learning_rate": 3.776767274975167e-06, + "loss": 0.987432, + "memory(GiB)": 302.58, + "step": 214480, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.74461203, + "epoch": 1.1995855977026118, + "grad_norm": 4.28125, + "learning_rate": 3.77587069056343e-06, + "loss": 1.00653954, + "memory(GiB)": 302.58, + "step": 214500, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74367232, + "epoch": 1.199697447175591, + "grad_norm": 6.3125, + "learning_rate": 3.774974148020797e-06, + "loss": 1.00895309, + "memory(GiB)": 302.58, + "step": 214520, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.74561486, + "epoch": 1.1998092966485703, + "grad_norm": 6.65625, + "learning_rate": 3.7740776473779316e-06, + "loss": 0.97568321, + "memory(GiB)": 302.58, + "step": 214540, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.73968992, + "epoch": 1.1999211461215495, + "grad_norm": 8.125, + "learning_rate": 3.773181188665499e-06, + "loss": 1.01023579, + "memory(GiB)": 302.58, + "step": 214560, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.76722813, + "epoch": 1.2000329955945288, + "grad_norm": 8.625, + "learning_rate": 3.7722847719141597e-06, + "loss": 0.90303774, + "memory(GiB)": 302.58, + "step": 214580, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.76281672, + "epoch": 1.200144845067508, + "grad_norm": 8.0625, + "learning_rate": 3.771388397154574e-06, + "loss": 0.94157801, + "memory(GiB)": 302.58, + "step": 214600, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.74389434, + "epoch": 1.2002566945404873, + "grad_norm": 7.1875, + "learning_rate": 3.7704920644174015e-06, + "loss": 1.00793476, + "memory(GiB)": 302.58, + "step": 214620, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.73435564, + "epoch": 1.2003685440134666, + "grad_norm": 10.4375, + "learning_rate": 3.769595773733299e-06, + "loss": 1.05236549, + "memory(GiB)": 302.58, + "step": 214640, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.74239106, + "epoch": 1.2004803934864459, + "grad_norm": 7.75, + "learning_rate": 3.7686995251329205e-06, + "loss": 1.01122093, + "memory(GiB)": 302.58, + "step": 214660, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.75247793, + "epoch": 1.2005922429594251, + "grad_norm": 6.0625, + "learning_rate": 3.767803318646925e-06, + "loss": 0.97773952, + "memory(GiB)": 302.58, + "step": 214680, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.77513719, + "epoch": 1.2007040924324044, + "grad_norm": 9.25, + "learning_rate": 3.766907154305962e-06, + "loss": 0.87748775, + "memory(GiB)": 302.58, + "step": 214700, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.76473637, + "epoch": 1.2008159419053837, + "grad_norm": 6.5, + "learning_rate": 3.766011032140685e-06, + "loss": 0.91129503, + "memory(GiB)": 302.58, + "step": 214720, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.7629437, + "epoch": 1.200927791378363, + "grad_norm": 8.3125, + "learning_rate": 3.765114952181744e-06, + "loss": 0.93378572, + "memory(GiB)": 302.58, + "step": 214740, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74350739, + "epoch": 1.2010396408513422, + "grad_norm": 5.90625, + "learning_rate": 3.764218914459786e-06, + "loss": 1.00085897, + "memory(GiB)": 302.58, + "step": 214760, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.76651206, + "epoch": 1.2011514903243214, + "grad_norm": 6.375, + "learning_rate": 3.763322919005461e-06, + "loss": 0.9045269, + "memory(GiB)": 302.58, + "step": 214780, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.75706091, + "epoch": 1.2012633397973007, + "grad_norm": 8.75, + "learning_rate": 3.7624269658494123e-06, + "loss": 0.95809727, + "memory(GiB)": 302.58, + "step": 214800, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75643268, + "epoch": 1.20137518927028, + "grad_norm": 7.28125, + "learning_rate": 3.7615310550222863e-06, + "loss": 0.96974363, + "memory(GiB)": 302.58, + "step": 214820, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.74932585, + "epoch": 1.2014870387432592, + "grad_norm": 8.0625, + "learning_rate": 3.7606351865547253e-06, + "loss": 0.99139929, + "memory(GiB)": 302.58, + "step": 214840, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.74140625, + "epoch": 1.2015988882162385, + "grad_norm": 6.0625, + "learning_rate": 3.7597393604773715e-06, + "loss": 1.01331415, + "memory(GiB)": 302.58, + "step": 214860, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.7629899, + "epoch": 1.2017107376892178, + "grad_norm": 6.90625, + "learning_rate": 3.758843576820863e-06, + "loss": 0.89870615, + "memory(GiB)": 302.58, + "step": 214880, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74182048, + "epoch": 1.201822587162197, + "grad_norm": 7.78125, + "learning_rate": 3.7579478356158404e-06, + "loss": 1.0062129, + "memory(GiB)": 302.58, + "step": 214900, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.74833035, + "epoch": 1.2019344366351763, + "grad_norm": 7.5, + "learning_rate": 3.757052136892941e-06, + "loss": 1.00267773, + "memory(GiB)": 302.58, + "step": 214920, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.74551735, + "epoch": 1.2020462861081556, + "grad_norm": 9.5625, + "learning_rate": 3.7561564806827977e-06, + "loss": 0.98462296, + "memory(GiB)": 302.58, + "step": 214940, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.73815427, + "epoch": 1.2021581355811348, + "grad_norm": 5.0625, + "learning_rate": 3.7552608670160486e-06, + "loss": 1.01620674, + "memory(GiB)": 302.58, + "step": 214960, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.74845958, + "epoch": 1.202269985054114, + "grad_norm": 6.75, + "learning_rate": 3.7543652959233244e-06, + "loss": 0.99603882, + "memory(GiB)": 302.58, + "step": 214980, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.74818649, + "epoch": 1.2023818345270934, + "grad_norm": 8.0, + "learning_rate": 3.753469767435257e-06, + "loss": 0.97471638, + "memory(GiB)": 302.58, + "step": 215000, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.76278996, + "epoch": 1.2024936840000726, + "grad_norm": 9.875, + "learning_rate": 3.7525742815824767e-06, + "loss": 0.94613829, + "memory(GiB)": 302.58, + "step": 215020, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.76213756, + "epoch": 1.2026055334730519, + "grad_norm": 6.375, + "learning_rate": 3.751678838395611e-06, + "loss": 0.95115633, + "memory(GiB)": 302.58, + "step": 215040, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.74004149, + "epoch": 1.2027173829460311, + "grad_norm": 7.09375, + "learning_rate": 3.7507834379052877e-06, + "loss": 1.01338511, + "memory(GiB)": 302.58, + "step": 215060, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.7463666, + "epoch": 1.2028292324190104, + "grad_norm": 7.03125, + "learning_rate": 3.74988808014213e-06, + "loss": 0.98980703, + "memory(GiB)": 302.58, + "step": 215080, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.77444906, + "epoch": 1.2029410818919897, + "grad_norm": 7.3125, + "learning_rate": 3.7489927651367657e-06, + "loss": 0.8758461, + "memory(GiB)": 302.58, + "step": 215100, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.75677094, + "epoch": 1.203052931364969, + "grad_norm": 5.8125, + "learning_rate": 3.7480974929198155e-06, + "loss": 0.95518398, + "memory(GiB)": 302.58, + "step": 215120, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.74450836, + "epoch": 1.2031647808379482, + "grad_norm": 7.65625, + "learning_rate": 3.7472022635219008e-06, + "loss": 1.00012608, + "memory(GiB)": 302.58, + "step": 215140, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.74947329, + "epoch": 1.2032766303109275, + "grad_norm": 7.6875, + "learning_rate": 3.7463070769736414e-06, + "loss": 0.96967926, + "memory(GiB)": 302.58, + "step": 215160, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.75000386, + "epoch": 1.2033884797839067, + "grad_norm": 7.125, + "learning_rate": 3.745411933305655e-06, + "loss": 0.97690382, + "memory(GiB)": 302.58, + "step": 215180, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.7613142, + "epoch": 1.203500329256886, + "grad_norm": 4.5, + "learning_rate": 3.7445168325485593e-06, + "loss": 0.95954514, + "memory(GiB)": 302.58, + "step": 215200, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.76258945, + "epoch": 1.2036121787298653, + "grad_norm": 9.5, + "learning_rate": 3.7436217747329673e-06, + "loss": 0.94484472, + "memory(GiB)": 302.58, + "step": 215220, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.74833875, + "epoch": 1.2037240282028445, + "grad_norm": 8.0, + "learning_rate": 3.7427267598894955e-06, + "loss": 0.99268579, + "memory(GiB)": 302.58, + "step": 215240, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.74781518, + "epoch": 1.2038358776758238, + "grad_norm": 6.46875, + "learning_rate": 3.7418317880487552e-06, + "loss": 0.98559217, + "memory(GiB)": 302.58, + "step": 215260, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.76061683, + "epoch": 1.203947727148803, + "grad_norm": 6.15625, + "learning_rate": 3.7409368592413574e-06, + "loss": 0.94123869, + "memory(GiB)": 302.58, + "step": 215280, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.7599791, + "epoch": 1.2040595766217823, + "grad_norm": 9.5, + "learning_rate": 3.740041973497911e-06, + "loss": 0.94181929, + "memory(GiB)": 302.58, + "step": 215300, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.74668531, + "epoch": 1.2041714260947616, + "grad_norm": 7.15625, + "learning_rate": 3.739147130849025e-06, + "loss": 0.97491302, + "memory(GiB)": 302.58, + "step": 215320, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.75721188, + "epoch": 1.2042832755677408, + "grad_norm": 8.0, + "learning_rate": 3.7382523313253037e-06, + "loss": 0.96834536, + "memory(GiB)": 302.58, + "step": 215340, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.74436369, + "epoch": 1.20439512504072, + "grad_norm": 9.0, + "learning_rate": 3.7373575749573552e-06, + "loss": 1.0128479, + "memory(GiB)": 302.58, + "step": 215360, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.75673513, + "epoch": 1.2045069745136994, + "grad_norm": 9.125, + "learning_rate": 3.736462861775781e-06, + "loss": 0.97985992, + "memory(GiB)": 302.58, + "step": 215380, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.74647894, + "epoch": 1.2046188239866786, + "grad_norm": 6.40625, + "learning_rate": 3.735568191811183e-06, + "loss": 1.01124897, + "memory(GiB)": 302.58, + "step": 215400, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.76584768, + "epoch": 1.204730673459658, + "grad_norm": 8.75, + "learning_rate": 3.734673565094163e-06, + "loss": 0.91217871, + "memory(GiB)": 302.58, + "step": 215420, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.75826182, + "epoch": 1.2048425229326372, + "grad_norm": 7.59375, + "learning_rate": 3.7337789816553194e-06, + "loss": 0.94364204, + "memory(GiB)": 302.58, + "step": 215440, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.74834447, + "epoch": 1.2049543724056164, + "grad_norm": 5.75, + "learning_rate": 3.7328844415252486e-06, + "loss": 0.99401121, + "memory(GiB)": 302.58, + "step": 215460, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.74435978, + "epoch": 1.2050662218785957, + "grad_norm": 6.375, + "learning_rate": 3.7319899447345478e-06, + "loss": 1.00486326, + "memory(GiB)": 302.58, + "step": 215480, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.75842738, + "epoch": 1.205178071351575, + "grad_norm": 6.90625, + "learning_rate": 3.731095491313812e-06, + "loss": 0.94463501, + "memory(GiB)": 302.58, + "step": 215500, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.76064777, + "epoch": 1.2052899208245544, + "grad_norm": 6.28125, + "learning_rate": 3.7302010812936344e-06, + "loss": 0.92729692, + "memory(GiB)": 302.58, + "step": 215520, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.76960115, + "epoch": 1.2054017702975335, + "grad_norm": 7.625, + "learning_rate": 3.7293067147046065e-06, + "loss": 0.92312727, + "memory(GiB)": 302.58, + "step": 215540, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.73475475, + "epoch": 1.205513619770513, + "grad_norm": 7.15625, + "learning_rate": 3.728412391577318e-06, + "loss": 1.0425169, + "memory(GiB)": 302.58, + "step": 215560, + "train_speed(iter/s)": 0.123826 + }, + { + "acc": 0.74798446, + "epoch": 1.205625469243492, + "grad_norm": 6.9375, + "learning_rate": 3.7275181119423577e-06, + "loss": 0.97997465, + "memory(GiB)": 302.58, + "step": 215580, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.74962945, + "epoch": 1.2057373187164715, + "grad_norm": 8.9375, + "learning_rate": 3.726623875830313e-06, + "loss": 0.98707275, + "memory(GiB)": 302.58, + "step": 215600, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.7503396, + "epoch": 1.2058491681894505, + "grad_norm": 7.34375, + "learning_rate": 3.7257296832717684e-06, + "loss": 0.97381802, + "memory(GiB)": 302.58, + "step": 215620, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.75495653, + "epoch": 1.20596101766243, + "grad_norm": 7.25, + "learning_rate": 3.7248355342973097e-06, + "loss": 0.96081038, + "memory(GiB)": 302.58, + "step": 215640, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.75508909, + "epoch": 1.206072867135409, + "grad_norm": 7.78125, + "learning_rate": 3.7239414289375198e-06, + "loss": 0.97914429, + "memory(GiB)": 302.58, + "step": 215660, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.73823647, + "epoch": 1.2061847166083886, + "grad_norm": 7.0625, + "learning_rate": 3.7230473672229795e-06, + "loss": 1.07261496, + "memory(GiB)": 302.58, + "step": 215680, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.75886497, + "epoch": 1.2062965660813676, + "grad_norm": 7.71875, + "learning_rate": 3.722153349184268e-06, + "loss": 0.94407482, + "memory(GiB)": 302.58, + "step": 215700, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.73954358, + "epoch": 1.206408415554347, + "grad_norm": 8.1875, + "learning_rate": 3.721259374851962e-06, + "loss": 1.04165974, + "memory(GiB)": 302.58, + "step": 215720, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74384193, + "epoch": 1.2065202650273261, + "grad_norm": 7.1875, + "learning_rate": 3.7203654442566417e-06, + "loss": 1.02498989, + "memory(GiB)": 302.58, + "step": 215740, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.75432863, + "epoch": 1.2066321145003056, + "grad_norm": 8.6875, + "learning_rate": 3.719471557428882e-06, + "loss": 0.98125458, + "memory(GiB)": 302.58, + "step": 215760, + "train_speed(iter/s)": 0.123878 + }, + { + "acc": 0.75384188, + "epoch": 1.2067439639732847, + "grad_norm": 11.375, + "learning_rate": 3.718577714399254e-06, + "loss": 0.9612545, + "memory(GiB)": 302.58, + "step": 215780, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.7293304, + "epoch": 1.2068558134462641, + "grad_norm": 6.8125, + "learning_rate": 3.7176839151983325e-06, + "loss": 1.06818504, + "memory(GiB)": 302.58, + "step": 215800, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.75684476, + "epoch": 1.2069676629192432, + "grad_norm": 5.53125, + "learning_rate": 3.716790159856686e-06, + "loss": 0.96189804, + "memory(GiB)": 302.58, + "step": 215820, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.75238123, + "epoch": 1.2070795123922227, + "grad_norm": 6.96875, + "learning_rate": 3.7158964484048852e-06, + "loss": 0.99176502, + "memory(GiB)": 302.58, + "step": 215840, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.76194305, + "epoch": 1.2071913618652017, + "grad_norm": 5.96875, + "learning_rate": 3.715002780873499e-06, + "loss": 0.91990185, + "memory(GiB)": 302.58, + "step": 215860, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.75237184, + "epoch": 1.2073032113381812, + "grad_norm": 4.84375, + "learning_rate": 3.7141091572930914e-06, + "loss": 0.98147621, + "memory(GiB)": 302.58, + "step": 215880, + "train_speed(iter/s)": 0.123911 + }, + { + "acc": 0.74816322, + "epoch": 1.2074150608111602, + "grad_norm": 9.375, + "learning_rate": 3.7132155776942286e-06, + "loss": 0.9922308, + "memory(GiB)": 302.58, + "step": 215900, + "train_speed(iter/s)": 0.123916 + }, + { + "acc": 0.74741058, + "epoch": 1.2075269102841397, + "grad_norm": 9.25, + "learning_rate": 3.712322042107473e-06, + "loss": 0.9937706, + "memory(GiB)": 302.58, + "step": 215920, + "train_speed(iter/s)": 0.123921 + }, + { + "acc": 0.75732679, + "epoch": 1.2076387597571188, + "grad_norm": 6.28125, + "learning_rate": 3.7114285505633864e-06, + "loss": 0.96076164, + "memory(GiB)": 302.58, + "step": 215940, + "train_speed(iter/s)": 0.123927 + }, + { + "acc": 0.73054037, + "epoch": 1.2077506092300982, + "grad_norm": 5.53125, + "learning_rate": 3.7105351030925285e-06, + "loss": 1.05548134, + "memory(GiB)": 302.58, + "step": 215960, + "train_speed(iter/s)": 0.123932 + }, + { + "acc": 0.73975806, + "epoch": 1.2078624587030773, + "grad_norm": 5.5, + "learning_rate": 3.70964169972546e-06, + "loss": 1.02445536, + "memory(GiB)": 302.58, + "step": 215980, + "train_speed(iter/s)": 0.123938 + }, + { + "acc": 0.73187051, + "epoch": 1.2079743081760568, + "grad_norm": 7.0625, + "learning_rate": 3.708748340492737e-06, + "loss": 1.05697145, + "memory(GiB)": 302.58, + "step": 216000, + "train_speed(iter/s)": 0.123944 + }, + { + "epoch": 1.2079743081760568, + "eval_acc": 0.7066708608372428, + "eval_loss": 1.012706995010376, + "eval_runtime": 7510.9563, + "eval_samples_per_second": 10.023, + "eval_steps_per_second": 10.023, + "step": 216000 + }, + { + "acc": 0.74920335, + "epoch": 1.2080861576490358, + "grad_norm": 7.625, + "learning_rate": 3.7078550254249157e-06, + "loss": 0.98198004, + "memory(GiB)": 302.58, + "step": 216020, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.7515522, + "epoch": 1.2081980071220153, + "grad_norm": 6.3125, + "learning_rate": 3.7069617545525495e-06, + "loss": 0.98209362, + "memory(GiB)": 302.58, + "step": 216040, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.73797779, + "epoch": 1.2083098565949943, + "grad_norm": 10.3125, + "learning_rate": 3.7060685279061904e-06, + "loss": 1.03287916, + "memory(GiB)": 302.58, + "step": 216060, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.73574929, + "epoch": 1.2084217060679738, + "grad_norm": 6.3125, + "learning_rate": 3.705175345516392e-06, + "loss": 1.03262672, + "memory(GiB)": 302.58, + "step": 216080, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.73464317, + "epoch": 1.2085335555409529, + "grad_norm": 7.96875, + "learning_rate": 3.704282207413701e-06, + "loss": 1.02399197, + "memory(GiB)": 302.58, + "step": 216100, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.73791146, + "epoch": 1.2086454050139324, + "grad_norm": 6.96875, + "learning_rate": 3.7033891136286676e-06, + "loss": 1.05076895, + "memory(GiB)": 302.58, + "step": 216120, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75074244, + "epoch": 1.2087572544869114, + "grad_norm": 6.21875, + "learning_rate": 3.7024960641918384e-06, + "loss": 0.98876324, + "memory(GiB)": 302.58, + "step": 216140, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74472146, + "epoch": 1.208869103959891, + "grad_norm": 5.3125, + "learning_rate": 3.701603059133758e-06, + "loss": 1.00383043, + "memory(GiB)": 302.58, + "step": 216160, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.76159735, + "epoch": 1.20898095343287, + "grad_norm": 5.84375, + "learning_rate": 3.7007100984849704e-06, + "loss": 0.91529636, + "memory(GiB)": 302.58, + "step": 216180, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.75832992, + "epoch": 1.2090928029058494, + "grad_norm": 6.84375, + "learning_rate": 3.699817182276017e-06, + "loss": 0.94283257, + "memory(GiB)": 302.58, + "step": 216200, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.75636001, + "epoch": 1.2092046523788285, + "grad_norm": 9.75, + "learning_rate": 3.698924310537438e-06, + "loss": 0.96379929, + "memory(GiB)": 302.58, + "step": 216220, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.75502963, + "epoch": 1.209316501851808, + "grad_norm": 6.0625, + "learning_rate": 3.698031483299774e-06, + "loss": 0.96042891, + "memory(GiB)": 302.58, + "step": 216240, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.76025972, + "epoch": 1.209428351324787, + "grad_norm": 9.9375, + "learning_rate": 3.6971387005935616e-06, + "loss": 0.95180731, + "memory(GiB)": 302.58, + "step": 216260, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74213829, + "epoch": 1.2095402007977665, + "grad_norm": 6.5, + "learning_rate": 3.6962459624493374e-06, + "loss": 1.01586866, + "memory(GiB)": 302.58, + "step": 216280, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74725814, + "epoch": 1.2096520502707455, + "grad_norm": 5.875, + "learning_rate": 3.6953532688976344e-06, + "loss": 1.01436539, + "memory(GiB)": 302.58, + "step": 216300, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.74049449, + "epoch": 1.209763899743725, + "grad_norm": 11.25, + "learning_rate": 3.6944606199689874e-06, + "loss": 1.02445049, + "memory(GiB)": 302.58, + "step": 216320, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.74682126, + "epoch": 1.209875749216704, + "grad_norm": 6.6875, + "learning_rate": 3.693568015693927e-06, + "loss": 0.97559872, + "memory(GiB)": 302.58, + "step": 216340, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.74564776, + "epoch": 1.2099875986896835, + "grad_norm": 7.90625, + "learning_rate": 3.6926754561029816e-06, + "loss": 1.02117434, + "memory(GiB)": 302.58, + "step": 216360, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75078316, + "epoch": 1.2100994481626626, + "grad_norm": 6.09375, + "learning_rate": 3.691782941226682e-06, + "loss": 0.97844009, + "memory(GiB)": 302.58, + "step": 216380, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.75119328, + "epoch": 1.210211297635642, + "grad_norm": 10.0, + "learning_rate": 3.690890471095554e-06, + "loss": 0.96835861, + "memory(GiB)": 302.58, + "step": 216400, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.74033289, + "epoch": 1.210323147108621, + "grad_norm": 8.1875, + "learning_rate": 3.6899980457401237e-06, + "loss": 1.02269354, + "memory(GiB)": 302.58, + "step": 216420, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.7413661, + "epoch": 1.2104349965816006, + "grad_norm": 8.3125, + "learning_rate": 3.6891056651909137e-06, + "loss": 1.01921797, + "memory(GiB)": 302.58, + "step": 216440, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75596485, + "epoch": 1.2105468460545796, + "grad_norm": 5.4375, + "learning_rate": 3.688213329478446e-06, + "loss": 0.95603857, + "memory(GiB)": 302.58, + "step": 216460, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.76202393, + "epoch": 1.2106586955275591, + "grad_norm": 8.9375, + "learning_rate": 3.6873210386332432e-06, + "loss": 0.92352648, + "memory(GiB)": 302.58, + "step": 216480, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75682702, + "epoch": 1.2107705450005382, + "grad_norm": 5.875, + "learning_rate": 3.686428792685821e-06, + "loss": 0.95749512, + "memory(GiB)": 302.58, + "step": 216500, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.77653031, + "epoch": 1.2108823944735176, + "grad_norm": 10.625, + "learning_rate": 3.6855365916667013e-06, + "loss": 0.8649724, + "memory(GiB)": 302.58, + "step": 216520, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.76969051, + "epoch": 1.2109942439464967, + "grad_norm": 5.875, + "learning_rate": 3.6846444356063983e-06, + "loss": 0.88845882, + "memory(GiB)": 302.58, + "step": 216540, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75709944, + "epoch": 1.2111060934194762, + "grad_norm": 8.875, + "learning_rate": 3.6837523245354257e-06, + "loss": 0.94925919, + "memory(GiB)": 302.58, + "step": 216560, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.73472733, + "epoch": 1.2112179428924552, + "grad_norm": 7.3125, + "learning_rate": 3.682860258484298e-06, + "loss": 1.03764124, + "memory(GiB)": 302.58, + "step": 216580, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.73400826, + "epoch": 1.2113297923654347, + "grad_norm": 8.25, + "learning_rate": 3.6819682374835253e-06, + "loss": 1.03443766, + "memory(GiB)": 302.58, + "step": 216600, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.75456738, + "epoch": 1.2114416418384137, + "grad_norm": 6.9375, + "learning_rate": 3.6810762615636187e-06, + "loss": 0.97906742, + "memory(GiB)": 302.58, + "step": 216620, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.75464315, + "epoch": 1.2115534913113932, + "grad_norm": 9.1875, + "learning_rate": 3.6801843307550855e-06, + "loss": 0.9651556, + "memory(GiB)": 302.58, + "step": 216640, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75580778, + "epoch": 1.2116653407843723, + "grad_norm": 6.90625, + "learning_rate": 3.6792924450884337e-06, + "loss": 0.94634838, + "memory(GiB)": 302.58, + "step": 216660, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.7668716, + "epoch": 1.2117771902573518, + "grad_norm": 7.90625, + "learning_rate": 3.6784006045941676e-06, + "loss": 0.9203886, + "memory(GiB)": 302.58, + "step": 216680, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.74960647, + "epoch": 1.2118890397303308, + "grad_norm": 8.25, + "learning_rate": 3.6775088093027923e-06, + "loss": 0.99338455, + "memory(GiB)": 302.58, + "step": 216700, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.7487247, + "epoch": 1.2120008892033103, + "grad_norm": 9.9375, + "learning_rate": 3.6766170592448093e-06, + "loss": 0.98336163, + "memory(GiB)": 302.58, + "step": 216720, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.76340246, + "epoch": 1.2121127386762893, + "grad_norm": 6.28125, + "learning_rate": 3.6757253544507186e-06, + "loss": 0.90920963, + "memory(GiB)": 302.58, + "step": 216740, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.75606995, + "epoch": 1.2122245881492688, + "grad_norm": 8.25, + "learning_rate": 3.6748336949510206e-06, + "loss": 0.96026611, + "memory(GiB)": 302.58, + "step": 216760, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.76843352, + "epoch": 1.212336437622248, + "grad_norm": 9.0625, + "learning_rate": 3.6739420807762107e-06, + "loss": 0.88866348, + "memory(GiB)": 302.58, + "step": 216780, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.75275502, + "epoch": 1.2124482870952273, + "grad_norm": 9.875, + "learning_rate": 3.673050511956788e-06, + "loss": 0.96443968, + "memory(GiB)": 302.58, + "step": 216800, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.76849303, + "epoch": 1.2125601365682066, + "grad_norm": 7.8125, + "learning_rate": 3.672158988523245e-06, + "loss": 0.91451683, + "memory(GiB)": 302.58, + "step": 216820, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.763762, + "epoch": 1.2126719860411859, + "grad_norm": 5.75, + "learning_rate": 3.671267510506075e-06, + "loss": 0.90139589, + "memory(GiB)": 302.58, + "step": 216840, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.76076522, + "epoch": 1.2127838355141651, + "grad_norm": 8.625, + "learning_rate": 3.67037607793577e-06, + "loss": 0.93727493, + "memory(GiB)": 302.58, + "step": 216860, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.74147043, + "epoch": 1.2128956849871444, + "grad_norm": 6.34375, + "learning_rate": 3.6694846908428185e-06, + "loss": 1.00720778, + "memory(GiB)": 302.58, + "step": 216880, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.73905139, + "epoch": 1.2130075344601237, + "grad_norm": 5.53125, + "learning_rate": 3.6685933492577096e-06, + "loss": 1.04013653, + "memory(GiB)": 302.58, + "step": 216900, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.75862498, + "epoch": 1.213119383933103, + "grad_norm": 6.40625, + "learning_rate": 3.6677020532109296e-06, + "loss": 0.93372593, + "memory(GiB)": 302.58, + "step": 216920, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.76806026, + "epoch": 1.2132312334060822, + "grad_norm": 5.96875, + "learning_rate": 3.666810802732964e-06, + "loss": 0.90309048, + "memory(GiB)": 302.58, + "step": 216940, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.75071664, + "epoch": 1.2133430828790615, + "grad_norm": 7.96875, + "learning_rate": 3.6659195978542973e-06, + "loss": 0.97268209, + "memory(GiB)": 302.58, + "step": 216960, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.75359287, + "epoch": 1.2134549323520407, + "grad_norm": 6.25, + "learning_rate": 3.665028438605408e-06, + "loss": 0.97386465, + "memory(GiB)": 302.58, + "step": 216980, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.78406878, + "epoch": 1.21356678182502, + "grad_norm": 9.5, + "learning_rate": 3.664137325016782e-06, + "loss": 0.84622059, + "memory(GiB)": 302.58, + "step": 217000, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.75934501, + "epoch": 1.2136786312979992, + "grad_norm": 8.3125, + "learning_rate": 3.663246257118894e-06, + "loss": 0.95939722, + "memory(GiB)": 302.58, + "step": 217020, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.74055777, + "epoch": 1.2137904807709785, + "grad_norm": 9.875, + "learning_rate": 3.662355234942224e-06, + "loss": 1.0273675, + "memory(GiB)": 302.58, + "step": 217040, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74996691, + "epoch": 1.2139023302439578, + "grad_norm": 6.28125, + "learning_rate": 3.6614642585172468e-06, + "loss": 0.98557453, + "memory(GiB)": 302.58, + "step": 217060, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.74584947, + "epoch": 1.214014179716937, + "grad_norm": 8.75, + "learning_rate": 3.660573327874435e-06, + "loss": 1.01348696, + "memory(GiB)": 302.58, + "step": 217080, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.75238285, + "epoch": 1.2141260291899163, + "grad_norm": 8.6875, + "learning_rate": 3.6596824430442624e-06, + "loss": 0.98192501, + "memory(GiB)": 302.58, + "step": 217100, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.75855775, + "epoch": 1.2142378786628956, + "grad_norm": 6.78125, + "learning_rate": 3.6587916040572015e-06, + "loss": 0.94804249, + "memory(GiB)": 302.58, + "step": 217120, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.75388675, + "epoch": 1.2143497281358748, + "grad_norm": 7.5, + "learning_rate": 3.6579008109437196e-06, + "loss": 0.96678267, + "memory(GiB)": 302.58, + "step": 217140, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.73081713, + "epoch": 1.214461577608854, + "grad_norm": 6.34375, + "learning_rate": 3.6570100637342866e-06, + "loss": 1.06327419, + "memory(GiB)": 302.58, + "step": 217160, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.74042206, + "epoch": 1.2145734270818334, + "grad_norm": 5.6875, + "learning_rate": 3.6561193624593673e-06, + "loss": 0.99911098, + "memory(GiB)": 302.58, + "step": 217180, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.76599512, + "epoch": 1.2146852765548126, + "grad_norm": 6.75, + "learning_rate": 3.6552287071494274e-06, + "loss": 0.92365675, + "memory(GiB)": 302.58, + "step": 217200, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.76074705, + "epoch": 1.2147971260277919, + "grad_norm": 8.0, + "learning_rate": 3.65433809783493e-06, + "loss": 0.93559847, + "memory(GiB)": 302.58, + "step": 217220, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.737362, + "epoch": 1.2149089755007711, + "grad_norm": 6.375, + "learning_rate": 3.6534475345463353e-06, + "loss": 1.04956274, + "memory(GiB)": 302.58, + "step": 217240, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.73957257, + "epoch": 1.2150208249737504, + "grad_norm": 6.5, + "learning_rate": 3.6525570173141063e-06, + "loss": 1.02506847, + "memory(GiB)": 302.58, + "step": 217260, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.76548586, + "epoch": 1.2151326744467297, + "grad_norm": 9.625, + "learning_rate": 3.6516665461686994e-06, + "loss": 0.9074522, + "memory(GiB)": 302.58, + "step": 217280, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.73291068, + "epoch": 1.215244523919709, + "grad_norm": 6.59375, + "learning_rate": 3.6507761211405723e-06, + "loss": 1.04533663, + "memory(GiB)": 302.58, + "step": 217300, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.73593154, + "epoch": 1.2153563733926882, + "grad_norm": 5.8125, + "learning_rate": 3.6498857422601803e-06, + "loss": 1.07165756, + "memory(GiB)": 302.58, + "step": 217320, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.74185634, + "epoch": 1.2154682228656675, + "grad_norm": 7.21875, + "learning_rate": 3.6489954095579765e-06, + "loss": 1.02243557, + "memory(GiB)": 302.58, + "step": 217340, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.74049444, + "epoch": 1.2155800723386467, + "grad_norm": 4.5, + "learning_rate": 3.648105123064414e-06, + "loss": 1.01770697, + "memory(GiB)": 302.58, + "step": 217360, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.75177889, + "epoch": 1.215691921811626, + "grad_norm": 6.5625, + "learning_rate": 3.6472148828099418e-06, + "loss": 0.98484135, + "memory(GiB)": 302.58, + "step": 217380, + "train_speed(iter/s)": 0.123775 + }, + { + "acc": 0.76268001, + "epoch": 1.2158037712846053, + "grad_norm": 6.875, + "learning_rate": 3.6463246888250113e-06, + "loss": 0.93105011, + "memory(GiB)": 302.58, + "step": 217400, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.77135754, + "epoch": 1.2159156207575845, + "grad_norm": 7.25, + "learning_rate": 3.6454345411400693e-06, + "loss": 0.8594861, + "memory(GiB)": 302.58, + "step": 217420, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.76216588, + "epoch": 1.2160274702305638, + "grad_norm": 6.25, + "learning_rate": 3.6445444397855608e-06, + "loss": 0.9305851, + "memory(GiB)": 302.58, + "step": 217440, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.75925374, + "epoch": 1.216139319703543, + "grad_norm": 8.0, + "learning_rate": 3.64365438479193e-06, + "loss": 0.94377489, + "memory(GiB)": 302.58, + "step": 217460, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.74830074, + "epoch": 1.2162511691765223, + "grad_norm": 5.9375, + "learning_rate": 3.6427643761896202e-06, + "loss": 0.98901901, + "memory(GiB)": 302.58, + "step": 217480, + "train_speed(iter/s)": 0.123801 + }, + { + "acc": 0.74525905, + "epoch": 1.2163630186495016, + "grad_norm": 12.3125, + "learning_rate": 3.6418744140090734e-06, + "loss": 1.00856686, + "memory(GiB)": 302.58, + "step": 217500, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.74123979, + "epoch": 1.2164748681224808, + "grad_norm": 6.5625, + "learning_rate": 3.6409844982807265e-06, + "loss": 1.01114426, + "memory(GiB)": 302.58, + "step": 217520, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.756317, + "epoch": 1.21658671759546, + "grad_norm": 10.9375, + "learning_rate": 3.6400946290350197e-06, + "loss": 0.95486794, + "memory(GiB)": 302.58, + "step": 217540, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.73794622, + "epoch": 1.2166985670684394, + "grad_norm": 8.375, + "learning_rate": 3.639204806302389e-06, + "loss": 1.02176275, + "memory(GiB)": 302.58, + "step": 217560, + "train_speed(iter/s)": 0.123822 + }, + { + "acc": 0.75861526, + "epoch": 1.2168104165414186, + "grad_norm": 7.09375, + "learning_rate": 3.638315030113269e-06, + "loss": 0.94221697, + "memory(GiB)": 302.58, + "step": 217580, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.76485896, + "epoch": 1.216922266014398, + "grad_norm": 6.6875, + "learning_rate": 3.637425300498093e-06, + "loss": 0.9301507, + "memory(GiB)": 302.58, + "step": 217600, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.76192431, + "epoch": 1.2170341154873772, + "grad_norm": 7.21875, + "learning_rate": 3.6365356174872923e-06, + "loss": 0.93202515, + "memory(GiB)": 302.58, + "step": 217620, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.74079642, + "epoch": 1.2171459649603564, + "grad_norm": 7.09375, + "learning_rate": 3.6356459811112965e-06, + "loss": 1.02280865, + "memory(GiB)": 302.58, + "step": 217640, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.75584946, + "epoch": 1.2172578144333357, + "grad_norm": 7.78125, + "learning_rate": 3.6347563914005333e-06, + "loss": 0.96841812, + "memory(GiB)": 302.58, + "step": 217660, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.74442534, + "epoch": 1.217369663906315, + "grad_norm": 6.78125, + "learning_rate": 3.6338668483854323e-06, + "loss": 1.00491009, + "memory(GiB)": 302.58, + "step": 217680, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.77181563, + "epoch": 1.2174815133792942, + "grad_norm": 6.3125, + "learning_rate": 3.6329773520964163e-06, + "loss": 0.87956839, + "memory(GiB)": 302.58, + "step": 217700, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.75033922, + "epoch": 1.2175933628522735, + "grad_norm": 7.28125, + "learning_rate": 3.63208790256391e-06, + "loss": 0.9904294, + "memory(GiB)": 302.58, + "step": 217720, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.74512382, + "epoch": 1.2177052123252528, + "grad_norm": 11.6875, + "learning_rate": 3.631198499818335e-06, + "loss": 1.00260038, + "memory(GiB)": 302.58, + "step": 217740, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.75940585, + "epoch": 1.217817061798232, + "grad_norm": 6.4375, + "learning_rate": 3.630309143890111e-06, + "loss": 0.93567276, + "memory(GiB)": 302.58, + "step": 217760, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.74201989, + "epoch": 1.2179289112712113, + "grad_norm": 6.59375, + "learning_rate": 3.6294198348096564e-06, + "loss": 1.0423728, + "memory(GiB)": 302.58, + "step": 217780, + "train_speed(iter/s)": 0.123881 + }, + { + "acc": 0.74649687, + "epoch": 1.2180407607441905, + "grad_norm": 7.34375, + "learning_rate": 3.6285305726073917e-06, + "loss": 0.99412832, + "memory(GiB)": 302.58, + "step": 217800, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.73915267, + "epoch": 1.2181526102171698, + "grad_norm": 8.25, + "learning_rate": 3.6276413573137294e-06, + "loss": 1.00518732, + "memory(GiB)": 302.58, + "step": 217820, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.7546793, + "epoch": 1.218264459690149, + "grad_norm": 9.0, + "learning_rate": 3.626752188959084e-06, + "loss": 0.96635008, + "memory(GiB)": 302.58, + "step": 217840, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.74866123, + "epoch": 1.2183763091631283, + "grad_norm": 5.4375, + "learning_rate": 3.625863067573869e-06, + "loss": 0.98784494, + "memory(GiB)": 302.58, + "step": 217860, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73619771, + "epoch": 1.2184881586361076, + "grad_norm": 9.1875, + "learning_rate": 3.6249739931884936e-06, + "loss": 1.05055265, + "memory(GiB)": 302.58, + "step": 217880, + "train_speed(iter/s)": 0.123908 + }, + { + "acc": 0.74977355, + "epoch": 1.2186000081090869, + "grad_norm": 6.875, + "learning_rate": 3.6240849658333675e-06, + "loss": 0.98236418, + "memory(GiB)": 302.58, + "step": 217900, + "train_speed(iter/s)": 0.123913 + }, + { + "acc": 0.76255331, + "epoch": 1.2187118575820661, + "grad_norm": 6.8125, + "learning_rate": 3.623195985538899e-06, + "loss": 0.91643267, + "memory(GiB)": 302.58, + "step": 217920, + "train_speed(iter/s)": 0.123918 + }, + { + "acc": 0.7497273, + "epoch": 1.2188237070550454, + "grad_norm": 6.46875, + "learning_rate": 3.622307052335493e-06, + "loss": 0.99583731, + "memory(GiB)": 302.58, + "step": 217940, + "train_speed(iter/s)": 0.123923 + }, + { + "acc": 0.75135665, + "epoch": 1.2189355565280247, + "grad_norm": 7.46875, + "learning_rate": 3.6214181662535553e-06, + "loss": 0.98800516, + "memory(GiB)": 302.58, + "step": 217960, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.74708729, + "epoch": 1.219047406001004, + "grad_norm": 8.25, + "learning_rate": 3.6205293273234866e-06, + "loss": 0.98823166, + "memory(GiB)": 302.58, + "step": 217980, + "train_speed(iter/s)": 0.123935 + }, + { + "acc": 0.75073338, + "epoch": 1.2191592554739832, + "grad_norm": 9.6875, + "learning_rate": 3.6196405355756904e-06, + "loss": 0.9960043, + "memory(GiB)": 302.58, + "step": 218000, + "train_speed(iter/s)": 0.12394 + }, + { + "epoch": 1.2191592554739832, + "eval_acc": 0.7066510928259472, + "eval_loss": 1.0128796100616455, + "eval_runtime": 7564.3065, + "eval_samples_per_second": 9.952, + "eval_steps_per_second": 9.952, + "step": 218000 + }, + { + "acc": 0.75041361, + "epoch": 1.2192711049469624, + "grad_norm": 6.5, + "learning_rate": 3.618751791040564e-06, + "loss": 0.98666878, + "memory(GiB)": 302.58, + "step": 218020, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.75130725, + "epoch": 1.2193829544199417, + "grad_norm": 6.96875, + "learning_rate": 3.617863093748506e-06, + "loss": 0.98695631, + "memory(GiB)": 302.58, + "step": 218040, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.76449656, + "epoch": 1.219494803892921, + "grad_norm": 5.78125, + "learning_rate": 3.6169744437299124e-06, + "loss": 0.9226409, + "memory(GiB)": 302.58, + "step": 218060, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.73848991, + "epoch": 1.2196066533659002, + "grad_norm": 6.75, + "learning_rate": 3.616085841015179e-06, + "loss": 1.04166088, + "memory(GiB)": 302.58, + "step": 218080, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.7578896, + "epoch": 1.2197185028388795, + "grad_norm": 6.6875, + "learning_rate": 3.6151972856346982e-06, + "loss": 0.93406639, + "memory(GiB)": 302.58, + "step": 218100, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74363275, + "epoch": 1.2198303523118588, + "grad_norm": 6.5, + "learning_rate": 3.614308777618861e-06, + "loss": 1.01002817, + "memory(GiB)": 302.58, + "step": 218120, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.74831553, + "epoch": 1.219942201784838, + "grad_norm": 10.75, + "learning_rate": 3.6134203169980585e-06, + "loss": 0.99190054, + "memory(GiB)": 302.58, + "step": 218140, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.76532507, + "epoch": 1.2200540512578173, + "grad_norm": 6.875, + "learning_rate": 3.6125319038026776e-06, + "loss": 0.92342129, + "memory(GiB)": 302.58, + "step": 218160, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.76543474, + "epoch": 1.2201659007307966, + "grad_norm": 8.875, + "learning_rate": 3.6116435380631044e-06, + "loss": 0.92630196, + "memory(GiB)": 302.58, + "step": 218180, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.76372986, + "epoch": 1.2202777502037758, + "grad_norm": 8.5, + "learning_rate": 3.6107552198097238e-06, + "loss": 0.91105404, + "memory(GiB)": 302.58, + "step": 218200, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.75819707, + "epoch": 1.220389599676755, + "grad_norm": 8.125, + "learning_rate": 3.60986694907292e-06, + "loss": 0.95159359, + "memory(GiB)": 302.58, + "step": 218220, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.7479701, + "epoch": 1.2205014491497344, + "grad_norm": 9.5, + "learning_rate": 3.6089787258830754e-06, + "loss": 1.00331621, + "memory(GiB)": 302.58, + "step": 218240, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.75079584, + "epoch": 1.2206132986227136, + "grad_norm": 5.375, + "learning_rate": 3.6080905502705686e-06, + "loss": 0.98902225, + "memory(GiB)": 302.58, + "step": 218260, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.74659362, + "epoch": 1.2207251480956929, + "grad_norm": 8.875, + "learning_rate": 3.607202422265779e-06, + "loss": 1.01568155, + "memory(GiB)": 302.58, + "step": 218280, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.74824829, + "epoch": 1.2208369975686721, + "grad_norm": 10.4375, + "learning_rate": 3.606314341899083e-06, + "loss": 0.99235783, + "memory(GiB)": 302.58, + "step": 218300, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.74348664, + "epoch": 1.2209488470416514, + "grad_norm": 4.96875, + "learning_rate": 3.6054263092008568e-06, + "loss": 1.01835279, + "memory(GiB)": 302.58, + "step": 218320, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75004072, + "epoch": 1.2210606965146307, + "grad_norm": 9.625, + "learning_rate": 3.604538324201472e-06, + "loss": 0.97116652, + "memory(GiB)": 302.58, + "step": 218340, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.77305574, + "epoch": 1.22117254598761, + "grad_norm": 7.40625, + "learning_rate": 3.603650386931302e-06, + "loss": 0.87216263, + "memory(GiB)": 302.58, + "step": 218360, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.76314073, + "epoch": 1.2212843954605892, + "grad_norm": 7.34375, + "learning_rate": 3.602762497420715e-06, + "loss": 0.95574112, + "memory(GiB)": 302.58, + "step": 218380, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.75112886, + "epoch": 1.2213962449335685, + "grad_norm": 6.0625, + "learning_rate": 3.6018746557000815e-06, + "loss": 0.98363323, + "memory(GiB)": 302.58, + "step": 218400, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.72639999, + "epoch": 1.2215080944065477, + "grad_norm": 7.84375, + "learning_rate": 3.600986861799769e-06, + "loss": 1.08566399, + "memory(GiB)": 302.58, + "step": 218420, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74610286, + "epoch": 1.221619943879527, + "grad_norm": 9.375, + "learning_rate": 3.600099115750142e-06, + "loss": 1.00633316, + "memory(GiB)": 302.58, + "step": 218440, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.74150586, + "epoch": 1.2217317933525063, + "grad_norm": 6.40625, + "learning_rate": 3.599211417581564e-06, + "loss": 1.00993185, + "memory(GiB)": 302.58, + "step": 218460, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74370503, + "epoch": 1.2218436428254855, + "grad_norm": 7.71875, + "learning_rate": 3.598323767324397e-06, + "loss": 1.01186848, + "memory(GiB)": 302.58, + "step": 218480, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.73174319, + "epoch": 1.2219554922984648, + "grad_norm": 6.03125, + "learning_rate": 3.5974361650090017e-06, + "loss": 1.07512112, + "memory(GiB)": 302.58, + "step": 218500, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75105758, + "epoch": 1.222067341771444, + "grad_norm": 6.75, + "learning_rate": 3.5965486106657367e-06, + "loss": 0.98559408, + "memory(GiB)": 302.58, + "step": 218520, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.76064939, + "epoch": 1.2221791912444233, + "grad_norm": 9.8125, + "learning_rate": 3.59566110432496e-06, + "loss": 0.94868727, + "memory(GiB)": 302.58, + "step": 218540, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.75114274, + "epoch": 1.2222910407174026, + "grad_norm": 9.625, + "learning_rate": 3.5947736460170264e-06, + "loss": 0.97414742, + "memory(GiB)": 302.58, + "step": 218560, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74596314, + "epoch": 1.2224028901903818, + "grad_norm": 8.125, + "learning_rate": 3.5938862357722905e-06, + "loss": 1.00876532, + "memory(GiB)": 302.58, + "step": 218580, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.74085398, + "epoch": 1.222514739663361, + "grad_norm": 8.875, + "learning_rate": 3.592998873621103e-06, + "loss": 1.03495636, + "memory(GiB)": 302.58, + "step": 218600, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.7569263, + "epoch": 1.2226265891363404, + "grad_norm": 8.4375, + "learning_rate": 3.5921115595938162e-06, + "loss": 0.94713411, + "memory(GiB)": 302.58, + "step": 218620, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.76030664, + "epoch": 1.2227384386093196, + "grad_norm": 6.8125, + "learning_rate": 3.591224293720779e-06, + "loss": 0.94290752, + "memory(GiB)": 302.58, + "step": 218640, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.74678607, + "epoch": 1.222850288082299, + "grad_norm": 6.15625, + "learning_rate": 3.590337076032336e-06, + "loss": 0.99936905, + "memory(GiB)": 302.58, + "step": 218660, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.75725284, + "epoch": 1.2229621375552782, + "grad_norm": 10.875, + "learning_rate": 3.589449906558836e-06, + "loss": 0.96808672, + "memory(GiB)": 302.58, + "step": 218680, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.747434, + "epoch": 1.2230739870282574, + "grad_norm": 6.9375, + "learning_rate": 3.5885627853306226e-06, + "loss": 0.98906517, + "memory(GiB)": 302.58, + "step": 218700, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.74901762, + "epoch": 1.2231858365012367, + "grad_norm": 5.75, + "learning_rate": 3.587675712378037e-06, + "loss": 1.00704737, + "memory(GiB)": 302.58, + "step": 218720, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.77306805, + "epoch": 1.223297685974216, + "grad_norm": 9.125, + "learning_rate": 3.586788687731421e-06, + "loss": 0.88982468, + "memory(GiB)": 302.58, + "step": 218740, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.75512891, + "epoch": 1.2234095354471952, + "grad_norm": 8.3125, + "learning_rate": 3.585901711421112e-06, + "loss": 0.95815077, + "memory(GiB)": 302.58, + "step": 218760, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.74984555, + "epoch": 1.2235213849201745, + "grad_norm": 6.46875, + "learning_rate": 3.5850147834774483e-06, + "loss": 0.96647453, + "memory(GiB)": 302.58, + "step": 218780, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.74032221, + "epoch": 1.2236332343931537, + "grad_norm": 6.75, + "learning_rate": 3.584127903930766e-06, + "loss": 1.03184462, + "memory(GiB)": 302.58, + "step": 218800, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.74048114, + "epoch": 1.223745083866133, + "grad_norm": 5.90625, + "learning_rate": 3.5832410728113997e-06, + "loss": 1.03872662, + "memory(GiB)": 302.58, + "step": 218820, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75996766, + "epoch": 1.2238569333391123, + "grad_norm": 8.6875, + "learning_rate": 3.582354290149681e-06, + "loss": 0.95319433, + "memory(GiB)": 302.58, + "step": 218840, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75788646, + "epoch": 1.2239687828120915, + "grad_norm": 10.3125, + "learning_rate": 3.5814675559759404e-06, + "loss": 0.95504313, + "memory(GiB)": 302.58, + "step": 218860, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.75678639, + "epoch": 1.2240806322850708, + "grad_norm": 6.65625, + "learning_rate": 3.580580870320508e-06, + "loss": 0.94183893, + "memory(GiB)": 302.58, + "step": 218880, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.75357461, + "epoch": 1.22419248175805, + "grad_norm": 8.125, + "learning_rate": 3.57969423321371e-06, + "loss": 0.95469456, + "memory(GiB)": 302.58, + "step": 218900, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74468637, + "epoch": 1.2243043312310293, + "grad_norm": 6.90625, + "learning_rate": 3.578807644685873e-06, + "loss": 1.00884113, + "memory(GiB)": 302.58, + "step": 218920, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.73283219, + "epoch": 1.2244161807040086, + "grad_norm": 5.40625, + "learning_rate": 3.577921104767321e-06, + "loss": 1.06423473, + "memory(GiB)": 302.58, + "step": 218940, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.77888417, + "epoch": 1.2245280301769879, + "grad_norm": 6.28125, + "learning_rate": 3.577034613488377e-06, + "loss": 0.83663177, + "memory(GiB)": 302.58, + "step": 218960, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.76342454, + "epoch": 1.2246398796499671, + "grad_norm": 9.25, + "learning_rate": 3.5761481708793612e-06, + "loss": 0.92758093, + "memory(GiB)": 302.58, + "step": 218980, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.72818198, + "epoch": 1.2247517291229464, + "grad_norm": 4.65625, + "learning_rate": 3.5752617769705928e-06, + "loss": 1.07536373, + "memory(GiB)": 302.58, + "step": 219000, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.74955754, + "epoch": 1.2248635785959257, + "grad_norm": 8.125, + "learning_rate": 3.5743754317923895e-06, + "loss": 0.98695593, + "memory(GiB)": 302.58, + "step": 219020, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.74952989, + "epoch": 1.224975428068905, + "grad_norm": 7.84375, + "learning_rate": 3.573489135375067e-06, + "loss": 1.00324173, + "memory(GiB)": 302.58, + "step": 219040, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.76061983, + "epoch": 1.2250872775418842, + "grad_norm": 8.5, + "learning_rate": 3.5726028877489392e-06, + "loss": 0.94709902, + "memory(GiB)": 302.58, + "step": 219060, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.75438585, + "epoch": 1.2251991270148634, + "grad_norm": 7.09375, + "learning_rate": 3.5717166889443177e-06, + "loss": 0.97659883, + "memory(GiB)": 302.58, + "step": 219080, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.75044918, + "epoch": 1.2253109764878427, + "grad_norm": 6.03125, + "learning_rate": 3.5708305389915154e-06, + "loss": 0.99198084, + "memory(GiB)": 302.58, + "step": 219100, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.74258556, + "epoch": 1.225422825960822, + "grad_norm": 6.21875, + "learning_rate": 3.569944437920841e-06, + "loss": 1.01265306, + "memory(GiB)": 302.58, + "step": 219120, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.75430536, + "epoch": 1.2255346754338012, + "grad_norm": 10.1875, + "learning_rate": 3.569058385762601e-06, + "loss": 0.96806183, + "memory(GiB)": 302.58, + "step": 219140, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.74630523, + "epoch": 1.2256465249067805, + "grad_norm": 8.5625, + "learning_rate": 3.5681723825471015e-06, + "loss": 0.99901114, + "memory(GiB)": 302.58, + "step": 219160, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.76165667, + "epoch": 1.2257583743797598, + "grad_norm": 6.625, + "learning_rate": 3.5672864283046473e-06, + "loss": 0.95002766, + "memory(GiB)": 302.58, + "step": 219180, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.74588971, + "epoch": 1.225870223852739, + "grad_norm": 8.25, + "learning_rate": 3.5664005230655398e-06, + "loss": 1.01470032, + "memory(GiB)": 302.58, + "step": 219200, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.74999895, + "epoch": 1.2259820733257183, + "grad_norm": 6.65625, + "learning_rate": 3.5655146668600786e-06, + "loss": 0.99957066, + "memory(GiB)": 302.58, + "step": 219220, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.75245571, + "epoch": 1.2260939227986976, + "grad_norm": 4.84375, + "learning_rate": 3.5646288597185663e-06, + "loss": 0.96502352, + "memory(GiB)": 302.58, + "step": 219240, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.74557071, + "epoch": 1.2262057722716768, + "grad_norm": 5.1875, + "learning_rate": 3.563743101671299e-06, + "loss": 0.99519615, + "memory(GiB)": 302.58, + "step": 219260, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.74503016, + "epoch": 1.226317621744656, + "grad_norm": 10.0625, + "learning_rate": 3.5628573927485703e-06, + "loss": 1.0090354, + "memory(GiB)": 302.58, + "step": 219280, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.76544824, + "epoch": 1.2264294712176353, + "grad_norm": 5.875, + "learning_rate": 3.561971732980677e-06, + "loss": 0.90373983, + "memory(GiB)": 302.58, + "step": 219300, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.75749254, + "epoch": 1.2265413206906146, + "grad_norm": 7.125, + "learning_rate": 3.5610861223979098e-06, + "loss": 0.94546347, + "memory(GiB)": 302.58, + "step": 219320, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.74545679, + "epoch": 1.2266531701635939, + "grad_norm": 6.875, + "learning_rate": 3.560200561030559e-06, + "loss": 1.01123543, + "memory(GiB)": 302.58, + "step": 219340, + "train_speed(iter/s)": 0.123762 + }, + { + "acc": 0.75066652, + "epoch": 1.2267650196365731, + "grad_norm": 8.25, + "learning_rate": 3.559315048908915e-06, + "loss": 0.98453035, + "memory(GiB)": 302.58, + "step": 219360, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.73316875, + "epoch": 1.2268768691095524, + "grad_norm": 7.46875, + "learning_rate": 3.558429586063266e-06, + "loss": 1.04975224, + "memory(GiB)": 302.58, + "step": 219380, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.73334055, + "epoch": 1.2269887185825317, + "grad_norm": 5.375, + "learning_rate": 3.5575441725238945e-06, + "loss": 1.05369682, + "memory(GiB)": 302.58, + "step": 219400, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.7441802, + "epoch": 1.227100568055511, + "grad_norm": 5.4375, + "learning_rate": 3.556658808321087e-06, + "loss": 1.00403681, + "memory(GiB)": 302.58, + "step": 219420, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.73641334, + "epoch": 1.2272124175284902, + "grad_norm": 8.1875, + "learning_rate": 3.555773493485125e-06, + "loss": 1.04880028, + "memory(GiB)": 302.58, + "step": 219440, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.75166717, + "epoch": 1.2273242670014695, + "grad_norm": 8.1875, + "learning_rate": 3.554888228046289e-06, + "loss": 0.95284224, + "memory(GiB)": 302.58, + "step": 219460, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.76363354, + "epoch": 1.2274361164744487, + "grad_norm": 7.09375, + "learning_rate": 3.5540030120348564e-06, + "loss": 0.93635988, + "memory(GiB)": 302.58, + "step": 219480, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.74538083, + "epoch": 1.227547965947428, + "grad_norm": 6.8125, + "learning_rate": 3.5531178454811077e-06, + "loss": 1.00479736, + "memory(GiB)": 302.58, + "step": 219500, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.76391501, + "epoch": 1.2276598154204073, + "grad_norm": 10.125, + "learning_rate": 3.5522327284153157e-06, + "loss": 0.93547058, + "memory(GiB)": 302.58, + "step": 219520, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.76355171, + "epoch": 1.2277716648933865, + "grad_norm": 7.21875, + "learning_rate": 3.5513476608677555e-06, + "loss": 0.94497137, + "memory(GiB)": 302.58, + "step": 219540, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.74892955, + "epoch": 1.2278835143663658, + "grad_norm": 6.96875, + "learning_rate": 3.5504626428686987e-06, + "loss": 0.99295578, + "memory(GiB)": 302.58, + "step": 219560, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.75676723, + "epoch": 1.227995363839345, + "grad_norm": 9.25, + "learning_rate": 3.5495776744484167e-06, + "loss": 0.96214476, + "memory(GiB)": 302.58, + "step": 219580, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.75714202, + "epoch": 1.2281072133123243, + "grad_norm": 5.71875, + "learning_rate": 3.548692755637177e-06, + "loss": 0.94529533, + "memory(GiB)": 302.58, + "step": 219600, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.75085773, + "epoch": 1.2282190627853036, + "grad_norm": 7.125, + "learning_rate": 3.547807886465247e-06, + "loss": 0.9891511, + "memory(GiB)": 302.58, + "step": 219620, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.74253035, + "epoch": 1.2283309122582828, + "grad_norm": 9.3125, + "learning_rate": 3.5469230669628926e-06, + "loss": 1.00957947, + "memory(GiB)": 302.58, + "step": 219640, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.75243015, + "epoch": 1.228442761731262, + "grad_norm": 8.375, + "learning_rate": 3.5460382971603763e-06, + "loss": 0.95940886, + "memory(GiB)": 302.58, + "step": 219660, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.75853009, + "epoch": 1.2285546112042414, + "grad_norm": 7.5, + "learning_rate": 3.5451535770879597e-06, + "loss": 0.95051632, + "memory(GiB)": 302.58, + "step": 219680, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.7416563, + "epoch": 1.2286664606772206, + "grad_norm": 9.8125, + "learning_rate": 3.5442689067759052e-06, + "loss": 1.02712908, + "memory(GiB)": 302.58, + "step": 219700, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.73991995, + "epoch": 1.2287783101502, + "grad_norm": 8.875, + "learning_rate": 3.5433842862544697e-06, + "loss": 1.03156099, + "memory(GiB)": 302.58, + "step": 219720, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.75462737, + "epoch": 1.2288901596231792, + "grad_norm": 8.0, + "learning_rate": 3.5424997155539107e-06, + "loss": 0.97802629, + "memory(GiB)": 302.58, + "step": 219740, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74642134, + "epoch": 1.2290020090961584, + "grad_norm": 6.5625, + "learning_rate": 3.541615194704483e-06, + "loss": 1.00307875, + "memory(GiB)": 302.58, + "step": 219760, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.74057555, + "epoch": 1.2291138585691377, + "grad_norm": 5.125, + "learning_rate": 3.5407307237364397e-06, + "loss": 1.03544817, + "memory(GiB)": 302.58, + "step": 219780, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.73097897, + "epoch": 1.229225708042117, + "grad_norm": 9.75, + "learning_rate": 3.539846302680033e-06, + "loss": 1.08619518, + "memory(GiB)": 302.58, + "step": 219800, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.74469218, + "epoch": 1.2293375575150962, + "grad_norm": 6.75, + "learning_rate": 3.5389619315655122e-06, + "loss": 1.03282013, + "memory(GiB)": 302.58, + "step": 219820, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.76040049, + "epoch": 1.2294494069880755, + "grad_norm": 5.75, + "learning_rate": 3.538077610423127e-06, + "loss": 0.95167665, + "memory(GiB)": 302.58, + "step": 219840, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.73630886, + "epoch": 1.2295612564610547, + "grad_norm": 6.65625, + "learning_rate": 3.537193339283123e-06, + "loss": 1.05185213, + "memory(GiB)": 302.58, + "step": 219860, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.76517649, + "epoch": 1.229673105934034, + "grad_norm": 9.25, + "learning_rate": 3.536309118175746e-06, + "loss": 0.95016556, + "memory(GiB)": 302.58, + "step": 219880, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.73988972, + "epoch": 1.2297849554070133, + "grad_norm": 6.09375, + "learning_rate": 3.535424947131237e-06, + "loss": 1.04127045, + "memory(GiB)": 302.58, + "step": 219900, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.75807657, + "epoch": 1.2298968048799925, + "grad_norm": 7.6875, + "learning_rate": 3.5345408261798398e-06, + "loss": 0.93679924, + "memory(GiB)": 302.58, + "step": 219920, + "train_speed(iter/s)": 0.123915 + }, + { + "acc": 0.74495983, + "epoch": 1.2300086543529718, + "grad_norm": 8.1875, + "learning_rate": 3.533656755351793e-06, + "loss": 1.01105824, + "memory(GiB)": 302.58, + "step": 219940, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.73980551, + "epoch": 1.230120503825951, + "grad_norm": 5.125, + "learning_rate": 3.532772734677334e-06, + "loss": 1.03451872, + "memory(GiB)": 302.58, + "step": 219960, + "train_speed(iter/s)": 0.123926 + }, + { + "acc": 0.75946059, + "epoch": 1.2302323532989303, + "grad_norm": 6.9375, + "learning_rate": 3.5318887641867004e-06, + "loss": 0.92938919, + "memory(GiB)": 302.58, + "step": 219980, + "train_speed(iter/s)": 0.123931 + }, + { + "acc": 0.74763236, + "epoch": 1.2303442027719096, + "grad_norm": 7.875, + "learning_rate": 3.5310048439101264e-06, + "loss": 0.97408895, + "memory(GiB)": 302.58, + "step": 220000, + "train_speed(iter/s)": 0.123936 + }, + { + "epoch": 1.2303442027719096, + "eval_acc": 0.7067051221037028, + "eval_loss": 1.0127238035202026, + "eval_runtime": 7496.2468, + "eval_samples_per_second": 10.043, + "eval_steps_per_second": 10.043, + "step": 220000 + }, + { + "acc": 0.73403387, + "epoch": 1.2304560522448889, + "grad_norm": 8.25, + "learning_rate": 3.530120973877845e-06, + "loss": 1.04954681, + "memory(GiB)": 302.58, + "step": 220020, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74237747, + "epoch": 1.2305679017178681, + "grad_norm": 7.625, + "learning_rate": 3.5292371541200876e-06, + "loss": 1.01179781, + "memory(GiB)": 302.58, + "step": 220040, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.74974623, + "epoch": 1.2306797511908474, + "grad_norm": 5.9375, + "learning_rate": 3.5283533846670823e-06, + "loss": 0.97997379, + "memory(GiB)": 302.58, + "step": 220060, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.74861994, + "epoch": 1.2307916006638266, + "grad_norm": 8.8125, + "learning_rate": 3.5274696655490574e-06, + "loss": 0.97021103, + "memory(GiB)": 302.58, + "step": 220080, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.75176988, + "epoch": 1.230903450136806, + "grad_norm": 8.5625, + "learning_rate": 3.52658599679624e-06, + "loss": 0.98152676, + "memory(GiB)": 302.58, + "step": 220100, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.75348682, + "epoch": 1.2310152996097852, + "grad_norm": 6.25, + "learning_rate": 3.525702378438853e-06, + "loss": 0.96394949, + "memory(GiB)": 302.58, + "step": 220120, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.73733416, + "epoch": 1.2311271490827644, + "grad_norm": 8.8125, + "learning_rate": 3.5248188105071207e-06, + "loss": 1.03748293, + "memory(GiB)": 302.58, + "step": 220140, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.73953919, + "epoch": 1.2312389985557437, + "grad_norm": 5.21875, + "learning_rate": 3.523935293031262e-06, + "loss": 1.02923059, + "memory(GiB)": 302.58, + "step": 220160, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.74966803, + "epoch": 1.231350848028723, + "grad_norm": 5.3125, + "learning_rate": 3.523051826041497e-06, + "loss": 0.98659821, + "memory(GiB)": 302.58, + "step": 220180, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.74443502, + "epoch": 1.2314626975017022, + "grad_norm": 10.0, + "learning_rate": 3.522168409568043e-06, + "loss": 0.99931679, + "memory(GiB)": 302.58, + "step": 220200, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.7530303, + "epoch": 1.2315745469746815, + "grad_norm": 6.71875, + "learning_rate": 3.521285043641114e-06, + "loss": 0.97506676, + "memory(GiB)": 302.58, + "step": 220220, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.76190548, + "epoch": 1.2316863964476608, + "grad_norm": 6.71875, + "learning_rate": 3.5204017282909263e-06, + "loss": 0.94883947, + "memory(GiB)": 302.58, + "step": 220240, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75240941, + "epoch": 1.23179824592064, + "grad_norm": 6.53125, + "learning_rate": 3.519518463547692e-06, + "loss": 0.95397329, + "memory(GiB)": 302.58, + "step": 220260, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.74580116, + "epoch": 1.2319100953936193, + "grad_norm": 6.96875, + "learning_rate": 3.5186352494416196e-06, + "loss": 0.99060364, + "memory(GiB)": 302.58, + "step": 220280, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.7613555, + "epoch": 1.2320219448665986, + "grad_norm": 9.3125, + "learning_rate": 3.5177520860029198e-06, + "loss": 0.92315216, + "memory(GiB)": 302.58, + "step": 220300, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.734377, + "epoch": 1.2321337943395778, + "grad_norm": 5.3125, + "learning_rate": 3.516868973261798e-06, + "loss": 1.04976959, + "memory(GiB)": 302.58, + "step": 220320, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74472532, + "epoch": 1.232245643812557, + "grad_norm": 4.84375, + "learning_rate": 3.5159859112484607e-06, + "loss": 0.99939222, + "memory(GiB)": 302.58, + "step": 220340, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74836421, + "epoch": 1.2323574932855363, + "grad_norm": 6.0, + "learning_rate": 3.51510289999311e-06, + "loss": 0.99745626, + "memory(GiB)": 302.58, + "step": 220360, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.75399809, + "epoch": 1.2324693427585156, + "grad_norm": 7.65625, + "learning_rate": 3.5142199395259496e-06, + "loss": 0.9536684, + "memory(GiB)": 302.58, + "step": 220380, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.7544425, + "epoch": 1.2325811922314949, + "grad_norm": 8.125, + "learning_rate": 3.5133370298771787e-06, + "loss": 0.96182928, + "memory(GiB)": 302.58, + "step": 220400, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.78239989, + "epoch": 1.2326930417044741, + "grad_norm": 8.9375, + "learning_rate": 3.512454171076995e-06, + "loss": 0.84894934, + "memory(GiB)": 302.58, + "step": 220420, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.73739333, + "epoch": 1.2328048911774534, + "grad_norm": 9.5625, + "learning_rate": 3.5115713631555957e-06, + "loss": 1.01137352, + "memory(GiB)": 302.58, + "step": 220440, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.75735264, + "epoch": 1.2329167406504327, + "grad_norm": 4.75, + "learning_rate": 3.5106886061431753e-06, + "loss": 0.96951389, + "memory(GiB)": 302.58, + "step": 220460, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.743221, + "epoch": 1.233028590123412, + "grad_norm": 7.6875, + "learning_rate": 3.5098059000699273e-06, + "loss": 1.00267458, + "memory(GiB)": 302.58, + "step": 220480, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.76296682, + "epoch": 1.2331404395963912, + "grad_norm": 7.6875, + "learning_rate": 3.5089232449660415e-06, + "loss": 0.92739697, + "memory(GiB)": 302.58, + "step": 220500, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.73992772, + "epoch": 1.2332522890693705, + "grad_norm": 7.625, + "learning_rate": 3.50804064086171e-06, + "loss": 1.01583176, + "memory(GiB)": 302.58, + "step": 220520, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.76232524, + "epoch": 1.2333641385423497, + "grad_norm": 7.09375, + "learning_rate": 3.50715808778712e-06, + "loss": 0.93141041, + "memory(GiB)": 302.58, + "step": 220540, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.74277902, + "epoch": 1.233475988015329, + "grad_norm": 6.46875, + "learning_rate": 3.506275585772456e-06, + "loss": 1.02868719, + "memory(GiB)": 302.58, + "step": 220560, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.75207019, + "epoch": 1.2335878374883082, + "grad_norm": 8.0625, + "learning_rate": 3.5053931348479043e-06, + "loss": 0.97592659, + "memory(GiB)": 302.58, + "step": 220580, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.73635945, + "epoch": 1.2336996869612875, + "grad_norm": 6.1875, + "learning_rate": 3.5045107350436465e-06, + "loss": 1.04261255, + "memory(GiB)": 302.58, + "step": 220600, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.75268636, + "epoch": 1.2338115364342668, + "grad_norm": 8.4375, + "learning_rate": 3.5036283863898635e-06, + "loss": 0.96931324, + "memory(GiB)": 302.58, + "step": 220620, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.75719838, + "epoch": 1.233923385907246, + "grad_norm": 9.25, + "learning_rate": 3.502746088916733e-06, + "loss": 0.94068422, + "memory(GiB)": 302.58, + "step": 220640, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.7650279, + "epoch": 1.2340352353802253, + "grad_norm": 7.8125, + "learning_rate": 3.501863842654436e-06, + "loss": 0.93616018, + "memory(GiB)": 302.58, + "step": 220660, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75674515, + "epoch": 1.2341470848532046, + "grad_norm": 6.96875, + "learning_rate": 3.500981647633145e-06, + "loss": 0.94952478, + "memory(GiB)": 302.58, + "step": 220680, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.76328306, + "epoch": 1.2342589343261838, + "grad_norm": 6.21875, + "learning_rate": 3.5000995038830354e-06, + "loss": 0.91623821, + "memory(GiB)": 302.58, + "step": 220700, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.7590189, + "epoch": 1.234370783799163, + "grad_norm": 9.9375, + "learning_rate": 3.4992174114342793e-06, + "loss": 0.92461138, + "memory(GiB)": 302.58, + "step": 220720, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.75366206, + "epoch": 1.2344826332721424, + "grad_norm": 6.46875, + "learning_rate": 3.498335370317044e-06, + "loss": 0.95848989, + "memory(GiB)": 302.58, + "step": 220740, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.73381472, + "epoch": 1.2345944827451216, + "grad_norm": 8.75, + "learning_rate": 3.4974533805615022e-06, + "loss": 1.06818209, + "memory(GiB)": 302.58, + "step": 220760, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.7445972, + "epoch": 1.2347063322181009, + "grad_norm": 5.59375, + "learning_rate": 3.4965714421978204e-06, + "loss": 1.01656666, + "memory(GiB)": 302.58, + "step": 220780, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.7553472, + "epoch": 1.2348181816910802, + "grad_norm": 7.125, + "learning_rate": 3.4956895552561616e-06, + "loss": 0.96004744, + "memory(GiB)": 302.58, + "step": 220800, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.75880585, + "epoch": 1.2349300311640594, + "grad_norm": 8.5625, + "learning_rate": 3.4948077197666907e-06, + "loss": 0.94857121, + "memory(GiB)": 302.58, + "step": 220820, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75785661, + "epoch": 1.2350418806370387, + "grad_norm": 7.78125, + "learning_rate": 3.493925935759567e-06, + "loss": 0.95801172, + "memory(GiB)": 302.58, + "step": 220840, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.76408601, + "epoch": 1.235153730110018, + "grad_norm": 9.0625, + "learning_rate": 3.493044203264953e-06, + "loss": 0.91434803, + "memory(GiB)": 302.58, + "step": 220860, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.75608721, + "epoch": 1.2352655795829972, + "grad_norm": 8.375, + "learning_rate": 3.492162522313006e-06, + "loss": 0.95466356, + "memory(GiB)": 302.58, + "step": 220880, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.74508362, + "epoch": 1.2353774290559765, + "grad_norm": 8.25, + "learning_rate": 3.4912808929338814e-06, + "loss": 1.01016273, + "memory(GiB)": 302.58, + "step": 220900, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.7448071, + "epoch": 1.2354892785289557, + "grad_norm": 7.0, + "learning_rate": 3.490399315157735e-06, + "loss": 1.00812845, + "memory(GiB)": 302.58, + "step": 220920, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.77110357, + "epoch": 1.235601128001935, + "grad_norm": 5.65625, + "learning_rate": 3.489517789014719e-06, + "loss": 0.88946762, + "memory(GiB)": 302.58, + "step": 220940, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.73265896, + "epoch": 1.2357129774749143, + "grad_norm": 7.125, + "learning_rate": 3.4886363145349824e-06, + "loss": 1.06954737, + "memory(GiB)": 302.58, + "step": 220960, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.76245122, + "epoch": 1.2358248269478935, + "grad_norm": 5.84375, + "learning_rate": 3.4877548917486783e-06, + "loss": 0.92426128, + "memory(GiB)": 302.58, + "step": 220980, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.75657258, + "epoch": 1.2359366764208728, + "grad_norm": 6.46875, + "learning_rate": 3.486873520685951e-06, + "loss": 0.95474396, + "memory(GiB)": 302.58, + "step": 221000, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.75833273, + "epoch": 1.236048525893852, + "grad_norm": 7.84375, + "learning_rate": 3.4859922013769476e-06, + "loss": 0.94485493, + "memory(GiB)": 302.58, + "step": 221020, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.76160126, + "epoch": 1.2361603753668313, + "grad_norm": 10.25, + "learning_rate": 3.485110933851812e-06, + "loss": 0.94670258, + "memory(GiB)": 302.58, + "step": 221040, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.75889602, + "epoch": 1.2362722248398106, + "grad_norm": 5.53125, + "learning_rate": 3.484229718140686e-06, + "loss": 0.94514999, + "memory(GiB)": 302.58, + "step": 221060, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.75205097, + "epoch": 1.2363840743127898, + "grad_norm": 10.3125, + "learning_rate": 3.48334855427371e-06, + "loss": 0.98301516, + "memory(GiB)": 302.58, + "step": 221080, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.75769606, + "epoch": 1.2364959237857691, + "grad_norm": 6.03125, + "learning_rate": 3.4824674422810214e-06, + "loss": 0.94413834, + "memory(GiB)": 302.58, + "step": 221100, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.73645787, + "epoch": 1.2366077732587484, + "grad_norm": 8.875, + "learning_rate": 3.481586382192759e-06, + "loss": 1.03294964, + "memory(GiB)": 302.58, + "step": 221120, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.74644737, + "epoch": 1.2367196227317276, + "grad_norm": 6.5625, + "learning_rate": 3.4807053740390573e-06, + "loss": 1.00773621, + "memory(GiB)": 302.58, + "step": 221140, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.75143418, + "epoch": 1.236831472204707, + "grad_norm": 7.75, + "learning_rate": 3.4798244178500497e-06, + "loss": 0.97216921, + "memory(GiB)": 302.58, + "step": 221160, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.76239905, + "epoch": 1.2369433216776862, + "grad_norm": 6.875, + "learning_rate": 3.4789435136558674e-06, + "loss": 0.91846104, + "memory(GiB)": 302.58, + "step": 221180, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.74148102, + "epoch": 1.2370551711506654, + "grad_norm": 5.75, + "learning_rate": 3.4780626614866397e-06, + "loss": 1.01591473, + "memory(GiB)": 302.58, + "step": 221200, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.75374579, + "epoch": 1.2371670206236447, + "grad_norm": 9.4375, + "learning_rate": 3.477181861372495e-06, + "loss": 0.95718546, + "memory(GiB)": 302.58, + "step": 221220, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.74840431, + "epoch": 1.237278870096624, + "grad_norm": 6.96875, + "learning_rate": 3.476301113343559e-06, + "loss": 1.00426893, + "memory(GiB)": 302.58, + "step": 221240, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.75657544, + "epoch": 1.2373907195696032, + "grad_norm": 9.0625, + "learning_rate": 3.4754204174299565e-06, + "loss": 0.93772402, + "memory(GiB)": 302.58, + "step": 221260, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.75640349, + "epoch": 1.2375025690425825, + "grad_norm": 5.75, + "learning_rate": 3.4745397736618102e-06, + "loss": 0.97581615, + "memory(GiB)": 302.58, + "step": 221280, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.75683317, + "epoch": 1.2376144185155618, + "grad_norm": 7.46875, + "learning_rate": 3.473659182069241e-06, + "loss": 0.95300217, + "memory(GiB)": 302.58, + "step": 221300, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.75204587, + "epoch": 1.237726267988541, + "grad_norm": 5.25, + "learning_rate": 3.4727786426823675e-06, + "loss": 0.97634249, + "memory(GiB)": 302.58, + "step": 221320, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.76284628, + "epoch": 1.2378381174615203, + "grad_norm": 8.875, + "learning_rate": 3.471898155531307e-06, + "loss": 0.93506088, + "memory(GiB)": 302.58, + "step": 221340, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.75958252, + "epoch": 1.2379499669344995, + "grad_norm": 6.15625, + "learning_rate": 3.471017720646176e-06, + "loss": 0.94840832, + "memory(GiB)": 302.58, + "step": 221360, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.7374095, + "epoch": 1.2380618164074788, + "grad_norm": 5.03125, + "learning_rate": 3.4701373380570847e-06, + "loss": 1.04342575, + "memory(GiB)": 302.58, + "step": 221380, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.74028172, + "epoch": 1.238173665880458, + "grad_norm": 7.59375, + "learning_rate": 3.4692570077941497e-06, + "loss": 1.00813951, + "memory(GiB)": 302.58, + "step": 221400, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.75451498, + "epoch": 1.2382855153534373, + "grad_norm": 7.0625, + "learning_rate": 3.4683767298874782e-06, + "loss": 0.97657919, + "memory(GiB)": 302.58, + "step": 221420, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.73814316, + "epoch": 1.2383973648264166, + "grad_norm": 9.0, + "learning_rate": 3.4674965043671793e-06, + "loss": 1.0616683, + "memory(GiB)": 302.58, + "step": 221440, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.74490333, + "epoch": 1.2385092142993959, + "grad_norm": 7.8125, + "learning_rate": 3.46661633126336e-06, + "loss": 1.01728954, + "memory(GiB)": 302.58, + "step": 221460, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.78478532, + "epoch": 1.2386210637723751, + "grad_norm": 8.5, + "learning_rate": 3.4657362106061236e-06, + "loss": 0.84968796, + "memory(GiB)": 302.58, + "step": 221480, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.75195565, + "epoch": 1.2387329132453544, + "grad_norm": 6.9375, + "learning_rate": 3.4648561424255743e-06, + "loss": 1.00043201, + "memory(GiB)": 302.58, + "step": 221500, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.74945388, + "epoch": 1.2388447627183337, + "grad_norm": 15.0625, + "learning_rate": 3.463976126751812e-06, + "loss": 1.00141411, + "memory(GiB)": 302.58, + "step": 221520, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.76409545, + "epoch": 1.238956612191313, + "grad_norm": 9.5, + "learning_rate": 3.4630961636149375e-06, + "loss": 0.91514225, + "memory(GiB)": 302.58, + "step": 221540, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.75274892, + "epoch": 1.2390684616642922, + "grad_norm": 10.875, + "learning_rate": 3.462216253045047e-06, + "loss": 0.96062393, + "memory(GiB)": 302.58, + "step": 221560, + "train_speed(iter/s)": 0.123822 + }, + { + "acc": 0.73686013, + "epoch": 1.2391803111372715, + "grad_norm": 9.375, + "learning_rate": 3.4613363950722373e-06, + "loss": 1.04536743, + "memory(GiB)": 302.58, + "step": 221580, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.76341481, + "epoch": 1.2392921606102507, + "grad_norm": 9.25, + "learning_rate": 3.460456589726602e-06, + "loss": 0.93516741, + "memory(GiB)": 302.58, + "step": 221600, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.75912185, + "epoch": 1.23940401008323, + "grad_norm": 7.4375, + "learning_rate": 3.4595768370382334e-06, + "loss": 0.92415514, + "memory(GiB)": 302.58, + "step": 221620, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.76469283, + "epoch": 1.2395158595562092, + "grad_norm": 9.5, + "learning_rate": 3.45869713703722e-06, + "loss": 0.92685719, + "memory(GiB)": 302.58, + "step": 221640, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.73443055, + "epoch": 1.2396277090291885, + "grad_norm": 6.09375, + "learning_rate": 3.457817489753652e-06, + "loss": 1.03832102, + "memory(GiB)": 302.58, + "step": 221660, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.75360293, + "epoch": 1.2397395585021678, + "grad_norm": 7.25, + "learning_rate": 3.456937895217617e-06, + "loss": 0.97405252, + "memory(GiB)": 302.58, + "step": 221680, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.73056622, + "epoch": 1.239851407975147, + "grad_norm": 5.9375, + "learning_rate": 3.456058353459199e-06, + "loss": 1.09924698, + "memory(GiB)": 302.58, + "step": 221700, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.73775392, + "epoch": 1.2399632574481263, + "grad_norm": 8.75, + "learning_rate": 3.4551788645084806e-06, + "loss": 1.03581486, + "memory(GiB)": 302.58, + "step": 221720, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.76350064, + "epoch": 1.2400751069211056, + "grad_norm": 8.25, + "learning_rate": 3.4542994283955446e-06, + "loss": 0.90840139, + "memory(GiB)": 302.58, + "step": 221740, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74271865, + "epoch": 1.2401869563940848, + "grad_norm": 7.40625, + "learning_rate": 3.453420045150468e-06, + "loss": 1.04042158, + "memory(GiB)": 302.58, + "step": 221760, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.75196471, + "epoch": 1.240298805867064, + "grad_norm": 7.625, + "learning_rate": 3.4525407148033295e-06, + "loss": 0.96598854, + "memory(GiB)": 302.58, + "step": 221780, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.769981, + "epoch": 1.2404106553400434, + "grad_norm": 4.96875, + "learning_rate": 3.451661437384206e-06, + "loss": 0.89995699, + "memory(GiB)": 302.58, + "step": 221800, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.74953437, + "epoch": 1.2405225048130226, + "grad_norm": 8.125, + "learning_rate": 3.450782212923172e-06, + "loss": 0.97507582, + "memory(GiB)": 302.58, + "step": 221820, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.76042848, + "epoch": 1.2406343542860019, + "grad_norm": 5.875, + "learning_rate": 3.4499030414502977e-06, + "loss": 0.95574379, + "memory(GiB)": 302.58, + "step": 221840, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.74524918, + "epoch": 1.2407462037589811, + "grad_norm": 7.8125, + "learning_rate": 3.4490239229956556e-06, + "loss": 0.98821411, + "memory(GiB)": 302.58, + "step": 221860, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.7508316, + "epoch": 1.2408580532319604, + "grad_norm": 8.25, + "learning_rate": 3.4481448575893126e-06, + "loss": 1.00275173, + "memory(GiB)": 302.58, + "step": 221880, + "train_speed(iter/s)": 0.123904 + }, + { + "acc": 0.74778538, + "epoch": 1.2409699027049397, + "grad_norm": 9.375, + "learning_rate": 3.447265845261336e-06, + "loss": 1.02558527, + "memory(GiB)": 302.58, + "step": 221900, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.76371784, + "epoch": 1.241081752177919, + "grad_norm": 8.0, + "learning_rate": 3.446386886041791e-06, + "loss": 0.93613281, + "memory(GiB)": 302.58, + "step": 221920, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.7332994, + "epoch": 1.2411936016508982, + "grad_norm": 5.15625, + "learning_rate": 3.4455079799607417e-06, + "loss": 1.04230442, + "memory(GiB)": 302.58, + "step": 221940, + "train_speed(iter/s)": 0.123919 + }, + { + "acc": 0.74563012, + "epoch": 1.2413054511238775, + "grad_norm": 5.90625, + "learning_rate": 3.4446291270482477e-06, + "loss": 0.99193687, + "memory(GiB)": 302.58, + "step": 221960, + "train_speed(iter/s)": 0.123924 + }, + { + "acc": 0.74012175, + "epoch": 1.2414173005968567, + "grad_norm": 5.25, + "learning_rate": 3.4437503273343687e-06, + "loss": 1.0226963, + "memory(GiB)": 302.58, + "step": 221980, + "train_speed(iter/s)": 0.123929 + }, + { + "acc": 0.74606028, + "epoch": 1.241529150069836, + "grad_norm": 9.875, + "learning_rate": 3.442871580849164e-06, + "loss": 1.01339741, + "memory(GiB)": 302.58, + "step": 222000, + "train_speed(iter/s)": 0.123934 + }, + { + "epoch": 1.241529150069836, + "eval_acc": 0.7067297212000534, + "eval_loss": 1.0128016471862793, + "eval_runtime": 7551.4604, + "eval_samples_per_second": 9.969, + "eval_steps_per_second": 9.969, + "step": 222000 + }, + { + "acc": 0.74555063, + "epoch": 1.2416409995428153, + "grad_norm": 6.03125, + "learning_rate": 3.44199288762269e-06, + "loss": 1.02568378, + "memory(GiB)": 302.58, + "step": 222020, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.75694079, + "epoch": 1.2417528490157945, + "grad_norm": 5.75, + "learning_rate": 3.441114247684999e-06, + "loss": 0.94617529, + "memory(GiB)": 302.58, + "step": 222040, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.77067637, + "epoch": 1.2418646984887738, + "grad_norm": 5.25, + "learning_rate": 3.4402356610661437e-06, + "loss": 0.88926649, + "memory(GiB)": 302.58, + "step": 222060, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74219952, + "epoch": 1.241976547961753, + "grad_norm": 5.90625, + "learning_rate": 3.4393571277961746e-06, + "loss": 1.0227356, + "memory(GiB)": 302.58, + "step": 222080, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.76134238, + "epoch": 1.2420883974347323, + "grad_norm": 9.9375, + "learning_rate": 3.4384786479051415e-06, + "loss": 0.96677732, + "memory(GiB)": 302.58, + "step": 222100, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.75932798, + "epoch": 1.2422002469077116, + "grad_norm": 6.75, + "learning_rate": 3.4376002214230876e-06, + "loss": 0.9749651, + "memory(GiB)": 302.58, + "step": 222120, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.76136537, + "epoch": 1.2423120963806908, + "grad_norm": 6.34375, + "learning_rate": 3.4367218483800625e-06, + "loss": 0.93522482, + "memory(GiB)": 302.58, + "step": 222140, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.74508376, + "epoch": 1.24242394585367, + "grad_norm": 5.96875, + "learning_rate": 3.4358435288061066e-06, + "loss": 1.00602655, + "memory(GiB)": 302.58, + "step": 222160, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.74869986, + "epoch": 1.2425357953266494, + "grad_norm": 5.96875, + "learning_rate": 3.4349652627312615e-06, + "loss": 0.99550629, + "memory(GiB)": 302.58, + "step": 222180, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.7580698, + "epoch": 1.2426476447996286, + "grad_norm": 7.75, + "learning_rate": 3.4340870501855672e-06, + "loss": 0.95186167, + "memory(GiB)": 302.58, + "step": 222200, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.7674058, + "epoch": 1.242759494272608, + "grad_norm": 10.5, + "learning_rate": 3.4332088911990614e-06, + "loss": 0.90236607, + "memory(GiB)": 302.58, + "step": 222220, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.75641084, + "epoch": 1.2428713437455872, + "grad_norm": 7.4375, + "learning_rate": 3.4323307858017797e-06, + "loss": 0.94919224, + "memory(GiB)": 302.58, + "step": 222240, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.75255756, + "epoch": 1.2429831932185664, + "grad_norm": 6.15625, + "learning_rate": 3.4314527340237546e-06, + "loss": 0.97422724, + "memory(GiB)": 302.58, + "step": 222260, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.75928183, + "epoch": 1.2430950426915457, + "grad_norm": 7.15625, + "learning_rate": 3.4305747358950214e-06, + "loss": 0.9330492, + "memory(GiB)": 302.58, + "step": 222280, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.73745818, + "epoch": 1.243206892164525, + "grad_norm": 7.5625, + "learning_rate": 3.4296967914456086e-06, + "loss": 1.03990269, + "memory(GiB)": 302.58, + "step": 222300, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.75367932, + "epoch": 1.2433187416375042, + "grad_norm": 6.84375, + "learning_rate": 3.428818900705545e-06, + "loss": 0.96656666, + "memory(GiB)": 302.58, + "step": 222320, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.74916987, + "epoch": 1.2434305911104835, + "grad_norm": 5.84375, + "learning_rate": 3.4279410637048566e-06, + "loss": 0.98782711, + "memory(GiB)": 302.58, + "step": 222340, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.75116696, + "epoch": 1.2435424405834627, + "grad_norm": 7.71875, + "learning_rate": 3.427063280473569e-06, + "loss": 0.98344164, + "memory(GiB)": 302.58, + "step": 222360, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74328251, + "epoch": 1.243654290056442, + "grad_norm": 6.875, + "learning_rate": 3.4261855510417046e-06, + "loss": 1.01885271, + "memory(GiB)": 302.58, + "step": 222380, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.7714632, + "epoch": 1.2437661395294213, + "grad_norm": 4.8125, + "learning_rate": 3.425307875439284e-06, + "loss": 0.87562866, + "memory(GiB)": 302.58, + "step": 222400, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.76050315, + "epoch": 1.2438779890024005, + "grad_norm": 6.3125, + "learning_rate": 3.424430253696328e-06, + "loss": 0.9275424, + "memory(GiB)": 302.58, + "step": 222420, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.75559821, + "epoch": 1.2439898384753798, + "grad_norm": 5.75, + "learning_rate": 3.4235526858428536e-06, + "loss": 0.96663313, + "memory(GiB)": 302.58, + "step": 222440, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.75803313, + "epoch": 1.244101687948359, + "grad_norm": 6.40625, + "learning_rate": 3.4226751719088758e-06, + "loss": 0.95261402, + "memory(GiB)": 302.58, + "step": 222460, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.75478287, + "epoch": 1.2442135374213383, + "grad_norm": 9.3125, + "learning_rate": 3.4217977119244093e-06, + "loss": 0.95417118, + "memory(GiB)": 302.58, + "step": 222480, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.76677661, + "epoch": 1.2443253868943176, + "grad_norm": 8.6875, + "learning_rate": 3.4209203059194653e-06, + "loss": 0.91796608, + "memory(GiB)": 302.58, + "step": 222500, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73469162, + "epoch": 1.2444372363672969, + "grad_norm": 9.375, + "learning_rate": 3.420042953924052e-06, + "loss": 1.05750256, + "memory(GiB)": 302.58, + "step": 222520, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.77289271, + "epoch": 1.2445490858402761, + "grad_norm": 7.96875, + "learning_rate": 3.4191656559681814e-06, + "loss": 0.86827154, + "memory(GiB)": 302.58, + "step": 222540, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.77017202, + "epoch": 1.2446609353132554, + "grad_norm": 5.71875, + "learning_rate": 3.4182884120818584e-06, + "loss": 0.88649979, + "memory(GiB)": 302.58, + "step": 222560, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.7356863, + "epoch": 1.2447727847862347, + "grad_norm": 11.875, + "learning_rate": 3.4174112222950863e-06, + "loss": 1.04341259, + "memory(GiB)": 302.58, + "step": 222580, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.74212384, + "epoch": 1.244884634259214, + "grad_norm": 7.78125, + "learning_rate": 3.4165340866378688e-06, + "loss": 1.01441717, + "memory(GiB)": 302.58, + "step": 222600, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.7537796, + "epoch": 1.2449964837321932, + "grad_norm": 7.25, + "learning_rate": 3.415657005140207e-06, + "loss": 0.96321096, + "memory(GiB)": 302.58, + "step": 222620, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74464598, + "epoch": 1.2451083332051724, + "grad_norm": 5.46875, + "learning_rate": 3.414779977832099e-06, + "loss": 0.99101, + "memory(GiB)": 302.58, + "step": 222640, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74912868, + "epoch": 1.2452201826781517, + "grad_norm": 6.75, + "learning_rate": 3.4139030047435415e-06, + "loss": 1.01997204, + "memory(GiB)": 302.58, + "step": 222660, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.76078062, + "epoch": 1.245332032151131, + "grad_norm": 7.09375, + "learning_rate": 3.4130260859045315e-06, + "loss": 0.94952374, + "memory(GiB)": 302.58, + "step": 222680, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75570307, + "epoch": 1.2454438816241102, + "grad_norm": 7.59375, + "learning_rate": 3.4121492213450626e-06, + "loss": 0.96940708, + "memory(GiB)": 302.58, + "step": 222700, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.75568686, + "epoch": 1.2455557310970895, + "grad_norm": 12.625, + "learning_rate": 3.411272411095124e-06, + "loss": 0.97886114, + "memory(GiB)": 302.58, + "step": 222720, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.75393553, + "epoch": 1.2456675805700688, + "grad_norm": 8.125, + "learning_rate": 3.4103956551847074e-06, + "loss": 0.97773094, + "memory(GiB)": 302.58, + "step": 222740, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.73616657, + "epoch": 1.245779430043048, + "grad_norm": 5.71875, + "learning_rate": 3.409518953643799e-06, + "loss": 1.05652161, + "memory(GiB)": 302.58, + "step": 222760, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.74901404, + "epoch": 1.2458912795160273, + "grad_norm": 8.0625, + "learning_rate": 3.4086423065023865e-06, + "loss": 0.99610338, + "memory(GiB)": 302.58, + "step": 222780, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.75929794, + "epoch": 1.2460031289890066, + "grad_norm": 7.21875, + "learning_rate": 3.407765713790452e-06, + "loss": 0.93178844, + "memory(GiB)": 302.58, + "step": 222800, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.75520611, + "epoch": 1.2461149784619858, + "grad_norm": 4.375, + "learning_rate": 3.4068891755379796e-06, + "loss": 0.96598263, + "memory(GiB)": 302.58, + "step": 222820, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.74064379, + "epoch": 1.246226827934965, + "grad_norm": 5.78125, + "learning_rate": 3.4060126917749492e-06, + "loss": 1.02645855, + "memory(GiB)": 302.58, + "step": 222840, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.73413072, + "epoch": 1.2463386774079444, + "grad_norm": 7.25, + "learning_rate": 3.405136262531339e-06, + "loss": 1.06493254, + "memory(GiB)": 302.58, + "step": 222860, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.75203471, + "epoch": 1.2464505268809236, + "grad_norm": 7.28125, + "learning_rate": 3.4042598878371257e-06, + "loss": 0.98512573, + "memory(GiB)": 302.58, + "step": 222880, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.74524536, + "epoch": 1.2465623763539029, + "grad_norm": 8.8125, + "learning_rate": 3.403383567722285e-06, + "loss": 1.00229015, + "memory(GiB)": 302.58, + "step": 222900, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.74314122, + "epoch": 1.2466742258268821, + "grad_norm": 5.4375, + "learning_rate": 3.402507302216789e-06, + "loss": 1.02198982, + "memory(GiB)": 302.58, + "step": 222920, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.74241843, + "epoch": 1.2467860752998614, + "grad_norm": 7.125, + "learning_rate": 3.401631091350607e-06, + "loss": 1.00185976, + "memory(GiB)": 302.58, + "step": 222940, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.74341764, + "epoch": 1.2468979247728407, + "grad_norm": 4.375, + "learning_rate": 3.4007549351537105e-06, + "loss": 1.01732464, + "memory(GiB)": 302.58, + "step": 222960, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.76139321, + "epoch": 1.24700977424582, + "grad_norm": 8.5625, + "learning_rate": 3.3998788336560673e-06, + "loss": 0.94563808, + "memory(GiB)": 302.58, + "step": 222980, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.76041198, + "epoch": 1.2471216237187992, + "grad_norm": 6.4375, + "learning_rate": 3.399002786887642e-06, + "loss": 0.94509201, + "memory(GiB)": 302.58, + "step": 223000, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.74599657, + "epoch": 1.2472334731917785, + "grad_norm": 7.90625, + "learning_rate": 3.398126794878397e-06, + "loss": 1.00887938, + "memory(GiB)": 302.58, + "step": 223020, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.76509089, + "epoch": 1.2473453226647577, + "grad_norm": 9.0, + "learning_rate": 3.397250857658296e-06, + "loss": 0.93320217, + "memory(GiB)": 302.58, + "step": 223040, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.74950809, + "epoch": 1.247457172137737, + "grad_norm": 6.71875, + "learning_rate": 3.396374975257297e-06, + "loss": 0.9697031, + "memory(GiB)": 302.58, + "step": 223060, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.75631833, + "epoch": 1.2475690216107163, + "grad_norm": 6.375, + "learning_rate": 3.3954991477053585e-06, + "loss": 0.95649471, + "memory(GiB)": 302.58, + "step": 223080, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.74989829, + "epoch": 1.2476808710836955, + "grad_norm": 6.96875, + "learning_rate": 3.3946233750324374e-06, + "loss": 0.96693935, + "memory(GiB)": 302.58, + "step": 223100, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.75429821, + "epoch": 1.2477927205566748, + "grad_norm": 8.625, + "learning_rate": 3.393747657268488e-06, + "loss": 0.96961536, + "memory(GiB)": 302.58, + "step": 223120, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.76141453, + "epoch": 1.247904570029654, + "grad_norm": 7.71875, + "learning_rate": 3.392871994443461e-06, + "loss": 0.92835808, + "memory(GiB)": 302.58, + "step": 223140, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.75014267, + "epoch": 1.2480164195026333, + "grad_norm": 7.1875, + "learning_rate": 3.3919963865873086e-06, + "loss": 0.97189522, + "memory(GiB)": 302.58, + "step": 223160, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.75736108, + "epoch": 1.2481282689756126, + "grad_norm": 7.21875, + "learning_rate": 3.391120833729978e-06, + "loss": 0.96502285, + "memory(GiB)": 302.58, + "step": 223180, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.75516238, + "epoch": 1.2482401184485918, + "grad_norm": 5.65625, + "learning_rate": 3.3902453359014175e-06, + "loss": 0.96596231, + "memory(GiB)": 302.58, + "step": 223200, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.74880877, + "epoch": 1.248351967921571, + "grad_norm": 6.875, + "learning_rate": 3.3893698931315693e-06, + "loss": 0.98663063, + "memory(GiB)": 302.58, + "step": 223220, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.74379125, + "epoch": 1.2484638173945504, + "grad_norm": 7.46875, + "learning_rate": 3.388494505450378e-06, + "loss": 1.02136831, + "memory(GiB)": 302.58, + "step": 223240, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.74255815, + "epoch": 1.2485756668675296, + "grad_norm": 7.28125, + "learning_rate": 3.387619172887785e-06, + "loss": 1.02530336, + "memory(GiB)": 302.58, + "step": 223260, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.74919391, + "epoch": 1.248687516340509, + "grad_norm": 8.4375, + "learning_rate": 3.3867438954737292e-06, + "loss": 0.98984728, + "memory(GiB)": 302.58, + "step": 223280, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.74268289, + "epoch": 1.2487993658134882, + "grad_norm": 6.375, + "learning_rate": 3.3858686732381486e-06, + "loss": 1.01096573, + "memory(GiB)": 302.58, + "step": 223300, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.74768643, + "epoch": 1.2489112152864674, + "grad_norm": 7.65625, + "learning_rate": 3.384993506210976e-06, + "loss": 1.01643476, + "memory(GiB)": 302.58, + "step": 223320, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.73477993, + "epoch": 1.2490230647594467, + "grad_norm": 6.9375, + "learning_rate": 3.384118394422148e-06, + "loss": 1.04591408, + "memory(GiB)": 302.58, + "step": 223340, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.75893559, + "epoch": 1.249134914232426, + "grad_norm": 8.875, + "learning_rate": 3.383243337901595e-06, + "loss": 0.94523554, + "memory(GiB)": 302.58, + "step": 223360, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.7541625, + "epoch": 1.2492467637054052, + "grad_norm": 6.5, + "learning_rate": 3.3823683366792454e-06, + "loss": 0.93477345, + "memory(GiB)": 302.58, + "step": 223380, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.75343356, + "epoch": 1.2493586131783845, + "grad_norm": 5.3125, + "learning_rate": 3.3814933907850277e-06, + "loss": 0.96035528, + "memory(GiB)": 302.58, + "step": 223400, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.75074492, + "epoch": 1.2494704626513637, + "grad_norm": 6.03125, + "learning_rate": 3.380618500248869e-06, + "loss": 0.98009367, + "memory(GiB)": 302.58, + "step": 223420, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.74770393, + "epoch": 1.249582312124343, + "grad_norm": 7.84375, + "learning_rate": 3.3797436651006922e-06, + "loss": 0.97658148, + "memory(GiB)": 302.58, + "step": 223440, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.76739049, + "epoch": 1.2496941615973223, + "grad_norm": 6.96875, + "learning_rate": 3.37886888537042e-06, + "loss": 0.89680042, + "memory(GiB)": 302.58, + "step": 223460, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.74689569, + "epoch": 1.2498060110703015, + "grad_norm": 8.25, + "learning_rate": 3.3779941610879724e-06, + "loss": 0.99758472, + "memory(GiB)": 302.58, + "step": 223480, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.74417391, + "epoch": 1.2499178605432808, + "grad_norm": 7.9375, + "learning_rate": 3.377119492283267e-06, + "loss": 1.00733452, + "memory(GiB)": 302.58, + "step": 223500, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.74311814, + "epoch": 1.25002971001626, + "grad_norm": 5.4375, + "learning_rate": 3.3762448789862224e-06, + "loss": 1.03879709, + "memory(GiB)": 302.58, + "step": 223520, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.75049477, + "epoch": 1.2501415594892393, + "grad_norm": 6.5, + "learning_rate": 3.3753703212267496e-06, + "loss": 0.9791647, + "memory(GiB)": 302.58, + "step": 223540, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.74767804, + "epoch": 1.2502534089622186, + "grad_norm": 7.0, + "learning_rate": 3.3744958190347654e-06, + "loss": 0.97903261, + "memory(GiB)": 302.58, + "step": 223560, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.76421051, + "epoch": 1.2503652584351979, + "grad_norm": 5.65625, + "learning_rate": 3.373621372440178e-06, + "loss": 0.93143787, + "memory(GiB)": 302.58, + "step": 223580, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.7657649, + "epoch": 1.2504771079081771, + "grad_norm": 7.25, + "learning_rate": 3.3727469814728964e-06, + "loss": 0.91104898, + "memory(GiB)": 302.58, + "step": 223600, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.75562749, + "epoch": 1.2505889573811564, + "grad_norm": 6.8125, + "learning_rate": 3.371872646162828e-06, + "loss": 0.96288671, + "memory(GiB)": 302.58, + "step": 223620, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.74856305, + "epoch": 1.2507008068541356, + "grad_norm": 5.5, + "learning_rate": 3.3709983665398783e-06, + "loss": 0.99842901, + "memory(GiB)": 302.58, + "step": 223640, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.74263492, + "epoch": 1.250812656327115, + "grad_norm": 7.40625, + "learning_rate": 3.3701241426339494e-06, + "loss": 0.99265261, + "memory(GiB)": 302.58, + "step": 223660, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.74819384, + "epoch": 1.2509245058000942, + "grad_norm": 8.25, + "learning_rate": 3.369249974474942e-06, + "loss": 0.97193575, + "memory(GiB)": 302.58, + "step": 223680, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.75150099, + "epoch": 1.2510363552730734, + "grad_norm": 9.0625, + "learning_rate": 3.3683758620927574e-06, + "loss": 0.99463549, + "memory(GiB)": 302.58, + "step": 223700, + "train_speed(iter/s)": 0.123846 + }, + { + "acc": 0.76351166, + "epoch": 1.2511482047460527, + "grad_norm": 6.15625, + "learning_rate": 3.3675018055172924e-06, + "loss": 0.93012953, + "memory(GiB)": 302.58, + "step": 223720, + "train_speed(iter/s)": 0.123851 + }, + { + "acc": 0.72136631, + "epoch": 1.251260054219032, + "grad_norm": 6.15625, + "learning_rate": 3.3666278047784425e-06, + "loss": 1.09555082, + "memory(GiB)": 302.58, + "step": 223740, + "train_speed(iter/s)": 0.123856 + }, + { + "acc": 0.75592713, + "epoch": 1.2513719036920112, + "grad_norm": 6.5625, + "learning_rate": 3.3657538599061e-06, + "loss": 0.95390968, + "memory(GiB)": 302.58, + "step": 223760, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.76187382, + "epoch": 1.2514837531649905, + "grad_norm": 7.4375, + "learning_rate": 3.3648799709301584e-06, + "loss": 0.93386126, + "memory(GiB)": 302.58, + "step": 223780, + "train_speed(iter/s)": 0.123866 + }, + { + "acc": 0.75738435, + "epoch": 1.2515956026379698, + "grad_norm": 9.0625, + "learning_rate": 3.364006137880506e-06, + "loss": 0.94136648, + "memory(GiB)": 302.58, + "step": 223800, + "train_speed(iter/s)": 0.123871 + }, + { + "acc": 0.75218415, + "epoch": 1.251707452110949, + "grad_norm": 6.46875, + "learning_rate": 3.3631323607870305e-06, + "loss": 0.98320541, + "memory(GiB)": 302.58, + "step": 223820, + "train_speed(iter/s)": 0.123876 + }, + { + "acc": 0.74449015, + "epoch": 1.2518193015839283, + "grad_norm": 5.25, + "learning_rate": 3.36225863967962e-06, + "loss": 0.99397125, + "memory(GiB)": 302.58, + "step": 223840, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.74666634, + "epoch": 1.2519311510569076, + "grad_norm": 7.875, + "learning_rate": 3.3613849745881564e-06, + "loss": 0.9823988, + "memory(GiB)": 302.58, + "step": 223860, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.74898267, + "epoch": 1.2520430005298868, + "grad_norm": 6.875, + "learning_rate": 3.3605113655425232e-06, + "loss": 1.00644579, + "memory(GiB)": 302.58, + "step": 223880, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.75476604, + "epoch": 1.252154850002866, + "grad_norm": 8.125, + "learning_rate": 3.3596378125725997e-06, + "loss": 0.94411783, + "memory(GiB)": 302.58, + "step": 223900, + "train_speed(iter/s)": 0.123898 + }, + { + "acc": 0.74440074, + "epoch": 1.2522666994758453, + "grad_norm": 5.21875, + "learning_rate": 3.358764315708265e-06, + "loss": 1.00746069, + "memory(GiB)": 302.58, + "step": 223920, + "train_speed(iter/s)": 0.123903 + }, + { + "acc": 0.73980904, + "epoch": 1.2523785489488246, + "grad_norm": 7.625, + "learning_rate": 3.3578908749793935e-06, + "loss": 1.02555132, + "memory(GiB)": 302.58, + "step": 223940, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.75967588, + "epoch": 1.2524903984218039, + "grad_norm": 8.0625, + "learning_rate": 3.3570174904158613e-06, + "loss": 0.91742859, + "memory(GiB)": 302.58, + "step": 223960, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.76117444, + "epoch": 1.2526022478947831, + "grad_norm": 6.3125, + "learning_rate": 3.3561441620475417e-06, + "loss": 0.94679985, + "memory(GiB)": 302.58, + "step": 223980, + "train_speed(iter/s)": 0.12392 + }, + { + "acc": 0.74130425, + "epoch": 1.2527140973677624, + "grad_norm": 6.6875, + "learning_rate": 3.355270889904304e-06, + "loss": 1.03673019, + "memory(GiB)": 302.58, + "step": 224000, + "train_speed(iter/s)": 0.123925 + }, + { + "epoch": 1.2527140973677624, + "eval_acc": 0.706748404682051, + "eval_loss": 1.0125397443771362, + "eval_runtime": 7499.8019, + "eval_samples_per_second": 10.038, + "eval_steps_per_second": 10.038, + "step": 224000 + }, + { + "acc": 0.7529717, + "epoch": 1.2528259468407417, + "grad_norm": 6.21875, + "learning_rate": 3.354397674016017e-06, + "loss": 0.98456068, + "memory(GiB)": 302.58, + "step": 224020, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.74189906, + "epoch": 1.252937796313721, + "grad_norm": 6.5625, + "learning_rate": 3.3535245144125478e-06, + "loss": 1.01264505, + "memory(GiB)": 302.58, + "step": 224040, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.75804071, + "epoch": 1.2530496457867002, + "grad_norm": 5.90625, + "learning_rate": 3.3526514111237605e-06, + "loss": 0.93819399, + "memory(GiB)": 302.58, + "step": 224060, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.7548605, + "epoch": 1.2531614952596795, + "grad_norm": 6.84375, + "learning_rate": 3.3517783641795186e-06, + "loss": 0.97700319, + "memory(GiB)": 302.58, + "step": 224080, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.75568581, + "epoch": 1.2532733447326587, + "grad_norm": 8.0625, + "learning_rate": 3.350905373609683e-06, + "loss": 0.9364934, + "memory(GiB)": 302.58, + "step": 224100, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74807487, + "epoch": 1.253385194205638, + "grad_norm": 6.1875, + "learning_rate": 3.3500324394441135e-06, + "loss": 0.99302769, + "memory(GiB)": 302.58, + "step": 224120, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75639415, + "epoch": 1.2534970436786173, + "grad_norm": 6.6875, + "learning_rate": 3.3491595617126658e-06, + "loss": 0.94411221, + "memory(GiB)": 302.58, + "step": 224140, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74586782, + "epoch": 1.2536088931515965, + "grad_norm": 8.75, + "learning_rate": 3.3482867404451967e-06, + "loss": 0.98860245, + "memory(GiB)": 302.58, + "step": 224160, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.76231141, + "epoch": 1.2537207426245758, + "grad_norm": 6.78125, + "learning_rate": 3.347413975671558e-06, + "loss": 0.92303028, + "memory(GiB)": 302.58, + "step": 224180, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.74782166, + "epoch": 1.253832592097555, + "grad_norm": 6.40625, + "learning_rate": 3.3465412674216013e-06, + "loss": 0.99461832, + "memory(GiB)": 302.58, + "step": 224200, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.7466918, + "epoch": 1.2539444415705343, + "grad_norm": 6.0, + "learning_rate": 3.3456686157251756e-06, + "loss": 0.98107281, + "memory(GiB)": 302.58, + "step": 224220, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.74380579, + "epoch": 1.2540562910435136, + "grad_norm": 11.0, + "learning_rate": 3.34479602061213e-06, + "loss": 0.99286518, + "memory(GiB)": 302.58, + "step": 224240, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.74709315, + "epoch": 1.2541681405164928, + "grad_norm": 8.3125, + "learning_rate": 3.3439234821123082e-06, + "loss": 1.01561432, + "memory(GiB)": 302.58, + "step": 224260, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.74652071, + "epoch": 1.254279989989472, + "grad_norm": 9.0625, + "learning_rate": 3.3430510002555557e-06, + "loss": 0.98886538, + "memory(GiB)": 302.58, + "step": 224280, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.71549897, + "epoch": 1.2543918394624514, + "grad_norm": 5.6875, + "learning_rate": 3.342178575071712e-06, + "loss": 1.14001808, + "memory(GiB)": 302.58, + "step": 224300, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.76051559, + "epoch": 1.2545036889354306, + "grad_norm": 10.3125, + "learning_rate": 3.3413062065906187e-06, + "loss": 0.93105688, + "memory(GiB)": 302.58, + "step": 224320, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.73613529, + "epoch": 1.25461553840841, + "grad_norm": 6.625, + "learning_rate": 3.3404338948421123e-06, + "loss": 1.04239979, + "memory(GiB)": 302.58, + "step": 224340, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.7538414, + "epoch": 1.2547273878813892, + "grad_norm": 6.90625, + "learning_rate": 3.339561639856028e-06, + "loss": 0.98108206, + "memory(GiB)": 302.58, + "step": 224360, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74616399, + "epoch": 1.2548392373543684, + "grad_norm": 5.375, + "learning_rate": 3.338689441662202e-06, + "loss": 1.00772896, + "memory(GiB)": 302.58, + "step": 224380, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.75821753, + "epoch": 1.2549510868273477, + "grad_norm": 6.1875, + "learning_rate": 3.3378173002904647e-06, + "loss": 0.94211674, + "memory(GiB)": 302.58, + "step": 224400, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.76498294, + "epoch": 1.255062936300327, + "grad_norm": 7.03125, + "learning_rate": 3.3369452157706467e-06, + "loss": 0.92787952, + "memory(GiB)": 302.58, + "step": 224420, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.74272084, + "epoch": 1.2551747857733062, + "grad_norm": 6.21875, + "learning_rate": 3.3360731881325758e-06, + "loss": 1.03738728, + "memory(GiB)": 302.58, + "step": 224440, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.75334234, + "epoch": 1.2552866352462855, + "grad_norm": 6.875, + "learning_rate": 3.3352012174060777e-06, + "loss": 0.99215288, + "memory(GiB)": 302.58, + "step": 224460, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.75563903, + "epoch": 1.2553984847192647, + "grad_norm": 7.4375, + "learning_rate": 3.334329303620977e-06, + "loss": 0.96826277, + "memory(GiB)": 302.58, + "step": 224480, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.75369992, + "epoch": 1.255510334192244, + "grad_norm": 7.125, + "learning_rate": 3.3334574468070936e-06, + "loss": 0.97947521, + "memory(GiB)": 302.58, + "step": 224500, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.73840342, + "epoch": 1.2556221836652233, + "grad_norm": 7.25, + "learning_rate": 3.332585646994252e-06, + "loss": 1.01363411, + "memory(GiB)": 302.58, + "step": 224520, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.76394057, + "epoch": 1.2557340331382025, + "grad_norm": 5.5, + "learning_rate": 3.331713904212269e-06, + "loss": 0.93151979, + "memory(GiB)": 302.58, + "step": 224540, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.76111078, + "epoch": 1.2558458826111818, + "grad_norm": 7.28125, + "learning_rate": 3.3308422184909585e-06, + "loss": 0.94283581, + "memory(GiB)": 302.58, + "step": 224560, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.73192911, + "epoch": 1.255957732084161, + "grad_norm": 7.59375, + "learning_rate": 3.3299705898601385e-06, + "loss": 1.05026178, + "memory(GiB)": 302.58, + "step": 224580, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.76585507, + "epoch": 1.2560695815571403, + "grad_norm": 6.9375, + "learning_rate": 3.3290990183496197e-06, + "loss": 0.91015215, + "memory(GiB)": 302.58, + "step": 224600, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.76022353, + "epoch": 1.2561814310301196, + "grad_norm": 8.5, + "learning_rate": 3.328227503989213e-06, + "loss": 0.95141916, + "memory(GiB)": 302.58, + "step": 224620, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.76975183, + "epoch": 1.2562932805030989, + "grad_norm": 7.34375, + "learning_rate": 3.327356046808727e-06, + "loss": 0.92197247, + "memory(GiB)": 302.58, + "step": 224640, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.76154661, + "epoch": 1.2564051299760781, + "grad_norm": 7.625, + "learning_rate": 3.3264846468379674e-06, + "loss": 0.93279133, + "memory(GiB)": 302.58, + "step": 224660, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75379591, + "epoch": 1.2565169794490574, + "grad_norm": 8.75, + "learning_rate": 3.3256133041067397e-06, + "loss": 0.95830011, + "memory(GiB)": 302.58, + "step": 224680, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74429741, + "epoch": 1.2566288289220366, + "grad_norm": 8.0625, + "learning_rate": 3.324742018644845e-06, + "loss": 0.99786501, + "memory(GiB)": 302.58, + "step": 224700, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.73962846, + "epoch": 1.256740678395016, + "grad_norm": 8.25, + "learning_rate": 3.3238707904820865e-06, + "loss": 1.01690178, + "memory(GiB)": 302.58, + "step": 224720, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.74671512, + "epoch": 1.2568525278679952, + "grad_norm": 7.84375, + "learning_rate": 3.322999619648263e-06, + "loss": 1.01571484, + "memory(GiB)": 302.58, + "step": 224740, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.7559082, + "epoch": 1.2569643773409744, + "grad_norm": 7.90625, + "learning_rate": 3.322128506173169e-06, + "loss": 0.97441845, + "memory(GiB)": 302.58, + "step": 224760, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.73752704, + "epoch": 1.2570762268139537, + "grad_norm": 7.4375, + "learning_rate": 3.3212574500866e-06, + "loss": 1.03069801, + "memory(GiB)": 302.58, + "step": 224780, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.75414801, + "epoch": 1.257188076286933, + "grad_norm": 6.53125, + "learning_rate": 3.3203864514183505e-06, + "loss": 0.96146841, + "memory(GiB)": 302.58, + "step": 224800, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.76712756, + "epoch": 1.2572999257599122, + "grad_norm": 7.5, + "learning_rate": 3.3195155101982085e-06, + "loss": 0.90981474, + "memory(GiB)": 302.58, + "step": 224820, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.76178942, + "epoch": 1.2574117752328915, + "grad_norm": 8.6875, + "learning_rate": 3.3186446264559656e-06, + "loss": 0.94029322, + "memory(GiB)": 302.58, + "step": 224840, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.7642056, + "epoch": 1.2575236247058708, + "grad_norm": 7.6875, + "learning_rate": 3.317773800221409e-06, + "loss": 0.90941029, + "memory(GiB)": 302.58, + "step": 224860, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.77530956, + "epoch": 1.25763547417885, + "grad_norm": 12.6875, + "learning_rate": 3.316903031524321e-06, + "loss": 0.87343874, + "memory(GiB)": 302.58, + "step": 224880, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.76199541, + "epoch": 1.2577473236518293, + "grad_norm": 7.59375, + "learning_rate": 3.316032320394488e-06, + "loss": 0.89657488, + "memory(GiB)": 302.58, + "step": 224900, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.76368613, + "epoch": 1.2578591731248085, + "grad_norm": 7.03125, + "learning_rate": 3.315161666861688e-06, + "loss": 0.93250132, + "memory(GiB)": 302.58, + "step": 224920, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.76719446, + "epoch": 1.257971022597788, + "grad_norm": 10.625, + "learning_rate": 3.314291070955702e-06, + "loss": 0.92528973, + "memory(GiB)": 302.58, + "step": 224940, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.75774279, + "epoch": 1.258082872070767, + "grad_norm": 9.5, + "learning_rate": 3.313420532706306e-06, + "loss": 0.9491518, + "memory(GiB)": 302.58, + "step": 224960, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.73724799, + "epoch": 1.2581947215437466, + "grad_norm": 7.46875, + "learning_rate": 3.3125500521432764e-06, + "loss": 1.04411469, + "memory(GiB)": 302.58, + "step": 224980, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74531674, + "epoch": 1.2583065710167256, + "grad_norm": 11.6875, + "learning_rate": 3.3116796292963854e-06, + "loss": 1.00576334, + "memory(GiB)": 302.58, + "step": 225000, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.73417063, + "epoch": 1.258418420489705, + "grad_norm": 9.9375, + "learning_rate": 3.310809264195405e-06, + "loss": 1.05582657, + "memory(GiB)": 302.58, + "step": 225020, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.76261425, + "epoch": 1.2585302699626841, + "grad_norm": 5.21875, + "learning_rate": 3.309938956870104e-06, + "loss": 0.93214741, + "memory(GiB)": 302.58, + "step": 225040, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.753653, + "epoch": 1.2586421194356636, + "grad_norm": 6.96875, + "learning_rate": 3.3090687073502497e-06, + "loss": 0.96896992, + "memory(GiB)": 302.58, + "step": 225060, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.74022613, + "epoch": 1.2587539689086427, + "grad_norm": 5.8125, + "learning_rate": 3.308198515665607e-06, + "loss": 1.01659374, + "memory(GiB)": 302.58, + "step": 225080, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.75963044, + "epoch": 1.2588658183816221, + "grad_norm": 5.96875, + "learning_rate": 3.3073283818459394e-06, + "loss": 0.94997587, + "memory(GiB)": 302.58, + "step": 225100, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.73824773, + "epoch": 1.2589776678546012, + "grad_norm": 6.6875, + "learning_rate": 3.3064583059210093e-06, + "loss": 1.03581762, + "memory(GiB)": 302.58, + "step": 225120, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.74274006, + "epoch": 1.2590895173275807, + "grad_norm": 8.1875, + "learning_rate": 3.3055882879205746e-06, + "loss": 1.01482353, + "memory(GiB)": 302.58, + "step": 225140, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.75749989, + "epoch": 1.2592013668005597, + "grad_norm": 7.25, + "learning_rate": 3.304718327874394e-06, + "loss": 0.9485631, + "memory(GiB)": 302.58, + "step": 225160, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.75102496, + "epoch": 1.2593132162735392, + "grad_norm": 7.65625, + "learning_rate": 3.303848425812222e-06, + "loss": 0.96333275, + "memory(GiB)": 302.58, + "step": 225180, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.74599557, + "epoch": 1.2594250657465182, + "grad_norm": 5.6875, + "learning_rate": 3.302978581763812e-06, + "loss": 0.99162111, + "memory(GiB)": 302.58, + "step": 225200, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.77487583, + "epoch": 1.2595369152194977, + "grad_norm": 5.75, + "learning_rate": 3.302108795758916e-06, + "loss": 0.89223471, + "memory(GiB)": 302.58, + "step": 225220, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75344357, + "epoch": 1.2596487646924768, + "grad_norm": 9.125, + "learning_rate": 3.3012390678272816e-06, + "loss": 0.98239651, + "memory(GiB)": 302.58, + "step": 225240, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74915676, + "epoch": 1.2597606141654563, + "grad_norm": 7.75, + "learning_rate": 3.3003693979986582e-06, + "loss": 0.96668844, + "memory(GiB)": 302.58, + "step": 225260, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.76819077, + "epoch": 1.2598724636384353, + "grad_norm": 4.59375, + "learning_rate": 3.299499786302792e-06, + "loss": 0.87948694, + "memory(GiB)": 302.58, + "step": 225280, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.74853892, + "epoch": 1.2599843131114148, + "grad_norm": 7.46875, + "learning_rate": 3.298630232769424e-06, + "loss": 0.98781996, + "memory(GiB)": 302.58, + "step": 225300, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.75510063, + "epoch": 1.2600961625843938, + "grad_norm": 5.3125, + "learning_rate": 3.297760737428298e-06, + "loss": 0.95090179, + "memory(GiB)": 302.58, + "step": 225320, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.76416187, + "epoch": 1.2602080120573733, + "grad_norm": 8.625, + "learning_rate": 3.2968913003091517e-06, + "loss": 0.94961166, + "memory(GiB)": 302.58, + "step": 225340, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.74114666, + "epoch": 1.2603198615303524, + "grad_norm": 8.5625, + "learning_rate": 3.2960219214417244e-06, + "loss": 1.02620687, + "memory(GiB)": 302.58, + "step": 225360, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.76794744, + "epoch": 1.2604317110033318, + "grad_norm": 6.25, + "learning_rate": 3.2951526008557487e-06, + "loss": 0.88993149, + "memory(GiB)": 302.58, + "step": 225380, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.74204664, + "epoch": 1.2605435604763109, + "grad_norm": 6.5625, + "learning_rate": 3.2942833385809615e-06, + "loss": 1.01533365, + "memory(GiB)": 302.58, + "step": 225400, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.73569741, + "epoch": 1.2606554099492904, + "grad_norm": 8.4375, + "learning_rate": 3.2934141346470916e-06, + "loss": 1.0375411, + "memory(GiB)": 302.58, + "step": 225420, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.76006742, + "epoch": 1.2607672594222694, + "grad_norm": 5.25, + "learning_rate": 3.2925449890838714e-06, + "loss": 0.94387455, + "memory(GiB)": 302.58, + "step": 225440, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.74476118, + "epoch": 1.260879108895249, + "grad_norm": 6.46875, + "learning_rate": 3.2916759019210254e-06, + "loss": 1.01119976, + "memory(GiB)": 302.58, + "step": 225460, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.7417593, + "epoch": 1.260990958368228, + "grad_norm": 7.96875, + "learning_rate": 3.2908068731882808e-06, + "loss": 1.02251539, + "memory(GiB)": 302.58, + "step": 225480, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.74359212, + "epoch": 1.2611028078412074, + "grad_norm": 4.8125, + "learning_rate": 3.289937902915361e-06, + "loss": 1.02522116, + "memory(GiB)": 302.58, + "step": 225500, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.74364195, + "epoch": 1.2612146573141865, + "grad_norm": 7.59375, + "learning_rate": 3.289068991131986e-06, + "loss": 1.02877054, + "memory(GiB)": 302.58, + "step": 225520, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.74628377, + "epoch": 1.261326506787166, + "grad_norm": 10.875, + "learning_rate": 3.2882001378678785e-06, + "loss": 1.01042824, + "memory(GiB)": 302.58, + "step": 225540, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.75265102, + "epoch": 1.261438356260145, + "grad_norm": 9.875, + "learning_rate": 3.287331343152754e-06, + "loss": 0.95032692, + "memory(GiB)": 302.58, + "step": 225560, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.76007009, + "epoch": 1.2615502057331245, + "grad_norm": 6.1875, + "learning_rate": 3.2864626070163285e-06, + "loss": 0.94332943, + "memory(GiB)": 302.58, + "step": 225580, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.74657764, + "epoch": 1.2616620552061035, + "grad_norm": 7.25, + "learning_rate": 3.2855939294883147e-06, + "loss": 1.00200644, + "memory(GiB)": 302.58, + "step": 225600, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.75313544, + "epoch": 1.261773904679083, + "grad_norm": 7.28125, + "learning_rate": 3.2847253105984256e-06, + "loss": 0.96861439, + "memory(GiB)": 302.58, + "step": 225620, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.74861245, + "epoch": 1.261885754152062, + "grad_norm": 5.5625, + "learning_rate": 3.283856750376368e-06, + "loss": 0.9979001, + "memory(GiB)": 302.58, + "step": 225640, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.76311245, + "epoch": 1.2619976036250415, + "grad_norm": 6.9375, + "learning_rate": 3.282988248851853e-06, + "loss": 0.92454634, + "memory(GiB)": 302.58, + "step": 225660, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.75528121, + "epoch": 1.2621094530980206, + "grad_norm": 6.625, + "learning_rate": 3.282119806054583e-06, + "loss": 0.97551737, + "memory(GiB)": 302.58, + "step": 225680, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.74798236, + "epoch": 1.262221302571, + "grad_norm": 6.40625, + "learning_rate": 3.2812514220142645e-06, + "loss": 0.99162951, + "memory(GiB)": 302.58, + "step": 225700, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.73916945, + "epoch": 1.2623331520439791, + "grad_norm": 5.6875, + "learning_rate": 3.2803830967605967e-06, + "loss": 1.0269125, + "memory(GiB)": 302.58, + "step": 225720, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.74784908, + "epoch": 1.2624450015169586, + "grad_norm": 5.375, + "learning_rate": 3.2795148303232777e-06, + "loss": 0.99309692, + "memory(GiB)": 302.58, + "step": 225740, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.76175766, + "epoch": 1.2625568509899376, + "grad_norm": 7.0625, + "learning_rate": 3.278646622732009e-06, + "loss": 0.89814835, + "memory(GiB)": 302.58, + "step": 225760, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.74478016, + "epoch": 1.2626687004629171, + "grad_norm": 9.25, + "learning_rate": 3.2777784740164844e-06, + "loss": 0.99400606, + "memory(GiB)": 302.58, + "step": 225780, + "train_speed(iter/s)": 0.123864 + }, + { + "acc": 0.76117716, + "epoch": 1.2627805499358962, + "grad_norm": 5.15625, + "learning_rate": 3.2769103842063964e-06, + "loss": 0.93944845, + "memory(GiB)": 302.58, + "step": 225800, + "train_speed(iter/s)": 0.123869 + }, + { + "acc": 0.73912196, + "epoch": 1.2628923994088757, + "grad_norm": 7.71875, + "learning_rate": 3.276042353331438e-06, + "loss": 1.03930044, + "memory(GiB)": 302.58, + "step": 225820, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.75206714, + "epoch": 1.2630042488818547, + "grad_norm": 9.75, + "learning_rate": 3.2751743814212957e-06, + "loss": 0.98217812, + "memory(GiB)": 302.58, + "step": 225840, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.7470706, + "epoch": 1.2631160983548342, + "grad_norm": 7.65625, + "learning_rate": 3.2743064685056603e-06, + "loss": 0.9904521, + "memory(GiB)": 302.58, + "step": 225860, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.73693538, + "epoch": 1.2632279478278132, + "grad_norm": 6.6875, + "learning_rate": 3.2734386146142153e-06, + "loss": 1.06672993, + "memory(GiB)": 302.58, + "step": 225880, + "train_speed(iter/s)": 0.12389 + }, + { + "acc": 0.73800344, + "epoch": 1.2633397973007927, + "grad_norm": 9.25, + "learning_rate": 3.272570819776645e-06, + "loss": 1.03994169, + "memory(GiB)": 302.58, + "step": 225900, + "train_speed(iter/s)": 0.123895 + }, + { + "acc": 0.77236032, + "epoch": 1.2634516467737718, + "grad_norm": 7.78125, + "learning_rate": 3.2717030840226295e-06, + "loss": 0.90551758, + "memory(GiB)": 302.58, + "step": 225920, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.75048814, + "epoch": 1.2635634962467512, + "grad_norm": 5.65625, + "learning_rate": 3.2708354073818495e-06, + "loss": 0.99837723, + "memory(GiB)": 302.58, + "step": 225940, + "train_speed(iter/s)": 0.123904 + }, + { + "acc": 0.77431273, + "epoch": 1.2636753457197303, + "grad_norm": 4.96875, + "learning_rate": 3.269967789883981e-06, + "loss": 0.86803646, + "memory(GiB)": 302.58, + "step": 225960, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.7447578, + "epoch": 1.2637871951927098, + "grad_norm": 6.03125, + "learning_rate": 3.2691002315586994e-06, + "loss": 1.02470064, + "memory(GiB)": 302.58, + "step": 225980, + "train_speed(iter/s)": 0.123915 + }, + { + "acc": 0.74112201, + "epoch": 1.2638990446656888, + "grad_norm": 4.8125, + "learning_rate": 3.2682327324356795e-06, + "loss": 1.0454812, + "memory(GiB)": 302.58, + "step": 226000, + "train_speed(iter/s)": 0.12392 + }, + { + "epoch": 1.2638990446656888, + "eval_acc": 0.7067521019410216, + "eval_loss": 1.0125781297683716, + "eval_runtime": 7511.1539, + "eval_samples_per_second": 10.023, + "eval_steps_per_second": 10.023, + "step": 226000 + }, + { + "acc": 0.77356715, + "epoch": 1.2640108941386683, + "grad_norm": 7.0, + "learning_rate": 3.267365292544591e-06, + "loss": 0.88440762, + "memory(GiB)": 302.58, + "step": 226020, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.74604549, + "epoch": 1.2641227436116473, + "grad_norm": 9.1875, + "learning_rate": 3.2664979119151036e-06, + "loss": 1.0056716, + "memory(GiB)": 302.58, + "step": 226040, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75357447, + "epoch": 1.2642345930846268, + "grad_norm": 5.46875, + "learning_rate": 3.265630590576885e-06, + "loss": 0.95378675, + "memory(GiB)": 302.58, + "step": 226060, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.74639697, + "epoch": 1.2643464425576059, + "grad_norm": 8.5, + "learning_rate": 3.264763328559599e-06, + "loss": 0.99686155, + "memory(GiB)": 302.58, + "step": 226080, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75232973, + "epoch": 1.2644582920305854, + "grad_norm": 8.25, + "learning_rate": 3.263896125892909e-06, + "loss": 0.9711338, + "memory(GiB)": 302.58, + "step": 226100, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.74911423, + "epoch": 1.2645701415035644, + "grad_norm": 4.25, + "learning_rate": 3.2630289826064767e-06, + "loss": 0.99075432, + "memory(GiB)": 302.58, + "step": 226120, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.74350452, + "epoch": 1.2646819909765439, + "grad_norm": 6.59375, + "learning_rate": 3.2621618987299615e-06, + "loss": 1.00594444, + "memory(GiB)": 302.58, + "step": 226140, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.75713072, + "epoch": 1.264793840449523, + "grad_norm": 8.25, + "learning_rate": 3.2612948742930195e-06, + "loss": 0.93602438, + "memory(GiB)": 302.58, + "step": 226160, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.75400815, + "epoch": 1.2649056899225024, + "grad_norm": 7.28125, + "learning_rate": 3.2604279093253065e-06, + "loss": 0.95535278, + "memory(GiB)": 302.58, + "step": 226180, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75174999, + "epoch": 1.2650175393954814, + "grad_norm": 6.5625, + "learning_rate": 3.259561003856475e-06, + "loss": 0.98765335, + "memory(GiB)": 302.58, + "step": 226200, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.74021025, + "epoch": 1.265129388868461, + "grad_norm": 9.75, + "learning_rate": 3.2586941579161764e-06, + "loss": 1.02265129, + "memory(GiB)": 302.58, + "step": 226220, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74542966, + "epoch": 1.26524123834144, + "grad_norm": 8.4375, + "learning_rate": 3.2578273715340588e-06, + "loss": 0.99250307, + "memory(GiB)": 302.58, + "step": 226240, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.73395252, + "epoch": 1.2653530878144195, + "grad_norm": 6.3125, + "learning_rate": 3.2569606447397684e-06, + "loss": 1.05630341, + "memory(GiB)": 302.58, + "step": 226260, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.75826955, + "epoch": 1.2654649372873985, + "grad_norm": 8.5, + "learning_rate": 3.2560939775629513e-06, + "loss": 0.94527254, + "memory(GiB)": 302.58, + "step": 226280, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.7530087, + "epoch": 1.265576786760378, + "grad_norm": 6.0625, + "learning_rate": 3.2552273700332504e-06, + "loss": 0.98674269, + "memory(GiB)": 302.58, + "step": 226300, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.76209936, + "epoch": 1.265688636233357, + "grad_norm": 7.46875, + "learning_rate": 3.2543608221803063e-06, + "loss": 0.92001524, + "memory(GiB)": 302.58, + "step": 226320, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74065208, + "epoch": 1.2658004857063365, + "grad_norm": 7.0, + "learning_rate": 3.253494334033757e-06, + "loss": 1.01933527, + "memory(GiB)": 302.58, + "step": 226340, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.75754676, + "epoch": 1.2659123351793156, + "grad_norm": 9.3125, + "learning_rate": 3.2526279056232395e-06, + "loss": 0.93842363, + "memory(GiB)": 302.58, + "step": 226360, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.7453743, + "epoch": 1.266024184652295, + "grad_norm": 7.1875, + "learning_rate": 3.2517615369783884e-06, + "loss": 1.0141901, + "memory(GiB)": 302.58, + "step": 226380, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.75528889, + "epoch": 1.266136034125274, + "grad_norm": 7.90625, + "learning_rate": 3.250895228128835e-06, + "loss": 0.96012535, + "memory(GiB)": 302.58, + "step": 226400, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.73494749, + "epoch": 1.2662478835982536, + "grad_norm": 5.28125, + "learning_rate": 3.2500289791042126e-06, + "loss": 1.04512768, + "memory(GiB)": 302.58, + "step": 226420, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.7496254, + "epoch": 1.2663597330712326, + "grad_norm": 5.3125, + "learning_rate": 3.2491627899341483e-06, + "loss": 0.9783514, + "memory(GiB)": 302.58, + "step": 226440, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.75636535, + "epoch": 1.266471582544212, + "grad_norm": 7.0, + "learning_rate": 3.248296660648268e-06, + "loss": 0.95903215, + "memory(GiB)": 302.58, + "step": 226460, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74157572, + "epoch": 1.2665834320171911, + "grad_norm": 9.5625, + "learning_rate": 3.247430591276196e-06, + "loss": 1.02152672, + "memory(GiB)": 302.58, + "step": 226480, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.75462675, + "epoch": 1.2666952814901706, + "grad_norm": 7.71875, + "learning_rate": 3.2465645818475557e-06, + "loss": 0.95967455, + "memory(GiB)": 302.58, + "step": 226500, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.75651536, + "epoch": 1.2668071309631497, + "grad_norm": 7.09375, + "learning_rate": 3.2456986323919656e-06, + "loss": 0.94452991, + "memory(GiB)": 302.58, + "step": 226520, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75808764, + "epoch": 1.2669189804361292, + "grad_norm": 7.84375, + "learning_rate": 3.244832742939046e-06, + "loss": 0.94228506, + "memory(GiB)": 302.58, + "step": 226540, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.74042077, + "epoch": 1.2670308299091082, + "grad_norm": 8.625, + "learning_rate": 3.2439669135184116e-06, + "loss": 1.0024437, + "memory(GiB)": 302.58, + "step": 226560, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74618402, + "epoch": 1.2671426793820877, + "grad_norm": 8.5625, + "learning_rate": 3.243101144159677e-06, + "loss": 0.98217468, + "memory(GiB)": 302.58, + "step": 226580, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.73436084, + "epoch": 1.2672545288550667, + "grad_norm": 8.25, + "learning_rate": 3.2422354348924543e-06, + "loss": 1.04749718, + "memory(GiB)": 302.58, + "step": 226600, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75275378, + "epoch": 1.2673663783280462, + "grad_norm": 4.5, + "learning_rate": 3.2413697857463537e-06, + "loss": 0.97450094, + "memory(GiB)": 302.58, + "step": 226620, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.76795077, + "epoch": 1.2674782278010253, + "grad_norm": 10.75, + "learning_rate": 3.2405041967509833e-06, + "loss": 0.90375757, + "memory(GiB)": 302.58, + "step": 226640, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.75286465, + "epoch": 1.2675900772740047, + "grad_norm": 6.6875, + "learning_rate": 3.239638667935947e-06, + "loss": 0.97603865, + "memory(GiB)": 302.58, + "step": 226660, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.73960037, + "epoch": 1.2677019267469838, + "grad_norm": 8.5625, + "learning_rate": 3.238773199330851e-06, + "loss": 1.01477852, + "memory(GiB)": 302.58, + "step": 226680, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74064999, + "epoch": 1.2678137762199633, + "grad_norm": 6.375, + "learning_rate": 3.2379077909652966e-06, + "loss": 1.04046535, + "memory(GiB)": 302.58, + "step": 226700, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74199276, + "epoch": 1.2679256256929423, + "grad_norm": 6.25, + "learning_rate": 3.2370424428688835e-06, + "loss": 1.02823076, + "memory(GiB)": 302.58, + "step": 226720, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.73672576, + "epoch": 1.2680374751659218, + "grad_norm": 9.6875, + "learning_rate": 3.2361771550712086e-06, + "loss": 1.02579374, + "memory(GiB)": 302.58, + "step": 226740, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.76457939, + "epoch": 1.2681493246389008, + "grad_norm": 7.09375, + "learning_rate": 3.2353119276018685e-06, + "loss": 0.92211456, + "memory(GiB)": 302.58, + "step": 226760, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.76063819, + "epoch": 1.2682611741118803, + "grad_norm": 7.03125, + "learning_rate": 3.234446760490455e-06, + "loss": 0.92345476, + "memory(GiB)": 302.58, + "step": 226780, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.7496892, + "epoch": 1.2683730235848594, + "grad_norm": 7.09375, + "learning_rate": 3.233581653766561e-06, + "loss": 0.93979702, + "memory(GiB)": 302.58, + "step": 226800, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.75414543, + "epoch": 1.2684848730578389, + "grad_norm": 8.8125, + "learning_rate": 3.232716607459776e-06, + "loss": 0.95976429, + "memory(GiB)": 302.58, + "step": 226820, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.74940767, + "epoch": 1.268596722530818, + "grad_norm": 8.5625, + "learning_rate": 3.2318516215996863e-06, + "loss": 1.01098833, + "memory(GiB)": 302.58, + "step": 226840, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.76139736, + "epoch": 1.2687085720037974, + "grad_norm": 8.3125, + "learning_rate": 3.2309866962158787e-06, + "loss": 0.9428071, + "memory(GiB)": 302.58, + "step": 226860, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.75242634, + "epoch": 1.2688204214767764, + "grad_norm": 7.96875, + "learning_rate": 3.2301218313379355e-06, + "loss": 0.95747843, + "memory(GiB)": 302.58, + "step": 226880, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.74153047, + "epoch": 1.268932270949756, + "grad_norm": 6.40625, + "learning_rate": 3.2292570269954377e-06, + "loss": 1.03598576, + "memory(GiB)": 302.58, + "step": 226900, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.7550993, + "epoch": 1.269044120422735, + "grad_norm": 9.0625, + "learning_rate": 3.2283922832179635e-06, + "loss": 0.96304045, + "memory(GiB)": 302.58, + "step": 226920, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.75132689, + "epoch": 1.2691559698957144, + "grad_norm": 7.3125, + "learning_rate": 3.2275276000350906e-06, + "loss": 0.97488194, + "memory(GiB)": 302.58, + "step": 226940, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.76572785, + "epoch": 1.2692678193686935, + "grad_norm": 7.96875, + "learning_rate": 3.2266629774763957e-06, + "loss": 0.91686821, + "memory(GiB)": 302.58, + "step": 226960, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.75539999, + "epoch": 1.269379668841673, + "grad_norm": 7.75, + "learning_rate": 3.2257984155714495e-06, + "loss": 0.95880613, + "memory(GiB)": 302.58, + "step": 226980, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.75395389, + "epoch": 1.269491518314652, + "grad_norm": 8.3125, + "learning_rate": 3.2249339143498215e-06, + "loss": 0.95909595, + "memory(GiB)": 302.58, + "step": 227000, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.75752292, + "epoch": 1.2696033677876315, + "grad_norm": 5.75, + "learning_rate": 3.2240694738410837e-06, + "loss": 0.93923874, + "memory(GiB)": 302.58, + "step": 227020, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.75882931, + "epoch": 1.2697152172606105, + "grad_norm": 9.0625, + "learning_rate": 3.2232050940748014e-06, + "loss": 0.93301764, + "memory(GiB)": 302.58, + "step": 227040, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.73926058, + "epoch": 1.26982706673359, + "grad_norm": 4.96875, + "learning_rate": 3.2223407750805395e-06, + "loss": 1.03530025, + "memory(GiB)": 302.58, + "step": 227060, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.74334321, + "epoch": 1.269938916206569, + "grad_norm": 6.71875, + "learning_rate": 3.2214765168878594e-06, + "loss": 1.00829811, + "memory(GiB)": 302.58, + "step": 227080, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.74957581, + "epoch": 1.2700507656795486, + "grad_norm": 6.03125, + "learning_rate": 3.220612319526322e-06, + "loss": 0.99636288, + "memory(GiB)": 302.58, + "step": 227100, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.75330429, + "epoch": 1.2701626151525276, + "grad_norm": 9.875, + "learning_rate": 3.219748183025486e-06, + "loss": 0.95894985, + "memory(GiB)": 302.58, + "step": 227120, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.73845448, + "epoch": 1.270274464625507, + "grad_norm": 6.75, + "learning_rate": 3.218884107414906e-06, + "loss": 1.02465439, + "memory(GiB)": 302.58, + "step": 227140, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.74017658, + "epoch": 1.2703863140984861, + "grad_norm": 8.4375, + "learning_rate": 3.2180200927241378e-06, + "loss": 1.04376078, + "memory(GiB)": 302.58, + "step": 227160, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.75298629, + "epoch": 1.2704981635714656, + "grad_norm": 5.0, + "learning_rate": 3.2171561389827334e-06, + "loss": 0.97328377, + "memory(GiB)": 302.58, + "step": 227180, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.75283318, + "epoch": 1.2706100130444447, + "grad_norm": 6.34375, + "learning_rate": 3.2162922462202418e-06, + "loss": 0.96874256, + "memory(GiB)": 302.58, + "step": 227200, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.74320884, + "epoch": 1.2707218625174241, + "grad_norm": 8.1875, + "learning_rate": 3.2154284144662117e-06, + "loss": 1.01945829, + "memory(GiB)": 302.58, + "step": 227220, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.75630322, + "epoch": 1.2708337119904032, + "grad_norm": 6.90625, + "learning_rate": 3.2145646437501886e-06, + "loss": 0.93225822, + "memory(GiB)": 302.58, + "step": 227240, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.75946178, + "epoch": 1.2709455614633827, + "grad_norm": 7.53125, + "learning_rate": 3.2137009341017156e-06, + "loss": 0.91774492, + "memory(GiB)": 302.58, + "step": 227260, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.74941955, + "epoch": 1.2710574109363617, + "grad_norm": 4.9375, + "learning_rate": 3.2128372855503347e-06, + "loss": 0.9690011, + "memory(GiB)": 302.58, + "step": 227280, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.73278422, + "epoch": 1.2711692604093412, + "grad_norm": 7.78125, + "learning_rate": 3.2119736981255866e-06, + "loss": 1.05946398, + "memory(GiB)": 302.58, + "step": 227300, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.74924788, + "epoch": 1.2712811098823202, + "grad_norm": 7.53125, + "learning_rate": 3.211110171857007e-06, + "loss": 1.00255728, + "memory(GiB)": 302.58, + "step": 227320, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.76554413, + "epoch": 1.2713929593552997, + "grad_norm": 7.8125, + "learning_rate": 3.2102467067741326e-06, + "loss": 0.9438118, + "memory(GiB)": 302.58, + "step": 227340, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.75132942, + "epoch": 1.2715048088282788, + "grad_norm": 5.84375, + "learning_rate": 3.209383302906496e-06, + "loss": 0.98432045, + "memory(GiB)": 302.58, + "step": 227360, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.74384394, + "epoch": 1.2716166583012583, + "grad_norm": 6.34375, + "learning_rate": 3.2085199602836276e-06, + "loss": 0.99707098, + "memory(GiB)": 302.58, + "step": 227380, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.75413761, + "epoch": 1.2717285077742373, + "grad_norm": 8.0, + "learning_rate": 3.207656678935056e-06, + "loss": 0.98309669, + "memory(GiB)": 302.58, + "step": 227400, + "train_speed(iter/s)": 0.123765 + }, + { + "acc": 0.75982361, + "epoch": 1.2718403572472168, + "grad_norm": 7.0625, + "learning_rate": 3.2067934588903114e-06, + "loss": 0.93883963, + "memory(GiB)": 302.58, + "step": 227420, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.74831071, + "epoch": 1.2719522067201958, + "grad_norm": 6.28125, + "learning_rate": 3.205930300178916e-06, + "loss": 0.98905468, + "memory(GiB)": 302.58, + "step": 227440, + "train_speed(iter/s)": 0.123775 + }, + { + "acc": 0.74493294, + "epoch": 1.2720640561931753, + "grad_norm": 7.84375, + "learning_rate": 3.2050672028303933e-06, + "loss": 1.00996742, + "memory(GiB)": 302.58, + "step": 227460, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.76074309, + "epoch": 1.2721759056661546, + "grad_norm": 8.1875, + "learning_rate": 3.204204166874264e-06, + "loss": 0.96340532, + "memory(GiB)": 302.58, + "step": 227480, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.75026126, + "epoch": 1.2722877551391338, + "grad_norm": 7.90625, + "learning_rate": 3.203341192340046e-06, + "loss": 0.96824942, + "memory(GiB)": 302.58, + "step": 227500, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.75813317, + "epoch": 1.272399604612113, + "grad_norm": 7.5625, + "learning_rate": 3.2024782792572565e-06, + "loss": 0.9438447, + "memory(GiB)": 302.58, + "step": 227520, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.7502635, + "epoch": 1.2725114540850924, + "grad_norm": 8.625, + "learning_rate": 3.201615427655409e-06, + "loss": 0.99389601, + "memory(GiB)": 302.58, + "step": 227540, + "train_speed(iter/s)": 0.123801 + }, + { + "acc": 0.75063434, + "epoch": 1.2726233035580716, + "grad_norm": 8.625, + "learning_rate": 3.2007526375640176e-06, + "loss": 0.98301373, + "memory(GiB)": 302.58, + "step": 227560, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.77004781, + "epoch": 1.272735153031051, + "grad_norm": 8.5625, + "learning_rate": 3.19988990901259e-06, + "loss": 0.89749279, + "memory(GiB)": 302.58, + "step": 227580, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.76082706, + "epoch": 1.2728470025040302, + "grad_norm": 5.15625, + "learning_rate": 3.199027242030637e-06, + "loss": 0.94308996, + "memory(GiB)": 302.58, + "step": 227600, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.75312881, + "epoch": 1.2729588519770094, + "grad_norm": 7.75, + "learning_rate": 3.198164636647662e-06, + "loss": 0.94384747, + "memory(GiB)": 302.58, + "step": 227620, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.73155465, + "epoch": 1.2730707014499887, + "grad_norm": 7.5, + "learning_rate": 3.1973020928931702e-06, + "loss": 1.0629961, + "memory(GiB)": 302.58, + "step": 227640, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.76772375, + "epoch": 1.273182550922968, + "grad_norm": 6.5, + "learning_rate": 3.196439610796663e-06, + "loss": 0.90144386, + "memory(GiB)": 302.58, + "step": 227660, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.75424614, + "epoch": 1.2732944003959472, + "grad_norm": 21.625, + "learning_rate": 3.195577190387639e-06, + "loss": 1.000348, + "memory(GiB)": 302.58, + "step": 227680, + "train_speed(iter/s)": 0.123837 + }, + { + "acc": 0.75406723, + "epoch": 1.2734062498689265, + "grad_norm": 6.75, + "learning_rate": 3.1947148316955976e-06, + "loss": 0.95662031, + "memory(GiB)": 302.58, + "step": 227700, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.75808377, + "epoch": 1.2735180993419057, + "grad_norm": 5.78125, + "learning_rate": 3.1938525347500337e-06, + "loss": 0.9457674, + "memory(GiB)": 302.58, + "step": 227720, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.74823437, + "epoch": 1.273629948814885, + "grad_norm": 6.96875, + "learning_rate": 3.1929902995804406e-06, + "loss": 0.98656902, + "memory(GiB)": 302.58, + "step": 227740, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.75529537, + "epoch": 1.2737417982878643, + "grad_norm": 8.0625, + "learning_rate": 3.1921281262163086e-06, + "loss": 0.96433887, + "memory(GiB)": 302.58, + "step": 227760, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.75063138, + "epoch": 1.2738536477608435, + "grad_norm": 8.3125, + "learning_rate": 3.1912660146871275e-06, + "loss": 0.98580656, + "memory(GiB)": 302.58, + "step": 227780, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.74727907, + "epoch": 1.2739654972338228, + "grad_norm": 8.1875, + "learning_rate": 3.190403965022384e-06, + "loss": 0.97930326, + "memory(GiB)": 302.58, + "step": 227800, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74211373, + "epoch": 1.274077346706802, + "grad_norm": 8.8125, + "learning_rate": 3.189541977251562e-06, + "loss": 1.00140152, + "memory(GiB)": 302.58, + "step": 227820, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.74363976, + "epoch": 1.2741891961797813, + "grad_norm": 7.90625, + "learning_rate": 3.188680051404146e-06, + "loss": 0.99690952, + "memory(GiB)": 302.58, + "step": 227840, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.76277771, + "epoch": 1.2743010456527606, + "grad_norm": 5.125, + "learning_rate": 3.187818187509616e-06, + "loss": 0.91765194, + "memory(GiB)": 302.58, + "step": 227860, + "train_speed(iter/s)": 0.123883 + }, + { + "acc": 0.75086064, + "epoch": 1.2744128951257399, + "grad_norm": 6.5625, + "learning_rate": 3.18695638559745e-06, + "loss": 0.96692266, + "memory(GiB)": 302.58, + "step": 227880, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.73484163, + "epoch": 1.2745247445987191, + "grad_norm": 9.4375, + "learning_rate": 3.186094645697125e-06, + "loss": 1.06276484, + "memory(GiB)": 302.58, + "step": 227900, + "train_speed(iter/s)": 0.123894 + }, + { + "acc": 0.75565262, + "epoch": 1.2746365940716984, + "grad_norm": 8.3125, + "learning_rate": 3.1852329678381144e-06, + "loss": 0.95752926, + "memory(GiB)": 302.58, + "step": 227920, + "train_speed(iter/s)": 0.123899 + }, + { + "acc": 0.72468529, + "epoch": 1.2747484435446776, + "grad_norm": 5.3125, + "learning_rate": 3.184371352049891e-06, + "loss": 1.08875637, + "memory(GiB)": 302.58, + "step": 227940, + "train_speed(iter/s)": 0.123904 + }, + { + "acc": 0.76549029, + "epoch": 1.274860293017657, + "grad_norm": 6.4375, + "learning_rate": 3.1835097983619235e-06, + "loss": 0.91002178, + "memory(GiB)": 302.58, + "step": 227960, + "train_speed(iter/s)": 0.123909 + }, + { + "acc": 0.74354877, + "epoch": 1.2749721424906362, + "grad_norm": 6.71875, + "learning_rate": 3.1826483068036817e-06, + "loss": 1.01170368, + "memory(GiB)": 302.58, + "step": 227980, + "train_speed(iter/s)": 0.123914 + }, + { + "acc": 0.752107, + "epoch": 1.2750839919636154, + "grad_norm": 7.5, + "learning_rate": 3.1817868774046305e-06, + "loss": 0.98353138, + "memory(GiB)": 302.58, + "step": 228000, + "train_speed(iter/s)": 0.12392 + }, + { + "epoch": 1.2750839919636154, + "eval_acc": 0.7067468271848902, + "eval_loss": 1.012528657913208, + "eval_runtime": 7510.35, + "eval_samples_per_second": 10.024, + "eval_steps_per_second": 10.024, + "step": 228000 + }, + { + "acc": 0.78317337, + "epoch": 1.2751958414365947, + "grad_norm": 9.25, + "learning_rate": 3.180925510194234e-06, + "loss": 0.85644722, + "memory(GiB)": 302.58, + "step": 228020, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75310903, + "epoch": 1.275307690909574, + "grad_norm": 5.40625, + "learning_rate": 3.1800642052019525e-06, + "loss": 0.98335152, + "memory(GiB)": 302.58, + "step": 228040, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75591168, + "epoch": 1.2754195403825532, + "grad_norm": 10.375, + "learning_rate": 3.179202962457246e-06, + "loss": 0.94630814, + "memory(GiB)": 302.58, + "step": 228060, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.7493578, + "epoch": 1.2755313898555325, + "grad_norm": 7.21875, + "learning_rate": 3.1783417819895716e-06, + "loss": 0.98011513, + "memory(GiB)": 302.58, + "step": 228080, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74735818, + "epoch": 1.2756432393285118, + "grad_norm": 9.0, + "learning_rate": 3.177480663828385e-06, + "loss": 0.9854537, + "memory(GiB)": 302.58, + "step": 228100, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.75985818, + "epoch": 1.275755088801491, + "grad_norm": 7.71875, + "learning_rate": 3.1766196080031384e-06, + "loss": 0.92708998, + "memory(GiB)": 302.58, + "step": 228120, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.75685077, + "epoch": 1.2758669382744703, + "grad_norm": 8.125, + "learning_rate": 3.175758614543284e-06, + "loss": 0.93858194, + "memory(GiB)": 302.58, + "step": 228140, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.75566835, + "epoch": 1.2759787877474495, + "grad_norm": 6.125, + "learning_rate": 3.174897683478269e-06, + "loss": 0.96241484, + "memory(GiB)": 302.58, + "step": 228160, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.75928588, + "epoch": 1.2760906372204288, + "grad_norm": 5.84375, + "learning_rate": 3.1740368148375406e-06, + "loss": 0.94077492, + "memory(GiB)": 302.58, + "step": 228180, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75000658, + "epoch": 1.276202486693408, + "grad_norm": 9.3125, + "learning_rate": 3.1731760086505427e-06, + "loss": 0.99301243, + "memory(GiB)": 302.58, + "step": 228200, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74703178, + "epoch": 1.2763143361663873, + "grad_norm": 6.96875, + "learning_rate": 3.172315264946717e-06, + "loss": 1.02161703, + "memory(GiB)": 302.58, + "step": 228220, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.75460815, + "epoch": 1.2764261856393666, + "grad_norm": 7.90625, + "learning_rate": 3.171454583755506e-06, + "loss": 0.97072992, + "memory(GiB)": 302.58, + "step": 228240, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.76551785, + "epoch": 1.2765380351123459, + "grad_norm": 8.875, + "learning_rate": 3.1705939651063456e-06, + "loss": 0.92584743, + "memory(GiB)": 302.58, + "step": 228260, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.72949238, + "epoch": 1.2766498845853251, + "grad_norm": 5.15625, + "learning_rate": 3.169733409028672e-06, + "loss": 1.08472109, + "memory(GiB)": 302.58, + "step": 228280, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.73205762, + "epoch": 1.2767617340583044, + "grad_norm": 7.96875, + "learning_rate": 3.1688729155519203e-06, + "loss": 1.03676386, + "memory(GiB)": 302.58, + "step": 228300, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.76487522, + "epoch": 1.2768735835312837, + "grad_norm": 6.15625, + "learning_rate": 3.1680124847055217e-06, + "loss": 0.90930748, + "memory(GiB)": 302.58, + "step": 228320, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.76348748, + "epoch": 1.276985433004263, + "grad_norm": 6.46875, + "learning_rate": 3.167152116518905e-06, + "loss": 0.92874203, + "memory(GiB)": 302.58, + "step": 228340, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.75446301, + "epoch": 1.2770972824772422, + "grad_norm": 7.25, + "learning_rate": 3.166291811021498e-06, + "loss": 0.96194725, + "memory(GiB)": 302.58, + "step": 228360, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.73372102, + "epoch": 1.2772091319502215, + "grad_norm": 6.28125, + "learning_rate": 3.165431568242725e-06, + "loss": 1.05322361, + "memory(GiB)": 302.58, + "step": 228380, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.75289159, + "epoch": 1.2773209814232007, + "grad_norm": 9.5, + "learning_rate": 3.16457138821201e-06, + "loss": 0.98114595, + "memory(GiB)": 302.58, + "step": 228400, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.75379896, + "epoch": 1.27743283089618, + "grad_norm": 6.3125, + "learning_rate": 3.1637112709587726e-06, + "loss": 0.9810668, + "memory(GiB)": 302.58, + "step": 228420, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.75775852, + "epoch": 1.2775446803691592, + "grad_norm": 8.5, + "learning_rate": 3.1628512165124336e-06, + "loss": 0.94076643, + "memory(GiB)": 302.58, + "step": 228440, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74059129, + "epoch": 1.2776565298421385, + "grad_norm": 4.96875, + "learning_rate": 3.1619912249024076e-06, + "loss": 1.02789679, + "memory(GiB)": 302.58, + "step": 228460, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.74349289, + "epoch": 1.2777683793151178, + "grad_norm": 7.53125, + "learning_rate": 3.1611312961581107e-06, + "loss": 1.01359282, + "memory(GiB)": 302.58, + "step": 228480, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.75644608, + "epoch": 1.277880228788097, + "grad_norm": 7.34375, + "learning_rate": 3.1602714303089544e-06, + "loss": 0.95147562, + "memory(GiB)": 302.58, + "step": 228500, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.74504166, + "epoch": 1.2779920782610763, + "grad_norm": 8.1875, + "learning_rate": 3.1594116273843485e-06, + "loss": 1.00507755, + "memory(GiB)": 302.58, + "step": 228520, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.76677995, + "epoch": 1.2781039277340556, + "grad_norm": 8.1875, + "learning_rate": 3.1585518874137013e-06, + "loss": 0.91602755, + "memory(GiB)": 302.58, + "step": 228540, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74526753, + "epoch": 1.2782157772070348, + "grad_norm": 6.34375, + "learning_rate": 3.1576922104264173e-06, + "loss": 1.02360783, + "memory(GiB)": 302.58, + "step": 228560, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.73643808, + "epoch": 1.278327626680014, + "grad_norm": 5.875, + "learning_rate": 3.156832596451903e-06, + "loss": 1.04040461, + "memory(GiB)": 302.58, + "step": 228580, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.75470958, + "epoch": 1.2784394761529934, + "grad_norm": 8.375, + "learning_rate": 3.155973045519558e-06, + "loss": 0.93822508, + "memory(GiB)": 302.58, + "step": 228600, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.76195846, + "epoch": 1.2785513256259726, + "grad_norm": 5.03125, + "learning_rate": 3.155113557658783e-06, + "loss": 0.93335323, + "memory(GiB)": 302.58, + "step": 228620, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.75118728, + "epoch": 1.2786631750989519, + "grad_norm": 7.40625, + "learning_rate": 3.1542541328989727e-06, + "loss": 0.97601414, + "memory(GiB)": 302.58, + "step": 228640, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.74868197, + "epoch": 1.2787750245719312, + "grad_norm": 6.84375, + "learning_rate": 3.153394771269526e-06, + "loss": 1.00404987, + "memory(GiB)": 302.58, + "step": 228660, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.75064945, + "epoch": 1.2788868740449104, + "grad_norm": 8.1875, + "learning_rate": 3.1525354727998314e-06, + "loss": 0.97966089, + "memory(GiB)": 302.58, + "step": 228680, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75192084, + "epoch": 1.2789987235178897, + "grad_norm": 6.5, + "learning_rate": 3.1516762375192815e-06, + "loss": 0.96934528, + "memory(GiB)": 302.58, + "step": 228700, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.74516029, + "epoch": 1.279110572990869, + "grad_norm": 6.71875, + "learning_rate": 3.150817065457267e-06, + "loss": 0.98999453, + "memory(GiB)": 302.58, + "step": 228720, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.75832181, + "epoch": 1.2792224224638482, + "grad_norm": 9.4375, + "learning_rate": 3.1499579566431715e-06, + "loss": 0.94672041, + "memory(GiB)": 302.58, + "step": 228740, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.73773141, + "epoch": 1.2793342719368275, + "grad_norm": 6.0, + "learning_rate": 3.14909891110638e-06, + "loss": 1.03504028, + "memory(GiB)": 302.58, + "step": 228760, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.75714941, + "epoch": 1.2794461214098067, + "grad_norm": 7.3125, + "learning_rate": 3.1482399288762757e-06, + "loss": 0.95671797, + "memory(GiB)": 302.58, + "step": 228780, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.74775052, + "epoch": 1.279557970882786, + "grad_norm": 11.0, + "learning_rate": 3.1473810099822365e-06, + "loss": 0.99285135, + "memory(GiB)": 302.58, + "step": 228800, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.75045118, + "epoch": 1.2796698203557653, + "grad_norm": 7.65625, + "learning_rate": 3.1465221544536416e-06, + "loss": 0.97619953, + "memory(GiB)": 302.58, + "step": 228820, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.76004519, + "epoch": 1.2797816698287445, + "grad_norm": 7.84375, + "learning_rate": 3.1456633623198664e-06, + "loss": 0.93247967, + "memory(GiB)": 302.58, + "step": 228840, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.7527482, + "epoch": 1.2798935193017238, + "grad_norm": 7.34375, + "learning_rate": 3.1448046336102844e-06, + "loss": 0.96343994, + "memory(GiB)": 302.58, + "step": 228860, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.75170765, + "epoch": 1.280005368774703, + "grad_norm": 7.625, + "learning_rate": 3.143945968354266e-06, + "loss": 0.98130484, + "memory(GiB)": 302.58, + "step": 228880, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.75592809, + "epoch": 1.2801172182476823, + "grad_norm": 9.6875, + "learning_rate": 3.1430873665811816e-06, + "loss": 0.95138874, + "memory(GiB)": 302.58, + "step": 228900, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.73426466, + "epoch": 1.2802290677206616, + "grad_norm": 11.3125, + "learning_rate": 3.142228828320397e-06, + "loss": 1.0558671, + "memory(GiB)": 302.58, + "step": 228920, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74233351, + "epoch": 1.2803409171936408, + "grad_norm": 6.46875, + "learning_rate": 3.1413703536012774e-06, + "loss": 1.0208828, + "memory(GiB)": 302.58, + "step": 228940, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.74259028, + "epoch": 1.2804527666666201, + "grad_norm": 6.46875, + "learning_rate": 3.1405119424531844e-06, + "loss": 1.02413378, + "memory(GiB)": 302.58, + "step": 228960, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.7596447, + "epoch": 1.2805646161395994, + "grad_norm": 6.59375, + "learning_rate": 3.13965359490548e-06, + "loss": 0.95126019, + "memory(GiB)": 302.58, + "step": 228980, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.75191855, + "epoch": 1.2806764656125786, + "grad_norm": 5.96875, + "learning_rate": 3.1387953109875223e-06, + "loss": 0.96985149, + "memory(GiB)": 302.58, + "step": 229000, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.76026568, + "epoch": 1.280788315085558, + "grad_norm": 10.125, + "learning_rate": 3.137937090728666e-06, + "loss": 0.94691505, + "memory(GiB)": 302.58, + "step": 229020, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.75157743, + "epoch": 1.2809001645585372, + "grad_norm": 8.5625, + "learning_rate": 3.137078934158266e-06, + "loss": 0.97909584, + "memory(GiB)": 302.58, + "step": 229040, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.75508451, + "epoch": 1.2810120140315164, + "grad_norm": 8.1875, + "learning_rate": 3.1362208413056733e-06, + "loss": 0.9518774, + "memory(GiB)": 302.58, + "step": 229060, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.76513786, + "epoch": 1.2811238635044957, + "grad_norm": 6.96875, + "learning_rate": 3.1353628122002376e-06, + "loss": 0.91497755, + "memory(GiB)": 302.58, + "step": 229080, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.7388473, + "epoch": 1.281235712977475, + "grad_norm": 9.3125, + "learning_rate": 3.1345048468713058e-06, + "loss": 1.03851061, + "memory(GiB)": 302.58, + "step": 229100, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.73399434, + "epoch": 1.2813475624504542, + "grad_norm": 7.375, + "learning_rate": 3.1336469453482244e-06, + "loss": 1.05422363, + "memory(GiB)": 302.58, + "step": 229120, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.74339948, + "epoch": 1.2814594119234335, + "grad_norm": 7.28125, + "learning_rate": 3.1327891076603357e-06, + "loss": 1.01321001, + "memory(GiB)": 302.58, + "step": 229140, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.7524426, + "epoch": 1.2815712613964128, + "grad_norm": 5.53125, + "learning_rate": 3.1319313338369796e-06, + "loss": 0.95387726, + "memory(GiB)": 302.58, + "step": 229160, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.7326942, + "epoch": 1.281683110869392, + "grad_norm": 8.5625, + "learning_rate": 3.1310736239074955e-06, + "loss": 1.07086277, + "memory(GiB)": 302.58, + "step": 229180, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.75710354, + "epoch": 1.2817949603423713, + "grad_norm": 5.78125, + "learning_rate": 3.13021597790122e-06, + "loss": 0.94469175, + "memory(GiB)": 302.58, + "step": 229200, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.74891677, + "epoch": 1.2819068098153505, + "grad_norm": 5.375, + "learning_rate": 3.129358395847487e-06, + "loss": 0.98240423, + "memory(GiB)": 302.58, + "step": 229220, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.7564106, + "epoch": 1.2820186592883298, + "grad_norm": 7.96875, + "learning_rate": 3.1285008777756275e-06, + "loss": 0.95524054, + "memory(GiB)": 302.58, + "step": 229240, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.77145491, + "epoch": 1.282130508761309, + "grad_norm": 7.15625, + "learning_rate": 3.1276434237149733e-06, + "loss": 0.88927374, + "memory(GiB)": 302.58, + "step": 229260, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.76036491, + "epoch": 1.2822423582342883, + "grad_norm": 9.5625, + "learning_rate": 3.1267860336948515e-06, + "loss": 0.93964348, + "memory(GiB)": 302.58, + "step": 229280, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.75137525, + "epoch": 1.2823542077072676, + "grad_norm": 7.03125, + "learning_rate": 3.1259287077445865e-06, + "loss": 0.98320045, + "memory(GiB)": 302.58, + "step": 229300, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.75215445, + "epoch": 1.2824660571802469, + "grad_norm": 4.21875, + "learning_rate": 3.125071445893503e-06, + "loss": 0.9716382, + "memory(GiB)": 302.58, + "step": 229320, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.75531707, + "epoch": 1.2825779066532261, + "grad_norm": 7.125, + "learning_rate": 3.1242142481709203e-06, + "loss": 0.96407442, + "memory(GiB)": 302.58, + "step": 229340, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.75754485, + "epoch": 1.2826897561262054, + "grad_norm": 7.96875, + "learning_rate": 3.1233571146061585e-06, + "loss": 0.9470665, + "memory(GiB)": 302.58, + "step": 229360, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.75843673, + "epoch": 1.2828016055991847, + "grad_norm": 6.96875, + "learning_rate": 3.1225000452285335e-06, + "loss": 0.94968233, + "memory(GiB)": 302.58, + "step": 229380, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.77123876, + "epoch": 1.282913455072164, + "grad_norm": 6.46875, + "learning_rate": 3.121643040067361e-06, + "loss": 0.88410025, + "memory(GiB)": 302.58, + "step": 229400, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.75812244, + "epoch": 1.2830253045451432, + "grad_norm": 4.21875, + "learning_rate": 3.120786099151952e-06, + "loss": 0.96994839, + "memory(GiB)": 302.58, + "step": 229420, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.76606469, + "epoch": 1.2831371540181224, + "grad_norm": 5.46875, + "learning_rate": 3.1199292225116184e-06, + "loss": 0.9069418, + "memory(GiB)": 302.58, + "step": 229440, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.75542617, + "epoch": 1.2832490034911017, + "grad_norm": 5.03125, + "learning_rate": 3.1190724101756666e-06, + "loss": 0.95542698, + "memory(GiB)": 302.58, + "step": 229460, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.75481539, + "epoch": 1.283360852964081, + "grad_norm": 6.0, + "learning_rate": 3.118215662173403e-06, + "loss": 0.95026522, + "memory(GiB)": 302.58, + "step": 229480, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.73004298, + "epoch": 1.2834727024370602, + "grad_norm": 5.3125, + "learning_rate": 3.117358978534128e-06, + "loss": 1.07707634, + "memory(GiB)": 302.58, + "step": 229500, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.7502459, + "epoch": 1.2835845519100395, + "grad_norm": 8.125, + "learning_rate": 3.116502359287148e-06, + "loss": 0.99704027, + "memory(GiB)": 302.58, + "step": 229520, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.73433204, + "epoch": 1.2836964013830188, + "grad_norm": 8.125, + "learning_rate": 3.11564580446176e-06, + "loss": 1.07293711, + "memory(GiB)": 302.58, + "step": 229540, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.75205965, + "epoch": 1.283808250855998, + "grad_norm": 5.375, + "learning_rate": 3.1147893140872586e-06, + "loss": 0.95935068, + "memory(GiB)": 302.58, + "step": 229560, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.76089907, + "epoch": 1.2839201003289773, + "grad_norm": 5.6875, + "learning_rate": 3.113932888192942e-06, + "loss": 0.91616592, + "memory(GiB)": 302.58, + "step": 229580, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.73830228, + "epoch": 1.2840319498019566, + "grad_norm": 9.3125, + "learning_rate": 3.1130765268081013e-06, + "loss": 1.02576227, + "memory(GiB)": 302.58, + "step": 229600, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.74716105, + "epoch": 1.2841437992749358, + "grad_norm": 6.25, + "learning_rate": 3.112220229962027e-06, + "loss": 1.00361433, + "memory(GiB)": 302.58, + "step": 229620, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.75687299, + "epoch": 1.284255648747915, + "grad_norm": 4.875, + "learning_rate": 3.1113639976840063e-06, + "loss": 0.97413158, + "memory(GiB)": 302.58, + "step": 229640, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.77052221, + "epoch": 1.2843674982208944, + "grad_norm": 8.9375, + "learning_rate": 3.110507830003326e-06, + "loss": 0.9031497, + "memory(GiB)": 302.58, + "step": 229660, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.75138502, + "epoch": 1.2844793476938736, + "grad_norm": 8.0, + "learning_rate": 3.1096517269492686e-06, + "loss": 0.95488749, + "memory(GiB)": 302.58, + "step": 229680, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.7500042, + "epoch": 1.2845911971668529, + "grad_norm": 8.4375, + "learning_rate": 3.1087956885511163e-06, + "loss": 0.95751858, + "memory(GiB)": 302.58, + "step": 229700, + "train_speed(iter/s)": 0.12384 + }, + { + "acc": 0.74951959, + "epoch": 1.2847030466398321, + "grad_norm": 7.125, + "learning_rate": 3.1079397148381486e-06, + "loss": 0.96471882, + "memory(GiB)": 302.58, + "step": 229720, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.74672055, + "epoch": 1.2848148961128114, + "grad_norm": 6.96875, + "learning_rate": 3.1070838058396425e-06, + "loss": 0.9658987, + "memory(GiB)": 302.58, + "step": 229740, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.75112953, + "epoch": 1.2849267455857907, + "grad_norm": 5.09375, + "learning_rate": 3.106227961584872e-06, + "loss": 0.98481398, + "memory(GiB)": 302.58, + "step": 229760, + "train_speed(iter/s)": 0.123854 + }, + { + "acc": 0.7616354, + "epoch": 1.28503859505877, + "grad_norm": 7.625, + "learning_rate": 3.1053721821031115e-06, + "loss": 0.92913446, + "memory(GiB)": 302.58, + "step": 229780, + "train_speed(iter/s)": 0.123859 + }, + { + "acc": 0.75528293, + "epoch": 1.2851504445317492, + "grad_norm": 6.0, + "learning_rate": 3.1045164674236285e-06, + "loss": 0.97388, + "memory(GiB)": 302.58, + "step": 229800, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.7643002, + "epoch": 1.2852622940047285, + "grad_norm": 7.53125, + "learning_rate": 3.1036608175756932e-06, + "loss": 0.91796665, + "memory(GiB)": 302.58, + "step": 229820, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.75467138, + "epoch": 1.2853741434777077, + "grad_norm": 7.3125, + "learning_rate": 3.1028052325885707e-06, + "loss": 0.96722708, + "memory(GiB)": 302.58, + "step": 229840, + "train_speed(iter/s)": 0.123875 + }, + { + "acc": 0.75246124, + "epoch": 1.285485992950687, + "grad_norm": 7.71875, + "learning_rate": 3.1019497124915254e-06, + "loss": 0.9842804, + "memory(GiB)": 302.58, + "step": 229860, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.74730234, + "epoch": 1.2855978424236663, + "grad_norm": 7.9375, + "learning_rate": 3.101094257313819e-06, + "loss": 1.00280619, + "memory(GiB)": 302.58, + "step": 229880, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.75372486, + "epoch": 1.2857096918966455, + "grad_norm": 7.875, + "learning_rate": 3.10023886708471e-06, + "loss": 0.95902681, + "memory(GiB)": 302.58, + "step": 229900, + "train_speed(iter/s)": 0.123891 + }, + { + "acc": 0.75249982, + "epoch": 1.2858215413696248, + "grad_norm": 6.34375, + "learning_rate": 3.0993835418334557e-06, + "loss": 0.96625586, + "memory(GiB)": 302.58, + "step": 229920, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.75018492, + "epoch": 1.285933390842604, + "grad_norm": 6.65625, + "learning_rate": 3.098528281589311e-06, + "loss": 0.97901068, + "memory(GiB)": 302.58, + "step": 229940, + "train_speed(iter/s)": 0.123901 + }, + { + "acc": 0.74406738, + "epoch": 1.2860452403155833, + "grad_norm": 5.90625, + "learning_rate": 3.0976730863815286e-06, + "loss": 1.01755133, + "memory(GiB)": 302.58, + "step": 229960, + "train_speed(iter/s)": 0.123906 + }, + { + "acc": 0.74774113, + "epoch": 1.2861570897885626, + "grad_norm": 5.9375, + "learning_rate": 3.0968179562393576e-06, + "loss": 0.99511766, + "memory(GiB)": 302.58, + "step": 229980, + "train_speed(iter/s)": 0.12391 + }, + { + "acc": 0.7497138, + "epoch": 1.2862689392615418, + "grad_norm": 10.75, + "learning_rate": 3.095962891192048e-06, + "loss": 0.98021679, + "memory(GiB)": 302.58, + "step": 230000, + "train_speed(iter/s)": 0.123915 + }, + { + "epoch": 1.2862689392615418, + "eval_acc": 0.7067569330260763, + "eval_loss": 1.012474536895752, + "eval_runtime": 7516.7189, + "eval_samples_per_second": 10.015, + "eval_steps_per_second": 10.015, + "step": 230000 + }, + { + "acc": 0.73642173, + "epoch": 1.286380788734521, + "grad_norm": 6.09375, + "learning_rate": 3.095107891268846e-06, + "loss": 1.05363321, + "memory(GiB)": 302.58, + "step": 230020, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75951862, + "epoch": 1.2864926382075004, + "grad_norm": 7.34375, + "learning_rate": 3.0942529564989932e-06, + "loss": 0.94044838, + "memory(GiB)": 302.58, + "step": 230040, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.7473454, + "epoch": 1.2866044876804796, + "grad_norm": 7.40625, + "learning_rate": 3.093398086911733e-06, + "loss": 0.97977667, + "memory(GiB)": 302.58, + "step": 230060, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.7384479, + "epoch": 1.286716337153459, + "grad_norm": 10.5625, + "learning_rate": 3.0925432825363037e-06, + "loss": 1.02097483, + "memory(GiB)": 302.58, + "step": 230080, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.75994048, + "epoch": 1.2868281866264382, + "grad_norm": 5.8125, + "learning_rate": 3.091688543401943e-06, + "loss": 0.94092007, + "memory(GiB)": 302.58, + "step": 230100, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.74012437, + "epoch": 1.2869400360994174, + "grad_norm": 5.15625, + "learning_rate": 3.0908338695378838e-06, + "loss": 1.03990288, + "memory(GiB)": 302.58, + "step": 230120, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.75321813, + "epoch": 1.2870518855723967, + "grad_norm": 6.0625, + "learning_rate": 3.0899792609733605e-06, + "loss": 0.97739611, + "memory(GiB)": 302.58, + "step": 230140, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.76112223, + "epoch": 1.287163735045376, + "grad_norm": 11.4375, + "learning_rate": 3.089124717737604e-06, + "loss": 0.95826073, + "memory(GiB)": 302.58, + "step": 230160, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.7380825, + "epoch": 1.2872755845183552, + "grad_norm": 6.59375, + "learning_rate": 3.088270239859841e-06, + "loss": 1.03747616, + "memory(GiB)": 302.58, + "step": 230180, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.76494327, + "epoch": 1.2873874339913345, + "grad_norm": 8.1875, + "learning_rate": 3.0874158273692973e-06, + "loss": 0.91867323, + "memory(GiB)": 302.58, + "step": 230200, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.75297651, + "epoch": 1.2874992834643137, + "grad_norm": 8.1875, + "learning_rate": 3.086561480295197e-06, + "loss": 0.98875732, + "memory(GiB)": 302.58, + "step": 230220, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.74768872, + "epoch": 1.287611132937293, + "grad_norm": 4.34375, + "learning_rate": 3.085707198666762e-06, + "loss": 0.99054174, + "memory(GiB)": 302.58, + "step": 230240, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.7587388, + "epoch": 1.2877229824102723, + "grad_norm": 7.1875, + "learning_rate": 3.084852982513209e-06, + "loss": 0.95388269, + "memory(GiB)": 302.58, + "step": 230260, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74678707, + "epoch": 1.2878348318832515, + "grad_norm": 8.25, + "learning_rate": 3.0839988318637586e-06, + "loss": 0.99417133, + "memory(GiB)": 302.58, + "step": 230280, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.76338224, + "epoch": 1.2879466813562308, + "grad_norm": 9.125, + "learning_rate": 3.083144746747623e-06, + "loss": 0.94003077, + "memory(GiB)": 302.58, + "step": 230300, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.74326425, + "epoch": 1.28805853082921, + "grad_norm": 4.84375, + "learning_rate": 3.0822907271940163e-06, + "loss": 0.99308205, + "memory(GiB)": 302.58, + "step": 230320, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.74578342, + "epoch": 1.2881703803021893, + "grad_norm": 6.65625, + "learning_rate": 3.0814367732321465e-06, + "loss": 0.99158812, + "memory(GiB)": 302.58, + "step": 230340, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.76180172, + "epoch": 1.2882822297751686, + "grad_norm": 6.6875, + "learning_rate": 3.0805828848912235e-06, + "loss": 0.93798571, + "memory(GiB)": 302.58, + "step": 230360, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.76273179, + "epoch": 1.2883940792481479, + "grad_norm": 6.96875, + "learning_rate": 3.0797290622004504e-06, + "loss": 0.93280478, + "memory(GiB)": 302.58, + "step": 230380, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75595217, + "epoch": 1.2885059287211271, + "grad_norm": 7.53125, + "learning_rate": 3.0788753051890336e-06, + "loss": 0.94562044, + "memory(GiB)": 302.58, + "step": 230400, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.7540298, + "epoch": 1.2886177781941064, + "grad_norm": 5.15625, + "learning_rate": 3.078021613886173e-06, + "loss": 0.97758198, + "memory(GiB)": 302.58, + "step": 230420, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75126953, + "epoch": 1.2887296276670857, + "grad_norm": 7.34375, + "learning_rate": 3.077167988321068e-06, + "loss": 0.98334818, + "memory(GiB)": 302.58, + "step": 230440, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.75379138, + "epoch": 1.288841477140065, + "grad_norm": 4.25, + "learning_rate": 3.0763144285229144e-06, + "loss": 0.96888781, + "memory(GiB)": 302.58, + "step": 230460, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.75564671, + "epoch": 1.2889533266130442, + "grad_norm": 3.921875, + "learning_rate": 3.0754609345209075e-06, + "loss": 0.95101166, + "memory(GiB)": 302.58, + "step": 230480, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.75055337, + "epoch": 1.2890651760860234, + "grad_norm": 5.1875, + "learning_rate": 3.0746075063442394e-06, + "loss": 0.97357597, + "memory(GiB)": 302.58, + "step": 230500, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.75659308, + "epoch": 1.2891770255590027, + "grad_norm": 4.6875, + "learning_rate": 3.0737541440220985e-06, + "loss": 0.94854383, + "memory(GiB)": 302.58, + "step": 230520, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.74441075, + "epoch": 1.289288875031982, + "grad_norm": 8.0625, + "learning_rate": 3.072900847583675e-06, + "loss": 1.03252602, + "memory(GiB)": 302.58, + "step": 230540, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.75454912, + "epoch": 1.2894007245049612, + "grad_norm": 7.5, + "learning_rate": 3.0720476170581537e-06, + "loss": 0.96086302, + "memory(GiB)": 302.58, + "step": 230560, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.76925387, + "epoch": 1.2895125739779405, + "grad_norm": 5.4375, + "learning_rate": 3.0711944524747164e-06, + "loss": 0.89099293, + "memory(GiB)": 302.58, + "step": 230580, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74697051, + "epoch": 1.2896244234509198, + "grad_norm": 6.46875, + "learning_rate": 3.0703413538625447e-06, + "loss": 1.01544867, + "memory(GiB)": 302.58, + "step": 230600, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.74356017, + "epoch": 1.289736272923899, + "grad_norm": 6.03125, + "learning_rate": 3.0694883212508186e-06, + "loss": 1.00505104, + "memory(GiB)": 302.58, + "step": 230620, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.75933919, + "epoch": 1.2898481223968783, + "grad_norm": 11.0625, + "learning_rate": 3.0686353546687125e-06, + "loss": 0.9427002, + "memory(GiB)": 302.58, + "step": 230640, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.74390264, + "epoch": 1.2899599718698576, + "grad_norm": 9.625, + "learning_rate": 3.067782454145401e-06, + "loss": 1.02915878, + "memory(GiB)": 302.58, + "step": 230660, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.73930502, + "epoch": 1.2900718213428368, + "grad_norm": 5.1875, + "learning_rate": 3.0669296197100573e-06, + "loss": 1.04035969, + "memory(GiB)": 302.58, + "step": 230680, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.74964986, + "epoch": 1.290183670815816, + "grad_norm": 8.125, + "learning_rate": 3.06607685139185e-06, + "loss": 0.99742956, + "memory(GiB)": 302.58, + "step": 230700, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.7552753, + "epoch": 1.2902955202887954, + "grad_norm": 6.96875, + "learning_rate": 3.065224149219947e-06, + "loss": 0.97742586, + "memory(GiB)": 302.58, + "step": 230720, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.76636133, + "epoch": 1.2904073697617746, + "grad_norm": 5.3125, + "learning_rate": 3.064371513223513e-06, + "loss": 0.91691742, + "memory(GiB)": 302.58, + "step": 230740, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.75491562, + "epoch": 1.2905192192347539, + "grad_norm": 7.78125, + "learning_rate": 3.0635189434317087e-06, + "loss": 0.97307358, + "memory(GiB)": 302.58, + "step": 230760, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.76646247, + "epoch": 1.2906310687077331, + "grad_norm": 6.1875, + "learning_rate": 3.0626664398736995e-06, + "loss": 0.90685024, + "memory(GiB)": 302.58, + "step": 230780, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.74959068, + "epoch": 1.2907429181807124, + "grad_norm": 8.875, + "learning_rate": 3.0618140025786402e-06, + "loss": 0.95290804, + "memory(GiB)": 302.58, + "step": 230800, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.74777741, + "epoch": 1.2908547676536917, + "grad_norm": 7.1875, + "learning_rate": 3.0609616315756883e-06, + "loss": 0.99731312, + "memory(GiB)": 302.58, + "step": 230820, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.73386455, + "epoch": 1.290966617126671, + "grad_norm": 5.71875, + "learning_rate": 3.0601093268939974e-06, + "loss": 1.05317545, + "memory(GiB)": 302.58, + "step": 230840, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.74150195, + "epoch": 1.2910784665996502, + "grad_norm": 6.375, + "learning_rate": 3.0592570885627177e-06, + "loss": 1.0149703, + "memory(GiB)": 302.58, + "step": 230860, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.74617615, + "epoch": 1.2911903160726295, + "grad_norm": 7.65625, + "learning_rate": 3.0584049166110007e-06, + "loss": 0.99185257, + "memory(GiB)": 302.58, + "step": 230880, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.72934361, + "epoch": 1.2913021655456087, + "grad_norm": 6.40625, + "learning_rate": 3.057552811067992e-06, + "loss": 1.08438816, + "memory(GiB)": 302.58, + "step": 230900, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.74573588, + "epoch": 1.291414015018588, + "grad_norm": 6.9375, + "learning_rate": 3.056700771962836e-06, + "loss": 1.01572371, + "memory(GiB)": 302.58, + "step": 230920, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.76736131, + "epoch": 1.2915258644915673, + "grad_norm": 7.65625, + "learning_rate": 3.055848799324676e-06, + "loss": 0.92111235, + "memory(GiB)": 302.58, + "step": 230940, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.73999896, + "epoch": 1.2916377139645465, + "grad_norm": 5.625, + "learning_rate": 3.0549968931826525e-06, + "loss": 1.03309469, + "memory(GiB)": 302.58, + "step": 230960, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.74441648, + "epoch": 1.2917495634375258, + "grad_norm": 5.0625, + "learning_rate": 3.054145053565901e-06, + "loss": 1.02606392, + "memory(GiB)": 302.58, + "step": 230980, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.75187469, + "epoch": 1.291861412910505, + "grad_norm": 7.15625, + "learning_rate": 3.0532932805035592e-06, + "loss": 0.97207632, + "memory(GiB)": 302.58, + "step": 231000, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.75816269, + "epoch": 1.2919732623834843, + "grad_norm": 8.3125, + "learning_rate": 3.0524415740247605e-06, + "loss": 0.91857309, + "memory(GiB)": 302.58, + "step": 231020, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.74065185, + "epoch": 1.2920851118564636, + "grad_norm": 5.375, + "learning_rate": 3.051589934158636e-06, + "loss": 1.01598864, + "memory(GiB)": 302.58, + "step": 231040, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.75252676, + "epoch": 1.2921969613294428, + "grad_norm": 7.34375, + "learning_rate": 3.0507383609343134e-06, + "loss": 0.96660538, + "memory(GiB)": 302.58, + "step": 231060, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.75971875, + "epoch": 1.292308810802422, + "grad_norm": 7.4375, + "learning_rate": 3.0498868543809202e-06, + "loss": 0.92880707, + "memory(GiB)": 302.58, + "step": 231080, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.73618522, + "epoch": 1.2924206602754014, + "grad_norm": 7.25, + "learning_rate": 3.0490354145275803e-06, + "loss": 1.02440767, + "memory(GiB)": 302.58, + "step": 231100, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.77052088, + "epoch": 1.2925325097483806, + "grad_norm": 6.625, + "learning_rate": 3.0481840414034154e-06, + "loss": 0.87287998, + "memory(GiB)": 302.58, + "step": 231120, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.74397078, + "epoch": 1.29264435922136, + "grad_norm": 7.375, + "learning_rate": 3.0473327350375438e-06, + "loss": 1.00630646, + "memory(GiB)": 302.58, + "step": 231140, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.75308876, + "epoch": 1.2927562086943392, + "grad_norm": 6.3125, + "learning_rate": 3.046481495459086e-06, + "loss": 0.96607141, + "memory(GiB)": 302.58, + "step": 231160, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.7503408, + "epoch": 1.2928680581673184, + "grad_norm": 5.875, + "learning_rate": 3.0456303226971558e-06, + "loss": 0.99318562, + "memory(GiB)": 302.58, + "step": 231180, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.74969816, + "epoch": 1.2929799076402977, + "grad_norm": 9.375, + "learning_rate": 3.0447792167808653e-06, + "loss": 0.98087978, + "memory(GiB)": 302.58, + "step": 231200, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.75531716, + "epoch": 1.293091757113277, + "grad_norm": 5.9375, + "learning_rate": 3.043928177739325e-06, + "loss": 0.96519489, + "memory(GiB)": 302.58, + "step": 231220, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.75228038, + "epoch": 1.2932036065862562, + "grad_norm": 6.34375, + "learning_rate": 3.0430772056016445e-06, + "loss": 0.96368399, + "memory(GiB)": 302.58, + "step": 231240, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.74546704, + "epoch": 1.2933154560592355, + "grad_norm": 8.0, + "learning_rate": 3.0422263003969264e-06, + "loss": 0.98171597, + "memory(GiB)": 302.58, + "step": 231260, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.75131183, + "epoch": 1.2934273055322147, + "grad_norm": 6.0625, + "learning_rate": 3.041375462154279e-06, + "loss": 0.97554579, + "memory(GiB)": 302.58, + "step": 231280, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.74209819, + "epoch": 1.293539155005194, + "grad_norm": 6.65625, + "learning_rate": 3.0405246909028007e-06, + "loss": 0.99775295, + "memory(GiB)": 302.58, + "step": 231300, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.73961267, + "epoch": 1.2936510044781733, + "grad_norm": 9.1875, + "learning_rate": 3.039673986671592e-06, + "loss": 1.03037014, + "memory(GiB)": 302.58, + "step": 231320, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.77095542, + "epoch": 1.2937628539511525, + "grad_norm": 6.8125, + "learning_rate": 3.038823349489749e-06, + "loss": 0.88986444, + "memory(GiB)": 302.58, + "step": 231340, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.76250548, + "epoch": 1.2938747034241318, + "grad_norm": 12.375, + "learning_rate": 3.0379727793863656e-06, + "loss": 0.92511835, + "memory(GiB)": 302.58, + "step": 231360, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.76310687, + "epoch": 1.293986552897111, + "grad_norm": 9.5625, + "learning_rate": 3.037122276390535e-06, + "loss": 0.92579041, + "memory(GiB)": 302.58, + "step": 231380, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.75275974, + "epoch": 1.2940984023700903, + "grad_norm": 8.0625, + "learning_rate": 3.036271840531345e-06, + "loss": 0.96980467, + "memory(GiB)": 302.58, + "step": 231400, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.75156832, + "epoch": 1.2942102518430696, + "grad_norm": 8.0625, + "learning_rate": 3.035421471837886e-06, + "loss": 0.95388985, + "memory(GiB)": 302.58, + "step": 231420, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.76813512, + "epoch": 1.2943221013160489, + "grad_norm": 8.25, + "learning_rate": 3.0345711703392426e-06, + "loss": 0.89631872, + "memory(GiB)": 302.58, + "step": 231440, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.73317323, + "epoch": 1.2944339507890281, + "grad_norm": 6.15625, + "learning_rate": 3.0337209360644977e-06, + "loss": 1.04925575, + "memory(GiB)": 302.58, + "step": 231460, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.77479258, + "epoch": 1.2945458002620074, + "grad_norm": 8.9375, + "learning_rate": 3.0328707690427307e-06, + "loss": 0.85536757, + "memory(GiB)": 302.58, + "step": 231480, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.75211883, + "epoch": 1.2946576497349866, + "grad_norm": 7.3125, + "learning_rate": 3.0320206693030216e-06, + "loss": 0.97714252, + "memory(GiB)": 302.58, + "step": 231500, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.73544493, + "epoch": 1.294769499207966, + "grad_norm": 5.90625, + "learning_rate": 3.0311706368744464e-06, + "loss": 1.0476532, + "memory(GiB)": 302.58, + "step": 231520, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.74281373, + "epoch": 1.2948813486809452, + "grad_norm": 8.875, + "learning_rate": 3.030320671786077e-06, + "loss": 1.02643013, + "memory(GiB)": 302.58, + "step": 231540, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.75690899, + "epoch": 1.2949931981539244, + "grad_norm": 6.0625, + "learning_rate": 3.0294707740669864e-06, + "loss": 0.96541815, + "memory(GiB)": 302.58, + "step": 231560, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.76243577, + "epoch": 1.2951050476269037, + "grad_norm": 6.3125, + "learning_rate": 3.0286209437462444e-06, + "loss": 0.92736855, + "memory(GiB)": 302.58, + "step": 231580, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.74209466, + "epoch": 1.295216897099883, + "grad_norm": 7.15625, + "learning_rate": 3.0277711808529174e-06, + "loss": 1.02147846, + "memory(GiB)": 302.58, + "step": 231600, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.76041217, + "epoch": 1.2953287465728622, + "grad_norm": 7.40625, + "learning_rate": 3.02692148541607e-06, + "loss": 0.9420001, + "memory(GiB)": 302.58, + "step": 231620, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.74354734, + "epoch": 1.2954405960458415, + "grad_norm": 7.5, + "learning_rate": 3.026071857464764e-06, + "loss": 1.03033361, + "memory(GiB)": 302.58, + "step": 231640, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.75739207, + "epoch": 1.2955524455188208, + "grad_norm": 6.96875, + "learning_rate": 3.0252222970280594e-06, + "loss": 0.96075335, + "memory(GiB)": 302.58, + "step": 231660, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.74357977, + "epoch": 1.2956642949918, + "grad_norm": 7.125, + "learning_rate": 3.024372804135013e-06, + "loss": 1.02348585, + "memory(GiB)": 302.58, + "step": 231680, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.75647078, + "epoch": 1.2957761444647793, + "grad_norm": 8.4375, + "learning_rate": 3.0235233788146825e-06, + "loss": 0.94735699, + "memory(GiB)": 302.58, + "step": 231700, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.74752092, + "epoch": 1.2958879939377586, + "grad_norm": 6.40625, + "learning_rate": 3.0226740210961196e-06, + "loss": 1.02025185, + "memory(GiB)": 302.58, + "step": 231720, + "train_speed(iter/s)": 0.123839 + }, + { + "acc": 0.74919748, + "epoch": 1.2959998434107378, + "grad_norm": 5.8125, + "learning_rate": 3.0218247310083747e-06, + "loss": 0.97161112, + "memory(GiB)": 302.58, + "step": 231740, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.74856458, + "epoch": 1.296111692883717, + "grad_norm": 9.125, + "learning_rate": 3.0209755085804976e-06, + "loss": 0.98804665, + "memory(GiB)": 302.58, + "step": 231760, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.75080104, + "epoch": 1.2962235423566963, + "grad_norm": 5.1875, + "learning_rate": 3.0201263538415326e-06, + "loss": 0.98466778, + "memory(GiB)": 302.58, + "step": 231780, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.76072855, + "epoch": 1.2963353918296756, + "grad_norm": 6.28125, + "learning_rate": 3.019277266820525e-06, + "loss": 0.92675619, + "memory(GiB)": 302.58, + "step": 231800, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.76437144, + "epoch": 1.2964472413026549, + "grad_norm": 7.09375, + "learning_rate": 3.0184282475465144e-06, + "loss": 0.90650826, + "memory(GiB)": 302.58, + "step": 231820, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.75788283, + "epoch": 1.2965590907756341, + "grad_norm": 6.09375, + "learning_rate": 3.0175792960485418e-06, + "loss": 0.93848162, + "memory(GiB)": 302.58, + "step": 231840, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.75392027, + "epoch": 1.2966709402486134, + "grad_norm": 9.0625, + "learning_rate": 3.016730412355643e-06, + "loss": 0.93662386, + "memory(GiB)": 302.58, + "step": 231860, + "train_speed(iter/s)": 0.123875 + }, + { + "acc": 0.75664005, + "epoch": 1.2967827897215927, + "grad_norm": 7.21875, + "learning_rate": 3.0158815964968534e-06, + "loss": 0.95918865, + "memory(GiB)": 302.58, + "step": 231880, + "train_speed(iter/s)": 0.12388 + }, + { + "acc": 0.75837207, + "epoch": 1.296894639194572, + "grad_norm": 6.84375, + "learning_rate": 3.0150328485012047e-06, + "loss": 0.94017458, + "memory(GiB)": 302.58, + "step": 231900, + "train_speed(iter/s)": 0.123886 + }, + { + "acc": 0.74898257, + "epoch": 1.2970064886675512, + "grad_norm": 7.40625, + "learning_rate": 3.014184168397727e-06, + "loss": 0.98497906, + "memory(GiB)": 302.58, + "step": 231920, + "train_speed(iter/s)": 0.123891 + }, + { + "acc": 0.74162393, + "epoch": 1.2971183381405305, + "grad_norm": 6.46875, + "learning_rate": 3.013335556215446e-06, + "loss": 1.00963879, + "memory(GiB)": 302.58, + "step": 231940, + "train_speed(iter/s)": 0.123896 + }, + { + "acc": 0.76418853, + "epoch": 1.2972301876135097, + "grad_norm": 5.40625, + "learning_rate": 3.01248701198339e-06, + "loss": 0.91987276, + "memory(GiB)": 302.58, + "step": 231960, + "train_speed(iter/s)": 0.1239 + }, + { + "acc": 0.75439219, + "epoch": 1.297342037086489, + "grad_norm": 7.5625, + "learning_rate": 3.01163853573058e-06, + "loss": 0.96693363, + "memory(GiB)": 302.58, + "step": 231980, + "train_speed(iter/s)": 0.123905 + }, + { + "acc": 0.75147309, + "epoch": 1.2974538865594683, + "grad_norm": 6.5625, + "learning_rate": 3.0107901274860376e-06, + "loss": 0.96394243, + "memory(GiB)": 302.58, + "step": 232000, + "train_speed(iter/s)": 0.123911 + }, + { + "epoch": 1.2974538865594683, + "eval_acc": 0.7067806940770601, + "eval_loss": 1.01233971118927, + "eval_runtime": 7567.0748, + "eval_samples_per_second": 9.949, + "eval_steps_per_second": 9.949, + "step": 232000 + }, + { + "acc": 0.74039993, + "epoch": 1.2975657360324475, + "grad_norm": 7.875, + "learning_rate": 3.0099417872787774e-06, + "loss": 1.03812819, + "memory(GiB)": 302.58, + "step": 232020, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.74980426, + "epoch": 1.2976775855054268, + "grad_norm": 5.875, + "learning_rate": 3.0090935151378207e-06, + "loss": 0.96766729, + "memory(GiB)": 302.58, + "step": 232040, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.74322424, + "epoch": 1.297789434978406, + "grad_norm": 7.0625, + "learning_rate": 3.0082453110921796e-06, + "loss": 1.02385693, + "memory(GiB)": 302.58, + "step": 232060, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.76435542, + "epoch": 1.2979012844513853, + "grad_norm": 9.0625, + "learning_rate": 3.007397175170864e-06, + "loss": 0.91522999, + "memory(GiB)": 302.58, + "step": 232080, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75538416, + "epoch": 1.2980131339243646, + "grad_norm": 6.75, + "learning_rate": 3.0065491074028835e-06, + "loss": 0.96395359, + "memory(GiB)": 302.58, + "step": 232100, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.73607206, + "epoch": 1.2981249833973438, + "grad_norm": 8.9375, + "learning_rate": 3.005701107817245e-06, + "loss": 1.03074875, + "memory(GiB)": 302.58, + "step": 232120, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.75808096, + "epoch": 1.298236832870323, + "grad_norm": 6.46875, + "learning_rate": 3.00485317644295e-06, + "loss": 0.94818544, + "memory(GiB)": 302.58, + "step": 232140, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.76334519, + "epoch": 1.2983486823433024, + "grad_norm": 9.0, + "learning_rate": 3.0040053133090053e-06, + "loss": 0.89728565, + "memory(GiB)": 302.58, + "step": 232160, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.74536691, + "epoch": 1.2984605318162816, + "grad_norm": 7.9375, + "learning_rate": 3.0031575184444074e-06, + "loss": 1.00584555, + "memory(GiB)": 302.58, + "step": 232180, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.73926463, + "epoch": 1.298572381289261, + "grad_norm": 7.125, + "learning_rate": 3.002309791878154e-06, + "loss": 1.0212945, + "memory(GiB)": 302.58, + "step": 232200, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.73465438, + "epoch": 1.2986842307622402, + "grad_norm": 11.375, + "learning_rate": 3.0014621336392402e-06, + "loss": 1.07528973, + "memory(GiB)": 302.58, + "step": 232220, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.7603878, + "epoch": 1.2987960802352194, + "grad_norm": 9.25, + "learning_rate": 3.0006145437566592e-06, + "loss": 0.95435772, + "memory(GiB)": 302.58, + "step": 232240, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.74890685, + "epoch": 1.2989079297081987, + "grad_norm": 7.84375, + "learning_rate": 2.9997670222594e-06, + "loss": 0.96554632, + "memory(GiB)": 302.58, + "step": 232260, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.73956718, + "epoch": 1.299019779181178, + "grad_norm": 7.09375, + "learning_rate": 2.9989195691764505e-06, + "loss": 1.01495752, + "memory(GiB)": 302.58, + "step": 232280, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.76084671, + "epoch": 1.2991316286541572, + "grad_norm": 5.6875, + "learning_rate": 2.998072184536798e-06, + "loss": 0.92744188, + "memory(GiB)": 302.58, + "step": 232300, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.75559673, + "epoch": 1.2992434781271365, + "grad_norm": 9.625, + "learning_rate": 2.997224868369425e-06, + "loss": 0.97759752, + "memory(GiB)": 302.58, + "step": 232320, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74481249, + "epoch": 1.2993553276001157, + "grad_norm": 8.8125, + "learning_rate": 2.9963776207033113e-06, + "loss": 1.01481524, + "memory(GiB)": 302.58, + "step": 232340, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.76211562, + "epoch": 1.299467177073095, + "grad_norm": 7.6875, + "learning_rate": 2.995530441567437e-06, + "loss": 0.94202251, + "memory(GiB)": 302.58, + "step": 232360, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.75052705, + "epoch": 1.2995790265460743, + "grad_norm": 9.375, + "learning_rate": 2.9946833309907776e-06, + "loss": 0.9789257, + "memory(GiB)": 302.58, + "step": 232380, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74249959, + "epoch": 1.2996908760190535, + "grad_norm": 7.5625, + "learning_rate": 2.9938362890023063e-06, + "loss": 1.01956615, + "memory(GiB)": 302.58, + "step": 232400, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75903158, + "epoch": 1.2998027254920328, + "grad_norm": 8.5625, + "learning_rate": 2.9929893156309954e-06, + "loss": 0.95405664, + "memory(GiB)": 302.58, + "step": 232420, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.73194504, + "epoch": 1.299914574965012, + "grad_norm": 7.90625, + "learning_rate": 2.992142410905814e-06, + "loss": 1.08184404, + "memory(GiB)": 302.58, + "step": 232440, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.72667894, + "epoch": 1.3000264244379913, + "grad_norm": 6.75, + "learning_rate": 2.9912955748557293e-06, + "loss": 1.08393288, + "memory(GiB)": 302.58, + "step": 232460, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.73956237, + "epoch": 1.3001382739109706, + "grad_norm": 6.59375, + "learning_rate": 2.9904488075097054e-06, + "loss": 1.01607494, + "memory(GiB)": 302.58, + "step": 232480, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.73322716, + "epoch": 1.3002501233839499, + "grad_norm": 5.78125, + "learning_rate": 2.9896021088967044e-06, + "loss": 1.04995995, + "memory(GiB)": 302.58, + "step": 232500, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.74743376, + "epoch": 1.3003619728569291, + "grad_norm": 8.8125, + "learning_rate": 2.9887554790456865e-06, + "loss": 0.99685373, + "memory(GiB)": 302.58, + "step": 232520, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.75493908, + "epoch": 1.3004738223299084, + "grad_norm": 9.8125, + "learning_rate": 2.987908917985608e-06, + "loss": 0.95748644, + "memory(GiB)": 302.58, + "step": 232540, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75515451, + "epoch": 1.3005856718028876, + "grad_norm": 6.5625, + "learning_rate": 2.987062425745424e-06, + "loss": 0.93087606, + "memory(GiB)": 302.58, + "step": 232560, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.75165558, + "epoch": 1.300697521275867, + "grad_norm": 8.1875, + "learning_rate": 2.9862160023540887e-06, + "loss": 0.98725977, + "memory(GiB)": 302.58, + "step": 232580, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74582953, + "epoch": 1.3008093707488462, + "grad_norm": 7.625, + "learning_rate": 2.9853696478405514e-06, + "loss": 0.99462128, + "memory(GiB)": 302.58, + "step": 232600, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74768305, + "epoch": 1.3009212202218254, + "grad_norm": 7.40625, + "learning_rate": 2.984523362233761e-06, + "loss": 0.97970428, + "memory(GiB)": 302.58, + "step": 232620, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.76734433, + "epoch": 1.3010330696948047, + "grad_norm": 8.1875, + "learning_rate": 2.9836771455626623e-06, + "loss": 0.89702711, + "memory(GiB)": 302.58, + "step": 232640, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.75618806, + "epoch": 1.301144919167784, + "grad_norm": 10.6875, + "learning_rate": 2.982830997856199e-06, + "loss": 0.95467167, + "memory(GiB)": 302.58, + "step": 232660, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.75677013, + "epoch": 1.3012567686407632, + "grad_norm": 5.9375, + "learning_rate": 2.981984919143311e-06, + "loss": 0.95006351, + "memory(GiB)": 302.58, + "step": 232680, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.76382122, + "epoch": 1.3013686181137425, + "grad_norm": 5.78125, + "learning_rate": 2.9811389094529375e-06, + "loss": 0.92588253, + "memory(GiB)": 302.58, + "step": 232700, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.76492419, + "epoch": 1.3014804675867218, + "grad_norm": 8.0, + "learning_rate": 2.980292968814015e-06, + "loss": 0.90160608, + "memory(GiB)": 302.58, + "step": 232720, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.74423275, + "epoch": 1.301592317059701, + "grad_norm": 5.875, + "learning_rate": 2.9794470972554785e-06, + "loss": 1.0047472, + "memory(GiB)": 302.58, + "step": 232740, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.73717375, + "epoch": 1.3017041665326803, + "grad_norm": 7.21875, + "learning_rate": 2.9786012948062576e-06, + "loss": 1.05027809, + "memory(GiB)": 302.58, + "step": 232760, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.73198733, + "epoch": 1.3018160160056595, + "grad_norm": 8.5, + "learning_rate": 2.9777555614952823e-06, + "loss": 1.04834738, + "memory(GiB)": 302.58, + "step": 232780, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.74358597, + "epoch": 1.3019278654786388, + "grad_norm": 5.34375, + "learning_rate": 2.976909897351479e-06, + "loss": 1.01660547, + "memory(GiB)": 302.58, + "step": 232800, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.76364222, + "epoch": 1.302039714951618, + "grad_norm": 6.4375, + "learning_rate": 2.976064302403772e-06, + "loss": 0.94121561, + "memory(GiB)": 302.58, + "step": 232820, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.73591189, + "epoch": 1.3021515644245973, + "grad_norm": 8.5, + "learning_rate": 2.9752187766810838e-06, + "loss": 1.03791332, + "memory(GiB)": 302.58, + "step": 232840, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.74774156, + "epoch": 1.3022634138975766, + "grad_norm": 10.125, + "learning_rate": 2.974373320212334e-06, + "loss": 1.00448627, + "memory(GiB)": 302.58, + "step": 232860, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74864311, + "epoch": 1.3023752633705559, + "grad_norm": 7.3125, + "learning_rate": 2.9735279330264393e-06, + "loss": 0.98534107, + "memory(GiB)": 302.58, + "step": 232880, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.75895348, + "epoch": 1.3024871128435351, + "grad_norm": 5.78125, + "learning_rate": 2.9726826151523157e-06, + "loss": 0.94035139, + "memory(GiB)": 302.58, + "step": 232900, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.74372549, + "epoch": 1.3025989623165144, + "grad_norm": 6.28125, + "learning_rate": 2.9718373666188748e-06, + "loss": 1.01405344, + "memory(GiB)": 302.58, + "step": 232920, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.74836459, + "epoch": 1.3027108117894937, + "grad_norm": 7.375, + "learning_rate": 2.9709921874550275e-06, + "loss": 0.96729097, + "memory(GiB)": 302.58, + "step": 232940, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.74469423, + "epoch": 1.302822661262473, + "grad_norm": 9.3125, + "learning_rate": 2.9701470776896798e-06, + "loss": 1.00538425, + "memory(GiB)": 302.58, + "step": 232960, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.75312328, + "epoch": 1.3029345107354522, + "grad_norm": 4.8125, + "learning_rate": 2.969302037351739e-06, + "loss": 0.99270811, + "memory(GiB)": 302.58, + "step": 232980, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.74614549, + "epoch": 1.3030463602084315, + "grad_norm": 7.78125, + "learning_rate": 2.9684570664701083e-06, + "loss": 0.99728518, + "memory(GiB)": 302.58, + "step": 233000, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.74683456, + "epoch": 1.3031582096814107, + "grad_norm": 7.25, + "learning_rate": 2.967612165073688e-06, + "loss": 1.02289543, + "memory(GiB)": 302.58, + "step": 233020, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74362373, + "epoch": 1.30327005915439, + "grad_norm": 5.34375, + "learning_rate": 2.966767333191376e-06, + "loss": 1.02810812, + "memory(GiB)": 302.58, + "step": 233040, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.77116761, + "epoch": 1.3033819086273692, + "grad_norm": 7.46875, + "learning_rate": 2.9659225708520683e-06, + "loss": 0.8875905, + "memory(GiB)": 302.58, + "step": 233060, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.73717084, + "epoch": 1.3034937581003485, + "grad_norm": 9.4375, + "learning_rate": 2.965077878084659e-06, + "loss": 1.02426043, + "memory(GiB)": 302.58, + "step": 233080, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.76669192, + "epoch": 1.3036056075733278, + "grad_norm": 4.90625, + "learning_rate": 2.9642332549180377e-06, + "loss": 0.91902828, + "memory(GiB)": 302.58, + "step": 233100, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.73350143, + "epoch": 1.303717457046307, + "grad_norm": 10.0625, + "learning_rate": 2.963388701381095e-06, + "loss": 1.04821005, + "memory(GiB)": 302.58, + "step": 233120, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74130783, + "epoch": 1.3038293065192863, + "grad_norm": 8.75, + "learning_rate": 2.9625442175027164e-06, + "loss": 1.00579805, + "memory(GiB)": 302.58, + "step": 233140, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.72779779, + "epoch": 1.3039411559922656, + "grad_norm": 6.09375, + "learning_rate": 2.9616998033117867e-06, + "loss": 1.09344063, + "memory(GiB)": 302.58, + "step": 233160, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.75749068, + "epoch": 1.3040530054652448, + "grad_norm": 5.59375, + "learning_rate": 2.960855458837186e-06, + "loss": 0.97013645, + "memory(GiB)": 302.58, + "step": 233180, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.74842567, + "epoch": 1.304164854938224, + "grad_norm": 9.625, + "learning_rate": 2.960011184107796e-06, + "loss": 0.98173208, + "memory(GiB)": 302.58, + "step": 233200, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.750313, + "epoch": 1.3042767044112034, + "grad_norm": 6.71875, + "learning_rate": 2.9591669791524915e-06, + "loss": 0.98863621, + "memory(GiB)": 302.58, + "step": 233220, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.75999961, + "epoch": 1.3043885538841826, + "grad_norm": 6.875, + "learning_rate": 2.9583228440001465e-06, + "loss": 0.94716387, + "memory(GiB)": 302.58, + "step": 233240, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.75820704, + "epoch": 1.3045004033571619, + "grad_norm": 9.0625, + "learning_rate": 2.957478778679634e-06, + "loss": 0.95801563, + "memory(GiB)": 302.58, + "step": 233260, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.74809656, + "epoch": 1.3046122528301412, + "grad_norm": 9.75, + "learning_rate": 2.9566347832198257e-06, + "loss": 0.993748, + "memory(GiB)": 302.58, + "step": 233280, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.75543499, + "epoch": 1.3047241023031204, + "grad_norm": 7.65625, + "learning_rate": 2.9557908576495852e-06, + "loss": 0.97170019, + "memory(GiB)": 302.58, + "step": 233300, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.75242362, + "epoch": 1.3048359517760997, + "grad_norm": 6.34375, + "learning_rate": 2.954947001997781e-06, + "loss": 0.9912055, + "memory(GiB)": 302.58, + "step": 233320, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.73157372, + "epoch": 1.304947801249079, + "grad_norm": 5.34375, + "learning_rate": 2.954103216293274e-06, + "loss": 1.0541007, + "memory(GiB)": 302.58, + "step": 233340, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.75789232, + "epoch": 1.3050596507220582, + "grad_norm": 7.125, + "learning_rate": 2.9532595005649246e-06, + "loss": 0.92850828, + "memory(GiB)": 302.58, + "step": 233360, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.73913579, + "epoch": 1.3051715001950375, + "grad_norm": 9.5625, + "learning_rate": 2.95241585484159e-06, + "loss": 1.03541403, + "memory(GiB)": 302.58, + "step": 233380, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.74976454, + "epoch": 1.3052833496680167, + "grad_norm": 5.625, + "learning_rate": 2.9515722791521256e-06, + "loss": 0.96828642, + "memory(GiB)": 302.58, + "step": 233400, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.74679275, + "epoch": 1.305395199140996, + "grad_norm": 8.4375, + "learning_rate": 2.9507287735253854e-06, + "loss": 0.98418856, + "memory(GiB)": 302.58, + "step": 233420, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.75796356, + "epoch": 1.3055070486139753, + "grad_norm": 11.375, + "learning_rate": 2.949885337990217e-06, + "loss": 0.9773592, + "memory(GiB)": 302.58, + "step": 233440, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.74557738, + "epoch": 1.3056188980869545, + "grad_norm": 7.3125, + "learning_rate": 2.9490419725754728e-06, + "loss": 1.00052795, + "memory(GiB)": 302.58, + "step": 233460, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.75455952, + "epoch": 1.3057307475599338, + "grad_norm": 7.71875, + "learning_rate": 2.9481986773099964e-06, + "loss": 0.98231459, + "memory(GiB)": 302.58, + "step": 233480, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.74900908, + "epoch": 1.305842597032913, + "grad_norm": 9.9375, + "learning_rate": 2.9473554522226313e-06, + "loss": 0.97042894, + "memory(GiB)": 302.58, + "step": 233500, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.75921626, + "epoch": 1.3059544465058923, + "grad_norm": 7.03125, + "learning_rate": 2.946512297342218e-06, + "loss": 0.93949871, + "memory(GiB)": 302.58, + "step": 233520, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.75803442, + "epoch": 1.3060662959788716, + "grad_norm": 9.5625, + "learning_rate": 2.945669212697596e-06, + "loss": 0.96487904, + "memory(GiB)": 302.58, + "step": 233540, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.75613785, + "epoch": 1.3061781454518508, + "grad_norm": 9.0625, + "learning_rate": 2.9448261983175998e-06, + "loss": 0.9517498, + "memory(GiB)": 302.58, + "step": 233560, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.74968448, + "epoch": 1.3062899949248301, + "grad_norm": 8.375, + "learning_rate": 2.9439832542310652e-06, + "loss": 0.98518276, + "memory(GiB)": 302.58, + "step": 233580, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.75891666, + "epoch": 1.3064018443978094, + "grad_norm": 5.6875, + "learning_rate": 2.9431403804668233e-06, + "loss": 0.93390923, + "memory(GiB)": 302.58, + "step": 233600, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.75233312, + "epoch": 1.3065136938707886, + "grad_norm": 4.90625, + "learning_rate": 2.9422975770537025e-06, + "loss": 0.96271229, + "memory(GiB)": 302.58, + "step": 233620, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.73224983, + "epoch": 1.306625543343768, + "grad_norm": 7.78125, + "learning_rate": 2.9414548440205293e-06, + "loss": 1.06108351, + "memory(GiB)": 302.58, + "step": 233640, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.75428905, + "epoch": 1.3067373928167472, + "grad_norm": 7.6875, + "learning_rate": 2.9406121813961275e-06, + "loss": 0.98316097, + "memory(GiB)": 302.58, + "step": 233660, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.72939515, + "epoch": 1.3068492422897264, + "grad_norm": 7.78125, + "learning_rate": 2.93976958920932e-06, + "loss": 1.086341, + "memory(GiB)": 302.58, + "step": 233680, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.74255152, + "epoch": 1.3069610917627057, + "grad_norm": 8.0, + "learning_rate": 2.9389270674889235e-06, + "loss": 1.0103405, + "memory(GiB)": 302.58, + "step": 233700, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.74091287, + "epoch": 1.307072941235685, + "grad_norm": 7.875, + "learning_rate": 2.9380846162637584e-06, + "loss": 1.02263145, + "memory(GiB)": 302.58, + "step": 233720, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.76969457, + "epoch": 1.3071847907086642, + "grad_norm": 7.4375, + "learning_rate": 2.9372422355626374e-06, + "loss": 0.9039629, + "memory(GiB)": 302.58, + "step": 233740, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.75833378, + "epoch": 1.3072966401816435, + "grad_norm": 8.625, + "learning_rate": 2.9363999254143728e-06, + "loss": 0.94561558, + "memory(GiB)": 302.58, + "step": 233760, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.7533885, + "epoch": 1.3074084896546228, + "grad_norm": 8.0625, + "learning_rate": 2.935557685847774e-06, + "loss": 0.95984478, + "memory(GiB)": 302.58, + "step": 233780, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.76984076, + "epoch": 1.307520339127602, + "grad_norm": 6.09375, + "learning_rate": 2.9347155168916487e-06, + "loss": 0.89507418, + "memory(GiB)": 302.58, + "step": 233800, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.74628882, + "epoch": 1.3076321886005813, + "grad_norm": 7.5, + "learning_rate": 2.933873418574802e-06, + "loss": 0.98827848, + "memory(GiB)": 302.58, + "step": 233820, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.74935918, + "epoch": 1.3077440380735605, + "grad_norm": 7.96875, + "learning_rate": 2.933031390926034e-06, + "loss": 0.97057056, + "memory(GiB)": 302.58, + "step": 233840, + "train_speed(iter/s)": 0.123862 + }, + { + "acc": 0.74012756, + "epoch": 1.3078558875465398, + "grad_norm": 7.8125, + "learning_rate": 2.932189433974148e-06, + "loss": 1.03513718, + "memory(GiB)": 302.58, + "step": 233860, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.74309654, + "epoch": 1.307967737019519, + "grad_norm": 6.5625, + "learning_rate": 2.9313475477479403e-06, + "loss": 1.01054764, + "memory(GiB)": 302.58, + "step": 233880, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.7346611, + "epoch": 1.3080795864924983, + "grad_norm": 8.0625, + "learning_rate": 2.9305057322762058e-06, + "loss": 1.04543743, + "memory(GiB)": 302.58, + "step": 233900, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.7554297, + "epoch": 1.3081914359654776, + "grad_norm": 8.375, + "learning_rate": 2.9296639875877375e-06, + "loss": 0.9677351, + "memory(GiB)": 302.58, + "step": 233920, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.73718348, + "epoch": 1.3083032854384569, + "grad_norm": 7.9375, + "learning_rate": 2.9288223137113257e-06, + "loss": 1.03059673, + "memory(GiB)": 302.58, + "step": 233940, + "train_speed(iter/s)": 0.123887 + }, + { + "acc": 0.7482471, + "epoch": 1.3084151349114361, + "grad_norm": 6.21875, + "learning_rate": 2.9279807106757585e-06, + "loss": 0.98474951, + "memory(GiB)": 302.58, + "step": 233960, + "train_speed(iter/s)": 0.123892 + }, + { + "acc": 0.74351749, + "epoch": 1.3085269843844154, + "grad_norm": 6.34375, + "learning_rate": 2.92713917850982e-06, + "loss": 1.00841961, + "memory(GiB)": 302.58, + "step": 233980, + "train_speed(iter/s)": 0.123897 + }, + { + "acc": 0.74952641, + "epoch": 1.3086388338573947, + "grad_norm": 7.96875, + "learning_rate": 2.926297717242296e-06, + "loss": 0.97355223, + "memory(GiB)": 302.58, + "step": 234000, + "train_speed(iter/s)": 0.123902 + }, + { + "epoch": 1.3086388338573947, + "eval_acc": 0.7068031241148146, + "eval_loss": 1.0122333765029907, + "eval_runtime": 7561.6862, + "eval_samples_per_second": 9.956, + "eval_steps_per_second": 9.956, + "step": 234000 + }, + { + "acc": 0.75514379, + "epoch": 1.308750683330374, + "grad_norm": 8.0625, + "learning_rate": 2.9254563269019647e-06, + "loss": 0.95090342, + "memory(GiB)": 302.58, + "step": 234020, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.75893693, + "epoch": 1.3088625328033532, + "grad_norm": 6.0, + "learning_rate": 2.924615007517606e-06, + "loss": 0.96162539, + "memory(GiB)": 302.58, + "step": 234040, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.75509224, + "epoch": 1.3089743822763324, + "grad_norm": 6.4375, + "learning_rate": 2.9237737591179943e-06, + "loss": 0.95771885, + "memory(GiB)": 302.58, + "step": 234060, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.73448029, + "epoch": 1.3090862317493117, + "grad_norm": 4.75, + "learning_rate": 2.9229325817319037e-06, + "loss": 1.04576788, + "memory(GiB)": 302.58, + "step": 234080, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.75504179, + "epoch": 1.309198081222291, + "grad_norm": 5.40625, + "learning_rate": 2.922091475388105e-06, + "loss": 0.97615414, + "memory(GiB)": 302.58, + "step": 234100, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.75023937, + "epoch": 1.3093099306952702, + "grad_norm": 8.375, + "learning_rate": 2.9212504401153664e-06, + "loss": 0.98089027, + "memory(GiB)": 302.58, + "step": 234120, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74089403, + "epoch": 1.3094217801682495, + "grad_norm": 7.0, + "learning_rate": 2.920409475942454e-06, + "loss": 1.02986412, + "memory(GiB)": 302.58, + "step": 234140, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.72429481, + "epoch": 1.3095336296412288, + "grad_norm": 7.1875, + "learning_rate": 2.9195685828981325e-06, + "loss": 1.11629601, + "memory(GiB)": 302.58, + "step": 234160, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.76468706, + "epoch": 1.309645479114208, + "grad_norm": 9.3125, + "learning_rate": 2.918727761011162e-06, + "loss": 0.90545454, + "memory(GiB)": 302.58, + "step": 234180, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.74139848, + "epoch": 1.3097573285871873, + "grad_norm": 8.0, + "learning_rate": 2.9178870103103014e-06, + "loss": 1.00027552, + "memory(GiB)": 302.58, + "step": 234200, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.75573974, + "epoch": 1.3098691780601666, + "grad_norm": 7.375, + "learning_rate": 2.917046330824308e-06, + "loss": 0.95556307, + "memory(GiB)": 302.58, + "step": 234220, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75494027, + "epoch": 1.3099810275331458, + "grad_norm": 5.53125, + "learning_rate": 2.9162057225819344e-06, + "loss": 0.94426889, + "memory(GiB)": 302.58, + "step": 234240, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.73765478, + "epoch": 1.310092877006125, + "grad_norm": 7.5625, + "learning_rate": 2.915365185611933e-06, + "loss": 1.02089949, + "memory(GiB)": 302.58, + "step": 234260, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.74428802, + "epoch": 1.3102047264791044, + "grad_norm": 6.78125, + "learning_rate": 2.914524719943053e-06, + "loss": 1.01982298, + "memory(GiB)": 302.58, + "step": 234280, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.77747617, + "epoch": 1.3103165759520836, + "grad_norm": 8.0625, + "learning_rate": 2.913684325604039e-06, + "loss": 0.86515207, + "memory(GiB)": 302.58, + "step": 234300, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.74457564, + "epoch": 1.3104284254250629, + "grad_norm": 5.625, + "learning_rate": 2.912844002623638e-06, + "loss": 1.00983019, + "memory(GiB)": 302.58, + "step": 234320, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.76271806, + "epoch": 1.3105402748980421, + "grad_norm": 8.375, + "learning_rate": 2.9120037510305893e-06, + "loss": 0.93302059, + "memory(GiB)": 302.58, + "step": 234340, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74072571, + "epoch": 1.3106521243710214, + "grad_norm": 7.25, + "learning_rate": 2.9111635708536334e-06, + "loss": 1.02773561, + "memory(GiB)": 302.58, + "step": 234360, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.74682002, + "epoch": 1.3107639738440007, + "grad_norm": 7.28125, + "learning_rate": 2.910323462121507e-06, + "loss": 0.99352369, + "memory(GiB)": 302.58, + "step": 234380, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.75048461, + "epoch": 1.31087582331698, + "grad_norm": 7.6875, + "learning_rate": 2.9094834248629443e-06, + "loss": 0.98870277, + "memory(GiB)": 302.58, + "step": 234400, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75644417, + "epoch": 1.3109876727899592, + "grad_norm": 4.28125, + "learning_rate": 2.908643459106678e-06, + "loss": 0.94870148, + "memory(GiB)": 302.58, + "step": 234420, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.74325809, + "epoch": 1.3110995222629385, + "grad_norm": 8.375, + "learning_rate": 2.907803564881436e-06, + "loss": 1.02477427, + "memory(GiB)": 302.58, + "step": 234440, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.7520267, + "epoch": 1.3112113717359177, + "grad_norm": 6.34375, + "learning_rate": 2.9069637422159445e-06, + "loss": 0.96812534, + "memory(GiB)": 302.58, + "step": 234460, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74534297, + "epoch": 1.311323221208897, + "grad_norm": 5.5625, + "learning_rate": 2.9061239911389324e-06, + "loss": 1.01118011, + "memory(GiB)": 302.58, + "step": 234480, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.75185647, + "epoch": 1.3114350706818763, + "grad_norm": 8.6875, + "learning_rate": 2.9052843116791183e-06, + "loss": 0.97512484, + "memory(GiB)": 302.58, + "step": 234500, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.71880469, + "epoch": 1.3115469201548555, + "grad_norm": 7.28125, + "learning_rate": 2.904444703865222e-06, + "loss": 1.11260986, + "memory(GiB)": 302.58, + "step": 234520, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.74275684, + "epoch": 1.3116587696278348, + "grad_norm": 7.78125, + "learning_rate": 2.903605167725963e-06, + "loss": 1.0205739, + "memory(GiB)": 302.58, + "step": 234540, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.76754827, + "epoch": 1.311770619100814, + "grad_norm": 7.28125, + "learning_rate": 2.9027657032900548e-06, + "loss": 0.89682674, + "memory(GiB)": 302.58, + "step": 234560, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.73110299, + "epoch": 1.3118824685737933, + "grad_norm": 8.6875, + "learning_rate": 2.90192631058621e-06, + "loss": 1.03406096, + "memory(GiB)": 302.58, + "step": 234580, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.75086808, + "epoch": 1.3119943180467726, + "grad_norm": 5.3125, + "learning_rate": 2.9010869896431383e-06, + "loss": 0.98545666, + "memory(GiB)": 302.58, + "step": 234600, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74154468, + "epoch": 1.3121061675197518, + "grad_norm": 6.25, + "learning_rate": 2.9002477404895462e-06, + "loss": 1.01864672, + "memory(GiB)": 302.58, + "step": 234620, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.75752282, + "epoch": 1.312218016992731, + "grad_norm": 7.5625, + "learning_rate": 2.89940856315414e-06, + "loss": 0.94209242, + "memory(GiB)": 302.58, + "step": 234640, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.74978786, + "epoch": 1.3123298664657104, + "grad_norm": 7.875, + "learning_rate": 2.898569457665622e-06, + "loss": 0.98321085, + "memory(GiB)": 302.58, + "step": 234660, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.74521117, + "epoch": 1.3124417159386896, + "grad_norm": 7.6875, + "learning_rate": 2.8977304240526914e-06, + "loss": 1.0106885, + "memory(GiB)": 302.58, + "step": 234680, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.77159991, + "epoch": 1.312553565411669, + "grad_norm": 6.5, + "learning_rate": 2.896891462344047e-06, + "loss": 0.91120644, + "memory(GiB)": 302.58, + "step": 234700, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.74897671, + "epoch": 1.3126654148846482, + "grad_norm": 6.78125, + "learning_rate": 2.896052572568383e-06, + "loss": 0.98203945, + "memory(GiB)": 302.58, + "step": 234720, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.7276032, + "epoch": 1.3127772643576274, + "grad_norm": 9.9375, + "learning_rate": 2.8952137547543925e-06, + "loss": 1.09806833, + "memory(GiB)": 302.58, + "step": 234740, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.77336717, + "epoch": 1.3128891138306067, + "grad_norm": 7.40625, + "learning_rate": 2.894375008930766e-06, + "loss": 0.88604488, + "memory(GiB)": 302.58, + "step": 234760, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.76383405, + "epoch": 1.313000963303586, + "grad_norm": 6.09375, + "learning_rate": 2.893536335126189e-06, + "loss": 0.92066479, + "memory(GiB)": 302.58, + "step": 234780, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.74628301, + "epoch": 1.3131128127765652, + "grad_norm": 8.3125, + "learning_rate": 2.89269773336935e-06, + "loss": 0.98140821, + "memory(GiB)": 302.58, + "step": 234800, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.74909587, + "epoch": 1.3132246622495445, + "grad_norm": 5.46875, + "learning_rate": 2.8918592036889314e-06, + "loss": 1.00548086, + "memory(GiB)": 302.58, + "step": 234820, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.7488174, + "epoch": 1.3133365117225237, + "grad_norm": 8.3125, + "learning_rate": 2.8910207461136125e-06, + "loss": 0.9945116, + "memory(GiB)": 302.58, + "step": 234840, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.76376972, + "epoch": 1.313448361195503, + "grad_norm": 7.8125, + "learning_rate": 2.8901823606720714e-06, + "loss": 0.92433081, + "memory(GiB)": 302.58, + "step": 234860, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.76551428, + "epoch": 1.3135602106684823, + "grad_norm": 8.125, + "learning_rate": 2.8893440473929833e-06, + "loss": 0.92583895, + "memory(GiB)": 302.58, + "step": 234880, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.73147945, + "epoch": 1.3136720601414615, + "grad_norm": 8.125, + "learning_rate": 2.888505806305022e-06, + "loss": 1.0874444, + "memory(GiB)": 302.58, + "step": 234900, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.74331422, + "epoch": 1.3137839096144408, + "grad_norm": 9.5625, + "learning_rate": 2.8876676374368574e-06, + "loss": 1.00705585, + "memory(GiB)": 302.58, + "step": 234920, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75229158, + "epoch": 1.31389575908742, + "grad_norm": 7.71875, + "learning_rate": 2.8868295408171577e-06, + "loss": 0.95592928, + "memory(GiB)": 302.58, + "step": 234940, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.75313025, + "epoch": 1.3140076085603993, + "grad_norm": 6.875, + "learning_rate": 2.8859915164745876e-06, + "loss": 0.95839558, + "memory(GiB)": 302.58, + "step": 234960, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.76809902, + "epoch": 1.3141194580333786, + "grad_norm": 5.875, + "learning_rate": 2.885153564437812e-06, + "loss": 0.93663988, + "memory(GiB)": 302.58, + "step": 234980, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74092655, + "epoch": 1.314231307506358, + "grad_norm": 6.625, + "learning_rate": 2.8843156847354904e-06, + "loss": 1.0170476, + "memory(GiB)": 302.58, + "step": 235000, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.7414156, + "epoch": 1.3143431569793371, + "grad_norm": 6.78125, + "learning_rate": 2.8834778773962802e-06, + "loss": 1.0069315, + "memory(GiB)": 302.58, + "step": 235020, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75295019, + "epoch": 1.3144550064523166, + "grad_norm": 6.34375, + "learning_rate": 2.882640142448839e-06, + "loss": 0.95094881, + "memory(GiB)": 302.58, + "step": 235040, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.756849, + "epoch": 1.3145668559252957, + "grad_norm": 8.8125, + "learning_rate": 2.8818024799218157e-06, + "loss": 0.9643486, + "memory(GiB)": 302.58, + "step": 235060, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.75351863, + "epoch": 1.3146787053982751, + "grad_norm": 7.625, + "learning_rate": 2.8809648898438668e-06, + "loss": 0.95509663, + "memory(GiB)": 302.58, + "step": 235080, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.75119038, + "epoch": 1.3147905548712542, + "grad_norm": 5.84375, + "learning_rate": 2.8801273722436384e-06, + "loss": 0.98593369, + "memory(GiB)": 302.58, + "step": 235100, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.73734226, + "epoch": 1.3149024043442337, + "grad_norm": 5.3125, + "learning_rate": 2.8792899271497754e-06, + "loss": 1.05726528, + "memory(GiB)": 302.58, + "step": 235120, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.75466332, + "epoch": 1.3150142538172127, + "grad_norm": 7.65625, + "learning_rate": 2.8784525545909216e-06, + "loss": 0.95220461, + "memory(GiB)": 302.58, + "step": 235140, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.74760013, + "epoch": 1.3151261032901922, + "grad_norm": 7.375, + "learning_rate": 2.877615254595717e-06, + "loss": 0.97755194, + "memory(GiB)": 302.58, + "step": 235160, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.76190162, + "epoch": 1.3152379527631712, + "grad_norm": 8.375, + "learning_rate": 2.8767780271928014e-06, + "loss": 0.94408693, + "memory(GiB)": 302.58, + "step": 235180, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.75560145, + "epoch": 1.3153498022361507, + "grad_norm": 6.15625, + "learning_rate": 2.8759408724108096e-06, + "loss": 0.94204636, + "memory(GiB)": 302.58, + "step": 235200, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.75750251, + "epoch": 1.3154616517091298, + "grad_norm": 4.9375, + "learning_rate": 2.875103790278375e-06, + "loss": 0.95392246, + "memory(GiB)": 302.58, + "step": 235220, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.74315615, + "epoch": 1.3155735011821093, + "grad_norm": 9.125, + "learning_rate": 2.8742667808241287e-06, + "loss": 1.01724272, + "memory(GiB)": 302.58, + "step": 235240, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.74922347, + "epoch": 1.3156853506550883, + "grad_norm": 7.84375, + "learning_rate": 2.8734298440766994e-06, + "loss": 0.98896379, + "memory(GiB)": 302.58, + "step": 235260, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.74659228, + "epoch": 1.3157972001280678, + "grad_norm": 5.5625, + "learning_rate": 2.8725929800647122e-06, + "loss": 1.02868948, + "memory(GiB)": 302.58, + "step": 235280, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.75241036, + "epoch": 1.3159090496010468, + "grad_norm": 8.625, + "learning_rate": 2.871756188816791e-06, + "loss": 0.9633646, + "memory(GiB)": 302.58, + "step": 235300, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.75438604, + "epoch": 1.3160208990740263, + "grad_norm": 6.6875, + "learning_rate": 2.870919470361556e-06, + "loss": 0.96020536, + "memory(GiB)": 302.58, + "step": 235320, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.75880976, + "epoch": 1.3161327485470053, + "grad_norm": 8.75, + "learning_rate": 2.8700828247276252e-06, + "loss": 0.94263906, + "memory(GiB)": 302.58, + "step": 235340, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.74842682, + "epoch": 1.3162445980199848, + "grad_norm": 7.46875, + "learning_rate": 2.869246251943617e-06, + "loss": 1.0159956, + "memory(GiB)": 302.58, + "step": 235360, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.73316727, + "epoch": 1.3163564474929639, + "grad_norm": 7.34375, + "learning_rate": 2.8684097520381437e-06, + "loss": 1.06440725, + "memory(GiB)": 302.58, + "step": 235380, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.76219516, + "epoch": 1.3164682969659434, + "grad_norm": 6.84375, + "learning_rate": 2.8675733250398165e-06, + "loss": 0.9313838, + "memory(GiB)": 302.58, + "step": 235400, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.73701062, + "epoch": 1.3165801464389224, + "grad_norm": 6.875, + "learning_rate": 2.8667369709772423e-06, + "loss": 1.03679781, + "memory(GiB)": 302.58, + "step": 235420, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.74328156, + "epoch": 1.316691995911902, + "grad_norm": 7.65625, + "learning_rate": 2.8659006898790286e-06, + "loss": 1.00118046, + "memory(GiB)": 302.58, + "step": 235440, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.75431557, + "epoch": 1.316803845384881, + "grad_norm": 8.25, + "learning_rate": 2.8650644817737783e-06, + "loss": 0.96733866, + "memory(GiB)": 302.58, + "step": 235460, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.77461286, + "epoch": 1.3169156948578604, + "grad_norm": 7.3125, + "learning_rate": 2.8642283466900923e-06, + "loss": 0.89437256, + "memory(GiB)": 302.58, + "step": 235480, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.7540153, + "epoch": 1.3170275443308395, + "grad_norm": 8.4375, + "learning_rate": 2.86339228465657e-06, + "loss": 0.97221384, + "memory(GiB)": 302.58, + "step": 235500, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.77047887, + "epoch": 1.317139393803819, + "grad_norm": 8.1875, + "learning_rate": 2.862556295701805e-06, + "loss": 0.90457754, + "memory(GiB)": 302.58, + "step": 235520, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.75098805, + "epoch": 1.317251243276798, + "grad_norm": 7.6875, + "learning_rate": 2.8617203798543936e-06, + "loss": 0.99564362, + "memory(GiB)": 302.58, + "step": 235540, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.73718796, + "epoch": 1.3173630927497775, + "grad_norm": 8.8125, + "learning_rate": 2.860884537142925e-06, + "loss": 1.00771904, + "memory(GiB)": 302.58, + "step": 235560, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.75328798, + "epoch": 1.3174749422227565, + "grad_norm": 6.34375, + "learning_rate": 2.860048767595988e-06, + "loss": 0.96661777, + "memory(GiB)": 302.58, + "step": 235580, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.74827347, + "epoch": 1.317586791695736, + "grad_norm": 11.5, + "learning_rate": 2.8592130712421673e-06, + "loss": 0.98599787, + "memory(GiB)": 302.58, + "step": 235600, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.75840139, + "epoch": 1.317698641168715, + "grad_norm": 5.46875, + "learning_rate": 2.8583774481100494e-06, + "loss": 0.94029512, + "memory(GiB)": 302.58, + "step": 235620, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.74694567, + "epoch": 1.3178104906416945, + "grad_norm": 6.28125, + "learning_rate": 2.8575418982282143e-06, + "loss": 0.9989954, + "memory(GiB)": 302.58, + "step": 235640, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.75717969, + "epoch": 1.3179223401146736, + "grad_norm": 6.0625, + "learning_rate": 2.8567064216252395e-06, + "loss": 0.95636187, + "memory(GiB)": 302.58, + "step": 235660, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.76049767, + "epoch": 1.318034189587653, + "grad_norm": 9.4375, + "learning_rate": 2.8558710183297005e-06, + "loss": 0.92112312, + "memory(GiB)": 302.58, + "step": 235680, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.76367655, + "epoch": 1.318146039060632, + "grad_norm": 7.96875, + "learning_rate": 2.8550356883701725e-06, + "loss": 0.92945099, + "memory(GiB)": 302.58, + "step": 235700, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.7749135, + "epoch": 1.3182578885336116, + "grad_norm": 9.0625, + "learning_rate": 2.8542004317752246e-06, + "loss": 0.87369585, + "memory(GiB)": 302.58, + "step": 235720, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.7521595, + "epoch": 1.3183697380065906, + "grad_norm": 7.15625, + "learning_rate": 2.853365248573427e-06, + "loss": 0.97900553, + "memory(GiB)": 302.58, + "step": 235740, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.76451569, + "epoch": 1.3184815874795701, + "grad_norm": 5.5, + "learning_rate": 2.8525301387933447e-06, + "loss": 0.91383696, + "memory(GiB)": 302.58, + "step": 235760, + "train_speed(iter/s)": 0.123835 + }, + { + "acc": 0.74986157, + "epoch": 1.3185934369525492, + "grad_norm": 4.1875, + "learning_rate": 2.851695102463538e-06, + "loss": 0.98520212, + "memory(GiB)": 302.58, + "step": 235780, + "train_speed(iter/s)": 0.12384 + }, + { + "acc": 0.76164742, + "epoch": 1.3187052864255286, + "grad_norm": 4.4375, + "learning_rate": 2.8508601396125734e-06, + "loss": 0.93434753, + "memory(GiB)": 302.58, + "step": 235800, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.74736247, + "epoch": 1.3188171358985077, + "grad_norm": 6.5, + "learning_rate": 2.8500252502690072e-06, + "loss": 1.02336226, + "memory(GiB)": 302.58, + "step": 235820, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.74923763, + "epoch": 1.3189289853714872, + "grad_norm": 5.875, + "learning_rate": 2.8491904344613945e-06, + "loss": 0.99125252, + "memory(GiB)": 302.58, + "step": 235840, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.74977674, + "epoch": 1.3190408348444662, + "grad_norm": 7.4375, + "learning_rate": 2.848355692218289e-06, + "loss": 0.98705111, + "memory(GiB)": 302.58, + "step": 235860, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.75326867, + "epoch": 1.3191526843174457, + "grad_norm": 9.25, + "learning_rate": 2.8475210235682418e-06, + "loss": 0.98199177, + "memory(GiB)": 302.58, + "step": 235880, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.75519605, + "epoch": 1.3192645337904247, + "grad_norm": 6.59375, + "learning_rate": 2.8466864285398006e-06, + "loss": 0.96345873, + "memory(GiB)": 302.58, + "step": 235900, + "train_speed(iter/s)": 0.123869 + }, + { + "acc": 0.75257907, + "epoch": 1.3193763832634042, + "grad_norm": 7.5, + "learning_rate": 2.8458519071615117e-06, + "loss": 0.9691103, + "memory(GiB)": 302.58, + "step": 235920, + "train_speed(iter/s)": 0.123874 + }, + { + "acc": 0.74425077, + "epoch": 1.3194882327363833, + "grad_norm": 5.875, + "learning_rate": 2.845017459461916e-06, + "loss": 1.00750046, + "memory(GiB)": 302.58, + "step": 235940, + "train_speed(iter/s)": 0.123879 + }, + { + "acc": 0.77000489, + "epoch": 1.3196000822093628, + "grad_norm": 7.9375, + "learning_rate": 2.8441830854695584e-06, + "loss": 0.90297985, + "memory(GiB)": 302.58, + "step": 235960, + "train_speed(iter/s)": 0.123884 + }, + { + "acc": 0.73186126, + "epoch": 1.3197119316823418, + "grad_norm": 8.5625, + "learning_rate": 2.843348785212975e-06, + "loss": 1.05773048, + "memory(GiB)": 302.58, + "step": 235980, + "train_speed(iter/s)": 0.123889 + }, + { + "acc": 0.74953904, + "epoch": 1.3198237811553213, + "grad_norm": 7.15625, + "learning_rate": 2.8425145587207026e-06, + "loss": 1.00368986, + "memory(GiB)": 302.58, + "step": 236000, + "train_speed(iter/s)": 0.123894 + }, + { + "epoch": 1.3198237811553213, + "eval_acc": 0.7068050959862655, + "eval_loss": 1.0121543407440186, + "eval_runtime": 7524.0093, + "eval_samples_per_second": 10.006, + "eval_steps_per_second": 10.006, + "step": 236000 + }, + { + "acc": 0.74600582, + "epoch": 1.3199356306283003, + "grad_norm": 5.125, + "learning_rate": 2.8416804060212725e-06, + "loss": 0.99432831, + "memory(GiB)": 302.58, + "step": 236020, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.76505642, + "epoch": 1.3200474801012798, + "grad_norm": 4.625, + "learning_rate": 2.8408463271432167e-06, + "loss": 0.91793556, + "memory(GiB)": 302.58, + "step": 236040, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.7522706, + "epoch": 1.3201593295742589, + "grad_norm": 6.90625, + "learning_rate": 2.8400123221150634e-06, + "loss": 0.97172251, + "memory(GiB)": 302.58, + "step": 236060, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75883183, + "epoch": 1.3202711790472383, + "grad_norm": 8.6875, + "learning_rate": 2.839178390965337e-06, + "loss": 0.95306559, + "memory(GiB)": 302.58, + "step": 236080, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.76384616, + "epoch": 1.3203830285202174, + "grad_norm": 7.59375, + "learning_rate": 2.8383445337225624e-06, + "loss": 0.92562294, + "memory(GiB)": 302.58, + "step": 236100, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.74522924, + "epoch": 1.3204948779931969, + "grad_norm": 6.65625, + "learning_rate": 2.837510750415259e-06, + "loss": 1.00202217, + "memory(GiB)": 302.58, + "step": 236120, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.76069942, + "epoch": 1.320606727466176, + "grad_norm": 8.0625, + "learning_rate": 2.836677041071944e-06, + "loss": 0.91345062, + "memory(GiB)": 302.58, + "step": 236140, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.74396148, + "epoch": 1.3207185769391554, + "grad_norm": 6.65625, + "learning_rate": 2.835843405721135e-06, + "loss": 0.99993954, + "memory(GiB)": 302.58, + "step": 236160, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.73931208, + "epoch": 1.3208304264121344, + "grad_norm": 7.40625, + "learning_rate": 2.8350098443913433e-06, + "loss": 1.01897221, + "memory(GiB)": 302.58, + "step": 236180, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.74745069, + "epoch": 1.320942275885114, + "grad_norm": 7.375, + "learning_rate": 2.8341763571110803e-06, + "loss": 0.99138832, + "memory(GiB)": 302.58, + "step": 236200, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.74651871, + "epoch": 1.321054125358093, + "grad_norm": 7.8125, + "learning_rate": 2.833342943908851e-06, + "loss": 0.98597755, + "memory(GiB)": 302.58, + "step": 236220, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.74403701, + "epoch": 1.3211659748310725, + "grad_norm": 7.53125, + "learning_rate": 2.8325096048131657e-06, + "loss": 1.02965479, + "memory(GiB)": 302.58, + "step": 236240, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.75579796, + "epoch": 1.3212778243040515, + "grad_norm": 8.75, + "learning_rate": 2.831676339852525e-06, + "loss": 0.9461772, + "memory(GiB)": 302.58, + "step": 236260, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.73646398, + "epoch": 1.321389673777031, + "grad_norm": 4.625, + "learning_rate": 2.8308431490554284e-06, + "loss": 1.06199083, + "memory(GiB)": 302.58, + "step": 236280, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.76332235, + "epoch": 1.32150152325001, + "grad_norm": 7.25, + "learning_rate": 2.830010032450374e-06, + "loss": 0.92750759, + "memory(GiB)": 302.58, + "step": 236300, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.73380947, + "epoch": 1.3216133727229895, + "grad_norm": 5.40625, + "learning_rate": 2.829176990065858e-06, + "loss": 1.05460443, + "memory(GiB)": 302.58, + "step": 236320, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.77091818, + "epoch": 1.3217252221959686, + "grad_norm": 7.1875, + "learning_rate": 2.8283440219303734e-06, + "loss": 0.89457188, + "memory(GiB)": 302.58, + "step": 236340, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.74689503, + "epoch": 1.321837071668948, + "grad_norm": 7.53125, + "learning_rate": 2.8275111280724084e-06, + "loss": 0.98862019, + "memory(GiB)": 302.58, + "step": 236360, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.76085653, + "epoch": 1.321948921141927, + "grad_norm": 7.34375, + "learning_rate": 2.826678308520452e-06, + "loss": 0.92867231, + "memory(GiB)": 302.58, + "step": 236380, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74421101, + "epoch": 1.3220607706149066, + "grad_norm": 7.4375, + "learning_rate": 2.825845563302988e-06, + "loss": 0.99537354, + "memory(GiB)": 302.58, + "step": 236400, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.72987967, + "epoch": 1.3221726200878856, + "grad_norm": 8.625, + "learning_rate": 2.8250128924485004e-06, + "loss": 1.05874481, + "memory(GiB)": 302.58, + "step": 236420, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.76573515, + "epoch": 1.322284469560865, + "grad_norm": 8.125, + "learning_rate": 2.824180295985468e-06, + "loss": 0.92903709, + "memory(GiB)": 302.58, + "step": 236440, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.76117978, + "epoch": 1.3223963190338441, + "grad_norm": 8.6875, + "learning_rate": 2.82334777394237e-06, + "loss": 0.93698158, + "memory(GiB)": 302.58, + "step": 236460, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.7570745, + "epoch": 1.3225081685068236, + "grad_norm": 4.71875, + "learning_rate": 2.822515326347677e-06, + "loss": 0.92948503, + "memory(GiB)": 302.58, + "step": 236480, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.74513054, + "epoch": 1.3226200179798027, + "grad_norm": 11.0625, + "learning_rate": 2.8216829532298673e-06, + "loss": 1.00894089, + "memory(GiB)": 302.58, + "step": 236500, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.77178793, + "epoch": 1.3227318674527822, + "grad_norm": 5.15625, + "learning_rate": 2.8208506546174073e-06, + "loss": 0.88525829, + "memory(GiB)": 302.58, + "step": 236520, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.74911637, + "epoch": 1.3228437169257612, + "grad_norm": 9.3125, + "learning_rate": 2.8200184305387645e-06, + "loss": 0.99077835, + "memory(GiB)": 302.58, + "step": 236540, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.7461287, + "epoch": 1.3229555663987407, + "grad_norm": 5.75, + "learning_rate": 2.8191862810224046e-06, + "loss": 1.00933695, + "memory(GiB)": 302.58, + "step": 236560, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.74937949, + "epoch": 1.3230674158717197, + "grad_norm": 6.90625, + "learning_rate": 2.818354206096789e-06, + "loss": 0.98241491, + "memory(GiB)": 302.58, + "step": 236580, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.73835015, + "epoch": 1.3231792653446992, + "grad_norm": 8.0, + "learning_rate": 2.817522205790377e-06, + "loss": 1.04869366, + "memory(GiB)": 302.58, + "step": 236600, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.76383767, + "epoch": 1.3232911148176782, + "grad_norm": 9.625, + "learning_rate": 2.8166902801316263e-06, + "loss": 0.91671762, + "memory(GiB)": 302.58, + "step": 236620, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.72638955, + "epoch": 1.3234029642906577, + "grad_norm": 9.75, + "learning_rate": 2.815858429148991e-06, + "loss": 1.12097292, + "memory(GiB)": 302.58, + "step": 236640, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.76587768, + "epoch": 1.3235148137636368, + "grad_norm": 7.625, + "learning_rate": 2.815026652870923e-06, + "loss": 0.92657433, + "memory(GiB)": 302.58, + "step": 236660, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75677643, + "epoch": 1.3236266632366163, + "grad_norm": 7.46875, + "learning_rate": 2.8141949513258715e-06, + "loss": 0.96286383, + "memory(GiB)": 302.58, + "step": 236680, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.74753766, + "epoch": 1.3237385127095953, + "grad_norm": 8.5, + "learning_rate": 2.813363324542284e-06, + "loss": 0.99785223, + "memory(GiB)": 302.58, + "step": 236700, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.75176754, + "epoch": 1.3238503621825748, + "grad_norm": 10.0, + "learning_rate": 2.8125317725486053e-06, + "loss": 0.98143024, + "memory(GiB)": 302.58, + "step": 236720, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.7388401, + "epoch": 1.3239622116555538, + "grad_norm": 8.1875, + "learning_rate": 2.811700295373275e-06, + "loss": 1.02735195, + "memory(GiB)": 302.58, + "step": 236740, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.76127214, + "epoch": 1.3240740611285333, + "grad_norm": 6.96875, + "learning_rate": 2.8108688930447324e-06, + "loss": 0.93733072, + "memory(GiB)": 302.58, + "step": 236760, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.74450688, + "epoch": 1.3241859106015124, + "grad_norm": 8.8125, + "learning_rate": 2.810037565591417e-06, + "loss": 1.01395254, + "memory(GiB)": 302.58, + "step": 236780, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.73909116, + "epoch": 1.3242977600744918, + "grad_norm": 11.9375, + "learning_rate": 2.80920631304176e-06, + "loss": 1.03350897, + "memory(GiB)": 302.58, + "step": 236800, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.7494195, + "epoch": 1.324409609547471, + "grad_norm": 8.0625, + "learning_rate": 2.808375135424195e-06, + "loss": 0.99556427, + "memory(GiB)": 302.58, + "step": 236820, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.76362038, + "epoch": 1.3245214590204504, + "grad_norm": 4.78125, + "learning_rate": 2.80754403276715e-06, + "loss": 0.91996813, + "memory(GiB)": 302.58, + "step": 236840, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.75958843, + "epoch": 1.3246333084934294, + "grad_norm": 10.375, + "learning_rate": 2.806713005099051e-06, + "loss": 0.94207659, + "memory(GiB)": 302.58, + "step": 236860, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.75453029, + "epoch": 1.324745157966409, + "grad_norm": 6.875, + "learning_rate": 2.8058820524483216e-06, + "loss": 0.96964407, + "memory(GiB)": 302.58, + "step": 236880, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74159513, + "epoch": 1.324857007439388, + "grad_norm": 10.375, + "learning_rate": 2.8050511748433833e-06, + "loss": 1.015872, + "memory(GiB)": 302.58, + "step": 236900, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.76351466, + "epoch": 1.3249688569123674, + "grad_norm": 9.25, + "learning_rate": 2.804220372312655e-06, + "loss": 0.93204546, + "memory(GiB)": 302.58, + "step": 236920, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.76068945, + "epoch": 1.3250807063853465, + "grad_norm": 8.4375, + "learning_rate": 2.8033896448845525e-06, + "loss": 0.93115416, + "memory(GiB)": 302.58, + "step": 236940, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.75604582, + "epoch": 1.325192555858326, + "grad_norm": 7.59375, + "learning_rate": 2.8025589925874897e-06, + "loss": 0.98148899, + "memory(GiB)": 302.58, + "step": 236960, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.7323915, + "epoch": 1.325304405331305, + "grad_norm": 5.5625, + "learning_rate": 2.801728415449877e-06, + "loss": 1.05251598, + "memory(GiB)": 302.58, + "step": 236980, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.75268459, + "epoch": 1.3254162548042845, + "grad_norm": 7.40625, + "learning_rate": 2.800897913500123e-06, + "loss": 0.97428341, + "memory(GiB)": 302.58, + "step": 237000, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.73479595, + "epoch": 1.3255281042772635, + "grad_norm": 5.65625, + "learning_rate": 2.8000674867666335e-06, + "loss": 1.05784216, + "memory(GiB)": 302.58, + "step": 237020, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.7512054, + "epoch": 1.325639953750243, + "grad_norm": 5.875, + "learning_rate": 2.7992371352778102e-06, + "loss": 0.97352924, + "memory(GiB)": 302.58, + "step": 237040, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75001955, + "epoch": 1.325751803223222, + "grad_norm": 7.625, + "learning_rate": 2.7984068590620568e-06, + "loss": 0.97490721, + "memory(GiB)": 302.58, + "step": 237060, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.750528, + "epoch": 1.3258636526962015, + "grad_norm": 10.625, + "learning_rate": 2.7975766581477682e-06, + "loss": 0.97939987, + "memory(GiB)": 302.58, + "step": 237080, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.75504928, + "epoch": 1.3259755021691806, + "grad_norm": 8.1875, + "learning_rate": 2.7967465325633435e-06, + "loss": 0.94857912, + "memory(GiB)": 302.58, + "step": 237100, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.75747328, + "epoch": 1.32608735164216, + "grad_norm": 5.28125, + "learning_rate": 2.795916482337173e-06, + "loss": 0.95126839, + "memory(GiB)": 302.58, + "step": 237120, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.7607883, + "epoch": 1.3261992011151391, + "grad_norm": 5.96875, + "learning_rate": 2.7950865074976484e-06, + "loss": 0.93687658, + "memory(GiB)": 302.58, + "step": 237140, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.76128631, + "epoch": 1.3263110505881186, + "grad_norm": 8.3125, + "learning_rate": 2.794256608073157e-06, + "loss": 0.93348436, + "memory(GiB)": 302.58, + "step": 237160, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.7482471, + "epoch": 1.3264229000610976, + "grad_norm": 7.65625, + "learning_rate": 2.7934267840920835e-06, + "loss": 0.99316044, + "memory(GiB)": 302.58, + "step": 237180, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.74852319, + "epoch": 1.3265347495340771, + "grad_norm": 8.375, + "learning_rate": 2.7925970355828113e-06, + "loss": 0.98761902, + "memory(GiB)": 302.58, + "step": 237200, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.76378736, + "epoch": 1.3266465990070562, + "grad_norm": 8.75, + "learning_rate": 2.7917673625737196e-06, + "loss": 0.93541403, + "memory(GiB)": 302.58, + "step": 237220, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.76548643, + "epoch": 1.3267584484800357, + "grad_norm": 9.1875, + "learning_rate": 2.7909377650931867e-06, + "loss": 0.92854881, + "memory(GiB)": 302.58, + "step": 237240, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.74512482, + "epoch": 1.3268702979530147, + "grad_norm": 7.25, + "learning_rate": 2.7901082431695863e-06, + "loss": 0.99965973, + "memory(GiB)": 302.58, + "step": 237260, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.75205264, + "epoch": 1.3269821474259942, + "grad_norm": 7.84375, + "learning_rate": 2.789278796831292e-06, + "loss": 0.97421885, + "memory(GiB)": 302.58, + "step": 237280, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.73038201, + "epoch": 1.3270939968989732, + "grad_norm": 7.28125, + "learning_rate": 2.7884494261066717e-06, + "loss": 1.07196903, + "memory(GiB)": 302.58, + "step": 237300, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.75438719, + "epoch": 1.3272058463719527, + "grad_norm": 10.0625, + "learning_rate": 2.7876201310240948e-06, + "loss": 0.96645298, + "memory(GiB)": 302.58, + "step": 237320, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75083365, + "epoch": 1.3273176958449318, + "grad_norm": 7.5625, + "learning_rate": 2.786790911611924e-06, + "loss": 0.99331427, + "memory(GiB)": 302.58, + "step": 237340, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74705062, + "epoch": 1.3274295453179112, + "grad_norm": 8.125, + "learning_rate": 2.7859617678985207e-06, + "loss": 1.010641, + "memory(GiB)": 302.58, + "step": 237360, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.74582877, + "epoch": 1.3275413947908903, + "grad_norm": 10.8125, + "learning_rate": 2.785132699912247e-06, + "loss": 1.01034088, + "memory(GiB)": 302.58, + "step": 237380, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.75968919, + "epoch": 1.3276532442638698, + "grad_norm": 6.59375, + "learning_rate": 2.7843037076814585e-06, + "loss": 0.9444458, + "memory(GiB)": 302.58, + "step": 237400, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.74349446, + "epoch": 1.3277650937368488, + "grad_norm": 9.6875, + "learning_rate": 2.7834747912345085e-06, + "loss": 1.01795673, + "memory(GiB)": 302.58, + "step": 237420, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.7415081, + "epoch": 1.3278769432098283, + "grad_norm": 5.65625, + "learning_rate": 2.7826459505997487e-06, + "loss": 1.01549072, + "memory(GiB)": 302.58, + "step": 237440, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.75507283, + "epoch": 1.3279887926828073, + "grad_norm": 8.5, + "learning_rate": 2.781817185805529e-06, + "loss": 0.96455631, + "memory(GiB)": 302.58, + "step": 237460, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.7578054, + "epoch": 1.3281006421557868, + "grad_norm": 8.125, + "learning_rate": 2.7809884968801946e-06, + "loss": 0.9248662, + "memory(GiB)": 302.58, + "step": 237480, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.75374246, + "epoch": 1.328212491628766, + "grad_norm": 7.6875, + "learning_rate": 2.780159883852091e-06, + "loss": 0.9586318, + "memory(GiB)": 302.58, + "step": 237500, + "train_speed(iter/s)": 0.123765 + }, + { + "acc": 0.7583725, + "epoch": 1.3283243411017454, + "grad_norm": 4.84375, + "learning_rate": 2.7793313467495575e-06, + "loss": 0.93604631, + "memory(GiB)": 302.58, + "step": 237520, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.74561677, + "epoch": 1.3284361905747246, + "grad_norm": 6.03125, + "learning_rate": 2.778502885600934e-06, + "loss": 0.99966698, + "memory(GiB)": 302.58, + "step": 237540, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.75990133, + "epoch": 1.3285480400477039, + "grad_norm": 5.65625, + "learning_rate": 2.777674500434555e-06, + "loss": 0.91746731, + "memory(GiB)": 302.58, + "step": 237560, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.75532713, + "epoch": 1.3286598895206831, + "grad_norm": 10.0625, + "learning_rate": 2.7768461912787558e-06, + "loss": 0.93617659, + "memory(GiB)": 302.58, + "step": 237580, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.77125697, + "epoch": 1.3287717389936624, + "grad_norm": 7.90625, + "learning_rate": 2.776017958161866e-06, + "loss": 0.90468111, + "memory(GiB)": 302.58, + "step": 237600, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.74889569, + "epoch": 1.3288835884666417, + "grad_norm": 7.625, + "learning_rate": 2.775189801112215e-06, + "loss": 0.9918478, + "memory(GiB)": 302.58, + "step": 237620, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.75786753, + "epoch": 1.328995437939621, + "grad_norm": 8.1875, + "learning_rate": 2.7743617201581242e-06, + "loss": 0.95796309, + "memory(GiB)": 302.58, + "step": 237640, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.74430933, + "epoch": 1.3291072874126002, + "grad_norm": 6.375, + "learning_rate": 2.773533715327922e-06, + "loss": 1.00462084, + "memory(GiB)": 302.58, + "step": 237660, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.75226722, + "epoch": 1.3292191368855795, + "grad_norm": 5.28125, + "learning_rate": 2.7727057866499273e-06, + "loss": 0.96571131, + "memory(GiB)": 302.58, + "step": 237680, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.74652686, + "epoch": 1.3293309863585587, + "grad_norm": 9.3125, + "learning_rate": 2.771877934152457e-06, + "loss": 1.00010338, + "memory(GiB)": 302.58, + "step": 237700, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.76825585, + "epoch": 1.329442835831538, + "grad_norm": 4.71875, + "learning_rate": 2.771050157863826e-06, + "loss": 0.90811968, + "memory(GiB)": 302.58, + "step": 237720, + "train_speed(iter/s)": 0.123819 + }, + { + "acc": 0.76475687, + "epoch": 1.3295546853045173, + "grad_norm": 6.5, + "learning_rate": 2.770222457812348e-06, + "loss": 0.9261734, + "memory(GiB)": 302.58, + "step": 237740, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.75249496, + "epoch": 1.3296665347774965, + "grad_norm": 5.6875, + "learning_rate": 2.769394834026332e-06, + "loss": 0.96683779, + "memory(GiB)": 302.58, + "step": 237760, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.75000677, + "epoch": 1.3297783842504758, + "grad_norm": 6.65625, + "learning_rate": 2.7685672865340862e-06, + "loss": 0.99717112, + "memory(GiB)": 302.58, + "step": 237780, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.76101823, + "epoch": 1.329890233723455, + "grad_norm": 8.8125, + "learning_rate": 2.7677398153639145e-06, + "loss": 0.95760794, + "memory(GiB)": 302.58, + "step": 237800, + "train_speed(iter/s)": 0.123839 + }, + { + "acc": 0.75883393, + "epoch": 1.3300020831964343, + "grad_norm": 8.625, + "learning_rate": 2.76691242054412e-06, + "loss": 0.93605576, + "memory(GiB)": 302.58, + "step": 237820, + "train_speed(iter/s)": 0.123844 + }, + { + "acc": 0.73589907, + "epoch": 1.3301139326694136, + "grad_norm": 5.65625, + "learning_rate": 2.766085102103001e-06, + "loss": 1.02106342, + "memory(GiB)": 302.58, + "step": 237840, + "train_speed(iter/s)": 0.123849 + }, + { + "acc": 0.74023786, + "epoch": 1.3302257821423928, + "grad_norm": 7.21875, + "learning_rate": 2.765257860068855e-06, + "loss": 1.02531805, + "memory(GiB)": 302.58, + "step": 237860, + "train_speed(iter/s)": 0.123853 + }, + { + "acc": 0.75504127, + "epoch": 1.330337631615372, + "grad_norm": 8.625, + "learning_rate": 2.7644306944699774e-06, + "loss": 0.95440884, + "memory(GiB)": 302.58, + "step": 237880, + "train_speed(iter/s)": 0.123858 + }, + { + "acc": 0.74269934, + "epoch": 1.3304494810883514, + "grad_norm": 8.5, + "learning_rate": 2.763603605334656e-06, + "loss": 1.01394091, + "memory(GiB)": 302.58, + "step": 237900, + "train_speed(iter/s)": 0.123863 + }, + { + "acc": 0.77503505, + "epoch": 1.3305613305613306, + "grad_norm": 6.40625, + "learning_rate": 2.7627765926911842e-06, + "loss": 0.86699448, + "memory(GiB)": 302.58, + "step": 237920, + "train_speed(iter/s)": 0.123868 + }, + { + "acc": 0.74464879, + "epoch": 1.33067318003431, + "grad_norm": 5.71875, + "learning_rate": 2.7619496565678482e-06, + "loss": 1.01186676, + "memory(GiB)": 302.58, + "step": 237940, + "train_speed(iter/s)": 0.123873 + }, + { + "acc": 0.74095559, + "epoch": 1.3307850295072892, + "grad_norm": 8.25, + "learning_rate": 2.76112279699293e-06, + "loss": 1.03289213, + "memory(GiB)": 302.58, + "step": 237960, + "train_speed(iter/s)": 0.123877 + }, + { + "acc": 0.76017013, + "epoch": 1.3308968789802684, + "grad_norm": 8.8125, + "learning_rate": 2.760296013994711e-06, + "loss": 0.93278208, + "memory(GiB)": 302.58, + "step": 237980, + "train_speed(iter/s)": 0.123882 + }, + { + "acc": 0.75111771, + "epoch": 1.3310087284532477, + "grad_norm": 11.625, + "learning_rate": 2.759469307601471e-06, + "loss": 0.982796, + "memory(GiB)": 302.58, + "step": 238000, + "train_speed(iter/s)": 0.123887 + }, + { + "epoch": 1.3310087284532477, + "eval_acc": 0.7068097298841752, + "eval_loss": 1.0122085809707642, + "eval_runtime": 7526.3456, + "eval_samples_per_second": 10.003, + "eval_steps_per_second": 10.003, + "step": 238000 + }, + { + "acc": 0.75060482, + "epoch": 1.331120577926227, + "grad_norm": 5.84375, + "learning_rate": 2.7586426778414843e-06, + "loss": 0.98822756, + "memory(GiB)": 302.58, + "step": 238020, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.75338068, + "epoch": 1.3312324273992062, + "grad_norm": 5.71875, + "learning_rate": 2.757816124743026e-06, + "loss": 0.98463631, + "memory(GiB)": 302.58, + "step": 238040, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.75265326, + "epoch": 1.3313442768721855, + "grad_norm": 7.0625, + "learning_rate": 2.756989648334366e-06, + "loss": 0.96904001, + "memory(GiB)": 302.58, + "step": 238060, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.74581733, + "epoch": 1.3314561263451647, + "grad_norm": 7.4375, + "learning_rate": 2.7561632486437718e-06, + "loss": 1.0209549, + "memory(GiB)": 302.58, + "step": 238080, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.75257235, + "epoch": 1.331567975818144, + "grad_norm": 6.3125, + "learning_rate": 2.75533692569951e-06, + "loss": 0.95861225, + "memory(GiB)": 302.58, + "step": 238100, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.76936655, + "epoch": 1.3316798252911233, + "grad_norm": 9.0625, + "learning_rate": 2.7545106795298428e-06, + "loss": 0.90919685, + "memory(GiB)": 302.58, + "step": 238120, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75423703, + "epoch": 1.3317916747641025, + "grad_norm": 3.71875, + "learning_rate": 2.7536845101630305e-06, + "loss": 0.9613759, + "memory(GiB)": 302.58, + "step": 238140, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.76165109, + "epoch": 1.3319035242370818, + "grad_norm": 5.9375, + "learning_rate": 2.752858417627331e-06, + "loss": 0.94789572, + "memory(GiB)": 302.58, + "step": 238160, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.74003305, + "epoch": 1.332015373710061, + "grad_norm": 8.3125, + "learning_rate": 2.752032401950997e-06, + "loss": 1.04969597, + "memory(GiB)": 302.58, + "step": 238180, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.74831161, + "epoch": 1.3321272231830403, + "grad_norm": 7.75, + "learning_rate": 2.7512064631622854e-06, + "loss": 0.98714371, + "memory(GiB)": 302.58, + "step": 238200, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.73792696, + "epoch": 1.3322390726560196, + "grad_norm": 6.21875, + "learning_rate": 2.750380601289443e-06, + "loss": 1.04011965, + "memory(GiB)": 302.58, + "step": 238220, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.74990778, + "epoch": 1.3323509221289989, + "grad_norm": 9.375, + "learning_rate": 2.7495548163607178e-06, + "loss": 0.99628277, + "memory(GiB)": 302.58, + "step": 238240, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.76761441, + "epoch": 1.3324627716019781, + "grad_norm": 8.4375, + "learning_rate": 2.7487291084043534e-06, + "loss": 0.90015059, + "memory(GiB)": 302.58, + "step": 238260, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.74876752, + "epoch": 1.3325746210749574, + "grad_norm": 7.34375, + "learning_rate": 2.74790347744859e-06, + "loss": 1.00633373, + "memory(GiB)": 302.58, + "step": 238280, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.76322908, + "epoch": 1.3326864705479367, + "grad_norm": 8.4375, + "learning_rate": 2.7470779235216715e-06, + "loss": 0.93898411, + "memory(GiB)": 302.58, + "step": 238300, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.74022312, + "epoch": 1.332798320020916, + "grad_norm": 9.5, + "learning_rate": 2.7462524466518316e-06, + "loss": 1.04170895, + "memory(GiB)": 302.58, + "step": 238320, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.7418467, + "epoch": 1.3329101694938952, + "grad_norm": 7.34375, + "learning_rate": 2.7454270468673038e-06, + "loss": 1.01450338, + "memory(GiB)": 302.58, + "step": 238340, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.74495816, + "epoch": 1.3330220189668744, + "grad_norm": 7.0, + "learning_rate": 2.744601724196321e-06, + "loss": 1.00355415, + "memory(GiB)": 302.58, + "step": 238360, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.75842457, + "epoch": 1.3331338684398537, + "grad_norm": 6.96875, + "learning_rate": 2.7437764786671108e-06, + "loss": 0.92703028, + "memory(GiB)": 302.58, + "step": 238380, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.76799507, + "epoch": 1.333245717912833, + "grad_norm": 7.59375, + "learning_rate": 2.7429513103078986e-06, + "loss": 0.90830688, + "memory(GiB)": 302.58, + "step": 238400, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.75279374, + "epoch": 1.3333575673858122, + "grad_norm": 7.78125, + "learning_rate": 2.742126219146909e-06, + "loss": 0.97484312, + "memory(GiB)": 302.58, + "step": 238420, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.74854665, + "epoch": 1.3334694168587915, + "grad_norm": 7.625, + "learning_rate": 2.741301205212362e-06, + "loss": 1.00961447, + "memory(GiB)": 302.58, + "step": 238440, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.7481976, + "epoch": 1.3335812663317708, + "grad_norm": 8.875, + "learning_rate": 2.740476268532476e-06, + "loss": 1.02117147, + "memory(GiB)": 302.58, + "step": 238460, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.76211262, + "epoch": 1.33369311580475, + "grad_norm": 8.875, + "learning_rate": 2.7396514091354664e-06, + "loss": 0.94567528, + "memory(GiB)": 302.58, + "step": 238480, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.7720808, + "epoch": 1.3338049652777293, + "grad_norm": 7.28125, + "learning_rate": 2.738826627049547e-06, + "loss": 0.90338326, + "memory(GiB)": 302.58, + "step": 238500, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.7331708, + "epoch": 1.3339168147507086, + "grad_norm": 8.875, + "learning_rate": 2.738001922302923e-06, + "loss": 1.07034769, + "memory(GiB)": 302.58, + "step": 238520, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.75161815, + "epoch": 1.3340286642236878, + "grad_norm": 6.21875, + "learning_rate": 2.7371772949238095e-06, + "loss": 0.97005672, + "memory(GiB)": 302.58, + "step": 238540, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.73244381, + "epoch": 1.334140513696667, + "grad_norm": 11.625, + "learning_rate": 2.736352744940407e-06, + "loss": 1.06285181, + "memory(GiB)": 302.58, + "step": 238560, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.75927424, + "epoch": 1.3342523631696463, + "grad_norm": 9.4375, + "learning_rate": 2.7355282723809183e-06, + "loss": 0.95714569, + "memory(GiB)": 302.58, + "step": 238580, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.7578691, + "epoch": 1.3343642126426256, + "grad_norm": 6.1875, + "learning_rate": 2.7347038772735436e-06, + "loss": 0.956495, + "memory(GiB)": 302.58, + "step": 238600, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.75998163, + "epoch": 1.3344760621156049, + "grad_norm": 5.8125, + "learning_rate": 2.7338795596464783e-06, + "loss": 0.93314924, + "memory(GiB)": 302.58, + "step": 238620, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.74426537, + "epoch": 1.3345879115885841, + "grad_norm": 6.90625, + "learning_rate": 2.7330553195279187e-06, + "loss": 1.00367517, + "memory(GiB)": 302.58, + "step": 238640, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.74790263, + "epoch": 1.3346997610615634, + "grad_norm": 7.3125, + "learning_rate": 2.732231156946056e-06, + "loss": 1.01574087, + "memory(GiB)": 302.58, + "step": 238660, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.75446715, + "epoch": 1.3348116105345427, + "grad_norm": 6.59375, + "learning_rate": 2.7314070719290776e-06, + "loss": 0.96782484, + "memory(GiB)": 302.58, + "step": 238680, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.75057192, + "epoch": 1.334923460007522, + "grad_norm": 9.1875, + "learning_rate": 2.730583064505171e-06, + "loss": 0.99144812, + "memory(GiB)": 302.58, + "step": 238700, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.74338732, + "epoch": 1.3350353094805012, + "grad_norm": 6.15625, + "learning_rate": 2.7297591347025197e-06, + "loss": 1.02730465, + "memory(GiB)": 302.58, + "step": 238720, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.74263124, + "epoch": 1.3351471589534805, + "grad_norm": 8.875, + "learning_rate": 2.7289352825493052e-06, + "loss": 1.03058453, + "memory(GiB)": 302.58, + "step": 238740, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.74884882, + "epoch": 1.3352590084264597, + "grad_norm": 6.9375, + "learning_rate": 2.7281115080737054e-06, + "loss": 1.01263447, + "memory(GiB)": 302.58, + "step": 238760, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74906406, + "epoch": 1.335370857899439, + "grad_norm": 7.53125, + "learning_rate": 2.7272878113038935e-06, + "loss": 0.98859692, + "memory(GiB)": 302.58, + "step": 238780, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.75593619, + "epoch": 1.3354827073724183, + "grad_norm": 8.25, + "learning_rate": 2.7264641922680467e-06, + "loss": 0.95509796, + "memory(GiB)": 302.58, + "step": 238800, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.76089373, + "epoch": 1.3355945568453975, + "grad_norm": 7.75, + "learning_rate": 2.7256406509943346e-06, + "loss": 0.94969425, + "memory(GiB)": 302.58, + "step": 238820, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.7613245, + "epoch": 1.3357064063183768, + "grad_norm": 4.90625, + "learning_rate": 2.7248171875109234e-06, + "loss": 0.95442028, + "memory(GiB)": 302.58, + "step": 238840, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.73267183, + "epoch": 1.335818255791356, + "grad_norm": 9.1875, + "learning_rate": 2.723993801845979e-06, + "loss": 1.07883282, + "memory(GiB)": 302.58, + "step": 238860, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.76694903, + "epoch": 1.3359301052643353, + "grad_norm": 5.375, + "learning_rate": 2.7231704940276636e-06, + "loss": 0.88896618, + "memory(GiB)": 302.58, + "step": 238880, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.73944993, + "epoch": 1.3360419547373146, + "grad_norm": 8.0625, + "learning_rate": 2.7223472640841376e-06, + "loss": 1.03316536, + "memory(GiB)": 302.58, + "step": 238900, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74001594, + "epoch": 1.3361538042102938, + "grad_norm": 7.78125, + "learning_rate": 2.7215241120435565e-06, + "loss": 1.01288948, + "memory(GiB)": 302.58, + "step": 238920, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.74838939, + "epoch": 1.336265653683273, + "grad_norm": 9.25, + "learning_rate": 2.720701037934076e-06, + "loss": 0.98228645, + "memory(GiB)": 302.58, + "step": 238940, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.74722452, + "epoch": 1.3363775031562524, + "grad_norm": 6.15625, + "learning_rate": 2.7198780417838475e-06, + "loss": 0.99757662, + "memory(GiB)": 302.58, + "step": 238960, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.76054025, + "epoch": 1.3364893526292316, + "grad_norm": 10.4375, + "learning_rate": 2.71905512362102e-06, + "loss": 0.93317404, + "memory(GiB)": 302.58, + "step": 238980, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.74932499, + "epoch": 1.336601202102211, + "grad_norm": 6.65625, + "learning_rate": 2.7182322834737396e-06, + "loss": 0.98271751, + "memory(GiB)": 302.58, + "step": 239000, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.75704403, + "epoch": 1.3367130515751902, + "grad_norm": 7.5, + "learning_rate": 2.7174095213701514e-06, + "loss": 0.95046616, + "memory(GiB)": 302.58, + "step": 239020, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.75006623, + "epoch": 1.3368249010481694, + "grad_norm": 5.90625, + "learning_rate": 2.7165868373383943e-06, + "loss": 0.99362135, + "memory(GiB)": 302.58, + "step": 239040, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.73610034, + "epoch": 1.3369367505211487, + "grad_norm": 5.875, + "learning_rate": 2.715764231406606e-06, + "loss": 1.03859644, + "memory(GiB)": 302.58, + "step": 239060, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.74807119, + "epoch": 1.337048599994128, + "grad_norm": 7.3125, + "learning_rate": 2.7149417036029268e-06, + "loss": 1.00523033, + "memory(GiB)": 302.58, + "step": 239080, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.75566983, + "epoch": 1.3371604494671072, + "grad_norm": 6.1875, + "learning_rate": 2.714119253955487e-06, + "loss": 0.96734753, + "memory(GiB)": 302.58, + "step": 239100, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.76581059, + "epoch": 1.3372722989400865, + "grad_norm": 9.0, + "learning_rate": 2.7132968824924165e-06, + "loss": 0.90368433, + "memory(GiB)": 302.58, + "step": 239120, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.74539981, + "epoch": 1.3373841484130657, + "grad_norm": 5.0, + "learning_rate": 2.7124745892418435e-06, + "loss": 1.00581818, + "memory(GiB)": 302.58, + "step": 239140, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.74289379, + "epoch": 1.337495997886045, + "grad_norm": 8.5625, + "learning_rate": 2.7116523742318935e-06, + "loss": 1.02046909, + "memory(GiB)": 302.58, + "step": 239160, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.75672197, + "epoch": 1.3376078473590243, + "grad_norm": 6.3125, + "learning_rate": 2.710830237490688e-06, + "loss": 0.95759859, + "memory(GiB)": 302.58, + "step": 239180, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.75184798, + "epoch": 1.3377196968320035, + "grad_norm": 6.71875, + "learning_rate": 2.710008179046348e-06, + "loss": 0.95409355, + "memory(GiB)": 302.58, + "step": 239200, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.74468117, + "epoch": 1.3378315463049828, + "grad_norm": 6.9375, + "learning_rate": 2.7091861989269886e-06, + "loss": 1.01011791, + "memory(GiB)": 302.58, + "step": 239220, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.75164189, + "epoch": 1.337943395777962, + "grad_norm": 7.4375, + "learning_rate": 2.708364297160726e-06, + "loss": 0.968927, + "memory(GiB)": 302.58, + "step": 239240, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.75099578, + "epoch": 1.3380552452509413, + "grad_norm": 8.75, + "learning_rate": 2.7075424737756706e-06, + "loss": 0.98424501, + "memory(GiB)": 302.58, + "step": 239260, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.75243335, + "epoch": 1.3381670947239206, + "grad_norm": 8.1875, + "learning_rate": 2.706720728799932e-06, + "loss": 0.97944336, + "memory(GiB)": 302.58, + "step": 239280, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.76311512, + "epoch": 1.3382789441968999, + "grad_norm": 7.5625, + "learning_rate": 2.705899062261616e-06, + "loss": 0.9308363, + "memory(GiB)": 302.58, + "step": 239300, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.75366831, + "epoch": 1.3383907936698791, + "grad_norm": 7.96875, + "learning_rate": 2.7050774741888265e-06, + "loss": 0.97561455, + "memory(GiB)": 302.58, + "step": 239320, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.74015098, + "epoch": 1.3385026431428584, + "grad_norm": 7.0, + "learning_rate": 2.7042559646096626e-06, + "loss": 1.0349679, + "memory(GiB)": 302.58, + "step": 239340, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.74015121, + "epoch": 1.3386144926158376, + "grad_norm": 6.09375, + "learning_rate": 2.703434533552226e-06, + "loss": 1.03075886, + "memory(GiB)": 302.58, + "step": 239360, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.75664573, + "epoch": 1.338726342088817, + "grad_norm": 6.5625, + "learning_rate": 2.702613181044611e-06, + "loss": 0.96703978, + "memory(GiB)": 302.58, + "step": 239380, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.74694381, + "epoch": 1.3388381915617962, + "grad_norm": 8.25, + "learning_rate": 2.7017919071149103e-06, + "loss": 1.00015049, + "memory(GiB)": 302.58, + "step": 239400, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.75874686, + "epoch": 1.3389500410347754, + "grad_norm": 8.0, + "learning_rate": 2.7009707117912133e-06, + "loss": 0.94073086, + "memory(GiB)": 302.58, + "step": 239420, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.75625587, + "epoch": 1.3390618905077547, + "grad_norm": 6.5, + "learning_rate": 2.7001495951016078e-06, + "loss": 0.94733181, + "memory(GiB)": 302.58, + "step": 239440, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.75044188, + "epoch": 1.339173739980734, + "grad_norm": 7.34375, + "learning_rate": 2.6993285570741788e-06, + "loss": 0.9831358, + "memory(GiB)": 302.58, + "step": 239460, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.75205035, + "epoch": 1.3392855894537132, + "grad_norm": 7.84375, + "learning_rate": 2.6985075977370085e-06, + "loss": 0.97701778, + "memory(GiB)": 302.58, + "step": 239480, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.75908914, + "epoch": 1.3393974389266925, + "grad_norm": 8.4375, + "learning_rate": 2.6976867171181764e-06, + "loss": 0.95283222, + "memory(GiB)": 302.58, + "step": 239500, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.74674201, + "epoch": 1.3395092883996718, + "grad_norm": 8.6875, + "learning_rate": 2.6968659152457587e-06, + "loss": 1.00797129, + "memory(GiB)": 302.58, + "step": 239520, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.75357256, + "epoch": 1.339621137872651, + "grad_norm": 6.625, + "learning_rate": 2.696045192147828e-06, + "loss": 0.96890907, + "memory(GiB)": 302.58, + "step": 239540, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.76485219, + "epoch": 1.3397329873456303, + "grad_norm": 7.34375, + "learning_rate": 2.6952245478524593e-06, + "loss": 0.91482477, + "memory(GiB)": 302.58, + "step": 239560, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.76928964, + "epoch": 1.3398448368186096, + "grad_norm": 8.5625, + "learning_rate": 2.694403982387719e-06, + "loss": 0.90676622, + "memory(GiB)": 302.58, + "step": 239580, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.75791807, + "epoch": 1.3399566862915888, + "grad_norm": 9.1875, + "learning_rate": 2.6935834957816736e-06, + "loss": 0.94132566, + "memory(GiB)": 302.58, + "step": 239600, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.75412626, + "epoch": 1.340068535764568, + "grad_norm": 5.5, + "learning_rate": 2.6927630880623863e-06, + "loss": 0.96505795, + "memory(GiB)": 302.58, + "step": 239620, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.75271297, + "epoch": 1.3401803852375473, + "grad_norm": 5.15625, + "learning_rate": 2.691942759257917e-06, + "loss": 0.96365023, + "memory(GiB)": 302.58, + "step": 239640, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.77124424, + "epoch": 1.3402922347105266, + "grad_norm": 8.875, + "learning_rate": 2.6911225093963222e-06, + "loss": 0.87331629, + "memory(GiB)": 302.58, + "step": 239660, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.74283266, + "epoch": 1.3404040841835059, + "grad_norm": 6.9375, + "learning_rate": 2.6903023385056614e-06, + "loss": 0.98780584, + "memory(GiB)": 302.58, + "step": 239680, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.73911128, + "epoch": 1.3405159336564851, + "grad_norm": 6.1875, + "learning_rate": 2.6894822466139837e-06, + "loss": 1.02533045, + "memory(GiB)": 302.58, + "step": 239700, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.7277277, + "epoch": 1.3406277831294644, + "grad_norm": 6.34375, + "learning_rate": 2.6886622337493394e-06, + "loss": 1.07042227, + "memory(GiB)": 302.58, + "step": 239720, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.75076952, + "epoch": 1.3407396326024437, + "grad_norm": 9.5625, + "learning_rate": 2.687842299939777e-06, + "loss": 0.97293606, + "memory(GiB)": 302.58, + "step": 239740, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.75062437, + "epoch": 1.340851482075423, + "grad_norm": 7.90625, + "learning_rate": 2.6870224452133387e-06, + "loss": 0.98733864, + "memory(GiB)": 302.58, + "step": 239760, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.74614177, + "epoch": 1.3409633315484022, + "grad_norm": 7.0625, + "learning_rate": 2.686202669598067e-06, + "loss": 1.01521883, + "memory(GiB)": 302.58, + "step": 239780, + "train_speed(iter/s)": 0.123826 + }, + { + "acc": 0.73922329, + "epoch": 1.3410751810213815, + "grad_norm": 6.78125, + "learning_rate": 2.685382973122001e-06, + "loss": 1.02215147, + "memory(GiB)": 302.58, + "step": 239800, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.75112157, + "epoch": 1.3411870304943607, + "grad_norm": 6.03125, + "learning_rate": 2.684563355813177e-06, + "loss": 0.97145443, + "memory(GiB)": 302.58, + "step": 239820, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.74792576, + "epoch": 1.34129887996734, + "grad_norm": 5.1875, + "learning_rate": 2.683743817699628e-06, + "loss": 0.9776392, + "memory(GiB)": 302.58, + "step": 239840, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.74651184, + "epoch": 1.3414107294403192, + "grad_norm": 9.4375, + "learning_rate": 2.6829243588093847e-06, + "loss": 0.98873625, + "memory(GiB)": 302.58, + "step": 239860, + "train_speed(iter/s)": 0.123845 + }, + { + "acc": 0.73995075, + "epoch": 1.3415225789132985, + "grad_norm": 7.625, + "learning_rate": 2.682104979170476e-06, + "loss": 1.01374674, + "memory(GiB)": 302.58, + "step": 239880, + "train_speed(iter/s)": 0.12385 + }, + { + "acc": 0.73511257, + "epoch": 1.3416344283862778, + "grad_norm": 7.0, + "learning_rate": 2.681285678810927e-06, + "loss": 1.05488291, + "memory(GiB)": 302.58, + "step": 239900, + "train_speed(iter/s)": 0.123855 + }, + { + "acc": 0.75617685, + "epoch": 1.341746277859257, + "grad_norm": 7.03125, + "learning_rate": 2.6804664577587604e-06, + "loss": 0.94274235, + "memory(GiB)": 302.58, + "step": 239920, + "train_speed(iter/s)": 0.12386 + }, + { + "acc": 0.75385976, + "epoch": 1.3418581273322363, + "grad_norm": 6.5, + "learning_rate": 2.679647316041993e-06, + "loss": 0.94962502, + "memory(GiB)": 302.58, + "step": 239940, + "train_speed(iter/s)": 0.123865 + }, + { + "acc": 0.74425349, + "epoch": 1.3419699768052156, + "grad_norm": 5.3125, + "learning_rate": 2.678828253688648e-06, + "loss": 1.00177994, + "memory(GiB)": 302.58, + "step": 239960, + "train_speed(iter/s)": 0.12387 + }, + { + "acc": 0.75689631, + "epoch": 1.3420818262781948, + "grad_norm": 10.9375, + "learning_rate": 2.678009270726737e-06, + "loss": 0.94757967, + "memory(GiB)": 302.58, + "step": 239980, + "train_speed(iter/s)": 0.123875 + }, + { + "acc": 0.77045431, + "epoch": 1.342193675751174, + "grad_norm": 8.0, + "learning_rate": 2.6771903671842714e-06, + "loss": 0.9014945, + "memory(GiB)": 302.58, + "step": 240000, + "train_speed(iter/s)": 0.12388 + }, + { + "epoch": 1.342193675751174, + "eval_acc": 0.7068162370599633, + "eval_loss": 1.0123218297958374, + "eval_runtime": 7592.1927, + "eval_samples_per_second": 9.916, + "eval_steps_per_second": 9.916, + "step": 240000 + }, + { + "acc": 0.75550718, + "epoch": 1.3423055252241534, + "grad_norm": 7.8125, + "learning_rate": 2.6763715430892605e-06, + "loss": 0.96220713, + "memory(GiB)": 302.58, + "step": 240020, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.74911633, + "epoch": 1.3424173746971326, + "grad_norm": 4.96875, + "learning_rate": 2.675552798469711e-06, + "loss": 0.99681301, + "memory(GiB)": 302.58, + "step": 240040, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.74642196, + "epoch": 1.342529224170112, + "grad_norm": 8.625, + "learning_rate": 2.6747341333536265e-06, + "loss": 1.00529299, + "memory(GiB)": 302.58, + "step": 240060, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.7578114, + "epoch": 1.3426410736430912, + "grad_norm": 10.3125, + "learning_rate": 2.673915547769008e-06, + "loss": 0.96864758, + "memory(GiB)": 302.58, + "step": 240080, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.76698756, + "epoch": 1.3427529231160704, + "grad_norm": 8.0, + "learning_rate": 2.6730970417438534e-06, + "loss": 0.91235037, + "memory(GiB)": 302.58, + "step": 240100, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75751786, + "epoch": 1.3428647725890497, + "grad_norm": 8.5, + "learning_rate": 2.6722786153061586e-06, + "loss": 0.96560049, + "memory(GiB)": 302.58, + "step": 240120, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75773764, + "epoch": 1.342976622062029, + "grad_norm": 8.5625, + "learning_rate": 2.6714602684839155e-06, + "loss": 0.96085215, + "memory(GiB)": 302.58, + "step": 240140, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.76077738, + "epoch": 1.3430884715350082, + "grad_norm": 11.3125, + "learning_rate": 2.6706420013051153e-06, + "loss": 0.94171524, + "memory(GiB)": 302.58, + "step": 240160, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.75454197, + "epoch": 1.3432003210079875, + "grad_norm": 5.5, + "learning_rate": 2.669823813797745e-06, + "loss": 0.97993679, + "memory(GiB)": 302.58, + "step": 240180, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74072828, + "epoch": 1.3433121704809667, + "grad_norm": 8.5, + "learning_rate": 2.669005705989789e-06, + "loss": 0.99060965, + "memory(GiB)": 302.58, + "step": 240200, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.76591783, + "epoch": 1.343424019953946, + "grad_norm": 11.6875, + "learning_rate": 2.668187677909227e-06, + "loss": 0.90678892, + "memory(GiB)": 302.58, + "step": 240220, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.75439878, + "epoch": 1.3435358694269253, + "grad_norm": 9.0, + "learning_rate": 2.6673697295840426e-06, + "loss": 0.94143648, + "memory(GiB)": 302.58, + "step": 240240, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.75557995, + "epoch": 1.3436477188999045, + "grad_norm": 4.0625, + "learning_rate": 2.6665518610422095e-06, + "loss": 0.98521471, + "memory(GiB)": 302.58, + "step": 240260, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.75753884, + "epoch": 1.3437595683728838, + "grad_norm": 5.96875, + "learning_rate": 2.6657340723117027e-06, + "loss": 0.94134779, + "memory(GiB)": 302.58, + "step": 240280, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.75889602, + "epoch": 1.343871417845863, + "grad_norm": 4.1875, + "learning_rate": 2.6649163634204924e-06, + "loss": 0.9365593, + "memory(GiB)": 302.58, + "step": 240300, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.76464262, + "epoch": 1.3439832673188423, + "grad_norm": 10.0, + "learning_rate": 2.6640987343965464e-06, + "loss": 0.92165604, + "memory(GiB)": 302.58, + "step": 240320, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.75357056, + "epoch": 1.3440951167918216, + "grad_norm": 7.75, + "learning_rate": 2.6632811852678304e-06, + "loss": 0.98097839, + "memory(GiB)": 302.58, + "step": 240340, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.75170541, + "epoch": 1.3442069662648009, + "grad_norm": 10.75, + "learning_rate": 2.6624637160623078e-06, + "loss": 0.97855844, + "memory(GiB)": 302.58, + "step": 240360, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.7353991, + "epoch": 1.3443188157377801, + "grad_norm": 8.4375, + "learning_rate": 2.661646326807938e-06, + "loss": 1.06758814, + "memory(GiB)": 302.58, + "step": 240380, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.73663874, + "epoch": 1.3444306652107594, + "grad_norm": 4.9375, + "learning_rate": 2.660829017532679e-06, + "loss": 1.04448938, + "memory(GiB)": 302.58, + "step": 240400, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.74817882, + "epoch": 1.3445425146837386, + "grad_norm": 7.34375, + "learning_rate": 2.660011788264484e-06, + "loss": 1.00229378, + "memory(GiB)": 302.58, + "step": 240420, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.73220067, + "epoch": 1.344654364156718, + "grad_norm": 9.5, + "learning_rate": 2.659194639031306e-06, + "loss": 1.05263271, + "memory(GiB)": 302.58, + "step": 240440, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.75585842, + "epoch": 1.3447662136296972, + "grad_norm": 9.75, + "learning_rate": 2.6583775698610946e-06, + "loss": 0.96485434, + "memory(GiB)": 302.58, + "step": 240460, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.76018324, + "epoch": 1.3448780631026764, + "grad_norm": 11.0, + "learning_rate": 2.6575605807817927e-06, + "loss": 0.93798609, + "memory(GiB)": 302.58, + "step": 240480, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.74944539, + "epoch": 1.3449899125756557, + "grad_norm": 6.96875, + "learning_rate": 2.6567436718213492e-06, + "loss": 0.96588669, + "memory(GiB)": 302.58, + "step": 240500, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.75694895, + "epoch": 1.345101762048635, + "grad_norm": 4.6875, + "learning_rate": 2.6559268430077024e-06, + "loss": 0.93925018, + "memory(GiB)": 302.58, + "step": 240520, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.75202351, + "epoch": 1.3452136115216142, + "grad_norm": 10.4375, + "learning_rate": 2.6551100943687894e-06, + "loss": 0.96713638, + "memory(GiB)": 302.58, + "step": 240540, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.74850082, + "epoch": 1.3453254609945935, + "grad_norm": 8.75, + "learning_rate": 2.6542934259325476e-06, + "loss": 0.99973087, + "memory(GiB)": 302.58, + "step": 240560, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.73736548, + "epoch": 1.3454373104675728, + "grad_norm": 6.9375, + "learning_rate": 2.6534768377269082e-06, + "loss": 1.03804588, + "memory(GiB)": 302.58, + "step": 240580, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74766116, + "epoch": 1.345549159940552, + "grad_norm": 6.8125, + "learning_rate": 2.6526603297798014e-06, + "loss": 1.00731449, + "memory(GiB)": 302.58, + "step": 240600, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.76126404, + "epoch": 1.3456610094135313, + "grad_norm": 7.125, + "learning_rate": 2.6518439021191555e-06, + "loss": 0.95401487, + "memory(GiB)": 302.58, + "step": 240620, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.74662724, + "epoch": 1.3457728588865105, + "grad_norm": 8.4375, + "learning_rate": 2.651027554772893e-06, + "loss": 0.99300776, + "memory(GiB)": 302.58, + "step": 240640, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.76462817, + "epoch": 1.3458847083594898, + "grad_norm": 6.90625, + "learning_rate": 2.650211287768937e-06, + "loss": 0.91783342, + "memory(GiB)": 302.58, + "step": 240660, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75455227, + "epoch": 1.345996557832469, + "grad_norm": 6.34375, + "learning_rate": 2.649395101135206e-06, + "loss": 0.95274563, + "memory(GiB)": 302.58, + "step": 240680, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.7608849, + "epoch": 1.3461084073054483, + "grad_norm": 5.5625, + "learning_rate": 2.6485789948996157e-06, + "loss": 0.92507954, + "memory(GiB)": 302.58, + "step": 240700, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.77241659, + "epoch": 1.3462202567784276, + "grad_norm": 7.75, + "learning_rate": 2.6477629690900795e-06, + "loss": 0.85806952, + "memory(GiB)": 302.58, + "step": 240720, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.7384275, + "epoch": 1.3463321062514069, + "grad_norm": 8.375, + "learning_rate": 2.6469470237345097e-06, + "loss": 1.01794319, + "memory(GiB)": 302.58, + "step": 240740, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74397368, + "epoch": 1.3464439557243861, + "grad_norm": 8.375, + "learning_rate": 2.64613115886081e-06, + "loss": 1.01065044, + "memory(GiB)": 302.58, + "step": 240760, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74077849, + "epoch": 1.3465558051973654, + "grad_norm": 5.625, + "learning_rate": 2.645315374496891e-06, + "loss": 1.02419729, + "memory(GiB)": 302.58, + "step": 240780, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75505695, + "epoch": 1.3466676546703447, + "grad_norm": 9.4375, + "learning_rate": 2.644499670670653e-06, + "loss": 0.94994259, + "memory(GiB)": 302.58, + "step": 240800, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74501204, + "epoch": 1.346779504143324, + "grad_norm": 6.65625, + "learning_rate": 2.6436840474099928e-06, + "loss": 1.00625973, + "memory(GiB)": 302.58, + "step": 240820, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74112539, + "epoch": 1.3468913536163032, + "grad_norm": 4.75, + "learning_rate": 2.642868504742812e-06, + "loss": 1.01815767, + "memory(GiB)": 302.58, + "step": 240840, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.7276361, + "epoch": 1.3470032030892825, + "grad_norm": 5.125, + "learning_rate": 2.6420530426970024e-06, + "loss": 1.0969944, + "memory(GiB)": 302.58, + "step": 240860, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.75190411, + "epoch": 1.3471150525622617, + "grad_norm": 5.375, + "learning_rate": 2.6412376613004565e-06, + "loss": 0.98605928, + "memory(GiB)": 302.58, + "step": 240880, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.76419654, + "epoch": 1.347226902035241, + "grad_norm": 5.90625, + "learning_rate": 2.640422360581061e-06, + "loss": 0.91675873, + "memory(GiB)": 302.58, + "step": 240900, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.78317695, + "epoch": 1.3473387515082202, + "grad_norm": 8.5, + "learning_rate": 2.6396071405667033e-06, + "loss": 0.820403, + "memory(GiB)": 302.58, + "step": 240920, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.75240612, + "epoch": 1.3474506009811995, + "grad_norm": 6.875, + "learning_rate": 2.6387920012852664e-06, + "loss": 0.97101936, + "memory(GiB)": 302.58, + "step": 240940, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.74305005, + "epoch": 1.3475624504541788, + "grad_norm": 4.84375, + "learning_rate": 2.6379769427646298e-06, + "loss": 1.00314178, + "memory(GiB)": 302.58, + "step": 240960, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.73650455, + "epoch": 1.347674299927158, + "grad_norm": 7.40625, + "learning_rate": 2.637161965032671e-06, + "loss": 1.03129396, + "memory(GiB)": 302.58, + "step": 240980, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.74790692, + "epoch": 1.3477861494001373, + "grad_norm": 7.125, + "learning_rate": 2.636347068117266e-06, + "loss": 0.97545166, + "memory(GiB)": 302.58, + "step": 241000, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.75029688, + "epoch": 1.3478979988731166, + "grad_norm": 8.1875, + "learning_rate": 2.635532252046286e-06, + "loss": 0.97819118, + "memory(GiB)": 302.58, + "step": 241020, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.74659638, + "epoch": 1.3480098483460958, + "grad_norm": 7.28125, + "learning_rate": 2.6347175168476004e-06, + "loss": 0.98512878, + "memory(GiB)": 302.58, + "step": 241040, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.76198511, + "epoch": 1.348121697819075, + "grad_norm": 7.28125, + "learning_rate": 2.633902862549076e-06, + "loss": 0.93791428, + "memory(GiB)": 302.58, + "step": 241060, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.75115666, + "epoch": 1.3482335472920544, + "grad_norm": 4.8125, + "learning_rate": 2.633088289178574e-06, + "loss": 0.96614227, + "memory(GiB)": 302.58, + "step": 241080, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75274391, + "epoch": 1.3483453967650336, + "grad_norm": 9.0, + "learning_rate": 2.63227379676396e-06, + "loss": 0.97165337, + "memory(GiB)": 302.58, + "step": 241100, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.74774446, + "epoch": 1.3484572462380129, + "grad_norm": 9.9375, + "learning_rate": 2.6314593853330907e-06, + "loss": 1.01659174, + "memory(GiB)": 302.58, + "step": 241120, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74643488, + "epoch": 1.3485690957109921, + "grad_norm": 9.0, + "learning_rate": 2.6306450549138202e-06, + "loss": 1.00439024, + "memory(GiB)": 302.58, + "step": 241140, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.76162486, + "epoch": 1.3486809451839714, + "grad_norm": 4.9375, + "learning_rate": 2.629830805534002e-06, + "loss": 0.92113113, + "memory(GiB)": 302.58, + "step": 241160, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.74797077, + "epoch": 1.3487927946569507, + "grad_norm": 6.65625, + "learning_rate": 2.6290166372214854e-06, + "loss": 1.00627842, + "memory(GiB)": 302.58, + "step": 241180, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.76432157, + "epoch": 1.34890464412993, + "grad_norm": 8.6875, + "learning_rate": 2.6282025500041185e-06, + "loss": 0.91816559, + "memory(GiB)": 302.58, + "step": 241200, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.75225396, + "epoch": 1.3490164936029092, + "grad_norm": 8.3125, + "learning_rate": 2.6273885439097452e-06, + "loss": 0.97391405, + "memory(GiB)": 302.58, + "step": 241220, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.77212105, + "epoch": 1.3491283430758885, + "grad_norm": 8.8125, + "learning_rate": 2.6265746189662067e-06, + "loss": 0.88252659, + "memory(GiB)": 302.58, + "step": 241240, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.76139717, + "epoch": 1.3492401925488677, + "grad_norm": 7.84375, + "learning_rate": 2.6257607752013427e-06, + "loss": 0.90711002, + "memory(GiB)": 302.58, + "step": 241260, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.7625648, + "epoch": 1.349352042021847, + "grad_norm": 5.4375, + "learning_rate": 2.6249470126429877e-06, + "loss": 0.90954285, + "memory(GiB)": 302.58, + "step": 241280, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.7554769, + "epoch": 1.3494638914948263, + "grad_norm": 8.375, + "learning_rate": 2.6241333313189772e-06, + "loss": 0.96988754, + "memory(GiB)": 302.58, + "step": 241300, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.75112276, + "epoch": 1.3495757409678055, + "grad_norm": 9.375, + "learning_rate": 2.6233197312571398e-06, + "loss": 0.97759638, + "memory(GiB)": 302.58, + "step": 241320, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.7482275, + "epoch": 1.3496875904407848, + "grad_norm": 8.25, + "learning_rate": 2.622506212485304e-06, + "loss": 0.98680925, + "memory(GiB)": 302.58, + "step": 241340, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.75636473, + "epoch": 1.349799439913764, + "grad_norm": 7.75, + "learning_rate": 2.621692775031292e-06, + "loss": 0.95645676, + "memory(GiB)": 302.58, + "step": 241360, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.76416492, + "epoch": 1.3499112893867433, + "grad_norm": 8.5625, + "learning_rate": 2.6208794189229316e-06, + "loss": 0.92812271, + "memory(GiB)": 302.58, + "step": 241380, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.76224966, + "epoch": 1.3500231388597226, + "grad_norm": 10.0625, + "learning_rate": 2.620066144188038e-06, + "loss": 0.91839905, + "memory(GiB)": 302.58, + "step": 241400, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74781337, + "epoch": 1.3501349883327018, + "grad_norm": 4.96875, + "learning_rate": 2.619252950854429e-06, + "loss": 0.99220867, + "memory(GiB)": 302.58, + "step": 241420, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.76259446, + "epoch": 1.350246837805681, + "grad_norm": 8.3125, + "learning_rate": 2.6184398389499187e-06, + "loss": 0.93035841, + "memory(GiB)": 302.58, + "step": 241440, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.74213133, + "epoch": 1.3503586872786604, + "grad_norm": 6.21875, + "learning_rate": 2.617626808502317e-06, + "loss": 1.01131239, + "memory(GiB)": 302.58, + "step": 241460, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.7568645, + "epoch": 1.3504705367516396, + "grad_norm": 6.75, + "learning_rate": 2.6168138595394334e-06, + "loss": 0.96908398, + "memory(GiB)": 302.58, + "step": 241480, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.7614634, + "epoch": 1.350582386224619, + "grad_norm": 4.875, + "learning_rate": 2.616000992089072e-06, + "loss": 0.91899757, + "memory(GiB)": 302.58, + "step": 241500, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.7599586, + "epoch": 1.3506942356975982, + "grad_norm": 7.4375, + "learning_rate": 2.6151882061790367e-06, + "loss": 0.93390627, + "memory(GiB)": 302.58, + "step": 241520, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.75898328, + "epoch": 1.3508060851705774, + "grad_norm": 8.8125, + "learning_rate": 2.6143755018371263e-06, + "loss": 0.94187155, + "memory(GiB)": 302.58, + "step": 241540, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.74279008, + "epoch": 1.3509179346435567, + "grad_norm": 7.34375, + "learning_rate": 2.6135628790911394e-06, + "loss": 1.01037407, + "memory(GiB)": 302.58, + "step": 241560, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.73022456, + "epoch": 1.351029784116536, + "grad_norm": 7.5625, + "learning_rate": 2.612750337968868e-06, + "loss": 1.07300911, + "memory(GiB)": 302.58, + "step": 241580, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.75890861, + "epoch": 1.3511416335895152, + "grad_norm": 7.0, + "learning_rate": 2.6119378784981052e-06, + "loss": 0.93134241, + "memory(GiB)": 302.58, + "step": 241600, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.72865672, + "epoch": 1.3512534830624945, + "grad_norm": 6.46875, + "learning_rate": 2.6111255007066394e-06, + "loss": 1.06836891, + "memory(GiB)": 302.58, + "step": 241620, + "train_speed(iter/s)": 0.123779 + }, + { + "acc": 0.76341505, + "epoch": 1.3513653325354738, + "grad_norm": 10.8125, + "learning_rate": 2.6103132046222544e-06, + "loss": 0.95092621, + "memory(GiB)": 302.58, + "step": 241640, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.73515162, + "epoch": 1.351477182008453, + "grad_norm": 6.4375, + "learning_rate": 2.6095009902727376e-06, + "loss": 1.02372246, + "memory(GiB)": 302.58, + "step": 241660, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.76188765, + "epoch": 1.3515890314814323, + "grad_norm": 8.9375, + "learning_rate": 2.608688857685867e-06, + "loss": 0.93411121, + "memory(GiB)": 302.58, + "step": 241680, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.77349558, + "epoch": 1.3517008809544115, + "grad_norm": 6.21875, + "learning_rate": 2.6078768068894193e-06, + "loss": 0.8779541, + "memory(GiB)": 302.58, + "step": 241700, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.72449818, + "epoch": 1.3518127304273908, + "grad_norm": 7.875, + "learning_rate": 2.6070648379111705e-06, + "loss": 1.11154146, + "memory(GiB)": 302.58, + "step": 241720, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.76126018, + "epoch": 1.35192457990037, + "grad_norm": 4.90625, + "learning_rate": 2.6062529507788927e-06, + "loss": 0.93391933, + "memory(GiB)": 302.58, + "step": 241740, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.75070381, + "epoch": 1.3520364293733493, + "grad_norm": 4.90625, + "learning_rate": 2.6054411455203532e-06, + "loss": 0.96937456, + "memory(GiB)": 302.58, + "step": 241760, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.75337911, + "epoch": 1.3521482788463286, + "grad_norm": 7.84375, + "learning_rate": 2.60462942216332e-06, + "loss": 0.95472536, + "memory(GiB)": 302.58, + "step": 241780, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.75338049, + "epoch": 1.3522601283193079, + "grad_norm": 9.6875, + "learning_rate": 2.603817780735556e-06, + "loss": 0.96507769, + "memory(GiB)": 302.58, + "step": 241800, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.75952458, + "epoch": 1.3523719777922871, + "grad_norm": 6.1875, + "learning_rate": 2.6030062212648222e-06, + "loss": 0.92812929, + "memory(GiB)": 302.58, + "step": 241820, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.75959792, + "epoch": 1.3524838272652664, + "grad_norm": 8.75, + "learning_rate": 2.6021947437788764e-06, + "loss": 0.93277988, + "memory(GiB)": 302.58, + "step": 241840, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.73747935, + "epoch": 1.3525956767382457, + "grad_norm": 7.375, + "learning_rate": 2.601383348305473e-06, + "loss": 1.03318958, + "memory(GiB)": 302.58, + "step": 241860, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.75972953, + "epoch": 1.352707526211225, + "grad_norm": 6.46875, + "learning_rate": 2.6005720348723653e-06, + "loss": 0.94569912, + "memory(GiB)": 302.58, + "step": 241880, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.7353827, + "epoch": 1.3528193756842042, + "grad_norm": 5.28125, + "learning_rate": 2.5997608035073e-06, + "loss": 1.04019375, + "memory(GiB)": 302.58, + "step": 241900, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.74158258, + "epoch": 1.3529312251571834, + "grad_norm": 5.8125, + "learning_rate": 2.5989496542380287e-06, + "loss": 1.0397254, + "memory(GiB)": 302.58, + "step": 241920, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.75250621, + "epoch": 1.3530430746301627, + "grad_norm": 6.09375, + "learning_rate": 2.5981385870922928e-06, + "loss": 0.94527655, + "memory(GiB)": 302.58, + "step": 241940, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.75075607, + "epoch": 1.353154924103142, + "grad_norm": 7.90625, + "learning_rate": 2.597327602097833e-06, + "loss": 0.97754002, + "memory(GiB)": 302.58, + "step": 241960, + "train_speed(iter/s)": 0.123862 + }, + { + "acc": 0.7635385, + "epoch": 1.3532667735761212, + "grad_norm": 5.4375, + "learning_rate": 2.5965166992823874e-06, + "loss": 0.94162731, + "memory(GiB)": 302.58, + "step": 241980, + "train_speed(iter/s)": 0.123867 + }, + { + "acc": 0.76414952, + "epoch": 1.3533786230491005, + "grad_norm": 10.0625, + "learning_rate": 2.5957058786736922e-06, + "loss": 0.90215387, + "memory(GiB)": 302.58, + "step": 242000, + "train_speed(iter/s)": 0.123872 + }, + { + "epoch": 1.3533786230491005, + "eval_acc": 0.7068339839030218, + "eval_loss": 1.0122684240341187, + "eval_runtime": 7535.1322, + "eval_samples_per_second": 9.991, + "eval_steps_per_second": 9.991, + "step": 242000 + }, + { + "acc": 0.74607759, + "epoch": 1.3534904725220798, + "grad_norm": 8.0, + "learning_rate": 2.5948951402994804e-06, + "loss": 1.00433702, + "memory(GiB)": 302.58, + "step": 242020, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.74752388, + "epoch": 1.353602321995059, + "grad_norm": 4.3125, + "learning_rate": 2.5940844841874792e-06, + "loss": 1.0119854, + "memory(GiB)": 302.58, + "step": 242040, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.73553972, + "epoch": 1.3537141714680383, + "grad_norm": 5.84375, + "learning_rate": 2.5932739103654187e-06, + "loss": 1.04596958, + "memory(GiB)": 302.58, + "step": 242060, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.75050511, + "epoch": 1.3538260209410176, + "grad_norm": 6.53125, + "learning_rate": 2.5924634188610228e-06, + "loss": 0.98846169, + "memory(GiB)": 302.58, + "step": 242080, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.7426455, + "epoch": 1.3539378704139968, + "grad_norm": 6.3125, + "learning_rate": 2.5916530097020114e-06, + "loss": 1.0279706, + "memory(GiB)": 302.58, + "step": 242100, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75417304, + "epoch": 1.354049719886976, + "grad_norm": 10.4375, + "learning_rate": 2.5908426829161037e-06, + "loss": 0.94975252, + "memory(GiB)": 302.58, + "step": 242120, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.78556714, + "epoch": 1.3541615693599554, + "grad_norm": 7.1875, + "learning_rate": 2.590032438531016e-06, + "loss": 0.83483915, + "memory(GiB)": 302.58, + "step": 242140, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.77733769, + "epoch": 1.3542734188329346, + "grad_norm": 11.1875, + "learning_rate": 2.5892222765744608e-06, + "loss": 0.85437593, + "memory(GiB)": 302.58, + "step": 242160, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.73539162, + "epoch": 1.3543852683059139, + "grad_norm": 9.8125, + "learning_rate": 2.5884121970741477e-06, + "loss": 1.04650793, + "memory(GiB)": 302.58, + "step": 242180, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.7488625, + "epoch": 1.3544971177788931, + "grad_norm": 6.75, + "learning_rate": 2.587602200057785e-06, + "loss": 1.00097008, + "memory(GiB)": 302.58, + "step": 242200, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.74860778, + "epoch": 1.3546089672518724, + "grad_norm": 8.125, + "learning_rate": 2.586792285553076e-06, + "loss": 0.97441349, + "memory(GiB)": 302.58, + "step": 242220, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.73448839, + "epoch": 1.3547208167248517, + "grad_norm": 8.625, + "learning_rate": 2.585982453587721e-06, + "loss": 1.05367098, + "memory(GiB)": 302.58, + "step": 242240, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.7695344, + "epoch": 1.354832666197831, + "grad_norm": 7.5625, + "learning_rate": 2.5851727041894227e-06, + "loss": 0.89299154, + "memory(GiB)": 302.58, + "step": 242260, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.75193319, + "epoch": 1.3549445156708102, + "grad_norm": 6.78125, + "learning_rate": 2.5843630373858754e-06, + "loss": 0.96103849, + "memory(GiB)": 302.58, + "step": 242280, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.75168867, + "epoch": 1.3550563651437895, + "grad_norm": 6.46875, + "learning_rate": 2.5835534532047712e-06, + "loss": 0.98394899, + "memory(GiB)": 302.58, + "step": 242300, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.75270791, + "epoch": 1.3551682146167687, + "grad_norm": 6.0625, + "learning_rate": 2.5827439516738017e-06, + "loss": 0.98552351, + "memory(GiB)": 302.58, + "step": 242320, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74142714, + "epoch": 1.355280064089748, + "grad_norm": 7.78125, + "learning_rate": 2.5819345328206548e-06, + "loss": 1.01693621, + "memory(GiB)": 302.58, + "step": 242340, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75099616, + "epoch": 1.3553919135627273, + "grad_norm": 8.125, + "learning_rate": 2.5811251966730135e-06, + "loss": 0.98309889, + "memory(GiB)": 302.58, + "step": 242360, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.75226736, + "epoch": 1.3555037630357065, + "grad_norm": 6.78125, + "learning_rate": 2.58031594325856e-06, + "loss": 0.95976152, + "memory(GiB)": 302.58, + "step": 242380, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.75309439, + "epoch": 1.3556156125086858, + "grad_norm": 6.5625, + "learning_rate": 2.579506772604975e-06, + "loss": 0.98461294, + "memory(GiB)": 302.58, + "step": 242400, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.73571968, + "epoch": 1.355727461981665, + "grad_norm": 6.8125, + "learning_rate": 2.5786976847399325e-06, + "loss": 1.06925516, + "memory(GiB)": 302.58, + "step": 242420, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.75899386, + "epoch": 1.3558393114546443, + "grad_norm": 4.5, + "learning_rate": 2.5778886796911073e-06, + "loss": 0.93122616, + "memory(GiB)": 302.58, + "step": 242440, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.76175251, + "epoch": 1.3559511609276236, + "grad_norm": 7.28125, + "learning_rate": 2.5770797574861707e-06, + "loss": 0.94619684, + "memory(GiB)": 302.58, + "step": 242460, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.75199723, + "epoch": 1.3560630104006028, + "grad_norm": 7.40625, + "learning_rate": 2.5762709181527878e-06, + "loss": 0.97642336, + "memory(GiB)": 302.58, + "step": 242480, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.74420371, + "epoch": 1.356174859873582, + "grad_norm": 5.28125, + "learning_rate": 2.5754621617186253e-06, + "loss": 1.0140605, + "memory(GiB)": 302.58, + "step": 242500, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.76058826, + "epoch": 1.3562867093465614, + "grad_norm": 7.84375, + "learning_rate": 2.5746534882113427e-06, + "loss": 0.93130836, + "memory(GiB)": 302.58, + "step": 242520, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.76382546, + "epoch": 1.3563985588195406, + "grad_norm": 8.3125, + "learning_rate": 2.573844897658604e-06, + "loss": 0.9154767, + "memory(GiB)": 302.58, + "step": 242540, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.74607134, + "epoch": 1.35651040829252, + "grad_norm": 7.90625, + "learning_rate": 2.573036390088063e-06, + "loss": 0.98647099, + "memory(GiB)": 302.58, + "step": 242560, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.75332656, + "epoch": 1.3566222577654992, + "grad_norm": 6.90625, + "learning_rate": 2.5722279655273723e-06, + "loss": 0.95810108, + "memory(GiB)": 302.58, + "step": 242580, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.75500212, + "epoch": 1.3567341072384784, + "grad_norm": 13.6875, + "learning_rate": 2.5714196240041844e-06, + "loss": 0.94476032, + "memory(GiB)": 302.58, + "step": 242600, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.7469944, + "epoch": 1.3568459567114577, + "grad_norm": 7.5, + "learning_rate": 2.5706113655461453e-06, + "loss": 0.98390541, + "memory(GiB)": 302.58, + "step": 242620, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.74200425, + "epoch": 1.356957806184437, + "grad_norm": 7.40625, + "learning_rate": 2.569803190180901e-06, + "loss": 1.01527185, + "memory(GiB)": 302.58, + "step": 242640, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.74947848, + "epoch": 1.3570696556574162, + "grad_norm": 6.09375, + "learning_rate": 2.568995097936094e-06, + "loss": 0.97009382, + "memory(GiB)": 302.58, + "step": 242660, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74804268, + "epoch": 1.3571815051303955, + "grad_norm": 7.53125, + "learning_rate": 2.5681870888393635e-06, + "loss": 1.02170067, + "memory(GiB)": 302.58, + "step": 242680, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.76028018, + "epoch": 1.3572933546033747, + "grad_norm": 6.0, + "learning_rate": 2.5673791629183444e-06, + "loss": 0.93506451, + "memory(GiB)": 302.58, + "step": 242700, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.73595614, + "epoch": 1.357405204076354, + "grad_norm": 7.15625, + "learning_rate": 2.5665713202006725e-06, + "loss": 1.02716923, + "memory(GiB)": 302.58, + "step": 242720, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.74684033, + "epoch": 1.3575170535493333, + "grad_norm": 5.75, + "learning_rate": 2.5657635607139776e-06, + "loss": 1.00677872, + "memory(GiB)": 302.58, + "step": 242740, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.76166692, + "epoch": 1.3576289030223125, + "grad_norm": 7.8125, + "learning_rate": 2.564955884485887e-06, + "loss": 0.9458931, + "memory(GiB)": 302.58, + "step": 242760, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.75930743, + "epoch": 1.3577407524952918, + "grad_norm": 7.3125, + "learning_rate": 2.5641482915440252e-06, + "loss": 0.94485207, + "memory(GiB)": 302.58, + "step": 242780, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.73990459, + "epoch": 1.357852601968271, + "grad_norm": 4.625, + "learning_rate": 2.563340781916017e-06, + "loss": 1.0295104, + "memory(GiB)": 302.58, + "step": 242800, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.76187186, + "epoch": 1.3579644514412503, + "grad_norm": 7.25, + "learning_rate": 2.5625333556294814e-06, + "loss": 0.93125553, + "memory(GiB)": 302.58, + "step": 242820, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.74297042, + "epoch": 1.3580763009142296, + "grad_norm": 6.625, + "learning_rate": 2.5617260127120336e-06, + "loss": 1.00636158, + "memory(GiB)": 302.58, + "step": 242840, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.75585818, + "epoch": 1.3581881503872089, + "grad_norm": 10.875, + "learning_rate": 2.5609187531912872e-06, + "loss": 0.97132492, + "memory(GiB)": 302.58, + "step": 242860, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.74020524, + "epoch": 1.3582999998601881, + "grad_norm": 5.84375, + "learning_rate": 2.5601115770948544e-06, + "loss": 1.02846174, + "memory(GiB)": 302.58, + "step": 242880, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.74838457, + "epoch": 1.3584118493331674, + "grad_norm": 7.6875, + "learning_rate": 2.5593044844503424e-06, + "loss": 0.97948465, + "memory(GiB)": 302.58, + "step": 242900, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.76722107, + "epoch": 1.3585236988061467, + "grad_norm": 9.375, + "learning_rate": 2.5584974752853552e-06, + "loss": 0.93314066, + "memory(GiB)": 302.58, + "step": 242920, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.75082474, + "epoch": 1.358635548279126, + "grad_norm": 9.625, + "learning_rate": 2.5576905496274973e-06, + "loss": 0.98771935, + "memory(GiB)": 302.58, + "step": 242940, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.73690462, + "epoch": 1.3587473977521052, + "grad_norm": 7.96875, + "learning_rate": 2.556883707504366e-06, + "loss": 1.04246454, + "memory(GiB)": 302.58, + "step": 242960, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.76147037, + "epoch": 1.3588592472250844, + "grad_norm": 6.96875, + "learning_rate": 2.5560769489435593e-06, + "loss": 0.93571405, + "memory(GiB)": 302.58, + "step": 242980, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.73221111, + "epoch": 1.3589710966980637, + "grad_norm": 6.6875, + "learning_rate": 2.5552702739726703e-06, + "loss": 1.05214481, + "memory(GiB)": 302.58, + "step": 243000, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.74991627, + "epoch": 1.359082946171043, + "grad_norm": 8.9375, + "learning_rate": 2.5544636826192904e-06, + "loss": 0.99419842, + "memory(GiB)": 302.58, + "step": 243020, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.75403504, + "epoch": 1.3591947956440222, + "grad_norm": 5.40625, + "learning_rate": 2.553657174911006e-06, + "loss": 0.97451792, + "memory(GiB)": 302.58, + "step": 243040, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.75677271, + "epoch": 1.3593066451170015, + "grad_norm": 6.90625, + "learning_rate": 2.552850750875403e-06, + "loss": 0.96741123, + "memory(GiB)": 302.58, + "step": 243060, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.75436759, + "epoch": 1.3594184945899808, + "grad_norm": 6.1875, + "learning_rate": 2.552044410540065e-06, + "loss": 0.96811495, + "memory(GiB)": 302.58, + "step": 243080, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.75542207, + "epoch": 1.35953034406296, + "grad_norm": 5.09375, + "learning_rate": 2.55123815393257e-06, + "loss": 0.93422585, + "memory(GiB)": 302.58, + "step": 243100, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75457106, + "epoch": 1.3596421935359393, + "grad_norm": 8.8125, + "learning_rate": 2.5504319810804957e-06, + "loss": 0.95921602, + "memory(GiB)": 302.58, + "step": 243120, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.74649839, + "epoch": 1.3597540430089186, + "grad_norm": 8.5625, + "learning_rate": 2.5496258920114146e-06, + "loss": 0.996702, + "memory(GiB)": 302.58, + "step": 243140, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.76130033, + "epoch": 1.3598658924818978, + "grad_norm": 4.15625, + "learning_rate": 2.548819886752899e-06, + "loss": 0.92555141, + "memory(GiB)": 302.58, + "step": 243160, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.74860802, + "epoch": 1.359977741954877, + "grad_norm": 10.375, + "learning_rate": 2.5480139653325144e-06, + "loss": 0.96760569, + "memory(GiB)": 302.58, + "step": 243180, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.74892216, + "epoch": 1.3600895914278563, + "grad_norm": 6.34375, + "learning_rate": 2.547208127777828e-06, + "loss": 0.98779459, + "memory(GiB)": 302.58, + "step": 243200, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.7585609, + "epoch": 1.3602014409008356, + "grad_norm": 6.65625, + "learning_rate": 2.546402374116401e-06, + "loss": 0.9480135, + "memory(GiB)": 302.58, + "step": 243220, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.74640646, + "epoch": 1.3603132903738149, + "grad_norm": 11.3125, + "learning_rate": 2.5455967043757927e-06, + "loss": 1.00267639, + "memory(GiB)": 302.58, + "step": 243240, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.74517002, + "epoch": 1.3604251398467941, + "grad_norm": 7.28125, + "learning_rate": 2.5447911185835607e-06, + "loss": 1.01214571, + "memory(GiB)": 302.58, + "step": 243260, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.76105814, + "epoch": 1.3605369893197734, + "grad_norm": 7.40625, + "learning_rate": 2.543985616767256e-06, + "loss": 0.93629818, + "memory(GiB)": 302.58, + "step": 243280, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.74875073, + "epoch": 1.3606488387927527, + "grad_norm": 10.0625, + "learning_rate": 2.5431801989544304e-06, + "loss": 0.98832607, + "memory(GiB)": 302.58, + "step": 243300, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.74963632, + "epoch": 1.360760688265732, + "grad_norm": 9.75, + "learning_rate": 2.5423748651726344e-06, + "loss": 1.00060444, + "memory(GiB)": 302.58, + "step": 243320, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.75025535, + "epoch": 1.3608725377387112, + "grad_norm": 10.0625, + "learning_rate": 2.54156961544941e-06, + "loss": 0.9849041, + "memory(GiB)": 302.58, + "step": 243340, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.75475335, + "epoch": 1.3609843872116905, + "grad_norm": 7.84375, + "learning_rate": 2.5407644498123008e-06, + "loss": 0.94408979, + "memory(GiB)": 302.58, + "step": 243360, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.73189301, + "epoch": 1.3610962366846697, + "grad_norm": 8.5, + "learning_rate": 2.5399593682888436e-06, + "loss": 1.07387772, + "memory(GiB)": 302.58, + "step": 243380, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.74288507, + "epoch": 1.361208086157649, + "grad_norm": 9.0625, + "learning_rate": 2.5391543709065786e-06, + "loss": 1.04428396, + "memory(GiB)": 302.58, + "step": 243400, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.74574094, + "epoch": 1.3613199356306283, + "grad_norm": 8.125, + "learning_rate": 2.538349457693037e-06, + "loss": 0.98672323, + "memory(GiB)": 302.58, + "step": 243420, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.75639777, + "epoch": 1.3614317851036075, + "grad_norm": 5.25, + "learning_rate": 2.5375446286757497e-06, + "loss": 0.96304293, + "memory(GiB)": 302.58, + "step": 243440, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.75405636, + "epoch": 1.3615436345765868, + "grad_norm": 7.6875, + "learning_rate": 2.5367398838822443e-06, + "loss": 0.98118019, + "memory(GiB)": 302.58, + "step": 243460, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.75436649, + "epoch": 1.361655484049566, + "grad_norm": 6.4375, + "learning_rate": 2.5359352233400457e-06, + "loss": 0.97289228, + "memory(GiB)": 302.58, + "step": 243480, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.73139882, + "epoch": 1.3617673335225453, + "grad_norm": 8.125, + "learning_rate": 2.5351306470766755e-06, + "loss": 1.06476946, + "memory(GiB)": 302.58, + "step": 243500, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.76548705, + "epoch": 1.3618791829955246, + "grad_norm": 7.375, + "learning_rate": 2.534326155119653e-06, + "loss": 0.91428766, + "memory(GiB)": 302.58, + "step": 243520, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.75377769, + "epoch": 1.3619910324685038, + "grad_norm": 5.5, + "learning_rate": 2.5335217474964945e-06, + "loss": 0.97499828, + "memory(GiB)": 302.58, + "step": 243540, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.76188135, + "epoch": 1.362102881941483, + "grad_norm": 7.25, + "learning_rate": 2.532717424234714e-06, + "loss": 0.93542652, + "memory(GiB)": 302.58, + "step": 243560, + "train_speed(iter/s)": 0.123762 + }, + { + "acc": 0.7402174, + "epoch": 1.3622147314144624, + "grad_norm": 7.5625, + "learning_rate": 2.5319131853618204e-06, + "loss": 1.01504574, + "memory(GiB)": 302.58, + "step": 243580, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.75051079, + "epoch": 1.3623265808874416, + "grad_norm": 8.5, + "learning_rate": 2.531109030905322e-06, + "loss": 0.97238255, + "memory(GiB)": 302.58, + "step": 243600, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.74049654, + "epoch": 1.362438430360421, + "grad_norm": 7.8125, + "learning_rate": 2.530304960892724e-06, + "loss": 1.03306999, + "memory(GiB)": 302.58, + "step": 243620, + "train_speed(iter/s)": 0.123775 + }, + { + "acc": 0.75771461, + "epoch": 1.3625502798334002, + "grad_norm": 9.375, + "learning_rate": 2.5295009753515266e-06, + "loss": 0.96298399, + "memory(GiB)": 302.58, + "step": 243640, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.74410343, + "epoch": 1.3626621293063794, + "grad_norm": 12.5625, + "learning_rate": 2.5286970743092287e-06, + "loss": 1.01045246, + "memory(GiB)": 302.58, + "step": 243660, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.74827404, + "epoch": 1.3627739787793587, + "grad_norm": 8.0625, + "learning_rate": 2.527893257793328e-06, + "loss": 1.01526155, + "memory(GiB)": 302.58, + "step": 243680, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.77255797, + "epoch": 1.362885828252338, + "grad_norm": 9.6875, + "learning_rate": 2.527089525831318e-06, + "loss": 0.88895617, + "memory(GiB)": 302.58, + "step": 243700, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.74779081, + "epoch": 1.3629976777253172, + "grad_norm": 8.1875, + "learning_rate": 2.526285878450686e-06, + "loss": 1.00684299, + "memory(GiB)": 302.58, + "step": 243720, + "train_speed(iter/s)": 0.1238 + }, + { + "acc": 0.75106535, + "epoch": 1.3631095271982965, + "grad_norm": 8.625, + "learning_rate": 2.525482315678922e-06, + "loss": 0.9688426, + "memory(GiB)": 302.58, + "step": 243740, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.75184002, + "epoch": 1.3632213766712757, + "grad_norm": 7.34375, + "learning_rate": 2.524678837543509e-06, + "loss": 0.97686234, + "memory(GiB)": 302.58, + "step": 243760, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.75401239, + "epoch": 1.363333226144255, + "grad_norm": 9.5, + "learning_rate": 2.5238754440719287e-06, + "loss": 0.96082201, + "memory(GiB)": 302.58, + "step": 243780, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.73983479, + "epoch": 1.3634450756172343, + "grad_norm": 10.6875, + "learning_rate": 2.52307213529166e-06, + "loss": 1.02346029, + "memory(GiB)": 302.58, + "step": 243800, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.76088581, + "epoch": 1.3635569250902135, + "grad_norm": 8.4375, + "learning_rate": 2.5222689112301778e-06, + "loss": 0.94461536, + "memory(GiB)": 302.58, + "step": 243820, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.74511137, + "epoch": 1.3636687745631928, + "grad_norm": 6.25, + "learning_rate": 2.521465771914956e-06, + "loss": 1.00125265, + "memory(GiB)": 302.58, + "step": 243840, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.73863368, + "epoch": 1.363780624036172, + "grad_norm": 7.59375, + "learning_rate": 2.5206627173734645e-06, + "loss": 1.00416594, + "memory(GiB)": 302.58, + "step": 243860, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.73818789, + "epoch": 1.3638924735091513, + "grad_norm": 8.0625, + "learning_rate": 2.5198597476331686e-06, + "loss": 1.02901783, + "memory(GiB)": 302.58, + "step": 243880, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.73638191, + "epoch": 1.3640043229821306, + "grad_norm": 6.59375, + "learning_rate": 2.519056862721535e-06, + "loss": 1.03196783, + "memory(GiB)": 302.58, + "step": 243900, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.75104599, + "epoch": 1.3641161724551099, + "grad_norm": 7.4375, + "learning_rate": 2.5182540626660225e-06, + "loss": 0.99571171, + "memory(GiB)": 302.58, + "step": 243920, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.74194155, + "epoch": 1.3642280219280891, + "grad_norm": 6.34375, + "learning_rate": 2.5174513474940894e-06, + "loss": 1.03687057, + "memory(GiB)": 302.58, + "step": 243940, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.73748717, + "epoch": 1.3643398714010684, + "grad_norm": 7.375, + "learning_rate": 2.5166487172331934e-06, + "loss": 1.03133221, + "memory(GiB)": 302.58, + "step": 243960, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.7453917, + "epoch": 1.3644517208740476, + "grad_norm": 5.8125, + "learning_rate": 2.515846171910786e-06, + "loss": 1.00011463, + "memory(GiB)": 302.58, + "step": 243980, + "train_speed(iter/s)": 0.123861 + }, + { + "acc": 0.74523191, + "epoch": 1.364563570347027, + "grad_norm": 7.53125, + "learning_rate": 2.515043711554317e-06, + "loss": 0.98373365, + "memory(GiB)": 302.58, + "step": 244000, + "train_speed(iter/s)": 0.123866 + }, + { + "epoch": 1.364563570347027, + "eval_acc": 0.7068329979672963, + "eval_loss": 1.0120537281036377, + "eval_runtime": 7529.4269, + "eval_samples_per_second": 9.999, + "eval_steps_per_second": 9.999, + "step": 244000 + }, + { + "acc": 0.74321413, + "epoch": 1.3646754198200062, + "grad_norm": 6.375, + "learning_rate": 2.514241336191232e-06, + "loss": 0.99637642, + "memory(GiB)": 302.58, + "step": 244020, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.76713614, + "epoch": 1.3647872692929854, + "grad_norm": 7.71875, + "learning_rate": 2.5134390458489756e-06, + "loss": 0.91879206, + "memory(GiB)": 302.58, + "step": 244040, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.75161424, + "epoch": 1.3648991187659647, + "grad_norm": 8.625, + "learning_rate": 2.5126368405549884e-06, + "loss": 1.00166616, + "memory(GiB)": 302.58, + "step": 244060, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.74794583, + "epoch": 1.365010968238944, + "grad_norm": 9.8125, + "learning_rate": 2.5118347203367087e-06, + "loss": 0.99075098, + "memory(GiB)": 302.58, + "step": 244080, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.75189257, + "epoch": 1.3651228177119232, + "grad_norm": 5.71875, + "learning_rate": 2.511032685221571e-06, + "loss": 0.9655694, + "memory(GiB)": 302.58, + "step": 244100, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.73034906, + "epoch": 1.3652346671849025, + "grad_norm": 7.15625, + "learning_rate": 2.510230735237008e-06, + "loss": 1.0831727, + "memory(GiB)": 302.58, + "step": 244120, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.75374546, + "epoch": 1.3653465166578818, + "grad_norm": 5.4375, + "learning_rate": 2.5094288704104484e-06, + "loss": 0.96860447, + "memory(GiB)": 302.58, + "step": 244140, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74571757, + "epoch": 1.365458366130861, + "grad_norm": 5.34375, + "learning_rate": 2.50862709076932e-06, + "loss": 0.99546432, + "memory(GiB)": 302.58, + "step": 244160, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75746293, + "epoch": 1.3655702156038403, + "grad_norm": 10.125, + "learning_rate": 2.5078253963410436e-06, + "loss": 0.95227308, + "memory(GiB)": 302.58, + "step": 244180, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.75921278, + "epoch": 1.3656820650768196, + "grad_norm": 7.34375, + "learning_rate": 2.50702378715304e-06, + "loss": 0.93168392, + "memory(GiB)": 302.58, + "step": 244200, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.73743854, + "epoch": 1.3657939145497988, + "grad_norm": 6.40625, + "learning_rate": 2.5062222632327293e-06, + "loss": 1.04645481, + "memory(GiB)": 302.58, + "step": 244220, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.76877255, + "epoch": 1.365905764022778, + "grad_norm": 10.6875, + "learning_rate": 2.505420824607525e-06, + "loss": 0.8989501, + "memory(GiB)": 302.58, + "step": 244240, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.74192152, + "epoch": 1.3660176134957573, + "grad_norm": 7.375, + "learning_rate": 2.504619471304839e-06, + "loss": 1.00893488, + "memory(GiB)": 302.58, + "step": 244260, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.73985052, + "epoch": 1.3661294629687366, + "grad_norm": 16.75, + "learning_rate": 2.503818203352079e-06, + "loss": 1.06580801, + "memory(GiB)": 302.58, + "step": 244280, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73684173, + "epoch": 1.3662413124417159, + "grad_norm": 4.75, + "learning_rate": 2.503017020776652e-06, + "loss": 1.05021172, + "memory(GiB)": 302.58, + "step": 244300, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.77146354, + "epoch": 1.3663531619146951, + "grad_norm": 7.625, + "learning_rate": 2.502215923605961e-06, + "loss": 0.89013939, + "memory(GiB)": 302.58, + "step": 244320, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.74961786, + "epoch": 1.3664650113876744, + "grad_norm": 7.4375, + "learning_rate": 2.501414911867405e-06, + "loss": 0.98110275, + "memory(GiB)": 302.58, + "step": 244340, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74462314, + "epoch": 1.3665768608606537, + "grad_norm": 9.9375, + "learning_rate": 2.5006139855883813e-06, + "loss": 1.03184652, + "memory(GiB)": 302.58, + "step": 244360, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.75786524, + "epoch": 1.366688710333633, + "grad_norm": 5.875, + "learning_rate": 2.499813144796286e-06, + "loss": 0.95440054, + "memory(GiB)": 302.58, + "step": 244380, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74372201, + "epoch": 1.3668005598066122, + "grad_norm": 7.9375, + "learning_rate": 2.499012389518508e-06, + "loss": 1.00587711, + "memory(GiB)": 302.58, + "step": 244400, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.75682116, + "epoch": 1.3669124092795915, + "grad_norm": 7.125, + "learning_rate": 2.4982117197824373e-06, + "loss": 0.94285345, + "memory(GiB)": 302.58, + "step": 244420, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.74600644, + "epoch": 1.3670242587525707, + "grad_norm": 6.625, + "learning_rate": 2.4974111356154586e-06, + "loss": 1.01493225, + "memory(GiB)": 302.58, + "step": 244440, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.75083561, + "epoch": 1.36713610822555, + "grad_norm": 6.625, + "learning_rate": 2.4966106370449545e-06, + "loss": 1.01248674, + "memory(GiB)": 302.58, + "step": 244460, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.76681509, + "epoch": 1.3672479576985292, + "grad_norm": 7.40625, + "learning_rate": 2.495810224098303e-06, + "loss": 0.91937857, + "memory(GiB)": 302.58, + "step": 244480, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.76564074, + "epoch": 1.3673598071715085, + "grad_norm": 5.59375, + "learning_rate": 2.4950098968028836e-06, + "loss": 0.91345329, + "memory(GiB)": 302.58, + "step": 244500, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.74986553, + "epoch": 1.3674716566444878, + "grad_norm": 4.9375, + "learning_rate": 2.4942096551860696e-06, + "loss": 0.97504835, + "memory(GiB)": 302.58, + "step": 244520, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.73725467, + "epoch": 1.367583506117467, + "grad_norm": 6.71875, + "learning_rate": 2.4934094992752306e-06, + "loss": 1.03453913, + "memory(GiB)": 302.58, + "step": 244540, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.74279952, + "epoch": 1.3676953555904463, + "grad_norm": 6.0, + "learning_rate": 2.492609429097733e-06, + "loss": 1.00763216, + "memory(GiB)": 302.58, + "step": 244560, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.75916295, + "epoch": 1.3678072050634256, + "grad_norm": 8.0625, + "learning_rate": 2.4918094446809454e-06, + "loss": 0.95088558, + "memory(GiB)": 302.58, + "step": 244580, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.75022974, + "epoch": 1.3679190545364048, + "grad_norm": 6.625, + "learning_rate": 2.491009546052229e-06, + "loss": 0.97850771, + "memory(GiB)": 302.58, + "step": 244600, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74527049, + "epoch": 1.368030904009384, + "grad_norm": 7.21875, + "learning_rate": 2.4902097332389407e-06, + "loss": 1.00057802, + "memory(GiB)": 302.58, + "step": 244620, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75017242, + "epoch": 1.3681427534823634, + "grad_norm": 5.09375, + "learning_rate": 2.4894100062684386e-06, + "loss": 0.98141203, + "memory(GiB)": 302.58, + "step": 244640, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.75023031, + "epoch": 1.3682546029553426, + "grad_norm": 7.09375, + "learning_rate": 2.488610365168075e-06, + "loss": 0.96737909, + "memory(GiB)": 302.58, + "step": 244660, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.76921196, + "epoch": 1.3683664524283219, + "grad_norm": 7.34375, + "learning_rate": 2.4878108099652e-06, + "loss": 0.8749403, + "memory(GiB)": 302.58, + "step": 244680, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74723983, + "epoch": 1.3684783019013012, + "grad_norm": 5.96875, + "learning_rate": 2.4870113406871615e-06, + "loss": 0.98199406, + "memory(GiB)": 302.58, + "step": 244700, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74194894, + "epoch": 1.3685901513742804, + "grad_norm": 6.625, + "learning_rate": 2.4862119573613037e-06, + "loss": 1.01323557, + "memory(GiB)": 302.58, + "step": 244720, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.74994469, + "epoch": 1.3687020008472597, + "grad_norm": 7.125, + "learning_rate": 2.485412660014968e-06, + "loss": 0.98754845, + "memory(GiB)": 302.58, + "step": 244740, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.7341043, + "epoch": 1.368813850320239, + "grad_norm": 6.15625, + "learning_rate": 2.4846134486754927e-06, + "loss": 1.04175701, + "memory(GiB)": 302.58, + "step": 244760, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.76173482, + "epoch": 1.3689256997932182, + "grad_norm": 8.4375, + "learning_rate": 2.4838143233702133e-06, + "loss": 0.92365894, + "memory(GiB)": 302.58, + "step": 244780, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.74583344, + "epoch": 1.3690375492661975, + "grad_norm": 9.625, + "learning_rate": 2.483015284126463e-06, + "loss": 0.98935881, + "memory(GiB)": 302.58, + "step": 244800, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.75550747, + "epoch": 1.3691493987391767, + "grad_norm": 8.4375, + "learning_rate": 2.482216330971569e-06, + "loss": 0.9536684, + "memory(GiB)": 302.58, + "step": 244820, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75072622, + "epoch": 1.369261248212156, + "grad_norm": 6.1875, + "learning_rate": 2.4814174639328624e-06, + "loss": 0.99446392, + "memory(GiB)": 302.58, + "step": 244840, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.76217799, + "epoch": 1.3693730976851353, + "grad_norm": 9.1875, + "learning_rate": 2.4806186830376643e-06, + "loss": 0.92372169, + "memory(GiB)": 302.58, + "step": 244860, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.77261577, + "epoch": 1.3694849471581145, + "grad_norm": 9.25, + "learning_rate": 2.4798199883132964e-06, + "loss": 0.89549894, + "memory(GiB)": 302.58, + "step": 244880, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74094601, + "epoch": 1.3695967966310938, + "grad_norm": 4.9375, + "learning_rate": 2.479021379787076e-06, + "loss": 1.01816177, + "memory(GiB)": 302.58, + "step": 244900, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.73946095, + "epoch": 1.369708646104073, + "grad_norm": 5.4375, + "learning_rate": 2.478222857486318e-06, + "loss": 1.04529285, + "memory(GiB)": 302.58, + "step": 244920, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.75964146, + "epoch": 1.3698204955770523, + "grad_norm": 7.625, + "learning_rate": 2.4774244214383346e-06, + "loss": 0.95311441, + "memory(GiB)": 302.58, + "step": 244940, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.74742517, + "epoch": 1.3699323450500316, + "grad_norm": 10.5625, + "learning_rate": 2.4766260716704354e-06, + "loss": 0.99082003, + "memory(GiB)": 302.58, + "step": 244960, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.73383727, + "epoch": 1.3700441945230108, + "grad_norm": 5.6875, + "learning_rate": 2.475827808209926e-06, + "loss": 1.04977169, + "memory(GiB)": 302.58, + "step": 244980, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.75998373, + "epoch": 1.3701560439959901, + "grad_norm": 6.65625, + "learning_rate": 2.4750296310841087e-06, + "loss": 0.93570328, + "memory(GiB)": 302.58, + "step": 245000, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.74683371, + "epoch": 1.3702678934689694, + "grad_norm": 8.625, + "learning_rate": 2.4742315403202848e-06, + "loss": 1.00769758, + "memory(GiB)": 302.58, + "step": 245020, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.74885178, + "epoch": 1.3703797429419486, + "grad_norm": 7.34375, + "learning_rate": 2.4734335359457518e-06, + "loss": 1.00250568, + "memory(GiB)": 302.58, + "step": 245040, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.76241884, + "epoch": 1.3704915924149281, + "grad_norm": 7.90625, + "learning_rate": 2.4726356179878024e-06, + "loss": 0.95449104, + "memory(GiB)": 302.58, + "step": 245060, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.72928777, + "epoch": 1.3706034418879072, + "grad_norm": 6.34375, + "learning_rate": 2.4718377864737276e-06, + "loss": 1.06030483, + "memory(GiB)": 302.58, + "step": 245080, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.75166593, + "epoch": 1.3707152913608867, + "grad_norm": 5.25, + "learning_rate": 2.4710400414308188e-06, + "loss": 0.97188196, + "memory(GiB)": 302.58, + "step": 245100, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75186901, + "epoch": 1.3708271408338657, + "grad_norm": 6.1875, + "learning_rate": 2.47024238288636e-06, + "loss": 0.97687588, + "memory(GiB)": 302.58, + "step": 245120, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.75454011, + "epoch": 1.3709389903068452, + "grad_norm": 5.375, + "learning_rate": 2.4694448108676334e-06, + "loss": 0.9787775, + "memory(GiB)": 302.58, + "step": 245140, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.77256274, + "epoch": 1.3710508397798242, + "grad_norm": 7.4375, + "learning_rate": 2.4686473254019184e-06, + "loss": 0.87647734, + "memory(GiB)": 302.58, + "step": 245160, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.76300902, + "epoch": 1.3711626892528037, + "grad_norm": 7.1875, + "learning_rate": 2.4678499265164914e-06, + "loss": 0.91832275, + "memory(GiB)": 302.58, + "step": 245180, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.73577971, + "epoch": 1.3712745387257828, + "grad_norm": 7.78125, + "learning_rate": 2.467052614238626e-06, + "loss": 1.03388519, + "memory(GiB)": 302.58, + "step": 245200, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.75828619, + "epoch": 1.3713863881987622, + "grad_norm": 9.3125, + "learning_rate": 2.4662553885955933e-06, + "loss": 0.93328505, + "memory(GiB)": 302.58, + "step": 245220, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.74721193, + "epoch": 1.3714982376717413, + "grad_norm": 8.0625, + "learning_rate": 2.46545824961466e-06, + "loss": 1.00179176, + "memory(GiB)": 302.58, + "step": 245240, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.74818654, + "epoch": 1.3716100871447208, + "grad_norm": 9.375, + "learning_rate": 2.4646611973230914e-06, + "loss": 1.0016818, + "memory(GiB)": 302.58, + "step": 245260, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.72348824, + "epoch": 1.3717219366176998, + "grad_norm": 8.0625, + "learning_rate": 2.4638642317481497e-06, + "loss": 1.11713352, + "memory(GiB)": 302.58, + "step": 245280, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.74725685, + "epoch": 1.3718337860906793, + "grad_norm": 10.0, + "learning_rate": 2.4630673529170935e-06, + "loss": 1.00156288, + "memory(GiB)": 302.58, + "step": 245300, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.75055084, + "epoch": 1.3719456355636583, + "grad_norm": 9.875, + "learning_rate": 2.4622705608571775e-06, + "loss": 0.97986021, + "memory(GiB)": 302.58, + "step": 245320, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.74873781, + "epoch": 1.3720574850366378, + "grad_norm": 7.75, + "learning_rate": 2.4614738555956554e-06, + "loss": 0.99721842, + "memory(GiB)": 302.58, + "step": 245340, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.74368229, + "epoch": 1.3721693345096169, + "grad_norm": 5.96875, + "learning_rate": 2.460677237159775e-06, + "loss": 1.02116013, + "memory(GiB)": 302.58, + "step": 245360, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.74761748, + "epoch": 1.3722811839825964, + "grad_norm": 7.96875, + "learning_rate": 2.4598807055767872e-06, + "loss": 0.97605782, + "memory(GiB)": 302.58, + "step": 245380, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.72891231, + "epoch": 1.3723930334555754, + "grad_norm": 7.28125, + "learning_rate": 2.4590842608739334e-06, + "loss": 1.0809433, + "memory(GiB)": 302.58, + "step": 245400, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.7788146, + "epoch": 1.3725048829285549, + "grad_norm": 6.6875, + "learning_rate": 2.4582879030784553e-06, + "loss": 0.85270462, + "memory(GiB)": 302.58, + "step": 245420, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.76284981, + "epoch": 1.372616732401534, + "grad_norm": 7.96875, + "learning_rate": 2.45749163221759e-06, + "loss": 0.9122179, + "memory(GiB)": 302.58, + "step": 245440, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.76484537, + "epoch": 1.3727285818745134, + "grad_norm": 7.90625, + "learning_rate": 2.4566954483185735e-06, + "loss": 0.91357737, + "memory(GiB)": 302.58, + "step": 245460, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.75487194, + "epoch": 1.3728404313474925, + "grad_norm": 9.375, + "learning_rate": 2.4558993514086366e-06, + "loss": 0.96427507, + "memory(GiB)": 302.58, + "step": 245480, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.76689539, + "epoch": 1.372952280820472, + "grad_norm": 7.71875, + "learning_rate": 2.4551033415150093e-06, + "loss": 0.90831985, + "memory(GiB)": 302.58, + "step": 245500, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.75307069, + "epoch": 1.373064130293451, + "grad_norm": 8.75, + "learning_rate": 2.4543074186649174e-06, + "loss": 0.95549192, + "memory(GiB)": 302.58, + "step": 245520, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.76112909, + "epoch": 1.3731759797664305, + "grad_norm": 4.875, + "learning_rate": 2.453511582885584e-06, + "loss": 0.93532028, + "memory(GiB)": 302.58, + "step": 245540, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.74581676, + "epoch": 1.3732878292394095, + "grad_norm": 7.4375, + "learning_rate": 2.4527158342042297e-06, + "loss": 1.01467161, + "memory(GiB)": 302.58, + "step": 245560, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.73005786, + "epoch": 1.373399678712389, + "grad_norm": 6.625, + "learning_rate": 2.45192017264807e-06, + "loss": 1.06039438, + "memory(GiB)": 302.58, + "step": 245580, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.73921809, + "epoch": 1.373511528185368, + "grad_norm": 9.25, + "learning_rate": 2.451124598244321e-06, + "loss": 1.01959877, + "memory(GiB)": 302.58, + "step": 245600, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.74788542, + "epoch": 1.3736233776583475, + "grad_norm": 10.4375, + "learning_rate": 2.4503291110201927e-06, + "loss": 1.0081233, + "memory(GiB)": 302.58, + "step": 245620, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.7536006, + "epoch": 1.3737352271313266, + "grad_norm": 6.75, + "learning_rate": 2.4495337110028918e-06, + "loss": 0.94450426, + "memory(GiB)": 302.58, + "step": 245640, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.78090801, + "epoch": 1.373847076604306, + "grad_norm": 7.21875, + "learning_rate": 2.448738398219627e-06, + "loss": 0.85043964, + "memory(GiB)": 302.58, + "step": 245660, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.74801049, + "epoch": 1.373958926077285, + "grad_norm": 9.75, + "learning_rate": 2.4479431726975988e-06, + "loss": 0.98269444, + "memory(GiB)": 302.58, + "step": 245680, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.74667363, + "epoch": 1.3740707755502646, + "grad_norm": 8.5, + "learning_rate": 2.4471480344640066e-06, + "loss": 1.02331753, + "memory(GiB)": 302.58, + "step": 245700, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.75344143, + "epoch": 1.3741826250232436, + "grad_norm": 7.03125, + "learning_rate": 2.446352983546046e-06, + "loss": 0.95556984, + "memory(GiB)": 302.58, + "step": 245720, + "train_speed(iter/s)": 0.123796 + }, + { + "acc": 0.75613127, + "epoch": 1.374294474496223, + "grad_norm": 7.96875, + "learning_rate": 2.4455580199709113e-06, + "loss": 0.9616086, + "memory(GiB)": 302.58, + "step": 245740, + "train_speed(iter/s)": 0.123801 + }, + { + "acc": 0.73474412, + "epoch": 1.3744063239692021, + "grad_norm": 8.0625, + "learning_rate": 2.444763143765792e-06, + "loss": 1.05515642, + "memory(GiB)": 302.58, + "step": 245760, + "train_speed(iter/s)": 0.123805 + }, + { + "acc": 0.75020542, + "epoch": 1.3745181734421816, + "grad_norm": 7.34375, + "learning_rate": 2.4439683549578757e-06, + "loss": 1.00047483, + "memory(GiB)": 302.58, + "step": 245780, + "train_speed(iter/s)": 0.12381 + }, + { + "acc": 0.73552022, + "epoch": 1.3746300229151607, + "grad_norm": 10.0625, + "learning_rate": 2.443173653574345e-06, + "loss": 1.04855585, + "memory(GiB)": 302.58, + "step": 245800, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.74862218, + "epoch": 1.3747418723881402, + "grad_norm": 8.3125, + "learning_rate": 2.4423790396423847e-06, + "loss": 0.96821947, + "memory(GiB)": 302.58, + "step": 245820, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.74876037, + "epoch": 1.3748537218611192, + "grad_norm": 5.90625, + "learning_rate": 2.441584513189171e-06, + "loss": 0.99645452, + "memory(GiB)": 302.58, + "step": 245840, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.75685768, + "epoch": 1.3749655713340987, + "grad_norm": 6.8125, + "learning_rate": 2.4407900742418793e-06, + "loss": 0.96239214, + "memory(GiB)": 302.58, + "step": 245860, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.75878477, + "epoch": 1.3750774208070777, + "grad_norm": 6.0625, + "learning_rate": 2.4399957228276824e-06, + "loss": 0.92781057, + "memory(GiB)": 302.58, + "step": 245880, + "train_speed(iter/s)": 0.123833 + }, + { + "acc": 0.76784978, + "epoch": 1.3751892702800572, + "grad_norm": 8.125, + "learning_rate": 2.439201458973749e-06, + "loss": 0.91949272, + "memory(GiB)": 302.58, + "step": 245900, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.75335026, + "epoch": 1.3753011197530363, + "grad_norm": 8.0625, + "learning_rate": 2.438407282707246e-06, + "loss": 0.97233973, + "memory(GiB)": 302.58, + "step": 245920, + "train_speed(iter/s)": 0.123842 + }, + { + "acc": 0.75764084, + "epoch": 1.3754129692260157, + "grad_norm": 5.15625, + "learning_rate": 2.4376131940553366e-06, + "loss": 0.94047995, + "memory(GiB)": 302.58, + "step": 245940, + "train_speed(iter/s)": 0.123847 + }, + { + "acc": 0.7374608, + "epoch": 1.3755248186989948, + "grad_norm": 8.5, + "learning_rate": 2.436819193045179e-06, + "loss": 1.04506807, + "memory(GiB)": 302.58, + "step": 245960, + "train_speed(iter/s)": 0.123852 + }, + { + "acc": 0.74688282, + "epoch": 1.3756366681719743, + "grad_norm": 6.46875, + "learning_rate": 2.4360252797039347e-06, + "loss": 1.00190134, + "memory(GiB)": 302.58, + "step": 245980, + "train_speed(iter/s)": 0.123857 + }, + { + "acc": 0.75940971, + "epoch": 1.3757485176449533, + "grad_norm": 8.4375, + "learning_rate": 2.4352314540587557e-06, + "loss": 0.9315485, + "memory(GiB)": 302.58, + "step": 246000, + "train_speed(iter/s)": 0.123862 + }, + { + "epoch": 1.3757485176449533, + "eval_acc": 0.7068467024738804, + "eval_loss": 1.0121593475341797, + "eval_runtime": 7575.1154, + "eval_samples_per_second": 9.938, + "eval_steps_per_second": 9.938, + "step": 246000 + }, + { + "acc": 0.73441162, + "epoch": 1.3758603671179328, + "grad_norm": 5.8125, + "learning_rate": 2.4344377161367937e-06, + "loss": 1.05637188, + "memory(GiB)": 302.58, + "step": 246020, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.75721741, + "epoch": 1.3759722165909118, + "grad_norm": 5.40625, + "learning_rate": 2.4336440659651974e-06, + "loss": 0.9469367, + "memory(GiB)": 302.58, + "step": 246040, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.75275068, + "epoch": 1.3760840660638913, + "grad_norm": 6.0625, + "learning_rate": 2.4328505035711113e-06, + "loss": 0.98356361, + "memory(GiB)": 302.58, + "step": 246060, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.77216387, + "epoch": 1.3761959155368704, + "grad_norm": 6.34375, + "learning_rate": 2.432057028981678e-06, + "loss": 0.8927392, + "memory(GiB)": 302.58, + "step": 246080, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.75866971, + "epoch": 1.3763077650098499, + "grad_norm": 6.125, + "learning_rate": 2.4312636422240376e-06, + "loss": 0.93328133, + "memory(GiB)": 302.58, + "step": 246100, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74343152, + "epoch": 1.376419614482829, + "grad_norm": 7.875, + "learning_rate": 2.430470343325326e-06, + "loss": 1.00147324, + "memory(GiB)": 302.58, + "step": 246120, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75495424, + "epoch": 1.3765314639558084, + "grad_norm": 8.6875, + "learning_rate": 2.4296771323126757e-06, + "loss": 0.94789391, + "memory(GiB)": 302.58, + "step": 246140, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.7359086, + "epoch": 1.3766433134287874, + "grad_norm": 4.65625, + "learning_rate": 2.428884009213218e-06, + "loss": 1.04269104, + "memory(GiB)": 302.58, + "step": 246160, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.75835986, + "epoch": 1.376755162901767, + "grad_norm": 8.125, + "learning_rate": 2.4280909740540803e-06, + "loss": 0.96953754, + "memory(GiB)": 302.58, + "step": 246180, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.76515288, + "epoch": 1.376867012374746, + "grad_norm": 7.75, + "learning_rate": 2.4272980268623866e-06, + "loss": 0.95119209, + "memory(GiB)": 302.58, + "step": 246200, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74322162, + "epoch": 1.3769788618477254, + "grad_norm": 6.875, + "learning_rate": 2.4265051676652578e-06, + "loss": 1.01475382, + "memory(GiB)": 302.58, + "step": 246220, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.7369544, + "epoch": 1.3770907113207045, + "grad_norm": 8.6875, + "learning_rate": 2.4257123964898103e-06, + "loss": 1.03558884, + "memory(GiB)": 302.58, + "step": 246240, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.73964438, + "epoch": 1.377202560793684, + "grad_norm": 5.46875, + "learning_rate": 2.4249197133631646e-06, + "loss": 1.02800417, + "memory(GiB)": 302.58, + "step": 246260, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.7566534, + "epoch": 1.377314410266663, + "grad_norm": 6.875, + "learning_rate": 2.42412711831243e-06, + "loss": 0.96277285, + "memory(GiB)": 302.58, + "step": 246280, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75475779, + "epoch": 1.3774262597396425, + "grad_norm": 8.125, + "learning_rate": 2.423334611364715e-06, + "loss": 0.96523333, + "memory(GiB)": 302.58, + "step": 246300, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73172097, + "epoch": 1.3775381092126215, + "grad_norm": 7.40625, + "learning_rate": 2.4225421925471272e-06, + "loss": 1.05533409, + "memory(GiB)": 302.58, + "step": 246320, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.75423126, + "epoch": 1.377649958685601, + "grad_norm": 4.75, + "learning_rate": 2.421749861886769e-06, + "loss": 0.95164919, + "memory(GiB)": 302.58, + "step": 246340, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.76273608, + "epoch": 1.37776180815858, + "grad_norm": 6.21875, + "learning_rate": 2.4209576194107414e-06, + "loss": 0.92879553, + "memory(GiB)": 302.58, + "step": 246360, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.76556115, + "epoch": 1.3778736576315596, + "grad_norm": 10.0, + "learning_rate": 2.420165465146141e-06, + "loss": 0.91789484, + "memory(GiB)": 302.58, + "step": 246380, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.75344968, + "epoch": 1.3779855071045386, + "grad_norm": 8.75, + "learning_rate": 2.4193733991200617e-06, + "loss": 0.96685658, + "memory(GiB)": 302.58, + "step": 246400, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.73929062, + "epoch": 1.378097356577518, + "grad_norm": 7.5, + "learning_rate": 2.4185814213595954e-06, + "loss": 1.03270531, + "memory(GiB)": 302.58, + "step": 246420, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.75288329, + "epoch": 1.3782092060504971, + "grad_norm": 8.375, + "learning_rate": 2.4177895318918298e-06, + "loss": 0.96996498, + "memory(GiB)": 302.58, + "step": 246440, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.74999056, + "epoch": 1.3783210555234766, + "grad_norm": 5.875, + "learning_rate": 2.4169977307438507e-06, + "loss": 0.97397308, + "memory(GiB)": 302.58, + "step": 246460, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.75843339, + "epoch": 1.3784329049964557, + "grad_norm": 7.90625, + "learning_rate": 2.416206017942739e-06, + "loss": 0.94244537, + "memory(GiB)": 302.58, + "step": 246480, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.76693487, + "epoch": 1.3785447544694351, + "grad_norm": 10.8125, + "learning_rate": 2.4154143935155748e-06, + "loss": 0.90258112, + "memory(GiB)": 302.58, + "step": 246500, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.74423041, + "epoch": 1.3786566039424142, + "grad_norm": 4.96875, + "learning_rate": 2.414622857489432e-06, + "loss": 0.991049, + "memory(GiB)": 302.58, + "step": 246520, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.75661964, + "epoch": 1.3787684534153937, + "grad_norm": 5.5, + "learning_rate": 2.4138314098913874e-06, + "loss": 0.98819551, + "memory(GiB)": 302.58, + "step": 246540, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.74435334, + "epoch": 1.3788803028883727, + "grad_norm": 7.0, + "learning_rate": 2.4130400507485093e-06, + "loss": 1.02427711, + "memory(GiB)": 302.58, + "step": 246560, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.74272985, + "epoch": 1.3789921523613522, + "grad_norm": 8.625, + "learning_rate": 2.4122487800878643e-06, + "loss": 1.01574039, + "memory(GiB)": 302.58, + "step": 246580, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74318099, + "epoch": 1.3791040018343312, + "grad_norm": 5.25, + "learning_rate": 2.4114575979365164e-06, + "loss": 1.01057844, + "memory(GiB)": 302.58, + "step": 246600, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.74655595, + "epoch": 1.3792158513073107, + "grad_norm": 6.03125, + "learning_rate": 2.4106665043215266e-06, + "loss": 0.98657246, + "memory(GiB)": 302.58, + "step": 246620, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.75908284, + "epoch": 1.3793277007802898, + "grad_norm": 6.90625, + "learning_rate": 2.409875499269953e-06, + "loss": 0.93899717, + "memory(GiB)": 302.58, + "step": 246640, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.7527277, + "epoch": 1.3794395502532693, + "grad_norm": 7.0, + "learning_rate": 2.4090845828088505e-06, + "loss": 0.96982498, + "memory(GiB)": 302.58, + "step": 246660, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.75083413, + "epoch": 1.3795513997262483, + "grad_norm": 9.3125, + "learning_rate": 2.4082937549652706e-06, + "loss": 0.98051777, + "memory(GiB)": 302.58, + "step": 246680, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.74475055, + "epoch": 1.3796632491992278, + "grad_norm": 7.03125, + "learning_rate": 2.4075030157662627e-06, + "loss": 1.00915174, + "memory(GiB)": 302.58, + "step": 246700, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.73143926, + "epoch": 1.3797750986722068, + "grad_norm": 8.1875, + "learning_rate": 2.406712365238872e-06, + "loss": 1.06251612, + "memory(GiB)": 302.58, + "step": 246720, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.75315051, + "epoch": 1.3798869481451863, + "grad_norm": 8.6875, + "learning_rate": 2.4059218034101415e-06, + "loss": 0.95312052, + "memory(GiB)": 302.58, + "step": 246740, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.75825725, + "epoch": 1.3799987976181654, + "grad_norm": 8.4375, + "learning_rate": 2.4051313303071113e-06, + "loss": 0.93374205, + "memory(GiB)": 302.58, + "step": 246760, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.75036025, + "epoch": 1.3801106470911448, + "grad_norm": 8.5, + "learning_rate": 2.404340945956816e-06, + "loss": 0.98985157, + "memory(GiB)": 302.58, + "step": 246780, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.76053195, + "epoch": 1.3802224965641239, + "grad_norm": 7.46875, + "learning_rate": 2.403550650386292e-06, + "loss": 0.93160715, + "memory(GiB)": 302.58, + "step": 246800, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.75051446, + "epoch": 1.3803343460371034, + "grad_norm": 8.5, + "learning_rate": 2.4027604436225695e-06, + "loss": 0.96709747, + "memory(GiB)": 302.58, + "step": 246820, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75186534, + "epoch": 1.3804461955100824, + "grad_norm": 7.96875, + "learning_rate": 2.4019703256926746e-06, + "loss": 0.95918274, + "memory(GiB)": 302.58, + "step": 246840, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74550467, + "epoch": 1.380558044983062, + "grad_norm": 9.5, + "learning_rate": 2.4011802966236336e-06, + "loss": 1.01466074, + "memory(GiB)": 302.58, + "step": 246860, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.73073297, + "epoch": 1.380669894456041, + "grad_norm": 4.40625, + "learning_rate": 2.400390356442467e-06, + "loss": 1.06420031, + "memory(GiB)": 302.58, + "step": 246880, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.75107398, + "epoch": 1.3807817439290204, + "grad_norm": 7.21875, + "learning_rate": 2.399600505176193e-06, + "loss": 0.98911152, + "memory(GiB)": 302.58, + "step": 246900, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.74916754, + "epoch": 1.3808935934019995, + "grad_norm": 6.71875, + "learning_rate": 2.3988107428518276e-06, + "loss": 0.9908843, + "memory(GiB)": 302.58, + "step": 246920, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.74638915, + "epoch": 1.381005442874979, + "grad_norm": 8.5, + "learning_rate": 2.3980210694963827e-06, + "loss": 1.04252062, + "memory(GiB)": 302.58, + "step": 246940, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.75552115, + "epoch": 1.381117292347958, + "grad_norm": 6.125, + "learning_rate": 2.3972314851368677e-06, + "loss": 0.97408905, + "memory(GiB)": 302.58, + "step": 246960, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.7465528, + "epoch": 1.3812291418209375, + "grad_norm": 7.34375, + "learning_rate": 2.39644198980029e-06, + "loss": 1.00333061, + "memory(GiB)": 302.58, + "step": 246980, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.74255605, + "epoch": 1.3813409912939165, + "grad_norm": 9.125, + "learning_rate": 2.395652583513651e-06, + "loss": 1.02552404, + "memory(GiB)": 302.58, + "step": 247000, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.75091705, + "epoch": 1.381452840766896, + "grad_norm": 7.90625, + "learning_rate": 2.3948632663039524e-06, + "loss": 0.97078161, + "memory(GiB)": 302.58, + "step": 247020, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.72695651, + "epoch": 1.381564690239875, + "grad_norm": 7.0625, + "learning_rate": 2.39407403819819e-06, + "loss": 1.08666792, + "memory(GiB)": 302.58, + "step": 247040, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.74215355, + "epoch": 1.3816765397128545, + "grad_norm": 8.6875, + "learning_rate": 2.3932848992233575e-06, + "loss": 1.02430677, + "memory(GiB)": 302.58, + "step": 247060, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.7478137, + "epoch": 1.3817883891858336, + "grad_norm": 7.0625, + "learning_rate": 2.392495849406449e-06, + "loss": 0.98239994, + "memory(GiB)": 302.58, + "step": 247080, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.76117454, + "epoch": 1.381900238658813, + "grad_norm": 5.28125, + "learning_rate": 2.3917068887744497e-06, + "loss": 0.93660831, + "memory(GiB)": 302.58, + "step": 247100, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.754389, + "epoch": 1.382012088131792, + "grad_norm": 9.6875, + "learning_rate": 2.3909180173543447e-06, + "loss": 0.9633563, + "memory(GiB)": 302.58, + "step": 247120, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.76220555, + "epoch": 1.3821239376047716, + "grad_norm": 7.6875, + "learning_rate": 2.390129235173118e-06, + "loss": 0.94041672, + "memory(GiB)": 302.58, + "step": 247140, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.7592308, + "epoch": 1.3822357870777506, + "grad_norm": 8.9375, + "learning_rate": 2.389340542257747e-06, + "loss": 0.91857538, + "memory(GiB)": 302.58, + "step": 247160, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.7515512, + "epoch": 1.3823476365507301, + "grad_norm": 5.5625, + "learning_rate": 2.388551938635208e-06, + "loss": 1.00519905, + "memory(GiB)": 302.58, + "step": 247180, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.76546159, + "epoch": 1.3824594860237092, + "grad_norm": 7.9375, + "learning_rate": 2.3877634243324733e-06, + "loss": 0.91537933, + "memory(GiB)": 302.58, + "step": 247200, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.75621552, + "epoch": 1.3825713354966886, + "grad_norm": 6.96875, + "learning_rate": 2.3869749993765127e-06, + "loss": 0.96953058, + "memory(GiB)": 302.58, + "step": 247220, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.7646162, + "epoch": 1.3826831849696677, + "grad_norm": 8.75, + "learning_rate": 2.3861866637942926e-06, + "loss": 0.92857418, + "memory(GiB)": 302.58, + "step": 247240, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.75637164, + "epoch": 1.3827950344426472, + "grad_norm": 8.4375, + "learning_rate": 2.385398417612777e-06, + "loss": 0.96554155, + "memory(GiB)": 302.58, + "step": 247260, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.7597383, + "epoch": 1.3829068839156262, + "grad_norm": 5.40625, + "learning_rate": 2.384610260858926e-06, + "loss": 0.93036718, + "memory(GiB)": 302.58, + "step": 247280, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.75108514, + "epoch": 1.3830187333886057, + "grad_norm": 11.125, + "learning_rate": 2.3838221935596972e-06, + "loss": 0.96210489, + "memory(GiB)": 302.58, + "step": 247300, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.75159655, + "epoch": 1.3831305828615847, + "grad_norm": 5.96875, + "learning_rate": 2.3830342157420453e-06, + "loss": 0.98054895, + "memory(GiB)": 302.58, + "step": 247320, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.75755386, + "epoch": 1.3832424323345642, + "grad_norm": 4.875, + "learning_rate": 2.3822463274329218e-06, + "loss": 0.96666279, + "memory(GiB)": 302.58, + "step": 247340, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.75263147, + "epoch": 1.3833542818075433, + "grad_norm": 6.0625, + "learning_rate": 2.381458528659274e-06, + "loss": 0.96026268, + "memory(GiB)": 302.58, + "step": 247360, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.75461507, + "epoch": 1.3834661312805228, + "grad_norm": 4.875, + "learning_rate": 2.3806708194480453e-06, + "loss": 0.96097307, + "memory(GiB)": 302.58, + "step": 247380, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.74444427, + "epoch": 1.3835779807535018, + "grad_norm": 6.15625, + "learning_rate": 2.379883199826183e-06, + "loss": 0.99221964, + "memory(GiB)": 302.58, + "step": 247400, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.74660969, + "epoch": 1.3836898302264813, + "grad_norm": 6.59375, + "learning_rate": 2.379095669820623e-06, + "loss": 0.9886097, + "memory(GiB)": 302.58, + "step": 247420, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.73146958, + "epoch": 1.3838016796994603, + "grad_norm": 5.6875, + "learning_rate": 2.3783082294583023e-06, + "loss": 1.06156969, + "memory(GiB)": 302.58, + "step": 247440, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75316482, + "epoch": 1.3839135291724398, + "grad_norm": 8.875, + "learning_rate": 2.3775208787661526e-06, + "loss": 0.94414539, + "memory(GiB)": 302.58, + "step": 247460, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.73600349, + "epoch": 1.3840253786454189, + "grad_norm": 6.09375, + "learning_rate": 2.376733617771105e-06, + "loss": 1.05904694, + "memory(GiB)": 302.58, + "step": 247480, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.75282969, + "epoch": 1.3841372281183983, + "grad_norm": 9.5, + "learning_rate": 2.3759464465000854e-06, + "loss": 0.96826792, + "memory(GiB)": 302.58, + "step": 247500, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.75023799, + "epoch": 1.3842490775913774, + "grad_norm": 7.5, + "learning_rate": 2.375159364980018e-06, + "loss": 0.98319254, + "memory(GiB)": 302.58, + "step": 247520, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.71914997, + "epoch": 1.3843609270643569, + "grad_norm": 8.1875, + "learning_rate": 2.3743723732378237e-06, + "loss": 1.11972828, + "memory(GiB)": 302.58, + "step": 247540, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.75094824, + "epoch": 1.3844727765373361, + "grad_norm": 6.875, + "learning_rate": 2.37358547130042e-06, + "loss": 0.99641104, + "memory(GiB)": 302.58, + "step": 247560, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.76192136, + "epoch": 1.3845846260103154, + "grad_norm": 8.1875, + "learning_rate": 2.372798659194721e-06, + "loss": 0.92510834, + "memory(GiB)": 302.58, + "step": 247580, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.73737922, + "epoch": 1.3846964754832947, + "grad_norm": 9.1875, + "learning_rate": 2.3720119369476393e-06, + "loss": 1.03926382, + "memory(GiB)": 302.58, + "step": 247600, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.75275245, + "epoch": 1.384808324956274, + "grad_norm": 6.34375, + "learning_rate": 2.3712253045860817e-06, + "loss": 0.95921984, + "memory(GiB)": 302.58, + "step": 247620, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.75740848, + "epoch": 1.3849201744292532, + "grad_norm": 7.1875, + "learning_rate": 2.3704387621369542e-06, + "loss": 0.94433727, + "memory(GiB)": 302.58, + "step": 247640, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.74174113, + "epoch": 1.3850320239022325, + "grad_norm": 5.96875, + "learning_rate": 2.3696523096271572e-06, + "loss": 1.02465153, + "memory(GiB)": 302.58, + "step": 247660, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.76802144, + "epoch": 1.3851438733752117, + "grad_norm": 5.625, + "learning_rate": 2.3688659470835947e-06, + "loss": 0.91473198, + "memory(GiB)": 302.58, + "step": 247680, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.74142208, + "epoch": 1.385255722848191, + "grad_norm": 7.21875, + "learning_rate": 2.3680796745331585e-06, + "loss": 1.01946049, + "memory(GiB)": 302.58, + "step": 247700, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.76088338, + "epoch": 1.3853675723211702, + "grad_norm": 5.1875, + "learning_rate": 2.3672934920027443e-06, + "loss": 0.93452606, + "memory(GiB)": 302.58, + "step": 247720, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.75601516, + "epoch": 1.3854794217941495, + "grad_norm": 6.46875, + "learning_rate": 2.36650739951924e-06, + "loss": 0.96365213, + "memory(GiB)": 302.58, + "step": 247740, + "train_speed(iter/s)": 0.123792 + }, + { + "acc": 0.74395847, + "epoch": 1.3855912712671288, + "grad_norm": 11.8125, + "learning_rate": 2.3657213971095333e-06, + "loss": 0.99691782, + "memory(GiB)": 302.58, + "step": 247760, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.74320912, + "epoch": 1.385703120740108, + "grad_norm": 8.4375, + "learning_rate": 2.364935484800508e-06, + "loss": 1.02504978, + "memory(GiB)": 302.58, + "step": 247780, + "train_speed(iter/s)": 0.123801 + }, + { + "acc": 0.74538164, + "epoch": 1.3858149702130873, + "grad_norm": 7.78125, + "learning_rate": 2.3641496626190446e-06, + "loss": 0.98690386, + "memory(GiB)": 302.58, + "step": 247800, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.74507256, + "epoch": 1.3859268196860666, + "grad_norm": 7.4375, + "learning_rate": 2.363363930592021e-06, + "loss": 0.99561405, + "memory(GiB)": 302.58, + "step": 247820, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.75091906, + "epoch": 1.3860386691590458, + "grad_norm": 7.15625, + "learning_rate": 2.362578288746311e-06, + "loss": 0.95552559, + "memory(GiB)": 302.58, + "step": 247840, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.74147186, + "epoch": 1.386150518632025, + "grad_norm": 7.03125, + "learning_rate": 2.361792737108787e-06, + "loss": 1.00478039, + "memory(GiB)": 302.58, + "step": 247860, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.74470997, + "epoch": 1.3862623681050044, + "grad_norm": 8.8125, + "learning_rate": 2.3610072757063172e-06, + "loss": 1.00354748, + "memory(GiB)": 302.58, + "step": 247880, + "train_speed(iter/s)": 0.123825 + }, + { + "acc": 0.75324345, + "epoch": 1.3863742175779836, + "grad_norm": 10.6875, + "learning_rate": 2.3602219045657665e-06, + "loss": 0.97180233, + "memory(GiB)": 302.58, + "step": 247900, + "train_speed(iter/s)": 0.12383 + }, + { + "acc": 0.73487492, + "epoch": 1.386486067050963, + "grad_norm": 5.875, + "learning_rate": 2.3594366237139965e-06, + "loss": 1.06194296, + "memory(GiB)": 302.58, + "step": 247920, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.75492945, + "epoch": 1.3865979165239422, + "grad_norm": 7.125, + "learning_rate": 2.3586514331778658e-06, + "loss": 0.94802761, + "memory(GiB)": 302.58, + "step": 247940, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.74818172, + "epoch": 1.3867097659969214, + "grad_norm": 7.125, + "learning_rate": 2.3578663329842334e-06, + "loss": 0.98766136, + "memory(GiB)": 302.58, + "step": 247960, + "train_speed(iter/s)": 0.123843 + }, + { + "acc": 0.74584365, + "epoch": 1.3868216154699007, + "grad_norm": 6.65625, + "learning_rate": 2.35708132315995e-06, + "loss": 0.98762503, + "memory(GiB)": 302.58, + "step": 247980, + "train_speed(iter/s)": 0.123848 + }, + { + "acc": 0.72850976, + "epoch": 1.38693346494288, + "grad_norm": 6.28125, + "learning_rate": 2.3562964037318657e-06, + "loss": 1.07090006, + "memory(GiB)": 302.58, + "step": 248000, + "train_speed(iter/s)": 0.123852 + }, + { + "epoch": 1.38693346494288, + "eval_acc": 0.7068409347498863, + "eval_loss": 1.012199878692627, + "eval_runtime": 7540.1628, + "eval_samples_per_second": 9.984, + "eval_steps_per_second": 9.984, + "step": 248000 + }, + { + "acc": 0.74988194, + "epoch": 1.3870453144158592, + "grad_norm": 5.3125, + "learning_rate": 2.3555115747268275e-06, + "loss": 0.98552465, + "memory(GiB)": 302.58, + "step": 248020, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.7398366, + "epoch": 1.3871571638888385, + "grad_norm": 8.625, + "learning_rate": 2.354726836171679e-06, + "loss": 1.04026337, + "memory(GiB)": 302.58, + "step": 248040, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.74854345, + "epoch": 1.3872690133618177, + "grad_norm": 8.25, + "learning_rate": 2.353942188093261e-06, + "loss": 1.00514097, + "memory(GiB)": 302.58, + "step": 248060, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.74686794, + "epoch": 1.387380862834797, + "grad_norm": 7.90625, + "learning_rate": 2.35315763051841e-06, + "loss": 1.00012636, + "memory(GiB)": 302.58, + "step": 248080, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.74863238, + "epoch": 1.3874927123077763, + "grad_norm": 7.1875, + "learning_rate": 2.352373163473962e-06, + "loss": 1.01006374, + "memory(GiB)": 302.58, + "step": 248100, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.75756092, + "epoch": 1.3876045617807555, + "grad_norm": 9.9375, + "learning_rate": 2.3515887869867458e-06, + "loss": 0.96571159, + "memory(GiB)": 302.58, + "step": 248120, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74812732, + "epoch": 1.3877164112537348, + "grad_norm": 6.90625, + "learning_rate": 2.3508045010835917e-06, + "loss": 0.99834566, + "memory(GiB)": 302.58, + "step": 248140, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.74907675, + "epoch": 1.387828260726714, + "grad_norm": 6.875, + "learning_rate": 2.350020305791324e-06, + "loss": 0.9691659, + "memory(GiB)": 302.58, + "step": 248160, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.73422575, + "epoch": 1.3879401101996933, + "grad_norm": 4.625, + "learning_rate": 2.3492362011367643e-06, + "loss": 1.05338182, + "memory(GiB)": 302.58, + "step": 248180, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.74790421, + "epoch": 1.3880519596726726, + "grad_norm": 6.375, + "learning_rate": 2.34845218714673e-06, + "loss": 0.98739262, + "memory(GiB)": 302.58, + "step": 248200, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75623364, + "epoch": 1.3881638091456519, + "grad_norm": 8.8125, + "learning_rate": 2.3476682638480415e-06, + "loss": 0.95526209, + "memory(GiB)": 302.58, + "step": 248220, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.7474606, + "epoch": 1.3882756586186311, + "grad_norm": 7.46875, + "learning_rate": 2.3468844312675077e-06, + "loss": 0.99760189, + "memory(GiB)": 302.58, + "step": 248240, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.74707675, + "epoch": 1.3883875080916104, + "grad_norm": 8.5625, + "learning_rate": 2.34610068943194e-06, + "loss": 1.01017103, + "memory(GiB)": 302.58, + "step": 248260, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.74400754, + "epoch": 1.3884993575645896, + "grad_norm": 7.15625, + "learning_rate": 2.3453170383681434e-06, + "loss": 0.99396458, + "memory(GiB)": 302.58, + "step": 248280, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.74664145, + "epoch": 1.388611207037569, + "grad_norm": 10.0, + "learning_rate": 2.3445334781029216e-06, + "loss": 1.01849432, + "memory(GiB)": 302.58, + "step": 248300, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.75889192, + "epoch": 1.3887230565105482, + "grad_norm": 7.59375, + "learning_rate": 2.3437500086630737e-06, + "loss": 0.93891964, + "memory(GiB)": 302.58, + "step": 248320, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.75178914, + "epoch": 1.3888349059835274, + "grad_norm": 6.5, + "learning_rate": 2.3429666300754005e-06, + "loss": 0.97555733, + "memory(GiB)": 302.58, + "step": 248340, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.74534388, + "epoch": 1.3889467554565067, + "grad_norm": 7.78125, + "learning_rate": 2.3421833423666934e-06, + "loss": 1.00457497, + "memory(GiB)": 302.58, + "step": 248360, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.7488071, + "epoch": 1.389058604929486, + "grad_norm": 7.09375, + "learning_rate": 2.3414001455637438e-06, + "loss": 1.00127382, + "memory(GiB)": 302.58, + "step": 248380, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.75160041, + "epoch": 1.3891704544024652, + "grad_norm": 7.8125, + "learning_rate": 2.3406170396933386e-06, + "loss": 0.97157373, + "memory(GiB)": 302.58, + "step": 248400, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74745121, + "epoch": 1.3892823038754445, + "grad_norm": 6.59375, + "learning_rate": 2.3398340247822644e-06, + "loss": 0.98151836, + "memory(GiB)": 302.58, + "step": 248420, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.7402739, + "epoch": 1.3893941533484238, + "grad_norm": 9.0, + "learning_rate": 2.3390511008573014e-06, + "loss": 1.02888899, + "memory(GiB)": 302.58, + "step": 248440, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.75378284, + "epoch": 1.389506002821403, + "grad_norm": 5.90625, + "learning_rate": 2.3382682679452277e-06, + "loss": 0.93084335, + "memory(GiB)": 302.58, + "step": 248460, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.75803065, + "epoch": 1.3896178522943823, + "grad_norm": 7.65625, + "learning_rate": 2.33748552607282e-06, + "loss": 0.93552408, + "memory(GiB)": 302.58, + "step": 248480, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.75793929, + "epoch": 1.3897297017673615, + "grad_norm": 7.96875, + "learning_rate": 2.33670287526685e-06, + "loss": 0.95860672, + "memory(GiB)": 302.58, + "step": 248500, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.75039935, + "epoch": 1.3898415512403408, + "grad_norm": 6.28125, + "learning_rate": 2.3359203155540867e-06, + "loss": 0.98932858, + "memory(GiB)": 302.58, + "step": 248520, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.76787038, + "epoch": 1.38995340071332, + "grad_norm": 7.90625, + "learning_rate": 2.335137846961294e-06, + "loss": 0.91417923, + "memory(GiB)": 302.58, + "step": 248540, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.75996685, + "epoch": 1.3900652501862993, + "grad_norm": 7.59375, + "learning_rate": 2.3343554695152393e-06, + "loss": 0.92544203, + "memory(GiB)": 302.58, + "step": 248560, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.75096221, + "epoch": 1.3901770996592786, + "grad_norm": 8.875, + "learning_rate": 2.3335731832426794e-06, + "loss": 0.97551632, + "memory(GiB)": 302.58, + "step": 248580, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.73585944, + "epoch": 1.3902889491322579, + "grad_norm": 6.875, + "learning_rate": 2.3327909881703725e-06, + "loss": 1.06031475, + "memory(GiB)": 302.58, + "step": 248600, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.74685116, + "epoch": 1.3904007986052371, + "grad_norm": 7.1875, + "learning_rate": 2.332008884325071e-06, + "loss": 0.99278822, + "memory(GiB)": 302.58, + "step": 248620, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.75489073, + "epoch": 1.3905126480782164, + "grad_norm": 10.0, + "learning_rate": 2.331226871733525e-06, + "loss": 0.9756958, + "memory(GiB)": 302.58, + "step": 248640, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.7474566, + "epoch": 1.3906244975511957, + "grad_norm": 7.21875, + "learning_rate": 2.330444950422483e-06, + "loss": 1.0075757, + "memory(GiB)": 302.58, + "step": 248660, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.75138073, + "epoch": 1.390736347024175, + "grad_norm": 10.0625, + "learning_rate": 2.3296631204186888e-06, + "loss": 0.96850014, + "memory(GiB)": 302.58, + "step": 248680, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74108925, + "epoch": 1.3908481964971542, + "grad_norm": 5.5625, + "learning_rate": 2.328881381748883e-06, + "loss": 1.01829033, + "memory(GiB)": 302.58, + "step": 248700, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74162321, + "epoch": 1.3909600459701335, + "grad_norm": 6.375, + "learning_rate": 2.328099734439804e-06, + "loss": 1.01194286, + "memory(GiB)": 302.58, + "step": 248720, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.74710984, + "epoch": 1.3910718954431127, + "grad_norm": 4.90625, + "learning_rate": 2.3273181785181858e-06, + "loss": 1.00819216, + "memory(GiB)": 302.58, + "step": 248740, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.7478507, + "epoch": 1.391183744916092, + "grad_norm": 8.0, + "learning_rate": 2.3265367140107616e-06, + "loss": 0.98955221, + "memory(GiB)": 302.58, + "step": 248760, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75092564, + "epoch": 1.3912955943890712, + "grad_norm": 5.875, + "learning_rate": 2.3257553409442592e-06, + "loss": 0.96789675, + "memory(GiB)": 302.58, + "step": 248780, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.75263052, + "epoch": 1.3914074438620505, + "grad_norm": 9.6875, + "learning_rate": 2.324974059345403e-06, + "loss": 0.96422691, + "memory(GiB)": 302.58, + "step": 248800, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.7511673, + "epoch": 1.3915192933350298, + "grad_norm": 5.03125, + "learning_rate": 2.324192869240915e-06, + "loss": 0.97331486, + "memory(GiB)": 302.58, + "step": 248820, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.76728401, + "epoch": 1.391631142808009, + "grad_norm": 9.25, + "learning_rate": 2.323411770657517e-06, + "loss": 0.89201279, + "memory(GiB)": 302.58, + "step": 248840, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.76240435, + "epoch": 1.3917429922809883, + "grad_norm": 8.875, + "learning_rate": 2.322630763621924e-06, + "loss": 0.94591818, + "memory(GiB)": 302.58, + "step": 248860, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74844213, + "epoch": 1.3918548417539676, + "grad_norm": 7.4375, + "learning_rate": 2.3218498481608494e-06, + "loss": 0.98426189, + "memory(GiB)": 302.58, + "step": 248880, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74367294, + "epoch": 1.3919666912269468, + "grad_norm": 5.0625, + "learning_rate": 2.3210690243010008e-06, + "loss": 1.03852043, + "memory(GiB)": 302.58, + "step": 248900, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.75591211, + "epoch": 1.392078540699926, + "grad_norm": 7.0625, + "learning_rate": 2.3202882920690873e-06, + "loss": 0.95525665, + "memory(GiB)": 302.58, + "step": 248920, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.74737949, + "epoch": 1.3921903901729054, + "grad_norm": 5.71875, + "learning_rate": 2.319507651491811e-06, + "loss": 0.99015684, + "memory(GiB)": 302.58, + "step": 248940, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.76346488, + "epoch": 1.3923022396458846, + "grad_norm": 8.375, + "learning_rate": 2.3187271025958726e-06, + "loss": 0.92075462, + "memory(GiB)": 302.58, + "step": 248960, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.74259348, + "epoch": 1.3924140891188639, + "grad_norm": 7.78125, + "learning_rate": 2.317946645407969e-06, + "loss": 1.008992, + "memory(GiB)": 302.58, + "step": 248980, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.75913243, + "epoch": 1.3925259385918431, + "grad_norm": 5.90625, + "learning_rate": 2.3171662799547952e-06, + "loss": 0.93930702, + "memory(GiB)": 302.58, + "step": 249000, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.76280079, + "epoch": 1.3926377880648224, + "grad_norm": 7.84375, + "learning_rate": 2.3163860062630412e-06, + "loss": 0.92993288, + "memory(GiB)": 302.58, + "step": 249020, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.751369, + "epoch": 1.3927496375378017, + "grad_norm": 8.375, + "learning_rate": 2.3156058243593955e-06, + "loss": 0.95090504, + "memory(GiB)": 302.58, + "step": 249040, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75541296, + "epoch": 1.392861487010781, + "grad_norm": 5.5625, + "learning_rate": 2.3148257342705426e-06, + "loss": 0.94383602, + "memory(GiB)": 302.58, + "step": 249060, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.7464222, + "epoch": 1.3929733364837602, + "grad_norm": 6.625, + "learning_rate": 2.314045736023162e-06, + "loss": 0.99560518, + "memory(GiB)": 302.58, + "step": 249080, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.75322061, + "epoch": 1.3930851859567395, + "grad_norm": 6.0625, + "learning_rate": 2.3132658296439364e-06, + "loss": 0.96859798, + "memory(GiB)": 302.58, + "step": 249100, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.74466071, + "epoch": 1.3931970354297187, + "grad_norm": 6.25, + "learning_rate": 2.3124860151595383e-06, + "loss": 0.99391956, + "memory(GiB)": 302.58, + "step": 249120, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.75590448, + "epoch": 1.393308884902698, + "grad_norm": 11.625, + "learning_rate": 2.311706292596641e-06, + "loss": 0.95397739, + "memory(GiB)": 302.58, + "step": 249140, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.75342832, + "epoch": 1.3934207343756773, + "grad_norm": 9.25, + "learning_rate": 2.3109266619819126e-06, + "loss": 0.9828536, + "memory(GiB)": 302.58, + "step": 249160, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.75390906, + "epoch": 1.3935325838486565, + "grad_norm": 7.28125, + "learning_rate": 2.3101471233420193e-06, + "loss": 0.97241001, + "memory(GiB)": 302.58, + "step": 249180, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.73496003, + "epoch": 1.3936444333216358, + "grad_norm": 6.53125, + "learning_rate": 2.309367676703624e-06, + "loss": 1.02657213, + "memory(GiB)": 302.58, + "step": 249200, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.76954465, + "epoch": 1.393756282794615, + "grad_norm": 7.28125, + "learning_rate": 2.3085883220933854e-06, + "loss": 0.90290985, + "memory(GiB)": 302.58, + "step": 249220, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.76482916, + "epoch": 1.3938681322675943, + "grad_norm": 10.75, + "learning_rate": 2.30780905953796e-06, + "loss": 0.91085262, + "memory(GiB)": 302.58, + "step": 249240, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.74250193, + "epoch": 1.3939799817405736, + "grad_norm": 9.0, + "learning_rate": 2.307029889064002e-06, + "loss": 1.03193312, + "memory(GiB)": 302.58, + "step": 249260, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.76061697, + "epoch": 1.3940918312135528, + "grad_norm": 8.9375, + "learning_rate": 2.306250810698161e-06, + "loss": 0.95043812, + "memory(GiB)": 302.58, + "step": 249280, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.74606457, + "epoch": 1.394203680686532, + "grad_norm": 6.3125, + "learning_rate": 2.3054718244670837e-06, + "loss": 1.00094957, + "memory(GiB)": 302.58, + "step": 249300, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.73895955, + "epoch": 1.3943155301595114, + "grad_norm": 5.96875, + "learning_rate": 2.304692930397414e-06, + "loss": 1.03961315, + "memory(GiB)": 302.58, + "step": 249320, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.76277175, + "epoch": 1.3944273796324906, + "grad_norm": 11.625, + "learning_rate": 2.3039141285157936e-06, + "loss": 0.91454668, + "memory(GiB)": 302.58, + "step": 249340, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.75370545, + "epoch": 1.39453922910547, + "grad_norm": 6.0625, + "learning_rate": 2.3031354188488566e-06, + "loss": 0.98120117, + "memory(GiB)": 302.58, + "step": 249360, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.76371779, + "epoch": 1.3946510785784492, + "grad_norm": 5.1875, + "learning_rate": 2.3023568014232413e-06, + "loss": 0.92390451, + "memory(GiB)": 302.58, + "step": 249380, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.75128427, + "epoch": 1.3947629280514284, + "grad_norm": 5.46875, + "learning_rate": 2.3015782762655775e-06, + "loss": 0.96387901, + "memory(GiB)": 302.58, + "step": 249400, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.75637922, + "epoch": 1.3948747775244077, + "grad_norm": 5.53125, + "learning_rate": 2.300799843402493e-06, + "loss": 0.93997974, + "memory(GiB)": 302.58, + "step": 249420, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.76845675, + "epoch": 1.394986626997387, + "grad_norm": 6.875, + "learning_rate": 2.3000215028606133e-06, + "loss": 0.91202641, + "memory(GiB)": 302.58, + "step": 249440, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.77085719, + "epoch": 1.3950984764703662, + "grad_norm": 6.625, + "learning_rate": 2.2992432546665594e-06, + "loss": 0.87842121, + "memory(GiB)": 302.58, + "step": 249460, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.77874966, + "epoch": 1.3952103259433455, + "grad_norm": 5.59375, + "learning_rate": 2.29846509884695e-06, + "loss": 0.85038004, + "memory(GiB)": 302.58, + "step": 249480, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.76145773, + "epoch": 1.3953221754163248, + "grad_norm": 6.78125, + "learning_rate": 2.2976870354284e-06, + "loss": 0.92608948, + "memory(GiB)": 302.58, + "step": 249500, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.75556726, + "epoch": 1.395434024889304, + "grad_norm": 6.875, + "learning_rate": 2.2969090644375237e-06, + "loss": 0.94687653, + "memory(GiB)": 302.58, + "step": 249520, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.74775372, + "epoch": 1.3955458743622833, + "grad_norm": 6.53125, + "learning_rate": 2.2961311859009274e-06, + "loss": 0.97455597, + "memory(GiB)": 302.58, + "step": 249540, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.75444703, + "epoch": 1.3956577238352625, + "grad_norm": 11.6875, + "learning_rate": 2.2953533998452166e-06, + "loss": 0.96951075, + "memory(GiB)": 302.58, + "step": 249560, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.7420373, + "epoch": 1.3957695733082418, + "grad_norm": 8.25, + "learning_rate": 2.294575706296998e-06, + "loss": 1.01420326, + "memory(GiB)": 302.58, + "step": 249580, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.75765715, + "epoch": 1.395881422781221, + "grad_norm": 7.375, + "learning_rate": 2.2937981052828685e-06, + "loss": 0.9453721, + "memory(GiB)": 302.58, + "step": 249600, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.75857244, + "epoch": 1.3959932722542003, + "grad_norm": 10.5, + "learning_rate": 2.293020596829425e-06, + "loss": 0.9528079, + "memory(GiB)": 302.58, + "step": 249620, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.74313984, + "epoch": 1.3961051217271796, + "grad_norm": 6.84375, + "learning_rate": 2.2922431809632613e-06, + "loss": 1.0158041, + "memory(GiB)": 302.58, + "step": 249640, + "train_speed(iter/s)": 0.123762 + }, + { + "acc": 0.74954576, + "epoch": 1.3962169712001589, + "grad_norm": 5.96875, + "learning_rate": 2.291465857710966e-06, + "loss": 0.95512619, + "memory(GiB)": 302.58, + "step": 249660, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.75209293, + "epoch": 1.3963288206731381, + "grad_norm": 4.875, + "learning_rate": 2.2906886270991273e-06, + "loss": 0.97150717, + "memory(GiB)": 302.58, + "step": 249680, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.74912529, + "epoch": 1.3964406701461174, + "grad_norm": 6.5, + "learning_rate": 2.289911489154327e-06, + "loss": 0.98148851, + "memory(GiB)": 302.58, + "step": 249700, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.74690638, + "epoch": 1.3965525196190967, + "grad_norm": 9.625, + "learning_rate": 2.2891344439031484e-06, + "loss": 1.01044407, + "memory(GiB)": 302.58, + "step": 249720, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.7438767, + "epoch": 1.396664369092076, + "grad_norm": 3.90625, + "learning_rate": 2.2883574913721677e-06, + "loss": 1.00327091, + "memory(GiB)": 302.58, + "step": 249740, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.7577827, + "epoch": 1.3967762185650552, + "grad_norm": 8.625, + "learning_rate": 2.2875806315879595e-06, + "loss": 0.93287821, + "memory(GiB)": 302.58, + "step": 249760, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.746804, + "epoch": 1.3968880680380344, + "grad_norm": 7.71875, + "learning_rate": 2.2868038645770945e-06, + "loss": 1.01539211, + "memory(GiB)": 302.58, + "step": 249780, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.75037088, + "epoch": 1.3969999175110137, + "grad_norm": 8.5, + "learning_rate": 2.2860271903661414e-06, + "loss": 0.96794453, + "memory(GiB)": 302.58, + "step": 249800, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.74779916, + "epoch": 1.397111766983993, + "grad_norm": 7.03125, + "learning_rate": 2.2852506089816633e-06, + "loss": 0.97674389, + "memory(GiB)": 302.58, + "step": 249820, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.74987683, + "epoch": 1.3972236164569722, + "grad_norm": 6.28125, + "learning_rate": 2.2844741204502226e-06, + "loss": 0.96922846, + "memory(GiB)": 302.58, + "step": 249840, + "train_speed(iter/s)": 0.123809 + }, + { + "acc": 0.74701552, + "epoch": 1.3973354659299515, + "grad_norm": 7.4375, + "learning_rate": 2.2836977247983777e-06, + "loss": 1.00649824, + "memory(GiB)": 302.58, + "step": 249860, + "train_speed(iter/s)": 0.123814 + }, + { + "acc": 0.74523764, + "epoch": 1.3974473154029308, + "grad_norm": 6.1875, + "learning_rate": 2.282921422052684e-06, + "loss": 1.01550426, + "memory(GiB)": 302.58, + "step": 249880, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.74822483, + "epoch": 1.39755916487591, + "grad_norm": 6.78125, + "learning_rate": 2.2821452122396936e-06, + "loss": 0.99632063, + "memory(GiB)": 302.58, + "step": 249900, + "train_speed(iter/s)": 0.123823 + }, + { + "acc": 0.75349402, + "epoch": 1.3976710143488893, + "grad_norm": 8.25, + "learning_rate": 2.2813690953859545e-06, + "loss": 0.96697073, + "memory(GiB)": 302.58, + "step": 249920, + "train_speed(iter/s)": 0.123828 + }, + { + "acc": 0.73838229, + "epoch": 1.3977828638218686, + "grad_norm": 7.25, + "learning_rate": 2.280593071518013e-06, + "loss": 1.03703527, + "memory(GiB)": 302.58, + "step": 249940, + "train_speed(iter/s)": 0.123832 + }, + { + "acc": 0.75095029, + "epoch": 1.3978947132948478, + "grad_norm": 9.375, + "learning_rate": 2.27981714066241e-06, + "loss": 0.98219786, + "memory(GiB)": 302.58, + "step": 249960, + "train_speed(iter/s)": 0.123836 + }, + { + "acc": 0.76669006, + "epoch": 1.398006562767827, + "grad_norm": 7.0625, + "learning_rate": 2.2790413028456877e-06, + "loss": 0.91387062, + "memory(GiB)": 302.58, + "step": 249980, + "train_speed(iter/s)": 0.123841 + }, + { + "acc": 0.74040132, + "epoch": 1.3981184122408064, + "grad_norm": 5.0, + "learning_rate": 2.2782655580943807e-06, + "loss": 1.05039148, + "memory(GiB)": 302.58, + "step": 250000, + "train_speed(iter/s)": 0.123846 + }, + { + "epoch": 1.3981184122408064, + "eval_acc": 0.7068467517706666, + "eval_loss": 1.0121599435806274, + "eval_runtime": 7523.2711, + "eval_samples_per_second": 10.007, + "eval_steps_per_second": 10.007, + "step": 250000 + }, + { + "acc": 0.74748726, + "epoch": 1.3982302617137856, + "grad_norm": 7.09375, + "learning_rate": 2.277489906435022e-06, + "loss": 0.98719311, + "memory(GiB)": 302.58, + "step": 250020, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75043578, + "epoch": 1.3983421111867649, + "grad_norm": 5.71875, + "learning_rate": 2.2767143478941417e-06, + "loss": 0.98640928, + "memory(GiB)": 302.58, + "step": 250040, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.76154947, + "epoch": 1.3984539606597441, + "grad_norm": 7.96875, + "learning_rate": 2.2759388824982658e-06, + "loss": 0.93134375, + "memory(GiB)": 302.58, + "step": 250060, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.74841423, + "epoch": 1.3985658101327234, + "grad_norm": 9.25, + "learning_rate": 2.275163510273918e-06, + "loss": 0.99298344, + "memory(GiB)": 302.58, + "step": 250080, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.73102541, + "epoch": 1.3986776596057027, + "grad_norm": 6.21875, + "learning_rate": 2.2743882312476178e-06, + "loss": 1.0795188, + "memory(GiB)": 302.58, + "step": 250100, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.74270687, + "epoch": 1.398789509078682, + "grad_norm": 9.125, + "learning_rate": 2.273613045445883e-06, + "loss": 1.02978354, + "memory(GiB)": 302.58, + "step": 250120, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.75867534, + "epoch": 1.3989013585516612, + "grad_norm": 8.6875, + "learning_rate": 2.2728379528952275e-06, + "loss": 0.96043377, + "memory(GiB)": 302.58, + "step": 250140, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.76955724, + "epoch": 1.3990132080246405, + "grad_norm": 8.125, + "learning_rate": 2.272062953622162e-06, + "loss": 0.9159399, + "memory(GiB)": 302.58, + "step": 250160, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.75404797, + "epoch": 1.3991250574976197, + "grad_norm": 8.5625, + "learning_rate": 2.271288047653193e-06, + "loss": 0.97072487, + "memory(GiB)": 302.58, + "step": 250180, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.76105294, + "epoch": 1.399236906970599, + "grad_norm": 8.875, + "learning_rate": 2.270513235014825e-06, + "loss": 0.9302309, + "memory(GiB)": 302.58, + "step": 250200, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.7384584, + "epoch": 1.3993487564435783, + "grad_norm": 9.125, + "learning_rate": 2.26973851573356e-06, + "loss": 1.03006287, + "memory(GiB)": 302.58, + "step": 250220, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.75305381, + "epoch": 1.3994606059165575, + "grad_norm": 6.90625, + "learning_rate": 2.2689638898358928e-06, + "loss": 0.98004303, + "memory(GiB)": 302.58, + "step": 250240, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.76564422, + "epoch": 1.3995724553895368, + "grad_norm": 5.53125, + "learning_rate": 2.268189357348323e-06, + "loss": 0.89780846, + "memory(GiB)": 302.58, + "step": 250260, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.75089087, + "epoch": 1.399684304862516, + "grad_norm": 8.5, + "learning_rate": 2.267414918297339e-06, + "loss": 0.98017855, + "memory(GiB)": 302.58, + "step": 250280, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.76420593, + "epoch": 1.3997961543354953, + "grad_norm": 5.53125, + "learning_rate": 2.2666405727094293e-06, + "loss": 0.91221399, + "memory(GiB)": 302.58, + "step": 250300, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.74415064, + "epoch": 1.3999080038084746, + "grad_norm": 5.71875, + "learning_rate": 2.265866320611079e-06, + "loss": 1.01246872, + "memory(GiB)": 302.58, + "step": 250320, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.74232712, + "epoch": 1.4000198532814538, + "grad_norm": 8.9375, + "learning_rate": 2.2650921620287715e-06, + "loss": 1.0139082, + "memory(GiB)": 302.58, + "step": 250340, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.76081071, + "epoch": 1.400131702754433, + "grad_norm": 7.6875, + "learning_rate": 2.2643180969889835e-06, + "loss": 0.94234962, + "memory(GiB)": 302.58, + "step": 250360, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.7378252, + "epoch": 1.4002435522274124, + "grad_norm": 7.78125, + "learning_rate": 2.2635441255181913e-06, + "loss": 1.01292067, + "memory(GiB)": 302.58, + "step": 250380, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.74983964, + "epoch": 1.4003554017003916, + "grad_norm": 9.0, + "learning_rate": 2.2627702476428666e-06, + "loss": 0.96634607, + "memory(GiB)": 302.58, + "step": 250400, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.75866628, + "epoch": 1.400467251173371, + "grad_norm": 6.09375, + "learning_rate": 2.2619964633894794e-06, + "loss": 0.95697317, + "memory(GiB)": 302.58, + "step": 250420, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75123105, + "epoch": 1.4005791006463502, + "grad_norm": 8.8125, + "learning_rate": 2.261222772784495e-06, + "loss": 0.97979393, + "memory(GiB)": 302.58, + "step": 250440, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.74660702, + "epoch": 1.4006909501193294, + "grad_norm": 6.96875, + "learning_rate": 2.260449175854376e-06, + "loss": 1.00497589, + "memory(GiB)": 302.58, + "step": 250460, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75392013, + "epoch": 1.4008027995923087, + "grad_norm": 8.375, + "learning_rate": 2.2596756726255825e-06, + "loss": 0.98560305, + "memory(GiB)": 302.58, + "step": 250480, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.77017641, + "epoch": 1.400914649065288, + "grad_norm": 6.375, + "learning_rate": 2.2589022631245684e-06, + "loss": 0.89282427, + "memory(GiB)": 302.58, + "step": 250500, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.74297853, + "epoch": 1.4010264985382672, + "grad_norm": 6.125, + "learning_rate": 2.2581289473777908e-06, + "loss": 1.00846634, + "memory(GiB)": 302.58, + "step": 250520, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.75719867, + "epoch": 1.4011383480112465, + "grad_norm": 7.90625, + "learning_rate": 2.257355725411697e-06, + "loss": 0.94350023, + "memory(GiB)": 302.58, + "step": 250540, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.75138035, + "epoch": 1.4012501974842257, + "grad_norm": 6.0, + "learning_rate": 2.256582597252734e-06, + "loss": 0.95678329, + "memory(GiB)": 302.58, + "step": 250560, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.75661383, + "epoch": 1.401362046957205, + "grad_norm": 5.625, + "learning_rate": 2.255809562927345e-06, + "loss": 0.94292984, + "memory(GiB)": 302.58, + "step": 250580, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74822798, + "epoch": 1.4014738964301843, + "grad_norm": 9.8125, + "learning_rate": 2.2550366224619714e-06, + "loss": 0.98666859, + "memory(GiB)": 302.58, + "step": 250600, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.74688268, + "epoch": 1.4015857459031635, + "grad_norm": 4.65625, + "learning_rate": 2.254263775883049e-06, + "loss": 1.00174866, + "memory(GiB)": 302.58, + "step": 250620, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.75967054, + "epoch": 1.4016975953761428, + "grad_norm": 7.96875, + "learning_rate": 2.2534910232170114e-06, + "loss": 0.95537605, + "memory(GiB)": 302.58, + "step": 250640, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74564743, + "epoch": 1.401809444849122, + "grad_norm": 10.125, + "learning_rate": 2.2527183644902904e-06, + "loss": 1.01066351, + "memory(GiB)": 302.58, + "step": 250660, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.73333111, + "epoch": 1.4019212943221013, + "grad_norm": 7.1875, + "learning_rate": 2.2519457997293125e-06, + "loss": 1.06020184, + "memory(GiB)": 302.58, + "step": 250680, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.74514937, + "epoch": 1.4020331437950806, + "grad_norm": 5.625, + "learning_rate": 2.2511733289605027e-06, + "loss": 0.99341764, + "memory(GiB)": 302.58, + "step": 250700, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.74288216, + "epoch": 1.4021449932680599, + "grad_norm": 5.3125, + "learning_rate": 2.2504009522102803e-06, + "loss": 1.01475153, + "memory(GiB)": 302.58, + "step": 250720, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.73957558, + "epoch": 1.4022568427410391, + "grad_norm": 6.84375, + "learning_rate": 2.249628669505065e-06, + "loss": 1.03852329, + "memory(GiB)": 302.58, + "step": 250740, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.7575983, + "epoch": 1.4023686922140184, + "grad_norm": 5.03125, + "learning_rate": 2.24885648087127e-06, + "loss": 0.91911154, + "memory(GiB)": 302.58, + "step": 250760, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.7548811, + "epoch": 1.4024805416869977, + "grad_norm": 8.3125, + "learning_rate": 2.2480843863353052e-06, + "loss": 0.95366135, + "memory(GiB)": 302.58, + "step": 250780, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.73947039, + "epoch": 1.402592391159977, + "grad_norm": 7.0, + "learning_rate": 2.2473123859235823e-06, + "loss": 1.0381197, + "memory(GiB)": 302.58, + "step": 250800, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.75970945, + "epoch": 1.4027042406329562, + "grad_norm": 10.25, + "learning_rate": 2.246540479662504e-06, + "loss": 0.94668131, + "memory(GiB)": 302.58, + "step": 250820, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.73861289, + "epoch": 1.4028160901059354, + "grad_norm": 6.875, + "learning_rate": 2.245768667578471e-06, + "loss": 1.02535839, + "memory(GiB)": 302.58, + "step": 250840, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.75323052, + "epoch": 1.4029279395789147, + "grad_norm": 7.1875, + "learning_rate": 2.2449969496978848e-06, + "loss": 0.96138344, + "memory(GiB)": 302.58, + "step": 250860, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.7600215, + "epoch": 1.403039789051894, + "grad_norm": 10.125, + "learning_rate": 2.2442253260471385e-06, + "loss": 0.92774982, + "memory(GiB)": 302.58, + "step": 250880, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.76535892, + "epoch": 1.4031516385248732, + "grad_norm": 5.75, + "learning_rate": 2.2434537966526244e-06, + "loss": 0.92800112, + "memory(GiB)": 302.58, + "step": 250900, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.76041784, + "epoch": 1.4032634879978525, + "grad_norm": 5.84375, + "learning_rate": 2.242682361540731e-06, + "loss": 0.93366098, + "memory(GiB)": 302.58, + "step": 250920, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.76815615, + "epoch": 1.4033753374708318, + "grad_norm": 6.09375, + "learning_rate": 2.2419110207378447e-06, + "loss": 0.90264473, + "memory(GiB)": 302.58, + "step": 250940, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.72479267, + "epoch": 1.403487186943811, + "grad_norm": 9.125, + "learning_rate": 2.2411397742703468e-06, + "loss": 1.0913475, + "memory(GiB)": 302.58, + "step": 250960, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.75956516, + "epoch": 1.4035990364167903, + "grad_norm": 5.75, + "learning_rate": 2.2403686221646167e-06, + "loss": 0.94617796, + "memory(GiB)": 302.58, + "step": 250980, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.76770649, + "epoch": 1.4037108858897696, + "grad_norm": 5.90625, + "learning_rate": 2.2395975644470303e-06, + "loss": 0.89869738, + "memory(GiB)": 302.58, + "step": 251000, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.74253168, + "epoch": 1.4038227353627488, + "grad_norm": 7.625, + "learning_rate": 2.2388266011439603e-06, + "loss": 1.02110157, + "memory(GiB)": 302.58, + "step": 251020, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.75831676, + "epoch": 1.403934584835728, + "grad_norm": 9.625, + "learning_rate": 2.238055732281776e-06, + "loss": 0.95294256, + "memory(GiB)": 302.58, + "step": 251040, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.76988449, + "epoch": 1.4040464343087073, + "grad_norm": 7.1875, + "learning_rate": 2.237284957886843e-06, + "loss": 0.89865513, + "memory(GiB)": 302.58, + "step": 251060, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75221281, + "epoch": 1.4041582837816866, + "grad_norm": 9.1875, + "learning_rate": 2.2365142779855255e-06, + "loss": 0.98859262, + "memory(GiB)": 302.58, + "step": 251080, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.76955533, + "epoch": 1.4042701332546659, + "grad_norm": 5.4375, + "learning_rate": 2.235743692604182e-06, + "loss": 0.9087574, + "memory(GiB)": 302.58, + "step": 251100, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.75419383, + "epoch": 1.4043819827276451, + "grad_norm": 5.25, + "learning_rate": 2.2349732017691676e-06, + "loss": 0.95730829, + "memory(GiB)": 302.58, + "step": 251120, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.75151224, + "epoch": 1.4044938322006244, + "grad_norm": 5.125, + "learning_rate": 2.234202805506839e-06, + "loss": 0.95556755, + "memory(GiB)": 302.58, + "step": 251140, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.75366125, + "epoch": 1.4046056816736037, + "grad_norm": 6.84375, + "learning_rate": 2.2334325038435456e-06, + "loss": 0.97054272, + "memory(GiB)": 302.58, + "step": 251160, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.75274177, + "epoch": 1.404717531146583, + "grad_norm": 8.3125, + "learning_rate": 2.2326622968056324e-06, + "loss": 0.97245626, + "memory(GiB)": 302.58, + "step": 251180, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.76666026, + "epoch": 1.4048293806195622, + "grad_norm": 9.1875, + "learning_rate": 2.2318921844194437e-06, + "loss": 0.92042398, + "memory(GiB)": 302.58, + "step": 251200, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.74531388, + "epoch": 1.4049412300925415, + "grad_norm": 4.59375, + "learning_rate": 2.23112216671132e-06, + "loss": 0.99794731, + "memory(GiB)": 302.58, + "step": 251220, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.74414053, + "epoch": 1.4050530795655207, + "grad_norm": 6.3125, + "learning_rate": 2.2303522437075977e-06, + "loss": 1.01939983, + "memory(GiB)": 302.58, + "step": 251240, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.76535692, + "epoch": 1.4051649290385, + "grad_norm": 7.25, + "learning_rate": 2.2295824154346105e-06, + "loss": 0.92018318, + "memory(GiB)": 302.58, + "step": 251260, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.74903293, + "epoch": 1.4052767785114793, + "grad_norm": 8.375, + "learning_rate": 2.22881268191869e-06, + "loss": 0.97952099, + "memory(GiB)": 302.58, + "step": 251280, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.76312919, + "epoch": 1.4053886279844585, + "grad_norm": 7.6875, + "learning_rate": 2.228043043186163e-06, + "loss": 0.90175591, + "memory(GiB)": 302.58, + "step": 251300, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.75385108, + "epoch": 1.4055004774574378, + "grad_norm": 9.75, + "learning_rate": 2.227273499263354e-06, + "loss": 0.94360628, + "memory(GiB)": 302.58, + "step": 251320, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.76185665, + "epoch": 1.405612326930417, + "grad_norm": 7.625, + "learning_rate": 2.226504050176583e-06, + "loss": 0.92011366, + "memory(GiB)": 302.58, + "step": 251340, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.74580102, + "epoch": 1.4057241764033963, + "grad_norm": 6.5, + "learning_rate": 2.2257346959521674e-06, + "loss": 1.02055445, + "memory(GiB)": 302.58, + "step": 251360, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.7243865, + "epoch": 1.4058360258763756, + "grad_norm": 7.4375, + "learning_rate": 2.224965436616421e-06, + "loss": 1.10273867, + "memory(GiB)": 302.58, + "step": 251380, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.76656609, + "epoch": 1.4059478753493548, + "grad_norm": 9.8125, + "learning_rate": 2.2241962721956577e-06, + "loss": 0.9222683, + "memory(GiB)": 302.58, + "step": 251400, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.75258121, + "epoch": 1.406059724822334, + "grad_norm": 5.65625, + "learning_rate": 2.223427202716184e-06, + "loss": 0.96933098, + "memory(GiB)": 302.58, + "step": 251420, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.75059123, + "epoch": 1.4061715742953134, + "grad_norm": 4.78125, + "learning_rate": 2.222658228204304e-06, + "loss": 0.98772144, + "memory(GiB)": 302.58, + "step": 251440, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.7750998, + "epoch": 1.4062834237682926, + "grad_norm": 9.375, + "learning_rate": 2.221889348686319e-06, + "loss": 0.8677249, + "memory(GiB)": 302.58, + "step": 251460, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.74526415, + "epoch": 1.406395273241272, + "grad_norm": 6.1875, + "learning_rate": 2.221120564188528e-06, + "loss": 1.00434303, + "memory(GiB)": 302.58, + "step": 251480, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.75790672, + "epoch": 1.4065071227142512, + "grad_norm": 6.875, + "learning_rate": 2.220351874737225e-06, + "loss": 0.94568787, + "memory(GiB)": 302.58, + "step": 251500, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.76381478, + "epoch": 1.4066189721872304, + "grad_norm": 7.09375, + "learning_rate": 2.219583280358702e-06, + "loss": 0.91965275, + "memory(GiB)": 302.58, + "step": 251520, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.7378264, + "epoch": 1.4067308216602097, + "grad_norm": 8.75, + "learning_rate": 2.218814781079247e-06, + "loss": 1.0345645, + "memory(GiB)": 302.58, + "step": 251540, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.74635453, + "epoch": 1.406842671133189, + "grad_norm": 7.75, + "learning_rate": 2.2180463769251458e-06, + "loss": 1.00487709, + "memory(GiB)": 302.58, + "step": 251560, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.76033797, + "epoch": 1.4069545206061682, + "grad_norm": 8.75, + "learning_rate": 2.2172780679226794e-06, + "loss": 0.94894876, + "memory(GiB)": 302.58, + "step": 251580, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.75783105, + "epoch": 1.4070663700791475, + "grad_norm": 7.8125, + "learning_rate": 2.2165098540981273e-06, + "loss": 0.95365953, + "memory(GiB)": 302.58, + "step": 251600, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.74303732, + "epoch": 1.4071782195521267, + "grad_norm": 3.71875, + "learning_rate": 2.2157417354777644e-06, + "loss": 0.99282408, + "memory(GiB)": 302.58, + "step": 251620, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.76173482, + "epoch": 1.407290069025106, + "grad_norm": 9.8125, + "learning_rate": 2.214973712087862e-06, + "loss": 0.92479162, + "memory(GiB)": 302.58, + "step": 251640, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.75866823, + "epoch": 1.4074019184980853, + "grad_norm": 7.40625, + "learning_rate": 2.214205783954689e-06, + "loss": 0.9434782, + "memory(GiB)": 302.58, + "step": 251660, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.74239092, + "epoch": 1.4075137679710645, + "grad_norm": 8.8125, + "learning_rate": 2.213437951104513e-06, + "loss": 1.00853748, + "memory(GiB)": 302.58, + "step": 251680, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.73745909, + "epoch": 1.4076256174440438, + "grad_norm": 7.0, + "learning_rate": 2.2126702135635952e-06, + "loss": 1.03622103, + "memory(GiB)": 302.58, + "step": 251700, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.74685073, + "epoch": 1.407737466917023, + "grad_norm": 5.6875, + "learning_rate": 2.211902571358194e-06, + "loss": 0.98625908, + "memory(GiB)": 302.58, + "step": 251720, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.73940859, + "epoch": 1.4078493163900023, + "grad_norm": 7.71875, + "learning_rate": 2.2111350245145665e-06, + "loss": 1.04009485, + "memory(GiB)": 302.58, + "step": 251740, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.74624381, + "epoch": 1.4079611658629816, + "grad_norm": 4.625, + "learning_rate": 2.2103675730589637e-06, + "loss": 0.9950079, + "memory(GiB)": 302.58, + "step": 251760, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.7356122, + "epoch": 1.4080730153359609, + "grad_norm": 6.90625, + "learning_rate": 2.2096002170176355e-06, + "loss": 1.04464712, + "memory(GiB)": 302.58, + "step": 251780, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.7518065, + "epoch": 1.4081848648089401, + "grad_norm": 7.21875, + "learning_rate": 2.2088329564168288e-06, + "loss": 0.96971798, + "memory(GiB)": 302.58, + "step": 251800, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.76827531, + "epoch": 1.4082967142819194, + "grad_norm": 4.96875, + "learning_rate": 2.2080657912827847e-06, + "loss": 0.90414143, + "memory(GiB)": 302.58, + "step": 251820, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.74926243, + "epoch": 1.4084085637548986, + "grad_norm": 5.90625, + "learning_rate": 2.2072987216417443e-06, + "loss": 0.99522772, + "memory(GiB)": 302.58, + "step": 251840, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.74836655, + "epoch": 1.408520413227878, + "grad_norm": 9.5, + "learning_rate": 2.2065317475199423e-06, + "loss": 0.98319082, + "memory(GiB)": 302.58, + "step": 251860, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.74452882, + "epoch": 1.4086322627008572, + "grad_norm": 6.96875, + "learning_rate": 2.2057648689436127e-06, + "loss": 1.01750126, + "memory(GiB)": 302.58, + "step": 251880, + "train_speed(iter/s)": 0.123815 + }, + { + "acc": 0.75892701, + "epoch": 1.4087441121738364, + "grad_norm": 9.1875, + "learning_rate": 2.2049980859389845e-06, + "loss": 0.93093996, + "memory(GiB)": 302.58, + "step": 251900, + "train_speed(iter/s)": 0.12382 + }, + { + "acc": 0.75203342, + "epoch": 1.4088559616468157, + "grad_norm": 5.78125, + "learning_rate": 2.204231398532285e-06, + "loss": 0.96987705, + "memory(GiB)": 302.58, + "step": 251920, + "train_speed(iter/s)": 0.123824 + }, + { + "acc": 0.76767073, + "epoch": 1.408967811119795, + "grad_norm": 8.75, + "learning_rate": 2.203464806749735e-06, + "loss": 0.91587906, + "memory(GiB)": 302.58, + "step": 251940, + "train_speed(iter/s)": 0.123829 + }, + { + "acc": 0.74687939, + "epoch": 1.4090796605927742, + "grad_norm": 8.25, + "learning_rate": 2.2026983106175574e-06, + "loss": 1.01516867, + "memory(GiB)": 302.58, + "step": 251960, + "train_speed(iter/s)": 0.123834 + }, + { + "acc": 0.7434267, + "epoch": 1.4091915100657535, + "grad_norm": 8.75, + "learning_rate": 2.2019319101619683e-06, + "loss": 1.01489058, + "memory(GiB)": 302.58, + "step": 251980, + "train_speed(iter/s)": 0.123838 + }, + { + "acc": 0.75329933, + "epoch": 1.4093033595387328, + "grad_norm": 6.0, + "learning_rate": 2.2011656054091803e-06, + "loss": 0.97883863, + "memory(GiB)": 302.58, + "step": 252000, + "train_speed(iter/s)": 0.123843 + }, + { + "epoch": 1.4093033595387328, + "eval_acc": 0.7068472940353157, + "eval_loss": 1.0121749639511108, + "eval_runtime": 7532.1562, + "eval_samples_per_second": 9.995, + "eval_steps_per_second": 9.995, + "step": 252000 + }, + { + "acc": 0.7579958, + "epoch": 1.409415209011712, + "grad_norm": 10.1875, + "learning_rate": 2.2003993963854026e-06, + "loss": 0.94302225, + "memory(GiB)": 302.58, + "step": 252020, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.75153422, + "epoch": 1.4095270584846913, + "grad_norm": 7.15625, + "learning_rate": 2.1996332831168438e-06, + "loss": 0.97957592, + "memory(GiB)": 302.58, + "step": 252040, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.75738754, + "epoch": 1.4096389079576706, + "grad_norm": 9.375, + "learning_rate": 2.1988672656297063e-06, + "loss": 0.96426296, + "memory(GiB)": 302.58, + "step": 252060, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.73426828, + "epoch": 1.4097507574306498, + "grad_norm": 8.0625, + "learning_rate": 2.198101343950188e-06, + "loss": 1.0458252, + "memory(GiB)": 302.58, + "step": 252080, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.74107394, + "epoch": 1.409862606903629, + "grad_norm": 6.875, + "learning_rate": 2.1973355181044904e-06, + "loss": 1.03375711, + "memory(GiB)": 302.58, + "step": 252100, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.72995148, + "epoch": 1.4099744563766083, + "grad_norm": 7.25, + "learning_rate": 2.1965697881188052e-06, + "loss": 1.0683341, + "memory(GiB)": 302.58, + "step": 252120, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.74810281, + "epoch": 1.4100863058495876, + "grad_norm": 7.65625, + "learning_rate": 2.1958041540193228e-06, + "loss": 1.00027847, + "memory(GiB)": 302.58, + "step": 252140, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.74289875, + "epoch": 1.4101981553225669, + "grad_norm": 4.875, + "learning_rate": 2.19503861583223e-06, + "loss": 1.00236168, + "memory(GiB)": 302.58, + "step": 252160, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75330148, + "epoch": 1.4103100047955461, + "grad_norm": 6.46875, + "learning_rate": 2.194273173583711e-06, + "loss": 0.95904074, + "memory(GiB)": 302.58, + "step": 252180, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.73536258, + "epoch": 1.4104218542685254, + "grad_norm": 6.875, + "learning_rate": 2.193507827299946e-06, + "loss": 1.03839865, + "memory(GiB)": 302.58, + "step": 252200, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.74415207, + "epoch": 1.4105337037415047, + "grad_norm": 4.78125, + "learning_rate": 2.1927425770071125e-06, + "loss": 1.00683146, + "memory(GiB)": 302.58, + "step": 252220, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74482784, + "epoch": 1.410645553214484, + "grad_norm": 6.46875, + "learning_rate": 2.1919774227313846e-06, + "loss": 1.01945496, + "memory(GiB)": 302.58, + "step": 252240, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.74696536, + "epoch": 1.4107574026874632, + "grad_norm": 8.125, + "learning_rate": 2.1912123644989306e-06, + "loss": 0.98221598, + "memory(GiB)": 302.58, + "step": 252260, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.7437829, + "epoch": 1.4108692521604425, + "grad_norm": 9.9375, + "learning_rate": 2.1904474023359223e-06, + "loss": 1.01372986, + "memory(GiB)": 302.58, + "step": 252280, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.75195765, + "epoch": 1.4109811016334217, + "grad_norm": 8.0, + "learning_rate": 2.189682536268522e-06, + "loss": 0.96377621, + "memory(GiB)": 302.58, + "step": 252300, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.73777976, + "epoch": 1.411092951106401, + "grad_norm": 7.4375, + "learning_rate": 2.18891776632289e-06, + "loss": 1.04129591, + "memory(GiB)": 302.58, + "step": 252320, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.7669416, + "epoch": 1.4112048005793802, + "grad_norm": 10.0, + "learning_rate": 2.188153092525184e-06, + "loss": 0.91266508, + "memory(GiB)": 302.58, + "step": 252340, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.76637034, + "epoch": 1.4113166500523595, + "grad_norm": 6.375, + "learning_rate": 2.1873885149015582e-06, + "loss": 0.91271105, + "memory(GiB)": 302.58, + "step": 252360, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.75107422, + "epoch": 1.4114284995253388, + "grad_norm": 8.1875, + "learning_rate": 2.1866240334781643e-06, + "loss": 0.97041931, + "memory(GiB)": 302.58, + "step": 252380, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.75314221, + "epoch": 1.411540348998318, + "grad_norm": 5.3125, + "learning_rate": 2.1858596482811483e-06, + "loss": 0.97851248, + "memory(GiB)": 302.58, + "step": 252400, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.74500537, + "epoch": 1.4116521984712973, + "grad_norm": 6.90625, + "learning_rate": 2.185095359336657e-06, + "loss": 0.99662981, + "memory(GiB)": 302.58, + "step": 252420, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.73779669, + "epoch": 1.4117640479442766, + "grad_norm": 10.5, + "learning_rate": 2.1843311666708295e-06, + "loss": 1.03766146, + "memory(GiB)": 302.58, + "step": 252440, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.7497983, + "epoch": 1.4118758974172558, + "grad_norm": 8.6875, + "learning_rate": 2.1835670703098043e-06, + "loss": 0.98545246, + "memory(GiB)": 302.58, + "step": 252460, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.75097866, + "epoch": 1.411987746890235, + "grad_norm": 8.6875, + "learning_rate": 2.1828030702797165e-06, + "loss": 0.973736, + "memory(GiB)": 302.58, + "step": 252480, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.76255774, + "epoch": 1.4120995963632144, + "grad_norm": 6.84375, + "learning_rate": 2.1820391666066963e-06, + "loss": 0.92464733, + "memory(GiB)": 302.58, + "step": 252500, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.75297146, + "epoch": 1.4122114458361936, + "grad_norm": 7.4375, + "learning_rate": 2.1812753593168723e-06, + "loss": 0.96976557, + "memory(GiB)": 302.58, + "step": 252520, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.75011101, + "epoch": 1.4123232953091729, + "grad_norm": 7.5625, + "learning_rate": 2.180511648436367e-06, + "loss": 0.97896423, + "memory(GiB)": 302.58, + "step": 252540, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.73235865, + "epoch": 1.4124351447821522, + "grad_norm": 6.59375, + "learning_rate": 2.1797480339913057e-06, + "loss": 1.06596317, + "memory(GiB)": 302.58, + "step": 252560, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.77016821, + "epoch": 1.4125469942551314, + "grad_norm": 9.5625, + "learning_rate": 2.1789845160078045e-06, + "loss": 0.91259384, + "memory(GiB)": 302.58, + "step": 252580, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.75466433, + "epoch": 1.4126588437281107, + "grad_norm": 6.25, + "learning_rate": 2.1782210945119774e-06, + "loss": 0.961273, + "memory(GiB)": 302.58, + "step": 252600, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.77249837, + "epoch": 1.41277069320109, + "grad_norm": 4.53125, + "learning_rate": 2.177457769529937e-06, + "loss": 0.88706141, + "memory(GiB)": 302.58, + "step": 252620, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74059868, + "epoch": 1.4128825426740692, + "grad_norm": 7.15625, + "learning_rate": 2.176694541087791e-06, + "loss": 1.02779541, + "memory(GiB)": 302.58, + "step": 252640, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.75990143, + "epoch": 1.4129943921470485, + "grad_norm": 5.65625, + "learning_rate": 2.175931409211644e-06, + "loss": 0.93912725, + "memory(GiB)": 302.58, + "step": 252660, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.73174396, + "epoch": 1.4131062416200277, + "grad_norm": 7.0, + "learning_rate": 2.175168373927598e-06, + "loss": 1.03676786, + "memory(GiB)": 302.58, + "step": 252680, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.74448905, + "epoch": 1.413218091093007, + "grad_norm": 5.875, + "learning_rate": 2.1744054352617503e-06, + "loss": 1.03209352, + "memory(GiB)": 302.58, + "step": 252700, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.73796544, + "epoch": 1.4133299405659863, + "grad_norm": 9.4375, + "learning_rate": 2.173642593240197e-06, + "loss": 1.05686646, + "memory(GiB)": 302.58, + "step": 252720, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74733081, + "epoch": 1.4134417900389655, + "grad_norm": 6.34375, + "learning_rate": 2.172879847889029e-06, + "loss": 1.00418186, + "memory(GiB)": 302.58, + "step": 252740, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75790091, + "epoch": 1.4135536395119448, + "grad_norm": 6.21875, + "learning_rate": 2.172117199234335e-06, + "loss": 0.92922096, + "memory(GiB)": 302.58, + "step": 252760, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.73799338, + "epoch": 1.413665488984924, + "grad_norm": 7.0625, + "learning_rate": 2.171354647302199e-06, + "loss": 1.04037247, + "memory(GiB)": 302.58, + "step": 252780, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75809731, + "epoch": 1.4137773384579033, + "grad_norm": 8.0, + "learning_rate": 2.170592192118704e-06, + "loss": 0.957551, + "memory(GiB)": 302.58, + "step": 252800, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.74435358, + "epoch": 1.4138891879308826, + "grad_norm": 6.59375, + "learning_rate": 2.169829833709926e-06, + "loss": 0.99148951, + "memory(GiB)": 302.58, + "step": 252820, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.75253267, + "epoch": 1.4140010374038618, + "grad_norm": 8.375, + "learning_rate": 2.169067572101943e-06, + "loss": 0.97227802, + "memory(GiB)": 302.58, + "step": 252840, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.73620572, + "epoch": 1.4141128868768411, + "grad_norm": 7.5625, + "learning_rate": 2.168305407320827e-06, + "loss": 1.04545183, + "memory(GiB)": 302.58, + "step": 252860, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.76096754, + "epoch": 1.4142247363498204, + "grad_norm": 7.96875, + "learning_rate": 2.1675433393926444e-06, + "loss": 0.92354536, + "memory(GiB)": 302.58, + "step": 252880, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.75022025, + "epoch": 1.4143365858227996, + "grad_norm": 4.90625, + "learning_rate": 2.1667813683434608e-06, + "loss": 0.98188934, + "memory(GiB)": 302.58, + "step": 252900, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.751227, + "epoch": 1.414448435295779, + "grad_norm": 6.375, + "learning_rate": 2.166019494199339e-06, + "loss": 0.99673347, + "memory(GiB)": 302.58, + "step": 252920, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.76588154, + "epoch": 1.4145602847687582, + "grad_norm": 7.625, + "learning_rate": 2.1652577169863366e-06, + "loss": 0.90458813, + "memory(GiB)": 302.58, + "step": 252940, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75046954, + "epoch": 1.4146721342417374, + "grad_norm": 7.1875, + "learning_rate": 2.1644960367305085e-06, + "loss": 1.01197958, + "memory(GiB)": 302.58, + "step": 252960, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.74072208, + "epoch": 1.4147839837147167, + "grad_norm": 6.6875, + "learning_rate": 2.1637344534579074e-06, + "loss": 1.01470766, + "memory(GiB)": 302.58, + "step": 252980, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.73609929, + "epoch": 1.414895833187696, + "grad_norm": 9.0, + "learning_rate": 2.1629729671945814e-06, + "loss": 1.04028492, + "memory(GiB)": 302.58, + "step": 253000, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.75798922, + "epoch": 1.4150076826606752, + "grad_norm": 7.4375, + "learning_rate": 2.1622115779665758e-06, + "loss": 0.94832182, + "memory(GiB)": 302.58, + "step": 253020, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.77389083, + "epoch": 1.4151195321336545, + "grad_norm": 9.375, + "learning_rate": 2.161450285799933e-06, + "loss": 0.86347198, + "memory(GiB)": 302.58, + "step": 253040, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.74807091, + "epoch": 1.4152313816066338, + "grad_norm": 4.90625, + "learning_rate": 2.160689090720691e-06, + "loss": 1.01925907, + "memory(GiB)": 302.58, + "step": 253060, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75423722, + "epoch": 1.415343231079613, + "grad_norm": 9.3125, + "learning_rate": 2.159927992754884e-06, + "loss": 0.97901459, + "memory(GiB)": 302.58, + "step": 253080, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.74532375, + "epoch": 1.4154550805525923, + "grad_norm": 4.96875, + "learning_rate": 2.159166991928546e-06, + "loss": 1.02178202, + "memory(GiB)": 302.58, + "step": 253100, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.75489078, + "epoch": 1.4155669300255715, + "grad_norm": 5.625, + "learning_rate": 2.158406088267706e-06, + "loss": 0.96336622, + "memory(GiB)": 302.58, + "step": 253120, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.75259476, + "epoch": 1.4156787794985508, + "grad_norm": 6.84375, + "learning_rate": 2.157645281798388e-06, + "loss": 0.97301435, + "memory(GiB)": 302.58, + "step": 253140, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.74866538, + "epoch": 1.41579062897153, + "grad_norm": 5.34375, + "learning_rate": 2.1568845725466136e-06, + "loss": 0.98616085, + "memory(GiB)": 302.58, + "step": 253160, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.76755261, + "epoch": 1.4159024784445093, + "grad_norm": 7.53125, + "learning_rate": 2.1561239605384026e-06, + "loss": 0.91064577, + "memory(GiB)": 302.58, + "step": 253180, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.73137097, + "epoch": 1.4160143279174886, + "grad_norm": 8.0, + "learning_rate": 2.15536344579977e-06, + "loss": 1.06118298, + "memory(GiB)": 302.58, + "step": 253200, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.76057148, + "epoch": 1.4161261773904679, + "grad_norm": 7.125, + "learning_rate": 2.1546030283567276e-06, + "loss": 0.91964188, + "memory(GiB)": 302.58, + "step": 253220, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.75766234, + "epoch": 1.4162380268634471, + "grad_norm": 5.9375, + "learning_rate": 2.1538427082352837e-06, + "loss": 0.9589859, + "memory(GiB)": 302.58, + "step": 253240, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.73906732, + "epoch": 1.4163498763364264, + "grad_norm": 8.3125, + "learning_rate": 2.1530824854614446e-06, + "loss": 1.02520428, + "memory(GiB)": 302.58, + "step": 253260, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.73423338, + "epoch": 1.4164617258094057, + "grad_norm": 6.375, + "learning_rate": 2.1523223600612116e-06, + "loss": 1.05930443, + "memory(GiB)": 302.58, + "step": 253280, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.73133421, + "epoch": 1.416573575282385, + "grad_norm": 7.9375, + "learning_rate": 2.151562332060583e-06, + "loss": 1.06223097, + "memory(GiB)": 302.58, + "step": 253300, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.74842563, + "epoch": 1.4166854247553642, + "grad_norm": 7.25, + "learning_rate": 2.150802401485554e-06, + "loss": 1.00395479, + "memory(GiB)": 302.58, + "step": 253320, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.73867197, + "epoch": 1.4167972742283435, + "grad_norm": 7.40625, + "learning_rate": 2.150042568362119e-06, + "loss": 1.04838133, + "memory(GiB)": 302.58, + "step": 253340, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.75939536, + "epoch": 1.4169091237013227, + "grad_norm": 7.9375, + "learning_rate": 2.1492828327162645e-06, + "loss": 0.94567404, + "memory(GiB)": 302.58, + "step": 253360, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.75671296, + "epoch": 1.417020973174302, + "grad_norm": 8.5, + "learning_rate": 2.148523194573977e-06, + "loss": 0.96898813, + "memory(GiB)": 302.58, + "step": 253380, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.73977981, + "epoch": 1.4171328226472812, + "grad_norm": 7.46875, + "learning_rate": 2.1477636539612386e-06, + "loss": 1.02547407, + "memory(GiB)": 302.58, + "step": 253400, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.74962225, + "epoch": 1.4172446721202605, + "grad_norm": 7.09375, + "learning_rate": 2.1470042109040245e-06, + "loss": 1.00457954, + "memory(GiB)": 302.58, + "step": 253420, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.74657383, + "epoch": 1.4173565215932398, + "grad_norm": 8.5, + "learning_rate": 2.1462448654283152e-06, + "loss": 0.99570169, + "memory(GiB)": 302.58, + "step": 253440, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.75610428, + "epoch": 1.417468371066219, + "grad_norm": 6.375, + "learning_rate": 2.145485617560081e-06, + "loss": 0.94128895, + "memory(GiB)": 302.58, + "step": 253460, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.74058576, + "epoch": 1.4175802205391983, + "grad_norm": 6.59375, + "learning_rate": 2.1447264673252894e-06, + "loss": 1.01597347, + "memory(GiB)": 302.58, + "step": 253480, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.76424036, + "epoch": 1.4176920700121776, + "grad_norm": 6.65625, + "learning_rate": 2.143967414749907e-06, + "loss": 0.91246414, + "memory(GiB)": 302.58, + "step": 253500, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.74871244, + "epoch": 1.4178039194851568, + "grad_norm": 7.6875, + "learning_rate": 2.143208459859895e-06, + "loss": 0.99193439, + "memory(GiB)": 302.58, + "step": 253520, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.74869075, + "epoch": 1.417915768958136, + "grad_norm": 6.90625, + "learning_rate": 2.1424496026812124e-06, + "loss": 1.01347675, + "memory(GiB)": 302.58, + "step": 253540, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.76505642, + "epoch": 1.4180276184311154, + "grad_norm": 6.625, + "learning_rate": 2.141690843239814e-06, + "loss": 0.919382, + "memory(GiB)": 302.58, + "step": 253560, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.74889679, + "epoch": 1.4181394679040946, + "grad_norm": 6.0625, + "learning_rate": 2.140932181561653e-06, + "loss": 0.98297052, + "memory(GiB)": 302.58, + "step": 253580, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.75905423, + "epoch": 1.4182513173770739, + "grad_norm": 6.0, + "learning_rate": 2.140173617672677e-06, + "loss": 0.94233513, + "memory(GiB)": 302.58, + "step": 253600, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.76875958, + "epoch": 1.4183631668500531, + "grad_norm": 7.375, + "learning_rate": 2.139415151598831e-06, + "loss": 0.90709887, + "memory(GiB)": 302.58, + "step": 253620, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.75597706, + "epoch": 1.4184750163230324, + "grad_norm": 7.25, + "learning_rate": 2.1386567833660576e-06, + "loss": 0.9568985, + "memory(GiB)": 302.58, + "step": 253640, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.7433404, + "epoch": 1.4185868657960117, + "grad_norm": 5.78125, + "learning_rate": 2.137898513000296e-06, + "loss": 1.02448273, + "memory(GiB)": 302.58, + "step": 253660, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.74990172, + "epoch": 1.418698715268991, + "grad_norm": 9.375, + "learning_rate": 2.1371403405274784e-06, + "loss": 0.98433886, + "memory(GiB)": 302.58, + "step": 253680, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.74545951, + "epoch": 1.4188105647419702, + "grad_norm": 5.71875, + "learning_rate": 2.1363822659735406e-06, + "loss": 1.00330048, + "memory(GiB)": 302.58, + "step": 253700, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.73753734, + "epoch": 1.4189224142149495, + "grad_norm": 7.875, + "learning_rate": 2.1356242893644104e-06, + "loss": 1.02783985, + "memory(GiB)": 302.58, + "step": 253720, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.75011382, + "epoch": 1.4190342636879287, + "grad_norm": 5.125, + "learning_rate": 2.1348664107260114e-06, + "loss": 0.96195736, + "memory(GiB)": 302.58, + "step": 253740, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.74429479, + "epoch": 1.419146113160908, + "grad_norm": 9.5, + "learning_rate": 2.134108630084267e-06, + "loss": 1.00767918, + "memory(GiB)": 302.58, + "step": 253760, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.75753284, + "epoch": 1.4192579626338873, + "grad_norm": 7.78125, + "learning_rate": 2.133350947465094e-06, + "loss": 0.96256218, + "memory(GiB)": 302.58, + "step": 253780, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.76975374, + "epoch": 1.4193698121068665, + "grad_norm": 6.90625, + "learning_rate": 2.132593362894409e-06, + "loss": 0.8886343, + "memory(GiB)": 302.58, + "step": 253800, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.75480437, + "epoch": 1.4194816615798458, + "grad_norm": 5.84375, + "learning_rate": 2.1318358763981235e-06, + "loss": 0.95686111, + "memory(GiB)": 302.58, + "step": 253820, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.76149521, + "epoch": 1.419593511052825, + "grad_norm": 7.03125, + "learning_rate": 2.1310784880021452e-06, + "loss": 0.94407616, + "memory(GiB)": 302.58, + "step": 253840, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.75377126, + "epoch": 1.4197053605258043, + "grad_norm": 4.59375, + "learning_rate": 2.13032119773238e-06, + "loss": 0.97303467, + "memory(GiB)": 302.58, + "step": 253860, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.75062289, + "epoch": 1.4198172099987836, + "grad_norm": 6.59375, + "learning_rate": 2.129564005614729e-06, + "loss": 0.98565741, + "memory(GiB)": 302.58, + "step": 253880, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.73132257, + "epoch": 1.4199290594717628, + "grad_norm": 9.1875, + "learning_rate": 2.1288069116750916e-06, + "loss": 1.07180357, + "memory(GiB)": 302.58, + "step": 253900, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.749858, + "epoch": 1.420040908944742, + "grad_norm": 10.1875, + "learning_rate": 2.1280499159393618e-06, + "loss": 0.98716898, + "memory(GiB)": 302.58, + "step": 253920, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.75773711, + "epoch": 1.4201527584177214, + "grad_norm": 7.09375, + "learning_rate": 2.1272930184334317e-06, + "loss": 0.95506601, + "memory(GiB)": 302.58, + "step": 253940, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.73855958, + "epoch": 1.4202646078907006, + "grad_norm": 9.0625, + "learning_rate": 2.1265362191831875e-06, + "loss": 1.0142045, + "memory(GiB)": 302.58, + "step": 253960, + "train_speed(iter/s)": 0.123826 + }, + { + "acc": 0.74853611, + "epoch": 1.42037645736368, + "grad_norm": 6.9375, + "learning_rate": 2.125779518214518e-06, + "loss": 0.9692049, + "memory(GiB)": 302.58, + "step": 253980, + "train_speed(iter/s)": 0.123831 + }, + { + "acc": 0.74581666, + "epoch": 1.4204883068366592, + "grad_norm": 6.8125, + "learning_rate": 2.1250229155533033e-06, + "loss": 0.98808031, + "memory(GiB)": 302.58, + "step": 254000, + "train_speed(iter/s)": 0.123835 + }, + { + "epoch": 1.4204883068366592, + "eval_acc": 0.7068449770863607, + "eval_loss": 1.0120339393615723, + "eval_runtime": 7513.8835, + "eval_samples_per_second": 10.019, + "eval_steps_per_second": 10.019, + "step": 254000 + }, + { + "acc": 0.74230971, + "epoch": 1.4206001563096384, + "grad_norm": 4.6875, + "learning_rate": 2.1242664112254204e-06, + "loss": 1.01244164, + "memory(GiB)": 302.58, + "step": 254020, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.74971542, + "epoch": 1.4207120057826177, + "grad_norm": 7.0, + "learning_rate": 2.1235100052567453e-06, + "loss": 0.99028177, + "memory(GiB)": 302.58, + "step": 254040, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.74796572, + "epoch": 1.420823855255597, + "grad_norm": 7.15625, + "learning_rate": 2.122753697673149e-06, + "loss": 0.98438463, + "memory(GiB)": 302.58, + "step": 254060, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.76893454, + "epoch": 1.4209357047285762, + "grad_norm": 5.9375, + "learning_rate": 2.1219974885004994e-06, + "loss": 0.92024527, + "memory(GiB)": 302.58, + "step": 254080, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74818859, + "epoch": 1.4210475542015555, + "grad_norm": 8.9375, + "learning_rate": 2.121241377764662e-06, + "loss": 0.98191729, + "memory(GiB)": 302.58, + "step": 254100, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.75493636, + "epoch": 1.4211594036745347, + "grad_norm": 8.375, + "learning_rate": 2.120485365491498e-06, + "loss": 0.97250652, + "memory(GiB)": 302.58, + "step": 254120, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.73871617, + "epoch": 1.421271253147514, + "grad_norm": 6.46875, + "learning_rate": 2.119729451706865e-06, + "loss": 1.03421526, + "memory(GiB)": 302.58, + "step": 254140, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.75526295, + "epoch": 1.4213831026204933, + "grad_norm": 8.5625, + "learning_rate": 2.118973636436618e-06, + "loss": 0.96619663, + "memory(GiB)": 302.58, + "step": 254160, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.74932323, + "epoch": 1.4214949520934725, + "grad_norm": 8.8125, + "learning_rate": 2.1182179197066082e-06, + "loss": 0.98073683, + "memory(GiB)": 302.58, + "step": 254180, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75529599, + "epoch": 1.4216068015664518, + "grad_norm": 6.65625, + "learning_rate": 2.117462301542683e-06, + "loss": 0.93600779, + "memory(GiB)": 302.58, + "step": 254200, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.73705683, + "epoch": 1.421718651039431, + "grad_norm": 9.125, + "learning_rate": 2.116706781970688e-06, + "loss": 1.04738369, + "memory(GiB)": 302.58, + "step": 254220, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75587883, + "epoch": 1.4218305005124103, + "grad_norm": 6.6875, + "learning_rate": 2.115951361016461e-06, + "loss": 0.96296415, + "memory(GiB)": 302.58, + "step": 254240, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74118443, + "epoch": 1.4219423499853896, + "grad_norm": 8.1875, + "learning_rate": 2.115196038705845e-06, + "loss": 1.03416615, + "memory(GiB)": 302.58, + "step": 254260, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75051818, + "epoch": 1.4220541994583689, + "grad_norm": 8.125, + "learning_rate": 2.1144408150646718e-06, + "loss": 0.98881474, + "memory(GiB)": 302.58, + "step": 254280, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.7581707, + "epoch": 1.4221660489313481, + "grad_norm": 9.8125, + "learning_rate": 2.1136856901187724e-06, + "loss": 0.92074947, + "memory(GiB)": 302.58, + "step": 254300, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.74304566, + "epoch": 1.4222778984043274, + "grad_norm": 7.25, + "learning_rate": 2.112930663893975e-06, + "loss": 0.99831991, + "memory(GiB)": 302.58, + "step": 254320, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.76076627, + "epoch": 1.4223897478773067, + "grad_norm": 5.1875, + "learning_rate": 2.112175736416103e-06, + "loss": 0.93351383, + "memory(GiB)": 302.58, + "step": 254340, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73664908, + "epoch": 1.422501597350286, + "grad_norm": 7.5625, + "learning_rate": 2.1114209077109782e-06, + "loss": 1.03493996, + "memory(GiB)": 302.58, + "step": 254360, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.77517443, + "epoch": 1.4226134468232652, + "grad_norm": 12.875, + "learning_rate": 2.110666177804418e-06, + "loss": 0.86429501, + "memory(GiB)": 302.58, + "step": 254380, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.74537926, + "epoch": 1.4227252962962444, + "grad_norm": 4.96875, + "learning_rate": 2.109911546722236e-06, + "loss": 1.01045713, + "memory(GiB)": 302.58, + "step": 254400, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.7502069, + "epoch": 1.4228371457692237, + "grad_norm": 9.6875, + "learning_rate": 2.1091570144902434e-06, + "loss": 0.95876617, + "memory(GiB)": 302.58, + "step": 254420, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.73244286, + "epoch": 1.422948995242203, + "grad_norm": 7.96875, + "learning_rate": 2.108402581134248e-06, + "loss": 1.06726007, + "memory(GiB)": 302.58, + "step": 254440, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.74441748, + "epoch": 1.4230608447151822, + "grad_norm": 8.25, + "learning_rate": 2.1076482466800525e-06, + "loss": 0.99511366, + "memory(GiB)": 302.58, + "step": 254460, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.75173726, + "epoch": 1.4231726941881615, + "grad_norm": 6.0, + "learning_rate": 2.106894011153459e-06, + "loss": 0.98691216, + "memory(GiB)": 302.58, + "step": 254480, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.73493247, + "epoch": 1.4232845436611408, + "grad_norm": 7.90625, + "learning_rate": 2.106139874580262e-06, + "loss": 1.04549103, + "memory(GiB)": 302.58, + "step": 254500, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.72860074, + "epoch": 1.42339639313412, + "grad_norm": 6.28125, + "learning_rate": 2.1053858369862594e-06, + "loss": 1.03937654, + "memory(GiB)": 302.58, + "step": 254520, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.72360067, + "epoch": 1.4235082426070993, + "grad_norm": 4.96875, + "learning_rate": 2.10463189839724e-06, + "loss": 1.09932413, + "memory(GiB)": 302.58, + "step": 254540, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.74155345, + "epoch": 1.4236200920800786, + "grad_norm": 5.0, + "learning_rate": 2.10387805883899e-06, + "loss": 1.01582623, + "memory(GiB)": 302.58, + "step": 254560, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.73831596, + "epoch": 1.4237319415530578, + "grad_norm": 5.34375, + "learning_rate": 2.1031243183372925e-06, + "loss": 1.0577013, + "memory(GiB)": 302.58, + "step": 254580, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.75944467, + "epoch": 1.423843791026037, + "grad_norm": 4.46875, + "learning_rate": 2.1023706769179304e-06, + "loss": 0.96195469, + "memory(GiB)": 302.58, + "step": 254600, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.75858483, + "epoch": 1.4239556404990164, + "grad_norm": 9.875, + "learning_rate": 2.1016171346066793e-06, + "loss": 0.94789295, + "memory(GiB)": 302.58, + "step": 254620, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.73828883, + "epoch": 1.4240674899719956, + "grad_norm": 7.25, + "learning_rate": 2.1008636914293133e-06, + "loss": 1.00758657, + "memory(GiB)": 302.58, + "step": 254640, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74391422, + "epoch": 1.4241793394449749, + "grad_norm": 8.625, + "learning_rate": 2.1001103474116012e-06, + "loss": 0.99806767, + "memory(GiB)": 302.58, + "step": 254660, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.76003513, + "epoch": 1.4242911889179541, + "grad_norm": 6.15625, + "learning_rate": 2.0993571025793115e-06, + "loss": 0.93932219, + "memory(GiB)": 302.58, + "step": 254680, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.78719287, + "epoch": 1.4244030383909334, + "grad_norm": 6.125, + "learning_rate": 2.0986039569582057e-06, + "loss": 0.81417322, + "memory(GiB)": 302.58, + "step": 254700, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.73517337, + "epoch": 1.4245148878639127, + "grad_norm": 6.21875, + "learning_rate": 2.097850910574045e-06, + "loss": 1.03325567, + "memory(GiB)": 302.58, + "step": 254720, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.77272263, + "epoch": 1.424626737336892, + "grad_norm": 7.09375, + "learning_rate": 2.0970979634525855e-06, + "loss": 0.87308092, + "memory(GiB)": 302.58, + "step": 254740, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.75325866, + "epoch": 1.4247385868098712, + "grad_norm": 7.15625, + "learning_rate": 2.096345115619581e-06, + "loss": 0.9756032, + "memory(GiB)": 302.58, + "step": 254760, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.7556828, + "epoch": 1.4248504362828505, + "grad_norm": 4.71875, + "learning_rate": 2.09559236710078e-06, + "loss": 0.96290379, + "memory(GiB)": 302.58, + "step": 254780, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.76833444, + "epoch": 1.4249622857558297, + "grad_norm": 7.34375, + "learning_rate": 2.0948397179219304e-06, + "loss": 0.89794989, + "memory(GiB)": 302.58, + "step": 254800, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75769887, + "epoch": 1.425074135228809, + "grad_norm": 7.875, + "learning_rate": 2.094087168108774e-06, + "loss": 0.9387373, + "memory(GiB)": 302.58, + "step": 254820, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.75438194, + "epoch": 1.4251859847017883, + "grad_norm": 11.25, + "learning_rate": 2.0933347176870487e-06, + "loss": 0.98008156, + "memory(GiB)": 302.58, + "step": 254840, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.75784888, + "epoch": 1.4252978341747675, + "grad_norm": 9.125, + "learning_rate": 2.092582366682495e-06, + "loss": 0.93558025, + "memory(GiB)": 302.58, + "step": 254860, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.73345909, + "epoch": 1.4254096836477468, + "grad_norm": 9.6875, + "learning_rate": 2.091830115120844e-06, + "loss": 1.03682356, + "memory(GiB)": 302.58, + "step": 254880, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.76692715, + "epoch": 1.425521533120726, + "grad_norm": 12.1875, + "learning_rate": 2.0910779630278246e-06, + "loss": 0.91536541, + "memory(GiB)": 302.58, + "step": 254900, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75248938, + "epoch": 1.4256333825937053, + "grad_norm": 6.65625, + "learning_rate": 2.0903259104291623e-06, + "loss": 0.96038437, + "memory(GiB)": 302.58, + "step": 254920, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.74097848, + "epoch": 1.4257452320666846, + "grad_norm": 7.34375, + "learning_rate": 2.0895739573505803e-06, + "loss": 1.01101685, + "memory(GiB)": 302.58, + "step": 254940, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.75355234, + "epoch": 1.4258570815396638, + "grad_norm": 7.78125, + "learning_rate": 2.0888221038177976e-06, + "loss": 0.94649372, + "memory(GiB)": 302.58, + "step": 254960, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.75351624, + "epoch": 1.425968931012643, + "grad_norm": 4.75, + "learning_rate": 2.08807034985653e-06, + "loss": 0.974541, + "memory(GiB)": 302.58, + "step": 254980, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.75365782, + "epoch": 1.4260807804856224, + "grad_norm": 7.125, + "learning_rate": 2.08731869549249e-06, + "loss": 0.96559534, + "memory(GiB)": 302.58, + "step": 255000, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.73765082, + "epoch": 1.4261926299586016, + "grad_norm": 5.5, + "learning_rate": 2.0865671407513867e-06, + "loss": 1.03785315, + "memory(GiB)": 302.58, + "step": 255020, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.76995392, + "epoch": 1.426304479431581, + "grad_norm": 9.125, + "learning_rate": 2.085815685658925e-06, + "loss": 0.89230089, + "memory(GiB)": 302.58, + "step": 255040, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.75592299, + "epoch": 1.4264163289045602, + "grad_norm": 6.1875, + "learning_rate": 2.085064330240808e-06, + "loss": 0.95741453, + "memory(GiB)": 302.58, + "step": 255060, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.73873119, + "epoch": 1.4265281783775394, + "grad_norm": 8.0, + "learning_rate": 2.0843130745227336e-06, + "loss": 1.04977512, + "memory(GiB)": 302.58, + "step": 255080, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.76396713, + "epoch": 1.4266400278505187, + "grad_norm": 6.25, + "learning_rate": 2.083561918530398e-06, + "loss": 0.91588078, + "memory(GiB)": 302.58, + "step": 255100, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.7322813, + "epoch": 1.4267518773234982, + "grad_norm": 5.78125, + "learning_rate": 2.08281086228949e-06, + "loss": 1.07702284, + "memory(GiB)": 302.58, + "step": 255120, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.76430144, + "epoch": 1.4268637267964772, + "grad_norm": 5.46875, + "learning_rate": 2.082059905825703e-06, + "loss": 0.91754036, + "memory(GiB)": 302.58, + "step": 255140, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.75332642, + "epoch": 1.4269755762694567, + "grad_norm": 6.96875, + "learning_rate": 2.081309049164719e-06, + "loss": 0.97912626, + "memory(GiB)": 302.58, + "step": 255160, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.76197538, + "epoch": 1.4270874257424357, + "grad_norm": 10.4375, + "learning_rate": 2.080558292332221e-06, + "loss": 0.92602081, + "memory(GiB)": 302.58, + "step": 255180, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.7273325, + "epoch": 1.4271992752154152, + "grad_norm": 5.71875, + "learning_rate": 2.0798076353538864e-06, + "loss": 1.0813446, + "memory(GiB)": 302.58, + "step": 255200, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.7511837, + "epoch": 1.4273111246883943, + "grad_norm": 8.75, + "learning_rate": 2.079057078255391e-06, + "loss": 0.96184206, + "memory(GiB)": 302.58, + "step": 255220, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.74412403, + "epoch": 1.4274229741613738, + "grad_norm": 8.4375, + "learning_rate": 2.0783066210624054e-06, + "loss": 1.02652197, + "memory(GiB)": 302.58, + "step": 255240, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.75352001, + "epoch": 1.4275348236343528, + "grad_norm": 5.40625, + "learning_rate": 2.0775562638005974e-06, + "loss": 0.97350979, + "memory(GiB)": 302.58, + "step": 255260, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.73318734, + "epoch": 1.4276466731073323, + "grad_norm": 7.96875, + "learning_rate": 2.0768060064956317e-06, + "loss": 1.06213999, + "memory(GiB)": 302.58, + "step": 255280, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.75320745, + "epoch": 1.4277585225803113, + "grad_norm": 8.625, + "learning_rate": 2.076055849173171e-06, + "loss": 0.96733093, + "memory(GiB)": 302.58, + "step": 255300, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.74808455, + "epoch": 1.4278703720532908, + "grad_norm": 7.40625, + "learning_rate": 2.075305791858871e-06, + "loss": 0.9900835, + "memory(GiB)": 302.58, + "step": 255320, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.76366434, + "epoch": 1.4279822215262699, + "grad_norm": 6.46875, + "learning_rate": 2.074555834578387e-06, + "loss": 0.91296368, + "memory(GiB)": 302.58, + "step": 255340, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.75251255, + "epoch": 1.4280940709992493, + "grad_norm": 9.75, + "learning_rate": 2.0738059773573695e-06, + "loss": 0.97350559, + "memory(GiB)": 302.58, + "step": 255360, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.72392879, + "epoch": 1.4282059204722284, + "grad_norm": 8.25, + "learning_rate": 2.0730562202214645e-06, + "loss": 1.13974829, + "memory(GiB)": 302.58, + "step": 255380, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.76578674, + "epoch": 1.4283177699452079, + "grad_norm": 5.84375, + "learning_rate": 2.07230656319632e-06, + "loss": 0.92357264, + "memory(GiB)": 302.58, + "step": 255400, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.74656177, + "epoch": 1.428429619418187, + "grad_norm": 6.1875, + "learning_rate": 2.0715570063075745e-06, + "loss": 1.02172508, + "memory(GiB)": 302.58, + "step": 255420, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.76412687, + "epoch": 1.4285414688911664, + "grad_norm": 8.8125, + "learning_rate": 2.070807549580865e-06, + "loss": 0.93287783, + "memory(GiB)": 302.58, + "step": 255440, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.7407166, + "epoch": 1.4286533183641454, + "grad_norm": 6.65625, + "learning_rate": 2.070058193041826e-06, + "loss": 1.03318739, + "memory(GiB)": 302.58, + "step": 255460, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.74435167, + "epoch": 1.428765167837125, + "grad_norm": 8.8125, + "learning_rate": 2.0693089367160868e-06, + "loss": 0.98934498, + "memory(GiB)": 302.58, + "step": 255480, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.76390271, + "epoch": 1.428877017310104, + "grad_norm": 6.71875, + "learning_rate": 2.068559780629275e-06, + "loss": 0.92148085, + "memory(GiB)": 302.58, + "step": 255500, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.74980721, + "epoch": 1.4289888667830835, + "grad_norm": 8.5, + "learning_rate": 2.067810724807014e-06, + "loss": 0.97106876, + "memory(GiB)": 302.58, + "step": 255520, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.7374752, + "epoch": 1.4291007162560625, + "grad_norm": 7.28125, + "learning_rate": 2.0670617692749247e-06, + "loss": 1.02976313, + "memory(GiB)": 302.58, + "step": 255540, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.74114861, + "epoch": 1.429212565729042, + "grad_norm": 7.625, + "learning_rate": 2.066312914058622e-06, + "loss": 1.04269285, + "memory(GiB)": 302.58, + "step": 255560, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.7468595, + "epoch": 1.429324415202021, + "grad_norm": 7.53125, + "learning_rate": 2.0655641591837205e-06, + "loss": 0.98962631, + "memory(GiB)": 302.58, + "step": 255580, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.75503864, + "epoch": 1.4294362646750005, + "grad_norm": 6.875, + "learning_rate": 2.0648155046758294e-06, + "loss": 0.97542801, + "memory(GiB)": 302.58, + "step": 255600, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.75124226, + "epoch": 1.4295481141479796, + "grad_norm": 6.96875, + "learning_rate": 2.064066950560556e-06, + "loss": 0.96087255, + "memory(GiB)": 302.58, + "step": 255620, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.75317464, + "epoch": 1.429659963620959, + "grad_norm": 6.1875, + "learning_rate": 2.0633184968635018e-06, + "loss": 0.96404886, + "memory(GiB)": 302.58, + "step": 255640, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.76234341, + "epoch": 1.429771813093938, + "grad_norm": 7.53125, + "learning_rate": 2.0625701436102654e-06, + "loss": 0.93467503, + "memory(GiB)": 302.58, + "step": 255660, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.74242435, + "epoch": 1.4298836625669176, + "grad_norm": 5.5625, + "learning_rate": 2.0618218908264457e-06, + "loss": 1.01203413, + "memory(GiB)": 302.58, + "step": 255680, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.75622563, + "epoch": 1.4299955120398966, + "grad_norm": 11.4375, + "learning_rate": 2.061073738537635e-06, + "loss": 0.95862617, + "memory(GiB)": 302.58, + "step": 255700, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.75001411, + "epoch": 1.430107361512876, + "grad_norm": 7.5625, + "learning_rate": 2.0603256867694212e-06, + "loss": 0.99151459, + "memory(GiB)": 302.58, + "step": 255720, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.75411663, + "epoch": 1.4302192109858551, + "grad_norm": 11.125, + "learning_rate": 2.05957773554739e-06, + "loss": 0.99144831, + "memory(GiB)": 302.58, + "step": 255740, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.75674877, + "epoch": 1.4303310604588346, + "grad_norm": 7.125, + "learning_rate": 2.0588298848971244e-06, + "loss": 0.94823732, + "memory(GiB)": 302.58, + "step": 255760, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.74666066, + "epoch": 1.4304429099318137, + "grad_norm": 13.875, + "learning_rate": 2.0580821348442026e-06, + "loss": 0.99604349, + "memory(GiB)": 302.58, + "step": 255780, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.73175817, + "epoch": 1.4305547594047932, + "grad_norm": 7.875, + "learning_rate": 2.0573344854142006e-06, + "loss": 1.08827543, + "memory(GiB)": 302.58, + "step": 255800, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.75003562, + "epoch": 1.4306666088777722, + "grad_norm": 7.1875, + "learning_rate": 2.05658693663269e-06, + "loss": 0.97781658, + "memory(GiB)": 302.58, + "step": 255820, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.75983286, + "epoch": 1.4307784583507517, + "grad_norm": 4.28125, + "learning_rate": 2.055839488525238e-06, + "loss": 0.93787413, + "memory(GiB)": 302.58, + "step": 255840, + "train_speed(iter/s)": 0.123795 + }, + { + "acc": 0.75166812, + "epoch": 1.4308903078237307, + "grad_norm": 6.65625, + "learning_rate": 2.0550921411174125e-06, + "loss": 1.00000296, + "memory(GiB)": 302.58, + "step": 255860, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.75797853, + "epoch": 1.4310021572967102, + "grad_norm": 7.46875, + "learning_rate": 2.0543448944347744e-06, + "loss": 0.9718318, + "memory(GiB)": 302.58, + "step": 255880, + "train_speed(iter/s)": 0.123804 + }, + { + "acc": 0.74873857, + "epoch": 1.4311140067696893, + "grad_norm": 6.375, + "learning_rate": 2.053597748502881e-06, + "loss": 0.96581335, + "memory(GiB)": 302.58, + "step": 255900, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.74205375, + "epoch": 1.4312258562426687, + "grad_norm": 7.75, + "learning_rate": 2.052850703347287e-06, + "loss": 1.03522339, + "memory(GiB)": 302.58, + "step": 255920, + "train_speed(iter/s)": 0.123813 + }, + { + "acc": 0.74588804, + "epoch": 1.4313377057156478, + "grad_norm": 5.28125, + "learning_rate": 2.0521037589935443e-06, + "loss": 1.02000551, + "memory(GiB)": 302.58, + "step": 255940, + "train_speed(iter/s)": 0.123818 + }, + { + "acc": 0.77006106, + "epoch": 1.4314495551886273, + "grad_norm": 9.0625, + "learning_rate": 2.051356915467201e-06, + "loss": 0.90838079, + "memory(GiB)": 302.58, + "step": 255960, + "train_speed(iter/s)": 0.123822 + }, + { + "acc": 0.74853067, + "epoch": 1.4315614046616063, + "grad_norm": 6.875, + "learning_rate": 2.050610172793801e-06, + "loss": 0.97417717, + "memory(GiB)": 302.58, + "step": 255980, + "train_speed(iter/s)": 0.123827 + }, + { + "acc": 0.75250144, + "epoch": 1.4316732541345858, + "grad_norm": 7.53125, + "learning_rate": 2.0498635309988836e-06, + "loss": 0.97595739, + "memory(GiB)": 302.58, + "step": 256000, + "train_speed(iter/s)": 0.123831 + }, + { + "epoch": 1.4316732541345858, + "eval_acc": 0.7068396530334432, + "eval_loss": 1.011948823928833, + "eval_runtime": 7564.2046, + "eval_samples_per_second": 9.953, + "eval_steps_per_second": 9.953, + "step": 256000 + }, + { + "acc": 0.73237157, + "epoch": 1.4317851036075648, + "grad_norm": 6.625, + "learning_rate": 2.0491169901079895e-06, + "loss": 1.07001219, + "memory(GiB)": 302.58, + "step": 256020, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.73842287, + "epoch": 1.4318969530805443, + "grad_norm": 7.6875, + "learning_rate": 2.0483705501466517e-06, + "loss": 1.02862921, + "memory(GiB)": 302.58, + "step": 256040, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.74696608, + "epoch": 1.4320088025535234, + "grad_norm": 8.375, + "learning_rate": 2.0476242111404e-06, + "loss": 0.99580116, + "memory(GiB)": 302.58, + "step": 256060, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.74678593, + "epoch": 1.4321206520265028, + "grad_norm": 6.1875, + "learning_rate": 2.046877973114762e-06, + "loss": 0.9998868, + "memory(GiB)": 302.58, + "step": 256080, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.75985804, + "epoch": 1.432232501499482, + "grad_norm": 7.78125, + "learning_rate": 2.046131836095262e-06, + "loss": 0.94417725, + "memory(GiB)": 302.58, + "step": 256100, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.75675211, + "epoch": 1.4323443509724614, + "grad_norm": 6.46875, + "learning_rate": 2.0453858001074195e-06, + "loss": 0.95597267, + "memory(GiB)": 302.58, + "step": 256120, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.75724978, + "epoch": 1.4324562004454404, + "grad_norm": 7.59375, + "learning_rate": 2.044639865176751e-06, + "loss": 0.94778233, + "memory(GiB)": 302.58, + "step": 256140, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.74202414, + "epoch": 1.43256804991842, + "grad_norm": 9.125, + "learning_rate": 2.043894031328771e-06, + "loss": 1.00999727, + "memory(GiB)": 302.58, + "step": 256160, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.76751957, + "epoch": 1.432679899391399, + "grad_norm": 10.6875, + "learning_rate": 2.043148298588988e-06, + "loss": 0.92077475, + "memory(GiB)": 302.58, + "step": 256180, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75449939, + "epoch": 1.4327917488643784, + "grad_norm": 7.34375, + "learning_rate": 2.0424026669829096e-06, + "loss": 0.96232271, + "memory(GiB)": 302.58, + "step": 256200, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75967193, + "epoch": 1.4329035983373575, + "grad_norm": 7.4375, + "learning_rate": 2.041657136536038e-06, + "loss": 0.93996105, + "memory(GiB)": 302.58, + "step": 256220, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.74084635, + "epoch": 1.433015447810337, + "grad_norm": 4.96875, + "learning_rate": 2.0409117072738733e-06, + "loss": 1.0344327, + "memory(GiB)": 302.58, + "step": 256240, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74813204, + "epoch": 1.433127297283316, + "grad_norm": 7.21875, + "learning_rate": 2.040166379221909e-06, + "loss": 1.01260042, + "memory(GiB)": 302.58, + "step": 256260, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.75267787, + "epoch": 1.4332391467562955, + "grad_norm": 6.28125, + "learning_rate": 2.039421152405642e-06, + "loss": 0.97380209, + "memory(GiB)": 302.58, + "step": 256280, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75431662, + "epoch": 1.4333509962292745, + "grad_norm": 8.6875, + "learning_rate": 2.038676026850559e-06, + "loss": 0.96389065, + "memory(GiB)": 302.58, + "step": 256300, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74102917, + "epoch": 1.433462845702254, + "grad_norm": 6.0, + "learning_rate": 2.0379310025821463e-06, + "loss": 1.02278061, + "memory(GiB)": 302.58, + "step": 256320, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.73214211, + "epoch": 1.433574695175233, + "grad_norm": 6.40625, + "learning_rate": 2.037186079625886e-06, + "loss": 1.03800573, + "memory(GiB)": 302.58, + "step": 256340, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.73613868, + "epoch": 1.4336865446482125, + "grad_norm": 7.21875, + "learning_rate": 2.0364412580072557e-06, + "loss": 1.04176626, + "memory(GiB)": 302.58, + "step": 256360, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75085239, + "epoch": 1.4337983941211916, + "grad_norm": 5.53125, + "learning_rate": 2.0356965377517323e-06, + "loss": 0.99296942, + "memory(GiB)": 302.58, + "step": 256380, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74312391, + "epoch": 1.433910243594171, + "grad_norm": 4.75, + "learning_rate": 2.0349519188847868e-06, + "loss": 1.03974056, + "memory(GiB)": 302.58, + "step": 256400, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74679656, + "epoch": 1.4340220930671501, + "grad_norm": 9.5625, + "learning_rate": 2.0342074014318873e-06, + "loss": 0.9812933, + "memory(GiB)": 302.58, + "step": 256420, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.74623804, + "epoch": 1.4341339425401296, + "grad_norm": 5.90625, + "learning_rate": 2.0334629854184995e-06, + "loss": 1.00183811, + "memory(GiB)": 302.58, + "step": 256440, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.75383048, + "epoch": 1.4342457920131086, + "grad_norm": 6.53125, + "learning_rate": 2.0327186708700835e-06, + "loss": 0.97667217, + "memory(GiB)": 302.58, + "step": 256460, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75598044, + "epoch": 1.4343576414860881, + "grad_norm": 7.09375, + "learning_rate": 2.0319744578120985e-06, + "loss": 0.95936823, + "memory(GiB)": 302.58, + "step": 256480, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.74041395, + "epoch": 1.4344694909590672, + "grad_norm": 7.0, + "learning_rate": 2.0312303462699984e-06, + "loss": 1.0090867, + "memory(GiB)": 302.58, + "step": 256500, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.72822914, + "epoch": 1.4345813404320467, + "grad_norm": 6.84375, + "learning_rate": 2.030486336269234e-06, + "loss": 1.07392521, + "memory(GiB)": 302.58, + "step": 256520, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.75423508, + "epoch": 1.4346931899050257, + "grad_norm": 6.25, + "learning_rate": 2.0297424278352512e-06, + "loss": 0.96475058, + "memory(GiB)": 302.58, + "step": 256540, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.74504375, + "epoch": 1.4348050393780052, + "grad_norm": 7.9375, + "learning_rate": 2.0289986209934977e-06, + "loss": 1.00420446, + "memory(GiB)": 302.58, + "step": 256560, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.76275997, + "epoch": 1.4349168888509842, + "grad_norm": 7.8125, + "learning_rate": 2.028254915769412e-06, + "loss": 0.93382044, + "memory(GiB)": 302.58, + "step": 256580, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.74497938, + "epoch": 1.4350287383239637, + "grad_norm": 7.9375, + "learning_rate": 2.027511312188431e-06, + "loss": 1.01431732, + "memory(GiB)": 302.58, + "step": 256600, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.75074377, + "epoch": 1.4351405877969428, + "grad_norm": 5.40625, + "learning_rate": 2.0267678102759893e-06, + "loss": 0.97297516, + "memory(GiB)": 302.58, + "step": 256620, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.75671711, + "epoch": 1.4352524372699222, + "grad_norm": 7.0, + "learning_rate": 2.026024410057516e-06, + "loss": 0.94218874, + "memory(GiB)": 302.58, + "step": 256640, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.74990635, + "epoch": 1.4353642867429013, + "grad_norm": 8.4375, + "learning_rate": 2.025281111558438e-06, + "loss": 0.98270216, + "memory(GiB)": 302.58, + "step": 256660, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.73924532, + "epoch": 1.4354761362158808, + "grad_norm": 6.28125, + "learning_rate": 2.024537914804179e-06, + "loss": 1.01956215, + "memory(GiB)": 302.58, + "step": 256680, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.74941149, + "epoch": 1.4355879856888598, + "grad_norm": 7.5, + "learning_rate": 2.023794819820158e-06, + "loss": 0.98162069, + "memory(GiB)": 302.58, + "step": 256700, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.7527966, + "epoch": 1.4356998351618393, + "grad_norm": 7.03125, + "learning_rate": 2.023051826631791e-06, + "loss": 0.95860662, + "memory(GiB)": 302.58, + "step": 256720, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75185328, + "epoch": 1.4358116846348183, + "grad_norm": 5.625, + "learning_rate": 2.022308935264492e-06, + "loss": 0.9899416, + "memory(GiB)": 302.58, + "step": 256740, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.74493313, + "epoch": 1.4359235341077978, + "grad_norm": 9.0, + "learning_rate": 2.0215661457436684e-06, + "loss": 1.00062494, + "memory(GiB)": 302.58, + "step": 256760, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.73828688, + "epoch": 1.4360353835807769, + "grad_norm": 8.9375, + "learning_rate": 2.020823458094728e-06, + "loss": 1.03743067, + "memory(GiB)": 302.58, + "step": 256780, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74539285, + "epoch": 1.4361472330537564, + "grad_norm": 7.1875, + "learning_rate": 2.0200808723430716e-06, + "loss": 1.01288242, + "memory(GiB)": 302.58, + "step": 256800, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.76727753, + "epoch": 1.4362590825267354, + "grad_norm": 7.9375, + "learning_rate": 2.0193383885140967e-06, + "loss": 0.91443033, + "memory(GiB)": 302.58, + "step": 256820, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75232606, + "epoch": 1.4363709319997149, + "grad_norm": 8.1875, + "learning_rate": 2.018596006633202e-06, + "loss": 0.97588396, + "memory(GiB)": 302.58, + "step": 256840, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.76452193, + "epoch": 1.436482781472694, + "grad_norm": 5.03125, + "learning_rate": 2.0178537267257775e-06, + "loss": 0.91766615, + "memory(GiB)": 302.58, + "step": 256860, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.76505837, + "epoch": 1.4365946309456734, + "grad_norm": 6.6875, + "learning_rate": 2.017111548817212e-06, + "loss": 0.91412325, + "memory(GiB)": 302.58, + "step": 256880, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.75214968, + "epoch": 1.4367064804186525, + "grad_norm": 6.625, + "learning_rate": 2.0163694729328904e-06, + "loss": 0.96414518, + "memory(GiB)": 302.58, + "step": 256900, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75982203, + "epoch": 1.436818329891632, + "grad_norm": 6.25, + "learning_rate": 2.015627499098193e-06, + "loss": 0.94014406, + "memory(GiB)": 302.58, + "step": 256920, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74927745, + "epoch": 1.436930179364611, + "grad_norm": 7.875, + "learning_rate": 2.014885627338498e-06, + "loss": 0.99050732, + "memory(GiB)": 302.58, + "step": 256940, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.7615572, + "epoch": 1.4370420288375905, + "grad_norm": 7.125, + "learning_rate": 2.014143857679181e-06, + "loss": 0.91986313, + "memory(GiB)": 302.58, + "step": 256960, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.73548293, + "epoch": 1.4371538783105695, + "grad_norm": 6.71875, + "learning_rate": 2.0134021901456115e-06, + "loss": 1.05817537, + "memory(GiB)": 302.58, + "step": 256980, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.75222669, + "epoch": 1.437265727783549, + "grad_norm": 8.375, + "learning_rate": 2.0126606247631573e-06, + "loss": 0.96081657, + "memory(GiB)": 302.58, + "step": 257000, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.75843682, + "epoch": 1.437377577256528, + "grad_norm": 9.5625, + "learning_rate": 2.0119191615571826e-06, + "loss": 0.95809593, + "memory(GiB)": 302.58, + "step": 257020, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.73141041, + "epoch": 1.4374894267295075, + "grad_norm": 6.6875, + "learning_rate": 2.0111778005530476e-06, + "loss": 1.05990553, + "memory(GiB)": 302.58, + "step": 257040, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.7573339, + "epoch": 1.4376012762024866, + "grad_norm": 10.875, + "learning_rate": 2.0104365417761085e-06, + "loss": 0.94928122, + "memory(GiB)": 302.58, + "step": 257060, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.76083999, + "epoch": 1.437713125675466, + "grad_norm": 12.125, + "learning_rate": 2.009695385251718e-06, + "loss": 0.94770212, + "memory(GiB)": 302.58, + "step": 257080, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.74587069, + "epoch": 1.437824975148445, + "grad_norm": 9.625, + "learning_rate": 2.008954331005229e-06, + "loss": 1.01064844, + "memory(GiB)": 302.58, + "step": 257100, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.75870457, + "epoch": 1.4379368246214246, + "grad_norm": 11.4375, + "learning_rate": 2.0082133790619863e-06, + "loss": 0.93397064, + "memory(GiB)": 302.58, + "step": 257120, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75561719, + "epoch": 1.4380486740944036, + "grad_norm": 5.53125, + "learning_rate": 2.007472529447331e-06, + "loss": 0.9599762, + "memory(GiB)": 302.58, + "step": 257140, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.76471086, + "epoch": 1.438160523567383, + "grad_norm": 6.375, + "learning_rate": 2.0067317821866057e-06, + "loss": 0.91074657, + "memory(GiB)": 302.58, + "step": 257160, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.75341916, + "epoch": 1.4382723730403622, + "grad_norm": 6.09375, + "learning_rate": 2.0059911373051448e-06, + "loss": 0.96036348, + "memory(GiB)": 302.58, + "step": 257180, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.7489944, + "epoch": 1.4383842225133416, + "grad_norm": 9.0625, + "learning_rate": 2.005250594828281e-06, + "loss": 0.99371786, + "memory(GiB)": 302.58, + "step": 257200, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.76688261, + "epoch": 1.4384960719863207, + "grad_norm": 8.5625, + "learning_rate": 2.0045101547813432e-06, + "loss": 0.91097937, + "memory(GiB)": 302.58, + "step": 257220, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.74099989, + "epoch": 1.4386079214593002, + "grad_norm": 8.1875, + "learning_rate": 2.0037698171896565e-06, + "loss": 1.05154104, + "memory(GiB)": 302.58, + "step": 257240, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75969381, + "epoch": 1.4387197709322792, + "grad_norm": 9.3125, + "learning_rate": 2.003029582078542e-06, + "loss": 0.94428625, + "memory(GiB)": 302.58, + "step": 257260, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.73835607, + "epoch": 1.4388316204052587, + "grad_norm": 8.25, + "learning_rate": 2.0022894494733203e-06, + "loss": 1.01811056, + "memory(GiB)": 302.58, + "step": 257280, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.75685234, + "epoch": 1.4389434698782377, + "grad_norm": 8.3125, + "learning_rate": 2.001549419399304e-06, + "loss": 0.94700079, + "memory(GiB)": 302.58, + "step": 257300, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.74308233, + "epoch": 1.4390553193512172, + "grad_norm": 7.15625, + "learning_rate": 2.0008094918818055e-06, + "loss": 1.02732639, + "memory(GiB)": 302.58, + "step": 257320, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.73073611, + "epoch": 1.4391671688241963, + "grad_norm": 7.34375, + "learning_rate": 2.000069666946133e-06, + "loss": 1.06871529, + "memory(GiB)": 302.58, + "step": 257340, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.74576588, + "epoch": 1.4392790182971757, + "grad_norm": 13.125, + "learning_rate": 1.9993299446175894e-06, + "loss": 1.00307856, + "memory(GiB)": 302.58, + "step": 257360, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.75065584, + "epoch": 1.4393908677701548, + "grad_norm": 7.71875, + "learning_rate": 1.9985903249214773e-06, + "loss": 0.9760664, + "memory(GiB)": 302.58, + "step": 257380, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.73960423, + "epoch": 1.4395027172431343, + "grad_norm": 7.3125, + "learning_rate": 1.997850807883093e-06, + "loss": 1.03087854, + "memory(GiB)": 302.58, + "step": 257400, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.74982486, + "epoch": 1.4396145667161133, + "grad_norm": 9.875, + "learning_rate": 1.997111393527729e-06, + "loss": 1.00182314, + "memory(GiB)": 302.58, + "step": 257420, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.75349388, + "epoch": 1.4397264161890928, + "grad_norm": 6.78125, + "learning_rate": 1.996372081880679e-06, + "loss": 0.97312136, + "memory(GiB)": 302.58, + "step": 257440, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.75449705, + "epoch": 1.4398382656620718, + "grad_norm": 5.59375, + "learning_rate": 1.9956328729672284e-06, + "loss": 0.96748238, + "memory(GiB)": 302.58, + "step": 257460, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.74119315, + "epoch": 1.4399501151350513, + "grad_norm": 5.5625, + "learning_rate": 1.99489376681266e-06, + "loss": 1.03783703, + "memory(GiB)": 302.58, + "step": 257480, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.7603395, + "epoch": 1.4400619646080304, + "grad_norm": 7.03125, + "learning_rate": 1.9941547634422535e-06, + "loss": 0.94405136, + "memory(GiB)": 302.58, + "step": 257500, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.74271541, + "epoch": 1.4401738140810099, + "grad_norm": 5.375, + "learning_rate": 1.9934158628812857e-06, + "loss": 1.00807428, + "memory(GiB)": 302.58, + "step": 257520, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.75531087, + "epoch": 1.440285663553989, + "grad_norm": 7.34375, + "learning_rate": 1.9926770651550288e-06, + "loss": 0.98863716, + "memory(GiB)": 302.58, + "step": 257540, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.74995613, + "epoch": 1.4403975130269684, + "grad_norm": 6.71875, + "learning_rate": 1.991938370288752e-06, + "loss": 0.98783646, + "memory(GiB)": 302.58, + "step": 257560, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74724708, + "epoch": 1.4405093624999474, + "grad_norm": 5.75, + "learning_rate": 1.9911997783077215e-06, + "loss": 1.00461807, + "memory(GiB)": 302.58, + "step": 257580, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.74006958, + "epoch": 1.440621211972927, + "grad_norm": 5.78125, + "learning_rate": 1.9904612892371988e-06, + "loss": 1.02869806, + "memory(GiB)": 302.58, + "step": 257600, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.74823556, + "epoch": 1.4407330614459062, + "grad_norm": 7.9375, + "learning_rate": 1.9897229031024435e-06, + "loss": 0.97870731, + "memory(GiB)": 302.58, + "step": 257620, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.75238557, + "epoch": 1.4408449109188854, + "grad_norm": 6.4375, + "learning_rate": 1.98898461992871e-06, + "loss": 0.97175379, + "memory(GiB)": 302.58, + "step": 257640, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.73124604, + "epoch": 1.4409567603918647, + "grad_norm": 5.1875, + "learning_rate": 1.9882464397412503e-06, + "loss": 1.07086849, + "memory(GiB)": 302.58, + "step": 257660, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.74603524, + "epoch": 1.441068609864844, + "grad_norm": 7.125, + "learning_rate": 1.9875083625653103e-06, + "loss": 0.99786568, + "memory(GiB)": 302.58, + "step": 257680, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.76446157, + "epoch": 1.4411804593378232, + "grad_norm": 8.0625, + "learning_rate": 1.9867703884261384e-06, + "loss": 0.92059793, + "memory(GiB)": 302.58, + "step": 257700, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.75969062, + "epoch": 1.4412923088108025, + "grad_norm": 5.46875, + "learning_rate": 1.986032517348975e-06, + "loss": 0.93369036, + "memory(GiB)": 302.58, + "step": 257720, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.76264257, + "epoch": 1.4414041582837818, + "grad_norm": 5.78125, + "learning_rate": 1.9852947493590553e-06, + "loss": 0.91944942, + "memory(GiB)": 302.58, + "step": 257740, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.75546894, + "epoch": 1.441516007756761, + "grad_norm": 8.5, + "learning_rate": 1.9845570844816147e-06, + "loss": 0.93115292, + "memory(GiB)": 302.58, + "step": 257760, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.72545962, + "epoch": 1.4416278572297403, + "grad_norm": 6.84375, + "learning_rate": 1.983819522741884e-06, + "loss": 1.10363436, + "memory(GiB)": 302.58, + "step": 257780, + "train_speed(iter/s)": 0.123775 + }, + { + "acc": 0.75874534, + "epoch": 1.4417397067027196, + "grad_norm": 6.1875, + "learning_rate": 1.9830820641650895e-06, + "loss": 0.94028749, + "memory(GiB)": 302.58, + "step": 257800, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.78017478, + "epoch": 1.4418515561756988, + "grad_norm": 7.15625, + "learning_rate": 1.982344708776455e-06, + "loss": 0.85440197, + "memory(GiB)": 302.58, + "step": 257820, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.7461657, + "epoch": 1.441963405648678, + "grad_norm": 6.59375, + "learning_rate": 1.9816074566011996e-06, + "loss": 1.02301235, + "memory(GiB)": 302.58, + "step": 257840, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.75996432, + "epoch": 1.4420752551216574, + "grad_norm": 7.40625, + "learning_rate": 1.980870307664541e-06, + "loss": 0.94021149, + "memory(GiB)": 302.58, + "step": 257860, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.74116468, + "epoch": 1.4421871045946366, + "grad_norm": 6.9375, + "learning_rate": 1.9801332619916907e-06, + "loss": 1.01190004, + "memory(GiB)": 302.58, + "step": 257880, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.74712582, + "epoch": 1.4422989540676159, + "grad_norm": 6.1875, + "learning_rate": 1.9793963196078587e-06, + "loss": 0.96556988, + "memory(GiB)": 302.58, + "step": 257900, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.74562511, + "epoch": 1.4424108035405951, + "grad_norm": 8.4375, + "learning_rate": 1.9786594805382502e-06, + "loss": 1.03830767, + "memory(GiB)": 302.58, + "step": 257920, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.74281988, + "epoch": 1.4425226530135744, + "grad_norm": 7.1875, + "learning_rate": 1.9779227448080687e-06, + "loss": 1.0008379, + "memory(GiB)": 302.58, + "step": 257940, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.75367336, + "epoch": 1.4426345024865537, + "grad_norm": 7.1875, + "learning_rate": 1.9771861124425103e-06, + "loss": 0.95578585, + "memory(GiB)": 302.58, + "step": 257960, + "train_speed(iter/s)": 0.123817 + }, + { + "acc": 0.75482373, + "epoch": 1.442746351959533, + "grad_norm": 9.0, + "learning_rate": 1.976449583466773e-06, + "loss": 0.9635993, + "memory(GiB)": 302.58, + "step": 257980, + "train_speed(iter/s)": 0.123821 + }, + { + "acc": 0.75993018, + "epoch": 1.4428582014325122, + "grad_norm": 8.0625, + "learning_rate": 1.975713157906048e-06, + "loss": 0.94657059, + "memory(GiB)": 302.58, + "step": 258000, + "train_speed(iter/s)": 0.123826 + }, + { + "epoch": 1.4428582014325122, + "eval_acc": 0.7068553294114782, + "eval_loss": 1.0120078325271606, + "eval_runtime": 7545.6337, + "eval_samples_per_second": 9.977, + "eval_steps_per_second": 9.977, + "step": 258000 + }, + { + "acc": 0.75288639, + "epoch": 1.4429700509054915, + "grad_norm": 6.6875, + "learning_rate": 1.974976835785522e-06, + "loss": 0.98130951, + "memory(GiB)": 302.58, + "step": 258020, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.76200652, + "epoch": 1.4430819003784707, + "grad_norm": 7.75, + "learning_rate": 1.9742406171303808e-06, + "loss": 0.92651014, + "memory(GiB)": 302.58, + "step": 258040, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.75035648, + "epoch": 1.44319374985145, + "grad_norm": 6.21875, + "learning_rate": 1.973504501965805e-06, + "loss": 0.98263416, + "memory(GiB)": 302.58, + "step": 258060, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.74287453, + "epoch": 1.4433055993244293, + "grad_norm": 8.3125, + "learning_rate": 1.972768490316972e-06, + "loss": 1.02359991, + "memory(GiB)": 302.58, + "step": 258080, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.73324428, + "epoch": 1.4434174487974085, + "grad_norm": 5.6875, + "learning_rate": 1.972032582209056e-06, + "loss": 1.04332771, + "memory(GiB)": 302.58, + "step": 258100, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.75331721, + "epoch": 1.4435292982703878, + "grad_norm": 9.75, + "learning_rate": 1.9712967776672266e-06, + "loss": 0.96764393, + "memory(GiB)": 302.58, + "step": 258120, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.75686088, + "epoch": 1.443641147743367, + "grad_norm": 5.0625, + "learning_rate": 1.970561076716651e-06, + "loss": 0.94849005, + "memory(GiB)": 302.58, + "step": 258140, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.75496016, + "epoch": 1.4437529972163463, + "grad_norm": 6.21875, + "learning_rate": 1.969825479382493e-06, + "loss": 0.9756526, + "memory(GiB)": 302.58, + "step": 258160, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.76718307, + "epoch": 1.4438648466893256, + "grad_norm": 9.625, + "learning_rate": 1.969089985689912e-06, + "loss": 0.92333336, + "memory(GiB)": 302.58, + "step": 258180, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.73199396, + "epoch": 1.4439766961623048, + "grad_norm": 6.625, + "learning_rate": 1.9683545956640643e-06, + "loss": 1.04737673, + "memory(GiB)": 302.58, + "step": 258200, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.73962903, + "epoch": 1.444088545635284, + "grad_norm": 6.34375, + "learning_rate": 1.967619309330102e-06, + "loss": 1.02579517, + "memory(GiB)": 302.58, + "step": 258220, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.75936022, + "epoch": 1.4442003951082634, + "grad_norm": 7.28125, + "learning_rate": 1.9668841267131738e-06, + "loss": 0.95303545, + "memory(GiB)": 302.58, + "step": 258240, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75640745, + "epoch": 1.4443122445812426, + "grad_norm": 5.0625, + "learning_rate": 1.9661490478384278e-06, + "loss": 0.95228462, + "memory(GiB)": 302.58, + "step": 258260, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74978509, + "epoch": 1.444424094054222, + "grad_norm": 8.0625, + "learning_rate": 1.965414072731004e-06, + "loss": 0.96891546, + "memory(GiB)": 302.58, + "step": 258280, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.76394687, + "epoch": 1.4445359435272012, + "grad_norm": 6.53125, + "learning_rate": 1.9646792014160417e-06, + "loss": 0.91927299, + "memory(GiB)": 302.58, + "step": 258300, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.75140715, + "epoch": 1.4446477930001804, + "grad_norm": 7.125, + "learning_rate": 1.963944433918675e-06, + "loss": 0.96357975, + "memory(GiB)": 302.58, + "step": 258320, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.74862914, + "epoch": 1.4447596424731597, + "grad_norm": 8.1875, + "learning_rate": 1.9632097702640348e-06, + "loss": 0.99157524, + "memory(GiB)": 302.58, + "step": 258340, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.7337379, + "epoch": 1.444871491946139, + "grad_norm": 11.4375, + "learning_rate": 1.962475210477251e-06, + "loss": 1.02867994, + "memory(GiB)": 302.58, + "step": 258360, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.74548626, + "epoch": 1.4449833414191182, + "grad_norm": 8.5625, + "learning_rate": 1.9617407545834465e-06, + "loss": 1.00466881, + "memory(GiB)": 302.58, + "step": 258380, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.74264665, + "epoch": 1.4450951908920975, + "grad_norm": 5.71875, + "learning_rate": 1.961006402607743e-06, + "loss": 0.99544621, + "memory(GiB)": 302.58, + "step": 258400, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74150214, + "epoch": 1.4452070403650767, + "grad_norm": 5.40625, + "learning_rate": 1.960272154575256e-06, + "loss": 1.03333549, + "memory(GiB)": 302.58, + "step": 258420, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.75316916, + "epoch": 1.445318889838056, + "grad_norm": 6.90625, + "learning_rate": 1.9595380105111006e-06, + "loss": 0.97675829, + "memory(GiB)": 302.58, + "step": 258440, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.77504935, + "epoch": 1.4454307393110353, + "grad_norm": 7.96875, + "learning_rate": 1.958803970440387e-06, + "loss": 0.88204546, + "memory(GiB)": 302.58, + "step": 258460, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.7644402, + "epoch": 1.4455425887840145, + "grad_norm": 9.875, + "learning_rate": 1.9580700343882196e-06, + "loss": 0.94146862, + "memory(GiB)": 302.58, + "step": 258480, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.73974199, + "epoch": 1.4456544382569938, + "grad_norm": 8.875, + "learning_rate": 1.9573362023797034e-06, + "loss": 1.0133399, + "memory(GiB)": 302.58, + "step": 258500, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74560981, + "epoch": 1.445766287729973, + "grad_norm": 7.6875, + "learning_rate": 1.956602474439937e-06, + "loss": 0.98778477, + "memory(GiB)": 302.58, + "step": 258520, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.76664815, + "epoch": 1.4458781372029523, + "grad_norm": 8.8125, + "learning_rate": 1.9558688505940166e-06, + "loss": 0.91361923, + "memory(GiB)": 302.58, + "step": 258540, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.72765532, + "epoch": 1.4459899866759316, + "grad_norm": 7.375, + "learning_rate": 1.9551353308670316e-06, + "loss": 1.07497749, + "memory(GiB)": 302.58, + "step": 258560, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.72385426, + "epoch": 1.4461018361489109, + "grad_norm": 10.75, + "learning_rate": 1.954401915284076e-06, + "loss": 1.07950726, + "memory(GiB)": 302.58, + "step": 258580, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.74839449, + "epoch": 1.4462136856218901, + "grad_norm": 7.8125, + "learning_rate": 1.9536686038702317e-06, + "loss": 0.99519339, + "memory(GiB)": 302.58, + "step": 258600, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74429274, + "epoch": 1.4463255350948694, + "grad_norm": 7.59375, + "learning_rate": 1.952935396650581e-06, + "loss": 1.0125967, + "memory(GiB)": 302.58, + "step": 258620, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.76047077, + "epoch": 1.4464373845678486, + "grad_norm": 5.75, + "learning_rate": 1.952202293650202e-06, + "loss": 0.95856628, + "memory(GiB)": 302.58, + "step": 258640, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75148325, + "epoch": 1.446549234040828, + "grad_norm": 8.25, + "learning_rate": 1.9514692948941683e-06, + "loss": 1.00339355, + "memory(GiB)": 302.58, + "step": 258660, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.74992313, + "epoch": 1.4466610835138072, + "grad_norm": 6.34375, + "learning_rate": 1.9507364004075517e-06, + "loss": 0.97889013, + "memory(GiB)": 302.58, + "step": 258680, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.76412282, + "epoch": 1.4467729329867864, + "grad_norm": 7.46875, + "learning_rate": 1.9500036102154193e-06, + "loss": 0.93419046, + "memory(GiB)": 302.58, + "step": 258700, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74407153, + "epoch": 1.4468847824597657, + "grad_norm": 4.65625, + "learning_rate": 1.949270924342834e-06, + "loss": 1.01035433, + "memory(GiB)": 302.58, + "step": 258720, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75902829, + "epoch": 1.446996631932745, + "grad_norm": 9.0, + "learning_rate": 1.9485383428148576e-06, + "loss": 0.94551554, + "memory(GiB)": 302.58, + "step": 258740, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.73733811, + "epoch": 1.4471084814057242, + "grad_norm": 7.5625, + "learning_rate": 1.947805865656545e-06, + "loss": 1.030299, + "memory(GiB)": 302.58, + "step": 258760, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.75132847, + "epoch": 1.4472203308787035, + "grad_norm": 6.6875, + "learning_rate": 1.94707349289295e-06, + "loss": 0.97818623, + "memory(GiB)": 302.58, + "step": 258780, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.73614049, + "epoch": 1.4473321803516828, + "grad_norm": 7.84375, + "learning_rate": 1.946341224549123e-06, + "loss": 1.04794703, + "memory(GiB)": 302.58, + "step": 258800, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.74868569, + "epoch": 1.447444029824662, + "grad_norm": 4.96875, + "learning_rate": 1.9456090606501083e-06, + "loss": 1.00019293, + "memory(GiB)": 302.58, + "step": 258820, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.77365999, + "epoch": 1.4475558792976413, + "grad_norm": 7.84375, + "learning_rate": 1.9448770012209474e-06, + "loss": 0.8795125, + "memory(GiB)": 302.58, + "step": 258840, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.7389329, + "epoch": 1.4476677287706206, + "grad_norm": 9.8125, + "learning_rate": 1.9441450462866824e-06, + "loss": 1.03536243, + "memory(GiB)": 302.58, + "step": 258860, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.75401192, + "epoch": 1.4477795782435998, + "grad_norm": 9.6875, + "learning_rate": 1.943413195872347e-06, + "loss": 0.97393932, + "memory(GiB)": 302.58, + "step": 258880, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.75760126, + "epoch": 1.447891427716579, + "grad_norm": 9.0625, + "learning_rate": 1.9426814500029717e-06, + "loss": 0.93582211, + "memory(GiB)": 302.58, + "step": 258900, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.76066141, + "epoch": 1.4480032771895583, + "grad_norm": 8.0, + "learning_rate": 1.941949808703585e-06, + "loss": 0.94942732, + "memory(GiB)": 302.58, + "step": 258920, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74933338, + "epoch": 1.4481151266625376, + "grad_norm": 7.78125, + "learning_rate": 1.941218271999213e-06, + "loss": 0.98509388, + "memory(GiB)": 302.58, + "step": 258940, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.75936308, + "epoch": 1.4482269761355169, + "grad_norm": 6.5625, + "learning_rate": 1.9404868399148745e-06, + "loss": 0.92247496, + "memory(GiB)": 302.58, + "step": 258960, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.74779558, + "epoch": 1.4483388256084961, + "grad_norm": 5.9375, + "learning_rate": 1.939755512475588e-06, + "loss": 1.00584698, + "memory(GiB)": 302.58, + "step": 258980, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.75458517, + "epoch": 1.4484506750814754, + "grad_norm": 5.625, + "learning_rate": 1.939024289706366e-06, + "loss": 0.95751553, + "memory(GiB)": 302.58, + "step": 259000, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74950099, + "epoch": 1.4485625245544547, + "grad_norm": 7.125, + "learning_rate": 1.9382931716322195e-06, + "loss": 1.01077881, + "memory(GiB)": 302.58, + "step": 259020, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.77209454, + "epoch": 1.448674374027434, + "grad_norm": 6.65625, + "learning_rate": 1.9375621582781555e-06, + "loss": 0.88285418, + "memory(GiB)": 302.58, + "step": 259040, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74900007, + "epoch": 1.4487862235004132, + "grad_norm": 7.0, + "learning_rate": 1.936831249669176e-06, + "loss": 0.97821798, + "memory(GiB)": 302.58, + "step": 259060, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.74752464, + "epoch": 1.4488980729733925, + "grad_norm": 7.875, + "learning_rate": 1.9361004458302805e-06, + "loss": 0.99567881, + "memory(GiB)": 302.58, + "step": 259080, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.75864625, + "epoch": 1.4490099224463717, + "grad_norm": 7.15625, + "learning_rate": 1.9353697467864657e-06, + "loss": 0.93981628, + "memory(GiB)": 302.58, + "step": 259100, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.75118675, + "epoch": 1.449121771919351, + "grad_norm": 6.53125, + "learning_rate": 1.934639152562722e-06, + "loss": 0.97733936, + "memory(GiB)": 302.58, + "step": 259120, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.76248436, + "epoch": 1.4492336213923303, + "grad_norm": 10.6875, + "learning_rate": 1.93390866318404e-06, + "loss": 0.92796764, + "memory(GiB)": 302.58, + "step": 259140, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.74543285, + "epoch": 1.4493454708653095, + "grad_norm": 8.75, + "learning_rate": 1.933178278675404e-06, + "loss": 1.00917797, + "memory(GiB)": 302.58, + "step": 259160, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.73949366, + "epoch": 1.4494573203382888, + "grad_norm": 5.25, + "learning_rate": 1.932447999061796e-06, + "loss": 1.01483307, + "memory(GiB)": 302.58, + "step": 259180, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.74553452, + "epoch": 1.449569169811268, + "grad_norm": 10.1875, + "learning_rate": 1.9317178243681933e-06, + "loss": 1.01334887, + "memory(GiB)": 302.58, + "step": 259200, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.75564027, + "epoch": 1.4496810192842473, + "grad_norm": 8.625, + "learning_rate": 1.93098775461957e-06, + "loss": 0.94282427, + "memory(GiB)": 302.58, + "step": 259220, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.77349539, + "epoch": 1.4497928687572266, + "grad_norm": 8.0, + "learning_rate": 1.9302577898408975e-06, + "loss": 0.88621798, + "memory(GiB)": 302.58, + "step": 259240, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.75459061, + "epoch": 1.4499047182302058, + "grad_norm": 6.25, + "learning_rate": 1.929527930057142e-06, + "loss": 0.97108555, + "memory(GiB)": 302.58, + "step": 259260, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.72038965, + "epoch": 1.450016567703185, + "grad_norm": 6.96875, + "learning_rate": 1.9287981752932677e-06, + "loss": 1.11332502, + "memory(GiB)": 302.58, + "step": 259280, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.7552681, + "epoch": 1.4501284171761644, + "grad_norm": 8.5625, + "learning_rate": 1.9280685255742344e-06, + "loss": 0.94712038, + "memory(GiB)": 302.58, + "step": 259300, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.7622858, + "epoch": 1.4502402666491436, + "grad_norm": 9.25, + "learning_rate": 1.927338980924998e-06, + "loss": 0.92093735, + "memory(GiB)": 302.58, + "step": 259320, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.75082545, + "epoch": 1.450352116122123, + "grad_norm": 10.3125, + "learning_rate": 1.9266095413705122e-06, + "loss": 0.98929434, + "memory(GiB)": 302.58, + "step": 259340, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.74902925, + "epoch": 1.4504639655951022, + "grad_norm": 8.75, + "learning_rate": 1.9258802069357254e-06, + "loss": 1.00196362, + "memory(GiB)": 302.58, + "step": 259360, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.74920931, + "epoch": 1.4505758150680814, + "grad_norm": 9.25, + "learning_rate": 1.925150977645581e-06, + "loss": 0.99348001, + "memory(GiB)": 302.58, + "step": 259380, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.76404543, + "epoch": 1.4506876645410607, + "grad_norm": 6.78125, + "learning_rate": 1.924421853525026e-06, + "loss": 0.91202526, + "memory(GiB)": 302.58, + "step": 259400, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.74683466, + "epoch": 1.45079951401404, + "grad_norm": 4.375, + "learning_rate": 1.9236928345989954e-06, + "loss": 0.99226999, + "memory(GiB)": 302.58, + "step": 259420, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.72753429, + "epoch": 1.4509113634870192, + "grad_norm": 6.03125, + "learning_rate": 1.922963920892424e-06, + "loss": 1.0720665, + "memory(GiB)": 302.58, + "step": 259440, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.76394873, + "epoch": 1.4510232129599985, + "grad_norm": 6.53125, + "learning_rate": 1.922235112430244e-06, + "loss": 0.91063623, + "memory(GiB)": 302.58, + "step": 259460, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.77052712, + "epoch": 1.4511350624329777, + "grad_norm": 11.3125, + "learning_rate": 1.9215064092373827e-06, + "loss": 0.89795637, + "memory(GiB)": 302.58, + "step": 259480, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.74174857, + "epoch": 1.451246911905957, + "grad_norm": 9.875, + "learning_rate": 1.9207778113387636e-06, + "loss": 1.00533533, + "memory(GiB)": 302.58, + "step": 259500, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.74724712, + "epoch": 1.4513587613789363, + "grad_norm": 10.1875, + "learning_rate": 1.9200493187593077e-06, + "loss": 0.99645462, + "memory(GiB)": 302.58, + "step": 259520, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.73640552, + "epoch": 1.4514706108519155, + "grad_norm": 6.3125, + "learning_rate": 1.9193209315239307e-06, + "loss": 1.05683222, + "memory(GiB)": 302.58, + "step": 259540, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.77046347, + "epoch": 1.4515824603248948, + "grad_norm": 7.15625, + "learning_rate": 1.918592649657547e-06, + "loss": 0.90050516, + "memory(GiB)": 302.58, + "step": 259560, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.74882855, + "epoch": 1.451694309797874, + "grad_norm": 7.8125, + "learning_rate": 1.9178644731850653e-06, + "loss": 0.99300194, + "memory(GiB)": 302.58, + "step": 259580, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74553208, + "epoch": 1.4518061592708533, + "grad_norm": 7.96875, + "learning_rate": 1.91713640213139e-06, + "loss": 1.04222593, + "memory(GiB)": 302.58, + "step": 259600, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.75494337, + "epoch": 1.4519180087438326, + "grad_norm": 5.5, + "learning_rate": 1.916408436521427e-06, + "loss": 0.97023516, + "memory(GiB)": 302.58, + "step": 259620, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.76092362, + "epoch": 1.4520298582168119, + "grad_norm": 7.46875, + "learning_rate": 1.9156805763800734e-06, + "loss": 0.92634115, + "memory(GiB)": 302.58, + "step": 259640, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.74912453, + "epoch": 1.4521417076897911, + "grad_norm": 8.125, + "learning_rate": 1.9149528217322246e-06, + "loss": 0.99673986, + "memory(GiB)": 302.58, + "step": 259660, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.7704505, + "epoch": 1.4522535571627704, + "grad_norm": 7.1875, + "learning_rate": 1.914225172602771e-06, + "loss": 0.9008008, + "memory(GiB)": 302.58, + "step": 259680, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.75617485, + "epoch": 1.4523654066357496, + "grad_norm": 7.8125, + "learning_rate": 1.913497629016601e-06, + "loss": 0.94801779, + "memory(GiB)": 302.58, + "step": 259700, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.75754943, + "epoch": 1.452477256108729, + "grad_norm": 8.4375, + "learning_rate": 1.912770190998598e-06, + "loss": 0.941467, + "memory(GiB)": 302.58, + "step": 259720, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.75018668, + "epoch": 1.4525891055817082, + "grad_norm": 8.6875, + "learning_rate": 1.9120428585736458e-06, + "loss": 0.98590336, + "memory(GiB)": 302.58, + "step": 259740, + "train_speed(iter/s)": 0.123762 + }, + { + "acc": 0.74805026, + "epoch": 1.4527009550546874, + "grad_norm": 10.0, + "learning_rate": 1.911315631766619e-06, + "loss": 0.98547239, + "memory(GiB)": 302.58, + "step": 259760, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.75128646, + "epoch": 1.4528128045276667, + "grad_norm": 6.65625, + "learning_rate": 1.910588510602391e-06, + "loss": 0.97632771, + "memory(GiB)": 302.58, + "step": 259780, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.73912387, + "epoch": 1.452924654000646, + "grad_norm": 5.375, + "learning_rate": 1.9098614951058337e-06, + "loss": 1.02025766, + "memory(GiB)": 302.58, + "step": 259800, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.76660104, + "epoch": 1.4530365034736252, + "grad_norm": 6.9375, + "learning_rate": 1.9091345853018106e-06, + "loss": 0.90058117, + "memory(GiB)": 302.58, + "step": 259820, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.73828559, + "epoch": 1.4531483529466045, + "grad_norm": 6.96875, + "learning_rate": 1.9084077812151857e-06, + "loss": 1.02772207, + "memory(GiB)": 302.58, + "step": 259840, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.72501197, + "epoch": 1.4532602024195838, + "grad_norm": 6.25, + "learning_rate": 1.9076810828708182e-06, + "loss": 1.09873629, + "memory(GiB)": 302.58, + "step": 259860, + "train_speed(iter/s)": 0.123789 + }, + { + "acc": 0.74044347, + "epoch": 1.453372051892563, + "grad_norm": 7.125, + "learning_rate": 1.9069544902935632e-06, + "loss": 1.02273855, + "memory(GiB)": 302.58, + "step": 259880, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.74132142, + "epoch": 1.4534839013655423, + "grad_norm": 4.1875, + "learning_rate": 1.906228003508272e-06, + "loss": 1.02791576, + "memory(GiB)": 302.58, + "step": 259900, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.75698633, + "epoch": 1.4535957508385215, + "grad_norm": 5.46875, + "learning_rate": 1.9055016225397937e-06, + "loss": 0.96412382, + "memory(GiB)": 302.58, + "step": 259920, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.74801545, + "epoch": 1.4537076003115008, + "grad_norm": 7.8125, + "learning_rate": 1.9047753474129722e-06, + "loss": 1.01331053, + "memory(GiB)": 302.58, + "step": 259940, + "train_speed(iter/s)": 0.123807 + }, + { + "acc": 0.73892951, + "epoch": 1.45381944978448, + "grad_norm": 7.125, + "learning_rate": 1.9040491781526482e-06, + "loss": 1.05294733, + "memory(GiB)": 302.58, + "step": 259960, + "train_speed(iter/s)": 0.123811 + }, + { + "acc": 0.76119652, + "epoch": 1.4539312992574593, + "grad_norm": 9.3125, + "learning_rate": 1.9033231147836574e-06, + "loss": 0.95759821, + "memory(GiB)": 302.58, + "step": 259980, + "train_speed(iter/s)": 0.123816 + }, + { + "acc": 0.74839559, + "epoch": 1.4540431487304386, + "grad_norm": 6.59375, + "learning_rate": 1.9025971573308377e-06, + "loss": 0.97311487, + "memory(GiB)": 302.58, + "step": 260000, + "train_speed(iter/s)": 0.123821 + }, + { + "epoch": 1.4540431487304386, + "eval_acc": 0.7068650901751603, + "eval_loss": 1.0119327306747437, + "eval_runtime": 7536.5166, + "eval_samples_per_second": 9.989, + "eval_steps_per_second": 9.989, + "step": 260000 + }, + { + "acc": 0.74537973, + "epoch": 1.4541549982034179, + "grad_norm": 7.21875, + "learning_rate": 1.9018713058190164e-06, + "loss": 0.99422436, + "memory(GiB)": 302.58, + "step": 260020, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.74327478, + "epoch": 1.4542668476763971, + "grad_norm": 7.21875, + "learning_rate": 1.9011455602730205e-06, + "loss": 1.00598278, + "memory(GiB)": 302.58, + "step": 260040, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.75541859, + "epoch": 1.4543786971493764, + "grad_norm": 8.4375, + "learning_rate": 1.900419920717672e-06, + "loss": 0.9310421, + "memory(GiB)": 302.58, + "step": 260060, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.74683914, + "epoch": 1.4544905466223557, + "grad_norm": 6.59375, + "learning_rate": 1.8996943871777917e-06, + "loss": 1.00673161, + "memory(GiB)": 302.58, + "step": 260080, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.75661759, + "epoch": 1.454602396095335, + "grad_norm": 4.28125, + "learning_rate": 1.8989689596781936e-06, + "loss": 0.95744791, + "memory(GiB)": 302.58, + "step": 260100, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.75527515, + "epoch": 1.4547142455683142, + "grad_norm": 8.5625, + "learning_rate": 1.8982436382436903e-06, + "loss": 0.97305603, + "memory(GiB)": 302.58, + "step": 260120, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.74586129, + "epoch": 1.4548260950412935, + "grad_norm": 7.03125, + "learning_rate": 1.89751842289909e-06, + "loss": 0.98802376, + "memory(GiB)": 302.58, + "step": 260140, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.74694004, + "epoch": 1.4549379445142727, + "grad_norm": 6.9375, + "learning_rate": 1.8967933136691973e-06, + "loss": 0.99382238, + "memory(GiB)": 302.58, + "step": 260160, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.75133376, + "epoch": 1.455049793987252, + "grad_norm": 8.6875, + "learning_rate": 1.8960683105788136e-06, + "loss": 0.98708382, + "memory(GiB)": 302.58, + "step": 260180, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.7571475, + "epoch": 1.4551616434602312, + "grad_norm": 8.8125, + "learning_rate": 1.8953434136527354e-06, + "loss": 0.96300745, + "memory(GiB)": 302.58, + "step": 260200, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.73883729, + "epoch": 1.4552734929332105, + "grad_norm": 6.34375, + "learning_rate": 1.8946186229157576e-06, + "loss": 1.03398886, + "memory(GiB)": 302.58, + "step": 260220, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.73813639, + "epoch": 1.4553853424061898, + "grad_norm": 7.5, + "learning_rate": 1.8938939383926697e-06, + "loss": 1.0395566, + "memory(GiB)": 302.58, + "step": 260240, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75258403, + "epoch": 1.455497191879169, + "grad_norm": 7.59375, + "learning_rate": 1.8931693601082563e-06, + "loss": 0.99701881, + "memory(GiB)": 302.58, + "step": 260260, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.73720851, + "epoch": 1.4556090413521483, + "grad_norm": 7.96875, + "learning_rate": 1.8924448880873042e-06, + "loss": 1.02731819, + "memory(GiB)": 302.58, + "step": 260280, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.76824236, + "epoch": 1.4557208908251276, + "grad_norm": 7.0, + "learning_rate": 1.8917205223545904e-06, + "loss": 0.89212427, + "memory(GiB)": 302.58, + "step": 260300, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.7504539, + "epoch": 1.4558327402981068, + "grad_norm": 5.59375, + "learning_rate": 1.890996262934891e-06, + "loss": 0.98839035, + "memory(GiB)": 302.58, + "step": 260320, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.75821676, + "epoch": 1.455944589771086, + "grad_norm": 9.0, + "learning_rate": 1.8902721098529781e-06, + "loss": 0.96471357, + "memory(GiB)": 302.58, + "step": 260340, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.74235144, + "epoch": 1.4560564392440654, + "grad_norm": 5.4375, + "learning_rate": 1.8895480631336193e-06, + "loss": 1.0026535, + "memory(GiB)": 302.58, + "step": 260360, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.73801222, + "epoch": 1.4561682887170446, + "grad_norm": 7.3125, + "learning_rate": 1.8888241228015802e-06, + "loss": 1.0367135, + "memory(GiB)": 302.58, + "step": 260380, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73530593, + "epoch": 1.4562801381900239, + "grad_norm": 5.03125, + "learning_rate": 1.8881002888816209e-06, + "loss": 1.06300449, + "memory(GiB)": 302.58, + "step": 260400, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74632211, + "epoch": 1.4563919876630032, + "grad_norm": 11.5, + "learning_rate": 1.8873765613984995e-06, + "loss": 1.00372715, + "memory(GiB)": 302.58, + "step": 260420, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.75126786, + "epoch": 1.4565038371359824, + "grad_norm": 7.40625, + "learning_rate": 1.8866529403769695e-06, + "loss": 0.97067261, + "memory(GiB)": 302.58, + "step": 260440, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74455462, + "epoch": 1.4566156866089617, + "grad_norm": 6.875, + "learning_rate": 1.885929425841781e-06, + "loss": 1.00401039, + "memory(GiB)": 302.58, + "step": 260460, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74188542, + "epoch": 1.456727536081941, + "grad_norm": 6.59375, + "learning_rate": 1.8852060178176806e-06, + "loss": 1.03472595, + "memory(GiB)": 302.58, + "step": 260480, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.76514091, + "epoch": 1.4568393855549202, + "grad_norm": 6.8125, + "learning_rate": 1.8844827163294116e-06, + "loss": 0.93290472, + "memory(GiB)": 302.58, + "step": 260500, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.75526724, + "epoch": 1.4569512350278995, + "grad_norm": 10.375, + "learning_rate": 1.8837595214017125e-06, + "loss": 0.94277954, + "memory(GiB)": 302.58, + "step": 260520, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.76414018, + "epoch": 1.4570630845008787, + "grad_norm": 6.90625, + "learning_rate": 1.883036433059317e-06, + "loss": 0.91132479, + "memory(GiB)": 302.58, + "step": 260540, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74984775, + "epoch": 1.457174933973858, + "grad_norm": 5.34375, + "learning_rate": 1.8823134513269608e-06, + "loss": 0.98006945, + "memory(GiB)": 302.58, + "step": 260560, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.75584922, + "epoch": 1.4572867834468373, + "grad_norm": 7.0, + "learning_rate": 1.8815905762293706e-06, + "loss": 0.94446907, + "memory(GiB)": 302.58, + "step": 260580, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.72817559, + "epoch": 1.4573986329198165, + "grad_norm": 6.84375, + "learning_rate": 1.8808678077912711e-06, + "loss": 1.08938665, + "memory(GiB)": 302.58, + "step": 260600, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.74641566, + "epoch": 1.4575104823927958, + "grad_norm": 8.875, + "learning_rate": 1.8801451460373832e-06, + "loss": 1.00026665, + "memory(GiB)": 302.58, + "step": 260620, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.7542089, + "epoch": 1.457622331865775, + "grad_norm": 7.75, + "learning_rate": 1.879422590992424e-06, + "loss": 0.96037512, + "memory(GiB)": 302.58, + "step": 260640, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.73472219, + "epoch": 1.4577341813387543, + "grad_norm": 8.125, + "learning_rate": 1.8787001426811074e-06, + "loss": 1.06023102, + "memory(GiB)": 302.58, + "step": 260660, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.7490346, + "epoch": 1.4578460308117336, + "grad_norm": 10.4375, + "learning_rate": 1.877977801128143e-06, + "loss": 0.9927309, + "memory(GiB)": 302.58, + "step": 260680, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74611979, + "epoch": 1.4579578802847128, + "grad_norm": 6.28125, + "learning_rate": 1.8772555663582375e-06, + "loss": 0.99791355, + "memory(GiB)": 302.58, + "step": 260700, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.76379247, + "epoch": 1.4580697297576921, + "grad_norm": 8.375, + "learning_rate": 1.8765334383960937e-06, + "loss": 0.94094181, + "memory(GiB)": 302.58, + "step": 260720, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.734798, + "epoch": 1.4581815792306714, + "grad_norm": 5.0, + "learning_rate": 1.8758114172664104e-06, + "loss": 1.05397205, + "memory(GiB)": 302.58, + "step": 260740, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.75367131, + "epoch": 1.4582934287036506, + "grad_norm": 9.125, + "learning_rate": 1.8750895029938831e-06, + "loss": 0.94550896, + "memory(GiB)": 302.58, + "step": 260760, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.75768194, + "epoch": 1.45840527817663, + "grad_norm": 5.6875, + "learning_rate": 1.8743676956032042e-06, + "loss": 0.94601288, + "memory(GiB)": 302.58, + "step": 260780, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.76449294, + "epoch": 1.4585171276496092, + "grad_norm": 8.25, + "learning_rate": 1.8736459951190588e-06, + "loss": 0.91041059, + "memory(GiB)": 302.58, + "step": 260800, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.74068952, + "epoch": 1.4586289771225884, + "grad_norm": 5.09375, + "learning_rate": 1.8729244015661359e-06, + "loss": 1.02450905, + "memory(GiB)": 302.58, + "step": 260820, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.73776698, + "epoch": 1.4587408265955677, + "grad_norm": 7.25, + "learning_rate": 1.872202914969114e-06, + "loss": 1.03690948, + "memory(GiB)": 302.58, + "step": 260840, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.7450593, + "epoch": 1.458852676068547, + "grad_norm": 5.4375, + "learning_rate": 1.8714815353526688e-06, + "loss": 0.99393024, + "memory(GiB)": 302.58, + "step": 260860, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.76018462, + "epoch": 1.4589645255415262, + "grad_norm": 6.5625, + "learning_rate": 1.8707602627414768e-06, + "loss": 0.93979197, + "memory(GiB)": 302.58, + "step": 260880, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.76625633, + "epoch": 1.4590763750145055, + "grad_norm": 8.1875, + "learning_rate": 1.8700390971602067e-06, + "loss": 0.91089725, + "memory(GiB)": 302.58, + "step": 260900, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74141698, + "epoch": 1.4591882244874848, + "grad_norm": 6.40625, + "learning_rate": 1.8693180386335241e-06, + "loss": 1.01059761, + "memory(GiB)": 302.58, + "step": 260920, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75285044, + "epoch": 1.459300073960464, + "grad_norm": 7.125, + "learning_rate": 1.8685970871860913e-06, + "loss": 0.97118902, + "memory(GiB)": 302.58, + "step": 260940, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.75147295, + "epoch": 1.4594119234334433, + "grad_norm": 7.1875, + "learning_rate": 1.8678762428425684e-06, + "loss": 0.98647146, + "memory(GiB)": 302.58, + "step": 260960, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.75882506, + "epoch": 1.4595237729064225, + "grad_norm": 7.5625, + "learning_rate": 1.8671555056276092e-06, + "loss": 0.93868065, + "memory(GiB)": 302.58, + "step": 260980, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.74628372, + "epoch": 1.4596356223794018, + "grad_norm": 6.71875, + "learning_rate": 1.8664348755658657e-06, + "loss": 0.99556522, + "memory(GiB)": 302.58, + "step": 261000, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.72963719, + "epoch": 1.459747471852381, + "grad_norm": 5.125, + "learning_rate": 1.865714352681986e-06, + "loss": 1.07814217, + "memory(GiB)": 302.58, + "step": 261020, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.74750896, + "epoch": 1.4598593213253603, + "grad_norm": 6.625, + "learning_rate": 1.8649939370006137e-06, + "loss": 0.97093515, + "memory(GiB)": 302.58, + "step": 261040, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.75706167, + "epoch": 1.4599711707983396, + "grad_norm": 7.625, + "learning_rate": 1.8642736285463898e-06, + "loss": 0.96140985, + "memory(GiB)": 302.58, + "step": 261060, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.74978495, + "epoch": 1.4600830202713189, + "grad_norm": 6.40625, + "learning_rate": 1.8635534273439504e-06, + "loss": 0.98468666, + "memory(GiB)": 302.58, + "step": 261080, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.75140243, + "epoch": 1.4601948697442981, + "grad_norm": 7.65625, + "learning_rate": 1.86283333341793e-06, + "loss": 0.98979397, + "memory(GiB)": 302.58, + "step": 261100, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.74098005, + "epoch": 1.4603067192172774, + "grad_norm": 5.625, + "learning_rate": 1.8621133467929565e-06, + "loss": 1.01070528, + "memory(GiB)": 302.58, + "step": 261120, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.76734557, + "epoch": 1.4604185686902567, + "grad_norm": 5.46875, + "learning_rate": 1.861393467493655e-06, + "loss": 0.89318037, + "memory(GiB)": 302.58, + "step": 261140, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.76353712, + "epoch": 1.460530418163236, + "grad_norm": 7.59375, + "learning_rate": 1.8606736955446513e-06, + "loss": 0.91479654, + "memory(GiB)": 302.58, + "step": 261160, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.75950813, + "epoch": 1.4606422676362152, + "grad_norm": 5.3125, + "learning_rate": 1.8599540309705616e-06, + "loss": 0.94759226, + "memory(GiB)": 302.58, + "step": 261180, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.73811932, + "epoch": 1.4607541171091944, + "grad_norm": 6.75, + "learning_rate": 1.8592344737960006e-06, + "loss": 1.04411688, + "memory(GiB)": 302.58, + "step": 261200, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.74249969, + "epoch": 1.4608659665821737, + "grad_norm": 6.75, + "learning_rate": 1.8585150240455796e-06, + "loss": 1.00262184, + "memory(GiB)": 302.58, + "step": 261220, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.76781802, + "epoch": 1.460977816055153, + "grad_norm": 8.6875, + "learning_rate": 1.8577956817439064e-06, + "loss": 0.90126848, + "memory(GiB)": 302.58, + "step": 261240, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.74956899, + "epoch": 1.4610896655281322, + "grad_norm": 5.03125, + "learning_rate": 1.8570764469155845e-06, + "loss": 1.00309153, + "memory(GiB)": 302.58, + "step": 261260, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75398121, + "epoch": 1.4612015150011115, + "grad_norm": 6.40625, + "learning_rate": 1.8563573195852142e-06, + "loss": 0.98210182, + "memory(GiB)": 302.58, + "step": 261280, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.74031925, + "epoch": 1.4613133644740908, + "grad_norm": 5.875, + "learning_rate": 1.8556382997773914e-06, + "loss": 1.00196247, + "memory(GiB)": 302.58, + "step": 261300, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.75371304, + "epoch": 1.46142521394707, + "grad_norm": 7.25, + "learning_rate": 1.85491938751671e-06, + "loss": 0.9734767, + "memory(GiB)": 302.58, + "step": 261320, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.73613558, + "epoch": 1.4615370634200493, + "grad_norm": 10.625, + "learning_rate": 1.8542005828277576e-06, + "loss": 1.01316195, + "memory(GiB)": 302.58, + "step": 261340, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.76228986, + "epoch": 1.4616489128930286, + "grad_norm": 6.5625, + "learning_rate": 1.8534818857351206e-06, + "loss": 0.94846096, + "memory(GiB)": 302.58, + "step": 261360, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.74551597, + "epoch": 1.4617607623660078, + "grad_norm": 6.90625, + "learning_rate": 1.8527632962633802e-06, + "loss": 0.99968481, + "memory(GiB)": 302.58, + "step": 261380, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.7482275, + "epoch": 1.461872611838987, + "grad_norm": 8.1875, + "learning_rate": 1.8520448144371151e-06, + "loss": 1.00245857, + "memory(GiB)": 302.58, + "step": 261400, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.77280369, + "epoch": 1.4619844613119664, + "grad_norm": 7.84375, + "learning_rate": 1.8513264402808972e-06, + "loss": 0.89318657, + "memory(GiB)": 302.58, + "step": 261420, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.73328943, + "epoch": 1.4620963107849456, + "grad_norm": 7.25, + "learning_rate": 1.850608173819301e-06, + "loss": 1.04793987, + "memory(GiB)": 302.58, + "step": 261440, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.74404778, + "epoch": 1.4622081602579249, + "grad_norm": 8.0625, + "learning_rate": 1.8498900150768917e-06, + "loss": 1.01834583, + "memory(GiB)": 302.58, + "step": 261460, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.7526897, + "epoch": 1.4623200097309041, + "grad_norm": 7.3125, + "learning_rate": 1.849171964078233e-06, + "loss": 0.97977972, + "memory(GiB)": 302.58, + "step": 261480, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.74328246, + "epoch": 1.4624318592038834, + "grad_norm": 6.625, + "learning_rate": 1.8484540208478834e-06, + "loss": 1.01329756, + "memory(GiB)": 302.58, + "step": 261500, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.76089163, + "epoch": 1.4625437086768627, + "grad_norm": 11.3125, + "learning_rate": 1.8477361854104004e-06, + "loss": 0.93776302, + "memory(GiB)": 302.58, + "step": 261520, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.7497767, + "epoch": 1.462655558149842, + "grad_norm": 7.40625, + "learning_rate": 1.8470184577903349e-06, + "loss": 0.98896475, + "memory(GiB)": 302.58, + "step": 261540, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.75661783, + "epoch": 1.4627674076228212, + "grad_norm": 9.75, + "learning_rate": 1.8463008380122361e-06, + "loss": 0.95201836, + "memory(GiB)": 302.58, + "step": 261560, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.76984882, + "epoch": 1.4628792570958005, + "grad_norm": 7.28125, + "learning_rate": 1.8455833261006483e-06, + "loss": 0.89659004, + "memory(GiB)": 302.58, + "step": 261580, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.76301918, + "epoch": 1.4629911065687797, + "grad_norm": 7.03125, + "learning_rate": 1.844865922080114e-06, + "loss": 0.90679178, + "memory(GiB)": 302.58, + "step": 261600, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.75802145, + "epoch": 1.463102956041759, + "grad_norm": 7.125, + "learning_rate": 1.844148625975169e-06, + "loss": 0.94676123, + "memory(GiB)": 302.58, + "step": 261620, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.75062366, + "epoch": 1.4632148055147383, + "grad_norm": 8.8125, + "learning_rate": 1.8434314378103486e-06, + "loss": 0.98503256, + "memory(GiB)": 302.58, + "step": 261640, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.76142898, + "epoch": 1.4633266549877175, + "grad_norm": 7.46875, + "learning_rate": 1.8427143576101818e-06, + "loss": 0.91314306, + "memory(GiB)": 302.58, + "step": 261660, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.75494881, + "epoch": 1.4634385044606968, + "grad_norm": 8.5, + "learning_rate": 1.841997385399194e-06, + "loss": 0.97147865, + "memory(GiB)": 302.58, + "step": 261680, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.76013088, + "epoch": 1.463550353933676, + "grad_norm": 9.8125, + "learning_rate": 1.8412805212019109e-06, + "loss": 0.94319744, + "memory(GiB)": 302.58, + "step": 261700, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.7469687, + "epoch": 1.4636622034066553, + "grad_norm": 6.84375, + "learning_rate": 1.8405637650428499e-06, + "loss": 0.9974719, + "memory(GiB)": 302.58, + "step": 261720, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.76476121, + "epoch": 1.4637740528796346, + "grad_norm": 5.3125, + "learning_rate": 1.8398471169465265e-06, + "loss": 0.91202469, + "memory(GiB)": 302.58, + "step": 261740, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.73748946, + "epoch": 1.4638859023526138, + "grad_norm": 6.34375, + "learning_rate": 1.8391305769374523e-06, + "loss": 1.05072479, + "memory(GiB)": 302.58, + "step": 261760, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.74806852, + "epoch": 1.463997751825593, + "grad_norm": 6.78125, + "learning_rate": 1.8384141450401354e-06, + "loss": 1.00123854, + "memory(GiB)": 302.58, + "step": 261780, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.743191, + "epoch": 1.4641096012985724, + "grad_norm": 7.25, + "learning_rate": 1.8376978212790796e-06, + "loss": 1.00150127, + "memory(GiB)": 302.58, + "step": 261800, + "train_speed(iter/s)": 0.123773 + }, + { + "acc": 0.74448032, + "epoch": 1.4642214507715516, + "grad_norm": 8.625, + "learning_rate": 1.836981605678786e-06, + "loss": 1.02515192, + "memory(GiB)": 302.58, + "step": 261820, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.74758191, + "epoch": 1.464333300244531, + "grad_norm": 5.78125, + "learning_rate": 1.8362654982637511e-06, + "loss": 0.96436167, + "memory(GiB)": 302.58, + "step": 261840, + "train_speed(iter/s)": 0.123782 + }, + { + "acc": 0.74254532, + "epoch": 1.4644451497175102, + "grad_norm": 7.03125, + "learning_rate": 1.8355494990584683e-06, + "loss": 1.02719412, + "memory(GiB)": 302.58, + "step": 261860, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.7534616, + "epoch": 1.4645569991904894, + "grad_norm": 8.875, + "learning_rate": 1.8348336080874262e-06, + "loss": 0.96132107, + "memory(GiB)": 302.58, + "step": 261880, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.77061787, + "epoch": 1.4646688486634687, + "grad_norm": 6.3125, + "learning_rate": 1.8341178253751112e-06, + "loss": 0.90611687, + "memory(GiB)": 302.58, + "step": 261900, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.76762443, + "epoch": 1.464780698136448, + "grad_norm": 6.125, + "learning_rate": 1.8334021509460058e-06, + "loss": 0.91862202, + "memory(GiB)": 302.58, + "step": 261920, + "train_speed(iter/s)": 0.123799 + }, + { + "acc": 0.75925183, + "epoch": 1.4648925476094272, + "grad_norm": 6.875, + "learning_rate": 1.832686584824588e-06, + "loss": 0.94339256, + "memory(GiB)": 302.58, + "step": 261940, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.75812445, + "epoch": 1.4650043970824065, + "grad_norm": 6.75, + "learning_rate": 1.8319711270353297e-06, + "loss": 0.95642633, + "memory(GiB)": 302.58, + "step": 261960, + "train_speed(iter/s)": 0.123808 + }, + { + "acc": 0.76125116, + "epoch": 1.4651162465553857, + "grad_norm": 5.90625, + "learning_rate": 1.8312557776027069e-06, + "loss": 0.94259663, + "memory(GiB)": 302.58, + "step": 261980, + "train_speed(iter/s)": 0.123812 + }, + { + "acc": 0.74136348, + "epoch": 1.465228096028365, + "grad_norm": 6.125, + "learning_rate": 1.8305405365511842e-06, + "loss": 1.01650639, + "memory(GiB)": 302.58, + "step": 262000, + "train_speed(iter/s)": 0.123817 + }, + { + "epoch": 1.465228096028365, + "eval_acc": 0.7069029501070184, + "eval_loss": 1.011989951133728, + "eval_runtime": 7594.18, + "eval_samples_per_second": 9.913, + "eval_steps_per_second": 9.913, + "step": 262000 + }, + { + "acc": 0.7376071, + "epoch": 1.4653399455013443, + "grad_norm": 6.75, + "learning_rate": 1.829825403905225e-06, + "loss": 1.04828272, + "memory(GiB)": 302.58, + "step": 262020, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.7635685, + "epoch": 1.4654517949743235, + "grad_norm": 7.21875, + "learning_rate": 1.8291103796892894e-06, + "loss": 0.90696316, + "memory(GiB)": 302.58, + "step": 262040, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.7468431, + "epoch": 1.4655636444473028, + "grad_norm": 6.9375, + "learning_rate": 1.8283954639278334e-06, + "loss": 1.0158905, + "memory(GiB)": 302.58, + "step": 262060, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.75609016, + "epoch": 1.465675493920282, + "grad_norm": 5.46875, + "learning_rate": 1.8276806566453093e-06, + "loss": 0.96305914, + "memory(GiB)": 302.58, + "step": 262080, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.76398878, + "epoch": 1.4657873433932613, + "grad_norm": 7.78125, + "learning_rate": 1.8269659578661648e-06, + "loss": 0.91847048, + "memory(GiB)": 302.58, + "step": 262100, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.7578599, + "epoch": 1.4658991928662406, + "grad_norm": 6.9375, + "learning_rate": 1.8262513676148469e-06, + "loss": 0.94138985, + "memory(GiB)": 302.58, + "step": 262120, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.75781555, + "epoch": 1.4660110423392199, + "grad_norm": 8.3125, + "learning_rate": 1.8255368859157964e-06, + "loss": 0.96747761, + "memory(GiB)": 302.58, + "step": 262140, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.76318817, + "epoch": 1.4661228918121991, + "grad_norm": 8.25, + "learning_rate": 1.8248225127934499e-06, + "loss": 0.94058485, + "memory(GiB)": 302.58, + "step": 262160, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.74758286, + "epoch": 1.4662347412851784, + "grad_norm": 10.75, + "learning_rate": 1.8241082482722422e-06, + "loss": 0.99909258, + "memory(GiB)": 302.58, + "step": 262180, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.75012913, + "epoch": 1.4663465907581577, + "grad_norm": 5.90625, + "learning_rate": 1.8233940923766024e-06, + "loss": 0.99090233, + "memory(GiB)": 302.58, + "step": 262200, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.73161297, + "epoch": 1.466458440231137, + "grad_norm": 6.53125, + "learning_rate": 1.822680045130958e-06, + "loss": 1.07149315, + "memory(GiB)": 302.58, + "step": 262220, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.76111259, + "epoch": 1.4665702897041162, + "grad_norm": 7.21875, + "learning_rate": 1.8219661065597306e-06, + "loss": 0.94039183, + "memory(GiB)": 302.58, + "step": 262240, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74820313, + "epoch": 1.4666821391770954, + "grad_norm": 5.375, + "learning_rate": 1.8212522766873396e-06, + "loss": 0.99261723, + "memory(GiB)": 302.58, + "step": 262260, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.76226177, + "epoch": 1.4667939886500747, + "grad_norm": 6.34375, + "learning_rate": 1.8205385555382005e-06, + "loss": 0.92686472, + "memory(GiB)": 302.58, + "step": 262280, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.76595597, + "epoch": 1.466905838123054, + "grad_norm": 5.28125, + "learning_rate": 1.8198249431367226e-06, + "loss": 0.89991236, + "memory(GiB)": 302.58, + "step": 262300, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.75898037, + "epoch": 1.4670176875960332, + "grad_norm": 10.0, + "learning_rate": 1.8191114395073179e-06, + "loss": 0.93859377, + "memory(GiB)": 302.58, + "step": 262320, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.7817318, + "epoch": 1.4671295370690125, + "grad_norm": 7.28125, + "learning_rate": 1.8183980446743877e-06, + "loss": 0.84854116, + "memory(GiB)": 302.58, + "step": 262340, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.74544477, + "epoch": 1.4672413865419918, + "grad_norm": 7.40625, + "learning_rate": 1.8176847586623331e-06, + "loss": 1.00416927, + "memory(GiB)": 302.58, + "step": 262360, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.76834369, + "epoch": 1.467353236014971, + "grad_norm": 4.21875, + "learning_rate": 1.8169715814955508e-06, + "loss": 0.89616375, + "memory(GiB)": 302.58, + "step": 262380, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.75241551, + "epoch": 1.4674650854879503, + "grad_norm": 7.5, + "learning_rate": 1.816258513198434e-06, + "loss": 0.96019497, + "memory(GiB)": 302.58, + "step": 262400, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.74286623, + "epoch": 1.4675769349609296, + "grad_norm": 11.375, + "learning_rate": 1.8155455537953703e-06, + "loss": 1.02257357, + "memory(GiB)": 302.58, + "step": 262420, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.75441041, + "epoch": 1.4676887844339088, + "grad_norm": 4.875, + "learning_rate": 1.814832703310747e-06, + "loss": 0.94692936, + "memory(GiB)": 302.58, + "step": 262440, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.76659355, + "epoch": 1.467800633906888, + "grad_norm": 8.5, + "learning_rate": 1.8141199617689454e-06, + "loss": 0.92475176, + "memory(GiB)": 302.58, + "step": 262460, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.7637373, + "epoch": 1.4679124833798673, + "grad_norm": 10.25, + "learning_rate": 1.8134073291943427e-06, + "loss": 0.92658958, + "memory(GiB)": 302.58, + "step": 262480, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.76438475, + "epoch": 1.4680243328528466, + "grad_norm": 7.375, + "learning_rate": 1.8126948056113142e-06, + "loss": 0.91141119, + "memory(GiB)": 302.58, + "step": 262500, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.76154585, + "epoch": 1.4681361823258259, + "grad_norm": 8.375, + "learning_rate": 1.8119823910442297e-06, + "loss": 0.93286371, + "memory(GiB)": 302.58, + "step": 262520, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.76974459, + "epoch": 1.4682480317988051, + "grad_norm": 7.125, + "learning_rate": 1.8112700855174564e-06, + "loss": 0.90435009, + "memory(GiB)": 302.58, + "step": 262540, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.76502771, + "epoch": 1.4683598812717844, + "grad_norm": 7.8125, + "learning_rate": 1.8105578890553554e-06, + "loss": 0.91874313, + "memory(GiB)": 302.58, + "step": 262560, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74969563, + "epoch": 1.4684717307447637, + "grad_norm": 6.09375, + "learning_rate": 1.8098458016822902e-06, + "loss": 0.98182421, + "memory(GiB)": 302.58, + "step": 262580, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.73827791, + "epoch": 1.468583580217743, + "grad_norm": 5.90625, + "learning_rate": 1.8091338234226141e-06, + "loss": 1.05835524, + "memory(GiB)": 302.58, + "step": 262600, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.76408181, + "epoch": 1.4686954296907222, + "grad_norm": 6.375, + "learning_rate": 1.8084219543006793e-06, + "loss": 0.93001509, + "memory(GiB)": 302.58, + "step": 262620, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.73278561, + "epoch": 1.4688072791637015, + "grad_norm": 9.0625, + "learning_rate": 1.8077101943408338e-06, + "loss": 1.06717339, + "memory(GiB)": 302.58, + "step": 262640, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.75869989, + "epoch": 1.4689191286366807, + "grad_norm": 7.96875, + "learning_rate": 1.806998543567422e-06, + "loss": 0.93405333, + "memory(GiB)": 302.58, + "step": 262660, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.73628168, + "epoch": 1.46903097810966, + "grad_norm": 7.71875, + "learning_rate": 1.8062870020047851e-06, + "loss": 1.05419407, + "memory(GiB)": 302.58, + "step": 262680, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.75647717, + "epoch": 1.4691428275826393, + "grad_norm": 6.5, + "learning_rate": 1.8055755696772593e-06, + "loss": 0.97078648, + "memory(GiB)": 302.58, + "step": 262700, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.75601821, + "epoch": 1.4692546770556185, + "grad_norm": 6.0625, + "learning_rate": 1.8048642466091782e-06, + "loss": 0.94164906, + "memory(GiB)": 302.58, + "step": 262720, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.75810757, + "epoch": 1.4693665265285978, + "grad_norm": 8.25, + "learning_rate": 1.804153032824872e-06, + "loss": 0.97596445, + "memory(GiB)": 302.58, + "step": 262740, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.74919024, + "epoch": 1.469478376001577, + "grad_norm": 4.0625, + "learning_rate": 1.8034419283486655e-06, + "loss": 0.97195749, + "memory(GiB)": 302.58, + "step": 262760, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.75614157, + "epoch": 1.4695902254745563, + "grad_norm": 7.5625, + "learning_rate": 1.802730933204881e-06, + "loss": 0.95966673, + "memory(GiB)": 302.58, + "step": 262780, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.75168352, + "epoch": 1.4697020749475356, + "grad_norm": 8.6875, + "learning_rate": 1.8020200474178367e-06, + "loss": 0.97533436, + "memory(GiB)": 302.58, + "step": 262800, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74104533, + "epoch": 1.4698139244205148, + "grad_norm": 5.34375, + "learning_rate": 1.8013092710118473e-06, + "loss": 1.02517471, + "memory(GiB)": 302.58, + "step": 262820, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74159107, + "epoch": 1.469925773893494, + "grad_norm": 6.3125, + "learning_rate": 1.8005986040112223e-06, + "loss": 1.02360516, + "memory(GiB)": 302.58, + "step": 262840, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74271908, + "epoch": 1.4700376233664734, + "grad_norm": 5.25, + "learning_rate": 1.7998880464402712e-06, + "loss": 1.0290823, + "memory(GiB)": 302.58, + "step": 262860, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.74685645, + "epoch": 1.4701494728394526, + "grad_norm": 8.8125, + "learning_rate": 1.7991775983232967e-06, + "loss": 0.97790594, + "memory(GiB)": 302.58, + "step": 262880, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.74649673, + "epoch": 1.470261322312432, + "grad_norm": 6.625, + "learning_rate": 1.7984672596845976e-06, + "loss": 1.01998758, + "memory(GiB)": 302.58, + "step": 262900, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.7568316, + "epoch": 1.4703731717854112, + "grad_norm": 5.625, + "learning_rate": 1.7977570305484703e-06, + "loss": 0.97757282, + "memory(GiB)": 302.58, + "step": 262920, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.75999794, + "epoch": 1.4704850212583904, + "grad_norm": 7.8125, + "learning_rate": 1.7970469109392063e-06, + "loss": 0.9444705, + "memory(GiB)": 302.58, + "step": 262940, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.75890074, + "epoch": 1.4705968707313697, + "grad_norm": 6.96875, + "learning_rate": 1.7963369008810943e-06, + "loss": 0.94777794, + "memory(GiB)": 302.58, + "step": 262960, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.76071568, + "epoch": 1.470708720204349, + "grad_norm": 6.625, + "learning_rate": 1.7956270003984188e-06, + "loss": 0.93095856, + "memory(GiB)": 302.58, + "step": 262980, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.76113648, + "epoch": 1.4708205696773282, + "grad_norm": 5.8125, + "learning_rate": 1.794917209515461e-06, + "loss": 0.93056879, + "memory(GiB)": 302.58, + "step": 263000, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.73277068, + "epoch": 1.4709324191503075, + "grad_norm": 7.65625, + "learning_rate": 1.7942075282564976e-06, + "loss": 1.0588933, + "memory(GiB)": 302.58, + "step": 263020, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.74168253, + "epoch": 1.4710442686232867, + "grad_norm": 7.40625, + "learning_rate": 1.7934979566458017e-06, + "loss": 1.00665512, + "memory(GiB)": 302.58, + "step": 263040, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74783845, + "epoch": 1.471156118096266, + "grad_norm": 6.65625, + "learning_rate": 1.7927884947076435e-06, + "loss": 1.01545534, + "memory(GiB)": 302.58, + "step": 263060, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.76372914, + "epoch": 1.4712679675692453, + "grad_norm": 7.875, + "learning_rate": 1.7920791424662881e-06, + "loss": 0.92119379, + "memory(GiB)": 302.58, + "step": 263080, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.71952987, + "epoch": 1.4713798170422245, + "grad_norm": 10.9375, + "learning_rate": 1.7913698999459982e-06, + "loss": 1.11295719, + "memory(GiB)": 302.58, + "step": 263100, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.74194198, + "epoch": 1.4714916665152038, + "grad_norm": 6.84375, + "learning_rate": 1.7906607671710307e-06, + "loss": 1.00771666, + "memory(GiB)": 302.58, + "step": 263120, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.75104418, + "epoch": 1.471603515988183, + "grad_norm": 6.09375, + "learning_rate": 1.7899517441656433e-06, + "loss": 0.98325911, + "memory(GiB)": 302.58, + "step": 263140, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.7574482, + "epoch": 1.4717153654611623, + "grad_norm": 9.625, + "learning_rate": 1.7892428309540842e-06, + "loss": 0.97485905, + "memory(GiB)": 302.58, + "step": 263160, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.76709428, + "epoch": 1.4718272149341416, + "grad_norm": 8.125, + "learning_rate": 1.7885340275606022e-06, + "loss": 0.91325073, + "memory(GiB)": 302.58, + "step": 263180, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.76596589, + "epoch": 1.4719390644071209, + "grad_norm": 6.125, + "learning_rate": 1.7878253340094392e-06, + "loss": 0.93343296, + "memory(GiB)": 302.58, + "step": 263200, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.75935364, + "epoch": 1.4720509138801001, + "grad_norm": 7.03125, + "learning_rate": 1.787116750324836e-06, + "loss": 0.94674053, + "memory(GiB)": 302.58, + "step": 263220, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.73871856, + "epoch": 1.4721627633530794, + "grad_norm": 13.3125, + "learning_rate": 1.786408276531027e-06, + "loss": 1.0361927, + "memory(GiB)": 302.58, + "step": 263240, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.75800014, + "epoch": 1.4722746128260586, + "grad_norm": 8.6875, + "learning_rate": 1.785699912652245e-06, + "loss": 0.95263824, + "memory(GiB)": 302.58, + "step": 263260, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.75558939, + "epoch": 1.472386462299038, + "grad_norm": 7.46875, + "learning_rate": 1.7849916587127186e-06, + "loss": 0.95689211, + "memory(GiB)": 302.58, + "step": 263280, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.75029802, + "epoch": 1.4724983117720172, + "grad_norm": 7.125, + "learning_rate": 1.784283514736672e-06, + "loss": 0.96996508, + "memory(GiB)": 302.58, + "step": 263300, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.75221901, + "epoch": 1.4726101612449964, + "grad_norm": 10.375, + "learning_rate": 1.7835754807483258e-06, + "loss": 0.97232523, + "memory(GiB)": 302.58, + "step": 263320, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.73547606, + "epoch": 1.4727220107179757, + "grad_norm": 8.0625, + "learning_rate": 1.7828675567718973e-06, + "loss": 1.03508892, + "memory(GiB)": 302.58, + "step": 263340, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.74751911, + "epoch": 1.472833860190955, + "grad_norm": 14.25, + "learning_rate": 1.7821597428315978e-06, + "loss": 0.98725224, + "memory(GiB)": 302.58, + "step": 263360, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.76386957, + "epoch": 1.4729457096639342, + "grad_norm": 7.5, + "learning_rate": 1.78145203895164e-06, + "loss": 0.93520861, + "memory(GiB)": 302.58, + "step": 263380, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.75110278, + "epoch": 1.4730575591369135, + "grad_norm": 6.875, + "learning_rate": 1.7807444451562284e-06, + "loss": 0.97841349, + "memory(GiB)": 302.58, + "step": 263400, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.76038194, + "epoch": 1.4731694086098928, + "grad_norm": 10.875, + "learning_rate": 1.780036961469565e-06, + "loss": 0.9354557, + "memory(GiB)": 302.58, + "step": 263420, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.75080681, + "epoch": 1.473281258082872, + "grad_norm": 5.96875, + "learning_rate": 1.7793295879158456e-06, + "loss": 0.99447851, + "memory(GiB)": 302.58, + "step": 263440, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.7371449, + "epoch": 1.4733931075558513, + "grad_norm": 9.125, + "learning_rate": 1.778622324519269e-06, + "loss": 1.05651655, + "memory(GiB)": 302.58, + "step": 263460, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.75291953, + "epoch": 1.4735049570288306, + "grad_norm": 6.4375, + "learning_rate": 1.7779151713040232e-06, + "loss": 0.95847826, + "memory(GiB)": 302.58, + "step": 263480, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.75876923, + "epoch": 1.4736168065018098, + "grad_norm": 5.78125, + "learning_rate": 1.777208128294296e-06, + "loss": 0.93349457, + "memory(GiB)": 302.58, + "step": 263500, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.75940819, + "epoch": 1.473728655974789, + "grad_norm": 5.84375, + "learning_rate": 1.7765011955142698e-06, + "loss": 0.94357986, + "memory(GiB)": 302.58, + "step": 263520, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.74692926, + "epoch": 1.4738405054477683, + "grad_norm": 4.96875, + "learning_rate": 1.7757943729881238e-06, + "loss": 0.9834549, + "memory(GiB)": 302.58, + "step": 263540, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.7584435, + "epoch": 1.4739523549207476, + "grad_norm": 6.03125, + "learning_rate": 1.7750876607400341e-06, + "loss": 0.94391556, + "memory(GiB)": 302.58, + "step": 263560, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.75644832, + "epoch": 1.4740642043937269, + "grad_norm": 7.40625, + "learning_rate": 1.7743810587941724e-06, + "loss": 0.95577364, + "memory(GiB)": 302.58, + "step": 263580, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.74810038, + "epoch": 1.4741760538667061, + "grad_norm": 11.0625, + "learning_rate": 1.7736745671747063e-06, + "loss": 1.00261488, + "memory(GiB)": 302.58, + "step": 263600, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.75594358, + "epoch": 1.4742879033396854, + "grad_norm": 5.1875, + "learning_rate": 1.7729681859058006e-06, + "loss": 0.96521425, + "memory(GiB)": 302.58, + "step": 263620, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.73523655, + "epoch": 1.4743997528126647, + "grad_norm": 6.1875, + "learning_rate": 1.772261915011616e-06, + "loss": 1.05459614, + "memory(GiB)": 302.58, + "step": 263640, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.76254044, + "epoch": 1.474511602285644, + "grad_norm": 7.34375, + "learning_rate": 1.7715557545163086e-06, + "loss": 0.95001831, + "memory(GiB)": 302.58, + "step": 263660, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.75349398, + "epoch": 1.4746234517586232, + "grad_norm": 9.1875, + "learning_rate": 1.7708497044440315e-06, + "loss": 0.97794933, + "memory(GiB)": 302.58, + "step": 263680, + "train_speed(iter/s)": 0.12374 + }, + { + "acc": 0.75219297, + "epoch": 1.4747353012316025, + "grad_norm": 6.71875, + "learning_rate": 1.7701437648189334e-06, + "loss": 0.97561789, + "memory(GiB)": 302.58, + "step": 263700, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.75500083, + "epoch": 1.4748471507045817, + "grad_norm": 6.59375, + "learning_rate": 1.7694379356651587e-06, + "loss": 0.95492172, + "memory(GiB)": 302.58, + "step": 263720, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.75712862, + "epoch": 1.474959000177561, + "grad_norm": 5.09375, + "learning_rate": 1.7687322170068523e-06, + "loss": 0.93494492, + "memory(GiB)": 302.58, + "step": 263740, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.73845377, + "epoch": 1.4750708496505403, + "grad_norm": 5.9375, + "learning_rate": 1.7680266088681503e-06, + "loss": 1.06664028, + "memory(GiB)": 302.58, + "step": 263760, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.76436582, + "epoch": 1.4751826991235195, + "grad_norm": 6.125, + "learning_rate": 1.7673211112731864e-06, + "loss": 0.90098686, + "memory(GiB)": 302.58, + "step": 263780, + "train_speed(iter/s)": 0.123762 + }, + { + "acc": 0.76263189, + "epoch": 1.4752945485964988, + "grad_norm": 11.6875, + "learning_rate": 1.7666157242460908e-06, + "loss": 0.90225372, + "memory(GiB)": 302.58, + "step": 263800, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.73163795, + "epoch": 1.475406398069478, + "grad_norm": 12.5625, + "learning_rate": 1.7659104478109906e-06, + "loss": 1.08278446, + "memory(GiB)": 302.58, + "step": 263820, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.74681883, + "epoch": 1.4755182475424573, + "grad_norm": 9.125, + "learning_rate": 1.765205281992008e-06, + "loss": 1.00170631, + "memory(GiB)": 302.58, + "step": 263840, + "train_speed(iter/s)": 0.123775 + }, + { + "acc": 0.7407443, + "epoch": 1.4756300970154366, + "grad_norm": 6.3125, + "learning_rate": 1.7645002268132615e-06, + "loss": 1.03772621, + "memory(GiB)": 302.58, + "step": 263860, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.75372782, + "epoch": 1.4757419464884158, + "grad_norm": 8.5625, + "learning_rate": 1.7637952822988675e-06, + "loss": 0.95780993, + "memory(GiB)": 302.58, + "step": 263880, + "train_speed(iter/s)": 0.123784 + }, + { + "acc": 0.73504729, + "epoch": 1.475853795961395, + "grad_norm": 7.375, + "learning_rate": 1.7630904484729362e-06, + "loss": 1.03954802, + "memory(GiB)": 302.58, + "step": 263900, + "train_speed(iter/s)": 0.123788 + }, + { + "acc": 0.75065098, + "epoch": 1.4759656454343744, + "grad_norm": 7.25, + "learning_rate": 1.7623857253595755e-06, + "loss": 0.98807869, + "memory(GiB)": 302.58, + "step": 263920, + "train_speed(iter/s)": 0.123793 + }, + { + "acc": 0.76650758, + "epoch": 1.4760774949073536, + "grad_norm": 7.34375, + "learning_rate": 1.7616811129828893e-06, + "loss": 0.92131367, + "memory(GiB)": 302.58, + "step": 263940, + "train_speed(iter/s)": 0.123797 + }, + { + "acc": 0.75356402, + "epoch": 1.476189344380333, + "grad_norm": 8.9375, + "learning_rate": 1.7609766113669774e-06, + "loss": 0.97503347, + "memory(GiB)": 302.58, + "step": 263960, + "train_speed(iter/s)": 0.123802 + }, + { + "acc": 0.76389623, + "epoch": 1.4763011938533122, + "grad_norm": 9.5, + "learning_rate": 1.7602722205359346e-06, + "loss": 0.91451778, + "memory(GiB)": 302.58, + "step": 263980, + "train_speed(iter/s)": 0.123806 + }, + { + "acc": 0.76173773, + "epoch": 1.4764130433262914, + "grad_norm": 7.21875, + "learning_rate": 1.759567940513856e-06, + "loss": 0.92371693, + "memory(GiB)": 302.58, + "step": 264000, + "train_speed(iter/s)": 0.12381 + }, + { + "epoch": 1.4764130433262914, + "eval_acc": 0.7068897878650834, + "eval_loss": 1.0118995904922485, + "eval_runtime": 7532.3203, + "eval_samples_per_second": 9.995, + "eval_steps_per_second": 9.995, + "step": 264000 + }, + { + "acc": 0.77053471, + "epoch": 1.4765248927992707, + "grad_norm": 11.1875, + "learning_rate": 1.758863771324829e-06, + "loss": 0.90967035, + "memory(GiB)": 302.58, + "step": 264020, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.76996226, + "epoch": 1.47663674227225, + "grad_norm": 6.75, + "learning_rate": 1.758159712992939e-06, + "loss": 0.89491463, + "memory(GiB)": 302.58, + "step": 264040, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.7497993, + "epoch": 1.4767485917452292, + "grad_norm": 7.59375, + "learning_rate": 1.7574557655422659e-06, + "loss": 0.97720413, + "memory(GiB)": 302.58, + "step": 264060, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.75187273, + "epoch": 1.4768604412182085, + "grad_norm": 9.8125, + "learning_rate": 1.7567519289968876e-06, + "loss": 0.94941692, + "memory(GiB)": 302.58, + "step": 264080, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.75913539, + "epoch": 1.4769722906911877, + "grad_norm": 5.0625, + "learning_rate": 1.7560482033808773e-06, + "loss": 0.93540735, + "memory(GiB)": 302.58, + "step": 264100, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.7512053, + "epoch": 1.477084140164167, + "grad_norm": 7.34375, + "learning_rate": 1.7553445887183046e-06, + "loss": 0.95917482, + "memory(GiB)": 302.58, + "step": 264120, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.74763875, + "epoch": 1.4771959896371463, + "grad_norm": 6.0625, + "learning_rate": 1.7546410850332356e-06, + "loss": 0.99098778, + "memory(GiB)": 302.58, + "step": 264140, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.75175023, + "epoch": 1.4773078391101255, + "grad_norm": 8.4375, + "learning_rate": 1.7539376923497325e-06, + "loss": 0.98487511, + "memory(GiB)": 302.58, + "step": 264160, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.76022382, + "epoch": 1.4774196885831048, + "grad_norm": 9.8125, + "learning_rate": 1.753234410691853e-06, + "loss": 0.95321894, + "memory(GiB)": 302.58, + "step": 264180, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.73545699, + "epoch": 1.477531538056084, + "grad_norm": 6.15625, + "learning_rate": 1.752531240083652e-06, + "loss": 1.0514473, + "memory(GiB)": 302.58, + "step": 264200, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74963765, + "epoch": 1.4776433875290633, + "grad_norm": 8.375, + "learning_rate": 1.7518281805491804e-06, + "loss": 0.99140062, + "memory(GiB)": 302.58, + "step": 264220, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.74941301, + "epoch": 1.4777552370020426, + "grad_norm": 6.40625, + "learning_rate": 1.751125232112484e-06, + "loss": 1.00114307, + "memory(GiB)": 302.58, + "step": 264240, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.75346527, + "epoch": 1.4778670864750219, + "grad_norm": 9.0, + "learning_rate": 1.7504223947976051e-06, + "loss": 0.97720003, + "memory(GiB)": 302.58, + "step": 264260, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.74789605, + "epoch": 1.4779789359480011, + "grad_norm": 5.1875, + "learning_rate": 1.7497196686285867e-06, + "loss": 1.0026926, + "memory(GiB)": 302.58, + "step": 264280, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.75771208, + "epoch": 1.4780907854209804, + "grad_norm": 8.1875, + "learning_rate": 1.7490170536294616e-06, + "loss": 0.93506622, + "memory(GiB)": 302.58, + "step": 264300, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.73995218, + "epoch": 1.4782026348939596, + "grad_norm": 5.25, + "learning_rate": 1.7483145498242615e-06, + "loss": 1.02317533, + "memory(GiB)": 302.58, + "step": 264320, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.73227081, + "epoch": 1.478314484366939, + "grad_norm": 7.71875, + "learning_rate": 1.7476121572370148e-06, + "loss": 1.05671005, + "memory(GiB)": 302.58, + "step": 264340, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.7464612, + "epoch": 1.4784263338399182, + "grad_norm": 4.96875, + "learning_rate": 1.7469098758917452e-06, + "loss": 0.99588022, + "memory(GiB)": 302.58, + "step": 264360, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.7570437, + "epoch": 1.4785381833128974, + "grad_norm": 6.15625, + "learning_rate": 1.7462077058124738e-06, + "loss": 0.96010618, + "memory(GiB)": 302.58, + "step": 264380, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.7410718, + "epoch": 1.4786500327858767, + "grad_norm": 6.53125, + "learning_rate": 1.7455056470232156e-06, + "loss": 1.02662163, + "memory(GiB)": 302.58, + "step": 264400, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.74720492, + "epoch": 1.478761882258856, + "grad_norm": 8.8125, + "learning_rate": 1.7448036995479846e-06, + "loss": 0.99759789, + "memory(GiB)": 302.58, + "step": 264420, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.73431935, + "epoch": 1.4788737317318352, + "grad_norm": 7.1875, + "learning_rate": 1.7441018634107888e-06, + "loss": 1.04614687, + "memory(GiB)": 302.58, + "step": 264440, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.74199567, + "epoch": 1.4789855812048145, + "grad_norm": 6.5625, + "learning_rate": 1.743400138635633e-06, + "loss": 1.02577572, + "memory(GiB)": 302.58, + "step": 264460, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74818788, + "epoch": 1.4790974306777938, + "grad_norm": 7.375, + "learning_rate": 1.7426985252465195e-06, + "loss": 1.00293036, + "memory(GiB)": 302.58, + "step": 264480, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.76165824, + "epoch": 1.479209280150773, + "grad_norm": 7.28125, + "learning_rate": 1.7419970232674444e-06, + "loss": 0.92833986, + "memory(GiB)": 302.58, + "step": 264500, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.76841626, + "epoch": 1.4793211296237523, + "grad_norm": 9.5, + "learning_rate": 1.7412956327224024e-06, + "loss": 0.90477133, + "memory(GiB)": 302.58, + "step": 264520, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.7568109, + "epoch": 1.4794329790967315, + "grad_norm": 11.875, + "learning_rate": 1.7405943536353815e-06, + "loss": 0.96152973, + "memory(GiB)": 302.58, + "step": 264540, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.76754794, + "epoch": 1.4795448285697108, + "grad_norm": 7.5625, + "learning_rate": 1.7398931860303696e-06, + "loss": 0.91618538, + "memory(GiB)": 302.58, + "step": 264560, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.76900778, + "epoch": 1.47965667804269, + "grad_norm": 7.875, + "learning_rate": 1.739192129931349e-06, + "loss": 0.911448, + "memory(GiB)": 302.58, + "step": 264580, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.74875379, + "epoch": 1.4797685275156693, + "grad_norm": 8.8125, + "learning_rate": 1.7384911853622977e-06, + "loss": 1.00289116, + "memory(GiB)": 302.58, + "step": 264600, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.74572697, + "epoch": 1.4798803769886486, + "grad_norm": 5.625, + "learning_rate": 1.7377903523471872e-06, + "loss": 0.99350939, + "memory(GiB)": 302.58, + "step": 264620, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.7691072, + "epoch": 1.4799922264616279, + "grad_norm": 8.3125, + "learning_rate": 1.7370896309099933e-06, + "loss": 0.91346817, + "memory(GiB)": 302.58, + "step": 264640, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.75547018, + "epoch": 1.4801040759346071, + "grad_norm": 7.1875, + "learning_rate": 1.7363890210746802e-06, + "loss": 0.9631238, + "memory(GiB)": 302.58, + "step": 264660, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74532852, + "epoch": 1.4802159254075864, + "grad_norm": 8.5625, + "learning_rate": 1.735688522865211e-06, + "loss": 0.98437777, + "memory(GiB)": 302.58, + "step": 264680, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.7486022, + "epoch": 1.4803277748805657, + "grad_norm": 6.84375, + "learning_rate": 1.7349881363055454e-06, + "loss": 0.98161583, + "memory(GiB)": 302.58, + "step": 264700, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75424376, + "epoch": 1.480439624353545, + "grad_norm": 5.21875, + "learning_rate": 1.7342878614196385e-06, + "loss": 0.97862158, + "memory(GiB)": 302.58, + "step": 264720, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.74550562, + "epoch": 1.4805514738265242, + "grad_norm": 9.375, + "learning_rate": 1.733587698231442e-06, + "loss": 1.012432, + "memory(GiB)": 302.58, + "step": 264740, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.7688096, + "epoch": 1.4806633232995035, + "grad_norm": 7.0, + "learning_rate": 1.7328876467649036e-06, + "loss": 0.87128143, + "memory(GiB)": 302.58, + "step": 264760, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.74819255, + "epoch": 1.4807751727724827, + "grad_norm": 6.78125, + "learning_rate": 1.7321877070439679e-06, + "loss": 0.99257259, + "memory(GiB)": 302.58, + "step": 264780, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.75267358, + "epoch": 1.480887022245462, + "grad_norm": 7.125, + "learning_rate": 1.7314878790925744e-06, + "loss": 0.94958496, + "memory(GiB)": 302.58, + "step": 264800, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.7428103, + "epoch": 1.4809988717184412, + "grad_norm": 6.1875, + "learning_rate": 1.730788162934659e-06, + "loss": 1.03189116, + "memory(GiB)": 302.58, + "step": 264820, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.74098077, + "epoch": 1.4811107211914205, + "grad_norm": 5.40625, + "learning_rate": 1.7300885585941557e-06, + "loss": 1.03103065, + "memory(GiB)": 302.58, + "step": 264840, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.75166063, + "epoch": 1.4812225706643998, + "grad_norm": 10.1875, + "learning_rate": 1.7293890660949897e-06, + "loss": 0.9664113, + "memory(GiB)": 302.58, + "step": 264860, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.7504549, + "epoch": 1.481334420137379, + "grad_norm": 7.65625, + "learning_rate": 1.7286896854610907e-06, + "loss": 0.97619162, + "memory(GiB)": 302.58, + "step": 264880, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.7428, + "epoch": 1.4814462696103583, + "grad_norm": 8.125, + "learning_rate": 1.7279904167163775e-06, + "loss": 0.99442053, + "memory(GiB)": 302.58, + "step": 264900, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.74511051, + "epoch": 1.4815581190833376, + "grad_norm": 10.0, + "learning_rate": 1.7272912598847673e-06, + "loss": 1.01816244, + "memory(GiB)": 302.58, + "step": 264920, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.77250295, + "epoch": 1.4816699685563168, + "grad_norm": 7.53125, + "learning_rate": 1.7265922149901737e-06, + "loss": 0.88830032, + "memory(GiB)": 302.58, + "step": 264940, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.75895128, + "epoch": 1.481781818029296, + "grad_norm": 9.5625, + "learning_rate": 1.7258932820565056e-06, + "loss": 0.95942106, + "memory(GiB)": 302.58, + "step": 264960, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75478678, + "epoch": 1.4818936675022754, + "grad_norm": 7.71875, + "learning_rate": 1.7251944611076693e-06, + "loss": 0.96795788, + "memory(GiB)": 302.58, + "step": 264980, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75104108, + "epoch": 1.4820055169752546, + "grad_norm": 6.53125, + "learning_rate": 1.7244957521675664e-06, + "loss": 0.98363571, + "memory(GiB)": 302.58, + "step": 265000, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.75893703, + "epoch": 1.4821173664482339, + "grad_norm": 7.5625, + "learning_rate": 1.7237971552600957e-06, + "loss": 0.93892612, + "memory(GiB)": 302.58, + "step": 265020, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.73936296, + "epoch": 1.4822292159212132, + "grad_norm": 6.96875, + "learning_rate": 1.7230986704091507e-06, + "loss": 1.03292255, + "memory(GiB)": 302.58, + "step": 265040, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.74257374, + "epoch": 1.4823410653941924, + "grad_norm": 5.84375, + "learning_rate": 1.7224002976386217e-06, + "loss": 1.03471746, + "memory(GiB)": 302.58, + "step": 265060, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.74428492, + "epoch": 1.4824529148671717, + "grad_norm": 6.40625, + "learning_rate": 1.7217020369723952e-06, + "loss": 1.03412867, + "memory(GiB)": 302.58, + "step": 265080, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.75016546, + "epoch": 1.482564764340151, + "grad_norm": 7.28125, + "learning_rate": 1.7210038884343543e-06, + "loss": 0.98925047, + "memory(GiB)": 302.58, + "step": 265100, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.76748662, + "epoch": 1.4826766138131302, + "grad_norm": 9.375, + "learning_rate": 1.7203058520483784e-06, + "loss": 0.92890615, + "memory(GiB)": 302.58, + "step": 265120, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.7540051, + "epoch": 1.4827884632861097, + "grad_norm": 7.5, + "learning_rate": 1.71960792783834e-06, + "loss": 0.96885242, + "memory(GiB)": 302.58, + "step": 265140, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.75646639, + "epoch": 1.4829003127590887, + "grad_norm": 8.3125, + "learning_rate": 1.7189101158281142e-06, + "loss": 0.95913572, + "memory(GiB)": 302.58, + "step": 265160, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.75198407, + "epoch": 1.4830121622320682, + "grad_norm": 7.125, + "learning_rate": 1.7182124160415665e-06, + "loss": 0.992665, + "memory(GiB)": 302.58, + "step": 265180, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75732083, + "epoch": 1.4831240117050473, + "grad_norm": 6.0, + "learning_rate": 1.71751482850256e-06, + "loss": 0.9818078, + "memory(GiB)": 302.58, + "step": 265200, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.74750705, + "epoch": 1.4832358611780267, + "grad_norm": 6.34375, + "learning_rate": 1.7168173532349546e-06, + "loss": 0.99991989, + "memory(GiB)": 302.58, + "step": 265220, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.75686312, + "epoch": 1.4833477106510058, + "grad_norm": 12.6875, + "learning_rate": 1.7161199902626074e-06, + "loss": 0.96606236, + "memory(GiB)": 302.58, + "step": 265240, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.74946275, + "epoch": 1.4834595601239853, + "grad_norm": 7.09375, + "learning_rate": 1.7154227396093688e-06, + "loss": 0.97574062, + "memory(GiB)": 302.58, + "step": 265260, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.74613485, + "epoch": 1.4835714095969643, + "grad_norm": 6.1875, + "learning_rate": 1.7147256012990876e-06, + "loss": 0.99206467, + "memory(GiB)": 302.58, + "step": 265280, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75860682, + "epoch": 1.4836832590699438, + "grad_norm": 6.40625, + "learning_rate": 1.7140285753556086e-06, + "loss": 0.96346521, + "memory(GiB)": 302.58, + "step": 265300, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.74824781, + "epoch": 1.4837951085429228, + "grad_norm": 8.3125, + "learning_rate": 1.7133316618027712e-06, + "loss": 0.98074808, + "memory(GiB)": 302.58, + "step": 265320, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.75402007, + "epoch": 1.4839069580159023, + "grad_norm": 6.09375, + "learning_rate": 1.7126348606644132e-06, + "loss": 0.96990328, + "memory(GiB)": 302.58, + "step": 265340, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.75552087, + "epoch": 1.4840188074888814, + "grad_norm": 7.84375, + "learning_rate": 1.7119381719643669e-06, + "loss": 0.96577368, + "memory(GiB)": 302.58, + "step": 265360, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.73281174, + "epoch": 1.4841306569618609, + "grad_norm": 8.25, + "learning_rate": 1.711241595726461e-06, + "loss": 1.04331865, + "memory(GiB)": 302.58, + "step": 265380, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.73938951, + "epoch": 1.48424250643484, + "grad_norm": 11.0625, + "learning_rate": 1.7105451319745208e-06, + "loss": 1.04139776, + "memory(GiB)": 302.58, + "step": 265400, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.76563325, + "epoch": 1.4843543559078194, + "grad_norm": 6.96875, + "learning_rate": 1.7098487807323665e-06, + "loss": 0.91613159, + "memory(GiB)": 302.58, + "step": 265420, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.73058, + "epoch": 1.4844662053807984, + "grad_norm": 8.5, + "learning_rate": 1.709152542023818e-06, + "loss": 1.08923159, + "memory(GiB)": 302.58, + "step": 265440, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.75005836, + "epoch": 1.484578054853778, + "grad_norm": 7.4375, + "learning_rate": 1.7084564158726879e-06, + "loss": 1.01179705, + "memory(GiB)": 302.58, + "step": 265460, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.75251832, + "epoch": 1.484689904326757, + "grad_norm": 9.125, + "learning_rate": 1.707760402302785e-06, + "loss": 0.98392553, + "memory(GiB)": 302.58, + "step": 265480, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.73307061, + "epoch": 1.4848017537997364, + "grad_norm": 7.1875, + "learning_rate": 1.7070645013379162e-06, + "loss": 1.07648544, + "memory(GiB)": 302.58, + "step": 265500, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.767068, + "epoch": 1.4849136032727155, + "grad_norm": 8.8125, + "learning_rate": 1.706368713001883e-06, + "loss": 0.90052395, + "memory(GiB)": 302.58, + "step": 265520, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.76063247, + "epoch": 1.485025452745695, + "grad_norm": 8.5, + "learning_rate": 1.7056730373184837e-06, + "loss": 0.93262148, + "memory(GiB)": 302.58, + "step": 265540, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.73381858, + "epoch": 1.485137302218674, + "grad_norm": 8.1875, + "learning_rate": 1.7049774743115123e-06, + "loss": 1.05949507, + "memory(GiB)": 302.58, + "step": 265560, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.74287767, + "epoch": 1.4852491516916535, + "grad_norm": 4.15625, + "learning_rate": 1.7042820240047596e-06, + "loss": 1.02657204, + "memory(GiB)": 302.58, + "step": 265580, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.76062455, + "epoch": 1.4853610011646325, + "grad_norm": 6.5, + "learning_rate": 1.7035866864220124e-06, + "loss": 0.94339266, + "memory(GiB)": 302.58, + "step": 265600, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.76133986, + "epoch": 1.485472850637612, + "grad_norm": 8.375, + "learning_rate": 1.7028914615870528e-06, + "loss": 0.9727541, + "memory(GiB)": 302.58, + "step": 265620, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.75480862, + "epoch": 1.485584700110591, + "grad_norm": 6.5, + "learning_rate": 1.7021963495236605e-06, + "loss": 0.94927044, + "memory(GiB)": 302.58, + "step": 265640, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.74410229, + "epoch": 1.4856965495835706, + "grad_norm": 8.5625, + "learning_rate": 1.7015013502556093e-06, + "loss": 0.99629936, + "memory(GiB)": 302.58, + "step": 265660, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.76274924, + "epoch": 1.4858083990565496, + "grad_norm": 5.78125, + "learning_rate": 1.70080646380667e-06, + "loss": 0.9491353, + "memory(GiB)": 302.58, + "step": 265680, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.75219126, + "epoch": 1.485920248529529, + "grad_norm": 9.1875, + "learning_rate": 1.7001116902006133e-06, + "loss": 0.96137247, + "memory(GiB)": 302.58, + "step": 265700, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.76343055, + "epoch": 1.4860320980025081, + "grad_norm": 5.0, + "learning_rate": 1.6994170294611995e-06, + "loss": 0.91583271, + "memory(GiB)": 302.58, + "step": 265720, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.72842584, + "epoch": 1.4861439474754876, + "grad_norm": 7.21875, + "learning_rate": 1.69872248161219e-06, + "loss": 1.08102674, + "memory(GiB)": 302.58, + "step": 265740, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.76111021, + "epoch": 1.4862557969484667, + "grad_norm": 6.46875, + "learning_rate": 1.6980280466773392e-06, + "loss": 0.93026142, + "memory(GiB)": 302.58, + "step": 265760, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.76013584, + "epoch": 1.4863676464214461, + "grad_norm": 6.625, + "learning_rate": 1.6973337246803995e-06, + "loss": 0.93417606, + "memory(GiB)": 302.58, + "step": 265780, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.74217587, + "epoch": 1.4864794958944252, + "grad_norm": 7.3125, + "learning_rate": 1.6966395156451188e-06, + "loss": 1.01826553, + "memory(GiB)": 302.58, + "step": 265800, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.74638162, + "epoch": 1.4865913453674047, + "grad_norm": 7.1875, + "learning_rate": 1.6959454195952412e-06, + "loss": 0.99408684, + "memory(GiB)": 302.58, + "step": 265820, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.75392728, + "epoch": 1.4867031948403837, + "grad_norm": 5.875, + "learning_rate": 1.6952514365545075e-06, + "loss": 0.97245913, + "memory(GiB)": 302.58, + "step": 265840, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.73626018, + "epoch": 1.4868150443133632, + "grad_norm": 7.8125, + "learning_rate": 1.6945575665466525e-06, + "loss": 1.05261698, + "memory(GiB)": 302.58, + "step": 265860, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.73350067, + "epoch": 1.4869268937863422, + "grad_norm": 7.1875, + "learning_rate": 1.6938638095954107e-06, + "loss": 1.04729815, + "memory(GiB)": 302.58, + "step": 265880, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.74408922, + "epoch": 1.4870387432593217, + "grad_norm": 7.84375, + "learning_rate": 1.693170165724511e-06, + "loss": 1.00730486, + "memory(GiB)": 302.58, + "step": 265900, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.75039926, + "epoch": 1.4871505927323008, + "grad_norm": 8.0625, + "learning_rate": 1.6924766349576765e-06, + "loss": 0.97720566, + "memory(GiB)": 302.58, + "step": 265920, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.75252209, + "epoch": 1.4872624422052803, + "grad_norm": 11.0, + "learning_rate": 1.6917832173186293e-06, + "loss": 0.98187799, + "memory(GiB)": 302.58, + "step": 265940, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.73869805, + "epoch": 1.4873742916782593, + "grad_norm": 7.4375, + "learning_rate": 1.6910899128310859e-06, + "loss": 1.01951723, + "memory(GiB)": 302.58, + "step": 265960, + "train_speed(iter/s)": 0.123798 + }, + { + "acc": 0.76203914, + "epoch": 1.4874861411512388, + "grad_norm": 10.125, + "learning_rate": 1.6903967215187605e-06, + "loss": 0.91827641, + "memory(GiB)": 302.58, + "step": 265980, + "train_speed(iter/s)": 0.123803 + }, + { + "acc": 0.72380695, + "epoch": 1.4875979906242178, + "grad_norm": 8.3125, + "learning_rate": 1.689703643405361e-06, + "loss": 1.08724728, + "memory(GiB)": 302.58, + "step": 266000, + "train_speed(iter/s)": 0.123807 + }, + { + "epoch": 1.4875979906242178, + "eval_acc": 0.7068925977819009, + "eval_loss": 1.011917233467102, + "eval_runtime": 7544.3198, + "eval_samples_per_second": 9.979, + "eval_steps_per_second": 9.979, + "step": 266000 + }, + { + "acc": 0.77074785, + "epoch": 1.4877098400971973, + "grad_norm": 9.0625, + "learning_rate": 1.689010678514592e-06, + "loss": 0.87945461, + "memory(GiB)": 302.58, + "step": 266020, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.76068544, + "epoch": 1.4878216895701764, + "grad_norm": 6.28125, + "learning_rate": 1.6883178268701583e-06, + "loss": 0.92567387, + "memory(GiB)": 302.58, + "step": 266040, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.73294315, + "epoch": 1.4879335390431558, + "grad_norm": 6.1875, + "learning_rate": 1.6876250884957562e-06, + "loss": 1.07819414, + "memory(GiB)": 302.58, + "step": 266060, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.7528883, + "epoch": 1.4880453885161349, + "grad_norm": 6.4375, + "learning_rate": 1.6869324634150792e-06, + "loss": 0.98105602, + "memory(GiB)": 302.58, + "step": 266080, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75132513, + "epoch": 1.4881572379891144, + "grad_norm": 4.96875, + "learning_rate": 1.6862399516518175e-06, + "loss": 0.97356977, + "memory(GiB)": 302.58, + "step": 266100, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74997187, + "epoch": 1.4882690874620934, + "grad_norm": 10.625, + "learning_rate": 1.6855475532296567e-06, + "loss": 0.99214792, + "memory(GiB)": 302.58, + "step": 266120, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.76360216, + "epoch": 1.488380936935073, + "grad_norm": 6.1875, + "learning_rate": 1.6848552681722803e-06, + "loss": 0.90378866, + "memory(GiB)": 302.58, + "step": 266140, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.75668044, + "epoch": 1.488492786408052, + "grad_norm": 6.875, + "learning_rate": 1.684163096503365e-06, + "loss": 0.93208256, + "memory(GiB)": 302.58, + "step": 266160, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.75304418, + "epoch": 1.4886046358810314, + "grad_norm": 5.40625, + "learning_rate": 1.6834710382465864e-06, + "loss": 0.95675116, + "memory(GiB)": 302.58, + "step": 266180, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.76458573, + "epoch": 1.4887164853540105, + "grad_norm": 8.4375, + "learning_rate": 1.6827790934256144e-06, + "loss": 0.89925985, + "memory(GiB)": 302.58, + "step": 266200, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.75172439, + "epoch": 1.48882833482699, + "grad_norm": 8.4375, + "learning_rate": 1.682087262064116e-06, + "loss": 0.97999268, + "memory(GiB)": 302.58, + "step": 266220, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.76161232, + "epoch": 1.488940184299969, + "grad_norm": 6.40625, + "learning_rate": 1.6813955441857544e-06, + "loss": 0.92967291, + "memory(GiB)": 302.58, + "step": 266240, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75821843, + "epoch": 1.4890520337729485, + "grad_norm": 6.59375, + "learning_rate": 1.6807039398141878e-06, + "loss": 0.94664841, + "memory(GiB)": 302.58, + "step": 266260, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.73536153, + "epoch": 1.4891638832459275, + "grad_norm": 8.1875, + "learning_rate": 1.6800124489730717e-06, + "loss": 1.03808222, + "memory(GiB)": 302.58, + "step": 266280, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75344224, + "epoch": 1.489275732718907, + "grad_norm": 6.96875, + "learning_rate": 1.6793210716860559e-06, + "loss": 0.98056793, + "memory(GiB)": 302.58, + "step": 266300, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.75680728, + "epoch": 1.489387582191886, + "grad_norm": 6.71875, + "learning_rate": 1.678629807976791e-06, + "loss": 0.96207952, + "memory(GiB)": 302.58, + "step": 266320, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.7548955, + "epoch": 1.4894994316648655, + "grad_norm": 5.84375, + "learning_rate": 1.6779386578689177e-06, + "loss": 0.94955292, + "memory(GiB)": 302.58, + "step": 266340, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.74634089, + "epoch": 1.4896112811378446, + "grad_norm": 5.75, + "learning_rate": 1.6772476213860766e-06, + "loss": 0.99616861, + "memory(GiB)": 302.58, + "step": 266360, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.75123863, + "epoch": 1.489723130610824, + "grad_norm": 10.0625, + "learning_rate": 1.6765566985519034e-06, + "loss": 0.96295547, + "memory(GiB)": 302.58, + "step": 266380, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.75780659, + "epoch": 1.489834980083803, + "grad_norm": 7.25, + "learning_rate": 1.6758658893900288e-06, + "loss": 0.9404377, + "memory(GiB)": 302.58, + "step": 266400, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.73123598, + "epoch": 1.4899468295567826, + "grad_norm": 7.28125, + "learning_rate": 1.6751751939240818e-06, + "loss": 1.05954676, + "memory(GiB)": 302.58, + "step": 266420, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.76181011, + "epoch": 1.4900586790297616, + "grad_norm": 8.25, + "learning_rate": 1.6744846121776854e-06, + "loss": 0.94696751, + "memory(GiB)": 302.58, + "step": 266440, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74132996, + "epoch": 1.4901705285027411, + "grad_norm": 7.84375, + "learning_rate": 1.6737941441744603e-06, + "loss": 1.03584948, + "memory(GiB)": 302.58, + "step": 266460, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.73877249, + "epoch": 1.4902823779757202, + "grad_norm": 5.59375, + "learning_rate": 1.6731037899380231e-06, + "loss": 1.05179691, + "memory(GiB)": 302.58, + "step": 266480, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.75364342, + "epoch": 1.4903942274486996, + "grad_norm": 6.03125, + "learning_rate": 1.672413549491985e-06, + "loss": 0.95500889, + "memory(GiB)": 302.58, + "step": 266500, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74536119, + "epoch": 1.4905060769216787, + "grad_norm": 8.625, + "learning_rate": 1.6717234228599556e-06, + "loss": 1.0131938, + "memory(GiB)": 302.58, + "step": 266520, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.7430788, + "epoch": 1.4906179263946582, + "grad_norm": 6.75, + "learning_rate": 1.6710334100655384e-06, + "loss": 1.01644049, + "memory(GiB)": 302.58, + "step": 266540, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74853539, + "epoch": 1.4907297758676372, + "grad_norm": 6.90625, + "learning_rate": 1.670343511132333e-06, + "loss": 0.97308292, + "memory(GiB)": 302.58, + "step": 266560, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75855441, + "epoch": 1.4908416253406167, + "grad_norm": 6.0625, + "learning_rate": 1.6696537260839386e-06, + "loss": 0.94072485, + "memory(GiB)": 302.58, + "step": 266580, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.74022303, + "epoch": 1.4909534748135957, + "grad_norm": 6.875, + "learning_rate": 1.6689640549439478e-06, + "loss": 1.05892744, + "memory(GiB)": 302.58, + "step": 266600, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.76773567, + "epoch": 1.4910653242865752, + "grad_norm": 7.875, + "learning_rate": 1.6682744977359483e-06, + "loss": 0.89165649, + "memory(GiB)": 302.58, + "step": 266620, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.76033759, + "epoch": 1.4911771737595543, + "grad_norm": 8.6875, + "learning_rate": 1.667585054483526e-06, + "loss": 0.95479059, + "memory(GiB)": 302.58, + "step": 266640, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.76004124, + "epoch": 1.4912890232325338, + "grad_norm": 7.53125, + "learning_rate": 1.6668957252102612e-06, + "loss": 0.94691219, + "memory(GiB)": 302.58, + "step": 266660, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.75759878, + "epoch": 1.4914008727055128, + "grad_norm": 7.09375, + "learning_rate": 1.6662065099397318e-06, + "loss": 0.95511866, + "memory(GiB)": 302.58, + "step": 266680, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.76575212, + "epoch": 1.4915127221784923, + "grad_norm": 8.6875, + "learning_rate": 1.6655174086955106e-06, + "loss": 0.91768007, + "memory(GiB)": 302.58, + "step": 266700, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.76414604, + "epoch": 1.4916245716514713, + "grad_norm": 7.84375, + "learning_rate": 1.6648284215011678e-06, + "loss": 0.89999561, + "memory(GiB)": 302.58, + "step": 266720, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75105619, + "epoch": 1.4917364211244508, + "grad_norm": 8.8125, + "learning_rate": 1.6641395483802686e-06, + "loss": 0.96942101, + "memory(GiB)": 302.58, + "step": 266740, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.75812588, + "epoch": 1.4918482705974299, + "grad_norm": 7.09375, + "learning_rate": 1.663450789356374e-06, + "loss": 0.94260712, + "memory(GiB)": 302.58, + "step": 266760, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.72956018, + "epoch": 1.4919601200704093, + "grad_norm": 6.875, + "learning_rate": 1.6627621444530429e-06, + "loss": 1.06618843, + "memory(GiB)": 302.58, + "step": 266780, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.73973689, + "epoch": 1.4920719695433884, + "grad_norm": 5.65625, + "learning_rate": 1.6620736136938282e-06, + "loss": 1.00320311, + "memory(GiB)": 302.58, + "step": 266800, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.76644549, + "epoch": 1.4921838190163679, + "grad_norm": 9.1875, + "learning_rate": 1.6613851971022804e-06, + "loss": 0.90022783, + "memory(GiB)": 302.58, + "step": 266820, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.7438798, + "epoch": 1.492295668489347, + "grad_norm": 9.75, + "learning_rate": 1.6606968947019436e-06, + "loss": 0.99725914, + "memory(GiB)": 302.58, + "step": 266840, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.7593204, + "epoch": 1.4924075179623264, + "grad_norm": 7.25, + "learning_rate": 1.6600087065163634e-06, + "loss": 0.94716644, + "memory(GiB)": 302.58, + "step": 266860, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.74520164, + "epoch": 1.4925193674353054, + "grad_norm": 7.9375, + "learning_rate": 1.6593206325690757e-06, + "loss": 1.00050955, + "memory(GiB)": 302.58, + "step": 266880, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.74826779, + "epoch": 1.492631216908285, + "grad_norm": 9.625, + "learning_rate": 1.6586326728836161e-06, + "loss": 0.98698883, + "memory(GiB)": 302.58, + "step": 266900, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.74146347, + "epoch": 1.492743066381264, + "grad_norm": 6.65625, + "learning_rate": 1.657944827483514e-06, + "loss": 1.0053544, + "memory(GiB)": 302.58, + "step": 266920, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.74310517, + "epoch": 1.4928549158542435, + "grad_norm": 8.125, + "learning_rate": 1.6572570963922963e-06, + "loss": 1.00543003, + "memory(GiB)": 302.58, + "step": 266940, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.74092879, + "epoch": 1.4929667653272225, + "grad_norm": 7.1875, + "learning_rate": 1.6565694796334852e-06, + "loss": 1.0125721, + "memory(GiB)": 302.58, + "step": 266960, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.75259843, + "epoch": 1.493078614800202, + "grad_norm": 8.0, + "learning_rate": 1.6558819772306001e-06, + "loss": 0.97965384, + "memory(GiB)": 302.58, + "step": 266980, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.75270658, + "epoch": 1.493190464273181, + "grad_norm": 7.03125, + "learning_rate": 1.6551945892071546e-06, + "loss": 0.98546629, + "memory(GiB)": 302.58, + "step": 267000, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.74308558, + "epoch": 1.4933023137461605, + "grad_norm": 7.6875, + "learning_rate": 1.654507315586661e-06, + "loss": 1.00629654, + "memory(GiB)": 302.58, + "step": 267020, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.73928227, + "epoch": 1.4934141632191396, + "grad_norm": 7.90625, + "learning_rate": 1.6538201563926248e-06, + "loss": 0.9960125, + "memory(GiB)": 302.58, + "step": 267040, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74456058, + "epoch": 1.493526012692119, + "grad_norm": 7.53125, + "learning_rate": 1.6531331116485499e-06, + "loss": 1.00221596, + "memory(GiB)": 302.58, + "step": 267060, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.76082883, + "epoch": 1.493637862165098, + "grad_norm": 7.15625, + "learning_rate": 1.6524461813779353e-06, + "loss": 0.92415848, + "memory(GiB)": 302.58, + "step": 267080, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.74462233, + "epoch": 1.4937497116380776, + "grad_norm": 7.3125, + "learning_rate": 1.6517593656042747e-06, + "loss": 1.01182585, + "memory(GiB)": 302.58, + "step": 267100, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.76585913, + "epoch": 1.4938615611110566, + "grad_norm": 7.09375, + "learning_rate": 1.6510726643510616e-06, + "loss": 0.89682789, + "memory(GiB)": 302.58, + "step": 267120, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.75054049, + "epoch": 1.493973410584036, + "grad_norm": 5.53125, + "learning_rate": 1.6503860776417834e-06, + "loss": 0.96927767, + "memory(GiB)": 302.58, + "step": 267140, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74868593, + "epoch": 1.4940852600570151, + "grad_norm": 5.4375, + "learning_rate": 1.6496996054999203e-06, + "loss": 0.99096909, + "memory(GiB)": 302.58, + "step": 267160, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.75289993, + "epoch": 1.4941971095299946, + "grad_norm": 8.1875, + "learning_rate": 1.6490132479489567e-06, + "loss": 0.98437452, + "memory(GiB)": 302.58, + "step": 267180, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75591469, + "epoch": 1.4943089590029737, + "grad_norm": 5.96875, + "learning_rate": 1.6483270050123652e-06, + "loss": 0.95865688, + "memory(GiB)": 302.58, + "step": 267200, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.74446449, + "epoch": 1.4944208084759532, + "grad_norm": 6.375, + "learning_rate": 1.6476408767136182e-06, + "loss": 0.99101553, + "memory(GiB)": 302.58, + "step": 267220, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.73632078, + "epoch": 1.4945326579489322, + "grad_norm": 7.5625, + "learning_rate": 1.6469548630761827e-06, + "loss": 1.04294901, + "memory(GiB)": 302.58, + "step": 267240, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.74651175, + "epoch": 1.4946445074219117, + "grad_norm": 5.53125, + "learning_rate": 1.6462689641235242e-06, + "loss": 0.99963923, + "memory(GiB)": 302.58, + "step": 267260, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.75993576, + "epoch": 1.4947563568948907, + "grad_norm": 9.0, + "learning_rate": 1.6455831798791006e-06, + "loss": 0.93409157, + "memory(GiB)": 302.58, + "step": 267280, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.75840402, + "epoch": 1.4948682063678702, + "grad_norm": 6.46875, + "learning_rate": 1.6448975103663688e-06, + "loss": 0.94446859, + "memory(GiB)": 302.58, + "step": 267300, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.74703989, + "epoch": 1.4949800558408493, + "grad_norm": 7.25, + "learning_rate": 1.6442119556087816e-06, + "loss": 1.00289268, + "memory(GiB)": 302.58, + "step": 267320, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.75404673, + "epoch": 1.4950919053138287, + "grad_norm": 6.46875, + "learning_rate": 1.643526515629786e-06, + "loss": 0.95524702, + "memory(GiB)": 302.58, + "step": 267340, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.75654764, + "epoch": 1.4952037547868078, + "grad_norm": 5.25, + "learning_rate": 1.6428411904528262e-06, + "loss": 0.95805159, + "memory(GiB)": 302.58, + "step": 267360, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.75123558, + "epoch": 1.4953156042597873, + "grad_norm": 7.75, + "learning_rate": 1.6421559801013438e-06, + "loss": 0.9765728, + "memory(GiB)": 302.58, + "step": 267380, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.76749377, + "epoch": 1.4954274537327663, + "grad_norm": 6.375, + "learning_rate": 1.6414708845987736e-06, + "loss": 0.90702429, + "memory(GiB)": 302.58, + "step": 267400, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.74286122, + "epoch": 1.4955393032057458, + "grad_norm": 8.0625, + "learning_rate": 1.6407859039685492e-06, + "loss": 1.01551991, + "memory(GiB)": 302.58, + "step": 267420, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.76362844, + "epoch": 1.4956511526787248, + "grad_norm": 10.5, + "learning_rate": 1.640101038234097e-06, + "loss": 0.92872114, + "memory(GiB)": 302.58, + "step": 267440, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.74639087, + "epoch": 1.4957630021517043, + "grad_norm": 5.375, + "learning_rate": 1.6394162874188447e-06, + "loss": 1.0108448, + "memory(GiB)": 302.58, + "step": 267460, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.7612731, + "epoch": 1.4958748516246834, + "grad_norm": 6.125, + "learning_rate": 1.6387316515462115e-06, + "loss": 0.93322449, + "memory(GiB)": 302.58, + "step": 267480, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.75763307, + "epoch": 1.4959867010976629, + "grad_norm": 6.53125, + "learning_rate": 1.6380471306396146e-06, + "loss": 0.95595398, + "memory(GiB)": 302.58, + "step": 267500, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.74990263, + "epoch": 1.496098550570642, + "grad_norm": 8.25, + "learning_rate": 1.6373627247224654e-06, + "loss": 0.99556112, + "memory(GiB)": 302.58, + "step": 267520, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.74409328, + "epoch": 1.4962104000436214, + "grad_norm": 8.375, + "learning_rate": 1.636678433818174e-06, + "loss": 1.01797113, + "memory(GiB)": 302.58, + "step": 267540, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.73344493, + "epoch": 1.4963222495166004, + "grad_norm": 9.4375, + "learning_rate": 1.6359942579501453e-06, + "loss": 1.06944733, + "memory(GiB)": 302.58, + "step": 267560, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.75226007, + "epoch": 1.49643409898958, + "grad_norm": 7.5, + "learning_rate": 1.63531019714178e-06, + "loss": 0.96874437, + "memory(GiB)": 302.58, + "step": 267580, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.74408369, + "epoch": 1.496545948462559, + "grad_norm": 5.78125, + "learning_rate": 1.6346262514164745e-06, + "loss": 1.0201951, + "memory(GiB)": 302.58, + "step": 267600, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.73043399, + "epoch": 1.4966577979355384, + "grad_norm": 4.75, + "learning_rate": 1.6339424207976228e-06, + "loss": 1.05593405, + "memory(GiB)": 302.58, + "step": 267620, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.76995201, + "epoch": 1.4967696474085175, + "grad_norm": 7.15625, + "learning_rate": 1.633258705308614e-06, + "loss": 0.88947077, + "memory(GiB)": 302.58, + "step": 267640, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.7417676, + "epoch": 1.496881496881497, + "grad_norm": 9.3125, + "learning_rate": 1.6325751049728334e-06, + "loss": 1.00663004, + "memory(GiB)": 302.58, + "step": 267660, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.73396187, + "epoch": 1.4969933463544762, + "grad_norm": 5.59375, + "learning_rate": 1.6318916198136613e-06, + "loss": 1.05050535, + "memory(GiB)": 302.58, + "step": 267680, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.73402209, + "epoch": 1.4971051958274555, + "grad_norm": 8.0625, + "learning_rate": 1.6312082498544762e-06, + "loss": 1.0783267, + "memory(GiB)": 302.58, + "step": 267700, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.756531, + "epoch": 1.4972170453004348, + "grad_norm": 6.34375, + "learning_rate": 1.6305249951186492e-06, + "loss": 0.94694777, + "memory(GiB)": 302.58, + "step": 267720, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.75882254, + "epoch": 1.497328894773414, + "grad_norm": 6.625, + "learning_rate": 1.6298418556295536e-06, + "loss": 0.95171995, + "memory(GiB)": 302.58, + "step": 267740, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.73639622, + "epoch": 1.4974407442463933, + "grad_norm": 9.375, + "learning_rate": 1.6291588314105527e-06, + "loss": 1.06145401, + "memory(GiB)": 302.58, + "step": 267760, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.75306115, + "epoch": 1.4975525937193725, + "grad_norm": 7.65625, + "learning_rate": 1.6284759224850088e-06, + "loss": 0.95162287, + "memory(GiB)": 302.58, + "step": 267780, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.75864882, + "epoch": 1.4976644431923518, + "grad_norm": 8.0625, + "learning_rate": 1.6277931288762788e-06, + "loss": 0.94074116, + "memory(GiB)": 302.58, + "step": 267800, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.76801195, + "epoch": 1.497776292665331, + "grad_norm": 6.875, + "learning_rate": 1.6271104506077168e-06, + "loss": 0.90333519, + "memory(GiB)": 302.58, + "step": 267820, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.74993854, + "epoch": 1.4978881421383103, + "grad_norm": 9.75, + "learning_rate": 1.626427887702673e-06, + "loss": 0.98440113, + "memory(GiB)": 302.58, + "step": 267840, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.75347896, + "epoch": 1.4979999916112896, + "grad_norm": 7.84375, + "learning_rate": 1.6257454401844925e-06, + "loss": 0.97432604, + "memory(GiB)": 302.58, + "step": 267860, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.74031544, + "epoch": 1.4981118410842689, + "grad_norm": 10.3125, + "learning_rate": 1.6250631080765173e-06, + "loss": 1.01722183, + "memory(GiB)": 302.58, + "step": 267880, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.73419824, + "epoch": 1.4982236905572481, + "grad_norm": 6.40625, + "learning_rate": 1.6243808914020854e-06, + "loss": 1.06000338, + "memory(GiB)": 302.58, + "step": 267900, + "train_speed(iter/s)": 0.123777 + }, + { + "acc": 0.76855264, + "epoch": 1.4983355400302274, + "grad_norm": 7.15625, + "learning_rate": 1.6236987901845309e-06, + "loss": 0.8763196, + "memory(GiB)": 302.58, + "step": 267920, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.74732084, + "epoch": 1.4984473895032067, + "grad_norm": 5.34375, + "learning_rate": 1.6230168044471834e-06, + "loss": 1.00646553, + "memory(GiB)": 302.58, + "step": 267940, + "train_speed(iter/s)": 0.123786 + }, + { + "acc": 0.74606862, + "epoch": 1.498559238976186, + "grad_norm": 5.875, + "learning_rate": 1.62233493421337e-06, + "loss": 1.02089996, + "memory(GiB)": 302.58, + "step": 267960, + "train_speed(iter/s)": 0.12379 + }, + { + "acc": 0.74843302, + "epoch": 1.4986710884491652, + "grad_norm": 7.71875, + "learning_rate": 1.62165317950641e-06, + "loss": 0.99541512, + "memory(GiB)": 302.58, + "step": 267980, + "train_speed(iter/s)": 0.123794 + }, + { + "acc": 0.7489377, + "epoch": 1.4987829379221445, + "grad_norm": 6.5625, + "learning_rate": 1.6209715403496257e-06, + "loss": 0.97700052, + "memory(GiB)": 302.58, + "step": 268000, + "train_speed(iter/s)": 0.123799 + }, + { + "epoch": 1.4987829379221445, + "eval_acc": 0.70688520326396, + "eval_loss": 1.011939525604248, + "eval_runtime": 7536.3873, + "eval_samples_per_second": 9.989, + "eval_steps_per_second": 9.989, + "step": 268000 + }, + { + "acc": 0.75571923, + "epoch": 1.4988947873951237, + "grad_norm": 5.1875, + "learning_rate": 1.6202900167663289e-06, + "loss": 0.96554794, + "memory(GiB)": 302.58, + "step": 268020, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.73569593, + "epoch": 1.499006636868103, + "grad_norm": 6.5, + "learning_rate": 1.6196086087798302e-06, + "loss": 1.04874268, + "memory(GiB)": 302.58, + "step": 268040, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.7359767, + "epoch": 1.4991184863410822, + "grad_norm": 6.34375, + "learning_rate": 1.6189273164134366e-06, + "loss": 1.04676981, + "memory(GiB)": 302.58, + "step": 268060, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.75264716, + "epoch": 1.4992303358140615, + "grad_norm": 8.125, + "learning_rate": 1.6182461396904498e-06, + "loss": 0.96423397, + "memory(GiB)": 302.58, + "step": 268080, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.74310036, + "epoch": 1.4993421852870408, + "grad_norm": 8.0625, + "learning_rate": 1.617565078634168e-06, + "loss": 1.02112103, + "memory(GiB)": 302.58, + "step": 268100, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.73553309, + "epoch": 1.49945403476002, + "grad_norm": 6.34375, + "learning_rate": 1.6168841332678859e-06, + "loss": 1.06389036, + "memory(GiB)": 302.58, + "step": 268120, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.73444529, + "epoch": 1.4995658842329993, + "grad_norm": 8.875, + "learning_rate": 1.616203303614894e-06, + "loss": 1.08566322, + "memory(GiB)": 302.58, + "step": 268140, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.7463336, + "epoch": 1.4996777337059786, + "grad_norm": 9.6875, + "learning_rate": 1.615522589698479e-06, + "loss": 1.00696764, + "memory(GiB)": 302.58, + "step": 268160, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.7466012, + "epoch": 1.4997895831789578, + "grad_norm": 7.15625, + "learning_rate": 1.6148419915419234e-06, + "loss": 1.00003881, + "memory(GiB)": 302.58, + "step": 268180, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.7468154, + "epoch": 1.499901432651937, + "grad_norm": 8.3125, + "learning_rate": 1.6141615091685053e-06, + "loss": 0.99105473, + "memory(GiB)": 302.58, + "step": 268200, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.74974518, + "epoch": 1.5000132821249164, + "grad_norm": 6.53125, + "learning_rate": 1.6134811426015002e-06, + "loss": 0.99811621, + "memory(GiB)": 302.58, + "step": 268220, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.7464695, + "epoch": 1.5001251315978954, + "grad_norm": 8.875, + "learning_rate": 1.612800891864178e-06, + "loss": 0.9827322, + "memory(GiB)": 302.58, + "step": 268240, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.7495141, + "epoch": 1.5002369810708749, + "grad_norm": 5.625, + "learning_rate": 1.6121207569798048e-06, + "loss": 0.99810467, + "memory(GiB)": 302.58, + "step": 268260, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75158992, + "epoch": 1.500348830543854, + "grad_norm": 7.25, + "learning_rate": 1.6114407379716457e-06, + "loss": 0.99868593, + "memory(GiB)": 302.58, + "step": 268280, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.75934305, + "epoch": 1.5004606800168334, + "grad_norm": 6.84375, + "learning_rate": 1.6107608348629577e-06, + "loss": 0.91780405, + "memory(GiB)": 302.58, + "step": 268300, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75798268, + "epoch": 1.5005725294898125, + "grad_norm": 5.34375, + "learning_rate": 1.6100810476769968e-06, + "loss": 0.9559042, + "memory(GiB)": 302.58, + "step": 268320, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.73008094, + "epoch": 1.500684378962792, + "grad_norm": 8.8125, + "learning_rate": 1.609401376437013e-06, + "loss": 1.08026419, + "memory(GiB)": 302.58, + "step": 268340, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.77042346, + "epoch": 1.500796228435771, + "grad_norm": 8.8125, + "learning_rate": 1.6087218211662537e-06, + "loss": 0.91620865, + "memory(GiB)": 302.58, + "step": 268360, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.74625316, + "epoch": 1.5009080779087505, + "grad_norm": 6.4375, + "learning_rate": 1.6080423818879592e-06, + "loss": 1.00285149, + "memory(GiB)": 302.58, + "step": 268380, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.73964825, + "epoch": 1.5010199273817295, + "grad_norm": 8.6875, + "learning_rate": 1.607363058625373e-06, + "loss": 1.02218523, + "memory(GiB)": 302.58, + "step": 268400, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.74071083, + "epoch": 1.501131776854709, + "grad_norm": 8.375, + "learning_rate": 1.606683851401728e-06, + "loss": 1.05114126, + "memory(GiB)": 302.58, + "step": 268420, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.76006265, + "epoch": 1.501243626327688, + "grad_norm": 9.375, + "learning_rate": 1.606004760240255e-06, + "loss": 0.94533615, + "memory(GiB)": 302.58, + "step": 268440, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.74536057, + "epoch": 1.5013554758006675, + "grad_norm": 6.9375, + "learning_rate": 1.6053257851641812e-06, + "loss": 0.98582497, + "memory(GiB)": 302.58, + "step": 268460, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.7567349, + "epoch": 1.5014673252736466, + "grad_norm": 8.3125, + "learning_rate": 1.6046469261967302e-06, + "loss": 0.95945158, + "memory(GiB)": 302.58, + "step": 268480, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.75998454, + "epoch": 1.501579174746626, + "grad_norm": 7.78125, + "learning_rate": 1.6039681833611203e-06, + "loss": 0.94658508, + "memory(GiB)": 302.58, + "step": 268500, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.72947755, + "epoch": 1.501691024219605, + "grad_norm": 6.21875, + "learning_rate": 1.6032895566805672e-06, + "loss": 1.08765907, + "memory(GiB)": 302.58, + "step": 268520, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74660568, + "epoch": 1.5018028736925846, + "grad_norm": 7.75, + "learning_rate": 1.6026110461782818e-06, + "loss": 0.99839029, + "memory(GiB)": 302.58, + "step": 268540, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.75225515, + "epoch": 1.5019147231655636, + "grad_norm": 8.375, + "learning_rate": 1.601932651877472e-06, + "loss": 0.97667274, + "memory(GiB)": 302.58, + "step": 268560, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.73378501, + "epoch": 1.5020265726385431, + "grad_norm": 8.4375, + "learning_rate": 1.6012543738013398e-06, + "loss": 1.05233221, + "memory(GiB)": 302.58, + "step": 268580, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74141126, + "epoch": 1.5021384221115222, + "grad_norm": 5.53125, + "learning_rate": 1.600576211973084e-06, + "loss": 1.01386414, + "memory(GiB)": 302.58, + "step": 268600, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.76667566, + "epoch": 1.5022502715845016, + "grad_norm": 10.3125, + "learning_rate": 1.599898166415902e-06, + "loss": 0.90889997, + "memory(GiB)": 302.58, + "step": 268620, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.75988832, + "epoch": 1.5023621210574807, + "grad_norm": 6.65625, + "learning_rate": 1.5992202371529846e-06, + "loss": 0.9433485, + "memory(GiB)": 302.58, + "step": 268640, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.74716301, + "epoch": 1.5024739705304602, + "grad_norm": 4.0625, + "learning_rate": 1.5985424242075188e-06, + "loss": 1.00230236, + "memory(GiB)": 302.58, + "step": 268660, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.75554037, + "epoch": 1.5025858200034392, + "grad_norm": 7.90625, + "learning_rate": 1.5978647276026877e-06, + "loss": 0.9824151, + "memory(GiB)": 302.58, + "step": 268680, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.76395016, + "epoch": 1.5026976694764187, + "grad_norm": 9.5, + "learning_rate": 1.5971871473616708e-06, + "loss": 0.90209122, + "memory(GiB)": 302.58, + "step": 268700, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75074358, + "epoch": 1.5028095189493977, + "grad_norm": 6.375, + "learning_rate": 1.5965096835076433e-06, + "loss": 0.97097673, + "memory(GiB)": 302.58, + "step": 268720, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.74893308, + "epoch": 1.5029213684223772, + "grad_norm": 7.34375, + "learning_rate": 1.5958323360637767e-06, + "loss": 1.00705185, + "memory(GiB)": 302.58, + "step": 268740, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75738883, + "epoch": 1.5030332178953563, + "grad_norm": 5.8125, + "learning_rate": 1.5951551050532389e-06, + "loss": 0.98160429, + "memory(GiB)": 302.58, + "step": 268760, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.75619698, + "epoch": 1.5031450673683358, + "grad_norm": 5.8125, + "learning_rate": 1.594477990499193e-06, + "loss": 0.97108183, + "memory(GiB)": 302.58, + "step": 268780, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.75694036, + "epoch": 1.5032569168413148, + "grad_norm": 7.0625, + "learning_rate": 1.5938009924247983e-06, + "loss": 0.95428343, + "memory(GiB)": 302.58, + "step": 268800, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.75452442, + "epoch": 1.5033687663142943, + "grad_norm": 7.1875, + "learning_rate": 1.5931241108532102e-06, + "loss": 0.97585764, + "memory(GiB)": 302.58, + "step": 268820, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.75121899, + "epoch": 1.5034806157872733, + "grad_norm": 9.25, + "learning_rate": 1.5924473458075806e-06, + "loss": 0.97156, + "memory(GiB)": 302.58, + "step": 268840, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75474725, + "epoch": 1.5035924652602528, + "grad_norm": 8.5625, + "learning_rate": 1.5917706973110553e-06, + "loss": 0.97820454, + "memory(GiB)": 302.58, + "step": 268860, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.73218741, + "epoch": 1.5037043147332319, + "grad_norm": 5.65625, + "learning_rate": 1.5910941653867807e-06, + "loss": 1.06268158, + "memory(GiB)": 302.58, + "step": 268880, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.74115582, + "epoch": 1.5038161642062113, + "grad_norm": 10.25, + "learning_rate": 1.5904177500578954e-06, + "loss": 1.03726482, + "memory(GiB)": 302.58, + "step": 268900, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75648417, + "epoch": 1.5039280136791906, + "grad_norm": 10.3125, + "learning_rate": 1.5897414513475346e-06, + "loss": 0.93559599, + "memory(GiB)": 302.58, + "step": 268920, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75290051, + "epoch": 1.5040398631521699, + "grad_norm": 7.25, + "learning_rate": 1.5890652692788293e-06, + "loss": 0.96504221, + "memory(GiB)": 302.58, + "step": 268940, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.76078653, + "epoch": 1.5041517126251491, + "grad_norm": 4.625, + "learning_rate": 1.5883892038749078e-06, + "loss": 0.95330915, + "memory(GiB)": 302.58, + "step": 268960, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.76019397, + "epoch": 1.5042635620981284, + "grad_norm": 9.8125, + "learning_rate": 1.5877132551588936e-06, + "loss": 0.93598366, + "memory(GiB)": 302.58, + "step": 268980, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.75472169, + "epoch": 1.5043754115711077, + "grad_norm": 8.3125, + "learning_rate": 1.587037423153906e-06, + "loss": 0.98327703, + "memory(GiB)": 302.58, + "step": 269000, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.76408329, + "epoch": 1.504487261044087, + "grad_norm": 7.40625, + "learning_rate": 1.5863617078830607e-06, + "loss": 0.89919949, + "memory(GiB)": 302.58, + "step": 269020, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.76133928, + "epoch": 1.5045991105170662, + "grad_norm": 11.375, + "learning_rate": 1.5856861093694692e-06, + "loss": 0.91827736, + "memory(GiB)": 302.58, + "step": 269040, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74266415, + "epoch": 1.5047109599900454, + "grad_norm": 6.78125, + "learning_rate": 1.5850106276362392e-06, + "loss": 1.01867504, + "memory(GiB)": 302.58, + "step": 269060, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.72875752, + "epoch": 1.5048228094630247, + "grad_norm": 12.375, + "learning_rate": 1.584335262706475e-06, + "loss": 1.08578243, + "memory(GiB)": 302.58, + "step": 269080, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75073557, + "epoch": 1.504934658936004, + "grad_norm": 5.9375, + "learning_rate": 1.5836600146032748e-06, + "loss": 0.98889771, + "memory(GiB)": 302.58, + "step": 269100, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.75458984, + "epoch": 1.5050465084089832, + "grad_norm": 5.96875, + "learning_rate": 1.5829848833497357e-06, + "loss": 0.98354845, + "memory(GiB)": 302.58, + "step": 269120, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.7557498, + "epoch": 1.5051583578819625, + "grad_norm": 8.0, + "learning_rate": 1.582309868968947e-06, + "loss": 0.94086256, + "memory(GiB)": 302.58, + "step": 269140, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.73986716, + "epoch": 1.5052702073549418, + "grad_norm": 7.4375, + "learning_rate": 1.581634971483999e-06, + "loss": 1.03638172, + "memory(GiB)": 302.58, + "step": 269160, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.76007481, + "epoch": 1.505382056827921, + "grad_norm": 8.5625, + "learning_rate": 1.5809601909179751e-06, + "loss": 0.93575191, + "memory(GiB)": 302.58, + "step": 269180, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.7490047, + "epoch": 1.5054939063009003, + "grad_norm": 8.5, + "learning_rate": 1.5802855272939537e-06, + "loss": 0.96985807, + "memory(GiB)": 302.58, + "step": 269200, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.75476441, + "epoch": 1.5056057557738796, + "grad_norm": 9.125, + "learning_rate": 1.5796109806350108e-06, + "loss": 0.94570942, + "memory(GiB)": 302.58, + "step": 269220, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.75455523, + "epoch": 1.5057176052468588, + "grad_norm": 7.59375, + "learning_rate": 1.578936550964218e-06, + "loss": 0.95628939, + "memory(GiB)": 302.58, + "step": 269240, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.75468006, + "epoch": 1.505829454719838, + "grad_norm": 8.0625, + "learning_rate": 1.5782622383046437e-06, + "loss": 0.95075445, + "memory(GiB)": 302.58, + "step": 269260, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.76283636, + "epoch": 1.5059413041928174, + "grad_norm": 8.4375, + "learning_rate": 1.5775880426793505e-06, + "loss": 0.95765753, + "memory(GiB)": 302.58, + "step": 269280, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.75417175, + "epoch": 1.5060531536657966, + "grad_norm": 7.34375, + "learning_rate": 1.5769139641113983e-06, + "loss": 0.97212191, + "memory(GiB)": 302.58, + "step": 269300, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.74635634, + "epoch": 1.5061650031387759, + "grad_norm": 5.6875, + "learning_rate": 1.5762400026238428e-06, + "loss": 1.01935511, + "memory(GiB)": 302.58, + "step": 269320, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.7545835, + "epoch": 1.5062768526117551, + "grad_norm": 4.75, + "learning_rate": 1.5755661582397358e-06, + "loss": 0.9353981, + "memory(GiB)": 302.58, + "step": 269340, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.76048946, + "epoch": 1.5063887020847344, + "grad_norm": 5.46875, + "learning_rate": 1.574892430982125e-06, + "loss": 0.94591312, + "memory(GiB)": 302.58, + "step": 269360, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.73786035, + "epoch": 1.5065005515577137, + "grad_norm": 7.03125, + "learning_rate": 1.5742188208740533e-06, + "loss": 1.05693779, + "memory(GiB)": 302.58, + "step": 269380, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.73816538, + "epoch": 1.506612401030693, + "grad_norm": 7.0, + "learning_rate": 1.573545327938561e-06, + "loss": 1.01378412, + "memory(GiB)": 302.58, + "step": 269400, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.75216222, + "epoch": 1.5067242505036722, + "grad_norm": 5.15625, + "learning_rate": 1.5728719521986819e-06, + "loss": 0.98692865, + "memory(GiB)": 302.58, + "step": 269420, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.74982362, + "epoch": 1.5068360999766515, + "grad_norm": 6.375, + "learning_rate": 1.5721986936774508e-06, + "loss": 0.99464903, + "memory(GiB)": 302.58, + "step": 269440, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.73578782, + "epoch": 1.5069479494496307, + "grad_norm": 7.46875, + "learning_rate": 1.571525552397894e-06, + "loss": 1.02819128, + "memory(GiB)": 302.58, + "step": 269460, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.75449452, + "epoch": 1.50705979892261, + "grad_norm": 11.25, + "learning_rate": 1.5708525283830338e-06, + "loss": 0.96029978, + "memory(GiB)": 302.58, + "step": 269480, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.73663316, + "epoch": 1.5071716483955893, + "grad_norm": 5.71875, + "learning_rate": 1.5701796216558912e-06, + "loss": 1.0451767, + "memory(GiB)": 302.58, + "step": 269500, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.74186978, + "epoch": 1.5072834978685685, + "grad_norm": 6.65625, + "learning_rate": 1.5695068322394814e-06, + "loss": 1.01434994, + "memory(GiB)": 302.58, + "step": 269520, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.75662837, + "epoch": 1.5073953473415478, + "grad_norm": 5.96875, + "learning_rate": 1.5688341601568158e-06, + "loss": 0.96299467, + "memory(GiB)": 302.58, + "step": 269540, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.75714593, + "epoch": 1.507507196814527, + "grad_norm": 8.25, + "learning_rate": 1.5681616054309012e-06, + "loss": 0.93345585, + "memory(GiB)": 302.58, + "step": 269560, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.7441102, + "epoch": 1.5076190462875063, + "grad_norm": 6.28125, + "learning_rate": 1.567489168084742e-06, + "loss": 1.00437498, + "memory(GiB)": 302.58, + "step": 269580, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.76325512, + "epoch": 1.5077308957604856, + "grad_norm": 7.625, + "learning_rate": 1.5668168481413375e-06, + "loss": 0.90810394, + "memory(GiB)": 302.58, + "step": 269600, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.72964725, + "epoch": 1.5078427452334648, + "grad_norm": 7.5, + "learning_rate": 1.5661446456236818e-06, + "loss": 1.08411102, + "memory(GiB)": 302.58, + "step": 269620, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.75989499, + "epoch": 1.507954594706444, + "grad_norm": 6.625, + "learning_rate": 1.5654725605547694e-06, + "loss": 0.95350256, + "memory(GiB)": 302.58, + "step": 269640, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.74632368, + "epoch": 1.5080664441794234, + "grad_norm": 7.375, + "learning_rate": 1.5648005929575854e-06, + "loss": 1.01146069, + "memory(GiB)": 302.58, + "step": 269660, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.76176057, + "epoch": 1.5081782936524026, + "grad_norm": 10.5, + "learning_rate": 1.5641287428551143e-06, + "loss": 0.92143946, + "memory(GiB)": 302.58, + "step": 269680, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.75077376, + "epoch": 1.508290143125382, + "grad_norm": 4.5, + "learning_rate": 1.563457010270335e-06, + "loss": 0.98261833, + "memory(GiB)": 302.58, + "step": 269700, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.75940671, + "epoch": 1.5084019925983612, + "grad_norm": 8.125, + "learning_rate": 1.5627853952262228e-06, + "loss": 0.93654871, + "memory(GiB)": 302.58, + "step": 269720, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.75731325, + "epoch": 1.5085138420713404, + "grad_norm": 7.71875, + "learning_rate": 1.5621138977457479e-06, + "loss": 0.95309496, + "memory(GiB)": 302.58, + "step": 269740, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.76348991, + "epoch": 1.5086256915443197, + "grad_norm": 5.9375, + "learning_rate": 1.56144251785188e-06, + "loss": 0.93080702, + "memory(GiB)": 302.58, + "step": 269760, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.74376001, + "epoch": 1.508737541017299, + "grad_norm": 8.375, + "learning_rate": 1.5607712555675813e-06, + "loss": 1.00543213, + "memory(GiB)": 302.58, + "step": 269780, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.76038671, + "epoch": 1.5088493904902782, + "grad_norm": 5.625, + "learning_rate": 1.560100110915812e-06, + "loss": 0.95396986, + "memory(GiB)": 302.58, + "step": 269800, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.74663091, + "epoch": 1.5089612399632575, + "grad_norm": 5.46875, + "learning_rate": 1.5594290839195258e-06, + "loss": 1.00918264, + "memory(GiB)": 302.58, + "step": 269820, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.74825368, + "epoch": 1.5090730894362367, + "grad_norm": 8.625, + "learning_rate": 1.558758174601675e-06, + "loss": 0.98348103, + "memory(GiB)": 302.58, + "step": 269840, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.75230575, + "epoch": 1.509184938909216, + "grad_norm": 6.75, + "learning_rate": 1.5580873829852062e-06, + "loss": 0.96699286, + "memory(GiB)": 302.58, + "step": 269860, + "train_speed(iter/s)": 0.123766 + }, + { + "acc": 0.75282092, + "epoch": 1.5092967883821953, + "grad_norm": 7.125, + "learning_rate": 1.5574167090930631e-06, + "loss": 0.93331976, + "memory(GiB)": 302.58, + "step": 269880, + "train_speed(iter/s)": 0.12377 + }, + { + "acc": 0.74322586, + "epoch": 1.5094086378551745, + "grad_norm": 6.8125, + "learning_rate": 1.5567461529481848e-06, + "loss": 1.03398676, + "memory(GiB)": 302.58, + "step": 269900, + "train_speed(iter/s)": 0.123774 + }, + { + "acc": 0.75056801, + "epoch": 1.5095204873281538, + "grad_norm": 8.625, + "learning_rate": 1.5560757145735061e-06, + "loss": 1.00367355, + "memory(GiB)": 302.58, + "step": 269920, + "train_speed(iter/s)": 0.123778 + }, + { + "acc": 0.74146781, + "epoch": 1.509632336801133, + "grad_norm": 7.71875, + "learning_rate": 1.555405393991959e-06, + "loss": 1.01200905, + "memory(GiB)": 302.58, + "step": 269940, + "train_speed(iter/s)": 0.123783 + }, + { + "acc": 0.74908586, + "epoch": 1.5097441862741123, + "grad_norm": 6.84375, + "learning_rate": 1.554735191226469e-06, + "loss": 0.99818048, + "memory(GiB)": 302.58, + "step": 269960, + "train_speed(iter/s)": 0.123787 + }, + { + "acc": 0.7509675, + "epoch": 1.5098560357470916, + "grad_norm": 10.3125, + "learning_rate": 1.5540651062999608e-06, + "loss": 0.97842617, + "memory(GiB)": 302.58, + "step": 269980, + "train_speed(iter/s)": 0.123791 + }, + { + "acc": 0.75532546, + "epoch": 1.5099678852200709, + "grad_norm": 5.78125, + "learning_rate": 1.5533951392353524e-06, + "loss": 0.94838867, + "memory(GiB)": 302.58, + "step": 270000, + "train_speed(iter/s)": 0.123795 + }, + { + "epoch": 1.5099678852200709, + "eval_acc": 0.706881801785707, + "eval_loss": 1.0118446350097656, + "eval_runtime": 7533.642, + "eval_samples_per_second": 9.993, + "eval_steps_per_second": 9.993, + "step": 270000 + }, + { + "acc": 0.73965259, + "epoch": 1.5100797346930501, + "grad_norm": 4.4375, + "learning_rate": 1.5527252900555573e-06, + "loss": 1.04998741, + "memory(GiB)": 302.58, + "step": 270020, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.75928211, + "epoch": 1.5101915841660294, + "grad_norm": 8.375, + "learning_rate": 1.55205555878349e-06, + "loss": 0.95311794, + "memory(GiB)": 302.58, + "step": 270040, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.75439954, + "epoch": 1.5103034336390087, + "grad_norm": 9.0625, + "learning_rate": 1.551385945442056e-06, + "loss": 0.96470585, + "memory(GiB)": 302.58, + "step": 270060, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.75286665, + "epoch": 1.510415283111988, + "grad_norm": 8.125, + "learning_rate": 1.5507164500541582e-06, + "loss": 0.97395802, + "memory(GiB)": 302.58, + "step": 270080, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.75807528, + "epoch": 1.5105271325849672, + "grad_norm": 7.59375, + "learning_rate": 1.5500470726426948e-06, + "loss": 0.9553977, + "memory(GiB)": 302.58, + "step": 270100, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.76291118, + "epoch": 1.5106389820579464, + "grad_norm": 7.96875, + "learning_rate": 1.549377813230561e-06, + "loss": 0.95470314, + "memory(GiB)": 302.58, + "step": 270120, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.75785918, + "epoch": 1.5107508315309257, + "grad_norm": 7.46875, + "learning_rate": 1.5487086718406475e-06, + "loss": 0.95090799, + "memory(GiB)": 302.58, + "step": 270140, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.75047126, + "epoch": 1.510862681003905, + "grad_norm": 8.125, + "learning_rate": 1.5480396484958415e-06, + "loss": 0.97935476, + "memory(GiB)": 302.58, + "step": 270160, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.7553998, + "epoch": 1.5109745304768842, + "grad_norm": 7.6875, + "learning_rate": 1.547370743219025e-06, + "loss": 0.96259451, + "memory(GiB)": 302.58, + "step": 270180, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.73304954, + "epoch": 1.5110863799498635, + "grad_norm": 7.96875, + "learning_rate": 1.5467019560330775e-06, + "loss": 1.06842871, + "memory(GiB)": 302.58, + "step": 270200, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.76226025, + "epoch": 1.5111982294228428, + "grad_norm": 9.6875, + "learning_rate": 1.546033286960873e-06, + "loss": 0.91135607, + "memory(GiB)": 302.58, + "step": 270220, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.73593879, + "epoch": 1.511310078895822, + "grad_norm": 5.9375, + "learning_rate": 1.545364736025282e-06, + "loss": 1.05283575, + "memory(GiB)": 302.58, + "step": 270240, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.76208401, + "epoch": 1.5114219283688013, + "grad_norm": 8.25, + "learning_rate": 1.544696303249172e-06, + "loss": 0.90701065, + "memory(GiB)": 302.58, + "step": 270260, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75762639, + "epoch": 1.5115337778417806, + "grad_norm": 9.375, + "learning_rate": 1.5440279886554028e-06, + "loss": 0.94299698, + "memory(GiB)": 302.58, + "step": 270280, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.73867168, + "epoch": 1.5116456273147598, + "grad_norm": 7.78125, + "learning_rate": 1.543359792266837e-06, + "loss": 1.03229399, + "memory(GiB)": 302.58, + "step": 270300, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.76028132, + "epoch": 1.511757476787739, + "grad_norm": 7.0625, + "learning_rate": 1.5426917141063269e-06, + "loss": 0.93113747, + "memory(GiB)": 302.58, + "step": 270320, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74257097, + "epoch": 1.5118693262607183, + "grad_norm": 6.78125, + "learning_rate": 1.5420237541967226e-06, + "loss": 1.02753096, + "memory(GiB)": 302.58, + "step": 270340, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75151687, + "epoch": 1.5119811757336976, + "grad_norm": 5.8125, + "learning_rate": 1.5413559125608713e-06, + "loss": 0.99287157, + "memory(GiB)": 302.58, + "step": 270360, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.76266541, + "epoch": 1.5120930252066769, + "grad_norm": 7.0, + "learning_rate": 1.5406881892216147e-06, + "loss": 0.93289604, + "memory(GiB)": 302.58, + "step": 270380, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.75881863, + "epoch": 1.5122048746796561, + "grad_norm": 5.5, + "learning_rate": 1.5400205842017912e-06, + "loss": 0.94586935, + "memory(GiB)": 302.58, + "step": 270400, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.76388807, + "epoch": 1.5123167241526354, + "grad_norm": 10.0, + "learning_rate": 1.5393530975242349e-06, + "loss": 0.91312084, + "memory(GiB)": 302.58, + "step": 270420, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.75917082, + "epoch": 1.5124285736256147, + "grad_norm": 7.28125, + "learning_rate": 1.5386857292117762e-06, + "loss": 0.9570466, + "memory(GiB)": 302.58, + "step": 270440, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.75436015, + "epoch": 1.512540423098594, + "grad_norm": 6.96875, + "learning_rate": 1.538018479287241e-06, + "loss": 0.96898632, + "memory(GiB)": 302.58, + "step": 270460, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.75776405, + "epoch": 1.5126522725715732, + "grad_norm": 7.625, + "learning_rate": 1.5373513477734513e-06, + "loss": 0.94823923, + "memory(GiB)": 302.58, + "step": 270480, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.75104518, + "epoch": 1.5127641220445525, + "grad_norm": 7.9375, + "learning_rate": 1.536684334693226e-06, + "loss": 0.95867577, + "memory(GiB)": 302.58, + "step": 270500, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.7379631, + "epoch": 1.5128759715175317, + "grad_norm": 9.75, + "learning_rate": 1.5360174400693779e-06, + "loss": 1.03048172, + "memory(GiB)": 302.58, + "step": 270520, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.76037784, + "epoch": 1.512987820990511, + "grad_norm": 8.4375, + "learning_rate": 1.5353506639247173e-06, + "loss": 0.95416336, + "memory(GiB)": 302.58, + "step": 270540, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.73893542, + "epoch": 1.5130996704634903, + "grad_norm": 11.3125, + "learning_rate": 1.5346840062820483e-06, + "loss": 1.01871805, + "memory(GiB)": 302.58, + "step": 270560, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.75990672, + "epoch": 1.5132115199364695, + "grad_norm": 9.3125, + "learning_rate": 1.5340174671641762e-06, + "loss": 0.94703808, + "memory(GiB)": 302.58, + "step": 270580, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.75006461, + "epoch": 1.5133233694094488, + "grad_norm": 4.875, + "learning_rate": 1.5333510465938972e-06, + "loss": 0.99798851, + "memory(GiB)": 302.58, + "step": 270600, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74900861, + "epoch": 1.513435218882428, + "grad_norm": 9.375, + "learning_rate": 1.5326847445940046e-06, + "loss": 0.97075472, + "memory(GiB)": 302.58, + "step": 270620, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.74336162, + "epoch": 1.5135470683554073, + "grad_norm": 7.1875, + "learning_rate": 1.532018561187289e-06, + "loss": 1.00634232, + "memory(GiB)": 302.58, + "step": 270640, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.77307105, + "epoch": 1.5136589178283866, + "grad_norm": 7.78125, + "learning_rate": 1.5313524963965344e-06, + "loss": 0.90167475, + "memory(GiB)": 302.58, + "step": 270660, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75121922, + "epoch": 1.5137707673013658, + "grad_norm": 6.46875, + "learning_rate": 1.530686550244524e-06, + "loss": 0.98258181, + "memory(GiB)": 302.58, + "step": 270680, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74461737, + "epoch": 1.513882616774345, + "grad_norm": 4.46875, + "learning_rate": 1.5300207227540338e-06, + "loss": 1.01767168, + "memory(GiB)": 302.58, + "step": 270700, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.7507452, + "epoch": 1.5139944662473244, + "grad_norm": 9.0625, + "learning_rate": 1.529355013947838e-06, + "loss": 0.97243032, + "memory(GiB)": 302.58, + "step": 270720, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.74305429, + "epoch": 1.5141063157203036, + "grad_norm": 9.625, + "learning_rate": 1.528689423848706e-06, + "loss": 0.98714094, + "memory(GiB)": 302.58, + "step": 270740, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.76250873, + "epoch": 1.514218165193283, + "grad_norm": 7.21875, + "learning_rate": 1.5280239524794027e-06, + "loss": 0.9340476, + "memory(GiB)": 302.58, + "step": 270760, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.76480355, + "epoch": 1.5143300146662622, + "grad_norm": 9.8125, + "learning_rate": 1.52735859986269e-06, + "loss": 0.92043018, + "memory(GiB)": 302.58, + "step": 270780, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74265103, + "epoch": 1.5144418641392414, + "grad_norm": 6.9375, + "learning_rate": 1.5266933660213245e-06, + "loss": 1.00607662, + "memory(GiB)": 302.58, + "step": 270800, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.76710291, + "epoch": 1.5145537136122207, + "grad_norm": 9.25, + "learning_rate": 1.5260282509780595e-06, + "loss": 0.92461824, + "memory(GiB)": 302.58, + "step": 270820, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.7633945, + "epoch": 1.5146655630852, + "grad_norm": 9.6875, + "learning_rate": 1.5253632547556418e-06, + "loss": 0.93431854, + "memory(GiB)": 302.58, + "step": 270840, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.7496645, + "epoch": 1.5147774125581792, + "grad_norm": 6.46875, + "learning_rate": 1.524698377376821e-06, + "loss": 0.97468882, + "memory(GiB)": 302.58, + "step": 270860, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.76801562, + "epoch": 1.5148892620311585, + "grad_norm": 8.4375, + "learning_rate": 1.5240336188643352e-06, + "loss": 0.92173157, + "memory(GiB)": 302.58, + "step": 270880, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.75565643, + "epoch": 1.5150011115041377, + "grad_norm": 9.4375, + "learning_rate": 1.5233689792409196e-06, + "loss": 0.95359268, + "memory(GiB)": 302.58, + "step": 270900, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75426984, + "epoch": 1.515112960977117, + "grad_norm": 9.0625, + "learning_rate": 1.522704458529311e-06, + "loss": 0.96548557, + "memory(GiB)": 302.58, + "step": 270920, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.7265542, + "epoch": 1.5152248104500963, + "grad_norm": 7.15625, + "learning_rate": 1.5220400567522364e-06, + "loss": 1.08440351, + "memory(GiB)": 302.58, + "step": 270940, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.76520343, + "epoch": 1.5153366599230755, + "grad_norm": 9.5, + "learning_rate": 1.5213757739324199e-06, + "loss": 0.91778421, + "memory(GiB)": 302.58, + "step": 270960, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.72697701, + "epoch": 1.5154485093960548, + "grad_norm": 8.9375, + "learning_rate": 1.5207116100925823e-06, + "loss": 1.10573826, + "memory(GiB)": 302.58, + "step": 270980, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.73856616, + "epoch": 1.515560358869034, + "grad_norm": 5.6875, + "learning_rate": 1.5200475652554403e-06, + "loss": 1.03481874, + "memory(GiB)": 302.58, + "step": 271000, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.73894362, + "epoch": 1.5156722083420133, + "grad_norm": 6.59375, + "learning_rate": 1.5193836394437062e-06, + "loss": 1.04361544, + "memory(GiB)": 302.58, + "step": 271020, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.76510925, + "epoch": 1.5157840578149926, + "grad_norm": 8.3125, + "learning_rate": 1.5187198326800877e-06, + "loss": 0.92133474, + "memory(GiB)": 302.58, + "step": 271040, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.76497431, + "epoch": 1.5158959072879719, + "grad_norm": 6.9375, + "learning_rate": 1.5180561449872905e-06, + "loss": 0.91852503, + "memory(GiB)": 302.58, + "step": 271060, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.74816623, + "epoch": 1.5160077567609511, + "grad_norm": 4.96875, + "learning_rate": 1.5173925763880144e-06, + "loss": 1.01268864, + "memory(GiB)": 302.58, + "step": 271080, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.75253763, + "epoch": 1.5161196062339304, + "grad_norm": 4.625, + "learning_rate": 1.5167291269049543e-06, + "loss": 0.98914499, + "memory(GiB)": 302.58, + "step": 271100, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.75698647, + "epoch": 1.5162314557069096, + "grad_norm": 9.5, + "learning_rate": 1.5160657965608038e-06, + "loss": 0.94426451, + "memory(GiB)": 302.58, + "step": 271120, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.76019859, + "epoch": 1.516343305179889, + "grad_norm": 8.5625, + "learning_rate": 1.51540258537825e-06, + "loss": 0.92686691, + "memory(GiB)": 302.58, + "step": 271140, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.76020575, + "epoch": 1.5164551546528682, + "grad_norm": 5.40625, + "learning_rate": 1.5147394933799759e-06, + "loss": 0.93362846, + "memory(GiB)": 302.58, + "step": 271160, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.73983402, + "epoch": 1.5165670041258474, + "grad_norm": 7.0625, + "learning_rate": 1.5140765205886642e-06, + "loss": 1.03305435, + "memory(GiB)": 302.58, + "step": 271180, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.74197965, + "epoch": 1.5166788535988267, + "grad_norm": 7.3125, + "learning_rate": 1.513413667026989e-06, + "loss": 1.03037157, + "memory(GiB)": 302.58, + "step": 271200, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.74748425, + "epoch": 1.516790703071806, + "grad_norm": 5.09375, + "learning_rate": 1.5127509327176216e-06, + "loss": 0.9817915, + "memory(GiB)": 302.58, + "step": 271220, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.74005442, + "epoch": 1.5169025525447852, + "grad_norm": 5.53125, + "learning_rate": 1.51208831768323e-06, + "loss": 1.03247404, + "memory(GiB)": 302.58, + "step": 271240, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75029745, + "epoch": 1.5170144020177645, + "grad_norm": 7.4375, + "learning_rate": 1.5114258219464784e-06, + "loss": 0.9851655, + "memory(GiB)": 302.58, + "step": 271260, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.75965652, + "epoch": 1.5171262514907438, + "grad_norm": 7.78125, + "learning_rate": 1.5107634455300256e-06, + "loss": 0.94250612, + "memory(GiB)": 302.58, + "step": 271280, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.74342818, + "epoch": 1.517238100963723, + "grad_norm": 9.5625, + "learning_rate": 1.510101188456527e-06, + "loss": 1.01732254, + "memory(GiB)": 302.58, + "step": 271300, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.75188217, + "epoch": 1.5173499504367023, + "grad_norm": 7.65625, + "learning_rate": 1.5094390507486339e-06, + "loss": 0.97886105, + "memory(GiB)": 302.58, + "step": 271320, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.75756006, + "epoch": 1.5174617999096816, + "grad_norm": 6.5, + "learning_rate": 1.5087770324289936e-06, + "loss": 0.93973417, + "memory(GiB)": 302.58, + "step": 271340, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.74138336, + "epoch": 1.5175736493826608, + "grad_norm": 7.46875, + "learning_rate": 1.5081151335202486e-06, + "loss": 1.01650486, + "memory(GiB)": 302.58, + "step": 271360, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.77641039, + "epoch": 1.51768549885564, + "grad_norm": 8.8125, + "learning_rate": 1.5074533540450392e-06, + "loss": 0.86502666, + "memory(GiB)": 302.58, + "step": 271380, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74697022, + "epoch": 1.5177973483286193, + "grad_norm": 7.375, + "learning_rate": 1.5067916940259991e-06, + "loss": 1.02301712, + "memory(GiB)": 302.58, + "step": 271400, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.76384416, + "epoch": 1.5179091978015986, + "grad_norm": 7.4375, + "learning_rate": 1.5061301534857603e-06, + "loss": 0.9385025, + "memory(GiB)": 302.58, + "step": 271420, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.74195633, + "epoch": 1.5180210472745779, + "grad_norm": 8.375, + "learning_rate": 1.505468732446947e-06, + "loss": 1.0087265, + "memory(GiB)": 302.58, + "step": 271440, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.7485764, + "epoch": 1.5181328967475571, + "grad_norm": 7.9375, + "learning_rate": 1.5048074309321852e-06, + "loss": 0.98685036, + "memory(GiB)": 302.58, + "step": 271460, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.7603837, + "epoch": 1.5182447462205364, + "grad_norm": 9.75, + "learning_rate": 1.5041462489640929e-06, + "loss": 0.94568405, + "memory(GiB)": 302.58, + "step": 271480, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.74945307, + "epoch": 1.5183565956935157, + "grad_norm": 7.84375, + "learning_rate": 1.5034851865652839e-06, + "loss": 0.96981754, + "memory(GiB)": 302.58, + "step": 271500, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.74336061, + "epoch": 1.518468445166495, + "grad_norm": 5.25, + "learning_rate": 1.5028242437583685e-06, + "loss": 1.0101079, + "memory(GiB)": 302.58, + "step": 271520, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.77604828, + "epoch": 1.5185802946394742, + "grad_norm": 7.53125, + "learning_rate": 1.502163420565953e-06, + "loss": 0.88296242, + "memory(GiB)": 302.58, + "step": 271540, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.74605188, + "epoch": 1.5186921441124535, + "grad_norm": 8.125, + "learning_rate": 1.50150271701064e-06, + "loss": 1.00630579, + "memory(GiB)": 302.58, + "step": 271560, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.74753423, + "epoch": 1.5188039935854327, + "grad_norm": 7.0625, + "learning_rate": 1.5008421331150274e-06, + "loss": 1.00387201, + "memory(GiB)": 302.58, + "step": 271580, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.76055336, + "epoch": 1.518915843058412, + "grad_norm": 8.375, + "learning_rate": 1.5001816689017096e-06, + "loss": 0.9424964, + "memory(GiB)": 302.58, + "step": 271600, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.76108131, + "epoch": 1.5190276925313912, + "grad_norm": 6.71875, + "learning_rate": 1.499521324393276e-06, + "loss": 0.93361301, + "memory(GiB)": 302.58, + "step": 271620, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.75491662, + "epoch": 1.5191395420043705, + "grad_norm": 7.5, + "learning_rate": 1.4988610996123131e-06, + "loss": 0.95350504, + "memory(GiB)": 302.58, + "step": 271640, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.7402688, + "epoch": 1.5192513914773498, + "grad_norm": 9.0625, + "learning_rate": 1.4982009945814025e-06, + "loss": 1.02522287, + "memory(GiB)": 302.58, + "step": 271660, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.75837779, + "epoch": 1.519363240950329, + "grad_norm": 8.8125, + "learning_rate": 1.4975410093231213e-06, + "loss": 0.94639463, + "memory(GiB)": 302.58, + "step": 271680, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.73929105, + "epoch": 1.5194750904233083, + "grad_norm": 8.0, + "learning_rate": 1.496881143860044e-06, + "loss": 1.03492317, + "memory(GiB)": 302.58, + "step": 271700, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.74884768, + "epoch": 1.5195869398962876, + "grad_norm": 10.4375, + "learning_rate": 1.4962213982147384e-06, + "loss": 1.00383472, + "memory(GiB)": 302.58, + "step": 271720, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.73133402, + "epoch": 1.5196987893692668, + "grad_norm": 8.0, + "learning_rate": 1.495561772409772e-06, + "loss": 1.07506981, + "memory(GiB)": 302.58, + "step": 271740, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.76455917, + "epoch": 1.519810638842246, + "grad_norm": 6.0, + "learning_rate": 1.4949022664677054e-06, + "loss": 0.91977072, + "memory(GiB)": 302.58, + "step": 271760, + "train_speed(iter/s)": 0.123738 + }, + { + "acc": 0.75958176, + "epoch": 1.5199224883152254, + "grad_norm": 5.1875, + "learning_rate": 1.494242880411096e-06, + "loss": 0.94092073, + "memory(GiB)": 302.58, + "step": 271780, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.73986673, + "epoch": 1.5200343377882046, + "grad_norm": 6.5625, + "learning_rate": 1.4935836142624966e-06, + "loss": 1.023629, + "memory(GiB)": 302.58, + "step": 271800, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.74599247, + "epoch": 1.520146187261184, + "grad_norm": 7.875, + "learning_rate": 1.492924468044456e-06, + "loss": 0.99358845, + "memory(GiB)": 302.58, + "step": 271820, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.73027215, + "epoch": 1.5202580367341632, + "grad_norm": 8.625, + "learning_rate": 1.4922654417795191e-06, + "loss": 1.0680707, + "memory(GiB)": 302.58, + "step": 271840, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.7481554, + "epoch": 1.5203698862071424, + "grad_norm": 6.625, + "learning_rate": 1.491606535490227e-06, + "loss": 1.01016588, + "memory(GiB)": 302.58, + "step": 271860, + "train_speed(iter/s)": 0.123759 + }, + { + "acc": 0.75525742, + "epoch": 1.5204817356801217, + "grad_norm": 7.15625, + "learning_rate": 1.490947749199116e-06, + "loss": 0.95481081, + "memory(GiB)": 302.58, + "step": 271880, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.74637947, + "epoch": 1.520593585153101, + "grad_norm": 9.4375, + "learning_rate": 1.4902890829287186e-06, + "loss": 1.00501575, + "memory(GiB)": 302.58, + "step": 271900, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.7639112, + "epoch": 1.5207054346260802, + "grad_norm": 5.15625, + "learning_rate": 1.489630536701564e-06, + "loss": 0.90612154, + "memory(GiB)": 302.58, + "step": 271920, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.7631403, + "epoch": 1.5208172840990595, + "grad_norm": 7.09375, + "learning_rate": 1.488972110540176e-06, + "loss": 0.92521791, + "memory(GiB)": 302.58, + "step": 271940, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.73767471, + "epoch": 1.5209291335720387, + "grad_norm": 5.71875, + "learning_rate": 1.4883138044670752e-06, + "loss": 1.05733356, + "memory(GiB)": 302.58, + "step": 271960, + "train_speed(iter/s)": 0.123781 + }, + { + "acc": 0.75003881, + "epoch": 1.521040983045018, + "grad_norm": 6.9375, + "learning_rate": 1.4876556185047758e-06, + "loss": 0.96273346, + "memory(GiB)": 302.58, + "step": 271980, + "train_speed(iter/s)": 0.123785 + }, + { + "acc": 0.7606895, + "epoch": 1.5211528325179973, + "grad_norm": 6.96875, + "learning_rate": 1.4869975526757929e-06, + "loss": 0.92418823, + "memory(GiB)": 302.58, + "step": 272000, + "train_speed(iter/s)": 0.123789 + }, + { + "epoch": 1.5211528325179973, + "eval_acc": 0.7068915132526029, + "eval_loss": 1.0118788480758667, + "eval_runtime": 7550.3819, + "eval_samples_per_second": 9.971, + "eval_steps_per_second": 9.971, + "step": 272000 + }, + { + "acc": 0.72642527, + "epoch": 1.5212646819909765, + "grad_norm": 5.8125, + "learning_rate": 1.4863396070026337e-06, + "loss": 1.08203659, + "memory(GiB)": 302.58, + "step": 272020, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.75859361, + "epoch": 1.5213765314639558, + "grad_norm": 7.46875, + "learning_rate": 1.4856817815078012e-06, + "loss": 0.9560339, + "memory(GiB)": 302.58, + "step": 272040, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.76731687, + "epoch": 1.521488380936935, + "grad_norm": 8.125, + "learning_rate": 1.4850240762137952e-06, + "loss": 0.92389412, + "memory(GiB)": 302.58, + "step": 272060, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.76845012, + "epoch": 1.5216002304099143, + "grad_norm": 6.28125, + "learning_rate": 1.4843664911431115e-06, + "loss": 0.89478788, + "memory(GiB)": 302.58, + "step": 272080, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.74149637, + "epoch": 1.5217120798828936, + "grad_norm": 4.71875, + "learning_rate": 1.483709026318242e-06, + "loss": 1.03862848, + "memory(GiB)": 302.58, + "step": 272100, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.75810146, + "epoch": 1.5218239293558729, + "grad_norm": 6.9375, + "learning_rate": 1.483051681761673e-06, + "loss": 0.9393774, + "memory(GiB)": 302.58, + "step": 272120, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.74636612, + "epoch": 1.5219357788288521, + "grad_norm": 7.71875, + "learning_rate": 1.482394457495887e-06, + "loss": 1.00303335, + "memory(GiB)": 302.58, + "step": 272140, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.74155064, + "epoch": 1.5220476283018314, + "grad_norm": 8.25, + "learning_rate": 1.4817373535433666e-06, + "loss": 1.01009283, + "memory(GiB)": 302.58, + "step": 272160, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.74977875, + "epoch": 1.5221594777748106, + "grad_norm": 10.9375, + "learning_rate": 1.4810803699265847e-06, + "loss": 0.98638592, + "memory(GiB)": 302.58, + "step": 272180, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.75051198, + "epoch": 1.52227132724779, + "grad_norm": 7.0625, + "learning_rate": 1.4804235066680123e-06, + "loss": 0.97666759, + "memory(GiB)": 302.58, + "step": 272200, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.74935088, + "epoch": 1.5223831767207692, + "grad_norm": 7.53125, + "learning_rate": 1.4797667637901158e-06, + "loss": 1.00114117, + "memory(GiB)": 302.58, + "step": 272220, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.73946843, + "epoch": 1.5224950261937484, + "grad_norm": 8.6875, + "learning_rate": 1.4791101413153586e-06, + "loss": 1.04984798, + "memory(GiB)": 302.58, + "step": 272240, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.75066419, + "epoch": 1.5226068756667277, + "grad_norm": 7.71875, + "learning_rate": 1.4784536392661991e-06, + "loss": 0.9734354, + "memory(GiB)": 302.58, + "step": 272260, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.7413878, + "epoch": 1.522718725139707, + "grad_norm": 5.6875, + "learning_rate": 1.4777972576650917e-06, + "loss": 1.03346481, + "memory(GiB)": 302.58, + "step": 272280, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75163794, + "epoch": 1.5228305746126862, + "grad_norm": 9.375, + "learning_rate": 1.4771409965344868e-06, + "loss": 0.97685261, + "memory(GiB)": 302.58, + "step": 272300, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.7525444, + "epoch": 1.5229424240856655, + "grad_norm": 6.375, + "learning_rate": 1.4764848558968287e-06, + "loss": 0.97102728, + "memory(GiB)": 302.58, + "step": 272320, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.76273451, + "epoch": 1.5230542735586448, + "grad_norm": 10.0625, + "learning_rate": 1.4758288357745631e-06, + "loss": 0.94243069, + "memory(GiB)": 302.58, + "step": 272340, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74962664, + "epoch": 1.523166123031624, + "grad_norm": 6.65625, + "learning_rate": 1.4751729361901256e-06, + "loss": 0.97423735, + "memory(GiB)": 302.58, + "step": 272360, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75301824, + "epoch": 1.5232779725046033, + "grad_norm": 8.625, + "learning_rate": 1.474517157165951e-06, + "loss": 0.9669898, + "memory(GiB)": 302.58, + "step": 272380, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.73529754, + "epoch": 1.5233898219775825, + "grad_norm": 7.0625, + "learning_rate": 1.4738614987244686e-06, + "loss": 1.05820856, + "memory(GiB)": 302.58, + "step": 272400, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.75293331, + "epoch": 1.5235016714505618, + "grad_norm": 8.0625, + "learning_rate": 1.4732059608881045e-06, + "loss": 0.97592077, + "memory(GiB)": 302.58, + "step": 272420, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.74271026, + "epoch": 1.523613520923541, + "grad_norm": 7.34375, + "learning_rate": 1.4725505436792793e-06, + "loss": 1.01001463, + "memory(GiB)": 302.58, + "step": 272440, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.757447, + "epoch": 1.5237253703965203, + "grad_norm": 9.625, + "learning_rate": 1.4718952471204107e-06, + "loss": 0.94007816, + "memory(GiB)": 302.58, + "step": 272460, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.74695568, + "epoch": 1.5238372198694996, + "grad_norm": 9.3125, + "learning_rate": 1.4712400712339115e-06, + "loss": 0.99041595, + "memory(GiB)": 302.58, + "step": 272480, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.75332532, + "epoch": 1.5239490693424789, + "grad_norm": 7.75, + "learning_rate": 1.470585016042192e-06, + "loss": 0.9749855, + "memory(GiB)": 302.58, + "step": 272500, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74726009, + "epoch": 1.5240609188154581, + "grad_norm": 6.9375, + "learning_rate": 1.469930081567656e-06, + "loss": 0.99289532, + "memory(GiB)": 302.58, + "step": 272520, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74620757, + "epoch": 1.5241727682884374, + "grad_norm": 7.4375, + "learning_rate": 1.469275267832705e-06, + "loss": 0.99001856, + "memory(GiB)": 302.58, + "step": 272540, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.75990372, + "epoch": 1.5242846177614167, + "grad_norm": 4.78125, + "learning_rate": 1.4686205748597354e-06, + "loss": 0.92487173, + "memory(GiB)": 302.58, + "step": 272560, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74918628, + "epoch": 1.524396467234396, + "grad_norm": 6.6875, + "learning_rate": 1.4679660026711394e-06, + "loss": 1.00042887, + "memory(GiB)": 302.58, + "step": 272580, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.7601429, + "epoch": 1.5245083167073752, + "grad_norm": 8.25, + "learning_rate": 1.467311551289305e-06, + "loss": 0.95248404, + "memory(GiB)": 302.58, + "step": 272600, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75291066, + "epoch": 1.5246201661803545, + "grad_norm": 8.0, + "learning_rate": 1.4666572207366187e-06, + "loss": 0.95384874, + "memory(GiB)": 302.58, + "step": 272620, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.76197085, + "epoch": 1.5247320156533337, + "grad_norm": 8.4375, + "learning_rate": 1.4660030110354596e-06, + "loss": 0.91970663, + "memory(GiB)": 302.58, + "step": 272640, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.7580565, + "epoch": 1.524843865126313, + "grad_norm": 6.09375, + "learning_rate": 1.465348922208203e-06, + "loss": 0.95634527, + "memory(GiB)": 302.58, + "step": 272660, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75238209, + "epoch": 1.5249557145992922, + "grad_norm": 5.75, + "learning_rate": 1.4646949542772214e-06, + "loss": 0.97379999, + "memory(GiB)": 302.58, + "step": 272680, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.76269488, + "epoch": 1.5250675640722715, + "grad_norm": 8.9375, + "learning_rate": 1.464041107264883e-06, + "loss": 0.93291883, + "memory(GiB)": 302.58, + "step": 272700, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.74169898, + "epoch": 1.5251794135452508, + "grad_norm": 6.625, + "learning_rate": 1.4633873811935505e-06, + "loss": 0.99821501, + "memory(GiB)": 302.58, + "step": 272720, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.75641203, + "epoch": 1.52529126301823, + "grad_norm": 10.0, + "learning_rate": 1.4627337760855841e-06, + "loss": 0.94580917, + "memory(GiB)": 302.58, + "step": 272740, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.74226551, + "epoch": 1.5254031124912093, + "grad_norm": 9.3125, + "learning_rate": 1.4620802919633387e-06, + "loss": 1.01308012, + "memory(GiB)": 302.58, + "step": 272760, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.7444221, + "epoch": 1.5255149619641886, + "grad_norm": 7.0625, + "learning_rate": 1.4614269288491662e-06, + "loss": 1.01477919, + "memory(GiB)": 302.58, + "step": 272780, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.76531105, + "epoch": 1.5256268114371678, + "grad_norm": 7.46875, + "learning_rate": 1.4607736867654127e-06, + "loss": 0.91756649, + "memory(GiB)": 302.58, + "step": 272800, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.75450559, + "epoch": 1.525738660910147, + "grad_norm": 6.40625, + "learning_rate": 1.460120565734422e-06, + "loss": 0.96613741, + "memory(GiB)": 302.58, + "step": 272820, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.77199326, + "epoch": 1.5258505103831264, + "grad_norm": 8.4375, + "learning_rate": 1.459467565778533e-06, + "loss": 0.89977293, + "memory(GiB)": 302.58, + "step": 272840, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75497813, + "epoch": 1.5259623598561056, + "grad_norm": 7.625, + "learning_rate": 1.4588146869200775e-06, + "loss": 0.97218914, + "memory(GiB)": 302.58, + "step": 272860, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.75834465, + "epoch": 1.5260742093290849, + "grad_norm": 9.1875, + "learning_rate": 1.458161929181391e-06, + "loss": 0.93775816, + "memory(GiB)": 302.58, + "step": 272880, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.7504869, + "epoch": 1.5261860588020641, + "grad_norm": 11.25, + "learning_rate": 1.4575092925847967e-06, + "loss": 0.9866971, + "memory(GiB)": 302.58, + "step": 272900, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.76117163, + "epoch": 1.5262979082750434, + "grad_norm": 7.59375, + "learning_rate": 1.4568567771526175e-06, + "loss": 0.92572174, + "memory(GiB)": 302.58, + "step": 272920, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75383363, + "epoch": 1.5264097577480227, + "grad_norm": 5.78125, + "learning_rate": 1.456204382907172e-06, + "loss": 0.97812347, + "memory(GiB)": 302.58, + "step": 272940, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75649114, + "epoch": 1.526521607221002, + "grad_norm": 7.5, + "learning_rate": 1.455552109870773e-06, + "loss": 0.97131157, + "memory(GiB)": 302.58, + "step": 272960, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.72904406, + "epoch": 1.5266334566939812, + "grad_norm": 8.625, + "learning_rate": 1.454899958065732e-06, + "loss": 1.10353909, + "memory(GiB)": 302.58, + "step": 272980, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.74144773, + "epoch": 1.5267453061669605, + "grad_norm": 5.53125, + "learning_rate": 1.4542479275143528e-06, + "loss": 1.01078014, + "memory(GiB)": 302.58, + "step": 273000, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.77125392, + "epoch": 1.5268571556399397, + "grad_norm": 8.1875, + "learning_rate": 1.453596018238938e-06, + "loss": 0.899683, + "memory(GiB)": 302.58, + "step": 273020, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74989614, + "epoch": 1.526969005112919, + "grad_norm": 7.6875, + "learning_rate": 1.4529442302617847e-06, + "loss": 0.98459816, + "memory(GiB)": 302.58, + "step": 273040, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.73778892, + "epoch": 1.5270808545858983, + "grad_norm": 6.03125, + "learning_rate": 1.4522925636051865e-06, + "loss": 1.02689505, + "memory(GiB)": 302.58, + "step": 273060, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.75291591, + "epoch": 1.5271927040588775, + "grad_norm": 6.75, + "learning_rate": 1.4516410182914314e-06, + "loss": 0.98161182, + "memory(GiB)": 302.58, + "step": 273080, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.75490198, + "epoch": 1.5273045535318568, + "grad_norm": 9.375, + "learning_rate": 1.4509895943428054e-06, + "loss": 0.96249084, + "memory(GiB)": 302.58, + "step": 273100, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.75181189, + "epoch": 1.527416403004836, + "grad_norm": 4.40625, + "learning_rate": 1.4503382917815888e-06, + "loss": 0.97854252, + "memory(GiB)": 302.58, + "step": 273120, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74351106, + "epoch": 1.5275282524778153, + "grad_norm": 9.1875, + "learning_rate": 1.4496871106300569e-06, + "loss": 1.01833649, + "memory(GiB)": 302.58, + "step": 273140, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.75150046, + "epoch": 1.5276401019507946, + "grad_norm": 7.1875, + "learning_rate": 1.449036050910485e-06, + "loss": 0.96385336, + "memory(GiB)": 302.58, + "step": 273160, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.76481228, + "epoch": 1.5277519514237738, + "grad_norm": 4.625, + "learning_rate": 1.4483851126451403e-06, + "loss": 0.90212193, + "memory(GiB)": 302.58, + "step": 273180, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.74129395, + "epoch": 1.527863800896753, + "grad_norm": 5.25, + "learning_rate": 1.4477342958562863e-06, + "loss": 1.03216276, + "memory(GiB)": 302.58, + "step": 273200, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.75288606, + "epoch": 1.5279756503697324, + "grad_norm": 5.40625, + "learning_rate": 1.4470836005661832e-06, + "loss": 0.96585941, + "memory(GiB)": 302.58, + "step": 273220, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.75147514, + "epoch": 1.5280874998427116, + "grad_norm": 9.0, + "learning_rate": 1.4464330267970872e-06, + "loss": 0.98489571, + "memory(GiB)": 302.58, + "step": 273240, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.76791339, + "epoch": 1.528199349315691, + "grad_norm": 8.75, + "learning_rate": 1.4457825745712495e-06, + "loss": 0.90537663, + "memory(GiB)": 302.58, + "step": 273260, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75616436, + "epoch": 1.5283111987886702, + "grad_norm": 7.15625, + "learning_rate": 1.4451322439109178e-06, + "loss": 0.97835722, + "memory(GiB)": 302.58, + "step": 273280, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.74147182, + "epoch": 1.5284230482616494, + "grad_norm": 8.6875, + "learning_rate": 1.444482034838336e-06, + "loss": 1.02085743, + "memory(GiB)": 302.58, + "step": 273300, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.75198183, + "epoch": 1.5285348977346287, + "grad_norm": 7.25, + "learning_rate": 1.4438319473757423e-06, + "loss": 0.98507786, + "memory(GiB)": 302.58, + "step": 273320, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.75144963, + "epoch": 1.528646747207608, + "grad_norm": 7.25, + "learning_rate": 1.443181981545373e-06, + "loss": 0.98981333, + "memory(GiB)": 302.58, + "step": 273340, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.76098113, + "epoch": 1.5287585966805872, + "grad_norm": 8.0625, + "learning_rate": 1.4425321373694573e-06, + "loss": 0.94627113, + "memory(GiB)": 302.58, + "step": 273360, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75048032, + "epoch": 1.5288704461535665, + "grad_norm": 8.4375, + "learning_rate": 1.441882414870222e-06, + "loss": 0.96948996, + "memory(GiB)": 302.58, + "step": 273380, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.76685095, + "epoch": 1.5289822956265458, + "grad_norm": 5.96875, + "learning_rate": 1.4412328140698922e-06, + "loss": 0.90974216, + "memory(GiB)": 302.58, + "step": 273400, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.75134673, + "epoch": 1.529094145099525, + "grad_norm": 6.03125, + "learning_rate": 1.4405833349906846e-06, + "loss": 0.98425083, + "memory(GiB)": 302.58, + "step": 273420, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.7578125, + "epoch": 1.5292059945725043, + "grad_norm": 9.5625, + "learning_rate": 1.4399339776548137e-06, + "loss": 0.93727579, + "memory(GiB)": 302.58, + "step": 273440, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.75744829, + "epoch": 1.5293178440454835, + "grad_norm": 5.375, + "learning_rate": 1.4392847420844874e-06, + "loss": 0.97057219, + "memory(GiB)": 302.58, + "step": 273460, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.75715227, + "epoch": 1.5294296935184628, + "grad_norm": 9.625, + "learning_rate": 1.4386356283019153e-06, + "loss": 0.96198816, + "memory(GiB)": 302.58, + "step": 273480, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.76518135, + "epoch": 1.529541542991442, + "grad_norm": 9.1875, + "learning_rate": 1.4379866363292983e-06, + "loss": 0.9155901, + "memory(GiB)": 302.58, + "step": 273500, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.75181684, + "epoch": 1.5296533924644213, + "grad_norm": 7.1875, + "learning_rate": 1.4373377661888322e-06, + "loss": 0.97000427, + "memory(GiB)": 302.58, + "step": 273520, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.75254011, + "epoch": 1.5297652419374006, + "grad_norm": 9.0, + "learning_rate": 1.436689017902712e-06, + "loss": 0.97921658, + "memory(GiB)": 302.58, + "step": 273540, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.77285376, + "epoch": 1.5298770914103799, + "grad_norm": 7.15625, + "learning_rate": 1.4360403914931258e-06, + "loss": 0.87317753, + "memory(GiB)": 302.58, + "step": 273560, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.76832628, + "epoch": 1.5299889408833591, + "grad_norm": 8.3125, + "learning_rate": 1.4353918869822597e-06, + "loss": 0.89648037, + "memory(GiB)": 302.58, + "step": 273580, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.76647234, + "epoch": 1.5301007903563384, + "grad_norm": 12.625, + "learning_rate": 1.434743504392294e-06, + "loss": 0.90549536, + "memory(GiB)": 302.58, + "step": 273600, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.75359468, + "epoch": 1.5302126398293177, + "grad_norm": 6.21875, + "learning_rate": 1.4340952437454053e-06, + "loss": 0.95986004, + "memory(GiB)": 302.58, + "step": 273620, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.75291152, + "epoch": 1.530324489302297, + "grad_norm": 7.8125, + "learning_rate": 1.4334471050637665e-06, + "loss": 0.96336384, + "memory(GiB)": 302.58, + "step": 273640, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.76510515, + "epoch": 1.5304363387752762, + "grad_norm": 7.625, + "learning_rate": 1.4327990883695464e-06, + "loss": 0.91393232, + "memory(GiB)": 302.58, + "step": 273660, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.75727019, + "epoch": 1.5305481882482554, + "grad_norm": 7.46875, + "learning_rate": 1.4321511936849086e-06, + "loss": 0.93454933, + "memory(GiB)": 302.58, + "step": 273680, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.74097071, + "epoch": 1.5306600377212347, + "grad_norm": 8.25, + "learning_rate": 1.4315034210320134e-06, + "loss": 0.99556484, + "memory(GiB)": 302.58, + "step": 273700, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.76402555, + "epoch": 1.530771887194214, + "grad_norm": 6.59375, + "learning_rate": 1.4308557704330162e-06, + "loss": 0.92869749, + "memory(GiB)": 302.58, + "step": 273720, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.73033657, + "epoch": 1.5308837366671932, + "grad_norm": 7.0625, + "learning_rate": 1.4302082419100677e-06, + "loss": 1.07304544, + "memory(GiB)": 302.58, + "step": 273740, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.74808345, + "epoch": 1.5309955861401725, + "grad_norm": 8.0625, + "learning_rate": 1.4295608354853185e-06, + "loss": 0.98485193, + "memory(GiB)": 302.58, + "step": 273760, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.7578824, + "epoch": 1.5311074356131518, + "grad_norm": 7.125, + "learning_rate": 1.4289135511809098e-06, + "loss": 0.96673737, + "memory(GiB)": 302.58, + "step": 273780, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.72814665, + "epoch": 1.531219285086131, + "grad_norm": 6.34375, + "learning_rate": 1.428266389018982e-06, + "loss": 1.081388, + "memory(GiB)": 302.58, + "step": 273800, + "train_speed(iter/s)": 0.123742 + }, + { + "acc": 0.75959659, + "epoch": 1.5313311345591103, + "grad_norm": 7.0625, + "learning_rate": 1.4276193490216689e-06, + "loss": 0.93263359, + "memory(GiB)": 302.58, + "step": 273820, + "train_speed(iter/s)": 0.123746 + }, + { + "acc": 0.75365229, + "epoch": 1.5314429840320896, + "grad_norm": 6.875, + "learning_rate": 1.4269724312111021e-06, + "loss": 0.95183439, + "memory(GiB)": 302.58, + "step": 273840, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.77687364, + "epoch": 1.5315548335050688, + "grad_norm": 7.5625, + "learning_rate": 1.4263256356094073e-06, + "loss": 0.85904636, + "memory(GiB)": 302.58, + "step": 273860, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.76126065, + "epoch": 1.531666682978048, + "grad_norm": 9.75, + "learning_rate": 1.4256789622387086e-06, + "loss": 0.96570663, + "memory(GiB)": 302.58, + "step": 273880, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.74206553, + "epoch": 1.5317785324510274, + "grad_norm": 7.75, + "learning_rate": 1.4250324111211222e-06, + "loss": 1.00493364, + "memory(GiB)": 302.58, + "step": 273900, + "train_speed(iter/s)": 0.123763 + }, + { + "acc": 0.7608387, + "epoch": 1.5318903819240066, + "grad_norm": 9.0625, + "learning_rate": 1.4243859822787637e-06, + "loss": 0.95666723, + "memory(GiB)": 302.58, + "step": 273920, + "train_speed(iter/s)": 0.123767 + }, + { + "acc": 0.74666944, + "epoch": 1.5320022313969859, + "grad_norm": 7.0625, + "learning_rate": 1.4237396757337423e-06, + "loss": 1.0115468, + "memory(GiB)": 302.58, + "step": 273940, + "train_speed(iter/s)": 0.123771 + }, + { + "acc": 0.76761651, + "epoch": 1.5321140808699654, + "grad_norm": 5.125, + "learning_rate": 1.4230934915081645e-06, + "loss": 0.90467024, + "memory(GiB)": 302.58, + "step": 273960, + "train_speed(iter/s)": 0.123776 + }, + { + "acc": 0.73773952, + "epoch": 1.5322259303429444, + "grad_norm": 5.84375, + "learning_rate": 1.4224474296241315e-06, + "loss": 1.04116507, + "memory(GiB)": 302.58, + "step": 273980, + "train_speed(iter/s)": 0.12378 + }, + { + "acc": 0.7545043, + "epoch": 1.532337779815924, + "grad_norm": 9.0625, + "learning_rate": 1.4218014901037397e-06, + "loss": 0.96777086, + "memory(GiB)": 302.58, + "step": 274000, + "train_speed(iter/s)": 0.123784 + }, + { + "epoch": 1.532337779815924, + "eval_acc": 0.7068875695097011, + "eval_loss": 1.0118963718414307, + "eval_runtime": 7540.7773, + "eval_samples_per_second": 9.983, + "eval_steps_per_second": 9.983, + "step": 274000 + }, + { + "acc": 0.75316978, + "epoch": 1.532449629288903, + "grad_norm": 5.125, + "learning_rate": 1.4211556729690823e-06, + "loss": 0.95966215, + "memory(GiB)": 302.58, + "step": 274020, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.75709014, + "epoch": 1.5325614787618824, + "grad_norm": 8.875, + "learning_rate": 1.42050997824225e-06, + "loss": 0.94943876, + "memory(GiB)": 302.58, + "step": 274040, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.76762848, + "epoch": 1.5326733282348615, + "grad_norm": 7.4375, + "learning_rate": 1.419864405945327e-06, + "loss": 0.90255136, + "memory(GiB)": 302.58, + "step": 274060, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.74999523, + "epoch": 1.532785177707841, + "grad_norm": 6.75, + "learning_rate": 1.4192189561003938e-06, + "loss": 0.98371258, + "memory(GiB)": 302.58, + "step": 274080, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.75061288, + "epoch": 1.53289702718082, + "grad_norm": 8.3125, + "learning_rate": 1.4185736287295265e-06, + "loss": 0.98913956, + "memory(GiB)": 302.58, + "step": 274100, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.76433349, + "epoch": 1.5330088766537995, + "grad_norm": 7.96875, + "learning_rate": 1.4179284238547974e-06, + "loss": 0.92790403, + "memory(GiB)": 302.58, + "step": 274120, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.75438461, + "epoch": 1.5331207261267785, + "grad_norm": 8.8125, + "learning_rate": 1.4172833414982752e-06, + "loss": 0.98705263, + "memory(GiB)": 302.58, + "step": 274140, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.74892216, + "epoch": 1.533232575599758, + "grad_norm": 8.375, + "learning_rate": 1.4166383816820224e-06, + "loss": 0.98338833, + "memory(GiB)": 302.58, + "step": 274160, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.77399445, + "epoch": 1.533344425072737, + "grad_norm": 8.875, + "learning_rate": 1.4159935444280997e-06, + "loss": 0.86650953, + "memory(GiB)": 302.58, + "step": 274180, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74950881, + "epoch": 1.5334562745457165, + "grad_norm": 8.4375, + "learning_rate": 1.4153488297585627e-06, + "loss": 1.00358267, + "memory(GiB)": 302.58, + "step": 274200, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.75902524, + "epoch": 1.5335681240186956, + "grad_norm": 6.5625, + "learning_rate": 1.4147042376954623e-06, + "loss": 0.94501801, + "memory(GiB)": 302.58, + "step": 274220, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.74394526, + "epoch": 1.533679973491675, + "grad_norm": 6.71875, + "learning_rate": 1.4140597682608454e-06, + "loss": 1.00878096, + "memory(GiB)": 302.58, + "step": 274240, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.75607462, + "epoch": 1.533791822964654, + "grad_norm": 8.0625, + "learning_rate": 1.4134154214767548e-06, + "loss": 0.93858604, + "memory(GiB)": 302.58, + "step": 274260, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75206752, + "epoch": 1.5339036724376336, + "grad_norm": 6.71875, + "learning_rate": 1.4127711973652286e-06, + "loss": 0.96643677, + "memory(GiB)": 302.58, + "step": 274280, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.74209385, + "epoch": 1.5340155219106126, + "grad_norm": 7.0625, + "learning_rate": 1.4121270959483035e-06, + "loss": 1.01553192, + "memory(GiB)": 302.58, + "step": 274300, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74619489, + "epoch": 1.5341273713835921, + "grad_norm": 6.03125, + "learning_rate": 1.4114831172480087e-06, + "loss": 1.00690279, + "memory(GiB)": 302.58, + "step": 274320, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.73285928, + "epoch": 1.5342392208565712, + "grad_norm": 10.5625, + "learning_rate": 1.4108392612863703e-06, + "loss": 1.07297049, + "memory(GiB)": 302.58, + "step": 274340, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.73508105, + "epoch": 1.5343510703295506, + "grad_norm": 6.375, + "learning_rate": 1.4101955280854097e-06, + "loss": 1.0389451, + "memory(GiB)": 302.58, + "step": 274360, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.74283299, + "epoch": 1.5344629198025297, + "grad_norm": 7.25, + "learning_rate": 1.4095519176671452e-06, + "loss": 1.02675915, + "memory(GiB)": 302.58, + "step": 274380, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.76416149, + "epoch": 1.5345747692755092, + "grad_norm": 6.90625, + "learning_rate": 1.40890843005359e-06, + "loss": 0.93026896, + "memory(GiB)": 302.58, + "step": 274400, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.77023034, + "epoch": 1.5346866187484882, + "grad_norm": 9.875, + "learning_rate": 1.4082650652667535e-06, + "loss": 0.9004262, + "memory(GiB)": 302.58, + "step": 274420, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.75324664, + "epoch": 1.5347984682214677, + "grad_norm": 6.65625, + "learning_rate": 1.4076218233286409e-06, + "loss": 0.98985252, + "memory(GiB)": 302.58, + "step": 274440, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75129337, + "epoch": 1.5349103176944467, + "grad_norm": 8.75, + "learning_rate": 1.4069787042612532e-06, + "loss": 0.99194469, + "memory(GiB)": 302.58, + "step": 274460, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.76173863, + "epoch": 1.5350221671674262, + "grad_norm": 8.875, + "learning_rate": 1.406335708086587e-06, + "loss": 0.93707275, + "memory(GiB)": 302.58, + "step": 274480, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74422731, + "epoch": 1.5351340166404053, + "grad_norm": 7.34375, + "learning_rate": 1.4056928348266347e-06, + "loss": 0.998598, + "memory(GiB)": 302.58, + "step": 274500, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.75097337, + "epoch": 1.5352458661133848, + "grad_norm": 4.65625, + "learning_rate": 1.4050500845033848e-06, + "loss": 0.98630829, + "memory(GiB)": 302.58, + "step": 274520, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.73995132, + "epoch": 1.5353577155863638, + "grad_norm": 8.8125, + "learning_rate": 1.4044074571388212e-06, + "loss": 1.03993616, + "memory(GiB)": 302.58, + "step": 274540, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.73335414, + "epoch": 1.5354695650593433, + "grad_norm": 6.125, + "learning_rate": 1.4037649527549225e-06, + "loss": 1.05113068, + "memory(GiB)": 302.58, + "step": 274560, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.7600358, + "epoch": 1.5355814145323223, + "grad_norm": 8.125, + "learning_rate": 1.4031225713736669e-06, + "loss": 0.93868389, + "memory(GiB)": 302.58, + "step": 274580, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.76373439, + "epoch": 1.5356932640053018, + "grad_norm": 6.40625, + "learning_rate": 1.4024803130170251e-06, + "loss": 0.91106997, + "memory(GiB)": 302.58, + "step": 274600, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.73740678, + "epoch": 1.5358051134782809, + "grad_norm": 8.1875, + "learning_rate": 1.401838177706964e-06, + "loss": 1.03881865, + "memory(GiB)": 302.58, + "step": 274620, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.7625864, + "epoch": 1.5359169629512603, + "grad_norm": 7.78125, + "learning_rate": 1.4011961654654448e-06, + "loss": 0.92250614, + "memory(GiB)": 302.58, + "step": 274640, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.76062207, + "epoch": 1.5360288124242394, + "grad_norm": 5.96875, + "learning_rate": 1.4005542763144302e-06, + "loss": 0.93181267, + "memory(GiB)": 302.58, + "step": 274660, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.75644464, + "epoch": 1.5361406618972189, + "grad_norm": 5.40625, + "learning_rate": 1.3999125102758727e-06, + "loss": 0.96607141, + "memory(GiB)": 302.58, + "step": 274680, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.74913259, + "epoch": 1.536252511370198, + "grad_norm": 6.6875, + "learning_rate": 1.3992708673717231e-06, + "loss": 0.96394815, + "memory(GiB)": 302.58, + "step": 274700, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.73967147, + "epoch": 1.5363643608431774, + "grad_norm": 7.15625, + "learning_rate": 1.398629347623927e-06, + "loss": 1.01175261, + "memory(GiB)": 302.58, + "step": 274720, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74082389, + "epoch": 1.5364762103161564, + "grad_norm": 10.25, + "learning_rate": 1.397987951054427e-06, + "loss": 1.01874952, + "memory(GiB)": 302.58, + "step": 274740, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.76475601, + "epoch": 1.536588059789136, + "grad_norm": 7.46875, + "learning_rate": 1.3973466776851613e-06, + "loss": 0.90683069, + "memory(GiB)": 302.58, + "step": 274760, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.76602387, + "epoch": 1.536699909262115, + "grad_norm": 8.8125, + "learning_rate": 1.396705527538062e-06, + "loss": 0.93273983, + "memory(GiB)": 302.58, + "step": 274780, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75061951, + "epoch": 1.5368117587350945, + "grad_norm": 6.34375, + "learning_rate": 1.3960645006350598e-06, + "loss": 0.98305044, + "memory(GiB)": 302.58, + "step": 274800, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.73286591, + "epoch": 1.5369236082080735, + "grad_norm": 4.71875, + "learning_rate": 1.3954235969980802e-06, + "loss": 1.06034956, + "memory(GiB)": 302.58, + "step": 274820, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.76101122, + "epoch": 1.537035457681053, + "grad_norm": 6.65625, + "learning_rate": 1.3947828166490423e-06, + "loss": 0.93797245, + "memory(GiB)": 302.58, + "step": 274840, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.77099886, + "epoch": 1.537147307154032, + "grad_norm": 7.9375, + "learning_rate": 1.3941421596098642e-06, + "loss": 0.86029224, + "memory(GiB)": 302.58, + "step": 274860, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75748529, + "epoch": 1.5372591566270115, + "grad_norm": 6.78125, + "learning_rate": 1.3935016259024587e-06, + "loss": 0.94622183, + "memory(GiB)": 302.58, + "step": 274880, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.76122489, + "epoch": 1.5373710060999906, + "grad_norm": 9.9375, + "learning_rate": 1.3928612155487309e-06, + "loss": 0.95868969, + "memory(GiB)": 302.58, + "step": 274900, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.72824688, + "epoch": 1.53748285557297, + "grad_norm": 7.0, + "learning_rate": 1.39222092857059e-06, + "loss": 1.0726181, + "memory(GiB)": 302.58, + "step": 274920, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.75140362, + "epoch": 1.537594705045949, + "grad_norm": 8.0625, + "learning_rate": 1.3915807649899327e-06, + "loss": 0.98348637, + "memory(GiB)": 302.58, + "step": 274940, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75168562, + "epoch": 1.5377065545189286, + "grad_norm": 7.59375, + "learning_rate": 1.3909407248286555e-06, + "loss": 0.97942085, + "memory(GiB)": 302.58, + "step": 274960, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75616713, + "epoch": 1.5378184039919076, + "grad_norm": 7.8125, + "learning_rate": 1.3903008081086494e-06, + "loss": 0.9611701, + "memory(GiB)": 302.58, + "step": 274980, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.75374966, + "epoch": 1.537930253464887, + "grad_norm": 6.59375, + "learning_rate": 1.3896610148518014e-06, + "loss": 0.94869633, + "memory(GiB)": 302.58, + "step": 275000, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.75037498, + "epoch": 1.5380421029378661, + "grad_norm": 5.4375, + "learning_rate": 1.3890213450799955e-06, + "loss": 0.99944477, + "memory(GiB)": 302.58, + "step": 275020, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.75601068, + "epoch": 1.5381539524108456, + "grad_norm": 5.0625, + "learning_rate": 1.3883817988151093e-06, + "loss": 0.97312603, + "memory(GiB)": 302.58, + "step": 275040, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75926161, + "epoch": 1.5382658018838247, + "grad_norm": 9.25, + "learning_rate": 1.3877423760790176e-06, + "loss": 0.94779291, + "memory(GiB)": 302.58, + "step": 275060, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.75495167, + "epoch": 1.5383776513568042, + "grad_norm": 4.875, + "learning_rate": 1.3871030768935911e-06, + "loss": 0.95822544, + "memory(GiB)": 302.58, + "step": 275080, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74689798, + "epoch": 1.5384895008297832, + "grad_norm": 9.0, + "learning_rate": 1.3864639012806953e-06, + "loss": 1.00566263, + "memory(GiB)": 302.58, + "step": 275100, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.7569757, + "epoch": 1.5386013503027627, + "grad_norm": 9.75, + "learning_rate": 1.3858248492621922e-06, + "loss": 0.95357447, + "memory(GiB)": 302.58, + "step": 275120, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.75551968, + "epoch": 1.5387131997757417, + "grad_norm": 5.15625, + "learning_rate": 1.3851859208599395e-06, + "loss": 0.95216904, + "memory(GiB)": 302.58, + "step": 275140, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.7593833, + "epoch": 1.5388250492487212, + "grad_norm": 8.0625, + "learning_rate": 1.3845471160957891e-06, + "loss": 0.93488903, + "memory(GiB)": 302.58, + "step": 275160, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.7529716, + "epoch": 1.5389368987217003, + "grad_norm": 8.875, + "learning_rate": 1.3839084349915932e-06, + "loss": 0.96470928, + "memory(GiB)": 302.58, + "step": 275180, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.74922061, + "epoch": 1.5390487481946797, + "grad_norm": 7.84375, + "learning_rate": 1.3832698775691956e-06, + "loss": 0.97979689, + "memory(GiB)": 302.58, + "step": 275200, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.75010657, + "epoch": 1.5391605976676588, + "grad_norm": 7.59375, + "learning_rate": 1.3826314438504356e-06, + "loss": 0.96689129, + "memory(GiB)": 302.58, + "step": 275220, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74957719, + "epoch": 1.5392724471406383, + "grad_norm": 8.25, + "learning_rate": 1.3819931338571512e-06, + "loss": 0.97739344, + "memory(GiB)": 302.58, + "step": 275240, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.76307554, + "epoch": 1.5393842966136173, + "grad_norm": 4.84375, + "learning_rate": 1.3813549476111743e-06, + "loss": 0.9233448, + "memory(GiB)": 302.58, + "step": 275260, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.7545579, + "epoch": 1.5394961460865968, + "grad_norm": 8.75, + "learning_rate": 1.3807168851343321e-06, + "loss": 0.96248074, + "memory(GiB)": 302.58, + "step": 275280, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75760846, + "epoch": 1.5396079955595758, + "grad_norm": 12.875, + "learning_rate": 1.3800789464484487e-06, + "loss": 0.95982885, + "memory(GiB)": 302.58, + "step": 275300, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.74717579, + "epoch": 1.5397198450325553, + "grad_norm": 7.5625, + "learning_rate": 1.3794411315753443e-06, + "loss": 0.98343763, + "memory(GiB)": 302.58, + "step": 275320, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.75255795, + "epoch": 1.5398316945055344, + "grad_norm": 7.46875, + "learning_rate": 1.3788034405368334e-06, + "loss": 0.97249489, + "memory(GiB)": 302.58, + "step": 275340, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.73418698, + "epoch": 1.5399435439785139, + "grad_norm": 6.03125, + "learning_rate": 1.3781658733547276e-06, + "loss": 1.04732323, + "memory(GiB)": 302.58, + "step": 275360, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.75813851, + "epoch": 1.540055393451493, + "grad_norm": 11.25, + "learning_rate": 1.377528430050833e-06, + "loss": 0.94797773, + "memory(GiB)": 302.58, + "step": 275380, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.7493237, + "epoch": 1.5401672429244724, + "grad_norm": 6.4375, + "learning_rate": 1.376891110646953e-06, + "loss": 0.99288549, + "memory(GiB)": 302.58, + "step": 275400, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.74741464, + "epoch": 1.5402790923974514, + "grad_norm": 5.9375, + "learning_rate": 1.3762539151648852e-06, + "loss": 1.00713654, + "memory(GiB)": 302.58, + "step": 275420, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.7541769, + "epoch": 1.540390941870431, + "grad_norm": 8.125, + "learning_rate": 1.3756168436264228e-06, + "loss": 0.96072493, + "memory(GiB)": 302.58, + "step": 275440, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.74300456, + "epoch": 1.54050279134341, + "grad_norm": 6.75, + "learning_rate": 1.374979896053359e-06, + "loss": 1.00296068, + "memory(GiB)": 302.58, + "step": 275460, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.75757847, + "epoch": 1.5406146408163894, + "grad_norm": 7.625, + "learning_rate": 1.3743430724674767e-06, + "loss": 0.92622538, + "memory(GiB)": 302.58, + "step": 275480, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.74915981, + "epoch": 1.5407264902893685, + "grad_norm": 10.5, + "learning_rate": 1.373706372890558e-06, + "loss": 0.99519882, + "memory(GiB)": 302.58, + "step": 275500, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.7561481, + "epoch": 1.540838339762348, + "grad_norm": 8.875, + "learning_rate": 1.3730697973443801e-06, + "loss": 0.96974669, + "memory(GiB)": 302.58, + "step": 275520, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.75486407, + "epoch": 1.540950189235327, + "grad_norm": 5.8125, + "learning_rate": 1.372433345850716e-06, + "loss": 0.95994081, + "memory(GiB)": 302.58, + "step": 275540, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.73592105, + "epoch": 1.5410620387083065, + "grad_norm": 5.0625, + "learning_rate": 1.3717970184313339e-06, + "loss": 1.04324961, + "memory(GiB)": 302.58, + "step": 275560, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.75814047, + "epoch": 1.5411738881812855, + "grad_norm": 5.6875, + "learning_rate": 1.3711608151079985e-06, + "loss": 0.95723238, + "memory(GiB)": 302.58, + "step": 275580, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.7574327, + "epoch": 1.541285737654265, + "grad_norm": 9.25, + "learning_rate": 1.3705247359024703e-06, + "loss": 0.9516201, + "memory(GiB)": 302.58, + "step": 275600, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.75424547, + "epoch": 1.541397587127244, + "grad_norm": 7.84375, + "learning_rate": 1.3698887808365047e-06, + "loss": 0.96029596, + "memory(GiB)": 302.58, + "step": 275620, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.75632877, + "epoch": 1.5415094366002235, + "grad_norm": 9.0625, + "learning_rate": 1.3692529499318536e-06, + "loss": 0.96523943, + "memory(GiB)": 302.58, + "step": 275640, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.75111418, + "epoch": 1.5416212860732026, + "grad_norm": 8.3125, + "learning_rate": 1.368617243210264e-06, + "loss": 0.99116278, + "memory(GiB)": 302.58, + "step": 275660, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.7549509, + "epoch": 1.541733135546182, + "grad_norm": 5.96875, + "learning_rate": 1.3679816606934803e-06, + "loss": 0.9584095, + "memory(GiB)": 302.58, + "step": 275680, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.75101504, + "epoch": 1.5418449850191611, + "grad_norm": 4.96875, + "learning_rate": 1.3673462024032403e-06, + "loss": 0.98091526, + "memory(GiB)": 302.58, + "step": 275700, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.7721993, + "epoch": 1.5419568344921406, + "grad_norm": 6.6875, + "learning_rate": 1.3667108683612767e-06, + "loss": 0.89205866, + "memory(GiB)": 302.58, + "step": 275720, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.7470602, + "epoch": 1.5420686839651196, + "grad_norm": 7.28125, + "learning_rate": 1.3660756585893248e-06, + "loss": 0.98575249, + "memory(GiB)": 302.58, + "step": 275740, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.73925977, + "epoch": 1.5421805334380991, + "grad_norm": 7.625, + "learning_rate": 1.365440573109108e-06, + "loss": 1.02892685, + "memory(GiB)": 302.58, + "step": 275760, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.73771415, + "epoch": 1.5422923829110782, + "grad_norm": 6.0625, + "learning_rate": 1.364805611942348e-06, + "loss": 1.01488609, + "memory(GiB)": 302.58, + "step": 275780, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.76263027, + "epoch": 1.5424042323840577, + "grad_norm": 6.28125, + "learning_rate": 1.3641707751107635e-06, + "loss": 0.92331152, + "memory(GiB)": 302.58, + "step": 275800, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.74432874, + "epoch": 1.5425160818570367, + "grad_norm": 6.28125, + "learning_rate": 1.3635360626360672e-06, + "loss": 1.02591505, + "memory(GiB)": 302.58, + "step": 275820, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.726057, + "epoch": 1.5426279313300162, + "grad_norm": 10.5, + "learning_rate": 1.3629014745399682e-06, + "loss": 1.11614485, + "memory(GiB)": 302.58, + "step": 275840, + "train_speed(iter/s)": 0.123743 + }, + { + "acc": 0.7555768, + "epoch": 1.5427397808029952, + "grad_norm": 13.8125, + "learning_rate": 1.3622670108441721e-06, + "loss": 0.95472403, + "memory(GiB)": 302.58, + "step": 275860, + "train_speed(iter/s)": 0.123747 + }, + { + "acc": 0.75278287, + "epoch": 1.5428516302759747, + "grad_norm": 8.0, + "learning_rate": 1.3616326715703793e-06, + "loss": 0.97403708, + "memory(GiB)": 302.58, + "step": 275880, + "train_speed(iter/s)": 0.123751 + }, + { + "acc": 0.74254613, + "epoch": 1.5429634797489538, + "grad_norm": 5.0, + "learning_rate": 1.360998456740284e-06, + "loss": 0.99633369, + "memory(GiB)": 302.58, + "step": 275900, + "train_speed(iter/s)": 0.123755 + }, + { + "acc": 0.75040541, + "epoch": 1.5430753292219332, + "grad_norm": 8.5625, + "learning_rate": 1.3603643663755823e-06, + "loss": 0.96909342, + "memory(GiB)": 302.58, + "step": 275920, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.74058166, + "epoch": 1.5431871786949123, + "grad_norm": 8.4375, + "learning_rate": 1.3597304004979601e-06, + "loss": 0.9888608, + "memory(GiB)": 302.58, + "step": 275940, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.74024105, + "epoch": 1.5432990281678918, + "grad_norm": 6.28125, + "learning_rate": 1.3590965591291011e-06, + "loss": 1.02019129, + "memory(GiB)": 302.58, + "step": 275960, + "train_speed(iter/s)": 0.123768 + }, + { + "acc": 0.76509871, + "epoch": 1.5434108776408708, + "grad_norm": 7.96875, + "learning_rate": 1.3584628422906853e-06, + "loss": 0.90726004, + "memory(GiB)": 302.58, + "step": 275980, + "train_speed(iter/s)": 0.123772 + }, + { + "acc": 0.74947925, + "epoch": 1.5435227271138503, + "grad_norm": 8.0625, + "learning_rate": 1.3578292500043865e-06, + "loss": 0.98554573, + "memory(GiB)": 302.58, + "step": 276000, + "train_speed(iter/s)": 0.123776 + }, + { + "epoch": 1.5435227271138503, + "eval_acc": 0.7069070910370654, + "eval_loss": 1.0118564367294312, + "eval_runtime": 7523.8171, + "eval_samples_per_second": 10.006, + "eval_steps_per_second": 10.006, + "step": 276000 + }, + { + "acc": 0.76435571, + "epoch": 1.5436345765868293, + "grad_norm": 6.21875, + "learning_rate": 1.3571957822918768e-06, + "loss": 0.91733084, + "memory(GiB)": 302.58, + "step": 276020, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.76098962, + "epoch": 1.5437464260598088, + "grad_norm": 8.25, + "learning_rate": 1.3565624391748211e-06, + "loss": 0.94950409, + "memory(GiB)": 302.58, + "step": 276040, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.74820809, + "epoch": 1.5438582755327879, + "grad_norm": 7.15625, + "learning_rate": 1.3559292206748842e-06, + "loss": 0.99921207, + "memory(GiB)": 302.58, + "step": 276060, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.73316298, + "epoch": 1.5439701250057674, + "grad_norm": 6.4375, + "learning_rate": 1.3552961268137233e-06, + "loss": 1.06957998, + "memory(GiB)": 302.58, + "step": 276080, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.74706821, + "epoch": 1.5440819744787464, + "grad_norm": 8.3125, + "learning_rate": 1.3546631576129916e-06, + "loss": 0.99105043, + "memory(GiB)": 302.58, + "step": 276100, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.73938618, + "epoch": 1.5441938239517259, + "grad_norm": 11.25, + "learning_rate": 1.3540303130943388e-06, + "loss": 1.02996979, + "memory(GiB)": 302.58, + "step": 276120, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.74114852, + "epoch": 1.544305673424705, + "grad_norm": 5.875, + "learning_rate": 1.3533975932794109e-06, + "loss": 1.00537281, + "memory(GiB)": 302.58, + "step": 276140, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.7681006, + "epoch": 1.5444175228976844, + "grad_norm": 6.5, + "learning_rate": 1.3527649981898477e-06, + "loss": 0.92689104, + "memory(GiB)": 302.58, + "step": 276160, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.76341047, + "epoch": 1.5445293723706635, + "grad_norm": 11.1875, + "learning_rate": 1.3521325278472874e-06, + "loss": 0.93318262, + "memory(GiB)": 302.58, + "step": 276180, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.74548841, + "epoch": 1.544641221843643, + "grad_norm": 9.9375, + "learning_rate": 1.351500182273361e-06, + "loss": 0.99075117, + "memory(GiB)": 302.58, + "step": 276200, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74809809, + "epoch": 1.544753071316622, + "grad_norm": 5.4375, + "learning_rate": 1.3508679614896981e-06, + "loss": 0.98744917, + "memory(GiB)": 302.58, + "step": 276220, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.75029173, + "epoch": 1.5448649207896015, + "grad_norm": 6.9375, + "learning_rate": 1.3502358655179215e-06, + "loss": 0.96535101, + "memory(GiB)": 302.58, + "step": 276240, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.72539878, + "epoch": 1.5449767702625805, + "grad_norm": 7.09375, + "learning_rate": 1.3496038943796513e-06, + "loss": 1.0944809, + "memory(GiB)": 302.58, + "step": 276260, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.74394269, + "epoch": 1.54508861973556, + "grad_norm": 9.1875, + "learning_rate": 1.3489720480965035e-06, + "loss": 1.01075525, + "memory(GiB)": 302.58, + "step": 276280, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74682536, + "epoch": 1.545200469208539, + "grad_norm": 7.5625, + "learning_rate": 1.3483403266900886e-06, + "loss": 1.01366072, + "memory(GiB)": 302.58, + "step": 276300, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.75176549, + "epoch": 1.5453123186815185, + "grad_norm": 5.5, + "learning_rate": 1.3477087301820119e-06, + "loss": 0.96845703, + "memory(GiB)": 302.58, + "step": 276320, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.76455703, + "epoch": 1.5454241681544976, + "grad_norm": 7.75, + "learning_rate": 1.347077258593879e-06, + "loss": 0.93735132, + "memory(GiB)": 302.58, + "step": 276340, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.74836183, + "epoch": 1.545536017627477, + "grad_norm": 7.96875, + "learning_rate": 1.3464459119472873e-06, + "loss": 1.00234003, + "memory(GiB)": 302.58, + "step": 276360, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.73525977, + "epoch": 1.545647867100456, + "grad_norm": 7.34375, + "learning_rate": 1.3458146902638309e-06, + "loss": 1.02118759, + "memory(GiB)": 302.58, + "step": 276380, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.76907377, + "epoch": 1.5457597165734356, + "grad_norm": 6.65625, + "learning_rate": 1.3451835935650986e-06, + "loss": 0.90825415, + "memory(GiB)": 302.58, + "step": 276400, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.78150601, + "epoch": 1.5458715660464146, + "grad_norm": 5.5, + "learning_rate": 1.3445526218726769e-06, + "loss": 0.84559145, + "memory(GiB)": 302.58, + "step": 276420, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.77386951, + "epoch": 1.545983415519394, + "grad_norm": 8.0, + "learning_rate": 1.3439217752081463e-06, + "loss": 0.88406115, + "memory(GiB)": 302.58, + "step": 276440, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.75258913, + "epoch": 1.5460952649923732, + "grad_norm": 5.0625, + "learning_rate": 1.3432910535930844e-06, + "loss": 0.95965757, + "memory(GiB)": 302.58, + "step": 276460, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.74656701, + "epoch": 1.5462071144653526, + "grad_norm": 4.8125, + "learning_rate": 1.3426604570490632e-06, + "loss": 0.98643637, + "memory(GiB)": 302.58, + "step": 276480, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.7524394, + "epoch": 1.5463189639383317, + "grad_norm": 8.125, + "learning_rate": 1.3420299855976521e-06, + "loss": 0.9885499, + "memory(GiB)": 302.58, + "step": 276500, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.73783894, + "epoch": 1.5464308134113112, + "grad_norm": 7.5625, + "learning_rate": 1.341399639260414e-06, + "loss": 1.04187689, + "memory(GiB)": 302.58, + "step": 276520, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.75911007, + "epoch": 1.5465426628842902, + "grad_norm": 8.6875, + "learning_rate": 1.3407694180589098e-06, + "loss": 0.94500628, + "memory(GiB)": 302.58, + "step": 276540, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74028311, + "epoch": 1.5466545123572697, + "grad_norm": 6.3125, + "learning_rate": 1.3401393220146941e-06, + "loss": 1.01739492, + "memory(GiB)": 302.58, + "step": 276560, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.75564795, + "epoch": 1.5467663618302487, + "grad_norm": 6.125, + "learning_rate": 1.3395093511493173e-06, + "loss": 0.95141897, + "memory(GiB)": 302.58, + "step": 276580, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.76896567, + "epoch": 1.5468782113032282, + "grad_norm": 6.25, + "learning_rate": 1.3388795054843295e-06, + "loss": 0.91654434, + "memory(GiB)": 302.58, + "step": 276600, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.76011252, + "epoch": 1.5469900607762073, + "grad_norm": 5.15625, + "learning_rate": 1.3382497850412717e-06, + "loss": 0.9201787, + "memory(GiB)": 302.58, + "step": 276620, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74452429, + "epoch": 1.5471019102491868, + "grad_norm": 4.21875, + "learning_rate": 1.3376201898416824e-06, + "loss": 1.00363464, + "memory(GiB)": 302.58, + "step": 276640, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75261407, + "epoch": 1.5472137597221658, + "grad_norm": 6.5, + "learning_rate": 1.3369907199070958e-06, + "loss": 0.97280664, + "memory(GiB)": 302.58, + "step": 276660, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.7541245, + "epoch": 1.5473256091951453, + "grad_norm": 7.71875, + "learning_rate": 1.3363613752590415e-06, + "loss": 0.96198406, + "memory(GiB)": 302.58, + "step": 276680, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.75049062, + "epoch": 1.5474374586681243, + "grad_norm": 8.9375, + "learning_rate": 1.335732155919045e-06, + "loss": 0.9671277, + "memory(GiB)": 302.58, + "step": 276700, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75584521, + "epoch": 1.5475493081411038, + "grad_norm": 8.5, + "learning_rate": 1.335103061908628e-06, + "loss": 0.96225986, + "memory(GiB)": 302.58, + "step": 276720, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75738335, + "epoch": 1.5476611576140828, + "grad_norm": 7.53125, + "learning_rate": 1.3344740932493077e-06, + "loss": 0.94238186, + "memory(GiB)": 302.58, + "step": 276740, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74636393, + "epoch": 1.5477730070870623, + "grad_norm": 8.1875, + "learning_rate": 1.333845249962596e-06, + "loss": 1.0143796, + "memory(GiB)": 302.58, + "step": 276760, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.73882737, + "epoch": 1.5478848565600414, + "grad_norm": 7.0, + "learning_rate": 1.3332165320700024e-06, + "loss": 1.02639732, + "memory(GiB)": 302.58, + "step": 276780, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75245914, + "epoch": 1.5479967060330209, + "grad_norm": 5.875, + "learning_rate": 1.3325879395930297e-06, + "loss": 0.94262619, + "memory(GiB)": 302.58, + "step": 276800, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.75447416, + "epoch": 1.548108555506, + "grad_norm": 6.90625, + "learning_rate": 1.3319594725531793e-06, + "loss": 0.94997978, + "memory(GiB)": 302.58, + "step": 276820, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.75435629, + "epoch": 1.5482204049789794, + "grad_norm": 7.0625, + "learning_rate": 1.331331130971945e-06, + "loss": 0.97109079, + "memory(GiB)": 302.58, + "step": 276840, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74950075, + "epoch": 1.5483322544519584, + "grad_norm": 8.6875, + "learning_rate": 1.3307029148708183e-06, + "loss": 0.97351398, + "memory(GiB)": 302.58, + "step": 276860, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.74616771, + "epoch": 1.548444103924938, + "grad_norm": 6.15625, + "learning_rate": 1.3300748242712885e-06, + "loss": 1.00624018, + "memory(GiB)": 302.58, + "step": 276880, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73663158, + "epoch": 1.548555953397917, + "grad_norm": 7.59375, + "learning_rate": 1.3294468591948368e-06, + "loss": 1.04010534, + "memory(GiB)": 302.58, + "step": 276900, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.75309272, + "epoch": 1.5486678028708964, + "grad_norm": 8.25, + "learning_rate": 1.3288190196629409e-06, + "loss": 0.95612812, + "memory(GiB)": 302.58, + "step": 276920, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.76419582, + "epoch": 1.5487796523438755, + "grad_norm": 6.71875, + "learning_rate": 1.3281913056970757e-06, + "loss": 0.91614456, + "memory(GiB)": 302.58, + "step": 276940, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.75471587, + "epoch": 1.548891501816855, + "grad_norm": 6.21875, + "learning_rate": 1.3275637173187112e-06, + "loss": 0.95870304, + "memory(GiB)": 302.58, + "step": 276960, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.75261908, + "epoch": 1.549003351289834, + "grad_norm": 9.5, + "learning_rate": 1.3269362545493125e-06, + "loss": 0.96864519, + "memory(GiB)": 302.58, + "step": 276980, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.74182, + "epoch": 1.5491152007628135, + "grad_norm": 8.1875, + "learning_rate": 1.3263089174103404e-06, + "loss": 1.02057419, + "memory(GiB)": 302.58, + "step": 277000, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.76030045, + "epoch": 1.5492270502357925, + "grad_norm": 8.4375, + "learning_rate": 1.3256817059232525e-06, + "loss": 0.91849489, + "memory(GiB)": 302.58, + "step": 277020, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.75090928, + "epoch": 1.549338899708772, + "grad_norm": 8.5, + "learning_rate": 1.3250546201095016e-06, + "loss": 0.98010931, + "memory(GiB)": 302.58, + "step": 277040, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.75837646, + "epoch": 1.549450749181751, + "grad_norm": 8.1875, + "learning_rate": 1.3244276599905354e-06, + "loss": 0.95451918, + "memory(GiB)": 302.58, + "step": 277060, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75059147, + "epoch": 1.5495625986547306, + "grad_norm": 10.6875, + "learning_rate": 1.3238008255877977e-06, + "loss": 0.96946268, + "memory(GiB)": 302.58, + "step": 277080, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75192852, + "epoch": 1.5496744481277096, + "grad_norm": 8.4375, + "learning_rate": 1.3231741169227291e-06, + "loss": 0.98546057, + "memory(GiB)": 302.58, + "step": 277100, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.7403666, + "epoch": 1.549786297600689, + "grad_norm": 6.78125, + "learning_rate": 1.3225475340167648e-06, + "loss": 1.04475784, + "memory(GiB)": 302.58, + "step": 277120, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.758705, + "epoch": 1.5498981470736681, + "grad_norm": 8.375, + "learning_rate": 1.3219210768913338e-06, + "loss": 0.93015089, + "memory(GiB)": 302.58, + "step": 277140, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.74368992, + "epoch": 1.5500099965466476, + "grad_norm": 8.0, + "learning_rate": 1.3212947455678666e-06, + "loss": 0.98928347, + "memory(GiB)": 302.58, + "step": 277160, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.75239754, + "epoch": 1.5501218460196267, + "grad_norm": 5.96875, + "learning_rate": 1.3206685400677837e-06, + "loss": 0.94335022, + "memory(GiB)": 302.58, + "step": 277180, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.75483274, + "epoch": 1.5502336954926061, + "grad_norm": 7.1875, + "learning_rate": 1.3200424604125022e-06, + "loss": 0.97630138, + "memory(GiB)": 302.58, + "step": 277200, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.76175528, + "epoch": 1.5503455449655852, + "grad_norm": 5.625, + "learning_rate": 1.3194165066234389e-06, + "loss": 0.9370409, + "memory(GiB)": 302.58, + "step": 277220, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.74012837, + "epoch": 1.5504573944385647, + "grad_norm": 7.1875, + "learning_rate": 1.3187906787220022e-06, + "loss": 1.02493763, + "memory(GiB)": 302.58, + "step": 277240, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.76362915, + "epoch": 1.5505692439115437, + "grad_norm": 8.5, + "learning_rate": 1.3181649767295968e-06, + "loss": 0.94080429, + "memory(GiB)": 302.58, + "step": 277260, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.7489809, + "epoch": 1.5506810933845232, + "grad_norm": 6.75, + "learning_rate": 1.317539400667624e-06, + "loss": 0.98191385, + "memory(GiB)": 302.58, + "step": 277280, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.76041231, + "epoch": 1.5507929428575022, + "grad_norm": 8.375, + "learning_rate": 1.31691395055748e-06, + "loss": 0.9471446, + "memory(GiB)": 302.58, + "step": 277300, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75357862, + "epoch": 1.5509047923304817, + "grad_norm": 6.40625, + "learning_rate": 1.3162886264205582e-06, + "loss": 0.9816885, + "memory(GiB)": 302.58, + "step": 277320, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.75248733, + "epoch": 1.5510166418034608, + "grad_norm": 7.375, + "learning_rate": 1.315663428278246e-06, + "loss": 0.9702282, + "memory(GiB)": 302.58, + "step": 277340, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.74007187, + "epoch": 1.5511284912764403, + "grad_norm": 5.3125, + "learning_rate": 1.3150383561519276e-06, + "loss": 1.01878357, + "memory(GiB)": 302.58, + "step": 277360, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.76353951, + "epoch": 1.5512403407494193, + "grad_norm": 8.125, + "learning_rate": 1.3144134100629814e-06, + "loss": 0.92603884, + "memory(GiB)": 302.58, + "step": 277380, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.76231046, + "epoch": 1.5513521902223988, + "grad_norm": 5.8125, + "learning_rate": 1.3137885900327841e-06, + "loss": 0.94057732, + "memory(GiB)": 302.58, + "step": 277400, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.75628791, + "epoch": 1.5514640396953778, + "grad_norm": 8.625, + "learning_rate": 1.3131638960827048e-06, + "loss": 0.9599968, + "memory(GiB)": 302.58, + "step": 277420, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.73476105, + "epoch": 1.5515758891683573, + "grad_norm": 5.28125, + "learning_rate": 1.3125393282341115e-06, + "loss": 1.05335903, + "memory(GiB)": 302.58, + "step": 277440, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.75323195, + "epoch": 1.5516877386413364, + "grad_norm": 9.0625, + "learning_rate": 1.3119148865083643e-06, + "loss": 0.98488474, + "memory(GiB)": 302.58, + "step": 277460, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.73624625, + "epoch": 1.5517995881143158, + "grad_norm": 6.46875, + "learning_rate": 1.3112905709268236e-06, + "loss": 1.05016479, + "memory(GiB)": 302.58, + "step": 277480, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.7557023, + "epoch": 1.5519114375872949, + "grad_norm": 7.15625, + "learning_rate": 1.3106663815108423e-06, + "loss": 0.96625576, + "memory(GiB)": 302.58, + "step": 277500, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.75500183, + "epoch": 1.5520232870602744, + "grad_norm": 8.4375, + "learning_rate": 1.310042318281769e-06, + "loss": 0.96098757, + "memory(GiB)": 302.58, + "step": 277520, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.76565127, + "epoch": 1.5521351365332534, + "grad_norm": 7.53125, + "learning_rate": 1.3094183812609497e-06, + "loss": 0.92680607, + "memory(GiB)": 302.58, + "step": 277540, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.75389652, + "epoch": 1.552246986006233, + "grad_norm": 6.96875, + "learning_rate": 1.3087945704697237e-06, + "loss": 0.95827942, + "memory(GiB)": 302.58, + "step": 277560, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.73906894, + "epoch": 1.552358835479212, + "grad_norm": 7.75, + "learning_rate": 1.3081708859294278e-06, + "loss": 1.02342358, + "memory(GiB)": 302.58, + "step": 277580, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74450188, + "epoch": 1.5524706849521914, + "grad_norm": 7.0, + "learning_rate": 1.3075473276613943e-06, + "loss": 1.00714312, + "memory(GiB)": 302.58, + "step": 277600, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.75178108, + "epoch": 1.5525825344251705, + "grad_norm": 7.5, + "learning_rate": 1.3069238956869506e-06, + "loss": 0.97990255, + "memory(GiB)": 302.58, + "step": 277620, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.74598756, + "epoch": 1.55269438389815, + "grad_norm": 9.5, + "learning_rate": 1.30630059002742e-06, + "loss": 1.01161003, + "memory(GiB)": 302.58, + "step": 277640, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.75694976, + "epoch": 1.552806233371129, + "grad_norm": 7.78125, + "learning_rate": 1.3056774107041214e-06, + "loss": 0.96405067, + "memory(GiB)": 302.58, + "step": 277660, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.75284348, + "epoch": 1.5529180828441085, + "grad_norm": 8.625, + "learning_rate": 1.30505435773837e-06, + "loss": 0.97050314, + "memory(GiB)": 302.58, + "step": 277680, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.75401516, + "epoch": 1.5530299323170875, + "grad_norm": 5.71875, + "learning_rate": 1.3044314311514761e-06, + "loss": 0.97074919, + "memory(GiB)": 302.58, + "step": 277700, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.73518863, + "epoch": 1.553141781790067, + "grad_norm": 6.6875, + "learning_rate": 1.3038086309647458e-06, + "loss": 1.05991716, + "memory(GiB)": 302.58, + "step": 277720, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.74030185, + "epoch": 1.553253631263046, + "grad_norm": 7.0, + "learning_rate": 1.3031859571994787e-06, + "loss": 1.04746704, + "memory(GiB)": 302.58, + "step": 277740, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.74344916, + "epoch": 1.5533654807360255, + "grad_norm": 5.3125, + "learning_rate": 1.3025634098769757e-06, + "loss": 1.02562189, + "memory(GiB)": 302.58, + "step": 277760, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.75810437, + "epoch": 1.5534773302090046, + "grad_norm": 9.5, + "learning_rate": 1.3019409890185281e-06, + "loss": 0.96414499, + "memory(GiB)": 302.58, + "step": 277780, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.74028506, + "epoch": 1.553589179681984, + "grad_norm": 6.78125, + "learning_rate": 1.3013186946454253e-06, + "loss": 1.0054389, + "memory(GiB)": 302.58, + "step": 277800, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.74686208, + "epoch": 1.553701029154963, + "grad_norm": 8.5625, + "learning_rate": 1.3006965267789512e-06, + "loss": 1.00907383, + "memory(GiB)": 302.58, + "step": 277820, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.75424767, + "epoch": 1.5538128786279426, + "grad_norm": 10.5, + "learning_rate": 1.3000744854403857e-06, + "loss": 0.96720333, + "memory(GiB)": 302.58, + "step": 277840, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.75195665, + "epoch": 1.5539247281009216, + "grad_norm": 7.5, + "learning_rate": 1.2994525706510053e-06, + "loss": 0.97069092, + "memory(GiB)": 302.58, + "step": 277860, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.75550389, + "epoch": 1.5540365775739011, + "grad_norm": 8.8125, + "learning_rate": 1.298830782432081e-06, + "loss": 0.9606307, + "memory(GiB)": 302.58, + "step": 277880, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.75056028, + "epoch": 1.5541484270468802, + "grad_norm": 10.75, + "learning_rate": 1.2982091208048803e-06, + "loss": 0.98100252, + "memory(GiB)": 302.58, + "step": 277900, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.74699249, + "epoch": 1.5542602765198597, + "grad_norm": 7.5625, + "learning_rate": 1.2975875857906655e-06, + "loss": 0.99607038, + "memory(GiB)": 302.58, + "step": 277920, + "train_speed(iter/s)": 0.123756 + }, + { + "acc": 0.74898586, + "epoch": 1.5543721259928387, + "grad_norm": 5.4375, + "learning_rate": 1.2969661774106956e-06, + "loss": 0.98332376, + "memory(GiB)": 302.58, + "step": 277940, + "train_speed(iter/s)": 0.12376 + }, + { + "acc": 0.74144359, + "epoch": 1.5544839754658182, + "grad_norm": 6.3125, + "learning_rate": 1.296344895686224e-06, + "loss": 1.03646669, + "memory(GiB)": 302.58, + "step": 277960, + "train_speed(iter/s)": 0.123764 + }, + { + "acc": 0.75443945, + "epoch": 1.5545958249387972, + "grad_norm": 7.78125, + "learning_rate": 1.2957237406385016e-06, + "loss": 0.95939083, + "memory(GiB)": 302.58, + "step": 277980, + "train_speed(iter/s)": 0.123769 + }, + { + "acc": 0.75648541, + "epoch": 1.5547076744117767, + "grad_norm": 8.9375, + "learning_rate": 1.2951027122887722e-06, + "loss": 0.95971222, + "memory(GiB)": 302.58, + "step": 278000, + "train_speed(iter/s)": 0.123773 + }, + { + "epoch": 1.5547076744117767, + "eval_acc": 0.706893682311199, + "eval_loss": 1.0118474960327148, + "eval_runtime": 7566.5366, + "eval_samples_per_second": 9.949, + "eval_steps_per_second": 9.949, + "step": 278000 + }, + { + "acc": 0.75080695, + "epoch": 1.5548195238847557, + "grad_norm": 6.90625, + "learning_rate": 1.2944818106582773e-06, + "loss": 0.97525921, + "memory(GiB)": 302.58, + "step": 278020, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.74498572, + "epoch": 1.5549313733577352, + "grad_norm": 6.53125, + "learning_rate": 1.293861035768255e-06, + "loss": 0.99594851, + "memory(GiB)": 302.58, + "step": 278040, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.74735889, + "epoch": 1.5550432228307143, + "grad_norm": 5.84375, + "learning_rate": 1.293240387639938e-06, + "loss": 0.99238091, + "memory(GiB)": 302.58, + "step": 278060, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.76308947, + "epoch": 1.5551550723036938, + "grad_norm": 7.03125, + "learning_rate": 1.2926198662945534e-06, + "loss": 0.94069281, + "memory(GiB)": 302.58, + "step": 278080, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.74451303, + "epoch": 1.5552669217766728, + "grad_norm": 7.03125, + "learning_rate": 1.2919994717533252e-06, + "loss": 1.00535126, + "memory(GiB)": 302.58, + "step": 278100, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.74334154, + "epoch": 1.5553787712496523, + "grad_norm": 6.59375, + "learning_rate": 1.2913792040374723e-06, + "loss": 1.01685724, + "memory(GiB)": 302.58, + "step": 278120, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.7438159, + "epoch": 1.5554906207226313, + "grad_norm": 6.5, + "learning_rate": 1.2907590631682104e-06, + "loss": 1.00925179, + "memory(GiB)": 302.58, + "step": 278140, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.75756392, + "epoch": 1.5556024701956108, + "grad_norm": 5.90625, + "learning_rate": 1.290139049166751e-06, + "loss": 0.94887733, + "memory(GiB)": 302.58, + "step": 278160, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.74599929, + "epoch": 1.5557143196685899, + "grad_norm": 5.3125, + "learning_rate": 1.289519162054299e-06, + "loss": 1.01429348, + "memory(GiB)": 302.58, + "step": 278180, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.76018438, + "epoch": 1.5558261691415693, + "grad_norm": 7.15625, + "learning_rate": 1.2888994018520572e-06, + "loss": 0.94150696, + "memory(GiB)": 302.58, + "step": 278200, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.75361304, + "epoch": 1.5559380186145484, + "grad_norm": 7.03125, + "learning_rate": 1.2882797685812244e-06, + "loss": 0.96134615, + "memory(GiB)": 302.58, + "step": 278220, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.74491987, + "epoch": 1.5560498680875279, + "grad_norm": 9.3125, + "learning_rate": 1.287660262262992e-06, + "loss": 0.99427643, + "memory(GiB)": 302.58, + "step": 278240, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.7430593, + "epoch": 1.556161717560507, + "grad_norm": 7.40625, + "learning_rate": 1.2870408829185505e-06, + "loss": 1.0106554, + "memory(GiB)": 302.58, + "step": 278260, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.73279748, + "epoch": 1.5562735670334864, + "grad_norm": 6.125, + "learning_rate": 1.2864216305690829e-06, + "loss": 1.06596498, + "memory(GiB)": 302.58, + "step": 278280, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.75713773, + "epoch": 1.5563854165064654, + "grad_norm": 6.75, + "learning_rate": 1.2858025052357726e-06, + "loss": 0.97829523, + "memory(GiB)": 302.58, + "step": 278300, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75704851, + "epoch": 1.556497265979445, + "grad_norm": 9.3125, + "learning_rate": 1.285183506939794e-06, + "loss": 0.96059551, + "memory(GiB)": 302.58, + "step": 278320, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.74535141, + "epoch": 1.556609115452424, + "grad_norm": 5.25, + "learning_rate": 1.2845646357023183e-06, + "loss": 1.00232363, + "memory(GiB)": 302.58, + "step": 278340, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.75039225, + "epoch": 1.5567209649254035, + "grad_norm": 8.5625, + "learning_rate": 1.2839458915445135e-06, + "loss": 0.99364395, + "memory(GiB)": 302.58, + "step": 278360, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75688806, + "epoch": 1.5568328143983825, + "grad_norm": 7.65625, + "learning_rate": 1.283327274487543e-06, + "loss": 0.9493742, + "memory(GiB)": 302.58, + "step": 278380, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.75011234, + "epoch": 1.556944663871362, + "grad_norm": 7.1875, + "learning_rate": 1.2827087845525631e-06, + "loss": 0.97619963, + "memory(GiB)": 302.58, + "step": 278400, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.74982409, + "epoch": 1.557056513344341, + "grad_norm": 8.375, + "learning_rate": 1.2820904217607316e-06, + "loss": 0.99824429, + "memory(GiB)": 302.58, + "step": 278420, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.74397135, + "epoch": 1.5571683628173205, + "grad_norm": 5.59375, + "learning_rate": 1.2814721861331968e-06, + "loss": 1.01782379, + "memory(GiB)": 302.58, + "step": 278440, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.74779534, + "epoch": 1.5572802122902996, + "grad_norm": 9.375, + "learning_rate": 1.2808540776911044e-06, + "loss": 1.00213242, + "memory(GiB)": 302.58, + "step": 278460, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.75986066, + "epoch": 1.557392061763279, + "grad_norm": 10.75, + "learning_rate": 1.2802360964555954e-06, + "loss": 0.9308403, + "memory(GiB)": 302.58, + "step": 278480, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.74390826, + "epoch": 1.557503911236258, + "grad_norm": 6.5, + "learning_rate": 1.2796182424478076e-06, + "loss": 1.02617884, + "memory(GiB)": 302.58, + "step": 278500, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.76016603, + "epoch": 1.5576157607092376, + "grad_norm": 6.53125, + "learning_rate": 1.2790005156888724e-06, + "loss": 0.93743277, + "memory(GiB)": 302.58, + "step": 278520, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.74644141, + "epoch": 1.5577276101822166, + "grad_norm": 8.0625, + "learning_rate": 1.2783829161999184e-06, + "loss": 1.00508194, + "memory(GiB)": 302.58, + "step": 278540, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.76291795, + "epoch": 1.557839459655196, + "grad_norm": 9.1875, + "learning_rate": 1.2777654440020698e-06, + "loss": 0.92611628, + "memory(GiB)": 302.58, + "step": 278560, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.76028757, + "epoch": 1.5579513091281751, + "grad_norm": 5.3125, + "learning_rate": 1.277148099116446e-06, + "loss": 0.93302546, + "memory(GiB)": 302.58, + "step": 278580, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.76586227, + "epoch": 1.5580631586011546, + "grad_norm": 7.6875, + "learning_rate": 1.276530881564162e-06, + "loss": 0.91172218, + "memory(GiB)": 302.58, + "step": 278600, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75100045, + "epoch": 1.5581750080741337, + "grad_norm": 7.9375, + "learning_rate": 1.2759137913663272e-06, + "loss": 0.98210201, + "memory(GiB)": 302.58, + "step": 278620, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.7408917, + "epoch": 1.5582868575471132, + "grad_norm": 5.625, + "learning_rate": 1.275296828544051e-06, + "loss": 1.01316605, + "memory(GiB)": 302.58, + "step": 278640, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74889584, + "epoch": 1.5583987070200922, + "grad_norm": 8.4375, + "learning_rate": 1.2746799931184338e-06, + "loss": 0.99714622, + "memory(GiB)": 302.58, + "step": 278660, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.76549501, + "epoch": 1.5585105564930717, + "grad_norm": 12.25, + "learning_rate": 1.2740632851105733e-06, + "loss": 0.92008657, + "memory(GiB)": 302.58, + "step": 278680, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74015489, + "epoch": 1.5586224059660507, + "grad_norm": 5.78125, + "learning_rate": 1.2734467045415638e-06, + "loss": 1.01039639, + "memory(GiB)": 302.58, + "step": 278700, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.75638604, + "epoch": 1.5587342554390302, + "grad_norm": 5.3125, + "learning_rate": 1.272830251432493e-06, + "loss": 0.96126394, + "memory(GiB)": 302.58, + "step": 278720, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.76167126, + "epoch": 1.5588461049120093, + "grad_norm": 8.0625, + "learning_rate": 1.2722139258044463e-06, + "loss": 0.917733, + "memory(GiB)": 302.58, + "step": 278740, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.7652452, + "epoch": 1.5589579543849887, + "grad_norm": 8.875, + "learning_rate": 1.2715977276785036e-06, + "loss": 0.9057745, + "memory(GiB)": 302.58, + "step": 278760, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.73381963, + "epoch": 1.5590698038579678, + "grad_norm": 7.90625, + "learning_rate": 1.2709816570757415e-06, + "loss": 1.04875374, + "memory(GiB)": 302.58, + "step": 278780, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.74105644, + "epoch": 1.5591816533309473, + "grad_norm": 9.375, + "learning_rate": 1.270365714017231e-06, + "loss": 1.00895185, + "memory(GiB)": 302.58, + "step": 278800, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.73994403, + "epoch": 1.5592935028039263, + "grad_norm": 6.21875, + "learning_rate": 1.2697498985240392e-06, + "loss": 1.04229774, + "memory(GiB)": 302.58, + "step": 278820, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.74995689, + "epoch": 1.5594053522769058, + "grad_norm": 11.375, + "learning_rate": 1.2691342106172294e-06, + "loss": 0.96603689, + "memory(GiB)": 302.58, + "step": 278840, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.75817809, + "epoch": 1.5595172017498848, + "grad_norm": 10.5, + "learning_rate": 1.2685186503178598e-06, + "loss": 0.95475407, + "memory(GiB)": 302.58, + "step": 278860, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.72738099, + "epoch": 1.5596290512228643, + "grad_norm": 8.3125, + "learning_rate": 1.2679032176469846e-06, + "loss": 1.09544744, + "memory(GiB)": 302.58, + "step": 278880, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.74643049, + "epoch": 1.5597409006958434, + "grad_norm": 7.84375, + "learning_rate": 1.2672879126256515e-06, + "loss": 0.99874239, + "memory(GiB)": 302.58, + "step": 278900, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.7471189, + "epoch": 1.5598527501688229, + "grad_norm": 8.0, + "learning_rate": 1.2666727352749102e-06, + "loss": 1.00558147, + "memory(GiB)": 302.58, + "step": 278920, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.73977818, + "epoch": 1.559964599641802, + "grad_norm": 9.8125, + "learning_rate": 1.266057685615799e-06, + "loss": 1.03712368, + "memory(GiB)": 302.58, + "step": 278940, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.74604869, + "epoch": 1.5600764491147814, + "grad_norm": 6.3125, + "learning_rate": 1.265442763669355e-06, + "loss": 0.99249735, + "memory(GiB)": 302.58, + "step": 278960, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.7602622, + "epoch": 1.5601882985877606, + "grad_norm": 4.96875, + "learning_rate": 1.2648279694566108e-06, + "loss": 0.93654509, + "memory(GiB)": 302.58, + "step": 278980, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.75740118, + "epoch": 1.56030014806074, + "grad_norm": 9.9375, + "learning_rate": 1.264213302998593e-06, + "loss": 0.95115433, + "memory(GiB)": 302.58, + "step": 279000, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.74395504, + "epoch": 1.5604119975337192, + "grad_norm": 9.0, + "learning_rate": 1.2635987643163266e-06, + "loss": 1.03250666, + "memory(GiB)": 302.58, + "step": 279020, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.75923924, + "epoch": 1.5605238470066984, + "grad_norm": 6.96875, + "learning_rate": 1.2629843534308305e-06, + "loss": 0.95504608, + "memory(GiB)": 302.58, + "step": 279040, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.74986606, + "epoch": 1.5606356964796777, + "grad_norm": 7.78125, + "learning_rate": 1.2623700703631192e-06, + "loss": 0.98970346, + "memory(GiB)": 302.58, + "step": 279060, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.76547365, + "epoch": 1.560747545952657, + "grad_norm": 8.25, + "learning_rate": 1.2617559151342035e-06, + "loss": 0.90142078, + "memory(GiB)": 302.58, + "step": 279080, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74145041, + "epoch": 1.5608593954256362, + "grad_norm": 5.34375, + "learning_rate": 1.2611418877650883e-06, + "loss": 1.02354012, + "memory(GiB)": 302.58, + "step": 279100, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74734983, + "epoch": 1.5609712448986155, + "grad_norm": 6.84375, + "learning_rate": 1.2605279882767768e-06, + "loss": 1.01163244, + "memory(GiB)": 302.58, + "step": 279120, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.7456316, + "epoch": 1.5610830943715948, + "grad_norm": 7.59375, + "learning_rate": 1.2599142166902657e-06, + "loss": 1.00706768, + "memory(GiB)": 302.58, + "step": 279140, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.74762039, + "epoch": 1.561194943844574, + "grad_norm": 8.3125, + "learning_rate": 1.2593005730265462e-06, + "loss": 0.9942709, + "memory(GiB)": 302.58, + "step": 279160, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.75408287, + "epoch": 1.5613067933175533, + "grad_norm": 8.5625, + "learning_rate": 1.2586870573066095e-06, + "loss": 0.97125826, + "memory(GiB)": 302.58, + "step": 279180, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.75861506, + "epoch": 1.5614186427905326, + "grad_norm": 8.75, + "learning_rate": 1.2580736695514396e-06, + "loss": 0.9245491, + "memory(GiB)": 302.58, + "step": 279200, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.73460097, + "epoch": 1.5615304922635118, + "grad_norm": 7.875, + "learning_rate": 1.2574604097820154e-06, + "loss": 1.07364359, + "memory(GiB)": 302.58, + "step": 279220, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.75498896, + "epoch": 1.561642341736491, + "grad_norm": 7.1875, + "learning_rate": 1.2568472780193119e-06, + "loss": 0.95268135, + "memory(GiB)": 302.58, + "step": 279240, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.74585705, + "epoch": 1.5617541912094703, + "grad_norm": 7.40625, + "learning_rate": 1.2562342742843015e-06, + "loss": 1.00709057, + "memory(GiB)": 302.58, + "step": 279260, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.7554347, + "epoch": 1.5618660406824496, + "grad_norm": 7.21875, + "learning_rate": 1.2556213985979494e-06, + "loss": 0.95035686, + "memory(GiB)": 302.58, + "step": 279280, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.75101824, + "epoch": 1.5619778901554289, + "grad_norm": 6.28125, + "learning_rate": 1.2550086509812194e-06, + "loss": 0.98107433, + "memory(GiB)": 302.58, + "step": 279300, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.73697052, + "epoch": 1.5620897396284081, + "grad_norm": 9.4375, + "learning_rate": 1.2543960314550673e-06, + "loss": 1.04781694, + "memory(GiB)": 302.58, + "step": 279320, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.75065422, + "epoch": 1.5622015891013874, + "grad_norm": 8.6875, + "learning_rate": 1.2537835400404492e-06, + "loss": 0.9626133, + "memory(GiB)": 302.58, + "step": 279340, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.73228846, + "epoch": 1.5623134385743667, + "grad_norm": 5.28125, + "learning_rate": 1.2531711767583121e-06, + "loss": 1.06036835, + "memory(GiB)": 302.58, + "step": 279360, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.75885491, + "epoch": 1.562425288047346, + "grad_norm": 7.40625, + "learning_rate": 1.2525589416296018e-06, + "loss": 0.96030731, + "memory(GiB)": 302.58, + "step": 279380, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.75210657, + "epoch": 1.5625371375203252, + "grad_norm": 10.625, + "learning_rate": 1.251946834675259e-06, + "loss": 0.95636301, + "memory(GiB)": 302.58, + "step": 279400, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.75611215, + "epoch": 1.5626489869933045, + "grad_norm": 6.0, + "learning_rate": 1.2513348559162187e-06, + "loss": 0.96556425, + "memory(GiB)": 302.58, + "step": 279420, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.74664974, + "epoch": 1.5627608364662837, + "grad_norm": 5.96875, + "learning_rate": 1.2507230053734116e-06, + "loss": 0.98962708, + "memory(GiB)": 302.58, + "step": 279440, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.7486033, + "epoch": 1.562872685939263, + "grad_norm": 5.4375, + "learning_rate": 1.250111283067768e-06, + "loss": 0.97816181, + "memory(GiB)": 302.58, + "step": 279460, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.74014206, + "epoch": 1.5629845354122422, + "grad_norm": 6.125, + "learning_rate": 1.2494996890202092e-06, + "loss": 1.02581081, + "memory(GiB)": 302.58, + "step": 279480, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.76100488, + "epoch": 1.5630963848852215, + "grad_norm": 9.375, + "learning_rate": 1.248888223251653e-06, + "loss": 0.92933006, + "memory(GiB)": 302.58, + "step": 279500, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.74173217, + "epoch": 1.5632082343582008, + "grad_norm": 8.75, + "learning_rate": 1.2482768857830147e-06, + "loss": 1.01328487, + "memory(GiB)": 302.58, + "step": 279520, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.74817529, + "epoch": 1.56332008383118, + "grad_norm": 7.4375, + "learning_rate": 1.247665676635203e-06, + "loss": 0.98003483, + "memory(GiB)": 302.58, + "step": 279540, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.74863968, + "epoch": 1.5634319333041593, + "grad_norm": 6.0, + "learning_rate": 1.2470545958291235e-06, + "loss": 0.99445744, + "memory(GiB)": 302.58, + "step": 279560, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.74913769, + "epoch": 1.5635437827771386, + "grad_norm": 8.125, + "learning_rate": 1.2464436433856768e-06, + "loss": 1.00727291, + "memory(GiB)": 302.58, + "step": 279580, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.74151545, + "epoch": 1.5636556322501178, + "grad_norm": 5.40625, + "learning_rate": 1.2458328193257601e-06, + "loss": 1.02475462, + "memory(GiB)": 302.58, + "step": 279600, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.7468133, + "epoch": 1.563767481723097, + "grad_norm": 7.75, + "learning_rate": 1.245222123670265e-06, + "loss": 0.99762182, + "memory(GiB)": 302.58, + "step": 279620, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.75782208, + "epoch": 1.5638793311960764, + "grad_norm": 8.0, + "learning_rate": 1.2446115564400797e-06, + "loss": 0.97161503, + "memory(GiB)": 302.58, + "step": 279640, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.75270672, + "epoch": 1.5639911806690556, + "grad_norm": 8.875, + "learning_rate": 1.244001117656085e-06, + "loss": 0.98555079, + "memory(GiB)": 302.58, + "step": 279660, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.74412675, + "epoch": 1.564103030142035, + "grad_norm": 9.8125, + "learning_rate": 1.2433908073391637e-06, + "loss": 1.02094879, + "memory(GiB)": 302.58, + "step": 279680, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.76775198, + "epoch": 1.5642148796150142, + "grad_norm": 7.5, + "learning_rate": 1.2427806255101888e-06, + "loss": 0.91148577, + "memory(GiB)": 302.58, + "step": 279700, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.7498837, + "epoch": 1.5643267290879934, + "grad_norm": 8.75, + "learning_rate": 1.2421705721900307e-06, + "loss": 0.95913582, + "memory(GiB)": 302.58, + "step": 279720, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.7517777, + "epoch": 1.5644385785609727, + "grad_norm": 7.4375, + "learning_rate": 1.2415606473995545e-06, + "loss": 0.97915211, + "memory(GiB)": 302.58, + "step": 279740, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.738375, + "epoch": 1.564550428033952, + "grad_norm": 5.5625, + "learning_rate": 1.2409508511596202e-06, + "loss": 1.00998735, + "memory(GiB)": 302.58, + "step": 279760, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.75614538, + "epoch": 1.5646622775069312, + "grad_norm": 7.53125, + "learning_rate": 1.2403411834910877e-06, + "loss": 0.95308905, + "memory(GiB)": 302.58, + "step": 279780, + "train_speed(iter/s)": 0.123723 + }, + { + "acc": 0.75690331, + "epoch": 1.5647741269799105, + "grad_norm": 6.03125, + "learning_rate": 1.239731644414809e-06, + "loss": 0.91364346, + "memory(GiB)": 302.58, + "step": 279800, + "train_speed(iter/s)": 0.123727 + }, + { + "acc": 0.75810175, + "epoch": 1.5648859764528897, + "grad_norm": 4.40625, + "learning_rate": 1.2391222339516307e-06, + "loss": 0.93481531, + "memory(GiB)": 302.58, + "step": 279820, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.75298223, + "epoch": 1.564997825925869, + "grad_norm": 5.90625, + "learning_rate": 1.2385129521223982e-06, + "loss": 0.9929019, + "memory(GiB)": 302.58, + "step": 279840, + "train_speed(iter/s)": 0.123735 + }, + { + "acc": 0.75705075, + "epoch": 1.5651096753988483, + "grad_norm": 7.15625, + "learning_rate": 1.2379037989479498e-06, + "loss": 0.93651781, + "memory(GiB)": 302.58, + "step": 279860, + "train_speed(iter/s)": 0.123739 + }, + { + "acc": 0.75226736, + "epoch": 1.5652215248718275, + "grad_norm": 6.3125, + "learning_rate": 1.2372947744491205e-06, + "loss": 0.99492455, + "memory(GiB)": 302.58, + "step": 279880, + "train_speed(iter/s)": 0.123744 + }, + { + "acc": 0.74563146, + "epoch": 1.5653333743448068, + "grad_norm": 10.0, + "learning_rate": 1.2366858786467412e-06, + "loss": 1.00906363, + "memory(GiB)": 302.58, + "step": 279900, + "train_speed(iter/s)": 0.123748 + }, + { + "acc": 0.74533772, + "epoch": 1.565445223817786, + "grad_norm": 6.84375, + "learning_rate": 1.2360771115616383e-06, + "loss": 0.99997625, + "memory(GiB)": 302.58, + "step": 279920, + "train_speed(iter/s)": 0.123752 + }, + { + "acc": 0.73813868, + "epoch": 1.5655570732907653, + "grad_norm": 5.71875, + "learning_rate": 1.2354684732146328e-06, + "loss": 1.03918066, + "memory(GiB)": 302.58, + "step": 279940, + "train_speed(iter/s)": 0.123757 + }, + { + "acc": 0.74751911, + "epoch": 1.5656689227637446, + "grad_norm": 6.78125, + "learning_rate": 1.234859963626543e-06, + "loss": 0.99658089, + "memory(GiB)": 302.58, + "step": 279960, + "train_speed(iter/s)": 0.123761 + }, + { + "acc": 0.74223156, + "epoch": 1.5657807722367239, + "grad_norm": 7.46875, + "learning_rate": 1.2342515828181812e-06, + "loss": 1.0280777, + "memory(GiB)": 302.58, + "step": 279980, + "train_speed(iter/s)": 0.123765 + }, + { + "acc": 0.75928779, + "epoch": 1.5658926217097031, + "grad_norm": 10.625, + "learning_rate": 1.2336433308103562e-06, + "loss": 0.96223812, + "memory(GiB)": 302.58, + "step": 280000, + "train_speed(iter/s)": 0.123769 + }, + { + "epoch": 1.5658926217097031, + "eval_acc": 0.7068862384964717, + "eval_loss": 1.0118426084518433, + "eval_runtime": 7553.5854, + "eval_samples_per_second": 9.967, + "eval_steps_per_second": 9.967, + "step": 280000 + }, + { + "acc": 0.7567297, + "epoch": 1.5660044711826824, + "grad_norm": 7.34375, + "learning_rate": 1.2330352076238715e-06, + "loss": 0.93918104, + "memory(GiB)": 302.58, + "step": 280020, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.76124187, + "epoch": 1.5661163206556616, + "grad_norm": 7.75, + "learning_rate": 1.2324272132795268e-06, + "loss": 0.9406147, + "memory(GiB)": 302.58, + "step": 280040, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.75272088, + "epoch": 1.566228170128641, + "grad_norm": 6.34375, + "learning_rate": 1.2318193477981193e-06, + "loss": 0.9775589, + "memory(GiB)": 302.58, + "step": 280060, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.74455881, + "epoch": 1.5663400196016202, + "grad_norm": 6.96875, + "learning_rate": 1.2312116112004385e-06, + "loss": 1.01946669, + "memory(GiB)": 302.58, + "step": 280080, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.76367569, + "epoch": 1.5664518690745994, + "grad_norm": 9.1875, + "learning_rate": 1.2306040035072708e-06, + "loss": 0.92329836, + "memory(GiB)": 302.58, + "step": 280100, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.7668952, + "epoch": 1.5665637185475787, + "grad_norm": 7.1875, + "learning_rate": 1.2299965247393992e-06, + "loss": 0.93502979, + "memory(GiB)": 302.58, + "step": 280120, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.76328802, + "epoch": 1.566675568020558, + "grad_norm": 6.59375, + "learning_rate": 1.2293891749176007e-06, + "loss": 0.94775076, + "memory(GiB)": 302.58, + "step": 280140, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.75112133, + "epoch": 1.5667874174935372, + "grad_norm": 6.0625, + "learning_rate": 1.2287819540626484e-06, + "loss": 0.96600208, + "memory(GiB)": 302.58, + "step": 280160, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.73210897, + "epoch": 1.5668992669665165, + "grad_norm": 7.3125, + "learning_rate": 1.2281748621953116e-06, + "loss": 1.06910095, + "memory(GiB)": 302.58, + "step": 280180, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.75184636, + "epoch": 1.5670111164394958, + "grad_norm": 4.84375, + "learning_rate": 1.227567899336355e-06, + "loss": 0.96633282, + "memory(GiB)": 302.58, + "step": 280200, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.73872957, + "epoch": 1.567122965912475, + "grad_norm": 11.0625, + "learning_rate": 1.2269610655065377e-06, + "loss": 1.03631916, + "memory(GiB)": 302.58, + "step": 280220, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.7546895, + "epoch": 1.5672348153854543, + "grad_norm": 9.0, + "learning_rate": 1.2263543607266165e-06, + "loss": 0.96496668, + "memory(GiB)": 302.58, + "step": 280240, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.76819096, + "epoch": 1.5673466648584335, + "grad_norm": 8.25, + "learning_rate": 1.2257477850173415e-06, + "loss": 0.91656809, + "memory(GiB)": 302.58, + "step": 280260, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.74910245, + "epoch": 1.5674585143314128, + "grad_norm": 10.1875, + "learning_rate": 1.2251413383994603e-06, + "loss": 0.98646679, + "memory(GiB)": 302.58, + "step": 280280, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.75203242, + "epoch": 1.567570363804392, + "grad_norm": 5.84375, + "learning_rate": 1.2245350208937151e-06, + "loss": 1.00364237, + "memory(GiB)": 302.58, + "step": 280300, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.7505703, + "epoch": 1.5676822132773713, + "grad_norm": 6.5625, + "learning_rate": 1.2239288325208427e-06, + "loss": 0.99147472, + "memory(GiB)": 302.58, + "step": 280320, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.7483345, + "epoch": 1.5677940627503506, + "grad_norm": 8.625, + "learning_rate": 1.223322773301579e-06, + "loss": 1.00179768, + "memory(GiB)": 302.58, + "step": 280340, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.74818449, + "epoch": 1.5679059122233299, + "grad_norm": 6.71875, + "learning_rate": 1.2227168432566522e-06, + "loss": 1.00697718, + "memory(GiB)": 302.58, + "step": 280360, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.7464602, + "epoch": 1.5680177616963091, + "grad_norm": 8.875, + "learning_rate": 1.2221110424067867e-06, + "loss": 1.00144243, + "memory(GiB)": 302.58, + "step": 280380, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.73877487, + "epoch": 1.5681296111692884, + "grad_norm": 6.0625, + "learning_rate": 1.2215053707727025e-06, + "loss": 1.03874664, + "memory(GiB)": 302.58, + "step": 280400, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.73662138, + "epoch": 1.5682414606422677, + "grad_norm": 7.0, + "learning_rate": 1.2208998283751166e-06, + "loss": 1.03974934, + "memory(GiB)": 302.58, + "step": 280420, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.73642335, + "epoch": 1.568353310115247, + "grad_norm": 9.1875, + "learning_rate": 1.220294415234739e-06, + "loss": 1.06220675, + "memory(GiB)": 302.58, + "step": 280440, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.74541059, + "epoch": 1.5684651595882262, + "grad_norm": 6.65625, + "learning_rate": 1.219689131372278e-06, + "loss": 1.00128164, + "memory(GiB)": 302.58, + "step": 280460, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.7328259, + "epoch": 1.5685770090612055, + "grad_norm": 10.3125, + "learning_rate": 1.2190839768084356e-06, + "loss": 1.04086847, + "memory(GiB)": 302.58, + "step": 280480, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75287957, + "epoch": 1.5686888585341847, + "grad_norm": 8.125, + "learning_rate": 1.2184789515639105e-06, + "loss": 0.95278215, + "memory(GiB)": 302.58, + "step": 280500, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.74674082, + "epoch": 1.568800708007164, + "grad_norm": 8.125, + "learning_rate": 1.217874055659396e-06, + "loss": 0.98756828, + "memory(GiB)": 302.58, + "step": 280520, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74543219, + "epoch": 1.5689125574801432, + "grad_norm": 7.15625, + "learning_rate": 1.2172692891155813e-06, + "loss": 1.01282864, + "memory(GiB)": 302.58, + "step": 280540, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74038196, + "epoch": 1.5690244069531225, + "grad_norm": 5.71875, + "learning_rate": 1.2166646519531517e-06, + "loss": 1.01257801, + "memory(GiB)": 302.58, + "step": 280560, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.75647635, + "epoch": 1.5691362564261018, + "grad_norm": 6.3125, + "learning_rate": 1.2160601441927865e-06, + "loss": 0.96361837, + "memory(GiB)": 302.58, + "step": 280580, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.76959219, + "epoch": 1.569248105899081, + "grad_norm": 8.5, + "learning_rate": 1.215455765855164e-06, + "loss": 0.89710894, + "memory(GiB)": 302.58, + "step": 280600, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.75543718, + "epoch": 1.5693599553720603, + "grad_norm": 6.5, + "learning_rate": 1.2148515169609547e-06, + "loss": 0.96237421, + "memory(GiB)": 302.58, + "step": 280620, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74816499, + "epoch": 1.5694718048450396, + "grad_norm": 8.375, + "learning_rate": 1.214247397530826e-06, + "loss": 1.00394325, + "memory(GiB)": 302.58, + "step": 280640, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.73686719, + "epoch": 1.5695836543180188, + "grad_norm": 6.71875, + "learning_rate": 1.213643407585441e-06, + "loss": 1.03890209, + "memory(GiB)": 302.58, + "step": 280660, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.74443235, + "epoch": 1.569695503790998, + "grad_norm": 4.96875, + "learning_rate": 1.2130395471454576e-06, + "loss": 1.01042404, + "memory(GiB)": 302.58, + "step": 280680, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.77418814, + "epoch": 1.5698073532639774, + "grad_norm": 11.8125, + "learning_rate": 1.2124358162315292e-06, + "loss": 0.88539104, + "memory(GiB)": 302.58, + "step": 280700, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.75386238, + "epoch": 1.5699192027369566, + "grad_norm": 6.59375, + "learning_rate": 1.2118322148643064e-06, + "loss": 0.97064981, + "memory(GiB)": 302.58, + "step": 280720, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75681467, + "epoch": 1.5700310522099359, + "grad_norm": 8.9375, + "learning_rate": 1.2112287430644337e-06, + "loss": 0.94556637, + "memory(GiB)": 302.58, + "step": 280740, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.76070714, + "epoch": 1.5701429016829151, + "grad_norm": 7.40625, + "learning_rate": 1.210625400852552e-06, + "loss": 0.93887873, + "memory(GiB)": 302.58, + "step": 280760, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74845858, + "epoch": 1.5702547511558944, + "grad_norm": 9.0, + "learning_rate": 1.2100221882492973e-06, + "loss": 0.97909565, + "memory(GiB)": 302.58, + "step": 280780, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.75467706, + "epoch": 1.5703666006288737, + "grad_norm": 6.3125, + "learning_rate": 1.2094191052753013e-06, + "loss": 0.95721292, + "memory(GiB)": 302.58, + "step": 280800, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.75926394, + "epoch": 1.570478450101853, + "grad_norm": 7.125, + "learning_rate": 1.2088161519511915e-06, + "loss": 0.95480747, + "memory(GiB)": 302.58, + "step": 280820, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74269629, + "epoch": 1.5705902995748322, + "grad_norm": 6.5, + "learning_rate": 1.208213328297591e-06, + "loss": 1.02832642, + "memory(GiB)": 302.58, + "step": 280840, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74886222, + "epoch": 1.5707021490478115, + "grad_norm": 7.6875, + "learning_rate": 1.2076106343351169e-06, + "loss": 0.97759094, + "memory(GiB)": 302.58, + "step": 280860, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74726048, + "epoch": 1.5708139985207907, + "grad_norm": 5.65625, + "learning_rate": 1.2070080700843855e-06, + "loss": 1.0047967, + "memory(GiB)": 302.58, + "step": 280880, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.73101134, + "epoch": 1.57092584799377, + "grad_norm": 8.875, + "learning_rate": 1.2064056355660052e-06, + "loss": 1.09690866, + "memory(GiB)": 302.58, + "step": 280900, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.75546951, + "epoch": 1.5710376974667493, + "grad_norm": 5.03125, + "learning_rate": 1.2058033308005801e-06, + "loss": 0.96872311, + "memory(GiB)": 302.58, + "step": 280920, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74499574, + "epoch": 1.5711495469397285, + "grad_norm": 7.59375, + "learning_rate": 1.205201155808714e-06, + "loss": 1.01530495, + "memory(GiB)": 302.58, + "step": 280940, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75256648, + "epoch": 1.5712613964127078, + "grad_norm": 6.15625, + "learning_rate": 1.2045991106110005e-06, + "loss": 0.97837124, + "memory(GiB)": 302.58, + "step": 280960, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.7603857, + "epoch": 1.571373245885687, + "grad_norm": 8.4375, + "learning_rate": 1.2039971952280333e-06, + "loss": 0.95590324, + "memory(GiB)": 302.58, + "step": 280980, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.75369492, + "epoch": 1.5714850953586663, + "grad_norm": 11.6875, + "learning_rate": 1.203395409680398e-06, + "loss": 0.97364893, + "memory(GiB)": 302.58, + "step": 281000, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75019989, + "epoch": 1.5715969448316456, + "grad_norm": 7.96875, + "learning_rate": 1.2027937539886785e-06, + "loss": 0.99460115, + "memory(GiB)": 302.58, + "step": 281020, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75132828, + "epoch": 1.5717087943046248, + "grad_norm": 5.625, + "learning_rate": 1.2021922281734538e-06, + "loss": 0.99034338, + "memory(GiB)": 302.58, + "step": 281040, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.72923965, + "epoch": 1.571820643777604, + "grad_norm": 7.875, + "learning_rate": 1.201590832255297e-06, + "loss": 1.06288233, + "memory(GiB)": 302.58, + "step": 281060, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.74695053, + "epoch": 1.5719324932505834, + "grad_norm": 10.9375, + "learning_rate": 1.2009895662547783e-06, + "loss": 1.00204401, + "memory(GiB)": 302.58, + "step": 281080, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.75837846, + "epoch": 1.5720443427235626, + "grad_norm": 7.15625, + "learning_rate": 1.2003884301924623e-06, + "loss": 0.94671211, + "memory(GiB)": 302.58, + "step": 281100, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.7617413, + "epoch": 1.572156192196542, + "grad_norm": 5.75, + "learning_rate": 1.1997874240889108e-06, + "loss": 0.948773, + "memory(GiB)": 302.58, + "step": 281120, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75929589, + "epoch": 1.5722680416695212, + "grad_norm": 7.4375, + "learning_rate": 1.1991865479646798e-06, + "loss": 0.9384943, + "memory(GiB)": 302.58, + "step": 281140, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74435649, + "epoch": 1.5723798911425004, + "grad_norm": 7.59375, + "learning_rate": 1.1985858018403201e-06, + "loss": 1.01038132, + "memory(GiB)": 302.58, + "step": 281160, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.74000959, + "epoch": 1.5724917406154797, + "grad_norm": 9.125, + "learning_rate": 1.1979851857363805e-06, + "loss": 1.01919298, + "memory(GiB)": 302.58, + "step": 281180, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.7661643, + "epoch": 1.572603590088459, + "grad_norm": 8.875, + "learning_rate": 1.1973846996734018e-06, + "loss": 0.92837877, + "memory(GiB)": 302.58, + "step": 281200, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.74838138, + "epoch": 1.5727154395614382, + "grad_norm": 6.71875, + "learning_rate": 1.1967843436719252e-06, + "loss": 0.98603678, + "memory(GiB)": 302.58, + "step": 281220, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.77070427, + "epoch": 1.5728272890344175, + "grad_norm": 5.8125, + "learning_rate": 1.1961841177524841e-06, + "loss": 0.90318356, + "memory(GiB)": 302.58, + "step": 281240, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74149919, + "epoch": 1.5729391385073968, + "grad_norm": 7.21875, + "learning_rate": 1.195584021935608e-06, + "loss": 1.01940222, + "memory(GiB)": 302.58, + "step": 281260, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.74030948, + "epoch": 1.573050987980376, + "grad_norm": 6.4375, + "learning_rate": 1.1949840562418213e-06, + "loss": 1.03235922, + "memory(GiB)": 302.58, + "step": 281280, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.72985048, + "epoch": 1.5731628374533553, + "grad_norm": 7.75, + "learning_rate": 1.1943842206916457e-06, + "loss": 1.06051779, + "memory(GiB)": 302.58, + "step": 281300, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.75893598, + "epoch": 1.5732746869263345, + "grad_norm": 9.0, + "learning_rate": 1.1937845153055965e-06, + "loss": 0.94587221, + "memory(GiB)": 302.58, + "step": 281320, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.75066438, + "epoch": 1.5733865363993138, + "grad_norm": 9.4375, + "learning_rate": 1.1931849401041868e-06, + "loss": 0.98317719, + "memory(GiB)": 302.58, + "step": 281340, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.7504396, + "epoch": 1.573498385872293, + "grad_norm": 6.25, + "learning_rate": 1.1925854951079224e-06, + "loss": 0.98781967, + "memory(GiB)": 302.58, + "step": 281360, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.7521462, + "epoch": 1.5736102353452723, + "grad_norm": 6.78125, + "learning_rate": 1.1919861803373078e-06, + "loss": 0.96809502, + "memory(GiB)": 302.58, + "step": 281380, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.74259624, + "epoch": 1.5737220848182516, + "grad_norm": 5.40625, + "learning_rate": 1.1913869958128404e-06, + "loss": 1.01204309, + "memory(GiB)": 302.58, + "step": 281400, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.74554005, + "epoch": 1.5738339342912309, + "grad_norm": 9.0, + "learning_rate": 1.1907879415550144e-06, + "loss": 0.98247213, + "memory(GiB)": 302.58, + "step": 281420, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74948516, + "epoch": 1.5739457837642101, + "grad_norm": 8.3125, + "learning_rate": 1.1901890175843195e-06, + "loss": 1.00510921, + "memory(GiB)": 302.58, + "step": 281440, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.74648619, + "epoch": 1.5740576332371894, + "grad_norm": 6.5625, + "learning_rate": 1.1895902239212392e-06, + "loss": 0.99683886, + "memory(GiB)": 302.58, + "step": 281460, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.76970406, + "epoch": 1.5741694827101687, + "grad_norm": 8.5625, + "learning_rate": 1.1889915605862578e-06, + "loss": 0.9005022, + "memory(GiB)": 302.58, + "step": 281480, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.73255205, + "epoch": 1.574281332183148, + "grad_norm": 12.875, + "learning_rate": 1.188393027599849e-06, + "loss": 1.0496707, + "memory(GiB)": 302.58, + "step": 281500, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.75721722, + "epoch": 1.5743931816561272, + "grad_norm": 8.0625, + "learning_rate": 1.187794624982485e-06, + "loss": 0.94274797, + "memory(GiB)": 302.58, + "step": 281520, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.76185246, + "epoch": 1.5745050311291064, + "grad_norm": 6.96875, + "learning_rate": 1.1871963527546326e-06, + "loss": 0.92674427, + "memory(GiB)": 302.58, + "step": 281540, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.76777759, + "epoch": 1.5746168806020857, + "grad_norm": 9.1875, + "learning_rate": 1.1865982109367558e-06, + "loss": 0.9118062, + "memory(GiB)": 302.58, + "step": 281560, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.74811954, + "epoch": 1.574728730075065, + "grad_norm": 8.5, + "learning_rate": 1.1860001995493114e-06, + "loss": 1.00428247, + "memory(GiB)": 302.58, + "step": 281580, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.74581981, + "epoch": 1.5748405795480442, + "grad_norm": 6.53125, + "learning_rate": 1.1854023186127545e-06, + "loss": 1.00272894, + "memory(GiB)": 302.58, + "step": 281600, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.75281005, + "epoch": 1.5749524290210235, + "grad_norm": 6.375, + "learning_rate": 1.184804568147534e-06, + "loss": 0.96668682, + "memory(GiB)": 302.58, + "step": 281620, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.73905396, + "epoch": 1.5750642784940028, + "grad_norm": 11.1875, + "learning_rate": 1.1842069481740947e-06, + "loss": 1.0411562, + "memory(GiB)": 302.58, + "step": 281640, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.73712201, + "epoch": 1.575176127966982, + "grad_norm": 7.53125, + "learning_rate": 1.1836094587128777e-06, + "loss": 1.02713795, + "memory(GiB)": 302.58, + "step": 281660, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.74452362, + "epoch": 1.5752879774399613, + "grad_norm": 8.375, + "learning_rate": 1.183012099784318e-06, + "loss": 1.04189873, + "memory(GiB)": 302.58, + "step": 281680, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.76722794, + "epoch": 1.5753998269129406, + "grad_norm": 7.34375, + "learning_rate": 1.1824148714088486e-06, + "loss": 0.91269741, + "memory(GiB)": 302.58, + "step": 281700, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.75134783, + "epoch": 1.5755116763859198, + "grad_norm": 5.9375, + "learning_rate": 1.1818177736068953e-06, + "loss": 0.98476887, + "memory(GiB)": 302.58, + "step": 281720, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.75139523, + "epoch": 1.575623525858899, + "grad_norm": 8.9375, + "learning_rate": 1.18122080639888e-06, + "loss": 0.95300598, + "memory(GiB)": 302.58, + "step": 281740, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.73581171, + "epoch": 1.5757353753318784, + "grad_norm": 8.3125, + "learning_rate": 1.1806239698052236e-06, + "loss": 1.04466486, + "memory(GiB)": 302.58, + "step": 281760, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.76452188, + "epoch": 1.5758472248048576, + "grad_norm": 7.28125, + "learning_rate": 1.1800272638463384e-06, + "loss": 0.91059875, + "memory(GiB)": 302.58, + "step": 281780, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.73822412, + "epoch": 1.5759590742778369, + "grad_norm": 5.875, + "learning_rate": 1.1794306885426337e-06, + "loss": 1.02876558, + "memory(GiB)": 302.58, + "step": 281800, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75754051, + "epoch": 1.5760709237508161, + "grad_norm": 8.625, + "learning_rate": 1.178834243914514e-06, + "loss": 0.95648098, + "memory(GiB)": 302.58, + "step": 281820, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.73623443, + "epoch": 1.5761827732237954, + "grad_norm": 7.21875, + "learning_rate": 1.1782379299823793e-06, + "loss": 1.04741526, + "memory(GiB)": 302.58, + "step": 281840, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.75871034, + "epoch": 1.5762946226967747, + "grad_norm": 6.46875, + "learning_rate": 1.1776417467666262e-06, + "loss": 0.95052767, + "memory(GiB)": 302.58, + "step": 281860, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.75444231, + "epoch": 1.576406472169754, + "grad_norm": 6.65625, + "learning_rate": 1.1770456942876463e-06, + "loss": 0.9643383, + "memory(GiB)": 302.58, + "step": 281880, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.74647107, + "epoch": 1.5765183216427332, + "grad_norm": 9.75, + "learning_rate": 1.1764497725658252e-06, + "loss": 0.99396, + "memory(GiB)": 302.58, + "step": 281900, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.75681858, + "epoch": 1.5766301711157125, + "grad_norm": 8.9375, + "learning_rate": 1.1758539816215465e-06, + "loss": 0.95267487, + "memory(GiB)": 302.58, + "step": 281920, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.75299864, + "epoch": 1.5767420205886917, + "grad_norm": 8.0, + "learning_rate": 1.1752583214751873e-06, + "loss": 0.94049292, + "memory(GiB)": 302.58, + "step": 281940, + "train_speed(iter/s)": 0.12375 + }, + { + "acc": 0.74612207, + "epoch": 1.576853870061671, + "grad_norm": 9.6875, + "learning_rate": 1.1746627921471214e-06, + "loss": 0.96103811, + "memory(GiB)": 302.58, + "step": 281960, + "train_speed(iter/s)": 0.123754 + }, + { + "acc": 0.74644427, + "epoch": 1.5769657195346503, + "grad_norm": 6.09375, + "learning_rate": 1.174067393657718e-06, + "loss": 1.00367813, + "memory(GiB)": 302.58, + "step": 281980, + "train_speed(iter/s)": 0.123758 + }, + { + "acc": 0.75750604, + "epoch": 1.5770775690076295, + "grad_norm": 5.96875, + "learning_rate": 1.1734721260273418e-06, + "loss": 0.95579195, + "memory(GiB)": 302.58, + "step": 282000, + "train_speed(iter/s)": 0.123762 + }, + { + "epoch": 1.5770775690076295, + "eval_acc": 0.70692488717691, + "eval_loss": 1.0118335485458374, + "eval_runtime": 7532.3761, + "eval_samples_per_second": 9.995, + "eval_steps_per_second": 9.995, + "step": 282000 + }, + { + "acc": 0.74314651, + "epoch": 1.5771894184806088, + "grad_norm": 7.53125, + "learning_rate": 1.1728769892763503e-06, + "loss": 0.99097891, + "memory(GiB)": 302.58, + "step": 282020, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.73974805, + "epoch": 1.577301267953588, + "grad_norm": 6.84375, + "learning_rate": 1.1722819834251031e-06, + "loss": 1.02434492, + "memory(GiB)": 302.58, + "step": 282040, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.75166378, + "epoch": 1.5774131174265673, + "grad_norm": 7.78125, + "learning_rate": 1.1716871084939496e-06, + "loss": 0.99047251, + "memory(GiB)": 302.58, + "step": 282060, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.74531116, + "epoch": 1.5775249668995466, + "grad_norm": 8.125, + "learning_rate": 1.1710923645032358e-06, + "loss": 1.0017705, + "memory(GiB)": 302.58, + "step": 282080, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.74847651, + "epoch": 1.5776368163725258, + "grad_norm": 8.5625, + "learning_rate": 1.1704977514733045e-06, + "loss": 0.98901596, + "memory(GiB)": 302.58, + "step": 282100, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.73535948, + "epoch": 1.577748665845505, + "grad_norm": 6.5, + "learning_rate": 1.1699032694244932e-06, + "loss": 1.05591898, + "memory(GiB)": 302.58, + "step": 282120, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.73618059, + "epoch": 1.5778605153184844, + "grad_norm": 5.5, + "learning_rate": 1.169308918377135e-06, + "loss": 1.04704323, + "memory(GiB)": 302.58, + "step": 282140, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.73852339, + "epoch": 1.5779723647914636, + "grad_norm": 6.9375, + "learning_rate": 1.1687146983515569e-06, + "loss": 1.03306723, + "memory(GiB)": 302.58, + "step": 282160, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.74538808, + "epoch": 1.578084214264443, + "grad_norm": 6.90625, + "learning_rate": 1.1681206093680864e-06, + "loss": 1.01212273, + "memory(GiB)": 302.58, + "step": 282180, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.75062513, + "epoch": 1.5781960637374222, + "grad_norm": 6.4375, + "learning_rate": 1.1675266514470418e-06, + "loss": 0.9852519, + "memory(GiB)": 302.58, + "step": 282200, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.7545352, + "epoch": 1.5783079132104014, + "grad_norm": 6.25, + "learning_rate": 1.166932824608738e-06, + "loss": 0.97167873, + "memory(GiB)": 302.58, + "step": 282220, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.77158389, + "epoch": 1.5784197626833807, + "grad_norm": 7.59375, + "learning_rate": 1.166339128873486e-06, + "loss": 0.89831238, + "memory(GiB)": 302.58, + "step": 282240, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.75141706, + "epoch": 1.57853161215636, + "grad_norm": 6.84375, + "learning_rate": 1.1657455642615917e-06, + "loss": 0.99692888, + "memory(GiB)": 302.58, + "step": 282260, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.74453807, + "epoch": 1.5786434616293392, + "grad_norm": 6.1875, + "learning_rate": 1.1651521307933572e-06, + "loss": 1.02950506, + "memory(GiB)": 302.58, + "step": 282280, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.73853183, + "epoch": 1.5787553111023185, + "grad_norm": 6.40625, + "learning_rate": 1.1645588284890802e-06, + "loss": 1.03064156, + "memory(GiB)": 302.58, + "step": 282300, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.75817604, + "epoch": 1.5788671605752977, + "grad_norm": 6.875, + "learning_rate": 1.1639656573690527e-06, + "loss": 0.93006935, + "memory(GiB)": 302.58, + "step": 282320, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75709238, + "epoch": 1.578979010048277, + "grad_norm": 7.3125, + "learning_rate": 1.1633726174535625e-06, + "loss": 0.94727459, + "memory(GiB)": 302.58, + "step": 282340, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.7439508, + "epoch": 1.5790908595212563, + "grad_norm": 8.25, + "learning_rate": 1.1627797087628957e-06, + "loss": 1.00965662, + "memory(GiB)": 302.58, + "step": 282360, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.7326838, + "epoch": 1.5792027089942355, + "grad_norm": 6.75, + "learning_rate": 1.16218693131733e-06, + "loss": 1.0551301, + "memory(GiB)": 302.58, + "step": 282380, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.74439831, + "epoch": 1.5793145584672148, + "grad_norm": 10.25, + "learning_rate": 1.161594285137141e-06, + "loss": 0.99140415, + "memory(GiB)": 302.58, + "step": 282400, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.74934535, + "epoch": 1.579426407940194, + "grad_norm": 6.1875, + "learning_rate": 1.1610017702425985e-06, + "loss": 0.97471752, + "memory(GiB)": 302.58, + "step": 282420, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.75261426, + "epoch": 1.5795382574131733, + "grad_norm": 7.03125, + "learning_rate": 1.1604093866539684e-06, + "loss": 0.98029652, + "memory(GiB)": 302.58, + "step": 282440, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.74170761, + "epoch": 1.5796501068861526, + "grad_norm": 9.0, + "learning_rate": 1.1598171343915126e-06, + "loss": 1.01333199, + "memory(GiB)": 302.58, + "step": 282460, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.75498052, + "epoch": 1.5797619563591319, + "grad_norm": 6.40625, + "learning_rate": 1.159225013475488e-06, + "loss": 0.95516052, + "memory(GiB)": 302.58, + "step": 282480, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.74693184, + "epoch": 1.5798738058321111, + "grad_norm": 7.78125, + "learning_rate": 1.158633023926146e-06, + "loss": 0.97514353, + "memory(GiB)": 302.58, + "step": 282500, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.74923654, + "epoch": 1.5799856553050904, + "grad_norm": 6.84375, + "learning_rate": 1.1580411657637358e-06, + "loss": 0.99094133, + "memory(GiB)": 302.58, + "step": 282520, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.75205879, + "epoch": 1.5800975047780697, + "grad_norm": 5.71875, + "learning_rate": 1.1574494390085e-06, + "loss": 0.98614626, + "memory(GiB)": 302.58, + "step": 282540, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.7463284, + "epoch": 1.580209354251049, + "grad_norm": 7.71875, + "learning_rate": 1.1568578436806777e-06, + "loss": 0.9920536, + "memory(GiB)": 302.58, + "step": 282560, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.74632325, + "epoch": 1.5803212037240282, + "grad_norm": 8.3125, + "learning_rate": 1.1562663798005037e-06, + "loss": 1.02132711, + "memory(GiB)": 302.58, + "step": 282580, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.74281678, + "epoch": 1.5804330531970074, + "grad_norm": 6.65625, + "learning_rate": 1.1556750473882073e-06, + "loss": 1.0200902, + "memory(GiB)": 302.58, + "step": 282600, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.75614858, + "epoch": 1.5805449026699867, + "grad_norm": 6.34375, + "learning_rate": 1.155083846464013e-06, + "loss": 0.94602728, + "memory(GiB)": 302.58, + "step": 282620, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.7324543, + "epoch": 1.580656752142966, + "grad_norm": 6.75, + "learning_rate": 1.1544927770481446e-06, + "loss": 1.06521149, + "memory(GiB)": 302.58, + "step": 282640, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.75265465, + "epoch": 1.5807686016159452, + "grad_norm": 5.40625, + "learning_rate": 1.153901839160817e-06, + "loss": 0.95509825, + "memory(GiB)": 302.58, + "step": 282660, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.73677812, + "epoch": 1.5808804510889245, + "grad_norm": 6.125, + "learning_rate": 1.1533110328222418e-06, + "loss": 1.04701223, + "memory(GiB)": 302.58, + "step": 282680, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.75489326, + "epoch": 1.5809923005619038, + "grad_norm": 5.0, + "learning_rate": 1.1527203580526276e-06, + "loss": 0.95028486, + "memory(GiB)": 302.58, + "step": 282700, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74162245, + "epoch": 1.581104150034883, + "grad_norm": 9.1875, + "learning_rate": 1.1521298148721755e-06, + "loss": 1.02254286, + "memory(GiB)": 302.58, + "step": 282720, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.72534709, + "epoch": 1.5812159995078623, + "grad_norm": 9.25, + "learning_rate": 1.1515394033010857e-06, + "loss": 1.08714466, + "memory(GiB)": 302.58, + "step": 282740, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.73235798, + "epoch": 1.5813278489808416, + "grad_norm": 8.25, + "learning_rate": 1.150949123359551e-06, + "loss": 1.05584373, + "memory(GiB)": 302.58, + "step": 282760, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.74807701, + "epoch": 1.5814396984538208, + "grad_norm": 8.625, + "learning_rate": 1.1503589750677618e-06, + "loss": 1.00902824, + "memory(GiB)": 302.58, + "step": 282780, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.76742387, + "epoch": 1.5815515479268, + "grad_norm": 5.8125, + "learning_rate": 1.1497689584459021e-06, + "loss": 0.91914406, + "memory(GiB)": 302.58, + "step": 282800, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.76053052, + "epoch": 1.5816633973997793, + "grad_norm": 8.625, + "learning_rate": 1.1491790735141534e-06, + "loss": 0.93791265, + "memory(GiB)": 302.58, + "step": 282820, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.74553494, + "epoch": 1.5817752468727586, + "grad_norm": 7.625, + "learning_rate": 1.14858932029269e-06, + "loss": 1.00344, + "memory(GiB)": 302.58, + "step": 282840, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.75438719, + "epoch": 1.5818870963457379, + "grad_norm": 8.8125, + "learning_rate": 1.147999698801685e-06, + "loss": 0.96420612, + "memory(GiB)": 302.58, + "step": 282860, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74502239, + "epoch": 1.5819989458187171, + "grad_norm": 6.5, + "learning_rate": 1.1474102090613033e-06, + "loss": 0.99553852, + "memory(GiB)": 302.58, + "step": 282880, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.74283628, + "epoch": 1.5821107952916964, + "grad_norm": 6.6875, + "learning_rate": 1.14682085109171e-06, + "loss": 1.00805874, + "memory(GiB)": 302.58, + "step": 282900, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73800588, + "epoch": 1.5822226447646757, + "grad_norm": 8.1875, + "learning_rate": 1.1462316249130618e-06, + "loss": 1.05397577, + "memory(GiB)": 302.58, + "step": 282920, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.75887303, + "epoch": 1.582334494237655, + "grad_norm": 10.875, + "learning_rate": 1.1456425305455115e-06, + "loss": 0.93776484, + "memory(GiB)": 302.58, + "step": 282940, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.75768437, + "epoch": 1.5824463437106342, + "grad_norm": 6.8125, + "learning_rate": 1.145053568009209e-06, + "loss": 0.94509535, + "memory(GiB)": 302.58, + "step": 282960, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75090981, + "epoch": 1.5825581931836135, + "grad_norm": 7.40625, + "learning_rate": 1.1444647373242979e-06, + "loss": 0.97787418, + "memory(GiB)": 302.58, + "step": 282980, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75073133, + "epoch": 1.5826700426565927, + "grad_norm": 7.46875, + "learning_rate": 1.1438760385109187e-06, + "loss": 0.97384567, + "memory(GiB)": 302.58, + "step": 283000, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.75554376, + "epoch": 1.582781892129572, + "grad_norm": 5.46875, + "learning_rate": 1.1432874715892062e-06, + "loss": 0.97615128, + "memory(GiB)": 302.58, + "step": 283020, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.75291014, + "epoch": 1.5828937416025513, + "grad_norm": 6.71875, + "learning_rate": 1.142699036579291e-06, + "loss": 0.9591197, + "memory(GiB)": 302.58, + "step": 283040, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.75599198, + "epoch": 1.5830055910755305, + "grad_norm": 6.9375, + "learning_rate": 1.1421107335013005e-06, + "loss": 0.93631153, + "memory(GiB)": 302.58, + "step": 283060, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.74701991, + "epoch": 1.5831174405485098, + "grad_norm": 6.21875, + "learning_rate": 1.1415225623753562e-06, + "loss": 0.99732056, + "memory(GiB)": 302.58, + "step": 283080, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.74478741, + "epoch": 1.583229290021489, + "grad_norm": 6.3125, + "learning_rate": 1.1409345232215746e-06, + "loss": 0.98620234, + "memory(GiB)": 302.58, + "step": 283100, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.75234327, + "epoch": 1.5833411394944683, + "grad_norm": 9.0, + "learning_rate": 1.140346616060069e-06, + "loss": 0.97667837, + "memory(GiB)": 302.58, + "step": 283120, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.74619379, + "epoch": 1.5834529889674476, + "grad_norm": 6.0625, + "learning_rate": 1.1397588409109484e-06, + "loss": 1.00248041, + "memory(GiB)": 302.58, + "step": 283140, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74612732, + "epoch": 1.5835648384404268, + "grad_norm": 8.75, + "learning_rate": 1.1391711977943137e-06, + "loss": 1.0069622, + "memory(GiB)": 302.58, + "step": 283160, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74697022, + "epoch": 1.583676687913406, + "grad_norm": 8.75, + "learning_rate": 1.1385836867302685e-06, + "loss": 1.01020775, + "memory(GiB)": 302.58, + "step": 283180, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.75072532, + "epoch": 1.5837885373863854, + "grad_norm": 6.96875, + "learning_rate": 1.137996307738905e-06, + "loss": 0.97823782, + "memory(GiB)": 302.58, + "step": 283200, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.73679066, + "epoch": 1.5839003868593646, + "grad_norm": 7.78125, + "learning_rate": 1.1374090608403142e-06, + "loss": 1.06702633, + "memory(GiB)": 302.58, + "step": 283220, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.7525054, + "epoch": 1.584012236332344, + "grad_norm": 8.375, + "learning_rate": 1.1368219460545816e-06, + "loss": 0.96930933, + "memory(GiB)": 302.58, + "step": 283240, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.7563714, + "epoch": 1.5841240858053232, + "grad_norm": 8.1875, + "learning_rate": 1.1362349634017877e-06, + "loss": 0.9687582, + "memory(GiB)": 302.58, + "step": 283260, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.74862313, + "epoch": 1.5842359352783024, + "grad_norm": 8.75, + "learning_rate": 1.1356481129020103e-06, + "loss": 0.99615993, + "memory(GiB)": 302.58, + "step": 283280, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.76421871, + "epoch": 1.5843477847512817, + "grad_norm": 4.84375, + "learning_rate": 1.1350613945753203e-06, + "loss": 0.93346186, + "memory(GiB)": 302.58, + "step": 283300, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74983592, + "epoch": 1.584459634224261, + "grad_norm": 6.96875, + "learning_rate": 1.1344748084417861e-06, + "loss": 1.00122337, + "memory(GiB)": 302.58, + "step": 283320, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.75072436, + "epoch": 1.5845714836972402, + "grad_norm": 5.8125, + "learning_rate": 1.1338883545214713e-06, + "loss": 0.98300018, + "memory(GiB)": 302.58, + "step": 283340, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75366974, + "epoch": 1.5846833331702195, + "grad_norm": 11.0625, + "learning_rate": 1.1333020328344335e-06, + "loss": 0.96228437, + "memory(GiB)": 302.58, + "step": 283360, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.74859662, + "epoch": 1.5847951826431987, + "grad_norm": 8.6875, + "learning_rate": 1.1327158434007268e-06, + "loss": 0.9786335, + "memory(GiB)": 302.58, + "step": 283380, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.74405169, + "epoch": 1.584907032116178, + "grad_norm": 5.90625, + "learning_rate": 1.132129786240402e-06, + "loss": 1.01082678, + "memory(GiB)": 302.58, + "step": 283400, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.72723713, + "epoch": 1.5850188815891573, + "grad_norm": 8.0, + "learning_rate": 1.1315438613735007e-06, + "loss": 1.07952223, + "memory(GiB)": 302.58, + "step": 283420, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.74687762, + "epoch": 1.5851307310621365, + "grad_norm": 4.78125, + "learning_rate": 1.1309580688200683e-06, + "loss": 0.9959754, + "memory(GiB)": 302.58, + "step": 283440, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.75889649, + "epoch": 1.5852425805351158, + "grad_norm": 6.90625, + "learning_rate": 1.1303724086001377e-06, + "loss": 0.956954, + "memory(GiB)": 302.58, + "step": 283460, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.74440022, + "epoch": 1.585354430008095, + "grad_norm": 7.09375, + "learning_rate": 1.1297868807337408e-06, + "loss": 0.99888039, + "memory(GiB)": 302.58, + "step": 283480, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.74855266, + "epoch": 1.5854662794810743, + "grad_norm": 7.84375, + "learning_rate": 1.129201485240904e-06, + "loss": 0.99070215, + "memory(GiB)": 302.58, + "step": 283500, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.74235721, + "epoch": 1.5855781289540536, + "grad_norm": 9.5625, + "learning_rate": 1.1286162221416513e-06, + "loss": 1.03795824, + "memory(GiB)": 302.58, + "step": 283520, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.73231835, + "epoch": 1.5856899784270329, + "grad_norm": 6.34375, + "learning_rate": 1.128031091456e-06, + "loss": 1.07089396, + "memory(GiB)": 302.58, + "step": 283540, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.75932055, + "epoch": 1.5858018279000121, + "grad_norm": 9.3125, + "learning_rate": 1.1274460932039628e-06, + "loss": 0.94469776, + "memory(GiB)": 302.58, + "step": 283560, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.73442569, + "epoch": 1.5859136773729914, + "grad_norm": 7.59375, + "learning_rate": 1.1268612274055484e-06, + "loss": 1.03843527, + "memory(GiB)": 302.58, + "step": 283580, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.73588243, + "epoch": 1.5860255268459706, + "grad_norm": 6.0, + "learning_rate": 1.1262764940807618e-06, + "loss": 1.06249819, + "memory(GiB)": 302.58, + "step": 283600, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.74287171, + "epoch": 1.58613737631895, + "grad_norm": 6.625, + "learning_rate": 1.1256918932496024e-06, + "loss": 1.02876253, + "memory(GiB)": 302.58, + "step": 283620, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.73830867, + "epoch": 1.5862492257919292, + "grad_norm": 6.375, + "learning_rate": 1.1251074249320654e-06, + "loss": 1.01736059, + "memory(GiB)": 302.58, + "step": 283640, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74213638, + "epoch": 1.5863610752649084, + "grad_norm": 6.34375, + "learning_rate": 1.1245230891481412e-06, + "loss": 1.02993517, + "memory(GiB)": 302.58, + "step": 283660, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.74415288, + "epoch": 1.5864729247378877, + "grad_norm": 9.125, + "learning_rate": 1.1239388859178164e-06, + "loss": 1.03281736, + "memory(GiB)": 302.58, + "step": 283680, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.74023309, + "epoch": 1.586584774210867, + "grad_norm": 8.125, + "learning_rate": 1.1233548152610719e-06, + "loss": 1.02834129, + "memory(GiB)": 302.58, + "step": 283700, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.75716171, + "epoch": 1.5866966236838462, + "grad_norm": 7.9375, + "learning_rate": 1.1227708771978857e-06, + "loss": 0.95961094, + "memory(GiB)": 302.58, + "step": 283720, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.75263534, + "epoch": 1.5868084731568255, + "grad_norm": 6.875, + "learning_rate": 1.1221870717482291e-06, + "loss": 0.96582947, + "memory(GiB)": 302.58, + "step": 283740, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.75295162, + "epoch": 1.5869203226298048, + "grad_norm": 8.75, + "learning_rate": 1.1216033989320702e-06, + "loss": 0.96309052, + "memory(GiB)": 302.58, + "step": 283760, + "train_speed(iter/s)": 0.123708 + }, + { + "acc": 0.77209768, + "epoch": 1.587032172102784, + "grad_norm": 5.78125, + "learning_rate": 1.1210198587693744e-06, + "loss": 0.87238293, + "memory(GiB)": 302.58, + "step": 283780, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.73944111, + "epoch": 1.5871440215757633, + "grad_norm": 6.0625, + "learning_rate": 1.120436451280099e-06, + "loss": 1.05533886, + "memory(GiB)": 302.58, + "step": 283800, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.74872794, + "epoch": 1.5872558710487426, + "grad_norm": 7.375, + "learning_rate": 1.1198531764841997e-06, + "loss": 0.99480963, + "memory(GiB)": 302.58, + "step": 283820, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.76597762, + "epoch": 1.5873677205217218, + "grad_norm": 7.5, + "learning_rate": 1.1192700344016244e-06, + "loss": 0.91514883, + "memory(GiB)": 302.58, + "step": 283840, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.75540543, + "epoch": 1.587479569994701, + "grad_norm": 6.3125, + "learning_rate": 1.1186870250523196e-06, + "loss": 0.94913559, + "memory(GiB)": 302.58, + "step": 283860, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.75679803, + "epoch": 1.5875914194676803, + "grad_norm": 6.59375, + "learning_rate": 1.118104148456226e-06, + "loss": 0.93358707, + "memory(GiB)": 302.58, + "step": 283880, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.76081033, + "epoch": 1.5877032689406596, + "grad_norm": 11.375, + "learning_rate": 1.1175214046332793e-06, + "loss": 0.92928963, + "memory(GiB)": 302.58, + "step": 283900, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.74490123, + "epoch": 1.5878151184136389, + "grad_norm": 7.9375, + "learning_rate": 1.1169387936034121e-06, + "loss": 0.98324251, + "memory(GiB)": 302.58, + "step": 283920, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.73757844, + "epoch": 1.5879269678866181, + "grad_norm": 4.71875, + "learning_rate": 1.116356315386551e-06, + "loss": 1.03441114, + "memory(GiB)": 302.58, + "step": 283940, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.74905992, + "epoch": 1.5880388173595974, + "grad_norm": 9.6875, + "learning_rate": 1.1157739700026182e-06, + "loss": 0.99106493, + "memory(GiB)": 302.58, + "step": 283960, + "train_speed(iter/s)": 0.123749 + }, + { + "acc": 0.75373106, + "epoch": 1.5881506668325767, + "grad_norm": 10.4375, + "learning_rate": 1.1151917574715326e-06, + "loss": 0.99232473, + "memory(GiB)": 302.58, + "step": 283980, + "train_speed(iter/s)": 0.123753 + }, + { + "acc": 0.7508008, + "epoch": 1.588262516305556, + "grad_norm": 7.4375, + "learning_rate": 1.114609677813207e-06, + "loss": 0.96296072, + "memory(GiB)": 302.58, + "step": 284000, + "train_speed(iter/s)": 0.123757 + }, + { + "epoch": 1.588262516305556, + "eval_acc": 0.7069172461750377, + "eval_loss": 1.0118013620376587, + "eval_runtime": 7582.6861, + "eval_samples_per_second": 9.928, + "eval_steps_per_second": 9.928, + "step": 284000 + }, + { + "acc": 0.77619562, + "epoch": 1.5883743657785354, + "grad_norm": 5.75, + "learning_rate": 1.1140277310475505e-06, + "loss": 0.85980148, + "memory(GiB)": 302.58, + "step": 284020, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.76448512, + "epoch": 1.5884862152515145, + "grad_norm": 9.375, + "learning_rate": 1.1134459171944667e-06, + "loss": 0.89550753, + "memory(GiB)": 302.58, + "step": 284040, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.76152873, + "epoch": 1.588598064724494, + "grad_norm": 6.125, + "learning_rate": 1.1128642362738579e-06, + "loss": 0.91854916, + "memory(GiB)": 302.58, + "step": 284060, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.7493547, + "epoch": 1.588709914197473, + "grad_norm": 7.25, + "learning_rate": 1.1122826883056175e-06, + "loss": 0.9852006, + "memory(GiB)": 302.58, + "step": 284080, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.74153275, + "epoch": 1.5888217636704525, + "grad_norm": 6.65625, + "learning_rate": 1.111701273309637e-06, + "loss": 1.03247147, + "memory(GiB)": 302.58, + "step": 284100, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.76038675, + "epoch": 1.5889336131434315, + "grad_norm": 6.0625, + "learning_rate": 1.1111199913058024e-06, + "loss": 0.93768167, + "memory(GiB)": 302.58, + "step": 284120, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.75103083, + "epoch": 1.589045462616411, + "grad_norm": 5.1875, + "learning_rate": 1.1105388423139956e-06, + "loss": 0.96251736, + "memory(GiB)": 302.58, + "step": 284140, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.75223656, + "epoch": 1.58915731208939, + "grad_norm": 8.125, + "learning_rate": 1.1099578263540933e-06, + "loss": 0.99429579, + "memory(GiB)": 302.58, + "step": 284160, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.75359674, + "epoch": 1.5892691615623695, + "grad_norm": 6.21875, + "learning_rate": 1.1093769434459684e-06, + "loss": 0.97603178, + "memory(GiB)": 302.58, + "step": 284180, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.75778332, + "epoch": 1.5893810110353486, + "grad_norm": 6.625, + "learning_rate": 1.1087961936094888e-06, + "loss": 0.95492058, + "memory(GiB)": 302.58, + "step": 284200, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75253839, + "epoch": 1.589492860508328, + "grad_norm": 8.5625, + "learning_rate": 1.108215576864518e-06, + "loss": 0.96149044, + "memory(GiB)": 302.58, + "step": 284220, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74938583, + "epoch": 1.589604709981307, + "grad_norm": 7.46875, + "learning_rate": 1.1076350932309155e-06, + "loss": 0.98413506, + "memory(GiB)": 302.58, + "step": 284240, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.73977156, + "epoch": 1.5897165594542866, + "grad_norm": 4.9375, + "learning_rate": 1.107054742728535e-06, + "loss": 1.01949348, + "memory(GiB)": 302.58, + "step": 284260, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.7447113, + "epoch": 1.5898284089272656, + "grad_norm": 5.34375, + "learning_rate": 1.1064745253772258e-06, + "loss": 1.00320845, + "memory(GiB)": 302.58, + "step": 284280, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.7495831, + "epoch": 1.589940258400245, + "grad_norm": 6.78125, + "learning_rate": 1.1058944411968347e-06, + "loss": 0.98453512, + "memory(GiB)": 302.58, + "step": 284300, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.75263176, + "epoch": 1.5900521078732242, + "grad_norm": 5.375, + "learning_rate": 1.1053144902072e-06, + "loss": 0.96191025, + "memory(GiB)": 302.58, + "step": 284320, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.73172545, + "epoch": 1.5901639573462036, + "grad_norm": 8.5625, + "learning_rate": 1.1047346724281605e-06, + "loss": 1.07133722, + "memory(GiB)": 302.58, + "step": 284340, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75721126, + "epoch": 1.5902758068191827, + "grad_norm": 7.6875, + "learning_rate": 1.1041549878795466e-06, + "loss": 0.96341772, + "memory(GiB)": 302.58, + "step": 284360, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.75091829, + "epoch": 1.5903876562921622, + "grad_norm": 9.25, + "learning_rate": 1.1035754365811863e-06, + "loss": 0.97736349, + "memory(GiB)": 302.58, + "step": 284380, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.76803713, + "epoch": 1.5904995057651412, + "grad_norm": 8.1875, + "learning_rate": 1.1029960185529004e-06, + "loss": 0.89925251, + "memory(GiB)": 302.58, + "step": 284400, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.73386693, + "epoch": 1.5906113552381207, + "grad_norm": 10.8125, + "learning_rate": 1.102416733814508e-06, + "loss": 1.06306171, + "memory(GiB)": 302.58, + "step": 284420, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.75450664, + "epoch": 1.5907232047110997, + "grad_norm": 5.0, + "learning_rate": 1.101837582385823e-06, + "loss": 0.96867952, + "memory(GiB)": 302.58, + "step": 284440, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.74252481, + "epoch": 1.5908350541840792, + "grad_norm": 8.375, + "learning_rate": 1.1012585642866524e-06, + "loss": 1.0102026, + "memory(GiB)": 302.58, + "step": 284460, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.75778332, + "epoch": 1.5909469036570583, + "grad_norm": 7.4375, + "learning_rate": 1.1006796795368018e-06, + "loss": 0.95595169, + "memory(GiB)": 302.58, + "step": 284480, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.75466709, + "epoch": 1.5910587531300378, + "grad_norm": 7.46875, + "learning_rate": 1.100100928156071e-06, + "loss": 0.9714139, + "memory(GiB)": 302.58, + "step": 284500, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.73318224, + "epoch": 1.5911706026030168, + "grad_norm": 6.5, + "learning_rate": 1.0995223101642544e-06, + "loss": 1.06544428, + "memory(GiB)": 302.58, + "step": 284520, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.76350961, + "epoch": 1.5912824520759963, + "grad_norm": 5.90625, + "learning_rate": 1.0989438255811423e-06, + "loss": 0.92608433, + "memory(GiB)": 302.58, + "step": 284540, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.73812413, + "epoch": 1.5913943015489753, + "grad_norm": 4.8125, + "learning_rate": 1.098365474426522e-06, + "loss": 1.05534458, + "memory(GiB)": 302.58, + "step": 284560, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.75869908, + "epoch": 1.5915061510219548, + "grad_norm": 5.53125, + "learning_rate": 1.0977872567201724e-06, + "loss": 0.93229265, + "memory(GiB)": 302.58, + "step": 284580, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74718032, + "epoch": 1.5916180004949338, + "grad_norm": 8.4375, + "learning_rate": 1.097209172481874e-06, + "loss": 0.98783789, + "memory(GiB)": 302.58, + "step": 284600, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.75660577, + "epoch": 1.5917298499679133, + "grad_norm": 8.5, + "learning_rate": 1.0966312217313974e-06, + "loss": 0.97049618, + "memory(GiB)": 302.58, + "step": 284620, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74981375, + "epoch": 1.5918416994408924, + "grad_norm": 5.09375, + "learning_rate": 1.0960534044885097e-06, + "loss": 0.96583233, + "memory(GiB)": 302.58, + "step": 284640, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.77485037, + "epoch": 1.5919535489138719, + "grad_norm": 7.5, + "learning_rate": 1.0954757207729738e-06, + "loss": 0.8667737, + "memory(GiB)": 302.58, + "step": 284660, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.74199433, + "epoch": 1.592065398386851, + "grad_norm": 7.15625, + "learning_rate": 1.0948981706045503e-06, + "loss": 1.04886093, + "memory(GiB)": 302.58, + "step": 284680, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.76800823, + "epoch": 1.5921772478598304, + "grad_norm": 4.15625, + "learning_rate": 1.094320754002992e-06, + "loss": 0.90199471, + "memory(GiB)": 302.58, + "step": 284700, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74892607, + "epoch": 1.5922890973328094, + "grad_norm": 4.5625, + "learning_rate": 1.0937434709880484e-06, + "loss": 0.97891674, + "memory(GiB)": 302.58, + "step": 284720, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.74545484, + "epoch": 1.592400946805789, + "grad_norm": 6.46875, + "learning_rate": 1.0931663215794646e-06, + "loss": 0.98697577, + "memory(GiB)": 302.58, + "step": 284740, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.73764858, + "epoch": 1.592512796278768, + "grad_norm": 9.75, + "learning_rate": 1.0925893057969811e-06, + "loss": 1.03703775, + "memory(GiB)": 302.58, + "step": 284760, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.77285004, + "epoch": 1.5926246457517474, + "grad_norm": 8.625, + "learning_rate": 1.0920124236603335e-06, + "loss": 0.86679897, + "memory(GiB)": 302.58, + "step": 284780, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.76174173, + "epoch": 1.5927364952247265, + "grad_norm": 6.46875, + "learning_rate": 1.0914356751892523e-06, + "loss": 0.92882595, + "memory(GiB)": 302.58, + "step": 284800, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.75433154, + "epoch": 1.592848344697706, + "grad_norm": 6.6875, + "learning_rate": 1.0908590604034652e-06, + "loss": 0.95908146, + "memory(GiB)": 302.58, + "step": 284820, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.77094774, + "epoch": 1.592960194170685, + "grad_norm": 6.53125, + "learning_rate": 1.0902825793226935e-06, + "loss": 0.88133154, + "memory(GiB)": 302.58, + "step": 284840, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75744376, + "epoch": 1.5930720436436645, + "grad_norm": 5.1875, + "learning_rate": 1.0897062319666551e-06, + "loss": 0.96310844, + "memory(GiB)": 302.58, + "step": 284860, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.75645328, + "epoch": 1.5931838931166435, + "grad_norm": 8.3125, + "learning_rate": 1.089130018355063e-06, + "loss": 0.97217951, + "memory(GiB)": 302.58, + "step": 284880, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75028367, + "epoch": 1.593295742589623, + "grad_norm": 5.625, + "learning_rate": 1.088553938507625e-06, + "loss": 0.97052097, + "memory(GiB)": 302.58, + "step": 284900, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.7436727, + "epoch": 1.593407592062602, + "grad_norm": 7.40625, + "learning_rate": 1.0879779924440437e-06, + "loss": 1.01710091, + "memory(GiB)": 302.58, + "step": 284920, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.74067764, + "epoch": 1.5935194415355816, + "grad_norm": 9.3125, + "learning_rate": 1.0874021801840217e-06, + "loss": 1.01045103, + "memory(GiB)": 302.58, + "step": 284940, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.7621707, + "epoch": 1.5936312910085606, + "grad_norm": 6.625, + "learning_rate": 1.0868265017472512e-06, + "loss": 0.91744366, + "memory(GiB)": 302.58, + "step": 284960, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.75133405, + "epoch": 1.59374314048154, + "grad_norm": 6.75, + "learning_rate": 1.0862509571534224e-06, + "loss": 0.97978277, + "memory(GiB)": 302.58, + "step": 284980, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.75014319, + "epoch": 1.5938549899545191, + "grad_norm": 6.09375, + "learning_rate": 1.0856755464222218e-06, + "loss": 0.99138479, + "memory(GiB)": 302.58, + "step": 285000, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.73594351, + "epoch": 1.5939668394274986, + "grad_norm": 9.1875, + "learning_rate": 1.0851002695733287e-06, + "loss": 1.04735327, + "memory(GiB)": 302.58, + "step": 285020, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.76047058, + "epoch": 1.5940786889004777, + "grad_norm": 7.28125, + "learning_rate": 1.0845251266264207e-06, + "loss": 0.94716616, + "memory(GiB)": 302.58, + "step": 285040, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75692, + "epoch": 1.5941905383734571, + "grad_norm": 5.71875, + "learning_rate": 1.083950117601169e-06, + "loss": 0.9574461, + "memory(GiB)": 302.58, + "step": 285060, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.74231296, + "epoch": 1.5943023878464362, + "grad_norm": 4.84375, + "learning_rate": 1.0833752425172405e-06, + "loss": 0.99981699, + "memory(GiB)": 302.58, + "step": 285080, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.7432807, + "epoch": 1.5944142373194157, + "grad_norm": 7.09375, + "learning_rate": 1.0828005013942977e-06, + "loss": 1.00848646, + "memory(GiB)": 302.58, + "step": 285100, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.73588514, + "epoch": 1.5945260867923947, + "grad_norm": 6.75, + "learning_rate": 1.0822258942519998e-06, + "loss": 1.0544735, + "memory(GiB)": 302.58, + "step": 285120, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.7314116, + "epoch": 1.5946379362653742, + "grad_norm": 4.375, + "learning_rate": 1.0816514211099983e-06, + "loss": 1.07133417, + "memory(GiB)": 302.58, + "step": 285140, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.76001801, + "epoch": 1.5947497857383532, + "grad_norm": 7.15625, + "learning_rate": 1.0810770819879435e-06, + "loss": 0.95886164, + "memory(GiB)": 302.58, + "step": 285160, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.74932151, + "epoch": 1.5948616352113327, + "grad_norm": 7.875, + "learning_rate": 1.0805028769054788e-06, + "loss": 0.98783875, + "memory(GiB)": 302.58, + "step": 285180, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75430107, + "epoch": 1.5949734846843118, + "grad_norm": 8.5, + "learning_rate": 1.0799288058822426e-06, + "loss": 0.95776949, + "memory(GiB)": 302.58, + "step": 285200, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74698968, + "epoch": 1.5950853341572913, + "grad_norm": 6.28125, + "learning_rate": 1.079354868937873e-06, + "loss": 1.00055695, + "memory(GiB)": 302.58, + "step": 285220, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.75461502, + "epoch": 1.5951971836302703, + "grad_norm": 6.3125, + "learning_rate": 1.0787810660919989e-06, + "loss": 0.97085953, + "memory(GiB)": 302.58, + "step": 285240, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.75403914, + "epoch": 1.5953090331032498, + "grad_norm": 7.1875, + "learning_rate": 1.0782073973642466e-06, + "loss": 0.96147776, + "memory(GiB)": 302.58, + "step": 285260, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.75118504, + "epoch": 1.5954208825762288, + "grad_norm": 10.625, + "learning_rate": 1.0776338627742367e-06, + "loss": 0.97850561, + "memory(GiB)": 302.58, + "step": 285280, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.73736854, + "epoch": 1.5955327320492083, + "grad_norm": 7.53125, + "learning_rate": 1.077060462341586e-06, + "loss": 1.0038518, + "memory(GiB)": 302.58, + "step": 285300, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.76047401, + "epoch": 1.5956445815221874, + "grad_norm": 6.65625, + "learning_rate": 1.0764871960859068e-06, + "loss": 0.93830347, + "memory(GiB)": 302.58, + "step": 285320, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.75520353, + "epoch": 1.5957564309951668, + "grad_norm": 6.90625, + "learning_rate": 1.0759140640268068e-06, + "loss": 0.99694347, + "memory(GiB)": 302.58, + "step": 285340, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.74445429, + "epoch": 1.5958682804681459, + "grad_norm": 8.0, + "learning_rate": 1.075341066183888e-06, + "loss": 1.0290411, + "memory(GiB)": 302.58, + "step": 285360, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.75517263, + "epoch": 1.5959801299411254, + "grad_norm": 6.65625, + "learning_rate": 1.0747682025767504e-06, + "loss": 0.95491867, + "memory(GiB)": 302.58, + "step": 285380, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75463138, + "epoch": 1.5960919794141044, + "grad_norm": 8.3125, + "learning_rate": 1.0741954732249866e-06, + "loss": 0.96489773, + "memory(GiB)": 302.58, + "step": 285400, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.75585051, + "epoch": 1.596203828887084, + "grad_norm": 8.0, + "learning_rate": 1.073622878148186e-06, + "loss": 0.96568222, + "memory(GiB)": 302.58, + "step": 285420, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.75407376, + "epoch": 1.596315678360063, + "grad_norm": 8.8125, + "learning_rate": 1.0730504173659334e-06, + "loss": 0.96414509, + "memory(GiB)": 302.58, + "step": 285440, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.74637742, + "epoch": 1.5964275278330424, + "grad_norm": 9.0, + "learning_rate": 1.0724780908978067e-06, + "loss": 0.97492418, + "memory(GiB)": 302.58, + "step": 285460, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.76316848, + "epoch": 1.5965393773060215, + "grad_norm": 6.8125, + "learning_rate": 1.0719058987633845e-06, + "loss": 0.93895779, + "memory(GiB)": 302.58, + "step": 285480, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.7401722, + "epoch": 1.596651226779001, + "grad_norm": 9.5, + "learning_rate": 1.0713338409822365e-06, + "loss": 1.03751802, + "memory(GiB)": 302.58, + "step": 285500, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.74044323, + "epoch": 1.59676307625198, + "grad_norm": 6.90625, + "learning_rate": 1.0707619175739286e-06, + "loss": 1.01489868, + "memory(GiB)": 302.58, + "step": 285520, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.74220762, + "epoch": 1.5968749257249595, + "grad_norm": 7.9375, + "learning_rate": 1.0701901285580225e-06, + "loss": 1.01605673, + "memory(GiB)": 302.58, + "step": 285540, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.7469449, + "epoch": 1.5969867751979385, + "grad_norm": 8.6875, + "learning_rate": 1.069618473954075e-06, + "loss": 1.01736383, + "memory(GiB)": 302.58, + "step": 285560, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.74502206, + "epoch": 1.597098624670918, + "grad_norm": 5.75, + "learning_rate": 1.0690469537816384e-06, + "loss": 0.99969978, + "memory(GiB)": 302.58, + "step": 285580, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.76874447, + "epoch": 1.597210474143897, + "grad_norm": 7.75, + "learning_rate": 1.068475568060261e-06, + "loss": 0.90365429, + "memory(GiB)": 302.58, + "step": 285600, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.74757051, + "epoch": 1.5973223236168765, + "grad_norm": 9.0, + "learning_rate": 1.0679043168094854e-06, + "loss": 0.98160477, + "memory(GiB)": 302.58, + "step": 285620, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.73414812, + "epoch": 1.5974341730898556, + "grad_norm": 7.0625, + "learning_rate": 1.0673332000488506e-06, + "loss": 1.04898434, + "memory(GiB)": 302.58, + "step": 285640, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.75070772, + "epoch": 1.597546022562835, + "grad_norm": 8.125, + "learning_rate": 1.0667622177978904e-06, + "loss": 0.98897018, + "memory(GiB)": 302.58, + "step": 285660, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.75322766, + "epoch": 1.597657872035814, + "grad_norm": 6.6875, + "learning_rate": 1.0661913700761346e-06, + "loss": 0.98932066, + "memory(GiB)": 302.58, + "step": 285680, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.76849012, + "epoch": 1.5977697215087936, + "grad_norm": 8.9375, + "learning_rate": 1.0656206569031074e-06, + "loss": 0.91969767, + "memory(GiB)": 302.58, + "step": 285700, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.75790091, + "epoch": 1.5978815709817726, + "grad_norm": 10.9375, + "learning_rate": 1.0650500782983292e-06, + "loss": 0.95106478, + "memory(GiB)": 302.58, + "step": 285720, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.75593424, + "epoch": 1.5979934204547521, + "grad_norm": 6.125, + "learning_rate": 1.0644796342813146e-06, + "loss": 0.96587772, + "memory(GiB)": 302.58, + "step": 285740, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.75659981, + "epoch": 1.5981052699277312, + "grad_norm": 8.1875, + "learning_rate": 1.063909324871577e-06, + "loss": 0.96006975, + "memory(GiB)": 302.58, + "step": 285760, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.74126625, + "epoch": 1.5982171194007107, + "grad_norm": 6.71875, + "learning_rate": 1.063339150088622e-06, + "loss": 1.03140497, + "memory(GiB)": 302.58, + "step": 285780, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.73572884, + "epoch": 1.5983289688736897, + "grad_norm": 7.4375, + "learning_rate": 1.06276910995195e-06, + "loss": 1.0542676, + "memory(GiB)": 302.58, + "step": 285800, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.75063195, + "epoch": 1.5984408183466692, + "grad_norm": 7.375, + "learning_rate": 1.0621992044810597e-06, + "loss": 0.99914789, + "memory(GiB)": 302.58, + "step": 285820, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.75596519, + "epoch": 1.5985526678196482, + "grad_norm": 5.8125, + "learning_rate": 1.061629433695443e-06, + "loss": 0.95036745, + "memory(GiB)": 302.58, + "step": 285840, + "train_speed(iter/s)": 0.123716 + }, + { + "acc": 0.74981427, + "epoch": 1.5986645172926277, + "grad_norm": 5.1875, + "learning_rate": 1.0610597976145875e-06, + "loss": 0.97830324, + "memory(GiB)": 302.58, + "step": 285860, + "train_speed(iter/s)": 0.12372 + }, + { + "acc": 0.74350777, + "epoch": 1.5987763667656067, + "grad_norm": 8.6875, + "learning_rate": 1.0604902962579772e-06, + "loss": 1.00621119, + "memory(GiB)": 302.58, + "step": 285880, + "train_speed(iter/s)": 0.123724 + }, + { + "acc": 0.75330844, + "epoch": 1.5988882162385862, + "grad_norm": 7.28125, + "learning_rate": 1.0599209296450908e-06, + "loss": 0.97023182, + "memory(GiB)": 302.58, + "step": 285900, + "train_speed(iter/s)": 0.123728 + }, + { + "acc": 0.74216824, + "epoch": 1.5990000657115653, + "grad_norm": 10.1875, + "learning_rate": 1.0593516977954006e-06, + "loss": 1.00872021, + "memory(GiB)": 302.58, + "step": 285920, + "train_speed(iter/s)": 0.123732 + }, + { + "acc": 0.75722227, + "epoch": 1.5991119151845448, + "grad_norm": 5.28125, + "learning_rate": 1.0587826007283792e-06, + "loss": 0.95235023, + "memory(GiB)": 302.58, + "step": 285940, + "train_speed(iter/s)": 0.123736 + }, + { + "acc": 0.77029052, + "epoch": 1.5992237646575238, + "grad_norm": 7.59375, + "learning_rate": 1.05821363846349e-06, + "loss": 0.87879581, + "memory(GiB)": 302.58, + "step": 285960, + "train_speed(iter/s)": 0.123741 + }, + { + "acc": 0.76585298, + "epoch": 1.5993356141305033, + "grad_norm": 5.9375, + "learning_rate": 1.0576448110201938e-06, + "loss": 0.91073875, + "memory(GiB)": 302.58, + "step": 285980, + "train_speed(iter/s)": 0.123745 + }, + { + "acc": 0.74407654, + "epoch": 1.5994474636034823, + "grad_norm": 8.3125, + "learning_rate": 1.0570761184179457e-06, + "loss": 1.01720333, + "memory(GiB)": 302.58, + "step": 286000, + "train_speed(iter/s)": 0.123749 + }, + { + "epoch": 1.5994474636034823, + "eval_acc": 0.7069166053168161, + "eval_loss": 1.011824131011963, + "eval_runtime": 7594.3582, + "eval_samples_per_second": 9.913, + "eval_steps_per_second": 9.913, + "step": 286000 + }, + { + "acc": 0.75486674, + "epoch": 1.5995593130764618, + "grad_norm": 7.53125, + "learning_rate": 1.056507560676197e-06, + "loss": 0.96597357, + "memory(GiB)": 302.58, + "step": 286020, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.72529826, + "epoch": 1.5996711625494409, + "grad_norm": 4.90625, + "learning_rate": 1.0559391378143946e-06, + "loss": 1.10255089, + "memory(GiB)": 302.58, + "step": 286040, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.76050539, + "epoch": 1.5997830120224203, + "grad_norm": 4.34375, + "learning_rate": 1.0553708498519783e-06, + "loss": 0.92951336, + "memory(GiB)": 302.58, + "step": 286060, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.7570138, + "epoch": 1.5998948614953994, + "grad_norm": 6.46875, + "learning_rate": 1.054802696808388e-06, + "loss": 0.93733234, + "memory(GiB)": 302.58, + "step": 286080, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.7583776, + "epoch": 1.6000067109683789, + "grad_norm": 6.4375, + "learning_rate": 1.054234678703056e-06, + "loss": 0.94494228, + "memory(GiB)": 302.58, + "step": 286100, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.76304641, + "epoch": 1.600118560441358, + "grad_norm": 6.65625, + "learning_rate": 1.0536667955554091e-06, + "loss": 0.93741522, + "memory(GiB)": 302.58, + "step": 286120, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.75425477, + "epoch": 1.6002304099143374, + "grad_norm": 8.25, + "learning_rate": 1.0530990473848718e-06, + "loss": 0.97053938, + "memory(GiB)": 302.58, + "step": 286140, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.72902627, + "epoch": 1.6003422593873164, + "grad_norm": 8.0, + "learning_rate": 1.0525314342108616e-06, + "loss": 1.07076464, + "memory(GiB)": 302.58, + "step": 286160, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.74638724, + "epoch": 1.600454108860296, + "grad_norm": 6.5625, + "learning_rate": 1.0519639560527945e-06, + "loss": 1.00059528, + "memory(GiB)": 302.58, + "step": 286180, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.73860912, + "epoch": 1.600565958333275, + "grad_norm": 5.71875, + "learning_rate": 1.0513966129300784e-06, + "loss": 1.0443861, + "memory(GiB)": 302.58, + "step": 286200, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.74603109, + "epoch": 1.6006778078062545, + "grad_norm": 8.5625, + "learning_rate": 1.0508294048621187e-06, + "loss": 1.00872097, + "memory(GiB)": 302.58, + "step": 286220, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.75157681, + "epoch": 1.6007896572792335, + "grad_norm": 8.5, + "learning_rate": 1.0502623318683159e-06, + "loss": 0.99111996, + "memory(GiB)": 302.58, + "step": 286240, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.74524269, + "epoch": 1.600901506752213, + "grad_norm": 5.25, + "learning_rate": 1.049695393968066e-06, + "loss": 1.02048235, + "memory(GiB)": 302.58, + "step": 286260, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.74247403, + "epoch": 1.601013356225192, + "grad_norm": 7.53125, + "learning_rate": 1.0491285911807593e-06, + "loss": 1.00819635, + "memory(GiB)": 302.58, + "step": 286280, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.75440564, + "epoch": 1.6011252056981715, + "grad_norm": 7.40625, + "learning_rate": 1.0485619235257826e-06, + "loss": 0.96254444, + "memory(GiB)": 302.58, + "step": 286300, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.75532422, + "epoch": 1.6012370551711506, + "grad_norm": 6.875, + "learning_rate": 1.0479953910225182e-06, + "loss": 0.96083965, + "memory(GiB)": 302.58, + "step": 286320, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.76268497, + "epoch": 1.60134890464413, + "grad_norm": 12.0, + "learning_rate": 1.047428993690341e-06, + "loss": 0.94915295, + "memory(GiB)": 302.58, + "step": 286340, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.74656405, + "epoch": 1.601460754117109, + "grad_norm": 5.4375, + "learning_rate": 1.0468627315486274e-06, + "loss": 1.00061331, + "memory(GiB)": 302.58, + "step": 286360, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.74734473, + "epoch": 1.6015726035900886, + "grad_norm": 7.59375, + "learning_rate": 1.0462966046167427e-06, + "loss": 0.99927654, + "memory(GiB)": 302.58, + "step": 286380, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.74310632, + "epoch": 1.6016844530630676, + "grad_norm": 8.0, + "learning_rate": 1.0457306129140515e-06, + "loss": 1.01330872, + "memory(GiB)": 302.58, + "step": 286400, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.74119129, + "epoch": 1.601796302536047, + "grad_norm": 9.625, + "learning_rate": 1.045164756459912e-06, + "loss": 1.04922857, + "memory(GiB)": 302.58, + "step": 286420, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.74834495, + "epoch": 1.6019081520090261, + "grad_norm": 5.84375, + "learning_rate": 1.0445990352736784e-06, + "loss": 0.99027414, + "memory(GiB)": 302.58, + "step": 286440, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.74633832, + "epoch": 1.6020200014820056, + "grad_norm": 6.25, + "learning_rate": 1.0440334493746996e-06, + "loss": 1.00948124, + "memory(GiB)": 302.58, + "step": 286460, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74723382, + "epoch": 1.6021318509549847, + "grad_norm": 6.875, + "learning_rate": 1.043467998782321e-06, + "loss": 0.99563103, + "memory(GiB)": 302.58, + "step": 286480, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.7497313, + "epoch": 1.6022437004279642, + "grad_norm": 6.15625, + "learning_rate": 1.0429026835158828e-06, + "loss": 0.97642336, + "memory(GiB)": 302.58, + "step": 286500, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.75363765, + "epoch": 1.6023555499009432, + "grad_norm": 6.46875, + "learning_rate": 1.0423375035947208e-06, + "loss": 0.97375078, + "memory(GiB)": 302.58, + "step": 286520, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.74343157, + "epoch": 1.6024673993739227, + "grad_norm": 5.6875, + "learning_rate": 1.041772459038165e-06, + "loss": 0.99422579, + "memory(GiB)": 302.58, + "step": 286540, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.76113334, + "epoch": 1.6025792488469017, + "grad_norm": 5.6875, + "learning_rate": 1.0412075498655428e-06, + "loss": 0.91962996, + "memory(GiB)": 302.58, + "step": 286560, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.74951272, + "epoch": 1.6026910983198812, + "grad_norm": 6.25, + "learning_rate": 1.0406427760961747e-06, + "loss": 0.96520596, + "memory(GiB)": 302.58, + "step": 286580, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.74101791, + "epoch": 1.6028029477928603, + "grad_norm": 10.1875, + "learning_rate": 1.0400781377493792e-06, + "loss": 1.01263456, + "memory(GiB)": 302.58, + "step": 286600, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.76568189, + "epoch": 1.6029147972658397, + "grad_norm": 8.375, + "learning_rate": 1.0395136348444657e-06, + "loss": 0.92653608, + "memory(GiB)": 302.58, + "step": 286620, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.73792887, + "epoch": 1.6030266467388188, + "grad_norm": 6.84375, + "learning_rate": 1.0389492674007461e-06, + "loss": 1.04315643, + "memory(GiB)": 302.58, + "step": 286640, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.73923798, + "epoch": 1.6031384962117983, + "grad_norm": 4.8125, + "learning_rate": 1.0383850354375219e-06, + "loss": 1.04260387, + "memory(GiB)": 302.58, + "step": 286660, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.74810047, + "epoch": 1.6032503456847773, + "grad_norm": 5.9375, + "learning_rate": 1.0378209389740913e-06, + "loss": 1.01516638, + "memory(GiB)": 302.58, + "step": 286680, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.74931302, + "epoch": 1.6033621951577568, + "grad_norm": 7.625, + "learning_rate": 1.037256978029748e-06, + "loss": 0.99559345, + "memory(GiB)": 302.58, + "step": 286700, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.73224792, + "epoch": 1.6034740446307358, + "grad_norm": 5.65625, + "learning_rate": 1.0366931526237823e-06, + "loss": 1.06951332, + "memory(GiB)": 302.58, + "step": 286720, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.73649693, + "epoch": 1.6035858941037153, + "grad_norm": 7.34375, + "learning_rate": 1.0361294627754776e-06, + "loss": 1.02460947, + "memory(GiB)": 302.58, + "step": 286740, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.73722706, + "epoch": 1.6036977435766944, + "grad_norm": 7.25, + "learning_rate": 1.0355659085041143e-06, + "loss": 1.04302807, + "memory(GiB)": 302.58, + "step": 286760, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.76125298, + "epoch": 1.6038095930496739, + "grad_norm": 6.90625, + "learning_rate": 1.035002489828968e-06, + "loss": 0.95200853, + "memory(GiB)": 302.58, + "step": 286780, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.72849226, + "epoch": 1.603921442522653, + "grad_norm": 7.625, + "learning_rate": 1.0344392067693087e-06, + "loss": 1.09197454, + "memory(GiB)": 302.58, + "step": 286800, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74331684, + "epoch": 1.6040332919956324, + "grad_norm": 7.125, + "learning_rate": 1.0338760593444036e-06, + "loss": 1.00315075, + "memory(GiB)": 302.58, + "step": 286820, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.76433578, + "epoch": 1.6041451414686114, + "grad_norm": 7.625, + "learning_rate": 1.0333130475735132e-06, + "loss": 0.92603083, + "memory(GiB)": 302.58, + "step": 286840, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74768925, + "epoch": 1.604256990941591, + "grad_norm": 7.09375, + "learning_rate": 1.0327501714758947e-06, + "loss": 0.98621321, + "memory(GiB)": 302.58, + "step": 286860, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.74756846, + "epoch": 1.60436884041457, + "grad_norm": 6.96875, + "learning_rate": 1.0321874310707991e-06, + "loss": 0.98064747, + "memory(GiB)": 302.58, + "step": 286880, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.77299814, + "epoch": 1.6044806898875494, + "grad_norm": 8.3125, + "learning_rate": 1.0316248263774758e-06, + "loss": 0.8987772, + "memory(GiB)": 302.58, + "step": 286900, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.75797133, + "epoch": 1.6045925393605285, + "grad_norm": 4.84375, + "learning_rate": 1.0310623574151667e-06, + "loss": 0.97394266, + "memory(GiB)": 302.58, + "step": 286920, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74719558, + "epoch": 1.604704388833508, + "grad_norm": 6.40625, + "learning_rate": 1.0305000242031105e-06, + "loss": 0.99308386, + "memory(GiB)": 302.58, + "step": 286940, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.77083235, + "epoch": 1.604816238306487, + "grad_norm": 6.4375, + "learning_rate": 1.0299378267605404e-06, + "loss": 0.91405373, + "memory(GiB)": 302.58, + "step": 286960, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.75746651, + "epoch": 1.6049280877794665, + "grad_norm": 6.5625, + "learning_rate": 1.0293757651066854e-06, + "loss": 0.95104961, + "memory(GiB)": 302.58, + "step": 286980, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75323148, + "epoch": 1.6050399372524455, + "grad_norm": 8.3125, + "learning_rate": 1.02881383926077e-06, + "loss": 0.95975361, + "memory(GiB)": 302.58, + "step": 287000, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75133667, + "epoch": 1.605151786725425, + "grad_norm": 5.625, + "learning_rate": 1.0282520492420128e-06, + "loss": 0.9618659, + "memory(GiB)": 302.58, + "step": 287020, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.7425118, + "epoch": 1.605263636198404, + "grad_norm": 7.78125, + "learning_rate": 1.0276903950696305e-06, + "loss": 1.01247883, + "memory(GiB)": 302.58, + "step": 287040, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.76223245, + "epoch": 1.6053754856713836, + "grad_norm": 7.25, + "learning_rate": 1.0271288767628317e-06, + "loss": 0.93364744, + "memory(GiB)": 302.58, + "step": 287060, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.72134471, + "epoch": 1.6054873351443626, + "grad_norm": 7.625, + "learning_rate": 1.0265674943408233e-06, + "loss": 1.12244616, + "memory(GiB)": 302.58, + "step": 287080, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.76102681, + "epoch": 1.605599184617342, + "grad_norm": 6.53125, + "learning_rate": 1.026006247822806e-06, + "loss": 0.95228195, + "memory(GiB)": 302.58, + "step": 287100, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75369029, + "epoch": 1.6057110340903211, + "grad_norm": 5.34375, + "learning_rate": 1.025445137227976e-06, + "loss": 0.95200167, + "memory(GiB)": 302.58, + "step": 287120, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.73753686, + "epoch": 1.6058228835633006, + "grad_norm": 10.3125, + "learning_rate": 1.0248841625755257e-06, + "loss": 1.0407546, + "memory(GiB)": 302.58, + "step": 287140, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74225712, + "epoch": 1.6059347330362796, + "grad_norm": 7.8125, + "learning_rate": 1.0243233238846407e-06, + "loss": 1.01660528, + "memory(GiB)": 302.58, + "step": 287160, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.75793309, + "epoch": 1.6060465825092591, + "grad_norm": 6.84375, + "learning_rate": 1.023762621174505e-06, + "loss": 0.95116367, + "memory(GiB)": 302.58, + "step": 287180, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75554781, + "epoch": 1.6061584319822382, + "grad_norm": 9.3125, + "learning_rate": 1.0232020544642967e-06, + "loss": 0.96855364, + "memory(GiB)": 302.58, + "step": 287200, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.76026492, + "epoch": 1.6062702814552177, + "grad_norm": 7.5, + "learning_rate": 1.0226416237731861e-06, + "loss": 0.92534599, + "memory(GiB)": 302.58, + "step": 287220, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.77231841, + "epoch": 1.6063821309281967, + "grad_norm": 8.875, + "learning_rate": 1.0220813291203457e-06, + "loss": 0.88329515, + "memory(GiB)": 302.58, + "step": 287240, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.74376755, + "epoch": 1.6064939804011762, + "grad_norm": 6.5, + "learning_rate": 1.0215211705249379e-06, + "loss": 1.01254072, + "memory(GiB)": 302.58, + "step": 287260, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.7514751, + "epoch": 1.6066058298741552, + "grad_norm": 6.15625, + "learning_rate": 1.020961148006121e-06, + "loss": 0.9765233, + "memory(GiB)": 302.58, + "step": 287280, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.73335667, + "epoch": 1.6067176793471347, + "grad_norm": 7.0, + "learning_rate": 1.02040126158305e-06, + "loss": 1.05482063, + "memory(GiB)": 302.58, + "step": 287300, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.75561743, + "epoch": 1.6068295288201138, + "grad_norm": 7.6875, + "learning_rate": 1.019841511274875e-06, + "loss": 0.97009315, + "memory(GiB)": 302.58, + "step": 287320, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.75662475, + "epoch": 1.6069413782930932, + "grad_norm": 7.03125, + "learning_rate": 1.0192818971007406e-06, + "loss": 0.95116367, + "memory(GiB)": 302.58, + "step": 287340, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.76097226, + "epoch": 1.6070532277660723, + "grad_norm": 12.625, + "learning_rate": 1.0187224190797884e-06, + "loss": 0.95861979, + "memory(GiB)": 302.58, + "step": 287360, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.75846972, + "epoch": 1.6071650772390518, + "grad_norm": 8.4375, + "learning_rate": 1.0181630772311534e-06, + "loss": 0.94247065, + "memory(GiB)": 302.58, + "step": 287380, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.74143023, + "epoch": 1.6072769267120308, + "grad_norm": 8.4375, + "learning_rate": 1.0176038715739673e-06, + "loss": 1.02790537, + "memory(GiB)": 302.58, + "step": 287400, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.75981784, + "epoch": 1.6073887761850103, + "grad_norm": 10.5625, + "learning_rate": 1.0170448021273565e-06, + "loss": 0.93626814, + "memory(GiB)": 302.58, + "step": 287420, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.76519842, + "epoch": 1.6075006256579893, + "grad_norm": 5.875, + "learning_rate": 1.0164858689104434e-06, + "loss": 0.92435312, + "memory(GiB)": 302.58, + "step": 287440, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.75339699, + "epoch": 1.6076124751309688, + "grad_norm": 8.125, + "learning_rate": 1.0159270719423448e-06, + "loss": 0.96945114, + "memory(GiB)": 302.58, + "step": 287460, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.74557586, + "epoch": 1.6077243246039479, + "grad_norm": 10.25, + "learning_rate": 1.0153684112421736e-06, + "loss": 0.99093266, + "memory(GiB)": 302.58, + "step": 287480, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.76812134, + "epoch": 1.6078361740769274, + "grad_norm": 6.03125, + "learning_rate": 1.0148098868290363e-06, + "loss": 0.89794321, + "memory(GiB)": 302.58, + "step": 287500, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.74955773, + "epoch": 1.6079480235499064, + "grad_norm": 5.65625, + "learning_rate": 1.0142514987220386e-06, + "loss": 1.00017738, + "memory(GiB)": 302.58, + "step": 287520, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.75509338, + "epoch": 1.6080598730228859, + "grad_norm": 6.59375, + "learning_rate": 1.0136932469402788e-06, + "loss": 0.95402441, + "memory(GiB)": 302.58, + "step": 287540, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.76885366, + "epoch": 1.608171722495865, + "grad_norm": 6.75, + "learning_rate": 1.0131351315028497e-06, + "loss": 0.91426258, + "memory(GiB)": 302.58, + "step": 287560, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75742965, + "epoch": 1.6082835719688444, + "grad_norm": 6.15625, + "learning_rate": 1.0125771524288409e-06, + "loss": 0.97704868, + "memory(GiB)": 302.58, + "step": 287580, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.74527574, + "epoch": 1.6083954214418235, + "grad_norm": 8.25, + "learning_rate": 1.0120193097373376e-06, + "loss": 1.00831175, + "memory(GiB)": 302.58, + "step": 287600, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.76657767, + "epoch": 1.608507270914803, + "grad_norm": 9.3125, + "learning_rate": 1.0114616034474194e-06, + "loss": 0.91548033, + "memory(GiB)": 302.58, + "step": 287620, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.76550345, + "epoch": 1.608619120387782, + "grad_norm": 8.125, + "learning_rate": 1.0109040335781617e-06, + "loss": 0.92164268, + "memory(GiB)": 302.58, + "step": 287640, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.738767, + "epoch": 1.6087309698607615, + "grad_norm": 8.75, + "learning_rate": 1.0103466001486352e-06, + "loss": 1.03358536, + "memory(GiB)": 302.58, + "step": 287660, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.73259625, + "epoch": 1.6088428193337405, + "grad_norm": 8.1875, + "learning_rate": 1.0097893031779054e-06, + "loss": 1.04993382, + "memory(GiB)": 302.58, + "step": 287680, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.75896807, + "epoch": 1.60895466880672, + "grad_norm": 7.5625, + "learning_rate": 1.0092321426850344e-06, + "loss": 0.9601511, + "memory(GiB)": 302.58, + "step": 287700, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.75655527, + "epoch": 1.609066518279699, + "grad_norm": 6.625, + "learning_rate": 1.0086751186890786e-06, + "loss": 0.96082649, + "memory(GiB)": 302.58, + "step": 287720, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.7542562, + "epoch": 1.6091783677526785, + "grad_norm": 9.1875, + "learning_rate": 1.0081182312090898e-06, + "loss": 0.95585737, + "memory(GiB)": 302.58, + "step": 287740, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.74287663, + "epoch": 1.6092902172256576, + "grad_norm": 6.09375, + "learning_rate": 1.0075614802641137e-06, + "loss": 1.01402731, + "memory(GiB)": 302.58, + "step": 287760, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.73726807, + "epoch": 1.609402066698637, + "grad_norm": 7.5625, + "learning_rate": 1.0070048658731963e-06, + "loss": 1.04616594, + "memory(GiB)": 302.58, + "step": 287780, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.73928013, + "epoch": 1.609513916171616, + "grad_norm": 6.5625, + "learning_rate": 1.0064483880553737e-06, + "loss": 1.04959898, + "memory(GiB)": 302.58, + "step": 287800, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.7618814, + "epoch": 1.6096257656445956, + "grad_norm": 4.46875, + "learning_rate": 1.0058920468296801e-06, + "loss": 0.91280298, + "memory(GiB)": 302.58, + "step": 287820, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.75461235, + "epoch": 1.6097376151175746, + "grad_norm": 6.5625, + "learning_rate": 1.0053358422151427e-06, + "loss": 0.96286669, + "memory(GiB)": 302.58, + "step": 287840, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.75315738, + "epoch": 1.6098494645905541, + "grad_norm": 9.125, + "learning_rate": 1.0047797742307868e-06, + "loss": 0.96141062, + "memory(GiB)": 302.58, + "step": 287860, + "train_speed(iter/s)": 0.123712 + }, + { + "acc": 0.75220041, + "epoch": 1.6099613140635332, + "grad_norm": 8.25, + "learning_rate": 1.0042238428956308e-06, + "loss": 0.97750025, + "memory(GiB)": 302.58, + "step": 287880, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.76238651, + "epoch": 1.6100731635365126, + "grad_norm": 6.78125, + "learning_rate": 1.0036680482286897e-06, + "loss": 0.91590614, + "memory(GiB)": 302.58, + "step": 287900, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.74731412, + "epoch": 1.6101850130094917, + "grad_norm": 7.34375, + "learning_rate": 1.0031123902489737e-06, + "loss": 0.99842854, + "memory(GiB)": 302.58, + "step": 287920, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.75424409, + "epoch": 1.6102968624824712, + "grad_norm": 6.71875, + "learning_rate": 1.0025568689754872e-06, + "loss": 0.98790646, + "memory(GiB)": 302.58, + "step": 287940, + "train_speed(iter/s)": 0.123729 + }, + { + "acc": 0.74551506, + "epoch": 1.6104087119554502, + "grad_norm": 5.46875, + "learning_rate": 1.0020014844272313e-06, + "loss": 0.99486961, + "memory(GiB)": 302.58, + "step": 287960, + "train_speed(iter/s)": 0.123733 + }, + { + "acc": 0.75449414, + "epoch": 1.6105205614284297, + "grad_norm": 7.5625, + "learning_rate": 1.0014462366232024e-06, + "loss": 0.95944023, + "memory(GiB)": 302.58, + "step": 287980, + "train_speed(iter/s)": 0.123737 + }, + { + "acc": 0.77543139, + "epoch": 1.6106324109014087, + "grad_norm": 6.21875, + "learning_rate": 1.0008911255823912e-06, + "loss": 0.86968918, + "memory(GiB)": 302.58, + "step": 288000, + "train_speed(iter/s)": 0.123741 + }, + { + "epoch": 1.6106324109014087, + "eval_acc": 0.7069010275323537, + "eval_loss": 1.011789083480835, + "eval_runtime": 7534.923, + "eval_samples_per_second": 9.991, + "eval_steps_per_second": 9.991, + "step": 288000 + }, + { + "acc": 0.74271097, + "epoch": 1.6107442603743882, + "grad_norm": 6.625, + "learning_rate": 1.0003361513237842e-06, + "loss": 1.01648254, + "memory(GiB)": 302.58, + "step": 288020, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.74175978, + "epoch": 1.6108561098473673, + "grad_norm": 7.0, + "learning_rate": 9.99781313866362e-07, + "loss": 1.01006403, + "memory(GiB)": 302.58, + "step": 288040, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.73866739, + "epoch": 1.6109679593203468, + "grad_norm": 8.6875, + "learning_rate": 9.992266132291046e-07, + "loss": 1.03168039, + "memory(GiB)": 302.58, + "step": 288060, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.75830851, + "epoch": 1.6110798087933258, + "grad_norm": 9.0625, + "learning_rate": 9.98672049430983e-07, + "loss": 0.94026279, + "memory(GiB)": 302.58, + "step": 288080, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.75236559, + "epoch": 1.6111916582663053, + "grad_norm": 7.28125, + "learning_rate": 9.981176224909655e-07, + "loss": 0.98072023, + "memory(GiB)": 302.58, + "step": 288100, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.75115619, + "epoch": 1.6113035077392843, + "grad_norm": 6.125, + "learning_rate": 9.975633324280148e-07, + "loss": 0.97537565, + "memory(GiB)": 302.58, + "step": 288120, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.74836679, + "epoch": 1.6114153572122638, + "grad_norm": 9.4375, + "learning_rate": 9.970091792610897e-07, + "loss": 0.99200354, + "memory(GiB)": 302.58, + "step": 288140, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.75129232, + "epoch": 1.6115272066852429, + "grad_norm": 6.21875, + "learning_rate": 9.964551630091435e-07, + "loss": 0.98134546, + "memory(GiB)": 302.58, + "step": 288160, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.74523096, + "epoch": 1.6116390561582223, + "grad_norm": 8.75, + "learning_rate": 9.95901283691126e-07, + "loss": 1.01893559, + "memory(GiB)": 302.58, + "step": 288180, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.74724946, + "epoch": 1.6117509056312014, + "grad_norm": 7.03125, + "learning_rate": 9.95347541325981e-07, + "loss": 0.99086475, + "memory(GiB)": 302.58, + "step": 288200, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.75447278, + "epoch": 1.6118627551041809, + "grad_norm": 6.125, + "learning_rate": 9.947939359326486e-07, + "loss": 0.98523321, + "memory(GiB)": 302.58, + "step": 288220, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.73671432, + "epoch": 1.61197460457716, + "grad_norm": 8.1875, + "learning_rate": 9.942404675300631e-07, + "loss": 1.034657, + "memory(GiB)": 302.58, + "step": 288240, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.74208593, + "epoch": 1.6120864540501394, + "grad_norm": 5.78125, + "learning_rate": 9.936871361371565e-07, + "loss": 1.00540667, + "memory(GiB)": 302.58, + "step": 288260, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.73498955, + "epoch": 1.6121983035231184, + "grad_norm": 6.25, + "learning_rate": 9.93133941772853e-07, + "loss": 1.0503376, + "memory(GiB)": 302.58, + "step": 288280, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.75014033, + "epoch": 1.612310152996098, + "grad_norm": 6.90625, + "learning_rate": 9.925808844560742e-07, + "loss": 0.98881874, + "memory(GiB)": 302.58, + "step": 288300, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.75389462, + "epoch": 1.612422002469077, + "grad_norm": 10.6875, + "learning_rate": 9.920279642057352e-07, + "loss": 0.94891005, + "memory(GiB)": 302.58, + "step": 288320, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.74956532, + "epoch": 1.6125338519420565, + "grad_norm": 7.5, + "learning_rate": 9.914751810407497e-07, + "loss": 1.01133938, + "memory(GiB)": 302.58, + "step": 288340, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.75399237, + "epoch": 1.6126457014150355, + "grad_norm": 5.625, + "learning_rate": 9.909225349800238e-07, + "loss": 0.9688612, + "memory(GiB)": 302.58, + "step": 288360, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.75599709, + "epoch": 1.612757550888015, + "grad_norm": 7.6875, + "learning_rate": 9.903700260424598e-07, + "loss": 0.96494236, + "memory(GiB)": 302.58, + "step": 288380, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.74603934, + "epoch": 1.612869400360994, + "grad_norm": 6.65625, + "learning_rate": 9.898176542469551e-07, + "loss": 1.01984491, + "memory(GiB)": 302.58, + "step": 288400, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.73521667, + "epoch": 1.6129812498339735, + "grad_norm": 7.5, + "learning_rate": 9.892654196124025e-07, + "loss": 1.02817774, + "memory(GiB)": 302.58, + "step": 288420, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74579234, + "epoch": 1.6130930993069525, + "grad_norm": 7.875, + "learning_rate": 9.88713322157689e-07, + "loss": 1.00121832, + "memory(GiB)": 302.58, + "step": 288440, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.74394212, + "epoch": 1.613204948779932, + "grad_norm": 6.9375, + "learning_rate": 9.881613619017005e-07, + "loss": 0.9990797, + "memory(GiB)": 302.58, + "step": 288460, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.75865879, + "epoch": 1.613316798252911, + "grad_norm": 6.5, + "learning_rate": 9.87609538863315e-07, + "loss": 0.96876669, + "memory(GiB)": 302.58, + "step": 288480, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74909906, + "epoch": 1.6134286477258906, + "grad_norm": 6.84375, + "learning_rate": 9.870578530614056e-07, + "loss": 0.96063385, + "memory(GiB)": 302.58, + "step": 288500, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.75852628, + "epoch": 1.6135404971988696, + "grad_norm": 9.1875, + "learning_rate": 9.865063045148427e-07, + "loss": 0.9540781, + "memory(GiB)": 302.58, + "step": 288520, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74295835, + "epoch": 1.613652346671849, + "grad_norm": 6.78125, + "learning_rate": 9.859548932424906e-07, + "loss": 1.00226831, + "memory(GiB)": 302.58, + "step": 288540, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.74155197, + "epoch": 1.6137641961448281, + "grad_norm": 6.34375, + "learning_rate": 9.854036192632094e-07, + "loss": 1.0196394, + "memory(GiB)": 302.58, + "step": 288560, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.75208349, + "epoch": 1.6138760456178076, + "grad_norm": 6.40625, + "learning_rate": 9.848524825958545e-07, + "loss": 0.97398071, + "memory(GiB)": 302.58, + "step": 288580, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.76271634, + "epoch": 1.6139878950907867, + "grad_norm": 5.0625, + "learning_rate": 9.843014832592758e-07, + "loss": 0.92708139, + "memory(GiB)": 302.58, + "step": 288600, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.74643736, + "epoch": 1.6140997445637661, + "grad_norm": 6.6875, + "learning_rate": 9.837506212723202e-07, + "loss": 1.00953884, + "memory(GiB)": 302.58, + "step": 288620, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.73008637, + "epoch": 1.6142115940367452, + "grad_norm": 6.75, + "learning_rate": 9.83199896653827e-07, + "loss": 1.08662949, + "memory(GiB)": 302.58, + "step": 288640, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.73558674, + "epoch": 1.6143234435097247, + "grad_norm": 5.90625, + "learning_rate": 9.826493094226357e-07, + "loss": 1.0437211, + "memory(GiB)": 302.58, + "step": 288660, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.73783407, + "epoch": 1.6144352929827037, + "grad_norm": 15.3125, + "learning_rate": 9.820988595975772e-07, + "loss": 1.04190903, + "memory(GiB)": 302.58, + "step": 288680, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74546232, + "epoch": 1.6145471424556832, + "grad_norm": 6.21875, + "learning_rate": 9.815485471974773e-07, + "loss": 0.99452934, + "memory(GiB)": 302.58, + "step": 288700, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.74531384, + "epoch": 1.6146589919286622, + "grad_norm": 7.09375, + "learning_rate": 9.809983722411598e-07, + "loss": 0.9870677, + "memory(GiB)": 302.58, + "step": 288720, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.73410249, + "epoch": 1.6147708414016417, + "grad_norm": 6.96875, + "learning_rate": 9.80448334747442e-07, + "loss": 1.03719015, + "memory(GiB)": 302.58, + "step": 288740, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.73999062, + "epoch": 1.6148826908746208, + "grad_norm": 6.90625, + "learning_rate": 9.798984347351365e-07, + "loss": 1.02847929, + "memory(GiB)": 302.58, + "step": 288760, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.75346413, + "epoch": 1.6149945403476003, + "grad_norm": 13.5, + "learning_rate": 9.793486722230516e-07, + "loss": 0.96281271, + "memory(GiB)": 302.58, + "step": 288780, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.73894439, + "epoch": 1.6151063898205793, + "grad_norm": 7.375, + "learning_rate": 9.787990472299918e-07, + "loss": 1.0630661, + "memory(GiB)": 302.58, + "step": 288800, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.76479654, + "epoch": 1.6152182392935588, + "grad_norm": 5.5625, + "learning_rate": 9.782495597747549e-07, + "loss": 0.91460724, + "memory(GiB)": 302.58, + "step": 288820, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.75286202, + "epoch": 1.6153300887665378, + "grad_norm": 6.96875, + "learning_rate": 9.777002098761361e-07, + "loss": 0.98400955, + "memory(GiB)": 302.58, + "step": 288840, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.73533759, + "epoch": 1.6154419382395173, + "grad_norm": 11.625, + "learning_rate": 9.771509975529248e-07, + "loss": 1.05220442, + "memory(GiB)": 302.58, + "step": 288860, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.74529634, + "epoch": 1.6155537877124964, + "grad_norm": 11.1875, + "learning_rate": 9.766019228239048e-07, + "loss": 0.98288631, + "memory(GiB)": 302.58, + "step": 288880, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74492488, + "epoch": 1.6156656371854758, + "grad_norm": 7.71875, + "learning_rate": 9.76052985707857e-07, + "loss": 1.01416454, + "memory(GiB)": 302.58, + "step": 288900, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75244551, + "epoch": 1.6157774866584549, + "grad_norm": 6.0, + "learning_rate": 9.755041862235559e-07, + "loss": 0.98156767, + "memory(GiB)": 302.58, + "step": 288920, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74716625, + "epoch": 1.6158893361314344, + "grad_norm": 10.5625, + "learning_rate": 9.74955524389774e-07, + "loss": 0.98801308, + "memory(GiB)": 302.58, + "step": 288940, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.76289268, + "epoch": 1.6160011856044134, + "grad_norm": 6.625, + "learning_rate": 9.74407000225276e-07, + "loss": 0.95533743, + "memory(GiB)": 302.58, + "step": 288960, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74952707, + "epoch": 1.616113035077393, + "grad_norm": 8.0625, + "learning_rate": 9.738586137488237e-07, + "loss": 0.99422235, + "memory(GiB)": 302.58, + "step": 288980, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.73238621, + "epoch": 1.616224884550372, + "grad_norm": 5.59375, + "learning_rate": 9.73310364979173e-07, + "loss": 1.06010799, + "memory(GiB)": 302.58, + "step": 289000, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.74147148, + "epoch": 1.6163367340233514, + "grad_norm": 7.90625, + "learning_rate": 9.727622539350767e-07, + "loss": 1.0098011, + "memory(GiB)": 302.58, + "step": 289020, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74089704, + "epoch": 1.6164485834963307, + "grad_norm": 7.28125, + "learning_rate": 9.722142806352808e-07, + "loss": 1.03733606, + "memory(GiB)": 302.58, + "step": 289040, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.73724294, + "epoch": 1.61656043296931, + "grad_norm": 5.75, + "learning_rate": 9.716664450985287e-07, + "loss": 1.04970016, + "memory(GiB)": 302.58, + "step": 289060, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.73730226, + "epoch": 1.6166722824422892, + "grad_norm": 7.40625, + "learning_rate": 9.711187473435574e-07, + "loss": 1.03010111, + "memory(GiB)": 302.58, + "step": 289080, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.74368272, + "epoch": 1.6167841319152685, + "grad_norm": 5.5625, + "learning_rate": 9.705711873890999e-07, + "loss": 1.02415581, + "memory(GiB)": 302.58, + "step": 289100, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.75923462, + "epoch": 1.6168959813882477, + "grad_norm": 7.09375, + "learning_rate": 9.700237652538852e-07, + "loss": 0.94900084, + "memory(GiB)": 302.58, + "step": 289120, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.74366932, + "epoch": 1.617007830861227, + "grad_norm": 6.9375, + "learning_rate": 9.694764809566365e-07, + "loss": 1.00434513, + "memory(GiB)": 302.58, + "step": 289140, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.76117487, + "epoch": 1.6171196803342063, + "grad_norm": 5.15625, + "learning_rate": 9.689293345160722e-07, + "loss": 0.93477793, + "memory(GiB)": 302.58, + "step": 289160, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.7424614, + "epoch": 1.6172315298071855, + "grad_norm": 8.875, + "learning_rate": 9.683823259509068e-07, + "loss": 0.99698849, + "memory(GiB)": 302.58, + "step": 289180, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.74590244, + "epoch": 1.6173433792801648, + "grad_norm": 7.15625, + "learning_rate": 9.678354552798491e-07, + "loss": 1.0116415, + "memory(GiB)": 302.58, + "step": 289200, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.74806933, + "epoch": 1.617455228753144, + "grad_norm": 8.125, + "learning_rate": 9.672887225216054e-07, + "loss": 0.98173323, + "memory(GiB)": 302.58, + "step": 289220, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74777989, + "epoch": 1.6175670782261233, + "grad_norm": 6.21875, + "learning_rate": 9.667421276948747e-07, + "loss": 1.00667171, + "memory(GiB)": 302.58, + "step": 289240, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.75211682, + "epoch": 1.6176789276991026, + "grad_norm": 10.0, + "learning_rate": 9.661956708183523e-07, + "loss": 0.96835661, + "memory(GiB)": 302.58, + "step": 289260, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.76387506, + "epoch": 1.6177907771720819, + "grad_norm": 6.8125, + "learning_rate": 9.65649351910729e-07, + "loss": 0.89996557, + "memory(GiB)": 302.58, + "step": 289280, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.74223456, + "epoch": 1.6179026266450611, + "grad_norm": 6.875, + "learning_rate": 9.651031709906899e-07, + "loss": 1.03398991, + "memory(GiB)": 302.58, + "step": 289300, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.76233392, + "epoch": 1.6180144761180404, + "grad_norm": 6.34375, + "learning_rate": 9.645571280769173e-07, + "loss": 0.91794205, + "memory(GiB)": 302.58, + "step": 289320, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.74693928, + "epoch": 1.6181263255910197, + "grad_norm": 6.8125, + "learning_rate": 9.640112231880866e-07, + "loss": 0.98900871, + "memory(GiB)": 302.58, + "step": 289340, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.76373215, + "epoch": 1.618238175063999, + "grad_norm": 6.75, + "learning_rate": 9.634654563428703e-07, + "loss": 0.92893438, + "memory(GiB)": 302.58, + "step": 289360, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.76776109, + "epoch": 1.6183500245369782, + "grad_norm": 8.1875, + "learning_rate": 9.629198275599345e-07, + "loss": 0.90649185, + "memory(GiB)": 302.58, + "step": 289380, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.76856623, + "epoch": 1.6184618740099574, + "grad_norm": 6.21875, + "learning_rate": 9.623743368579418e-07, + "loss": 0.870821, + "memory(GiB)": 302.58, + "step": 289400, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.75660787, + "epoch": 1.6185737234829367, + "grad_norm": 7.65625, + "learning_rate": 9.6182898425555e-07, + "loss": 0.94650736, + "memory(GiB)": 302.58, + "step": 289420, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75849881, + "epoch": 1.618685572955916, + "grad_norm": 9.4375, + "learning_rate": 9.612837697714116e-07, + "loss": 0.93836174, + "memory(GiB)": 302.58, + "step": 289440, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.736484, + "epoch": 1.6187974224288952, + "grad_norm": 6.15625, + "learning_rate": 9.607386934241736e-07, + "loss": 1.05525141, + "memory(GiB)": 302.58, + "step": 289460, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.755972, + "epoch": 1.6189092719018745, + "grad_norm": 5.625, + "learning_rate": 9.601937552324814e-07, + "loss": 0.94887686, + "memory(GiB)": 302.58, + "step": 289480, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.76400113, + "epoch": 1.6190211213748538, + "grad_norm": 8.1875, + "learning_rate": 9.596489552149724e-07, + "loss": 0.90828257, + "memory(GiB)": 302.58, + "step": 289500, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.75206943, + "epoch": 1.619132970847833, + "grad_norm": 7.96875, + "learning_rate": 9.59104293390281e-07, + "loss": 0.99230404, + "memory(GiB)": 302.58, + "step": 289520, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.76787972, + "epoch": 1.6192448203208123, + "grad_norm": 6.59375, + "learning_rate": 9.58559769777036e-07, + "loss": 0.90391073, + "memory(GiB)": 302.58, + "step": 289540, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.74458017, + "epoch": 1.6193566697937916, + "grad_norm": 8.125, + "learning_rate": 9.580153843938622e-07, + "loss": 0.9778985, + "memory(GiB)": 302.58, + "step": 289560, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.75460553, + "epoch": 1.6194685192667708, + "grad_norm": 5.9375, + "learning_rate": 9.574711372593792e-07, + "loss": 0.96215611, + "memory(GiB)": 302.58, + "step": 289580, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.75637965, + "epoch": 1.61958036873975, + "grad_norm": 6.25, + "learning_rate": 9.569270283922011e-07, + "loss": 0.95537577, + "memory(GiB)": 302.58, + "step": 289600, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.74888167, + "epoch": 1.6196922182127294, + "grad_norm": 7.1875, + "learning_rate": 9.563830578109395e-07, + "loss": 0.98048897, + "memory(GiB)": 302.58, + "step": 289620, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.74439321, + "epoch": 1.6198040676857086, + "grad_norm": 8.625, + "learning_rate": 9.558392255341992e-07, + "loss": 1.00799923, + "memory(GiB)": 302.58, + "step": 289640, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.74423275, + "epoch": 1.6199159171586879, + "grad_norm": 4.5625, + "learning_rate": 9.552955315805807e-07, + "loss": 1.01691999, + "memory(GiB)": 302.58, + "step": 289660, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.75020924, + "epoch": 1.6200277666316671, + "grad_norm": 6.375, + "learning_rate": 9.547519759686797e-07, + "loss": 0.97285852, + "memory(GiB)": 302.58, + "step": 289680, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.76942105, + "epoch": 1.6201396161046464, + "grad_norm": 4.46875, + "learning_rate": 9.54208558717089e-07, + "loss": 0.90993185, + "memory(GiB)": 302.58, + "step": 289700, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.73340964, + "epoch": 1.6202514655776257, + "grad_norm": 6.09375, + "learning_rate": 9.536652798443946e-07, + "loss": 1.05414944, + "memory(GiB)": 302.58, + "step": 289720, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.75686769, + "epoch": 1.620363315050605, + "grad_norm": 4.96875, + "learning_rate": 9.531221393691787e-07, + "loss": 0.93160467, + "memory(GiB)": 302.58, + "step": 289740, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.76153326, + "epoch": 1.6204751645235842, + "grad_norm": 6.84375, + "learning_rate": 9.525791373100174e-07, + "loss": 0.92738991, + "memory(GiB)": 302.58, + "step": 289760, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.7622324, + "epoch": 1.6205870139965635, + "grad_norm": 9.1875, + "learning_rate": 9.520362736854838e-07, + "loss": 0.94489441, + "memory(GiB)": 302.58, + "step": 289780, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.74112291, + "epoch": 1.6206988634695427, + "grad_norm": 6.0625, + "learning_rate": 9.514935485141441e-07, + "loss": 1.02238054, + "memory(GiB)": 302.58, + "step": 289800, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.7311492, + "epoch": 1.620810712942522, + "grad_norm": 6.78125, + "learning_rate": 9.509509618145635e-07, + "loss": 1.07200403, + "memory(GiB)": 302.58, + "step": 289820, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.74593091, + "epoch": 1.6209225624155013, + "grad_norm": 5.78125, + "learning_rate": 9.504085136052993e-07, + "loss": 1.02800169, + "memory(GiB)": 302.58, + "step": 289840, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.73760357, + "epoch": 1.6210344118884805, + "grad_norm": 4.84375, + "learning_rate": 9.49866203904905e-07, + "loss": 1.05034952, + "memory(GiB)": 302.58, + "step": 289860, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.7651197, + "epoch": 1.6211462613614598, + "grad_norm": 6.59375, + "learning_rate": 9.493240327319292e-07, + "loss": 0.92114019, + "memory(GiB)": 302.58, + "step": 289880, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.75236206, + "epoch": 1.621258110834439, + "grad_norm": 7.3125, + "learning_rate": 9.487820001049158e-07, + "loss": 0.96721754, + "memory(GiB)": 302.58, + "step": 289900, + "train_speed(iter/s)": 0.123719 + }, + { + "acc": 0.72577305, + "epoch": 1.6213699603074183, + "grad_norm": 8.6875, + "learning_rate": 9.48240106042404e-07, + "loss": 1.09484053, + "memory(GiB)": 302.58, + "step": 289920, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.7509306, + "epoch": 1.6214818097803976, + "grad_norm": 6.40625, + "learning_rate": 9.476983505629283e-07, + "loss": 0.98531733, + "memory(GiB)": 302.58, + "step": 289940, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74267964, + "epoch": 1.6215936592533768, + "grad_norm": 5.5625, + "learning_rate": 9.471567336850185e-07, + "loss": 1.00207758, + "memory(GiB)": 302.58, + "step": 289960, + "train_speed(iter/s)": 0.123731 + }, + { + "acc": 0.76080151, + "epoch": 1.621705508726356, + "grad_norm": 5.96875, + "learning_rate": 9.466152554272001e-07, + "loss": 0.93298035, + "memory(GiB)": 302.58, + "step": 289980, + "train_speed(iter/s)": 0.123734 + }, + { + "acc": 0.75157433, + "epoch": 1.6218173581993354, + "grad_norm": 5.3125, + "learning_rate": 9.460739158079923e-07, + "loss": 0.96915207, + "memory(GiB)": 302.58, + "step": 290000, + "train_speed(iter/s)": 0.123738 + }, + { + "epoch": 1.6218173581993354, + "eval_acc": 0.7069004359709185, + "eval_loss": 1.0118260383605957, + "eval_runtime": 7547.4374, + "eval_samples_per_second": 9.975, + "eval_steps_per_second": 9.975, + "step": 290000 + }, + { + "acc": 0.75492129, + "epoch": 1.6219292076723146, + "grad_norm": 8.3125, + "learning_rate": 9.455327148459114e-07, + "loss": 0.96668062, + "memory(GiB)": 302.58, + "step": 290020, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.74320993, + "epoch": 1.622041057145294, + "grad_norm": 9.75, + "learning_rate": 9.449916525594683e-07, + "loss": 1.0000576, + "memory(GiB)": 302.58, + "step": 290040, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.74718409, + "epoch": 1.6221529066182732, + "grad_norm": 7.09375, + "learning_rate": 9.444507289671673e-07, + "loss": 1.00198965, + "memory(GiB)": 302.58, + "step": 290060, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.72745543, + "epoch": 1.6222647560912524, + "grad_norm": 5.375, + "learning_rate": 9.439099440875121e-07, + "loss": 1.1039196, + "memory(GiB)": 302.58, + "step": 290080, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.7469893, + "epoch": 1.6223766055642317, + "grad_norm": 7.4375, + "learning_rate": 9.433692979389985e-07, + "loss": 0.9948432, + "memory(GiB)": 302.58, + "step": 290100, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.76452498, + "epoch": 1.622488455037211, + "grad_norm": 6.4375, + "learning_rate": 9.428287905401184e-07, + "loss": 0.92109251, + "memory(GiB)": 302.58, + "step": 290120, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.76722074, + "epoch": 1.6226003045101902, + "grad_norm": 9.75, + "learning_rate": 9.422884219093587e-07, + "loss": 0.92133007, + "memory(GiB)": 302.58, + "step": 290140, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.76871872, + "epoch": 1.6227121539831695, + "grad_norm": 8.5625, + "learning_rate": 9.41748192065201e-07, + "loss": 0.89637985, + "memory(GiB)": 302.58, + "step": 290160, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.75581369, + "epoch": 1.6228240034561487, + "grad_norm": 7.71875, + "learning_rate": 9.412081010261242e-07, + "loss": 0.95878334, + "memory(GiB)": 302.58, + "step": 290180, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.74416518, + "epoch": 1.622935852929128, + "grad_norm": 7.75, + "learning_rate": 9.406681488106001e-07, + "loss": 1.0172677, + "memory(GiB)": 302.58, + "step": 290200, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.73264189, + "epoch": 1.6230477024021073, + "grad_norm": 7.65625, + "learning_rate": 9.401283354370971e-07, + "loss": 1.05857525, + "memory(GiB)": 302.58, + "step": 290220, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.74462571, + "epoch": 1.6231595518750865, + "grad_norm": 8.25, + "learning_rate": 9.395886609240784e-07, + "loss": 1.02868595, + "memory(GiB)": 302.58, + "step": 290240, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.74777088, + "epoch": 1.6232714013480658, + "grad_norm": 6.84375, + "learning_rate": 9.39049125290003e-07, + "loss": 0.96349287, + "memory(GiB)": 302.58, + "step": 290260, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.76180005, + "epoch": 1.623383250821045, + "grad_norm": 7.46875, + "learning_rate": 9.38509728553324e-07, + "loss": 0.92405567, + "memory(GiB)": 302.58, + "step": 290280, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.75434589, + "epoch": 1.6234951002940243, + "grad_norm": 6.03125, + "learning_rate": 9.379704707324916e-07, + "loss": 0.9781167, + "memory(GiB)": 302.58, + "step": 290300, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.75923953, + "epoch": 1.6236069497670036, + "grad_norm": 5.15625, + "learning_rate": 9.374313518459488e-07, + "loss": 0.93216419, + "memory(GiB)": 302.58, + "step": 290320, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.73649368, + "epoch": 1.6237187992399829, + "grad_norm": 9.1875, + "learning_rate": 9.368923719121348e-07, + "loss": 1.05246267, + "memory(GiB)": 302.58, + "step": 290340, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.75113478, + "epoch": 1.6238306487129621, + "grad_norm": 5.4375, + "learning_rate": 9.36353530949487e-07, + "loss": 0.97619286, + "memory(GiB)": 302.58, + "step": 290360, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.7538393, + "epoch": 1.6239424981859414, + "grad_norm": 7.28125, + "learning_rate": 9.358148289764335e-07, + "loss": 0.96968479, + "memory(GiB)": 302.58, + "step": 290380, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75620852, + "epoch": 1.6240543476589206, + "grad_norm": 7.21875, + "learning_rate": 9.352762660114001e-07, + "loss": 0.95927811, + "memory(GiB)": 302.58, + "step": 290400, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.73902054, + "epoch": 1.6241661971319, + "grad_norm": 4.125, + "learning_rate": 9.347378420728076e-07, + "loss": 1.03925619, + "memory(GiB)": 302.58, + "step": 290420, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.7326818, + "epoch": 1.6242780466048792, + "grad_norm": 5.625, + "learning_rate": 9.341995571790713e-07, + "loss": 1.08172503, + "memory(GiB)": 302.58, + "step": 290440, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.77492924, + "epoch": 1.6243898960778584, + "grad_norm": 6.5, + "learning_rate": 9.33661411348602e-07, + "loss": 0.86981106, + "memory(GiB)": 302.58, + "step": 290460, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74896917, + "epoch": 1.6245017455508377, + "grad_norm": 8.75, + "learning_rate": 9.331234045998072e-07, + "loss": 0.9939539, + "memory(GiB)": 302.58, + "step": 290480, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.7533052, + "epoch": 1.624613595023817, + "grad_norm": 6.21875, + "learning_rate": 9.325855369510873e-07, + "loss": 0.95316353, + "memory(GiB)": 302.58, + "step": 290500, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.76233912, + "epoch": 1.6247254444967962, + "grad_norm": 7.28125, + "learning_rate": 9.320478084208396e-07, + "loss": 0.91588345, + "memory(GiB)": 302.58, + "step": 290520, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.7552454, + "epoch": 1.6248372939697755, + "grad_norm": 8.5, + "learning_rate": 9.315102190274555e-07, + "loss": 0.95743465, + "memory(GiB)": 302.58, + "step": 290540, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.7456223, + "epoch": 1.6249491434427548, + "grad_norm": 6.90625, + "learning_rate": 9.309727687893233e-07, + "loss": 0.99955635, + "memory(GiB)": 302.58, + "step": 290560, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.73626256, + "epoch": 1.625060992915734, + "grad_norm": 8.25, + "learning_rate": 9.304354577248248e-07, + "loss": 1.04165392, + "memory(GiB)": 302.58, + "step": 290580, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.75596018, + "epoch": 1.6251728423887133, + "grad_norm": 6.78125, + "learning_rate": 9.298982858523381e-07, + "loss": 0.94832296, + "memory(GiB)": 302.58, + "step": 290600, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.74414372, + "epoch": 1.6252846918616926, + "grad_norm": 7.65625, + "learning_rate": 9.293612531902341e-07, + "loss": 1.00514212, + "memory(GiB)": 302.58, + "step": 290620, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.75350876, + "epoch": 1.6253965413346718, + "grad_norm": 8.9375, + "learning_rate": 9.288243597568847e-07, + "loss": 0.9562212, + "memory(GiB)": 302.58, + "step": 290640, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74439034, + "epoch": 1.625508390807651, + "grad_norm": 8.125, + "learning_rate": 9.282876055706513e-07, + "loss": 1.05072947, + "memory(GiB)": 302.58, + "step": 290660, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.73656797, + "epoch": 1.6256202402806303, + "grad_norm": 10.1875, + "learning_rate": 9.277509906498927e-07, + "loss": 1.02913208, + "memory(GiB)": 302.58, + "step": 290680, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74086776, + "epoch": 1.6257320897536096, + "grad_norm": 6.4375, + "learning_rate": 9.272145150129635e-07, + "loss": 1.03311491, + "memory(GiB)": 302.58, + "step": 290700, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.7567028, + "epoch": 1.6258439392265889, + "grad_norm": 5.84375, + "learning_rate": 9.26678178678212e-07, + "loss": 0.94300823, + "memory(GiB)": 302.58, + "step": 290720, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.73998461, + "epoch": 1.6259557886995681, + "grad_norm": 7.15625, + "learning_rate": 9.261419816639827e-07, + "loss": 1.02609377, + "memory(GiB)": 302.58, + "step": 290740, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.74104757, + "epoch": 1.6260676381725474, + "grad_norm": 9.625, + "learning_rate": 9.256059239886156e-07, + "loss": 1.01864862, + "memory(GiB)": 302.58, + "step": 290760, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.73465466, + "epoch": 1.6261794876455267, + "grad_norm": 7.1875, + "learning_rate": 9.250700056704459e-07, + "loss": 1.07226915, + "memory(GiB)": 302.58, + "step": 290780, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.74225798, + "epoch": 1.626291337118506, + "grad_norm": 8.5, + "learning_rate": 9.24534226727803e-07, + "loss": 0.99523354, + "memory(GiB)": 302.58, + "step": 290800, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74683614, + "epoch": 1.6264031865914852, + "grad_norm": 8.375, + "learning_rate": 9.239985871790125e-07, + "loss": 0.99809303, + "memory(GiB)": 302.58, + "step": 290820, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.74859495, + "epoch": 1.6265150360644645, + "grad_norm": 6.4375, + "learning_rate": 9.234630870423944e-07, + "loss": 0.9760787, + "memory(GiB)": 302.58, + "step": 290840, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.73152065, + "epoch": 1.6266268855374437, + "grad_norm": 7.8125, + "learning_rate": 9.229277263362657e-07, + "loss": 1.07104959, + "memory(GiB)": 302.58, + "step": 290860, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.77514238, + "epoch": 1.626738735010423, + "grad_norm": 9.375, + "learning_rate": 9.223925050789356e-07, + "loss": 0.88013468, + "memory(GiB)": 302.58, + "step": 290880, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.76229343, + "epoch": 1.6268505844834023, + "grad_norm": 6.5, + "learning_rate": 9.218574232887123e-07, + "loss": 0.93643456, + "memory(GiB)": 302.58, + "step": 290900, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.75501266, + "epoch": 1.6269624339563815, + "grad_norm": 8.6875, + "learning_rate": 9.213224809838972e-07, + "loss": 0.95937214, + "memory(GiB)": 302.58, + "step": 290920, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.74807978, + "epoch": 1.6270742834293608, + "grad_norm": 6.40625, + "learning_rate": 9.207876781827845e-07, + "loss": 0.99628935, + "memory(GiB)": 302.58, + "step": 290940, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.74384766, + "epoch": 1.62718613290234, + "grad_norm": 8.6875, + "learning_rate": 9.202530149036698e-07, + "loss": 1.00613937, + "memory(GiB)": 302.58, + "step": 290960, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.72462134, + "epoch": 1.6272979823753193, + "grad_norm": 8.25, + "learning_rate": 9.197184911648382e-07, + "loss": 1.09885416, + "memory(GiB)": 302.58, + "step": 290980, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.75469584, + "epoch": 1.6274098318482986, + "grad_norm": 5.65625, + "learning_rate": 9.191841069845731e-07, + "loss": 0.96672859, + "memory(GiB)": 302.58, + "step": 291000, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.7649229, + "epoch": 1.6275216813212778, + "grad_norm": 9.25, + "learning_rate": 9.186498623811508e-07, + "loss": 0.9300868, + "memory(GiB)": 302.58, + "step": 291020, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.75744653, + "epoch": 1.627633530794257, + "grad_norm": 8.875, + "learning_rate": 9.18115757372845e-07, + "loss": 0.95912809, + "memory(GiB)": 302.58, + "step": 291040, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75685053, + "epoch": 1.6277453802672364, + "grad_norm": 7.9375, + "learning_rate": 9.175817919779234e-07, + "loss": 0.93720493, + "memory(GiB)": 302.58, + "step": 291060, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.73451743, + "epoch": 1.6278572297402156, + "grad_norm": 7.90625, + "learning_rate": 9.170479662146498e-07, + "loss": 1.06225815, + "memory(GiB)": 302.58, + "step": 291080, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.76548586, + "epoch": 1.627969079213195, + "grad_norm": 8.25, + "learning_rate": 9.165142801012822e-07, + "loss": 0.89717112, + "memory(GiB)": 302.58, + "step": 291100, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75972824, + "epoch": 1.6280809286861742, + "grad_norm": 9.5625, + "learning_rate": 9.159807336560744e-07, + "loss": 0.95546503, + "memory(GiB)": 302.58, + "step": 291120, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.76025038, + "epoch": 1.6281927781591534, + "grad_norm": 5.875, + "learning_rate": 9.154473268972763e-07, + "loss": 0.94016218, + "memory(GiB)": 302.58, + "step": 291140, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.74353285, + "epoch": 1.6283046276321327, + "grad_norm": 6.84375, + "learning_rate": 9.149140598431305e-07, + "loss": 1.00225239, + "memory(GiB)": 302.58, + "step": 291160, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.75945425, + "epoch": 1.628416477105112, + "grad_norm": 7.375, + "learning_rate": 9.14380932511878e-07, + "loss": 0.96264849, + "memory(GiB)": 302.58, + "step": 291180, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.73850584, + "epoch": 1.6285283265780912, + "grad_norm": 8.875, + "learning_rate": 9.138479449217525e-07, + "loss": 1.04287996, + "memory(GiB)": 302.58, + "step": 291200, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74532027, + "epoch": 1.6286401760510705, + "grad_norm": 8.375, + "learning_rate": 9.133150970909832e-07, + "loss": 1.00914402, + "memory(GiB)": 302.58, + "step": 291220, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74285789, + "epoch": 1.6287520255240497, + "grad_norm": 10.875, + "learning_rate": 9.127823890377968e-07, + "loss": 1.01778708, + "memory(GiB)": 302.58, + "step": 291240, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.7480792, + "epoch": 1.628863874997029, + "grad_norm": 7.34375, + "learning_rate": 9.122498207804131e-07, + "loss": 0.99425039, + "memory(GiB)": 302.58, + "step": 291260, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.74688559, + "epoch": 1.6289757244700083, + "grad_norm": 9.9375, + "learning_rate": 9.117173923370476e-07, + "loss": 0.9978301, + "memory(GiB)": 302.58, + "step": 291280, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.75479641, + "epoch": 1.6290875739429875, + "grad_norm": 6.03125, + "learning_rate": 9.111851037259106e-07, + "loss": 0.95199108, + "memory(GiB)": 302.58, + "step": 291300, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.75112743, + "epoch": 1.6291994234159668, + "grad_norm": 5.6875, + "learning_rate": 9.106529549652088e-07, + "loss": 1.0002677, + "memory(GiB)": 302.58, + "step": 291320, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.76572447, + "epoch": 1.629311272888946, + "grad_norm": 9.0625, + "learning_rate": 9.101209460731425e-07, + "loss": 0.92205095, + "memory(GiB)": 302.58, + "step": 291340, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.75984936, + "epoch": 1.6294231223619253, + "grad_norm": 7.90625, + "learning_rate": 9.095890770679089e-07, + "loss": 0.94717102, + "memory(GiB)": 302.58, + "step": 291360, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.74356666, + "epoch": 1.6295349718349046, + "grad_norm": 6.53125, + "learning_rate": 9.090573479676989e-07, + "loss": 1.01459351, + "memory(GiB)": 302.58, + "step": 291380, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.74245195, + "epoch": 1.6296468213078839, + "grad_norm": 6.09375, + "learning_rate": 9.085257587906992e-07, + "loss": 1.02369061, + "memory(GiB)": 302.58, + "step": 291400, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.76378393, + "epoch": 1.6297586707808631, + "grad_norm": 4.46875, + "learning_rate": 9.07994309555093e-07, + "loss": 0.9375659, + "memory(GiB)": 302.58, + "step": 291420, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74488978, + "epoch": 1.6298705202538424, + "grad_norm": 7.5625, + "learning_rate": 9.074630002790563e-07, + "loss": 0.99890184, + "memory(GiB)": 302.58, + "step": 291440, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.74108863, + "epoch": 1.6299823697268216, + "grad_norm": 6.40625, + "learning_rate": 9.069318309807628e-07, + "loss": 1.04150295, + "memory(GiB)": 302.58, + "step": 291460, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75128355, + "epoch": 1.630094219199801, + "grad_norm": 8.0625, + "learning_rate": 9.064008016783787e-07, + "loss": 1.00430298, + "memory(GiB)": 302.58, + "step": 291480, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.75791245, + "epoch": 1.6302060686727802, + "grad_norm": 7.90625, + "learning_rate": 9.058699123900666e-07, + "loss": 0.95054274, + "memory(GiB)": 302.58, + "step": 291500, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.76176457, + "epoch": 1.6303179181457594, + "grad_norm": 8.5625, + "learning_rate": 9.05339163133987e-07, + "loss": 0.94425631, + "memory(GiB)": 302.58, + "step": 291520, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.7413548, + "epoch": 1.6304297676187387, + "grad_norm": 6.71875, + "learning_rate": 9.048085539282919e-07, + "loss": 1.01451855, + "memory(GiB)": 302.58, + "step": 291540, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74760294, + "epoch": 1.630541617091718, + "grad_norm": 6.78125, + "learning_rate": 9.042780847911292e-07, + "loss": 1.0040637, + "memory(GiB)": 302.58, + "step": 291560, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.74966726, + "epoch": 1.6306534665646972, + "grad_norm": 7.28125, + "learning_rate": 9.037477557406437e-07, + "loss": 0.97920179, + "memory(GiB)": 302.58, + "step": 291580, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.75887656, + "epoch": 1.6307653160376765, + "grad_norm": 8.875, + "learning_rate": 9.032175667949738e-07, + "loss": 0.95033436, + "memory(GiB)": 302.58, + "step": 291600, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.77610173, + "epoch": 1.6308771655106558, + "grad_norm": 6.5, + "learning_rate": 9.026875179722538e-07, + "loss": 0.8769969, + "memory(GiB)": 302.58, + "step": 291620, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.73284869, + "epoch": 1.630989014983635, + "grad_norm": 6.53125, + "learning_rate": 9.021576092906125e-07, + "loss": 1.06473923, + "memory(GiB)": 302.58, + "step": 291640, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.75455394, + "epoch": 1.6311008644566143, + "grad_norm": 6.9375, + "learning_rate": 9.01627840768175e-07, + "loss": 0.96247778, + "memory(GiB)": 302.58, + "step": 291660, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.75983114, + "epoch": 1.6312127139295935, + "grad_norm": 6.75, + "learning_rate": 9.010982124230611e-07, + "loss": 0.94225044, + "memory(GiB)": 302.58, + "step": 291680, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.75080848, + "epoch": 1.6313245634025728, + "grad_norm": 6.78125, + "learning_rate": 9.005687242733858e-07, + "loss": 0.96779747, + "memory(GiB)": 302.58, + "step": 291700, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.77081213, + "epoch": 1.631436412875552, + "grad_norm": 8.125, + "learning_rate": 9.000393763372583e-07, + "loss": 0.90778885, + "memory(GiB)": 302.58, + "step": 291720, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.77589111, + "epoch": 1.6315482623485313, + "grad_norm": 10.75, + "learning_rate": 8.995101686327857e-07, + "loss": 0.86931992, + "memory(GiB)": 302.58, + "step": 291740, + "train_speed(iter/s)": 0.123682 + }, + { + "acc": 0.75468688, + "epoch": 1.6316601118215106, + "grad_norm": 7.90625, + "learning_rate": 8.989811011780659e-07, + "loss": 0.96551943, + "memory(GiB)": 302.58, + "step": 291760, + "train_speed(iter/s)": 0.123686 + }, + { + "acc": 0.75282512, + "epoch": 1.6317719612944899, + "grad_norm": 9.0, + "learning_rate": 8.984521739911972e-07, + "loss": 0.9886405, + "memory(GiB)": 302.58, + "step": 291780, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.74945993, + "epoch": 1.6318838107674691, + "grad_norm": 5.84375, + "learning_rate": 8.979233870902704e-07, + "loss": 0.97594252, + "memory(GiB)": 302.58, + "step": 291800, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.76741438, + "epoch": 1.6319956602404484, + "grad_norm": 7.09375, + "learning_rate": 8.973947404933708e-07, + "loss": 0.87922916, + "memory(GiB)": 302.58, + "step": 291820, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.753228, + "epoch": 1.6321075097134277, + "grad_norm": 5.125, + "learning_rate": 8.968662342185802e-07, + "loss": 0.96715012, + "memory(GiB)": 302.58, + "step": 291840, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.74844494, + "epoch": 1.632219359186407, + "grad_norm": 7.0, + "learning_rate": 8.963378682839751e-07, + "loss": 1.00861778, + "memory(GiB)": 302.58, + "step": 291860, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.75366292, + "epoch": 1.6323312086593862, + "grad_norm": 8.375, + "learning_rate": 8.958096427076274e-07, + "loss": 0.95728073, + "memory(GiB)": 302.58, + "step": 291880, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.75815058, + "epoch": 1.6324430581323655, + "grad_norm": 6.65625, + "learning_rate": 8.95281557507604e-07, + "loss": 0.95405188, + "memory(GiB)": 302.58, + "step": 291900, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.76635714, + "epoch": 1.6325549076053447, + "grad_norm": 5.65625, + "learning_rate": 8.947536127019668e-07, + "loss": 0.90115013, + "memory(GiB)": 302.58, + "step": 291920, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.75431237, + "epoch": 1.632666757078324, + "grad_norm": 7.5, + "learning_rate": 8.942258083087734e-07, + "loss": 0.94271297, + "memory(GiB)": 302.58, + "step": 291940, + "train_speed(iter/s)": 0.123722 + }, + { + "acc": 0.74433703, + "epoch": 1.6327786065513032, + "grad_norm": 10.5, + "learning_rate": 8.936981443460769e-07, + "loss": 1.02172775, + "memory(GiB)": 302.58, + "step": 291960, + "train_speed(iter/s)": 0.123726 + }, + { + "acc": 0.74349904, + "epoch": 1.6328904560242825, + "grad_norm": 7.25, + "learning_rate": 8.931706208319241e-07, + "loss": 1.00100107, + "memory(GiB)": 302.58, + "step": 291980, + "train_speed(iter/s)": 0.12373 + }, + { + "acc": 0.75570884, + "epoch": 1.6330023054972618, + "grad_norm": 7.3125, + "learning_rate": 8.926432377843586e-07, + "loss": 0.98158855, + "memory(GiB)": 302.58, + "step": 292000, + "train_speed(iter/s)": 0.123734 + }, + { + "epoch": 1.6330023054972618, + "eval_acc": 0.7069095065795927, + "eval_loss": 1.0117909908294678, + "eval_runtime": 7518.1103, + "eval_samples_per_second": 10.014, + "eval_steps_per_second": 10.014, + "step": 292000 + }, + { + "acc": 0.74113045, + "epoch": 1.633114154970241, + "grad_norm": 9.25, + "learning_rate": 8.921159952214181e-07, + "loss": 1.02816029, + "memory(GiB)": 302.58, + "step": 292020, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.75636873, + "epoch": 1.6332260044432203, + "grad_norm": 7.25, + "learning_rate": 8.915888931611355e-07, + "loss": 0.95524988, + "memory(GiB)": 302.58, + "step": 292040, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.74863272, + "epoch": 1.6333378539161996, + "grad_norm": 4.1875, + "learning_rate": 8.910619316215413e-07, + "loss": 0.9844162, + "memory(GiB)": 302.58, + "step": 292060, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.74736581, + "epoch": 1.6334497033891788, + "grad_norm": 6.9375, + "learning_rate": 8.905351106206584e-07, + "loss": 0.96960154, + "memory(GiB)": 302.58, + "step": 292080, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.75654526, + "epoch": 1.633561552862158, + "grad_norm": 7.34375, + "learning_rate": 8.90008430176505e-07, + "loss": 0.96397638, + "memory(GiB)": 302.58, + "step": 292100, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.73469839, + "epoch": 1.6336734023351374, + "grad_norm": 7.3125, + "learning_rate": 8.894818903070962e-07, + "loss": 1.061341, + "memory(GiB)": 302.58, + "step": 292120, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.74281073, + "epoch": 1.6337852518081166, + "grad_norm": 9.5625, + "learning_rate": 8.889554910304404e-07, + "loss": 1.01400118, + "memory(GiB)": 302.58, + "step": 292140, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.74186072, + "epoch": 1.6338971012810959, + "grad_norm": 8.25, + "learning_rate": 8.884292323645427e-07, + "loss": 1.04628925, + "memory(GiB)": 302.58, + "step": 292160, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.75060134, + "epoch": 1.6340089507540752, + "grad_norm": 9.6875, + "learning_rate": 8.879031143274031e-07, + "loss": 0.98369951, + "memory(GiB)": 302.58, + "step": 292180, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.74270706, + "epoch": 1.6341208002270544, + "grad_norm": 8.125, + "learning_rate": 8.873771369370144e-07, + "loss": 1.01570692, + "memory(GiB)": 302.58, + "step": 292200, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.75047994, + "epoch": 1.6342326497000337, + "grad_norm": 9.625, + "learning_rate": 8.868513002113699e-07, + "loss": 0.98342371, + "memory(GiB)": 302.58, + "step": 292220, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.73681598, + "epoch": 1.634344499173013, + "grad_norm": 8.1875, + "learning_rate": 8.863256041684532e-07, + "loss": 1.04580822, + "memory(GiB)": 302.58, + "step": 292240, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.7529942, + "epoch": 1.6344563486459922, + "grad_norm": 6.09375, + "learning_rate": 8.858000488262447e-07, + "loss": 0.9663064, + "memory(GiB)": 302.58, + "step": 292260, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.76952581, + "epoch": 1.6345681981189715, + "grad_norm": 7.21875, + "learning_rate": 8.852746342027207e-07, + "loss": 0.8986845, + "memory(GiB)": 302.58, + "step": 292280, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.76002421, + "epoch": 1.6346800475919507, + "grad_norm": 6.0, + "learning_rate": 8.847493603158514e-07, + "loss": 0.92214966, + "memory(GiB)": 302.58, + "step": 292300, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.75191092, + "epoch": 1.63479189706493, + "grad_norm": 7.0, + "learning_rate": 8.842242271836032e-07, + "loss": 0.97685976, + "memory(GiB)": 302.58, + "step": 292320, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.75615449, + "epoch": 1.6349037465379093, + "grad_norm": 9.3125, + "learning_rate": 8.836992348239371e-07, + "loss": 0.97916965, + "memory(GiB)": 302.58, + "step": 292340, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.75742946, + "epoch": 1.6350155960108885, + "grad_norm": 7.78125, + "learning_rate": 8.831743832548101e-07, + "loss": 0.93231192, + "memory(GiB)": 302.58, + "step": 292360, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.74825425, + "epoch": 1.6351274454838678, + "grad_norm": 7.6875, + "learning_rate": 8.826496724941713e-07, + "loss": 0.99362555, + "memory(GiB)": 302.58, + "step": 292380, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.75066023, + "epoch": 1.635239294956847, + "grad_norm": 8.875, + "learning_rate": 8.821251025599715e-07, + "loss": 0.96963253, + "memory(GiB)": 302.58, + "step": 292400, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.76163301, + "epoch": 1.6353511444298263, + "grad_norm": 7.65625, + "learning_rate": 8.816006734701498e-07, + "loss": 0.92970867, + "memory(GiB)": 302.58, + "step": 292420, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.74666247, + "epoch": 1.6354629939028056, + "grad_norm": 4.84375, + "learning_rate": 8.810763852426446e-07, + "loss": 1.00046148, + "memory(GiB)": 302.58, + "step": 292440, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.75729113, + "epoch": 1.6355748433757848, + "grad_norm": 11.5625, + "learning_rate": 8.805522378953879e-07, + "loss": 0.99171047, + "memory(GiB)": 302.58, + "step": 292460, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75392327, + "epoch": 1.6356866928487641, + "grad_norm": 5.78125, + "learning_rate": 8.800282314463066e-07, + "loss": 0.9711916, + "memory(GiB)": 302.58, + "step": 292480, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.75899773, + "epoch": 1.6357985423217434, + "grad_norm": 7.5, + "learning_rate": 8.795043659133246e-07, + "loss": 0.94257259, + "memory(GiB)": 302.58, + "step": 292500, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.73882461, + "epoch": 1.6359103917947226, + "grad_norm": 8.0, + "learning_rate": 8.789806413143581e-07, + "loss": 1.02399921, + "memory(GiB)": 302.58, + "step": 292520, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.74411607, + "epoch": 1.636022241267702, + "grad_norm": 11.375, + "learning_rate": 8.784570576673218e-07, + "loss": 1.00863247, + "memory(GiB)": 302.58, + "step": 292540, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74676166, + "epoch": 1.6361340907406812, + "grad_norm": 8.25, + "learning_rate": 8.77933614990123e-07, + "loss": 0.99221601, + "memory(GiB)": 302.58, + "step": 292560, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.75085583, + "epoch": 1.6362459402136604, + "grad_norm": 6.59375, + "learning_rate": 8.77410313300665e-07, + "loss": 0.99578476, + "memory(GiB)": 302.58, + "step": 292580, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.72575989, + "epoch": 1.6363577896866397, + "grad_norm": 7.0625, + "learning_rate": 8.768871526168471e-07, + "loss": 1.1087718, + "memory(GiB)": 302.58, + "step": 292600, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.74115491, + "epoch": 1.636469639159619, + "grad_norm": 7.21875, + "learning_rate": 8.763641329565626e-07, + "loss": 1.01661005, + "memory(GiB)": 302.58, + "step": 292620, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.77123461, + "epoch": 1.6365814886325982, + "grad_norm": 8.125, + "learning_rate": 8.758412543376988e-07, + "loss": 0.89135981, + "memory(GiB)": 302.58, + "step": 292640, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.73594255, + "epoch": 1.6366933381055775, + "grad_norm": 6.03125, + "learning_rate": 8.753185167781431e-07, + "loss": 1.04211092, + "memory(GiB)": 302.58, + "step": 292660, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.75527534, + "epoch": 1.6368051875785568, + "grad_norm": 6.0, + "learning_rate": 8.747959202957729e-07, + "loss": 0.96000586, + "memory(GiB)": 302.58, + "step": 292680, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74455094, + "epoch": 1.636917037051536, + "grad_norm": 9.125, + "learning_rate": 8.742734649084628e-07, + "loss": 0.99228716, + "memory(GiB)": 302.58, + "step": 292700, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.73613558, + "epoch": 1.6370288865245153, + "grad_norm": 6.875, + "learning_rate": 8.737511506340823e-07, + "loss": 1.0684515, + "memory(GiB)": 302.58, + "step": 292720, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.76730523, + "epoch": 1.6371407359974945, + "grad_norm": 7.75, + "learning_rate": 8.732289774904973e-07, + "loss": 0.91546583, + "memory(GiB)": 302.58, + "step": 292740, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.75001755, + "epoch": 1.6372525854704738, + "grad_norm": 7.8125, + "learning_rate": 8.727069454955661e-07, + "loss": 0.96983662, + "memory(GiB)": 302.58, + "step": 292760, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.73704147, + "epoch": 1.637364434943453, + "grad_norm": 6.4375, + "learning_rate": 8.72185054667145e-07, + "loss": 1.04264383, + "memory(GiB)": 302.58, + "step": 292780, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.74523568, + "epoch": 1.6374762844164323, + "grad_norm": 6.0, + "learning_rate": 8.716633050230838e-07, + "loss": 0.98111124, + "memory(GiB)": 302.58, + "step": 292800, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74816933, + "epoch": 1.6375881338894116, + "grad_norm": 6.0, + "learning_rate": 8.711416965812286e-07, + "loss": 0.96991167, + "memory(GiB)": 302.58, + "step": 292820, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.76735191, + "epoch": 1.6376999833623909, + "grad_norm": 8.5625, + "learning_rate": 8.70620229359419e-07, + "loss": 0.88035469, + "memory(GiB)": 302.58, + "step": 292840, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.75235305, + "epoch": 1.6378118328353701, + "grad_norm": 9.75, + "learning_rate": 8.70098903375492e-07, + "loss": 0.96710978, + "memory(GiB)": 302.58, + "step": 292860, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.75507855, + "epoch": 1.6379236823083494, + "grad_norm": 10.0, + "learning_rate": 8.695777186472776e-07, + "loss": 0.96590919, + "memory(GiB)": 302.58, + "step": 292880, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.74437437, + "epoch": 1.6380355317813287, + "grad_norm": 9.5625, + "learning_rate": 8.690566751926027e-07, + "loss": 1.00037317, + "memory(GiB)": 302.58, + "step": 292900, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.75003648, + "epoch": 1.638147381254308, + "grad_norm": 5.8125, + "learning_rate": 8.68535773029287e-07, + "loss": 0.99573107, + "memory(GiB)": 302.58, + "step": 292920, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75473194, + "epoch": 1.6382592307272872, + "grad_norm": 8.0, + "learning_rate": 8.680150121751501e-07, + "loss": 0.96772261, + "memory(GiB)": 302.58, + "step": 292940, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.75497231, + "epoch": 1.6383710802002664, + "grad_norm": 9.9375, + "learning_rate": 8.674943926480017e-07, + "loss": 0.95269384, + "memory(GiB)": 302.58, + "step": 292960, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.7642437, + "epoch": 1.6384829296732457, + "grad_norm": 9.3125, + "learning_rate": 8.669739144656486e-07, + "loss": 0.92186804, + "memory(GiB)": 302.58, + "step": 292980, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74136763, + "epoch": 1.638594779146225, + "grad_norm": 8.625, + "learning_rate": 8.664535776458933e-07, + "loss": 1.02933559, + "memory(GiB)": 302.58, + "step": 293000, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75795126, + "epoch": 1.6387066286192042, + "grad_norm": 7.6875, + "learning_rate": 8.65933382206533e-07, + "loss": 0.96305695, + "memory(GiB)": 302.58, + "step": 293020, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.74760842, + "epoch": 1.6388184780921835, + "grad_norm": 7.84375, + "learning_rate": 8.654133281653593e-07, + "loss": 0.98160677, + "memory(GiB)": 302.58, + "step": 293040, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.75506401, + "epoch": 1.6389303275651628, + "grad_norm": 9.9375, + "learning_rate": 8.648934155401605e-07, + "loss": 0.94263401, + "memory(GiB)": 302.58, + "step": 293060, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75212812, + "epoch": 1.639042177038142, + "grad_norm": 6.53125, + "learning_rate": 8.643736443487188e-07, + "loss": 0.9803793, + "memory(GiB)": 302.58, + "step": 293080, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.75018358, + "epoch": 1.6391540265111213, + "grad_norm": 9.4375, + "learning_rate": 8.638540146088126e-07, + "loss": 0.9894311, + "memory(GiB)": 302.58, + "step": 293100, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.76385756, + "epoch": 1.6392658759841006, + "grad_norm": 8.9375, + "learning_rate": 8.633345263382136e-07, + "loss": 0.92196302, + "memory(GiB)": 302.58, + "step": 293120, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.74415684, + "epoch": 1.6393777254570798, + "grad_norm": 9.0625, + "learning_rate": 8.628151795546918e-07, + "loss": 1.00757437, + "memory(GiB)": 302.58, + "step": 293140, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.74038377, + "epoch": 1.639489574930059, + "grad_norm": 10.3125, + "learning_rate": 8.62295974276009e-07, + "loss": 1.0310832, + "memory(GiB)": 302.58, + "step": 293160, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.75756755, + "epoch": 1.6396014244030384, + "grad_norm": 5.5, + "learning_rate": 8.617769105199231e-07, + "loss": 0.95544519, + "memory(GiB)": 302.58, + "step": 293180, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.76647463, + "epoch": 1.6397132738760176, + "grad_norm": 9.3125, + "learning_rate": 8.612579883041899e-07, + "loss": 0.91406536, + "memory(GiB)": 302.58, + "step": 293200, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.7681282, + "epoch": 1.6398251233489969, + "grad_norm": 6.03125, + "learning_rate": 8.607392076465571e-07, + "loss": 0.90471029, + "memory(GiB)": 302.58, + "step": 293220, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.73984694, + "epoch": 1.6399369728219761, + "grad_norm": 6.46875, + "learning_rate": 8.602205685647685e-07, + "loss": 1.0247694, + "memory(GiB)": 302.58, + "step": 293240, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75140605, + "epoch": 1.6400488222949554, + "grad_norm": 7.21875, + "learning_rate": 8.597020710765635e-07, + "loss": 0.97397356, + "memory(GiB)": 302.58, + "step": 293260, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.74257364, + "epoch": 1.6401606717679347, + "grad_norm": 8.6875, + "learning_rate": 8.591837151996762e-07, + "loss": 1.03960934, + "memory(GiB)": 302.58, + "step": 293280, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.7532475, + "epoch": 1.640272521240914, + "grad_norm": 7.71875, + "learning_rate": 8.586655009518364e-07, + "loss": 0.9749753, + "memory(GiB)": 302.58, + "step": 293300, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74378319, + "epoch": 1.6403843707138932, + "grad_norm": 6.90625, + "learning_rate": 8.58147428350768e-07, + "loss": 1.01601353, + "memory(GiB)": 302.58, + "step": 293320, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75343456, + "epoch": 1.6404962201868725, + "grad_norm": 6.625, + "learning_rate": 8.576294974141908e-07, + "loss": 0.98049679, + "memory(GiB)": 302.58, + "step": 293340, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.73516941, + "epoch": 1.6406080696598517, + "grad_norm": 5.59375, + "learning_rate": 8.571117081598201e-07, + "loss": 1.04788694, + "memory(GiB)": 302.58, + "step": 293360, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.73601313, + "epoch": 1.640719919132831, + "grad_norm": 6.375, + "learning_rate": 8.565940606053658e-07, + "loss": 1.06535654, + "memory(GiB)": 302.58, + "step": 293380, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.74009066, + "epoch": 1.6408317686058103, + "grad_norm": 5.96875, + "learning_rate": 8.560765547685329e-07, + "loss": 1.02621584, + "memory(GiB)": 302.58, + "step": 293400, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.75307717, + "epoch": 1.6409436180787895, + "grad_norm": 10.375, + "learning_rate": 8.555591906670218e-07, + "loss": 0.96705151, + "memory(GiB)": 302.58, + "step": 293420, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.73475342, + "epoch": 1.6410554675517688, + "grad_norm": 10.4375, + "learning_rate": 8.550419683185268e-07, + "loss": 1.06722908, + "memory(GiB)": 302.58, + "step": 293440, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.74498682, + "epoch": 1.641167317024748, + "grad_norm": 7.875, + "learning_rate": 8.545248877407408e-07, + "loss": 0.98903551, + "memory(GiB)": 302.58, + "step": 293460, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.74561167, + "epoch": 1.6412791664977273, + "grad_norm": 7.8125, + "learning_rate": 8.540079489513492e-07, + "loss": 0.99117718, + "memory(GiB)": 302.58, + "step": 293480, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.76300697, + "epoch": 1.6413910159707066, + "grad_norm": 6.84375, + "learning_rate": 8.534911519680317e-07, + "loss": 0.93355312, + "memory(GiB)": 302.58, + "step": 293500, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.76855516, + "epoch": 1.6415028654436858, + "grad_norm": 6.46875, + "learning_rate": 8.529744968084641e-07, + "loss": 0.90717077, + "memory(GiB)": 302.58, + "step": 293520, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.76303778, + "epoch": 1.641614714916665, + "grad_norm": 7.71875, + "learning_rate": 8.524579834903196e-07, + "loss": 0.93402252, + "memory(GiB)": 302.58, + "step": 293540, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.74959431, + "epoch": 1.6417265643896444, + "grad_norm": 8.375, + "learning_rate": 8.51941612031264e-07, + "loss": 0.98832216, + "memory(GiB)": 302.58, + "step": 293560, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.75031128, + "epoch": 1.6418384138626236, + "grad_norm": 8.9375, + "learning_rate": 8.514253824489582e-07, + "loss": 0.98807878, + "memory(GiB)": 302.58, + "step": 293580, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.74438243, + "epoch": 1.641950263335603, + "grad_norm": 5.40625, + "learning_rate": 8.509092947610587e-07, + "loss": 0.99355202, + "memory(GiB)": 302.58, + "step": 293600, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75771518, + "epoch": 1.6420621128085822, + "grad_norm": 5.875, + "learning_rate": 8.503933489852184e-07, + "loss": 0.94358978, + "memory(GiB)": 302.58, + "step": 293620, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.73404422, + "epoch": 1.6421739622815614, + "grad_norm": 6.15625, + "learning_rate": 8.498775451390834e-07, + "loss": 1.05712633, + "memory(GiB)": 302.58, + "step": 293640, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.74609561, + "epoch": 1.6422858117545407, + "grad_norm": 7.8125, + "learning_rate": 8.493618832402956e-07, + "loss": 1.01018534, + "memory(GiB)": 302.58, + "step": 293660, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.76295867, + "epoch": 1.64239766122752, + "grad_norm": 6.3125, + "learning_rate": 8.48846363306493e-07, + "loss": 0.91996536, + "memory(GiB)": 302.58, + "step": 293680, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.75309873, + "epoch": 1.6425095107004992, + "grad_norm": 5.375, + "learning_rate": 8.483309853553079e-07, + "loss": 0.95596924, + "memory(GiB)": 302.58, + "step": 293700, + "train_speed(iter/s)": 0.123669 + }, + { + "acc": 0.7603168, + "epoch": 1.6426213601734785, + "grad_norm": 8.5625, + "learning_rate": 8.478157494043676e-07, + "loss": 0.9559186, + "memory(GiB)": 302.58, + "step": 293720, + "train_speed(iter/s)": 0.123673 + }, + { + "acc": 0.74583397, + "epoch": 1.6427332096464577, + "grad_norm": 5.09375, + "learning_rate": 8.473006554712948e-07, + "loss": 0.98616924, + "memory(GiB)": 302.58, + "step": 293740, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.73332934, + "epoch": 1.642845059119437, + "grad_norm": 8.1875, + "learning_rate": 8.467857035737071e-07, + "loss": 1.0843811, + "memory(GiB)": 302.58, + "step": 293760, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.73676395, + "epoch": 1.6429569085924163, + "grad_norm": 5.84375, + "learning_rate": 8.462708937292185e-07, + "loss": 1.04282923, + "memory(GiB)": 302.58, + "step": 293780, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.75311508, + "epoch": 1.6430687580653955, + "grad_norm": 7.0, + "learning_rate": 8.457562259554347e-07, + "loss": 0.97676744, + "memory(GiB)": 302.58, + "step": 293800, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.76473622, + "epoch": 1.6431806075383748, + "grad_norm": 8.5625, + "learning_rate": 8.45241700269962e-07, + "loss": 0.92618256, + "memory(GiB)": 302.58, + "step": 293820, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.751647, + "epoch": 1.643292457011354, + "grad_norm": 9.3125, + "learning_rate": 8.447273166903979e-07, + "loss": 0.98132, + "memory(GiB)": 302.58, + "step": 293840, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.74256859, + "epoch": 1.6434043064843333, + "grad_norm": 7.03125, + "learning_rate": 8.442130752343353e-07, + "loss": 1.00828314, + "memory(GiB)": 302.58, + "step": 293860, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.75746779, + "epoch": 1.6435161559573126, + "grad_norm": 8.9375, + "learning_rate": 8.436989759193632e-07, + "loss": 0.94464321, + "memory(GiB)": 302.58, + "step": 293880, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.75144386, + "epoch": 1.6436280054302919, + "grad_norm": 5.71875, + "learning_rate": 8.431850187630658e-07, + "loss": 0.98378353, + "memory(GiB)": 302.58, + "step": 293900, + "train_speed(iter/s)": 0.12371 + }, + { + "acc": 0.74145775, + "epoch": 1.6437398549032711, + "grad_norm": 7.46875, + "learning_rate": 8.426712037830209e-07, + "loss": 1.02757187, + "memory(GiB)": 302.58, + "step": 293920, + "train_speed(iter/s)": 0.123714 + }, + { + "acc": 0.75775738, + "epoch": 1.6438517043762504, + "grad_norm": 5.34375, + "learning_rate": 8.421575309968039e-07, + "loss": 0.95974836, + "memory(GiB)": 302.58, + "step": 293940, + "train_speed(iter/s)": 0.123718 + }, + { + "acc": 0.75622053, + "epoch": 1.6439635538492297, + "grad_norm": 11.25, + "learning_rate": 8.416440004219833e-07, + "loss": 0.95202112, + "memory(GiB)": 302.58, + "step": 293960, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75287862, + "epoch": 1.644075403322209, + "grad_norm": 7.59375, + "learning_rate": 8.411306120761237e-07, + "loss": 0.9679944, + "memory(GiB)": 302.58, + "step": 293980, + "train_speed(iter/s)": 0.123725 + }, + { + "acc": 0.75385346, + "epoch": 1.6441872527951882, + "grad_norm": 4.34375, + "learning_rate": 8.406173659767847e-07, + "loss": 0.96561747, + "memory(GiB)": 302.58, + "step": 294000, + "train_speed(iter/s)": 0.123729 + }, + { + "epoch": 1.6441872527951882, + "eval_acc": 0.7069076825985007, + "eval_loss": 1.011817455291748, + "eval_runtime": 7547.0933, + "eval_samples_per_second": 9.975, + "eval_steps_per_second": 9.975, + "step": 294000 + }, + { + "acc": 0.74363904, + "epoch": 1.6442991022681674, + "grad_norm": 3.984375, + "learning_rate": 8.401042621415206e-07, + "loss": 0.98965063, + "memory(GiB)": 302.58, + "step": 294020, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.7533637, + "epoch": 1.6444109517411467, + "grad_norm": 7.90625, + "learning_rate": 8.395913005878819e-07, + "loss": 0.96572294, + "memory(GiB)": 302.58, + "step": 294040, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.74139543, + "epoch": 1.644522801214126, + "grad_norm": 8.0625, + "learning_rate": 8.390784813334113e-07, + "loss": 1.01035376, + "memory(GiB)": 302.58, + "step": 294060, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.7504468, + "epoch": 1.6446346506871055, + "grad_norm": 9.1875, + "learning_rate": 8.385658043956518e-07, + "loss": 0.96587276, + "memory(GiB)": 302.58, + "step": 294080, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.75283599, + "epoch": 1.6447465001600845, + "grad_norm": 8.5625, + "learning_rate": 8.380532697921379e-07, + "loss": 0.95577555, + "memory(GiB)": 302.58, + "step": 294100, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.74648857, + "epoch": 1.644858349633064, + "grad_norm": 8.3125, + "learning_rate": 8.375408775403987e-07, + "loss": 1.00962677, + "memory(GiB)": 302.58, + "step": 294120, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.74595838, + "epoch": 1.644970199106043, + "grad_norm": 6.78125, + "learning_rate": 8.370286276579603e-07, + "loss": 0.99569311, + "memory(GiB)": 302.58, + "step": 294140, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.73523359, + "epoch": 1.6450820485790225, + "grad_norm": 8.4375, + "learning_rate": 8.365165201623438e-07, + "loss": 1.04022379, + "memory(GiB)": 302.58, + "step": 294160, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.75371318, + "epoch": 1.6451938980520016, + "grad_norm": 12.75, + "learning_rate": 8.360045550710639e-07, + "loss": 0.98340425, + "memory(GiB)": 302.58, + "step": 294180, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.77063971, + "epoch": 1.645305747524981, + "grad_norm": 7.59375, + "learning_rate": 8.354927324016321e-07, + "loss": 0.89371309, + "memory(GiB)": 302.58, + "step": 294200, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.75440373, + "epoch": 1.64541759699796, + "grad_norm": 8.4375, + "learning_rate": 8.34981052171554e-07, + "loss": 0.95415459, + "memory(GiB)": 302.58, + "step": 294220, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.73008566, + "epoch": 1.6455294464709396, + "grad_norm": 9.875, + "learning_rate": 8.344695143983311e-07, + "loss": 1.06951027, + "memory(GiB)": 302.58, + "step": 294240, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.77053595, + "epoch": 1.6456412959439186, + "grad_norm": 10.6875, + "learning_rate": 8.339581190994589e-07, + "loss": 0.89667797, + "memory(GiB)": 302.58, + "step": 294260, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.75556741, + "epoch": 1.645753145416898, + "grad_norm": 6.15625, + "learning_rate": 8.334468662924294e-07, + "loss": 0.96764336, + "memory(GiB)": 302.58, + "step": 294280, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.74788203, + "epoch": 1.6458649948898771, + "grad_norm": 5.78125, + "learning_rate": 8.329357559947287e-07, + "loss": 0.99283257, + "memory(GiB)": 302.58, + "step": 294300, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.73643684, + "epoch": 1.6459768443628566, + "grad_norm": 6.625, + "learning_rate": 8.32424788223839e-07, + "loss": 1.02984266, + "memory(GiB)": 302.58, + "step": 294320, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.75499749, + "epoch": 1.6460886938358357, + "grad_norm": 6.71875, + "learning_rate": 8.31913962997235e-07, + "loss": 0.96623154, + "memory(GiB)": 302.58, + "step": 294340, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.73823771, + "epoch": 1.6462005433088152, + "grad_norm": 10.8125, + "learning_rate": 8.314032803323912e-07, + "loss": 1.04035015, + "memory(GiB)": 302.58, + "step": 294360, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.75679059, + "epoch": 1.6463123927817942, + "grad_norm": 7.4375, + "learning_rate": 8.308927402467742e-07, + "loss": 0.95160942, + "memory(GiB)": 302.58, + "step": 294380, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.73448009, + "epoch": 1.6464242422547737, + "grad_norm": 7.65625, + "learning_rate": 8.303823427578444e-07, + "loss": 1.06040821, + "memory(GiB)": 302.58, + "step": 294400, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.7524539, + "epoch": 1.6465360917277527, + "grad_norm": 8.0625, + "learning_rate": 8.298720878830607e-07, + "loss": 0.97341671, + "memory(GiB)": 302.58, + "step": 294420, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.74079599, + "epoch": 1.6466479412007322, + "grad_norm": 8.8125, + "learning_rate": 8.293619756398746e-07, + "loss": 1.02754469, + "memory(GiB)": 302.58, + "step": 294440, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74790831, + "epoch": 1.6467597906737113, + "grad_norm": 7.21875, + "learning_rate": 8.288520060457333e-07, + "loss": 1.00618267, + "memory(GiB)": 302.58, + "step": 294460, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75716562, + "epoch": 1.6468716401466907, + "grad_norm": 9.75, + "learning_rate": 8.283421791180801e-07, + "loss": 0.94567165, + "memory(GiB)": 302.58, + "step": 294480, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.75303411, + "epoch": 1.6469834896196698, + "grad_norm": 9.375, + "learning_rate": 8.278324948743522e-07, + "loss": 0.97307453, + "memory(GiB)": 302.58, + "step": 294500, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.76448374, + "epoch": 1.6470953390926493, + "grad_norm": 7.03125, + "learning_rate": 8.273229533319826e-07, + "loss": 0.91529551, + "memory(GiB)": 302.58, + "step": 294520, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.7628561, + "epoch": 1.6472071885656283, + "grad_norm": 9.0625, + "learning_rate": 8.268135545083988e-07, + "loss": 0.92984123, + "memory(GiB)": 302.58, + "step": 294540, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.77377381, + "epoch": 1.6473190380386078, + "grad_norm": 7.875, + "learning_rate": 8.263042984210246e-07, + "loss": 0.8895895, + "memory(GiB)": 302.58, + "step": 294560, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.74952779, + "epoch": 1.6474308875115868, + "grad_norm": 7.8125, + "learning_rate": 8.257951850872781e-07, + "loss": 0.97004023, + "memory(GiB)": 302.58, + "step": 294580, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.73537512, + "epoch": 1.6475427369845663, + "grad_norm": 5.6875, + "learning_rate": 8.252862145245721e-07, + "loss": 1.05186453, + "memory(GiB)": 302.58, + "step": 294600, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.76412692, + "epoch": 1.6476545864575454, + "grad_norm": 5.09375, + "learning_rate": 8.247773867503139e-07, + "loss": 0.92574005, + "memory(GiB)": 302.58, + "step": 294620, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.73780794, + "epoch": 1.6477664359305249, + "grad_norm": 6.5, + "learning_rate": 8.242687017819095e-07, + "loss": 1.0359724, + "memory(GiB)": 302.58, + "step": 294640, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.75264702, + "epoch": 1.647878285403504, + "grad_norm": 11.375, + "learning_rate": 8.237601596367567e-07, + "loss": 0.93821154, + "memory(GiB)": 302.58, + "step": 294660, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.74324107, + "epoch": 1.6479901348764834, + "grad_norm": 8.8125, + "learning_rate": 8.232517603322487e-07, + "loss": 1.00209932, + "memory(GiB)": 302.58, + "step": 294680, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.74519491, + "epoch": 1.6481019843494624, + "grad_norm": 8.8125, + "learning_rate": 8.227435038857734e-07, + "loss": 1.00392408, + "memory(GiB)": 302.58, + "step": 294700, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.74517736, + "epoch": 1.648213833822442, + "grad_norm": 6.71875, + "learning_rate": 8.222353903147168e-07, + "loss": 0.99801302, + "memory(GiB)": 302.58, + "step": 294720, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.75224628, + "epoch": 1.648325683295421, + "grad_norm": 5.90625, + "learning_rate": 8.217274196364577e-07, + "loss": 0.97147379, + "memory(GiB)": 302.58, + "step": 294740, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.75503373, + "epoch": 1.6484375327684004, + "grad_norm": 4.375, + "learning_rate": 8.212195918683696e-07, + "loss": 0.97737865, + "memory(GiB)": 302.58, + "step": 294760, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.745893, + "epoch": 1.6485493822413795, + "grad_norm": 9.125, + "learning_rate": 8.207119070278224e-07, + "loss": 0.99010439, + "memory(GiB)": 302.58, + "step": 294780, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75729003, + "epoch": 1.648661231714359, + "grad_norm": 6.9375, + "learning_rate": 8.202043651321801e-07, + "loss": 0.94419594, + "memory(GiB)": 302.58, + "step": 294800, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74142294, + "epoch": 1.648773081187338, + "grad_norm": 6.65625, + "learning_rate": 8.196969661988019e-07, + "loss": 1.03003588, + "memory(GiB)": 302.58, + "step": 294820, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.7408112, + "epoch": 1.6488849306603175, + "grad_norm": 9.5, + "learning_rate": 8.19189710245043e-07, + "loss": 1.01632557, + "memory(GiB)": 302.58, + "step": 294840, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.75467148, + "epoch": 1.6489967801332965, + "grad_norm": 7.34375, + "learning_rate": 8.186825972882528e-07, + "loss": 0.96183996, + "memory(GiB)": 302.58, + "step": 294860, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.75280185, + "epoch": 1.649108629606276, + "grad_norm": 9.3125, + "learning_rate": 8.181756273457764e-07, + "loss": 0.98831425, + "memory(GiB)": 302.58, + "step": 294880, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.75173564, + "epoch": 1.649220479079255, + "grad_norm": 7.6875, + "learning_rate": 8.176688004349542e-07, + "loss": 0.99498901, + "memory(GiB)": 302.58, + "step": 294900, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.74789143, + "epoch": 1.6493323285522346, + "grad_norm": 9.4375, + "learning_rate": 8.171621165731203e-07, + "loss": 0.98590231, + "memory(GiB)": 302.58, + "step": 294920, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.74943619, + "epoch": 1.6494441780252136, + "grad_norm": 6.65625, + "learning_rate": 8.166555757776046e-07, + "loss": 0.98882589, + "memory(GiB)": 302.58, + "step": 294940, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.75777483, + "epoch": 1.649556027498193, + "grad_norm": 5.53125, + "learning_rate": 8.161491780657343e-07, + "loss": 0.94648361, + "memory(GiB)": 302.58, + "step": 294960, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.76279626, + "epoch": 1.6496678769711721, + "grad_norm": 8.9375, + "learning_rate": 8.156429234548286e-07, + "loss": 0.92924833, + "memory(GiB)": 302.58, + "step": 294980, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.75808225, + "epoch": 1.6497797264441516, + "grad_norm": 7.875, + "learning_rate": 8.151368119622033e-07, + "loss": 0.94048424, + "memory(GiB)": 302.58, + "step": 295000, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.76557565, + "epoch": 1.6498915759171306, + "grad_norm": 10.625, + "learning_rate": 8.146308436051692e-07, + "loss": 0.91924067, + "memory(GiB)": 302.58, + "step": 295020, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.7546083, + "epoch": 1.6500034253901101, + "grad_norm": 5.84375, + "learning_rate": 8.141250184010313e-07, + "loss": 0.95211325, + "memory(GiB)": 302.58, + "step": 295040, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.74823508, + "epoch": 1.6501152748630892, + "grad_norm": 7.0, + "learning_rate": 8.136193363670908e-07, + "loss": 0.9899189, + "memory(GiB)": 302.58, + "step": 295060, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.76161194, + "epoch": 1.6502271243360687, + "grad_norm": 6.8125, + "learning_rate": 8.131137975206438e-07, + "loss": 0.95083275, + "memory(GiB)": 302.58, + "step": 295080, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.75374756, + "epoch": 1.6503389738090477, + "grad_norm": 8.0, + "learning_rate": 8.12608401878981e-07, + "loss": 0.9522131, + "memory(GiB)": 302.58, + "step": 295100, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74957137, + "epoch": 1.6504508232820272, + "grad_norm": 6.625, + "learning_rate": 8.12103149459389e-07, + "loss": 0.97547808, + "memory(GiB)": 302.58, + "step": 295120, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.74642267, + "epoch": 1.6505626727550062, + "grad_norm": 5.28125, + "learning_rate": 8.115980402791485e-07, + "loss": 1.00964394, + "memory(GiB)": 302.58, + "step": 295140, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.74854846, + "epoch": 1.6506745222279857, + "grad_norm": 8.1875, + "learning_rate": 8.110930743555362e-07, + "loss": 0.97654085, + "memory(GiB)": 302.58, + "step": 295160, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75130548, + "epoch": 1.6507863717009648, + "grad_norm": 7.15625, + "learning_rate": 8.105882517058233e-07, + "loss": 0.97498293, + "memory(GiB)": 302.58, + "step": 295180, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.74275708, + "epoch": 1.6508982211739442, + "grad_norm": 6.5, + "learning_rate": 8.100835723472766e-07, + "loss": 1.02663794, + "memory(GiB)": 302.58, + "step": 295200, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.74045138, + "epoch": 1.6510100706469233, + "grad_norm": 7.59375, + "learning_rate": 8.095790362971561e-07, + "loss": 1.04095392, + "memory(GiB)": 302.58, + "step": 295220, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74541702, + "epoch": 1.6511219201199028, + "grad_norm": 5.65625, + "learning_rate": 8.090746435727215e-07, + "loss": 0.9873064, + "memory(GiB)": 302.58, + "step": 295240, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75236344, + "epoch": 1.6512337695928818, + "grad_norm": 6.21875, + "learning_rate": 8.085703941912231e-07, + "loss": 0.96487389, + "memory(GiB)": 302.58, + "step": 295260, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.76431041, + "epoch": 1.6513456190658613, + "grad_norm": 7.5, + "learning_rate": 8.080662881699081e-07, + "loss": 0.93796549, + "memory(GiB)": 302.58, + "step": 295280, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.76267042, + "epoch": 1.6514574685388403, + "grad_norm": 6.875, + "learning_rate": 8.075623255260179e-07, + "loss": 0.94379807, + "memory(GiB)": 302.58, + "step": 295300, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.74775128, + "epoch": 1.6515693180118198, + "grad_norm": 7.84375, + "learning_rate": 8.070585062767905e-07, + "loss": 0.98782434, + "memory(GiB)": 302.58, + "step": 295320, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.7432127, + "epoch": 1.6516811674847989, + "grad_norm": 5.65625, + "learning_rate": 8.065548304394572e-07, + "loss": 1.00150003, + "memory(GiB)": 302.58, + "step": 295340, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.72661481, + "epoch": 1.6517930169577784, + "grad_norm": 6.125, + "learning_rate": 8.060512980312457e-07, + "loss": 1.07540655, + "memory(GiB)": 302.58, + "step": 295360, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.75741692, + "epoch": 1.6519048664307574, + "grad_norm": 7.59375, + "learning_rate": 8.055479090693791e-07, + "loss": 0.97743902, + "memory(GiB)": 302.58, + "step": 295380, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.74413095, + "epoch": 1.6520167159037369, + "grad_norm": 7.625, + "learning_rate": 8.050446635710735e-07, + "loss": 1.00562172, + "memory(GiB)": 302.58, + "step": 295400, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.74375396, + "epoch": 1.652128565376716, + "grad_norm": 8.5625, + "learning_rate": 8.045415615535429e-07, + "loss": 1.01373348, + "memory(GiB)": 302.58, + "step": 295420, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.74317217, + "epoch": 1.6522404148496954, + "grad_norm": 7.8125, + "learning_rate": 8.040386030339937e-07, + "loss": 0.99407606, + "memory(GiB)": 302.58, + "step": 295440, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.74053912, + "epoch": 1.6523522643226745, + "grad_norm": 5.9375, + "learning_rate": 8.035357880296297e-07, + "loss": 1.02707691, + "memory(GiB)": 302.58, + "step": 295460, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74029803, + "epoch": 1.652464113795654, + "grad_norm": 5.75, + "learning_rate": 8.030331165576483e-07, + "loss": 1.02730579, + "memory(GiB)": 302.58, + "step": 295480, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75198884, + "epoch": 1.652575963268633, + "grad_norm": 9.5, + "learning_rate": 8.025305886352414e-07, + "loss": 0.96459408, + "memory(GiB)": 302.58, + "step": 295500, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.75767522, + "epoch": 1.6526878127416125, + "grad_norm": 7.34375, + "learning_rate": 8.020282042795996e-07, + "loss": 0.94512005, + "memory(GiB)": 302.58, + "step": 295520, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.74146242, + "epoch": 1.6527996622145915, + "grad_norm": 7.34375, + "learning_rate": 8.015259635079043e-07, + "loss": 1.04705677, + "memory(GiB)": 302.58, + "step": 295540, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.73973212, + "epoch": 1.652911511687571, + "grad_norm": 8.25, + "learning_rate": 8.010238663373337e-07, + "loss": 1.02566185, + "memory(GiB)": 302.58, + "step": 295560, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.75011449, + "epoch": 1.65302336116055, + "grad_norm": 7.25, + "learning_rate": 8.005219127850622e-07, + "loss": 0.99839067, + "memory(GiB)": 302.58, + "step": 295580, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.74057827, + "epoch": 1.6531352106335295, + "grad_norm": 7.1875, + "learning_rate": 8.000201028682569e-07, + "loss": 1.01076536, + "memory(GiB)": 302.58, + "step": 295600, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.75407429, + "epoch": 1.6532470601065086, + "grad_norm": 8.375, + "learning_rate": 7.99518436604082e-07, + "loss": 0.9759469, + "memory(GiB)": 302.58, + "step": 295620, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.75459938, + "epoch": 1.653358909579488, + "grad_norm": 7.0, + "learning_rate": 7.990169140096959e-07, + "loss": 0.95380955, + "memory(GiB)": 302.58, + "step": 295640, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.77295041, + "epoch": 1.653470759052467, + "grad_norm": 10.0, + "learning_rate": 7.985155351022527e-07, + "loss": 0.87841616, + "memory(GiB)": 302.58, + "step": 295660, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.76287966, + "epoch": 1.6535826085254466, + "grad_norm": 7.09375, + "learning_rate": 7.980142998989005e-07, + "loss": 0.93557262, + "memory(GiB)": 302.58, + "step": 295680, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.74353862, + "epoch": 1.6536944579984256, + "grad_norm": 7.84375, + "learning_rate": 7.975132084167836e-07, + "loss": 1.01361303, + "memory(GiB)": 302.58, + "step": 295700, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.73406987, + "epoch": 1.6538063074714051, + "grad_norm": 8.625, + "learning_rate": 7.970122606730402e-07, + "loss": 1.03507566, + "memory(GiB)": 302.58, + "step": 295720, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.75172462, + "epoch": 1.6539181569443842, + "grad_norm": 9.5, + "learning_rate": 7.965114566848048e-07, + "loss": 0.97492714, + "memory(GiB)": 302.58, + "step": 295740, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.75419416, + "epoch": 1.6540300064173636, + "grad_norm": 8.0625, + "learning_rate": 7.960107964692054e-07, + "loss": 0.97349529, + "memory(GiB)": 302.58, + "step": 295760, + "train_speed(iter/s)": 0.123677 + }, + { + "acc": 0.75760074, + "epoch": 1.6541418558903427, + "grad_norm": 8.375, + "learning_rate": 7.955102800433689e-07, + "loss": 0.96574125, + "memory(GiB)": 302.58, + "step": 295780, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.76462665, + "epoch": 1.6542537053633222, + "grad_norm": 7.125, + "learning_rate": 7.950099074244128e-07, + "loss": 0.89909945, + "memory(GiB)": 302.58, + "step": 295800, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.75249963, + "epoch": 1.6543655548363012, + "grad_norm": 9.875, + "learning_rate": 7.94509678629451e-07, + "loss": 0.98393507, + "memory(GiB)": 302.58, + "step": 295820, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.74618211, + "epoch": 1.6544774043092807, + "grad_norm": 6.625, + "learning_rate": 7.940095936755937e-07, + "loss": 0.9993906, + "memory(GiB)": 302.58, + "step": 295840, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.7460989, + "epoch": 1.6545892537822597, + "grad_norm": 7.71875, + "learning_rate": 7.935096525799452e-07, + "loss": 1.01889734, + "memory(GiB)": 302.58, + "step": 295860, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.75376558, + "epoch": 1.6547011032552392, + "grad_norm": 7.34375, + "learning_rate": 7.930098553596049e-07, + "loss": 0.95706253, + "memory(GiB)": 302.58, + "step": 295880, + "train_speed(iter/s)": 0.123701 + }, + { + "acc": 0.77088223, + "epoch": 1.6548129527282183, + "grad_norm": 6.15625, + "learning_rate": 7.925102020316677e-07, + "loss": 0.89422483, + "memory(GiB)": 302.58, + "step": 295900, + "train_speed(iter/s)": 0.123705 + }, + { + "acc": 0.76019278, + "epoch": 1.6549248022011978, + "grad_norm": 6.375, + "learning_rate": 7.920106926132232e-07, + "loss": 0.91519766, + "memory(GiB)": 302.58, + "step": 295920, + "train_speed(iter/s)": 0.123709 + }, + { + "acc": 0.75319085, + "epoch": 1.6550366516741768, + "grad_norm": 6.59375, + "learning_rate": 7.91511327121356e-07, + "loss": 0.96423206, + "memory(GiB)": 302.58, + "step": 295940, + "train_speed(iter/s)": 0.123713 + }, + { + "acc": 0.75600591, + "epoch": 1.6551485011471563, + "grad_norm": 9.5625, + "learning_rate": 7.910121055731452e-07, + "loss": 0.94494581, + "memory(GiB)": 302.58, + "step": 295960, + "train_speed(iter/s)": 0.123717 + }, + { + "acc": 0.75303092, + "epoch": 1.6552603506201353, + "grad_norm": 6.625, + "learning_rate": 7.905130279856676e-07, + "loss": 0.96849146, + "memory(GiB)": 302.58, + "step": 295980, + "train_speed(iter/s)": 0.123721 + }, + { + "acc": 0.75076461, + "epoch": 1.6553722000931148, + "grad_norm": 5.75, + "learning_rate": 7.900140943759927e-07, + "loss": 0.98133078, + "memory(GiB)": 302.58, + "step": 296000, + "train_speed(iter/s)": 0.123725 + }, + { + "epoch": 1.6553722000931148, + "eval_acc": 0.7069154714907319, + "eval_loss": 1.0118590593338013, + "eval_runtime": 7563.7647, + "eval_samples_per_second": 9.953, + "eval_steps_per_second": 9.953, + "step": 296000 + }, + { + "acc": 0.74852886, + "epoch": 1.6554840495660939, + "grad_norm": 8.875, + "learning_rate": 7.895153047611847e-07, + "loss": 0.98653479, + "memory(GiB)": 302.58, + "step": 296020, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.74221301, + "epoch": 1.6555958990390733, + "grad_norm": 5.65625, + "learning_rate": 7.890166591583048e-07, + "loss": 1.01164742, + "memory(GiB)": 302.58, + "step": 296040, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.73257842, + "epoch": 1.6557077485120524, + "grad_norm": 7.65625, + "learning_rate": 7.885181575844081e-07, + "loss": 1.06009035, + "memory(GiB)": 302.58, + "step": 296060, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.74820065, + "epoch": 1.6558195979850319, + "grad_norm": 7.46875, + "learning_rate": 7.880198000565443e-07, + "loss": 0.97890177, + "memory(GiB)": 302.58, + "step": 296080, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.738767, + "epoch": 1.655931447458011, + "grad_norm": 7.625, + "learning_rate": 7.875215865917579e-07, + "loss": 1.04232063, + "memory(GiB)": 302.58, + "step": 296100, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.74339767, + "epoch": 1.6560432969309904, + "grad_norm": 10.875, + "learning_rate": 7.870235172070923e-07, + "loss": 1.014715, + "memory(GiB)": 302.58, + "step": 296120, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.7447928, + "epoch": 1.6561551464039694, + "grad_norm": 6.71875, + "learning_rate": 7.865255919195808e-07, + "loss": 1.02132711, + "memory(GiB)": 302.58, + "step": 296140, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.75378075, + "epoch": 1.656266995876949, + "grad_norm": 7.34375, + "learning_rate": 7.860278107462549e-07, + "loss": 0.95222912, + "memory(GiB)": 302.58, + "step": 296160, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.76040368, + "epoch": 1.656378845349928, + "grad_norm": 9.5, + "learning_rate": 7.855301737041399e-07, + "loss": 0.93042908, + "memory(GiB)": 302.58, + "step": 296180, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.75592213, + "epoch": 1.6564906948229075, + "grad_norm": 5.75, + "learning_rate": 7.850326808102571e-07, + "loss": 0.93812599, + "memory(GiB)": 302.58, + "step": 296200, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.76214476, + "epoch": 1.6566025442958865, + "grad_norm": 6.53125, + "learning_rate": 7.845353320816213e-07, + "loss": 0.92951097, + "memory(GiB)": 302.58, + "step": 296220, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.75777836, + "epoch": 1.656714393768866, + "grad_norm": 8.125, + "learning_rate": 7.840381275352443e-07, + "loss": 0.95443306, + "memory(GiB)": 302.58, + "step": 296240, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.7605268, + "epoch": 1.656826243241845, + "grad_norm": 4.1875, + "learning_rate": 7.835410671881322e-07, + "loss": 0.92687397, + "memory(GiB)": 302.58, + "step": 296260, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.75857329, + "epoch": 1.6569380927148245, + "grad_norm": 9.5, + "learning_rate": 7.830441510572851e-07, + "loss": 0.96296473, + "memory(GiB)": 302.58, + "step": 296280, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75060868, + "epoch": 1.6570499421878035, + "grad_norm": 6.5625, + "learning_rate": 7.825473791597e-07, + "loss": 0.96827803, + "memory(GiB)": 302.58, + "step": 296300, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.76083999, + "epoch": 1.657161791660783, + "grad_norm": 8.5, + "learning_rate": 7.820507515123671e-07, + "loss": 0.96712809, + "memory(GiB)": 302.58, + "step": 296320, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.74434037, + "epoch": 1.657273641133762, + "grad_norm": 7.875, + "learning_rate": 7.815542681322735e-07, + "loss": 1.00064287, + "memory(GiB)": 302.58, + "step": 296340, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.75455298, + "epoch": 1.6573854906067416, + "grad_norm": 5.6875, + "learning_rate": 7.810579290363995e-07, + "loss": 0.96906157, + "memory(GiB)": 302.58, + "step": 296360, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.7392477, + "epoch": 1.6574973400797206, + "grad_norm": 8.375, + "learning_rate": 7.805617342417232e-07, + "loss": 1.01632671, + "memory(GiB)": 302.58, + "step": 296380, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.75671897, + "epoch": 1.6576091895527, + "grad_norm": 6.125, + "learning_rate": 7.800656837652148e-07, + "loss": 0.97313499, + "memory(GiB)": 302.58, + "step": 296400, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.75480127, + "epoch": 1.6577210390256791, + "grad_norm": 7.9375, + "learning_rate": 7.795697776238414e-07, + "loss": 0.96411467, + "memory(GiB)": 302.58, + "step": 296420, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.75831757, + "epoch": 1.6578328884986586, + "grad_norm": 8.0625, + "learning_rate": 7.790740158345638e-07, + "loss": 0.93745117, + "memory(GiB)": 302.58, + "step": 296440, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.75719199, + "epoch": 1.6579447379716377, + "grad_norm": 7.34375, + "learning_rate": 7.785783984143397e-07, + "loss": 0.93926382, + "memory(GiB)": 302.58, + "step": 296460, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.76109295, + "epoch": 1.6580565874446171, + "grad_norm": 8.75, + "learning_rate": 7.780829253801197e-07, + "loss": 0.91386976, + "memory(GiB)": 302.58, + "step": 296480, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.73979101, + "epoch": 1.6581684369175962, + "grad_norm": 6.3125, + "learning_rate": 7.775875967488511e-07, + "loss": 1.03367043, + "memory(GiB)": 302.58, + "step": 296500, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75348301, + "epoch": 1.6582802863905757, + "grad_norm": 11.8125, + "learning_rate": 7.770924125374757e-07, + "loss": 0.9743329, + "memory(GiB)": 302.58, + "step": 296520, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.75701442, + "epoch": 1.6583921358635547, + "grad_norm": 6.78125, + "learning_rate": 7.7659737276293e-07, + "loss": 0.94804039, + "memory(GiB)": 302.58, + "step": 296540, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.75534296, + "epoch": 1.6585039853365342, + "grad_norm": 9.125, + "learning_rate": 7.761024774421461e-07, + "loss": 0.92919521, + "memory(GiB)": 302.58, + "step": 296560, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.74883265, + "epoch": 1.6586158348095132, + "grad_norm": 5.28125, + "learning_rate": 7.756077265920514e-07, + "loss": 0.97884426, + "memory(GiB)": 302.58, + "step": 296580, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.74257755, + "epoch": 1.6587276842824927, + "grad_norm": 6.21875, + "learning_rate": 7.751131202295676e-07, + "loss": 1.02691898, + "memory(GiB)": 302.58, + "step": 296600, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.75052352, + "epoch": 1.6588395337554718, + "grad_norm": 4.875, + "learning_rate": 7.74618658371612e-07, + "loss": 0.97464409, + "memory(GiB)": 302.58, + "step": 296620, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.76675801, + "epoch": 1.6589513832284513, + "grad_norm": 7.375, + "learning_rate": 7.741243410350951e-07, + "loss": 0.92723427, + "memory(GiB)": 302.58, + "step": 296640, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.74018145, + "epoch": 1.6590632327014303, + "grad_norm": 8.1875, + "learning_rate": 7.736301682369268e-07, + "loss": 1.02269812, + "memory(GiB)": 302.58, + "step": 296660, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.74620943, + "epoch": 1.6591750821744098, + "grad_norm": 9.5, + "learning_rate": 7.731361399940085e-07, + "loss": 1.01064882, + "memory(GiB)": 302.58, + "step": 296680, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.75428772, + "epoch": 1.6592869316473888, + "grad_norm": 7.125, + "learning_rate": 7.726422563232372e-07, + "loss": 0.99707346, + "memory(GiB)": 302.58, + "step": 296700, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74631839, + "epoch": 1.6593987811203683, + "grad_norm": 8.5, + "learning_rate": 7.72148517241505e-07, + "loss": 1.00732374, + "memory(GiB)": 302.58, + "step": 296720, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.76652465, + "epoch": 1.6595106305933474, + "grad_norm": 8.5625, + "learning_rate": 7.716549227657e-07, + "loss": 0.90467815, + "memory(GiB)": 302.58, + "step": 296740, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.75495009, + "epoch": 1.6596224800663268, + "grad_norm": 6.71875, + "learning_rate": 7.711614729127043e-07, + "loss": 0.96458082, + "memory(GiB)": 302.58, + "step": 296760, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.75453715, + "epoch": 1.6597343295393059, + "grad_norm": 5.40625, + "learning_rate": 7.706681676993949e-07, + "loss": 0.98090754, + "memory(GiB)": 302.58, + "step": 296780, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.75185838, + "epoch": 1.6598461790122854, + "grad_norm": 6.75, + "learning_rate": 7.701750071426456e-07, + "loss": 0.97690725, + "memory(GiB)": 302.58, + "step": 296800, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.7484724, + "epoch": 1.6599580284852644, + "grad_norm": 5.28125, + "learning_rate": 7.696819912593234e-07, + "loss": 0.98382263, + "memory(GiB)": 302.58, + "step": 296820, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.74796615, + "epoch": 1.660069877958244, + "grad_norm": 8.4375, + "learning_rate": 7.69189120066291e-07, + "loss": 0.98227568, + "memory(GiB)": 302.58, + "step": 296840, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.7389605, + "epoch": 1.660181727431223, + "grad_norm": 6.5625, + "learning_rate": 7.686963935804064e-07, + "loss": 1.02954721, + "memory(GiB)": 302.58, + "step": 296860, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.75399981, + "epoch": 1.6602935769042024, + "grad_norm": 7.84375, + "learning_rate": 7.682038118185225e-07, + "loss": 0.94051294, + "memory(GiB)": 302.58, + "step": 296880, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.76039534, + "epoch": 1.6604054263771815, + "grad_norm": 7.34375, + "learning_rate": 7.677113747974862e-07, + "loss": 0.93220005, + "memory(GiB)": 302.58, + "step": 296900, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.76229925, + "epoch": 1.660517275850161, + "grad_norm": 5.96875, + "learning_rate": 7.672190825341403e-07, + "loss": 0.91864195, + "memory(GiB)": 302.58, + "step": 296920, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.74665866, + "epoch": 1.66062912532314, + "grad_norm": 7.96875, + "learning_rate": 7.667269350453249e-07, + "loss": 0.99012289, + "memory(GiB)": 302.58, + "step": 296940, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.74334469, + "epoch": 1.6607409747961195, + "grad_norm": 6.34375, + "learning_rate": 7.662349323478713e-07, + "loss": 1.01540079, + "memory(GiB)": 302.58, + "step": 296960, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.75059404, + "epoch": 1.6608528242690985, + "grad_norm": 6.0, + "learning_rate": 7.657430744586081e-07, + "loss": 0.98745213, + "memory(GiB)": 302.58, + "step": 296980, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.74522667, + "epoch": 1.660964673742078, + "grad_norm": 9.0, + "learning_rate": 7.652513613943585e-07, + "loss": 0.99660139, + "memory(GiB)": 302.58, + "step": 297000, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.7509964, + "epoch": 1.661076523215057, + "grad_norm": 7.28125, + "learning_rate": 7.647597931719398e-07, + "loss": 0.98167305, + "memory(GiB)": 302.58, + "step": 297020, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.75252752, + "epoch": 1.6611883726880365, + "grad_norm": 7.15625, + "learning_rate": 7.64268369808166e-07, + "loss": 0.97757864, + "memory(GiB)": 302.58, + "step": 297040, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.77124257, + "epoch": 1.6613002221610156, + "grad_norm": 9.0, + "learning_rate": 7.63777091319845e-07, + "loss": 0.87053518, + "memory(GiB)": 302.58, + "step": 297060, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.75652819, + "epoch": 1.661412071633995, + "grad_norm": 8.25, + "learning_rate": 7.632859577237806e-07, + "loss": 0.96414099, + "memory(GiB)": 302.58, + "step": 297080, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.73569584, + "epoch": 1.661523921106974, + "grad_norm": 10.25, + "learning_rate": 7.627949690367703e-07, + "loss": 1.05312347, + "memory(GiB)": 302.58, + "step": 297100, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.75144029, + "epoch": 1.6616357705799536, + "grad_norm": 8.6875, + "learning_rate": 7.62304125275608e-07, + "loss": 0.97419863, + "memory(GiB)": 302.58, + "step": 297120, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.75288053, + "epoch": 1.6617476200529326, + "grad_norm": 7.0, + "learning_rate": 7.61813426457082e-07, + "loss": 0.96312666, + "memory(GiB)": 302.58, + "step": 297140, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.75545917, + "epoch": 1.6618594695259121, + "grad_norm": 9.0, + "learning_rate": 7.613228725979755e-07, + "loss": 0.9498229, + "memory(GiB)": 302.58, + "step": 297160, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.74315972, + "epoch": 1.6619713189988912, + "grad_norm": 7.15625, + "learning_rate": 7.608324637150666e-07, + "loss": 1.01743221, + "memory(GiB)": 302.58, + "step": 297180, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.74697809, + "epoch": 1.6620831684718707, + "grad_norm": 6.53125, + "learning_rate": 7.603421998251304e-07, + "loss": 0.98387442, + "memory(GiB)": 302.58, + "step": 297200, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.75433507, + "epoch": 1.6621950179448497, + "grad_norm": 4.96875, + "learning_rate": 7.598520809449344e-07, + "loss": 0.96257372, + "memory(GiB)": 302.58, + "step": 297220, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.76128583, + "epoch": 1.6623068674178292, + "grad_norm": 8.75, + "learning_rate": 7.593621070912411e-07, + "loss": 0.94067526, + "memory(GiB)": 302.58, + "step": 297240, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.75766835, + "epoch": 1.6624187168908082, + "grad_norm": 6.46875, + "learning_rate": 7.588722782808117e-07, + "loss": 0.9537262, + "memory(GiB)": 302.58, + "step": 297260, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.7434093, + "epoch": 1.6625305663637877, + "grad_norm": 7.84375, + "learning_rate": 7.583825945303991e-07, + "loss": 0.98261757, + "memory(GiB)": 302.58, + "step": 297280, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.75012107, + "epoch": 1.6626424158367668, + "grad_norm": 9.75, + "learning_rate": 7.578930558567509e-07, + "loss": 0.98376493, + "memory(GiB)": 302.58, + "step": 297300, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.76190414, + "epoch": 1.6627542653097462, + "grad_norm": 7.4375, + "learning_rate": 7.574036622766117e-07, + "loss": 0.93690872, + "memory(GiB)": 302.58, + "step": 297320, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74278212, + "epoch": 1.6628661147827253, + "grad_norm": 7.1875, + "learning_rate": 7.569144138067203e-07, + "loss": 0.9830987, + "memory(GiB)": 302.58, + "step": 297340, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.75175238, + "epoch": 1.6629779642557048, + "grad_norm": 5.4375, + "learning_rate": 7.564253104638109e-07, + "loss": 0.96867466, + "memory(GiB)": 302.58, + "step": 297360, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.76884437, + "epoch": 1.6630898137286838, + "grad_norm": 5.625, + "learning_rate": 7.559363522646113e-07, + "loss": 0.90138988, + "memory(GiB)": 302.58, + "step": 297380, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74112735, + "epoch": 1.6632016632016633, + "grad_norm": 7.09375, + "learning_rate": 7.55447539225846e-07, + "loss": 1.01555939, + "memory(GiB)": 302.58, + "step": 297400, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.7423398, + "epoch": 1.6633135126746423, + "grad_norm": 10.75, + "learning_rate": 7.549588713642342e-07, + "loss": 1.02321739, + "memory(GiB)": 302.58, + "step": 297420, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.76721745, + "epoch": 1.6634253621476218, + "grad_norm": 8.0625, + "learning_rate": 7.544703486964894e-07, + "loss": 0.90525331, + "memory(GiB)": 302.58, + "step": 297440, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.7487772, + "epoch": 1.6635372116206009, + "grad_norm": 8.0625, + "learning_rate": 7.539819712393209e-07, + "loss": 0.98756371, + "memory(GiB)": 302.58, + "step": 297460, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.74544506, + "epoch": 1.6636490610935804, + "grad_norm": 10.5, + "learning_rate": 7.534937390094327e-07, + "loss": 0.9824131, + "memory(GiB)": 302.58, + "step": 297480, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.76470971, + "epoch": 1.6637609105665594, + "grad_norm": 6.0, + "learning_rate": 7.53005652023524e-07, + "loss": 0.9012352, + "memory(GiB)": 302.58, + "step": 297500, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.7598134, + "epoch": 1.6638727600395389, + "grad_norm": 7.0625, + "learning_rate": 7.525177102982883e-07, + "loss": 0.93645496, + "memory(GiB)": 302.58, + "step": 297520, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.74005876, + "epoch": 1.663984609512518, + "grad_norm": 6.0, + "learning_rate": 7.520299138504156e-07, + "loss": 1.00768337, + "memory(GiB)": 302.58, + "step": 297540, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.74645076, + "epoch": 1.6640964589854974, + "grad_norm": 6.75, + "learning_rate": 7.515422626965902e-07, + "loss": 0.99700594, + "memory(GiB)": 302.58, + "step": 297560, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.74197621, + "epoch": 1.6642083084584764, + "grad_norm": 5.21875, + "learning_rate": 7.510547568534909e-07, + "loss": 1.02204866, + "memory(GiB)": 302.58, + "step": 297580, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.75831943, + "epoch": 1.664320157931456, + "grad_norm": 5.9375, + "learning_rate": 7.505673963377918e-07, + "loss": 0.93652124, + "memory(GiB)": 302.58, + "step": 297600, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.7384028, + "epoch": 1.664432007404435, + "grad_norm": 9.5625, + "learning_rate": 7.500801811661629e-07, + "loss": 1.02669792, + "memory(GiB)": 302.58, + "step": 297620, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.75062065, + "epoch": 1.6645438568774145, + "grad_norm": 5.59375, + "learning_rate": 7.495931113552673e-07, + "loss": 0.97670345, + "memory(GiB)": 302.58, + "step": 297640, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.76027098, + "epoch": 1.6646557063503935, + "grad_norm": 7.34375, + "learning_rate": 7.491061869217653e-07, + "loss": 0.93435011, + "memory(GiB)": 302.58, + "step": 297660, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75754814, + "epoch": 1.664767555823373, + "grad_norm": 6.4375, + "learning_rate": 7.486194078823106e-07, + "loss": 0.95614939, + "memory(GiB)": 302.58, + "step": 297680, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.76659932, + "epoch": 1.664879405296352, + "grad_norm": 5.625, + "learning_rate": 7.481327742535532e-07, + "loss": 0.89987383, + "memory(GiB)": 302.58, + "step": 297700, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.7548388, + "epoch": 1.6649912547693315, + "grad_norm": 11.875, + "learning_rate": 7.476462860521371e-07, + "loss": 0.95018005, + "memory(GiB)": 302.58, + "step": 297720, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.749756, + "epoch": 1.6651031042423106, + "grad_norm": 7.78125, + "learning_rate": 7.471599432947019e-07, + "loss": 0.97680225, + "memory(GiB)": 302.58, + "step": 297740, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.73157248, + "epoch": 1.66521495371529, + "grad_norm": 6.0, + "learning_rate": 7.466737459978818e-07, + "loss": 1.06218815, + "memory(GiB)": 302.58, + "step": 297760, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.74265337, + "epoch": 1.665326803188269, + "grad_norm": 6.25, + "learning_rate": 7.46187694178307e-07, + "loss": 1.0291954, + "memory(GiB)": 302.58, + "step": 297780, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.73962741, + "epoch": 1.6654386526612486, + "grad_norm": 5.53125, + "learning_rate": 7.457017878526002e-07, + "loss": 1.02209454, + "memory(GiB)": 302.58, + "step": 297800, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.7519331, + "epoch": 1.6655505021342276, + "grad_norm": 8.0, + "learning_rate": 7.452160270373832e-07, + "loss": 0.96412449, + "memory(GiB)": 302.58, + "step": 297820, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.74387193, + "epoch": 1.665662351607207, + "grad_norm": 10.0625, + "learning_rate": 7.447304117492698e-07, + "loss": 1.01517, + "memory(GiB)": 302.58, + "step": 297840, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.75512295, + "epoch": 1.6657742010801861, + "grad_norm": 7.0625, + "learning_rate": 7.442449420048691e-07, + "loss": 0.96287155, + "memory(GiB)": 302.58, + "step": 297860, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.75707092, + "epoch": 1.6658860505531656, + "grad_norm": 6.96875, + "learning_rate": 7.437596178207862e-07, + "loss": 0.95240755, + "memory(GiB)": 302.58, + "step": 297880, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.72908549, + "epoch": 1.6659979000261447, + "grad_norm": 5.46875, + "learning_rate": 7.432744392136209e-07, + "loss": 1.06894741, + "memory(GiB)": 302.58, + "step": 297900, + "train_speed(iter/s)": 0.1237 + }, + { + "acc": 0.74283695, + "epoch": 1.6661097494991242, + "grad_norm": 6.96875, + "learning_rate": 7.427894061999674e-07, + "loss": 1.02013197, + "memory(GiB)": 302.58, + "step": 297920, + "train_speed(iter/s)": 0.123704 + }, + { + "acc": 0.74633927, + "epoch": 1.6662215989721032, + "grad_norm": 5.09375, + "learning_rate": 7.423045187964151e-07, + "loss": 1.00302887, + "memory(GiB)": 302.58, + "step": 297940, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.74536471, + "epoch": 1.6663334484450827, + "grad_norm": 6.9375, + "learning_rate": 7.41819777019549e-07, + "loss": 1.00479193, + "memory(GiB)": 302.58, + "step": 297960, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.73944077, + "epoch": 1.6664452979180617, + "grad_norm": 9.25, + "learning_rate": 7.413351808859493e-07, + "loss": 1.05141516, + "memory(GiB)": 302.58, + "step": 297980, + "train_speed(iter/s)": 0.123715 + }, + { + "acc": 0.74314194, + "epoch": 1.6665571473910412, + "grad_norm": 7.15625, + "learning_rate": 7.408507304121898e-07, + "loss": 1.02014732, + "memory(GiB)": 302.58, + "step": 298000, + "train_speed(iter/s)": 0.123719 + }, + { + "epoch": 1.6665571473910412, + "eval_acc": 0.7069178870332593, + "eval_loss": 1.011829137802124, + "eval_runtime": 7540.9019, + "eval_samples_per_second": 9.983, + "eval_steps_per_second": 9.983, + "step": 298000 + }, + { + "acc": 0.74415393, + "epoch": 1.6666689968640203, + "grad_norm": 6.84375, + "learning_rate": 7.403664256148413e-07, + "loss": 1.02972422, + "memory(GiB)": 302.58, + "step": 298020, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.751647, + "epoch": 1.6667808463369997, + "grad_norm": 5.53125, + "learning_rate": 7.398822665104677e-07, + "loss": 0.96714506, + "memory(GiB)": 302.58, + "step": 298040, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.74571991, + "epoch": 1.6668926958099788, + "grad_norm": 8.6875, + "learning_rate": 7.393982531156279e-07, + "loss": 1.02147188, + "memory(GiB)": 302.58, + "step": 298060, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.75952382, + "epoch": 1.6670045452829583, + "grad_norm": 5.5, + "learning_rate": 7.389143854468789e-07, + "loss": 0.94080095, + "memory(GiB)": 302.58, + "step": 298080, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.76377058, + "epoch": 1.6671163947559373, + "grad_norm": 8.875, + "learning_rate": 7.384306635207695e-07, + "loss": 0.9078146, + "memory(GiB)": 302.58, + "step": 298100, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.74150248, + "epoch": 1.6672282442289168, + "grad_norm": 5.53125, + "learning_rate": 7.379470873538441e-07, + "loss": 1.04428043, + "memory(GiB)": 302.58, + "step": 298120, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.75748906, + "epoch": 1.6673400937018958, + "grad_norm": 5.9375, + "learning_rate": 7.374636569626436e-07, + "loss": 0.97206631, + "memory(GiB)": 302.58, + "step": 298140, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.73675547, + "epoch": 1.6674519431748753, + "grad_norm": 6.53125, + "learning_rate": 7.36980372363702e-07, + "loss": 1.04018259, + "memory(GiB)": 302.58, + "step": 298160, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.76142216, + "epoch": 1.6675637926478544, + "grad_norm": 7.65625, + "learning_rate": 7.364972335735488e-07, + "loss": 0.94010324, + "memory(GiB)": 302.58, + "step": 298180, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.7519217, + "epoch": 1.6676756421208339, + "grad_norm": 8.4375, + "learning_rate": 7.360142406087101e-07, + "loss": 0.96949339, + "memory(GiB)": 302.58, + "step": 298200, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.76187053, + "epoch": 1.667787491593813, + "grad_norm": 9.3125, + "learning_rate": 7.355313934857044e-07, + "loss": 0.92257633, + "memory(GiB)": 302.58, + "step": 298220, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.74035521, + "epoch": 1.6678993410667924, + "grad_norm": 4.0, + "learning_rate": 7.350486922210482e-07, + "loss": 1.02947512, + "memory(GiB)": 302.58, + "step": 298240, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.77291117, + "epoch": 1.6680111905397714, + "grad_norm": 7.0, + "learning_rate": 7.345661368312501e-07, + "loss": 0.87403393, + "memory(GiB)": 302.58, + "step": 298260, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.73136282, + "epoch": 1.668123040012751, + "grad_norm": 10.0, + "learning_rate": 7.34083727332815e-07, + "loss": 1.0738472, + "memory(GiB)": 302.58, + "step": 298280, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.74789634, + "epoch": 1.66823488948573, + "grad_norm": 7.0625, + "learning_rate": 7.336014637422439e-07, + "loss": 0.98533211, + "memory(GiB)": 302.58, + "step": 298300, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.75101705, + "epoch": 1.6683467389587094, + "grad_norm": 6.84375, + "learning_rate": 7.331193460760305e-07, + "loss": 0.97686806, + "memory(GiB)": 302.58, + "step": 298320, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.76408415, + "epoch": 1.6684585884316885, + "grad_norm": 5.875, + "learning_rate": 7.326373743506649e-07, + "loss": 0.92000008, + "memory(GiB)": 302.58, + "step": 298340, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.74832039, + "epoch": 1.668570437904668, + "grad_norm": 7.28125, + "learning_rate": 7.321555485826332e-07, + "loss": 0.98534107, + "memory(GiB)": 302.58, + "step": 298360, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.75816236, + "epoch": 1.668682287377647, + "grad_norm": 8.9375, + "learning_rate": 7.316738687884151e-07, + "loss": 0.95070333, + "memory(GiB)": 302.58, + "step": 298380, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.75413156, + "epoch": 1.6687941368506265, + "grad_norm": 8.5625, + "learning_rate": 7.311923349844851e-07, + "loss": 0.9574398, + "memory(GiB)": 302.58, + "step": 298400, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.75640502, + "epoch": 1.6689059863236055, + "grad_norm": 5.84375, + "learning_rate": 7.30710947187313e-07, + "loss": 0.95321817, + "memory(GiB)": 302.58, + "step": 298420, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74305348, + "epoch": 1.669017835796585, + "grad_norm": 13.625, + "learning_rate": 7.302297054133644e-07, + "loss": 1.00766182, + "memory(GiB)": 302.58, + "step": 298440, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.76293316, + "epoch": 1.669129685269564, + "grad_norm": 10.3125, + "learning_rate": 7.297486096790974e-07, + "loss": 0.90856876, + "memory(GiB)": 302.58, + "step": 298460, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.7414144, + "epoch": 1.6692415347425436, + "grad_norm": 6.40625, + "learning_rate": 7.292676600009701e-07, + "loss": 1.00313425, + "memory(GiB)": 302.58, + "step": 298480, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.76069083, + "epoch": 1.6693533842155226, + "grad_norm": 10.0, + "learning_rate": 7.287868563954303e-07, + "loss": 0.94897842, + "memory(GiB)": 302.58, + "step": 298500, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75581856, + "epoch": 1.669465233688502, + "grad_norm": 10.9375, + "learning_rate": 7.283061988789242e-07, + "loss": 0.95155659, + "memory(GiB)": 302.58, + "step": 298520, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75471683, + "epoch": 1.6695770831614811, + "grad_norm": 7.09375, + "learning_rate": 7.278256874678913e-07, + "loss": 0.95131054, + "memory(GiB)": 302.58, + "step": 298540, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.75661716, + "epoch": 1.6696889326344606, + "grad_norm": 5.65625, + "learning_rate": 7.273453221787668e-07, + "loss": 0.94117975, + "memory(GiB)": 302.58, + "step": 298560, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.76208906, + "epoch": 1.6698007821074397, + "grad_norm": 7.125, + "learning_rate": 7.2686510302798e-07, + "loss": 0.91676531, + "memory(GiB)": 302.58, + "step": 298580, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.75697656, + "epoch": 1.6699126315804191, + "grad_norm": 8.0625, + "learning_rate": 7.263850300319564e-07, + "loss": 0.94856472, + "memory(GiB)": 302.58, + "step": 298600, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.7635757, + "epoch": 1.6700244810533982, + "grad_norm": 6.53125, + "learning_rate": 7.259051032071163e-07, + "loss": 0.93382149, + "memory(GiB)": 302.58, + "step": 298620, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.76092825, + "epoch": 1.6701363305263777, + "grad_norm": 6.625, + "learning_rate": 7.254253225698743e-07, + "loss": 0.92928762, + "memory(GiB)": 302.58, + "step": 298640, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.75708966, + "epoch": 1.6702481799993567, + "grad_norm": 4.875, + "learning_rate": 7.249456881366407e-07, + "loss": 0.95054951, + "memory(GiB)": 302.58, + "step": 298660, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.76614356, + "epoch": 1.6703600294723362, + "grad_norm": 8.0625, + "learning_rate": 7.244661999238195e-07, + "loss": 0.91744537, + "memory(GiB)": 302.58, + "step": 298680, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.73545814, + "epoch": 1.6704718789453152, + "grad_norm": 7.4375, + "learning_rate": 7.239868579478127e-07, + "loss": 1.04594316, + "memory(GiB)": 302.58, + "step": 298700, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74401278, + "epoch": 1.6705837284182947, + "grad_norm": 8.9375, + "learning_rate": 7.235076622250137e-07, + "loss": 1.0176753, + "memory(GiB)": 302.58, + "step": 298720, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74115739, + "epoch": 1.6706955778912738, + "grad_norm": 6.5, + "learning_rate": 7.230286127718134e-07, + "loss": 1.02810478, + "memory(GiB)": 302.58, + "step": 298740, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74732041, + "epoch": 1.6708074273642533, + "grad_norm": 7.375, + "learning_rate": 7.225497096045963e-07, + "loss": 0.98676586, + "memory(GiB)": 302.58, + "step": 298760, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74743528, + "epoch": 1.6709192768372323, + "grad_norm": 6.59375, + "learning_rate": 7.220709527397429e-07, + "loss": 1.00417662, + "memory(GiB)": 302.58, + "step": 298780, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.76080418, + "epoch": 1.6710311263102118, + "grad_norm": 8.1875, + "learning_rate": 7.215923421936271e-07, + "loss": 0.93305016, + "memory(GiB)": 302.58, + "step": 298800, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.76070838, + "epoch": 1.6711429757831908, + "grad_norm": 7.5625, + "learning_rate": 7.211138779826205e-07, + "loss": 0.94554014, + "memory(GiB)": 302.58, + "step": 298820, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.74759007, + "epoch": 1.6712548252561703, + "grad_norm": 11.125, + "learning_rate": 7.206355601230869e-07, + "loss": 0.97218332, + "memory(GiB)": 302.58, + "step": 298840, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.769068, + "epoch": 1.6713666747291493, + "grad_norm": 7.21875, + "learning_rate": 7.201573886313867e-07, + "loss": 0.89842024, + "memory(GiB)": 302.58, + "step": 298860, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.75421066, + "epoch": 1.6714785242021288, + "grad_norm": 5.71875, + "learning_rate": 7.196793635238747e-07, + "loss": 0.96726818, + "memory(GiB)": 302.58, + "step": 298880, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74072766, + "epoch": 1.6715903736751079, + "grad_norm": 5.875, + "learning_rate": 7.19201484816901e-07, + "loss": 1.0262969, + "memory(GiB)": 302.58, + "step": 298900, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.74053974, + "epoch": 1.6717022231480874, + "grad_norm": 8.0, + "learning_rate": 7.18723752526811e-07, + "loss": 1.01340828, + "memory(GiB)": 302.58, + "step": 298920, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.75154533, + "epoch": 1.6718140726210664, + "grad_norm": 7.25, + "learning_rate": 7.182461666699425e-07, + "loss": 0.97150793, + "memory(GiB)": 302.58, + "step": 298940, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.75436292, + "epoch": 1.671925922094046, + "grad_norm": 7.0, + "learning_rate": 7.177687272626338e-07, + "loss": 0.95385504, + "memory(GiB)": 302.58, + "step": 298960, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75493326, + "epoch": 1.672037771567025, + "grad_norm": 5.9375, + "learning_rate": 7.172914343212128e-07, + "loss": 0.97259693, + "memory(GiB)": 302.58, + "step": 298980, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.76338873, + "epoch": 1.6721496210400044, + "grad_norm": 6.65625, + "learning_rate": 7.168142878620049e-07, + "loss": 0.91820936, + "memory(GiB)": 302.58, + "step": 299000, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.74737425, + "epoch": 1.6722614705129835, + "grad_norm": 7.5625, + "learning_rate": 7.163372879013303e-07, + "loss": 0.99933939, + "memory(GiB)": 302.58, + "step": 299020, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.73826742, + "epoch": 1.672373319985963, + "grad_norm": 7.78125, + "learning_rate": 7.158604344555031e-07, + "loss": 1.01640568, + "memory(GiB)": 302.58, + "step": 299040, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.75383663, + "epoch": 1.672485169458942, + "grad_norm": 5.59375, + "learning_rate": 7.153837275408343e-07, + "loss": 0.96894131, + "memory(GiB)": 302.58, + "step": 299060, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.74695415, + "epoch": 1.6725970189319215, + "grad_norm": 9.1875, + "learning_rate": 7.14907167173628e-07, + "loss": 0.99756536, + "memory(GiB)": 302.58, + "step": 299080, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73153768, + "epoch": 1.6727088684049007, + "grad_norm": 6.90625, + "learning_rate": 7.144307533701838e-07, + "loss": 1.06813478, + "memory(GiB)": 302.58, + "step": 299100, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.76188607, + "epoch": 1.67282071787788, + "grad_norm": 8.4375, + "learning_rate": 7.139544861467973e-07, + "loss": 0.93011551, + "memory(GiB)": 302.58, + "step": 299120, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75480318, + "epoch": 1.6729325673508593, + "grad_norm": 8.5625, + "learning_rate": 7.134783655197581e-07, + "loss": 0.98438473, + "memory(GiB)": 302.58, + "step": 299140, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.7448967, + "epoch": 1.6730444168238385, + "grad_norm": 6.375, + "learning_rate": 7.130023915053507e-07, + "loss": 0.99334354, + "memory(GiB)": 302.58, + "step": 299160, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74214268, + "epoch": 1.6731562662968178, + "grad_norm": 4.96875, + "learning_rate": 7.125265641198553e-07, + "loss": 1.01900187, + "memory(GiB)": 302.58, + "step": 299180, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.7546433, + "epoch": 1.673268115769797, + "grad_norm": 4.96875, + "learning_rate": 7.120508833795469e-07, + "loss": 0.94639969, + "memory(GiB)": 302.58, + "step": 299200, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.73975806, + "epoch": 1.6733799652427763, + "grad_norm": 6.78125, + "learning_rate": 7.115753493006939e-07, + "loss": 1.01373634, + "memory(GiB)": 302.58, + "step": 299220, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.74771442, + "epoch": 1.6734918147157556, + "grad_norm": 7.84375, + "learning_rate": 7.110999618995629e-07, + "loss": 0.99828386, + "memory(GiB)": 302.58, + "step": 299240, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.75644536, + "epoch": 1.6736036641887349, + "grad_norm": 7.21875, + "learning_rate": 7.106247211924133e-07, + "loss": 0.95845461, + "memory(GiB)": 302.58, + "step": 299260, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.74658332, + "epoch": 1.6737155136617141, + "grad_norm": 6.34375, + "learning_rate": 7.101496271954995e-07, + "loss": 1.00668659, + "memory(GiB)": 302.58, + "step": 299280, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75000677, + "epoch": 1.6738273631346934, + "grad_norm": 7.5625, + "learning_rate": 7.096746799250708e-07, + "loss": 0.98020372, + "memory(GiB)": 302.58, + "step": 299300, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.74440055, + "epoch": 1.6739392126076726, + "grad_norm": 6.65625, + "learning_rate": 7.091998793973726e-07, + "loss": 1.0042819, + "memory(GiB)": 302.58, + "step": 299320, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.76114526, + "epoch": 1.674051062080652, + "grad_norm": 8.0, + "learning_rate": 7.087252256286443e-07, + "loss": 0.9398262, + "memory(GiB)": 302.58, + "step": 299340, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.755441, + "epoch": 1.6741629115536312, + "grad_norm": 5.71875, + "learning_rate": 7.0825071863512e-07, + "loss": 0.95030317, + "memory(GiB)": 302.58, + "step": 299360, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.73060703, + "epoch": 1.6742747610266104, + "grad_norm": 7.1875, + "learning_rate": 7.077763584330305e-07, + "loss": 1.06131544, + "memory(GiB)": 302.58, + "step": 299380, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75550351, + "epoch": 1.6743866104995897, + "grad_norm": 6.46875, + "learning_rate": 7.073021450385998e-07, + "loss": 0.93590603, + "memory(GiB)": 302.58, + "step": 299400, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.76691957, + "epoch": 1.674498459972569, + "grad_norm": 5.84375, + "learning_rate": 7.068280784680476e-07, + "loss": 0.90229235, + "memory(GiB)": 302.58, + "step": 299420, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.76445379, + "epoch": 1.6746103094455482, + "grad_norm": 6.875, + "learning_rate": 7.063541587375883e-07, + "loss": 0.91422949, + "memory(GiB)": 302.58, + "step": 299440, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.7326879, + "epoch": 1.6747221589185275, + "grad_norm": 8.0, + "learning_rate": 7.058803858634311e-07, + "loss": 1.03779612, + "memory(GiB)": 302.58, + "step": 299460, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.7482995, + "epoch": 1.6748340083915068, + "grad_norm": 7.5625, + "learning_rate": 7.054067598617803e-07, + "loss": 0.97712889, + "memory(GiB)": 302.58, + "step": 299480, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.7555912, + "epoch": 1.674945857864486, + "grad_norm": 9.75, + "learning_rate": 7.049332807488368e-07, + "loss": 0.95269613, + "memory(GiB)": 302.58, + "step": 299500, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.7609664, + "epoch": 1.6750577073374653, + "grad_norm": 6.59375, + "learning_rate": 7.044599485407949e-07, + "loss": 0.94063501, + "memory(GiB)": 302.58, + "step": 299520, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.74136701, + "epoch": 1.6751695568104445, + "grad_norm": 5.25, + "learning_rate": 7.039867632538433e-07, + "loss": 1.04545374, + "memory(GiB)": 302.58, + "step": 299540, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.73940806, + "epoch": 1.6752814062834238, + "grad_norm": 5.1875, + "learning_rate": 7.035137249041668e-07, + "loss": 1.03927803, + "memory(GiB)": 302.58, + "step": 299560, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.76247058, + "epoch": 1.675393255756403, + "grad_norm": 6.0625, + "learning_rate": 7.030408335079447e-07, + "loss": 0.93564463, + "memory(GiB)": 302.58, + "step": 299580, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.75643735, + "epoch": 1.6755051052293823, + "grad_norm": 7.75, + "learning_rate": 7.025680890813513e-07, + "loss": 0.95152512, + "memory(GiB)": 302.58, + "step": 299600, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.74797754, + "epoch": 1.6756169547023616, + "grad_norm": 4.75, + "learning_rate": 7.020954916405559e-07, + "loss": 1.0070858, + "memory(GiB)": 302.58, + "step": 299620, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.76686125, + "epoch": 1.6757288041753409, + "grad_norm": 5.3125, + "learning_rate": 7.016230412017234e-07, + "loss": 0.91610422, + "memory(GiB)": 302.58, + "step": 299640, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.74198217, + "epoch": 1.6758406536483201, + "grad_norm": 9.0, + "learning_rate": 7.011507377810123e-07, + "loss": 1.00701571, + "memory(GiB)": 302.58, + "step": 299660, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.76985655, + "epoch": 1.6759525031212994, + "grad_norm": 7.5, + "learning_rate": 7.006785813945776e-07, + "loss": 0.88925667, + "memory(GiB)": 302.58, + "step": 299680, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.7479691, + "epoch": 1.6760643525942787, + "grad_norm": 10.125, + "learning_rate": 7.002065720585677e-07, + "loss": 1.00295219, + "memory(GiB)": 302.58, + "step": 299700, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.76251955, + "epoch": 1.676176202067258, + "grad_norm": 7.625, + "learning_rate": 6.997347097891266e-07, + "loss": 0.92151117, + "memory(GiB)": 302.58, + "step": 299720, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.77143469, + "epoch": 1.6762880515402372, + "grad_norm": 10.625, + "learning_rate": 6.992629946023948e-07, + "loss": 0.89609747, + "memory(GiB)": 302.58, + "step": 299740, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.76086469, + "epoch": 1.6763999010132165, + "grad_norm": 10.6875, + "learning_rate": 6.987914265145062e-07, + "loss": 0.93485117, + "memory(GiB)": 302.58, + "step": 299760, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.75272508, + "epoch": 1.6765117504861957, + "grad_norm": 7.03125, + "learning_rate": 6.983200055415901e-07, + "loss": 0.96688108, + "memory(GiB)": 302.58, + "step": 299780, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.7544065, + "epoch": 1.676623599959175, + "grad_norm": 11.375, + "learning_rate": 6.978487316997696e-07, + "loss": 0.9454217, + "memory(GiB)": 302.58, + "step": 299800, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.75064182, + "epoch": 1.6767354494321542, + "grad_norm": 9.125, + "learning_rate": 6.973776050051628e-07, + "loss": 1.0036274, + "memory(GiB)": 302.58, + "step": 299820, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.74758162, + "epoch": 1.6768472989051335, + "grad_norm": 10.375, + "learning_rate": 6.969066254738865e-07, + "loss": 1.00748987, + "memory(GiB)": 302.58, + "step": 299840, + "train_speed(iter/s)": 0.123684 + }, + { + "acc": 0.75358038, + "epoch": 1.6769591483781128, + "grad_norm": 5.875, + "learning_rate": 6.964357931220484e-07, + "loss": 0.96051025, + "memory(GiB)": 302.58, + "step": 299860, + "train_speed(iter/s)": 0.123688 + }, + { + "acc": 0.72658706, + "epoch": 1.677070997851092, + "grad_norm": 8.3125, + "learning_rate": 6.959651079657526e-07, + "loss": 1.07842464, + "memory(GiB)": 302.58, + "step": 299880, + "train_speed(iter/s)": 0.123692 + }, + { + "acc": 0.75317855, + "epoch": 1.6771828473240713, + "grad_norm": 7.875, + "learning_rate": 6.954945700210974e-07, + "loss": 0.95099735, + "memory(GiB)": 302.58, + "step": 299900, + "train_speed(iter/s)": 0.123696 + }, + { + "acc": 0.76563497, + "epoch": 1.6772946967970506, + "grad_norm": 7.5, + "learning_rate": 6.950241793041773e-07, + "loss": 0.92690229, + "memory(GiB)": 302.58, + "step": 299920, + "train_speed(iter/s)": 0.123699 + }, + { + "acc": 0.75621643, + "epoch": 1.6774065462700298, + "grad_norm": 9.6875, + "learning_rate": 6.94553935831081e-07, + "loss": 0.96268253, + "memory(GiB)": 302.58, + "step": 299940, + "train_speed(iter/s)": 0.123703 + }, + { + "acc": 0.75392876, + "epoch": 1.677518395743009, + "grad_norm": 10.3125, + "learning_rate": 6.940838396178923e-07, + "loss": 0.9687479, + "memory(GiB)": 302.58, + "step": 299960, + "train_speed(iter/s)": 0.123707 + }, + { + "acc": 0.74487605, + "epoch": 1.6776302452159884, + "grad_norm": 7.0625, + "learning_rate": 6.9361389068069e-07, + "loss": 0.9824008, + "memory(GiB)": 302.58, + "step": 299980, + "train_speed(iter/s)": 0.123711 + }, + { + "acc": 0.76237726, + "epoch": 1.6777420946889676, + "grad_norm": 10.5625, + "learning_rate": 6.931440890355479e-07, + "loss": 0.93992004, + "memory(GiB)": 302.58, + "step": 300000, + "train_speed(iter/s)": 0.123715 + }, + { + "epoch": 1.6777420946889676, + "eval_acc": 0.7069075840049281, + "eval_loss": 1.0118262767791748, + "eval_runtime": 7545.3382, + "eval_samples_per_second": 9.977, + "eval_steps_per_second": 9.977, + "step": 300000 + }, + { + "acc": 0.74574981, + "epoch": 1.6778539441619469, + "grad_norm": 12.25, + "learning_rate": 6.926744346985348e-07, + "loss": 1.00822954, + "memory(GiB)": 302.58, + "step": 300020, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.75930085, + "epoch": 1.6779657936349262, + "grad_norm": 6.3125, + "learning_rate": 6.922049276857135e-07, + "loss": 0.92705641, + "memory(GiB)": 302.58, + "step": 300040, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.75700192, + "epoch": 1.6780776431079054, + "grad_norm": 6.84375, + "learning_rate": 6.917355680131438e-07, + "loss": 0.96547203, + "memory(GiB)": 302.58, + "step": 300060, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.74268322, + "epoch": 1.6781894925808847, + "grad_norm": 7.625, + "learning_rate": 6.912663556968791e-07, + "loss": 1.0180233, + "memory(GiB)": 302.58, + "step": 300080, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.74236364, + "epoch": 1.678301342053864, + "grad_norm": 7.90625, + "learning_rate": 6.907972907529658e-07, + "loss": 1.02688026, + "memory(GiB)": 302.58, + "step": 300100, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.77327452, + "epoch": 1.6784131915268432, + "grad_norm": 4.8125, + "learning_rate": 6.903283731974508e-07, + "loss": 0.88584175, + "memory(GiB)": 302.58, + "step": 300120, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.74674373, + "epoch": 1.6785250409998225, + "grad_norm": 5.4375, + "learning_rate": 6.898596030463711e-07, + "loss": 1.00724936, + "memory(GiB)": 302.58, + "step": 300140, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.74732075, + "epoch": 1.6786368904728017, + "grad_norm": 9.75, + "learning_rate": 6.8939098031576e-07, + "loss": 1.00421801, + "memory(GiB)": 302.58, + "step": 300160, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.77315598, + "epoch": 1.678748739945781, + "grad_norm": 6.46875, + "learning_rate": 6.889225050216458e-07, + "loss": 0.88973999, + "memory(GiB)": 302.58, + "step": 300180, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.75836983, + "epoch": 1.6788605894187603, + "grad_norm": 7.53125, + "learning_rate": 6.884541771800524e-07, + "loss": 0.94671936, + "memory(GiB)": 302.58, + "step": 300200, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.7374752, + "epoch": 1.6789724388917395, + "grad_norm": 5.84375, + "learning_rate": 6.879859968069974e-07, + "loss": 1.02914057, + "memory(GiB)": 302.58, + "step": 300220, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.77046857, + "epoch": 1.6790842883647188, + "grad_norm": 7.0625, + "learning_rate": 6.875179639184943e-07, + "loss": 0.89114866, + "memory(GiB)": 302.58, + "step": 300240, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.74532609, + "epoch": 1.679196137837698, + "grad_norm": 5.0, + "learning_rate": 6.870500785305512e-07, + "loss": 0.99248466, + "memory(GiB)": 302.58, + "step": 300260, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.7474586, + "epoch": 1.6793079873106773, + "grad_norm": 8.625, + "learning_rate": 6.865823406591715e-07, + "loss": 0.98937721, + "memory(GiB)": 302.58, + "step": 300280, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.73170257, + "epoch": 1.6794198367836566, + "grad_norm": 6.40625, + "learning_rate": 6.861147503203536e-07, + "loss": 1.05440035, + "memory(GiB)": 302.58, + "step": 300300, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.73213406, + "epoch": 1.6795316862566358, + "grad_norm": 7.28125, + "learning_rate": 6.856473075300901e-07, + "loss": 1.05633125, + "memory(GiB)": 302.58, + "step": 300320, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.75427709, + "epoch": 1.6796435357296151, + "grad_norm": 7.5625, + "learning_rate": 6.851800123043689e-07, + "loss": 0.96769352, + "memory(GiB)": 302.58, + "step": 300340, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.7644917, + "epoch": 1.6797553852025944, + "grad_norm": 7.875, + "learning_rate": 6.847128646591717e-07, + "loss": 0.91465311, + "memory(GiB)": 302.58, + "step": 300360, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.75548043, + "epoch": 1.6798672346755736, + "grad_norm": 6.9375, + "learning_rate": 6.842458646104799e-07, + "loss": 0.9664938, + "memory(GiB)": 302.58, + "step": 300380, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.74979858, + "epoch": 1.679979084148553, + "grad_norm": 9.3125, + "learning_rate": 6.837790121742638e-07, + "loss": 0.96919212, + "memory(GiB)": 302.58, + "step": 300400, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.77312107, + "epoch": 1.6800909336215322, + "grad_norm": 7.78125, + "learning_rate": 6.833123073664922e-07, + "loss": 0.8602253, + "memory(GiB)": 302.58, + "step": 300420, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.75330958, + "epoch": 1.6802027830945114, + "grad_norm": 8.1875, + "learning_rate": 6.828457502031272e-07, + "loss": 0.95664301, + "memory(GiB)": 302.58, + "step": 300440, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.72801862, + "epoch": 1.6803146325674907, + "grad_norm": 4.90625, + "learning_rate": 6.823793407001272e-07, + "loss": 1.07555799, + "memory(GiB)": 302.58, + "step": 300460, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.7492981, + "epoch": 1.68042648204047, + "grad_norm": 6.46875, + "learning_rate": 6.819130788734441e-07, + "loss": 0.9819253, + "memory(GiB)": 302.58, + "step": 300480, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74795218, + "epoch": 1.6805383315134492, + "grad_norm": 4.53125, + "learning_rate": 6.814469647390265e-07, + "loss": 0.99613209, + "memory(GiB)": 302.58, + "step": 300500, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.74355912, + "epoch": 1.6806501809864285, + "grad_norm": 5.96875, + "learning_rate": 6.809809983128163e-07, + "loss": 1.00507536, + "memory(GiB)": 302.58, + "step": 300520, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.74216094, + "epoch": 1.6807620304594078, + "grad_norm": 9.625, + "learning_rate": 6.805151796107507e-07, + "loss": 1.01074667, + "memory(GiB)": 302.58, + "step": 300540, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.72829561, + "epoch": 1.680873879932387, + "grad_norm": 6.59375, + "learning_rate": 6.800495086487635e-07, + "loss": 1.08929729, + "memory(GiB)": 302.58, + "step": 300560, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.74911828, + "epoch": 1.6809857294053663, + "grad_norm": 6.375, + "learning_rate": 6.795839854427805e-07, + "loss": 0.97221136, + "memory(GiB)": 302.58, + "step": 300580, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.73701487, + "epoch": 1.6810975788783455, + "grad_norm": 6.8125, + "learning_rate": 6.791186100087254e-07, + "loss": 1.0213707, + "memory(GiB)": 302.58, + "step": 300600, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.7665194, + "epoch": 1.6812094283513248, + "grad_norm": 6.96875, + "learning_rate": 6.786533823625152e-07, + "loss": 0.9055109, + "memory(GiB)": 302.58, + "step": 300620, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.74939628, + "epoch": 1.681321277824304, + "grad_norm": 7.53125, + "learning_rate": 6.781883025200603e-07, + "loss": 0.98742161, + "memory(GiB)": 302.58, + "step": 300640, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.76219978, + "epoch": 1.6814331272972833, + "grad_norm": 7.1875, + "learning_rate": 6.777233704972708e-07, + "loss": 0.92241039, + "memory(GiB)": 302.58, + "step": 300660, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75946145, + "epoch": 1.6815449767702626, + "grad_norm": 8.0625, + "learning_rate": 6.772585863100478e-07, + "loss": 0.94044447, + "memory(GiB)": 302.58, + "step": 300680, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.77072444, + "epoch": 1.6816568262432419, + "grad_norm": 6.65625, + "learning_rate": 6.767939499742876e-07, + "loss": 0.90624762, + "memory(GiB)": 302.58, + "step": 300700, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.73791142, + "epoch": 1.6817686757162211, + "grad_norm": 6.5625, + "learning_rate": 6.763294615058836e-07, + "loss": 1.05037022, + "memory(GiB)": 302.58, + "step": 300720, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.74244285, + "epoch": 1.6818805251892004, + "grad_norm": 9.3125, + "learning_rate": 6.758651209207218e-07, + "loss": 1.0075079, + "memory(GiB)": 302.58, + "step": 300740, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74986205, + "epoch": 1.6819923746621797, + "grad_norm": 7.46875, + "learning_rate": 6.75400928234684e-07, + "loss": 0.97238512, + "memory(GiB)": 302.58, + "step": 300760, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.75190291, + "epoch": 1.682104224135159, + "grad_norm": 8.625, + "learning_rate": 6.749368834636477e-07, + "loss": 0.96730595, + "memory(GiB)": 302.58, + "step": 300780, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75152001, + "epoch": 1.6822160736081382, + "grad_norm": 7.4375, + "learning_rate": 6.744729866234839e-07, + "loss": 0.97815514, + "memory(GiB)": 302.58, + "step": 300800, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.75431843, + "epoch": 1.6823279230811174, + "grad_norm": 6.96875, + "learning_rate": 6.740092377300605e-07, + "loss": 0.98261948, + "memory(GiB)": 302.58, + "step": 300820, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.75008731, + "epoch": 1.6824397725540967, + "grad_norm": 7.96875, + "learning_rate": 6.735456367992382e-07, + "loss": 0.98269644, + "memory(GiB)": 302.58, + "step": 300840, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.75369706, + "epoch": 1.682551622027076, + "grad_norm": 7.125, + "learning_rate": 6.730821838468737e-07, + "loss": 0.963486, + "memory(GiB)": 302.58, + "step": 300860, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.73138323, + "epoch": 1.6826634715000552, + "grad_norm": 7.0625, + "learning_rate": 6.726188788888188e-07, + "loss": 1.03560791, + "memory(GiB)": 302.58, + "step": 300880, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.73081999, + "epoch": 1.6827753209730345, + "grad_norm": 7.5, + "learning_rate": 6.721557219409203e-07, + "loss": 1.0527565, + "memory(GiB)": 302.58, + "step": 300900, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75573835, + "epoch": 1.6828871704460138, + "grad_norm": 7.4375, + "learning_rate": 6.716927130190182e-07, + "loss": 0.95075188, + "memory(GiB)": 302.58, + "step": 300920, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.7648046, + "epoch": 1.682999019918993, + "grad_norm": 7.34375, + "learning_rate": 6.71229852138951e-07, + "loss": 0.9355628, + "memory(GiB)": 302.58, + "step": 300940, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.75373726, + "epoch": 1.6831108693919723, + "grad_norm": 8.25, + "learning_rate": 6.707671393165494e-07, + "loss": 0.9805932, + "memory(GiB)": 302.58, + "step": 300960, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.77394094, + "epoch": 1.6832227188649516, + "grad_norm": 8.9375, + "learning_rate": 6.703045745676379e-07, + "loss": 0.873944, + "memory(GiB)": 302.58, + "step": 300980, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.76313128, + "epoch": 1.6833345683379308, + "grad_norm": 9.75, + "learning_rate": 6.698421579080399e-07, + "loss": 0.93246794, + "memory(GiB)": 302.58, + "step": 301000, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.73816228, + "epoch": 1.68344641781091, + "grad_norm": 6.59375, + "learning_rate": 6.693798893535708e-07, + "loss": 1.03796711, + "memory(GiB)": 302.58, + "step": 301020, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74817266, + "epoch": 1.6835582672838894, + "grad_norm": 6.34375, + "learning_rate": 6.689177689200416e-07, + "loss": 0.97960749, + "memory(GiB)": 302.58, + "step": 301040, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.75409231, + "epoch": 1.6836701167568686, + "grad_norm": 6.25, + "learning_rate": 6.68455796623258e-07, + "loss": 0.98195534, + "memory(GiB)": 302.58, + "step": 301060, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.74856181, + "epoch": 1.6837819662298479, + "grad_norm": 8.6875, + "learning_rate": 6.679939724790213e-07, + "loss": 1.02024641, + "memory(GiB)": 302.58, + "step": 301080, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.7546298, + "epoch": 1.6838938157028271, + "grad_norm": 7.53125, + "learning_rate": 6.675322965031272e-07, + "loss": 0.96288471, + "memory(GiB)": 302.58, + "step": 301100, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.74585705, + "epoch": 1.6840056651758064, + "grad_norm": 6.65625, + "learning_rate": 6.670707687113664e-07, + "loss": 1.01096735, + "memory(GiB)": 302.58, + "step": 301120, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.76931715, + "epoch": 1.6841175146487857, + "grad_norm": 9.125, + "learning_rate": 6.666093891195247e-07, + "loss": 0.90722904, + "memory(GiB)": 302.58, + "step": 301140, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.75570025, + "epoch": 1.684229364121765, + "grad_norm": 7.03125, + "learning_rate": 6.661481577433826e-07, + "loss": 0.95597649, + "memory(GiB)": 302.58, + "step": 301160, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.74236846, + "epoch": 1.6843412135947442, + "grad_norm": 6.21875, + "learning_rate": 6.65687074598716e-07, + "loss": 1.02409153, + "memory(GiB)": 302.58, + "step": 301180, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.76796179, + "epoch": 1.6844530630677235, + "grad_norm": 6.875, + "learning_rate": 6.65226139701295e-07, + "loss": 0.92274313, + "memory(GiB)": 302.58, + "step": 301200, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75090995, + "epoch": 1.6845649125407027, + "grad_norm": 6.78125, + "learning_rate": 6.647653530668847e-07, + "loss": 0.97800121, + "memory(GiB)": 302.58, + "step": 301220, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75310435, + "epoch": 1.684676762013682, + "grad_norm": 8.0, + "learning_rate": 6.643047147112458e-07, + "loss": 0.95715132, + "memory(GiB)": 302.58, + "step": 301240, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.76062188, + "epoch": 1.6847886114866613, + "grad_norm": 5.40625, + "learning_rate": 6.638442246501342e-07, + "loss": 0.95092421, + "memory(GiB)": 302.58, + "step": 301260, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.74214454, + "epoch": 1.6849004609596405, + "grad_norm": 6.8125, + "learning_rate": 6.633838828993e-07, + "loss": 1.0039896, + "memory(GiB)": 302.58, + "step": 301280, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.75757318, + "epoch": 1.6850123104326198, + "grad_norm": 9.3125, + "learning_rate": 6.629236894744878e-07, + "loss": 0.9540863, + "memory(GiB)": 302.58, + "step": 301300, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.75448046, + "epoch": 1.685124159905599, + "grad_norm": 8.0625, + "learning_rate": 6.624636443914384e-07, + "loss": 0.96072273, + "memory(GiB)": 302.58, + "step": 301320, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74264345, + "epoch": 1.6852360093785783, + "grad_norm": 7.21875, + "learning_rate": 6.620037476658859e-07, + "loss": 1.00012646, + "memory(GiB)": 302.58, + "step": 301340, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.74100933, + "epoch": 1.6853478588515576, + "grad_norm": 8.375, + "learning_rate": 6.615439993135609e-07, + "loss": 1.01708622, + "memory(GiB)": 302.58, + "step": 301360, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.7581151, + "epoch": 1.6854597083245368, + "grad_norm": 8.4375, + "learning_rate": 6.610843993501875e-07, + "loss": 0.93725853, + "memory(GiB)": 302.58, + "step": 301380, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.74444275, + "epoch": 1.685571557797516, + "grad_norm": 6.84375, + "learning_rate": 6.606249477914867e-07, + "loss": 0.98069458, + "memory(GiB)": 302.58, + "step": 301400, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.75472574, + "epoch": 1.6856834072704954, + "grad_norm": 9.75, + "learning_rate": 6.601656446531718e-07, + "loss": 0.96101093, + "memory(GiB)": 302.58, + "step": 301420, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.73283114, + "epoch": 1.6857952567434746, + "grad_norm": 5.625, + "learning_rate": 6.597064899509537e-07, + "loss": 1.06665726, + "memory(GiB)": 302.58, + "step": 301440, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.76043954, + "epoch": 1.685907106216454, + "grad_norm": 5.28125, + "learning_rate": 6.592474837005358e-07, + "loss": 0.94246578, + "memory(GiB)": 302.58, + "step": 301460, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.74149265, + "epoch": 1.6860189556894332, + "grad_norm": 6.375, + "learning_rate": 6.587886259176185e-07, + "loss": 1.01700525, + "memory(GiB)": 302.58, + "step": 301480, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.75115447, + "epoch": 1.6861308051624124, + "grad_norm": 7.5625, + "learning_rate": 6.583299166178953e-07, + "loss": 0.98235846, + "memory(GiB)": 302.58, + "step": 301500, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.74573069, + "epoch": 1.6862426546353917, + "grad_norm": 6.28125, + "learning_rate": 6.578713558170552e-07, + "loss": 1.00359526, + "memory(GiB)": 302.58, + "step": 301520, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74252934, + "epoch": 1.686354504108371, + "grad_norm": 9.9375, + "learning_rate": 6.574129435307847e-07, + "loss": 1.01598063, + "memory(GiB)": 302.58, + "step": 301540, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.7607861, + "epoch": 1.6864663535813502, + "grad_norm": 10.4375, + "learning_rate": 6.569546797747612e-07, + "loss": 0.9625967, + "memory(GiB)": 302.58, + "step": 301560, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.73915787, + "epoch": 1.6865782030543295, + "grad_norm": 9.3125, + "learning_rate": 6.564965645646593e-07, + "loss": 1.02403126, + "memory(GiB)": 302.58, + "step": 301580, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.75306797, + "epoch": 1.6866900525273087, + "grad_norm": 5.8125, + "learning_rate": 6.560385979161471e-07, + "loss": 0.97102594, + "memory(GiB)": 302.58, + "step": 301600, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.73768888, + "epoch": 1.686801902000288, + "grad_norm": 9.375, + "learning_rate": 6.555807798448899e-07, + "loss": 1.03034515, + "memory(GiB)": 302.58, + "step": 301620, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.77114096, + "epoch": 1.6869137514732673, + "grad_norm": 5.21875, + "learning_rate": 6.551231103665456e-07, + "loss": 0.90906906, + "memory(GiB)": 302.58, + "step": 301640, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.73607306, + "epoch": 1.6870256009462465, + "grad_norm": 6.4375, + "learning_rate": 6.54665589496768e-07, + "loss": 1.04825048, + "memory(GiB)": 302.58, + "step": 301660, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.74474063, + "epoch": 1.6871374504192258, + "grad_norm": 7.03125, + "learning_rate": 6.54208217251206e-07, + "loss": 0.99553061, + "memory(GiB)": 302.58, + "step": 301680, + "train_speed(iter/s)": 0.123649 + }, + { + "acc": 0.75004959, + "epoch": 1.687249299892205, + "grad_norm": 8.9375, + "learning_rate": 6.537509936455034e-07, + "loss": 0.98025208, + "memory(GiB)": 302.58, + "step": 301700, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.75318122, + "epoch": 1.6873611493651843, + "grad_norm": 7.59375, + "learning_rate": 6.53293918695298e-07, + "loss": 0.94549465, + "memory(GiB)": 302.58, + "step": 301720, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.75044065, + "epoch": 1.6874729988381636, + "grad_norm": 8.4375, + "learning_rate": 6.528369924162237e-07, + "loss": 0.99342165, + "memory(GiB)": 302.58, + "step": 301740, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.77848358, + "epoch": 1.6875848483111429, + "grad_norm": 6.03125, + "learning_rate": 6.523802148239083e-07, + "loss": 0.8534152, + "memory(GiB)": 302.58, + "step": 301760, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.73183026, + "epoch": 1.6876966977841221, + "grad_norm": 9.0, + "learning_rate": 6.519235859339762e-07, + "loss": 1.05512466, + "memory(GiB)": 302.58, + "step": 301780, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.74503217, + "epoch": 1.6878085472571014, + "grad_norm": 5.0, + "learning_rate": 6.514671057620436e-07, + "loss": 1.00172358, + "memory(GiB)": 302.58, + "step": 301800, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.72971773, + "epoch": 1.6879203967300807, + "grad_norm": 8.625, + "learning_rate": 6.510107743237259e-07, + "loss": 1.0707077, + "memory(GiB)": 302.58, + "step": 301820, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.74560838, + "epoch": 1.68803224620306, + "grad_norm": 7.9375, + "learning_rate": 6.505545916346296e-07, + "loss": 0.99839315, + "memory(GiB)": 302.58, + "step": 301840, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.75568056, + "epoch": 1.6881440956760392, + "grad_norm": 5.65625, + "learning_rate": 6.500985577103586e-07, + "loss": 0.93605461, + "memory(GiB)": 302.58, + "step": 301860, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.76356091, + "epoch": 1.6882559451490184, + "grad_norm": 7.59375, + "learning_rate": 6.496426725665095e-07, + "loss": 0.91637077, + "memory(GiB)": 302.58, + "step": 301880, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.75199156, + "epoch": 1.6883677946219977, + "grad_norm": 6.21875, + "learning_rate": 6.491869362186759e-07, + "loss": 0.98040466, + "memory(GiB)": 302.58, + "step": 301900, + "train_speed(iter/s)": 0.123691 + }, + { + "acc": 0.76553712, + "epoch": 1.688479644094977, + "grad_norm": 7.21875, + "learning_rate": 6.487313486824454e-07, + "loss": 0.92007608, + "memory(GiB)": 302.58, + "step": 301920, + "train_speed(iter/s)": 0.123695 + }, + { + "acc": 0.74512033, + "epoch": 1.6885914935679562, + "grad_norm": 9.125, + "learning_rate": 6.482759099733998e-07, + "loss": 0.99549532, + "memory(GiB)": 302.58, + "step": 301940, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.75439301, + "epoch": 1.6887033430409355, + "grad_norm": 6.53125, + "learning_rate": 6.478206201071169e-07, + "loss": 0.96964273, + "memory(GiB)": 302.58, + "step": 301960, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.75310683, + "epoch": 1.6888151925139148, + "grad_norm": 7.1875, + "learning_rate": 6.473654790991696e-07, + "loss": 0.97638874, + "memory(GiB)": 302.58, + "step": 301980, + "train_speed(iter/s)": 0.123706 + }, + { + "acc": 0.74250941, + "epoch": 1.688927041986894, + "grad_norm": 3.96875, + "learning_rate": 6.469104869651249e-07, + "loss": 1.02273722, + "memory(GiB)": 302.58, + "step": 302000, + "train_speed(iter/s)": 0.12371 + }, + { + "epoch": 1.688927041986894, + "eval_acc": 0.7069088164245849, + "eval_loss": 1.0118097066879272, + "eval_runtime": 7539.9363, + "eval_samples_per_second": 9.985, + "eval_steps_per_second": 9.985, + "step": 302000 + }, + { + "acc": 0.75492425, + "epoch": 1.6890388914598733, + "grad_norm": 8.125, + "learning_rate": 6.464556437205444e-07, + "loss": 0.95739918, + "memory(GiB)": 302.58, + "step": 302020, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.74820294, + "epoch": 1.6891507409328526, + "grad_norm": 5.6875, + "learning_rate": 6.460009493809855e-07, + "loss": 0.993365, + "memory(GiB)": 302.58, + "step": 302040, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.75696754, + "epoch": 1.6892625904058318, + "grad_norm": 7.5, + "learning_rate": 6.455464039619997e-07, + "loss": 0.96238546, + "memory(GiB)": 302.58, + "step": 302060, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.75262599, + "epoch": 1.689374439878811, + "grad_norm": 6.4375, + "learning_rate": 6.450920074791351e-07, + "loss": 0.96817331, + "memory(GiB)": 302.58, + "step": 302080, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.75237293, + "epoch": 1.6894862893517903, + "grad_norm": 6.0625, + "learning_rate": 6.446377599479331e-07, + "loss": 0.99600258, + "memory(GiB)": 302.58, + "step": 302100, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.7518539, + "epoch": 1.6895981388247696, + "grad_norm": 6.9375, + "learning_rate": 6.441836613839303e-07, + "loss": 0.98975897, + "memory(GiB)": 302.58, + "step": 302120, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.74182267, + "epoch": 1.6897099882977489, + "grad_norm": 5.0, + "learning_rate": 6.437297118026581e-07, + "loss": 1.04460106, + "memory(GiB)": 302.58, + "step": 302140, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.7402873, + "epoch": 1.6898218377707281, + "grad_norm": 7.71875, + "learning_rate": 6.43275911219643e-07, + "loss": 1.02781935, + "memory(GiB)": 302.58, + "step": 302160, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.74075232, + "epoch": 1.6899336872437074, + "grad_norm": 5.40625, + "learning_rate": 6.428222596504069e-07, + "loss": 1.02501764, + "memory(GiB)": 302.58, + "step": 302180, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.75052609, + "epoch": 1.6900455367166867, + "grad_norm": 4.875, + "learning_rate": 6.423687571104653e-07, + "loss": 0.98389931, + "memory(GiB)": 302.58, + "step": 302200, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.75501823, + "epoch": 1.690157386189666, + "grad_norm": 5.21875, + "learning_rate": 6.419154036153291e-07, + "loss": 0.92128611, + "memory(GiB)": 302.58, + "step": 302220, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.76159687, + "epoch": 1.6902692356626452, + "grad_norm": 8.0, + "learning_rate": 6.414621991805059e-07, + "loss": 0.94441223, + "memory(GiB)": 302.58, + "step": 302240, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.75393934, + "epoch": 1.6903810851356245, + "grad_norm": 9.4375, + "learning_rate": 6.410091438214966e-07, + "loss": 0.9879281, + "memory(GiB)": 302.58, + "step": 302260, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.74479127, + "epoch": 1.6904929346086037, + "grad_norm": 6.625, + "learning_rate": 6.40556237553796e-07, + "loss": 1.02146769, + "memory(GiB)": 302.58, + "step": 302280, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.74930682, + "epoch": 1.690604784081583, + "grad_norm": 7.3125, + "learning_rate": 6.401034803928963e-07, + "loss": 0.97756462, + "memory(GiB)": 302.58, + "step": 302300, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.76664453, + "epoch": 1.6907166335545623, + "grad_norm": 5.375, + "learning_rate": 6.396508723542816e-07, + "loss": 0.90432529, + "memory(GiB)": 302.58, + "step": 302320, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.74977255, + "epoch": 1.6908284830275415, + "grad_norm": 7.0, + "learning_rate": 6.391984134534335e-07, + "loss": 0.985884, + "memory(GiB)": 302.58, + "step": 302340, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.76868382, + "epoch": 1.6909403325005208, + "grad_norm": 5.5625, + "learning_rate": 6.387461037058279e-07, + "loss": 0.90701389, + "memory(GiB)": 302.58, + "step": 302360, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.74844804, + "epoch": 1.6910521819735, + "grad_norm": 8.5, + "learning_rate": 6.382939431269342e-07, + "loss": 1.00969219, + "memory(GiB)": 302.58, + "step": 302380, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.75608697, + "epoch": 1.6911640314464793, + "grad_norm": 7.625, + "learning_rate": 6.378419317322171e-07, + "loss": 0.97068939, + "memory(GiB)": 302.58, + "step": 302400, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.76722751, + "epoch": 1.6912758809194586, + "grad_norm": 6.71875, + "learning_rate": 6.373900695371394e-07, + "loss": 0.91383152, + "memory(GiB)": 302.58, + "step": 302420, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.75221872, + "epoch": 1.6913877303924378, + "grad_norm": 7.46875, + "learning_rate": 6.369383565571547e-07, + "loss": 0.97475977, + "memory(GiB)": 302.58, + "step": 302440, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.75177364, + "epoch": 1.691499579865417, + "grad_norm": 5.6875, + "learning_rate": 6.364867928077128e-07, + "loss": 0.98432016, + "memory(GiB)": 302.58, + "step": 302460, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75781388, + "epoch": 1.6916114293383964, + "grad_norm": 4.84375, + "learning_rate": 6.360353783042595e-07, + "loss": 0.93906784, + "memory(GiB)": 302.58, + "step": 302480, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.76869268, + "epoch": 1.6917232788113756, + "grad_norm": 9.5625, + "learning_rate": 6.355841130622337e-07, + "loss": 0.90804253, + "memory(GiB)": 302.58, + "step": 302500, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.75046902, + "epoch": 1.691835128284355, + "grad_norm": 7.78125, + "learning_rate": 6.351329970970704e-07, + "loss": 0.99307871, + "memory(GiB)": 302.58, + "step": 302520, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75783958, + "epoch": 1.6919469777573342, + "grad_norm": 9.125, + "learning_rate": 6.34682030424199e-07, + "loss": 0.97229357, + "memory(GiB)": 302.58, + "step": 302540, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74467812, + "epoch": 1.6920588272303134, + "grad_norm": 6.59375, + "learning_rate": 6.342312130590445e-07, + "loss": 0.9900177, + "memory(GiB)": 302.58, + "step": 302560, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.75089273, + "epoch": 1.6921706767032927, + "grad_norm": 6.59375, + "learning_rate": 6.337805450170259e-07, + "loss": 0.98347492, + "memory(GiB)": 302.58, + "step": 302580, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.73950281, + "epoch": 1.692282526176272, + "grad_norm": 7.1875, + "learning_rate": 6.333300263135577e-07, + "loss": 1.04057636, + "memory(GiB)": 302.58, + "step": 302600, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.74753771, + "epoch": 1.6923943756492512, + "grad_norm": 7.71875, + "learning_rate": 6.328796569640489e-07, + "loss": 1.01505671, + "memory(GiB)": 302.58, + "step": 302620, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.75527468, + "epoch": 1.6925062251222305, + "grad_norm": 7.9375, + "learning_rate": 6.324294369839034e-07, + "loss": 0.96116457, + "memory(GiB)": 302.58, + "step": 302640, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.7418582, + "epoch": 1.6926180745952097, + "grad_norm": 6.90625, + "learning_rate": 6.319793663885194e-07, + "loss": 0.99328241, + "memory(GiB)": 302.58, + "step": 302660, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75016465, + "epoch": 1.692729924068189, + "grad_norm": 8.6875, + "learning_rate": 6.315294451932929e-07, + "loss": 0.99386082, + "memory(GiB)": 302.58, + "step": 302680, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.73804855, + "epoch": 1.6928417735411683, + "grad_norm": 5.0625, + "learning_rate": 6.31079673413611e-07, + "loss": 1.0338088, + "memory(GiB)": 302.58, + "step": 302700, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.75853901, + "epoch": 1.6929536230141475, + "grad_norm": 5.9375, + "learning_rate": 6.306300510648583e-07, + "loss": 0.92069292, + "memory(GiB)": 302.58, + "step": 302720, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.74442315, + "epoch": 1.6930654724871268, + "grad_norm": 7.09375, + "learning_rate": 6.301805781624121e-07, + "loss": 0.99101839, + "memory(GiB)": 302.58, + "step": 302740, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.73725305, + "epoch": 1.693177321960106, + "grad_norm": 6.90625, + "learning_rate": 6.297312547216473e-07, + "loss": 1.02450724, + "memory(GiB)": 302.58, + "step": 302760, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.75816169, + "epoch": 1.6932891714330853, + "grad_norm": 7.09375, + "learning_rate": 6.292820807579309e-07, + "loss": 0.9688591, + "memory(GiB)": 302.58, + "step": 302780, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.75426397, + "epoch": 1.6934010209060646, + "grad_norm": 6.875, + "learning_rate": 6.288330562866263e-07, + "loss": 0.97443094, + "memory(GiB)": 302.58, + "step": 302800, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75411067, + "epoch": 1.6935128703790439, + "grad_norm": 7.1875, + "learning_rate": 6.283841813230917e-07, + "loss": 0.96826324, + "memory(GiB)": 302.58, + "step": 302820, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.74892879, + "epoch": 1.6936247198520231, + "grad_norm": 6.8125, + "learning_rate": 6.279354558826806e-07, + "loss": 0.98066654, + "memory(GiB)": 302.58, + "step": 302840, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.75083489, + "epoch": 1.6937365693250024, + "grad_norm": 6.25, + "learning_rate": 6.274868799807398e-07, + "loss": 0.97895756, + "memory(GiB)": 302.58, + "step": 302860, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.7365025, + "epoch": 1.6938484187979816, + "grad_norm": 6.09375, + "learning_rate": 6.270384536326129e-07, + "loss": 1.05553989, + "memory(GiB)": 302.58, + "step": 302880, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74725952, + "epoch": 1.693960268270961, + "grad_norm": 6.6875, + "learning_rate": 6.26590176853637e-07, + "loss": 1.00635643, + "memory(GiB)": 302.58, + "step": 302900, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.73425312, + "epoch": 1.6940721177439402, + "grad_norm": 7.625, + "learning_rate": 6.261420496591453e-07, + "loss": 1.04204359, + "memory(GiB)": 302.58, + "step": 302920, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.76101298, + "epoch": 1.6941839672169194, + "grad_norm": 9.9375, + "learning_rate": 6.256940720644628e-07, + "loss": 0.92934227, + "memory(GiB)": 302.58, + "step": 302940, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75146303, + "epoch": 1.6942958166898987, + "grad_norm": 7.59375, + "learning_rate": 6.252462440849149e-07, + "loss": 0.98383179, + "memory(GiB)": 302.58, + "step": 302960, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.7487812, + "epoch": 1.694407666162878, + "grad_norm": 7.90625, + "learning_rate": 6.24798565735818e-07, + "loss": 0.96845093, + "memory(GiB)": 302.58, + "step": 302980, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.74005485, + "epoch": 1.6945195156358572, + "grad_norm": 9.1875, + "learning_rate": 6.243510370324829e-07, + "loss": 1.03370132, + "memory(GiB)": 302.58, + "step": 303000, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.75031586, + "epoch": 1.6946313651088365, + "grad_norm": 6.875, + "learning_rate": 6.239036579902174e-07, + "loss": 0.95532484, + "memory(GiB)": 302.58, + "step": 303020, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.75663834, + "epoch": 1.6947432145818158, + "grad_norm": 6.28125, + "learning_rate": 6.234564286243227e-07, + "loss": 0.94558449, + "memory(GiB)": 302.58, + "step": 303040, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.74621229, + "epoch": 1.694855064054795, + "grad_norm": 6.96875, + "learning_rate": 6.23009348950096e-07, + "loss": 1.0230588, + "memory(GiB)": 302.58, + "step": 303060, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.7385036, + "epoch": 1.6949669135277743, + "grad_norm": 7.09375, + "learning_rate": 6.225624189828289e-07, + "loss": 1.05879307, + "memory(GiB)": 302.58, + "step": 303080, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.7499424, + "epoch": 1.6950787630007536, + "grad_norm": 6.5, + "learning_rate": 6.221156387378069e-07, + "loss": 0.99293137, + "memory(GiB)": 302.58, + "step": 303100, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75102077, + "epoch": 1.6951906124737328, + "grad_norm": 6.65625, + "learning_rate": 6.216690082303123e-07, + "loss": 0.97810917, + "memory(GiB)": 302.58, + "step": 303120, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.75820332, + "epoch": 1.695302461946712, + "grad_norm": 6.0625, + "learning_rate": 6.212225274756211e-07, + "loss": 0.94482212, + "memory(GiB)": 302.58, + "step": 303140, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.74297867, + "epoch": 1.6954143114196913, + "grad_norm": 9.625, + "learning_rate": 6.207761964890041e-07, + "loss": 1.0178051, + "memory(GiB)": 302.58, + "step": 303160, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.73620653, + "epoch": 1.6955261608926706, + "grad_norm": 11.25, + "learning_rate": 6.203300152857272e-07, + "loss": 1.04333601, + "memory(GiB)": 302.58, + "step": 303180, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.76914101, + "epoch": 1.6956380103656499, + "grad_norm": 7.78125, + "learning_rate": 6.198839838810516e-07, + "loss": 0.91791801, + "memory(GiB)": 302.58, + "step": 303200, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.76690836, + "epoch": 1.6957498598386291, + "grad_norm": 4.90625, + "learning_rate": 6.194381022902313e-07, + "loss": 0.89626904, + "memory(GiB)": 302.58, + "step": 303220, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.76624761, + "epoch": 1.6958617093116084, + "grad_norm": 7.34375, + "learning_rate": 6.189923705285194e-07, + "loss": 0.87402983, + "memory(GiB)": 302.58, + "step": 303240, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.76599941, + "epoch": 1.6959735587845877, + "grad_norm": 6.90625, + "learning_rate": 6.185467886111607e-07, + "loss": 0.89515934, + "memory(GiB)": 302.58, + "step": 303260, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.75210404, + "epoch": 1.696085408257567, + "grad_norm": 8.625, + "learning_rate": 6.181013565533945e-07, + "loss": 0.99546156, + "memory(GiB)": 302.58, + "step": 303280, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.75324063, + "epoch": 1.6961972577305462, + "grad_norm": 7.46875, + "learning_rate": 6.176560743704568e-07, + "loss": 0.97148104, + "memory(GiB)": 302.58, + "step": 303300, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74200044, + "epoch": 1.6963091072035255, + "grad_norm": 7.71875, + "learning_rate": 6.172109420775774e-07, + "loss": 1.03278456, + "memory(GiB)": 302.58, + "step": 303320, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.76985497, + "epoch": 1.6964209566765047, + "grad_norm": 6.8125, + "learning_rate": 6.167659596899811e-07, + "loss": 0.88338833, + "memory(GiB)": 302.58, + "step": 303340, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.76196618, + "epoch": 1.696532806149484, + "grad_norm": 8.0625, + "learning_rate": 6.163211272228875e-07, + "loss": 0.9213933, + "memory(GiB)": 302.58, + "step": 303360, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.75067701, + "epoch": 1.6966446556224632, + "grad_norm": 5.59375, + "learning_rate": 6.158764446915122e-07, + "loss": 0.98309822, + "memory(GiB)": 302.58, + "step": 303380, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.76739459, + "epoch": 1.6967565050954425, + "grad_norm": 6.3125, + "learning_rate": 6.154319121110641e-07, + "loss": 0.87461176, + "memory(GiB)": 302.58, + "step": 303400, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.75225711, + "epoch": 1.6968683545684218, + "grad_norm": 6.21875, + "learning_rate": 6.149875294967472e-07, + "loss": 0.96642437, + "memory(GiB)": 302.58, + "step": 303420, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.7560823, + "epoch": 1.696980204041401, + "grad_norm": 8.0625, + "learning_rate": 6.145432968637621e-07, + "loss": 0.96460552, + "memory(GiB)": 302.58, + "step": 303440, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.76183515, + "epoch": 1.6970920535143803, + "grad_norm": 10.125, + "learning_rate": 6.140992142273017e-07, + "loss": 0.95168066, + "memory(GiB)": 302.58, + "step": 303460, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.75947838, + "epoch": 1.6972039029873596, + "grad_norm": 7.3125, + "learning_rate": 6.136552816025543e-07, + "loss": 0.93468761, + "memory(GiB)": 302.58, + "step": 303480, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.7704958, + "epoch": 1.6973157524603388, + "grad_norm": 5.84375, + "learning_rate": 6.132114990047061e-07, + "loss": 0.8861659, + "memory(GiB)": 302.58, + "step": 303500, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.7579803, + "epoch": 1.697427601933318, + "grad_norm": 5.09375, + "learning_rate": 6.127678664489351e-07, + "loss": 0.9473012, + "memory(GiB)": 302.58, + "step": 303520, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.75363731, + "epoch": 1.6975394514062974, + "grad_norm": 9.125, + "learning_rate": 6.123243839504129e-07, + "loss": 0.96233807, + "memory(GiB)": 302.58, + "step": 303540, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.74006333, + "epoch": 1.6976513008792766, + "grad_norm": 7.09375, + "learning_rate": 6.118810515243112e-07, + "loss": 1.01291952, + "memory(GiB)": 302.58, + "step": 303560, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.74269691, + "epoch": 1.697763150352256, + "grad_norm": 7.6875, + "learning_rate": 6.114378691857919e-07, + "loss": 1.03345947, + "memory(GiB)": 302.58, + "step": 303580, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75865378, + "epoch": 1.6978749998252352, + "grad_norm": 8.3125, + "learning_rate": 6.10994836950013e-07, + "loss": 0.92499428, + "memory(GiB)": 302.58, + "step": 303600, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.7668498, + "epoch": 1.6979868492982144, + "grad_norm": 8.8125, + "learning_rate": 6.105519548321282e-07, + "loss": 0.92652464, + "memory(GiB)": 302.58, + "step": 303620, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.7666389, + "epoch": 1.6980986987711937, + "grad_norm": 4.5625, + "learning_rate": 6.101092228472849e-07, + "loss": 0.9273778, + "memory(GiB)": 302.58, + "step": 303640, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.75406561, + "epoch": 1.698210548244173, + "grad_norm": 7.1875, + "learning_rate": 6.096666410106261e-07, + "loss": 0.94394569, + "memory(GiB)": 302.58, + "step": 303660, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.76748381, + "epoch": 1.6983223977171522, + "grad_norm": 8.25, + "learning_rate": 6.092242093372897e-07, + "loss": 0.91103468, + "memory(GiB)": 302.58, + "step": 303680, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.73339396, + "epoch": 1.6984342471901315, + "grad_norm": 5.75, + "learning_rate": 6.087819278424078e-07, + "loss": 1.04425821, + "memory(GiB)": 302.58, + "step": 303700, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.76431923, + "epoch": 1.6985460966631107, + "grad_norm": 8.6875, + "learning_rate": 6.083397965411087e-07, + "loss": 0.9189991, + "memory(GiB)": 302.58, + "step": 303720, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.76065078, + "epoch": 1.69865794613609, + "grad_norm": 9.8125, + "learning_rate": 6.078978154485137e-07, + "loss": 0.91761646, + "memory(GiB)": 302.58, + "step": 303740, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.74519267, + "epoch": 1.6987697956090693, + "grad_norm": 9.8125, + "learning_rate": 6.074559845797407e-07, + "loss": 1.01122179, + "memory(GiB)": 302.58, + "step": 303760, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.71612897, + "epoch": 1.6988816450820485, + "grad_norm": 8.0, + "learning_rate": 6.07014303949901e-07, + "loss": 1.12477093, + "memory(GiB)": 302.58, + "step": 303780, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.7533874, + "epoch": 1.6989934945550278, + "grad_norm": 6.15625, + "learning_rate": 6.065727735741017e-07, + "loss": 0.9638402, + "memory(GiB)": 302.58, + "step": 303800, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.75217299, + "epoch": 1.699105344028007, + "grad_norm": 7.0, + "learning_rate": 6.061313934674445e-07, + "loss": 0.98654823, + "memory(GiB)": 302.58, + "step": 303820, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.7591208, + "epoch": 1.6992171935009863, + "grad_norm": 6.25, + "learning_rate": 6.056901636450269e-07, + "loss": 0.94754581, + "memory(GiB)": 302.58, + "step": 303840, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.75931602, + "epoch": 1.6993290429739656, + "grad_norm": 6.8125, + "learning_rate": 6.052490841219394e-07, + "loss": 0.94721899, + "memory(GiB)": 302.58, + "step": 303860, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.7529036, + "epoch": 1.6994408924469449, + "grad_norm": 8.875, + "learning_rate": 6.04808154913269e-07, + "loss": 0.98808966, + "memory(GiB)": 302.58, + "step": 303880, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.74986906, + "epoch": 1.6995527419199241, + "grad_norm": 4.625, + "learning_rate": 6.043673760340963e-07, + "loss": 1.00828428, + "memory(GiB)": 302.58, + "step": 303900, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.74811778, + "epoch": 1.6996645913929034, + "grad_norm": 7.15625, + "learning_rate": 6.03926747499497e-07, + "loss": 1.00506573, + "memory(GiB)": 302.58, + "step": 303920, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.75065484, + "epoch": 1.6997764408658826, + "grad_norm": 7.21875, + "learning_rate": 6.03486269324543e-07, + "loss": 0.9743413, + "memory(GiB)": 302.58, + "step": 303940, + "train_speed(iter/s)": 0.123694 + }, + { + "acc": 0.72951112, + "epoch": 1.699888290338862, + "grad_norm": 6.75, + "learning_rate": 6.030459415243e-07, + "loss": 1.07796679, + "memory(GiB)": 302.58, + "step": 303960, + "train_speed(iter/s)": 0.123698 + }, + { + "acc": 0.74390063, + "epoch": 1.7000001398118412, + "grad_norm": 9.25, + "learning_rate": 6.026057641138278e-07, + "loss": 1.00781584, + "memory(GiB)": 302.58, + "step": 303980, + "train_speed(iter/s)": 0.123702 + }, + { + "acc": 0.74064789, + "epoch": 1.7001119892848204, + "grad_norm": 7.71875, + "learning_rate": 6.021657371081818e-07, + "loss": 1.05789347, + "memory(GiB)": 302.58, + "step": 304000, + "train_speed(iter/s)": 0.123705 + }, + { + "epoch": 1.7001119892848204, + "eval_acc": 0.7069196617175652, + "eval_loss": 1.0118192434310913, + "eval_runtime": 7525.1135, + "eval_samples_per_second": 10.004, + "eval_steps_per_second": 10.004, + "step": 304000 + }, + { + "acc": 0.73913236, + "epoch": 1.7002238387577997, + "grad_norm": 7.46875, + "learning_rate": 6.01725860522413e-07, + "loss": 1.03534212, + "memory(GiB)": 302.58, + "step": 304020, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.78226056, + "epoch": 1.700335688230779, + "grad_norm": 6.1875, + "learning_rate": 6.012861343715664e-07, + "loss": 0.83848095, + "memory(GiB)": 302.58, + "step": 304040, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.76037774, + "epoch": 1.7004475377037582, + "grad_norm": 6.28125, + "learning_rate": 6.008465586706824e-07, + "loss": 0.95125418, + "memory(GiB)": 302.58, + "step": 304060, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.74702287, + "epoch": 1.7005593871767375, + "grad_norm": 7.0625, + "learning_rate": 6.00407133434795e-07, + "loss": 0.99578485, + "memory(GiB)": 302.58, + "step": 304080, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.75448565, + "epoch": 1.700671236649717, + "grad_norm": 4.46875, + "learning_rate": 5.999678586789332e-07, + "loss": 0.96218653, + "memory(GiB)": 302.58, + "step": 304100, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.74792471, + "epoch": 1.700783086122696, + "grad_norm": 7.15625, + "learning_rate": 5.995287344181238e-07, + "loss": 0.98156157, + "memory(GiB)": 302.58, + "step": 304120, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.74272652, + "epoch": 1.7008949355956755, + "grad_norm": 7.71875, + "learning_rate": 5.990897606673851e-07, + "loss": 1.034091, + "memory(GiB)": 302.58, + "step": 304140, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.75680966, + "epoch": 1.7010067850686545, + "grad_norm": 5.75, + "learning_rate": 5.986509374417321e-07, + "loss": 0.93930235, + "memory(GiB)": 302.58, + "step": 304160, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.74504681, + "epoch": 1.701118634541634, + "grad_norm": 7.875, + "learning_rate": 5.982122647561727e-07, + "loss": 0.99961405, + "memory(GiB)": 302.58, + "step": 304180, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.73708091, + "epoch": 1.701230484014613, + "grad_norm": 9.0, + "learning_rate": 5.977737426257119e-07, + "loss": 1.02804575, + "memory(GiB)": 302.58, + "step": 304200, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.75907111, + "epoch": 1.7013423334875926, + "grad_norm": 6.25, + "learning_rate": 5.973353710653479e-07, + "loss": 0.95203495, + "memory(GiB)": 302.58, + "step": 304220, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.75064278, + "epoch": 1.7014541829605716, + "grad_norm": 6.96875, + "learning_rate": 5.96897150090075e-07, + "loss": 0.98594961, + "memory(GiB)": 302.58, + "step": 304240, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.74517899, + "epoch": 1.701566032433551, + "grad_norm": 8.9375, + "learning_rate": 5.964590797148811e-07, + "loss": 1.0114748, + "memory(GiB)": 302.58, + "step": 304260, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.76991878, + "epoch": 1.7016778819065301, + "grad_norm": 6.96875, + "learning_rate": 5.960211599547505e-07, + "loss": 0.90452185, + "memory(GiB)": 302.58, + "step": 304280, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.74336443, + "epoch": 1.7017897313795096, + "grad_norm": 6.25, + "learning_rate": 5.9558339082466e-07, + "loss": 0.99487543, + "memory(GiB)": 302.58, + "step": 304300, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.74711852, + "epoch": 1.7019015808524887, + "grad_norm": 6.375, + "learning_rate": 5.951457723395842e-07, + "loss": 1.00366182, + "memory(GiB)": 302.58, + "step": 304320, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.7472518, + "epoch": 1.7020134303254681, + "grad_norm": 9.5625, + "learning_rate": 5.947083045144902e-07, + "loss": 0.97948704, + "memory(GiB)": 302.58, + "step": 304340, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.75648899, + "epoch": 1.7021252797984472, + "grad_norm": 6.46875, + "learning_rate": 5.942709873643399e-07, + "loss": 0.94254665, + "memory(GiB)": 302.58, + "step": 304360, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.73910909, + "epoch": 1.7022371292714267, + "grad_norm": 6.3125, + "learning_rate": 5.938338209040933e-07, + "loss": 1.03972797, + "memory(GiB)": 302.58, + "step": 304380, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.76120896, + "epoch": 1.7023489787444057, + "grad_norm": 7.84375, + "learning_rate": 5.933968051487016e-07, + "loss": 0.93972435, + "memory(GiB)": 302.58, + "step": 304400, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.74378738, + "epoch": 1.7024608282173852, + "grad_norm": 5.84375, + "learning_rate": 5.92959940113112e-07, + "loss": 1.01729965, + "memory(GiB)": 302.58, + "step": 304420, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.77377496, + "epoch": 1.7025726776903642, + "grad_norm": 8.0625, + "learning_rate": 5.925232258122665e-07, + "loss": 0.88301315, + "memory(GiB)": 302.58, + "step": 304440, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.74208231, + "epoch": 1.7026845271633437, + "grad_norm": 7.84375, + "learning_rate": 5.920866622611027e-07, + "loss": 1.03757658, + "memory(GiB)": 302.58, + "step": 304460, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.7480423, + "epoch": 1.7027963766363228, + "grad_norm": 8.625, + "learning_rate": 5.916502494745518e-07, + "loss": 1.00612001, + "memory(GiB)": 302.58, + "step": 304480, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.74544678, + "epoch": 1.7029082261093023, + "grad_norm": 6.84375, + "learning_rate": 5.912139874675415e-07, + "loss": 0.97946558, + "memory(GiB)": 302.58, + "step": 304500, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.75427589, + "epoch": 1.7030200755822813, + "grad_norm": 8.0625, + "learning_rate": 5.907778762549926e-07, + "loss": 0.94387646, + "memory(GiB)": 302.58, + "step": 304520, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.73872428, + "epoch": 1.7031319250552608, + "grad_norm": 6.125, + "learning_rate": 5.903419158518214e-07, + "loss": 1.03307848, + "memory(GiB)": 302.58, + "step": 304540, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.76872144, + "epoch": 1.7032437745282398, + "grad_norm": 7.125, + "learning_rate": 5.899061062729389e-07, + "loss": 0.90915012, + "memory(GiB)": 302.58, + "step": 304560, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.75281072, + "epoch": 1.7033556240012193, + "grad_norm": 6.21875, + "learning_rate": 5.894704475332524e-07, + "loss": 0.96307192, + "memory(GiB)": 302.58, + "step": 304580, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.76589241, + "epoch": 1.7034674734741984, + "grad_norm": 6.21875, + "learning_rate": 5.890349396476613e-07, + "loss": 0.93348789, + "memory(GiB)": 302.58, + "step": 304600, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75574923, + "epoch": 1.7035793229471778, + "grad_norm": 9.5625, + "learning_rate": 5.885995826310626e-07, + "loss": 0.96762905, + "memory(GiB)": 302.58, + "step": 304620, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.75714116, + "epoch": 1.7036911724201569, + "grad_norm": 7.28125, + "learning_rate": 5.881643764983453e-07, + "loss": 0.94102612, + "memory(GiB)": 302.58, + "step": 304640, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.75224371, + "epoch": 1.7038030218931364, + "grad_norm": 6.78125, + "learning_rate": 5.877293212643964e-07, + "loss": 0.97595692, + "memory(GiB)": 302.58, + "step": 304660, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.75643458, + "epoch": 1.7039148713661154, + "grad_norm": 6.875, + "learning_rate": 5.872944169440959e-07, + "loss": 0.95591078, + "memory(GiB)": 302.58, + "step": 304680, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.75159187, + "epoch": 1.704026720839095, + "grad_norm": 6.5625, + "learning_rate": 5.868596635523188e-07, + "loss": 0.97844934, + "memory(GiB)": 302.58, + "step": 304700, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.74485412, + "epoch": 1.704138570312074, + "grad_norm": 9.1875, + "learning_rate": 5.864250611039335e-07, + "loss": 0.99429541, + "memory(GiB)": 302.58, + "step": 304720, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.75025196, + "epoch": 1.7042504197850534, + "grad_norm": 6.90625, + "learning_rate": 5.859906096138079e-07, + "loss": 0.96763201, + "memory(GiB)": 302.58, + "step": 304740, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.75691648, + "epoch": 1.7043622692580325, + "grad_norm": 7.3125, + "learning_rate": 5.855563090967992e-07, + "loss": 0.95643873, + "memory(GiB)": 302.58, + "step": 304760, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.74808574, + "epoch": 1.704474118731012, + "grad_norm": 5.40625, + "learning_rate": 5.85122159567763e-07, + "loss": 0.994841, + "memory(GiB)": 302.58, + "step": 304780, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74055605, + "epoch": 1.704585968203991, + "grad_norm": 11.4375, + "learning_rate": 5.846881610415478e-07, + "loss": 1.04233646, + "memory(GiB)": 302.58, + "step": 304800, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.74355292, + "epoch": 1.7046978176769705, + "grad_norm": 7.125, + "learning_rate": 5.842543135329981e-07, + "loss": 1.01830215, + "memory(GiB)": 302.58, + "step": 304820, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.73797555, + "epoch": 1.7048096671499495, + "grad_norm": 5.21875, + "learning_rate": 5.838206170569533e-07, + "loss": 1.03767233, + "memory(GiB)": 302.58, + "step": 304840, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.74291477, + "epoch": 1.704921516622929, + "grad_norm": 8.9375, + "learning_rate": 5.833870716282463e-07, + "loss": 1.01188955, + "memory(GiB)": 302.58, + "step": 304860, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.7515964, + "epoch": 1.705033366095908, + "grad_norm": 5.28125, + "learning_rate": 5.829536772617061e-07, + "loss": 0.96170511, + "memory(GiB)": 302.58, + "step": 304880, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.7616982, + "epoch": 1.7051452155688875, + "grad_norm": 6.3125, + "learning_rate": 5.825204339721563e-07, + "loss": 0.93166733, + "memory(GiB)": 302.58, + "step": 304900, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.76635079, + "epoch": 1.7052570650418666, + "grad_norm": 6.5625, + "learning_rate": 5.820873417744149e-07, + "loss": 0.89964619, + "memory(GiB)": 302.58, + "step": 304920, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.75429573, + "epoch": 1.705368914514846, + "grad_norm": 8.0, + "learning_rate": 5.816544006832958e-07, + "loss": 0.96627483, + "memory(GiB)": 302.58, + "step": 304940, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.75466967, + "epoch": 1.705480763987825, + "grad_norm": 6.40625, + "learning_rate": 5.81221610713606e-07, + "loss": 0.96595211, + "memory(GiB)": 302.58, + "step": 304960, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.74916935, + "epoch": 1.7055926134608046, + "grad_norm": 5.5, + "learning_rate": 5.807889718801479e-07, + "loss": 0.97325211, + "memory(GiB)": 302.58, + "step": 304980, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.77656984, + "epoch": 1.7057044629337836, + "grad_norm": 5.4375, + "learning_rate": 5.803564841977205e-07, + "loss": 0.86689663, + "memory(GiB)": 302.58, + "step": 305000, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74194226, + "epoch": 1.7058163124067631, + "grad_norm": 9.3125, + "learning_rate": 5.799241476811157e-07, + "loss": 1.03183508, + "memory(GiB)": 302.58, + "step": 305020, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75304985, + "epoch": 1.7059281618797422, + "grad_norm": 8.75, + "learning_rate": 5.794919623451212e-07, + "loss": 0.96655607, + "memory(GiB)": 302.58, + "step": 305040, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74188094, + "epoch": 1.7060400113527217, + "grad_norm": 8.375, + "learning_rate": 5.790599282045184e-07, + "loss": 1.02323723, + "memory(GiB)": 302.58, + "step": 305060, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.73919439, + "epoch": 1.7061518608257007, + "grad_norm": 7.09375, + "learning_rate": 5.786280452740844e-07, + "loss": 1.04830408, + "memory(GiB)": 302.58, + "step": 305080, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.77877774, + "epoch": 1.7062637102986802, + "grad_norm": 7.0, + "learning_rate": 5.781963135685909e-07, + "loss": 0.8790308, + "memory(GiB)": 302.58, + "step": 305100, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.75035009, + "epoch": 1.7063755597716592, + "grad_norm": 8.125, + "learning_rate": 5.777647331028047e-07, + "loss": 0.98520651, + "memory(GiB)": 302.58, + "step": 305120, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.76215091, + "epoch": 1.7064874092446387, + "grad_norm": 6.40625, + "learning_rate": 5.773333038914874e-07, + "loss": 0.90692682, + "memory(GiB)": 302.58, + "step": 305140, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75263853, + "epoch": 1.7065992587176178, + "grad_norm": 5.5625, + "learning_rate": 5.769020259493952e-07, + "loss": 0.95140476, + "memory(GiB)": 302.58, + "step": 305160, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.73196688, + "epoch": 1.7067111081905972, + "grad_norm": 7.96875, + "learning_rate": 5.764708992912782e-07, + "loss": 1.06427536, + "memory(GiB)": 302.58, + "step": 305180, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74509506, + "epoch": 1.7068229576635763, + "grad_norm": 7.28125, + "learning_rate": 5.760399239318837e-07, + "loss": 1.00232697, + "memory(GiB)": 302.58, + "step": 305200, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.75078363, + "epoch": 1.7069348071365558, + "grad_norm": 4.53125, + "learning_rate": 5.756090998859515e-07, + "loss": 0.99674339, + "memory(GiB)": 302.58, + "step": 305220, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75887532, + "epoch": 1.7070466566095348, + "grad_norm": 9.25, + "learning_rate": 5.751784271682165e-07, + "loss": 0.95332947, + "memory(GiB)": 302.58, + "step": 305240, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.74845519, + "epoch": 1.7071585060825143, + "grad_norm": 6.53125, + "learning_rate": 5.747479057934113e-07, + "loss": 0.98271112, + "memory(GiB)": 302.58, + "step": 305260, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75650949, + "epoch": 1.7072703555554933, + "grad_norm": 7.65625, + "learning_rate": 5.743175357762598e-07, + "loss": 0.97822437, + "memory(GiB)": 302.58, + "step": 305280, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.75792537, + "epoch": 1.7073822050284728, + "grad_norm": 7.0625, + "learning_rate": 5.738873171314818e-07, + "loss": 0.95270767, + "memory(GiB)": 302.58, + "step": 305300, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.74655938, + "epoch": 1.7074940545014519, + "grad_norm": 6.3125, + "learning_rate": 5.734572498737922e-07, + "loss": 1.00180349, + "memory(GiB)": 302.58, + "step": 305320, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74805017, + "epoch": 1.7076059039744313, + "grad_norm": 6.28125, + "learning_rate": 5.73027334017901e-07, + "loss": 0.99712868, + "memory(GiB)": 302.58, + "step": 305340, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74996638, + "epoch": 1.7077177534474104, + "grad_norm": 6.28125, + "learning_rate": 5.72597569578513e-07, + "loss": 0.98399725, + "memory(GiB)": 302.58, + "step": 305360, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.76021147, + "epoch": 1.7078296029203899, + "grad_norm": 6.65625, + "learning_rate": 5.721679565703265e-07, + "loss": 0.93428364, + "memory(GiB)": 302.58, + "step": 305380, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75754914, + "epoch": 1.707941452393369, + "grad_norm": 8.25, + "learning_rate": 5.717384950080363e-07, + "loss": 0.94650869, + "memory(GiB)": 302.58, + "step": 305400, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.75255227, + "epoch": 1.7080533018663484, + "grad_norm": 6.3125, + "learning_rate": 5.713091849063313e-07, + "loss": 0.99173412, + "memory(GiB)": 302.58, + "step": 305420, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.77115474, + "epoch": 1.7081651513393274, + "grad_norm": 9.6875, + "learning_rate": 5.708800262798947e-07, + "loss": 0.89840221, + "memory(GiB)": 302.58, + "step": 305440, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.76336913, + "epoch": 1.708277000812307, + "grad_norm": 9.6875, + "learning_rate": 5.704510191434059e-07, + "loss": 0.93406715, + "memory(GiB)": 302.58, + "step": 305460, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.75616088, + "epoch": 1.708388850285286, + "grad_norm": 8.75, + "learning_rate": 5.700221635115383e-07, + "loss": 0.97503719, + "memory(GiB)": 302.58, + "step": 305480, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.75474863, + "epoch": 1.7085006997582655, + "grad_norm": 5.65625, + "learning_rate": 5.695934593989594e-07, + "loss": 0.95738716, + "memory(GiB)": 302.58, + "step": 305500, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.75525565, + "epoch": 1.7086125492312445, + "grad_norm": 8.8125, + "learning_rate": 5.691649068203315e-07, + "loss": 0.95119352, + "memory(GiB)": 302.58, + "step": 305520, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.74469333, + "epoch": 1.708724398704224, + "grad_norm": 9.25, + "learning_rate": 5.687365057903149e-07, + "loss": 1.00921898, + "memory(GiB)": 302.58, + "step": 305540, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.7469295, + "epoch": 1.708836248177203, + "grad_norm": 8.5625, + "learning_rate": 5.683082563235609e-07, + "loss": 0.99872723, + "memory(GiB)": 302.58, + "step": 305560, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.76180778, + "epoch": 1.7089480976501825, + "grad_norm": 8.5, + "learning_rate": 5.678801584347166e-07, + "loss": 0.93983479, + "memory(GiB)": 302.58, + "step": 305580, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.74047866, + "epoch": 1.7090599471231616, + "grad_norm": 8.875, + "learning_rate": 5.674522121384251e-07, + "loss": 1.03600416, + "memory(GiB)": 302.58, + "step": 305600, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.7318429, + "epoch": 1.709171796596141, + "grad_norm": 4.96875, + "learning_rate": 5.670244174493233e-07, + "loss": 1.04062138, + "memory(GiB)": 302.58, + "step": 305620, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.76636763, + "epoch": 1.70928364606912, + "grad_norm": 8.4375, + "learning_rate": 5.665967743820433e-07, + "loss": 0.912988, + "memory(GiB)": 302.58, + "step": 305640, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.74850097, + "epoch": 1.7093954955420996, + "grad_norm": 6.28125, + "learning_rate": 5.66169282951211e-07, + "loss": 0.99232407, + "memory(GiB)": 302.58, + "step": 305660, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.76069126, + "epoch": 1.7095073450150786, + "grad_norm": 7.0, + "learning_rate": 5.657419431714489e-07, + "loss": 0.94749947, + "memory(GiB)": 302.58, + "step": 305680, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.75337348, + "epoch": 1.709619194488058, + "grad_norm": 7.3125, + "learning_rate": 5.653147550573724e-07, + "loss": 0.96744881, + "memory(GiB)": 302.58, + "step": 305700, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.75199847, + "epoch": 1.7097310439610371, + "grad_norm": 7.1875, + "learning_rate": 5.64887718623594e-07, + "loss": 1.008671, + "memory(GiB)": 302.58, + "step": 305720, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.738977, + "epoch": 1.7098428934340166, + "grad_norm": 8.5625, + "learning_rate": 5.644608338847185e-07, + "loss": 1.01778641, + "memory(GiB)": 302.58, + "step": 305740, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.7495501, + "epoch": 1.7099547429069957, + "grad_norm": 6.03125, + "learning_rate": 5.640341008553474e-07, + "loss": 0.98619738, + "memory(GiB)": 302.58, + "step": 305760, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.75048771, + "epoch": 1.7100665923799752, + "grad_norm": 7.90625, + "learning_rate": 5.636075195500762e-07, + "loss": 0.98515129, + "memory(GiB)": 302.58, + "step": 305780, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.75443501, + "epoch": 1.7101784418529542, + "grad_norm": 6.09375, + "learning_rate": 5.631810899834938e-07, + "loss": 0.96923609, + "memory(GiB)": 302.58, + "step": 305800, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.75252104, + "epoch": 1.7102902913259337, + "grad_norm": 10.3125, + "learning_rate": 5.627548121701881e-07, + "loss": 1.00139265, + "memory(GiB)": 302.58, + "step": 305820, + "train_speed(iter/s)": 0.123667 + }, + { + "acc": 0.7419219, + "epoch": 1.7104021407989127, + "grad_norm": 5.5, + "learning_rate": 5.623286861247385e-07, + "loss": 1.03219461, + "memory(GiB)": 302.58, + "step": 305840, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.75962129, + "epoch": 1.7105139902718922, + "grad_norm": 9.375, + "learning_rate": 5.619027118617188e-07, + "loss": 0.95877676, + "memory(GiB)": 302.58, + "step": 305860, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.75855269, + "epoch": 1.7106258397448713, + "grad_norm": 7.65625, + "learning_rate": 5.61476889395699e-07, + "loss": 0.94531889, + "memory(GiB)": 302.58, + "step": 305880, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.76889701, + "epoch": 1.7107376892178507, + "grad_norm": 6.28125, + "learning_rate": 5.610512187412442e-07, + "loss": 0.88600998, + "memory(GiB)": 302.58, + "step": 305900, + "train_speed(iter/s)": 0.123681 + }, + { + "acc": 0.75091763, + "epoch": 1.7108495386908298, + "grad_norm": 6.4375, + "learning_rate": 5.606256999129129e-07, + "loss": 0.97474241, + "memory(GiB)": 302.58, + "step": 305920, + "train_speed(iter/s)": 0.123685 + }, + { + "acc": 0.75748124, + "epoch": 1.7109613881638093, + "grad_norm": 7.8125, + "learning_rate": 5.602003329252598e-07, + "loss": 0.93694115, + "memory(GiB)": 302.58, + "step": 305940, + "train_speed(iter/s)": 0.123689 + }, + { + "acc": 0.76711717, + "epoch": 1.7110732376367883, + "grad_norm": 8.0, + "learning_rate": 5.597751177928334e-07, + "loss": 0.91845093, + "memory(GiB)": 302.58, + "step": 305960, + "train_speed(iter/s)": 0.123693 + }, + { + "acc": 0.72965927, + "epoch": 1.7111850871097678, + "grad_norm": 7.59375, + "learning_rate": 5.593500545301766e-07, + "loss": 1.06922331, + "memory(GiB)": 302.58, + "step": 305980, + "train_speed(iter/s)": 0.123697 + }, + { + "acc": 0.74538207, + "epoch": 1.7112969365827468, + "grad_norm": 6.6875, + "learning_rate": 5.5892514315183e-07, + "loss": 1.01392765, + "memory(GiB)": 302.58, + "step": 306000, + "train_speed(iter/s)": 0.1237 + }, + { + "epoch": 1.7112969365827468, + "eval_acc": 0.7069041825266752, + "eval_loss": 1.0117932558059692, + "eval_runtime": 7602.1123, + "eval_samples_per_second": 9.903, + "eval_steps_per_second": 9.903, + "step": 306000 + }, + { + "acc": 0.74897013, + "epoch": 1.7114087860557263, + "grad_norm": 4.8125, + "learning_rate": 5.585003836723251e-07, + "loss": 0.98935738, + "memory(GiB)": 302.58, + "step": 306020, + "train_speed(iter/s)": 0.123318 + }, + { + "acc": 0.73998823, + "epoch": 1.7115206355287054, + "grad_norm": 4.375, + "learning_rate": 5.580757761061917e-07, + "loss": 1.04208488, + "memory(GiB)": 302.58, + "step": 306040, + "train_speed(iter/s)": 0.123322 + }, + { + "acc": 0.75719852, + "epoch": 1.7116324850016849, + "grad_norm": 6.3125, + "learning_rate": 5.576513204679512e-07, + "loss": 0.96339769, + "memory(GiB)": 302.58, + "step": 306060, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.73923326, + "epoch": 1.711744334474664, + "grad_norm": 7.6875, + "learning_rate": 5.572270167721217e-07, + "loss": 1.01044073, + "memory(GiB)": 302.58, + "step": 306080, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.73520374, + "epoch": 1.7118561839476434, + "grad_norm": 7.71875, + "learning_rate": 5.568028650332158e-07, + "loss": 1.06812429, + "memory(GiB)": 302.58, + "step": 306100, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.75049114, + "epoch": 1.7119680334206224, + "grad_norm": 5.21875, + "learning_rate": 5.563788652657404e-07, + "loss": 0.98423767, + "memory(GiB)": 302.58, + "step": 306120, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.7662807, + "epoch": 1.712079882893602, + "grad_norm": 6.34375, + "learning_rate": 5.559550174841988e-07, + "loss": 0.94375477, + "memory(GiB)": 302.58, + "step": 306140, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.76091709, + "epoch": 1.712191732366581, + "grad_norm": 8.5, + "learning_rate": 5.555313217030872e-07, + "loss": 0.93063478, + "memory(GiB)": 302.58, + "step": 306160, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.74994159, + "epoch": 1.7123035818395604, + "grad_norm": 9.875, + "learning_rate": 5.55107777936898e-07, + "loss": 0.99052372, + "memory(GiB)": 302.58, + "step": 306180, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.75899925, + "epoch": 1.7124154313125395, + "grad_norm": 7.875, + "learning_rate": 5.546843862001167e-07, + "loss": 0.97563801, + "memory(GiB)": 302.58, + "step": 306200, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.73825254, + "epoch": 1.712527280785519, + "grad_norm": 6.71875, + "learning_rate": 5.54261146507225e-07, + "loss": 1.04148588, + "memory(GiB)": 302.58, + "step": 306220, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.7620708, + "epoch": 1.712639130258498, + "grad_norm": 4.9375, + "learning_rate": 5.538380588726994e-07, + "loss": 0.89869299, + "memory(GiB)": 302.58, + "step": 306240, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.77222686, + "epoch": 1.7127509797314775, + "grad_norm": 9.375, + "learning_rate": 5.534151233110103e-07, + "loss": 0.87372055, + "memory(GiB)": 302.58, + "step": 306260, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.75752201, + "epoch": 1.7128628292044565, + "grad_norm": 4.96875, + "learning_rate": 5.529923398366239e-07, + "loss": 0.95616055, + "memory(GiB)": 302.58, + "step": 306280, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.75508733, + "epoch": 1.712974678677436, + "grad_norm": 6.03125, + "learning_rate": 5.525697084640009e-07, + "loss": 0.97098894, + "memory(GiB)": 302.58, + "step": 306300, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.7435575, + "epoch": 1.713086528150415, + "grad_norm": 11.5625, + "learning_rate": 5.521472292075958e-07, + "loss": 1.01296988, + "memory(GiB)": 302.58, + "step": 306320, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.73827891, + "epoch": 1.7131983776233946, + "grad_norm": 11.4375, + "learning_rate": 5.517249020818593e-07, + "loss": 1.02808771, + "memory(GiB)": 302.58, + "step": 306340, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.75169854, + "epoch": 1.7133102270963736, + "grad_norm": 9.375, + "learning_rate": 5.513027271012367e-07, + "loss": 0.96993399, + "memory(GiB)": 302.58, + "step": 306360, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.73394942, + "epoch": 1.713422076569353, + "grad_norm": 5.59375, + "learning_rate": 5.508807042801667e-07, + "loss": 1.06465788, + "memory(GiB)": 302.58, + "step": 306380, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.76857476, + "epoch": 1.7135339260423321, + "grad_norm": 5.21875, + "learning_rate": 5.504588336330835e-07, + "loss": 0.91152849, + "memory(GiB)": 302.58, + "step": 306400, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.74387927, + "epoch": 1.7136457755153116, + "grad_norm": 4.53125, + "learning_rate": 5.500371151744183e-07, + "loss": 0.99592066, + "memory(GiB)": 302.58, + "step": 306420, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.74619594, + "epoch": 1.7137576249882907, + "grad_norm": 6.78125, + "learning_rate": 5.496155489185945e-07, + "loss": 0.9809535, + "memory(GiB)": 302.58, + "step": 306440, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.77244105, + "epoch": 1.7138694744612701, + "grad_norm": 5.71875, + "learning_rate": 5.491941348800306e-07, + "loss": 0.88489962, + "memory(GiB)": 302.58, + "step": 306460, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.73450022, + "epoch": 1.7139813239342492, + "grad_norm": 16.5, + "learning_rate": 5.487728730731401e-07, + "loss": 1.04127884, + "memory(GiB)": 302.58, + "step": 306480, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74876356, + "epoch": 1.7140931734072287, + "grad_norm": 6.53125, + "learning_rate": 5.48351763512332e-07, + "loss": 1.00858574, + "memory(GiB)": 302.58, + "step": 306500, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75530887, + "epoch": 1.7142050228802077, + "grad_norm": 7.09375, + "learning_rate": 5.479308062120098e-07, + "loss": 0.96353569, + "memory(GiB)": 302.58, + "step": 306520, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.75603828, + "epoch": 1.7143168723531872, + "grad_norm": 9.8125, + "learning_rate": 5.475100011865702e-07, + "loss": 0.94042816, + "memory(GiB)": 302.58, + "step": 306540, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.7507937, + "epoch": 1.7144287218261662, + "grad_norm": 8.8125, + "learning_rate": 5.470893484504081e-07, + "loss": 0.99644213, + "memory(GiB)": 302.58, + "step": 306560, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.76198459, + "epoch": 1.7145405712991457, + "grad_norm": 6.46875, + "learning_rate": 5.466688480179094e-07, + "loss": 0.93373222, + "memory(GiB)": 302.58, + "step": 306580, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75132136, + "epoch": 1.7146524207721248, + "grad_norm": 6.875, + "learning_rate": 5.462484999034573e-07, + "loss": 0.95732822, + "memory(GiB)": 302.58, + "step": 306600, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.72830224, + "epoch": 1.7147642702451042, + "grad_norm": 6.84375, + "learning_rate": 5.458283041214291e-07, + "loss": 1.07229176, + "memory(GiB)": 302.58, + "step": 306620, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.74481578, + "epoch": 1.7148761197180833, + "grad_norm": 5.84375, + "learning_rate": 5.454082606861965e-07, + "loss": 1.00975609, + "memory(GiB)": 302.58, + "step": 306640, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.75349302, + "epoch": 1.7149879691910628, + "grad_norm": 7.03125, + "learning_rate": 5.449883696121261e-07, + "loss": 0.96292286, + "memory(GiB)": 302.58, + "step": 306660, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.75414138, + "epoch": 1.7150998186640418, + "grad_norm": 6.5625, + "learning_rate": 5.445686309135801e-07, + "loss": 0.9852664, + "memory(GiB)": 302.58, + "step": 306680, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.74115877, + "epoch": 1.7152116681370213, + "grad_norm": 10.3125, + "learning_rate": 5.441490446049153e-07, + "loss": 1.03302879, + "memory(GiB)": 302.58, + "step": 306700, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.7499444, + "epoch": 1.7153235176100003, + "grad_norm": 10.875, + "learning_rate": 5.437296107004819e-07, + "loss": 0.9973731, + "memory(GiB)": 302.58, + "step": 306720, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.75618467, + "epoch": 1.7154353670829798, + "grad_norm": 7.875, + "learning_rate": 5.433103292146269e-07, + "loss": 0.95860643, + "memory(GiB)": 302.58, + "step": 306740, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.73894324, + "epoch": 1.7155472165559589, + "grad_norm": 7.3125, + "learning_rate": 5.428912001616899e-07, + "loss": 1.04811335, + "memory(GiB)": 302.58, + "step": 306760, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.76346016, + "epoch": 1.7156590660289384, + "grad_norm": 5.96875, + "learning_rate": 5.424722235560071e-07, + "loss": 0.92844372, + "memory(GiB)": 302.58, + "step": 306780, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.77290645, + "epoch": 1.7157709155019174, + "grad_norm": 7.5, + "learning_rate": 5.420533994119081e-07, + "loss": 0.89661236, + "memory(GiB)": 302.58, + "step": 306800, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.76766882, + "epoch": 1.715882764974897, + "grad_norm": 7.125, + "learning_rate": 5.416347277437195e-07, + "loss": 0.90806732, + "memory(GiB)": 302.58, + "step": 306820, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.73139153, + "epoch": 1.715994614447876, + "grad_norm": 11.375, + "learning_rate": 5.412162085657596e-07, + "loss": 1.05814915, + "memory(GiB)": 302.58, + "step": 306840, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.738024, + "epoch": 1.7161064639208554, + "grad_norm": 7.34375, + "learning_rate": 5.407978418923443e-07, + "loss": 1.04207897, + "memory(GiB)": 302.58, + "step": 306860, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.75044284, + "epoch": 1.7162183133938345, + "grad_norm": 9.3125, + "learning_rate": 5.403796277377826e-07, + "loss": 0.97844534, + "memory(GiB)": 302.58, + "step": 306880, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74749994, + "epoch": 1.716330162866814, + "grad_norm": 7.28125, + "learning_rate": 5.399615661163781e-07, + "loss": 0.98927183, + "memory(GiB)": 302.58, + "step": 306900, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.7554893, + "epoch": 1.716442012339793, + "grad_norm": 8.3125, + "learning_rate": 5.395436570424312e-07, + "loss": 0.99296789, + "memory(GiB)": 302.58, + "step": 306920, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.76471791, + "epoch": 1.7165538618127725, + "grad_norm": 6.53125, + "learning_rate": 5.391259005302335e-07, + "loss": 0.90344133, + "memory(GiB)": 302.58, + "step": 306940, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.7369185, + "epoch": 1.7166657112857515, + "grad_norm": 5.875, + "learning_rate": 5.387082965940759e-07, + "loss": 1.03845949, + "memory(GiB)": 302.58, + "step": 306960, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74159203, + "epoch": 1.716777560758731, + "grad_norm": 7.5625, + "learning_rate": 5.382908452482416e-07, + "loss": 1.00291986, + "memory(GiB)": 302.58, + "step": 306980, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.75848336, + "epoch": 1.71688941023171, + "grad_norm": 8.75, + "learning_rate": 5.378735465070073e-07, + "loss": 0.94172211, + "memory(GiB)": 302.58, + "step": 307000, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.74788861, + "epoch": 1.7170012597046895, + "grad_norm": 6.3125, + "learning_rate": 5.374564003846472e-07, + "loss": 1.02905197, + "memory(GiB)": 302.58, + "step": 307020, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.7386466, + "epoch": 1.7171131091776686, + "grad_norm": 9.4375, + "learning_rate": 5.370394068954288e-07, + "loss": 1.03958435, + "memory(GiB)": 302.58, + "step": 307040, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75052099, + "epoch": 1.717224958650648, + "grad_norm": 7.5, + "learning_rate": 5.36622566053614e-07, + "loss": 0.96760302, + "memory(GiB)": 302.58, + "step": 307060, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.754848, + "epoch": 1.717336808123627, + "grad_norm": 5.625, + "learning_rate": 5.362058778734603e-07, + "loss": 0.97224817, + "memory(GiB)": 302.58, + "step": 307080, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75067005, + "epoch": 1.7174486575966066, + "grad_norm": 8.625, + "learning_rate": 5.357893423692201e-07, + "loss": 0.98225584, + "memory(GiB)": 302.58, + "step": 307100, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.75623331, + "epoch": 1.7175605070695856, + "grad_norm": 5.6875, + "learning_rate": 5.353729595551399e-07, + "loss": 0.96545353, + "memory(GiB)": 302.58, + "step": 307120, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.75021029, + "epoch": 1.7176723565425651, + "grad_norm": 6.8125, + "learning_rate": 5.349567294454611e-07, + "loss": 0.99620323, + "memory(GiB)": 302.58, + "step": 307140, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.73785615, + "epoch": 1.7177842060155442, + "grad_norm": 7.28125, + "learning_rate": 5.34540652054421e-07, + "loss": 1.02642651, + "memory(GiB)": 302.58, + "step": 307160, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.74651051, + "epoch": 1.7178960554885236, + "grad_norm": 6.15625, + "learning_rate": 5.341247273962497e-07, + "loss": 0.99153481, + "memory(GiB)": 302.58, + "step": 307180, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74812169, + "epoch": 1.7180079049615027, + "grad_norm": 5.0625, + "learning_rate": 5.337089554851737e-07, + "loss": 0.98606119, + "memory(GiB)": 302.58, + "step": 307200, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74992185, + "epoch": 1.7181197544344822, + "grad_norm": 7.28125, + "learning_rate": 5.332933363354131e-07, + "loss": 0.97519388, + "memory(GiB)": 302.58, + "step": 307220, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75083394, + "epoch": 1.7182316039074612, + "grad_norm": 6.3125, + "learning_rate": 5.328778699611847e-07, + "loss": 0.98657656, + "memory(GiB)": 302.58, + "step": 307240, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74296703, + "epoch": 1.7183434533804407, + "grad_norm": 4.8125, + "learning_rate": 5.324625563766978e-07, + "loss": 1.0023736, + "memory(GiB)": 302.58, + "step": 307260, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.75586772, + "epoch": 1.7184553028534197, + "grad_norm": 6.9375, + "learning_rate": 5.32047395596157e-07, + "loss": 0.96869774, + "memory(GiB)": 302.58, + "step": 307280, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.73920169, + "epoch": 1.7185671523263992, + "grad_norm": 5.96875, + "learning_rate": 5.316323876337637e-07, + "loss": 1.04544859, + "memory(GiB)": 302.58, + "step": 307300, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.75057645, + "epoch": 1.7186790017993783, + "grad_norm": 10.125, + "learning_rate": 5.312175325037111e-07, + "loss": 0.9760354, + "memory(GiB)": 302.58, + "step": 307320, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.73773575, + "epoch": 1.7187908512723578, + "grad_norm": 9.9375, + "learning_rate": 5.308028302201896e-07, + "loss": 1.01723938, + "memory(GiB)": 302.58, + "step": 307340, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.74812078, + "epoch": 1.7189027007453368, + "grad_norm": 7.34375, + "learning_rate": 5.303882807973826e-07, + "loss": 0.9900775, + "memory(GiB)": 302.58, + "step": 307360, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.75202274, + "epoch": 1.7190145502183163, + "grad_norm": 8.3125, + "learning_rate": 5.299738842494689e-07, + "loss": 0.99461193, + "memory(GiB)": 302.58, + "step": 307380, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.75844746, + "epoch": 1.7191263996912953, + "grad_norm": 7.875, + "learning_rate": 5.295596405906228e-07, + "loss": 0.95260944, + "memory(GiB)": 302.58, + "step": 307400, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.76187062, + "epoch": 1.7192382491642748, + "grad_norm": 8.1875, + "learning_rate": 5.291455498350117e-07, + "loss": 0.91004601, + "memory(GiB)": 302.58, + "step": 307420, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.73685904, + "epoch": 1.7193500986372539, + "grad_norm": 6.3125, + "learning_rate": 5.287316119968e-07, + "loss": 1.04342451, + "memory(GiB)": 302.58, + "step": 307440, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.75621409, + "epoch": 1.7194619481102333, + "grad_norm": 6.59375, + "learning_rate": 5.283178270901451e-07, + "loss": 0.95177155, + "memory(GiB)": 302.58, + "step": 307460, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.7312099, + "epoch": 1.7195737975832124, + "grad_norm": 7.90625, + "learning_rate": 5.279041951292002e-07, + "loss": 1.05748186, + "memory(GiB)": 302.58, + "step": 307480, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74410539, + "epoch": 1.7196856470561919, + "grad_norm": 8.875, + "learning_rate": 5.274907161281123e-07, + "loss": 1.00120087, + "memory(GiB)": 302.58, + "step": 307500, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.7554821, + "epoch": 1.719797496529171, + "grad_norm": 10.25, + "learning_rate": 5.270773901010234e-07, + "loss": 0.95781994, + "memory(GiB)": 302.58, + "step": 307520, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.76913896, + "epoch": 1.7199093460021504, + "grad_norm": 9.9375, + "learning_rate": 5.266642170620706e-07, + "loss": 0.8992342, + "memory(GiB)": 302.58, + "step": 307540, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.75311275, + "epoch": 1.7200211954751294, + "grad_norm": 10.0, + "learning_rate": 5.262511970253864e-07, + "loss": 0.97488832, + "memory(GiB)": 302.58, + "step": 307560, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.71933956, + "epoch": 1.720133044948109, + "grad_norm": 5.1875, + "learning_rate": 5.258383300050979e-07, + "loss": 1.10352459, + "memory(GiB)": 302.58, + "step": 307580, + "train_speed(iter/s)": 0.123615 + }, + { + "acc": 0.77619672, + "epoch": 1.720244894421088, + "grad_norm": 5.84375, + "learning_rate": 5.254256160153254e-07, + "loss": 0.86772051, + "memory(GiB)": 302.58, + "step": 307600, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74641814, + "epoch": 1.7203567438940675, + "grad_norm": 8.5625, + "learning_rate": 5.250130550701848e-07, + "loss": 1.00074377, + "memory(GiB)": 302.58, + "step": 307620, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.75483284, + "epoch": 1.7204685933670465, + "grad_norm": 5.8125, + "learning_rate": 5.246006471837883e-07, + "loss": 0.98273745, + "memory(GiB)": 302.58, + "step": 307640, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.74332638, + "epoch": 1.720580442840026, + "grad_norm": 9.125, + "learning_rate": 5.241883923702407e-07, + "loss": 1.01221209, + "memory(GiB)": 302.58, + "step": 307660, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.74827776, + "epoch": 1.720692292313005, + "grad_norm": 9.3125, + "learning_rate": 5.237762906436422e-07, + "loss": 0.98114252, + "memory(GiB)": 302.58, + "step": 307680, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.76702142, + "epoch": 1.7208041417859845, + "grad_norm": 6.6875, + "learning_rate": 5.233643420180878e-07, + "loss": 0.90285997, + "memory(GiB)": 302.58, + "step": 307700, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.76718273, + "epoch": 1.7209159912589636, + "grad_norm": 5.71875, + "learning_rate": 5.22952546507669e-07, + "loss": 0.89882145, + "memory(GiB)": 302.58, + "step": 307720, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.74778061, + "epoch": 1.721027840731943, + "grad_norm": 4.90625, + "learning_rate": 5.225409041264684e-07, + "loss": 0.9977891, + "memory(GiB)": 302.58, + "step": 307740, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.7477036, + "epoch": 1.721139690204922, + "grad_norm": 5.96875, + "learning_rate": 5.221294148885675e-07, + "loss": 0.9996995, + "memory(GiB)": 302.58, + "step": 307760, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.7570775, + "epoch": 1.7212515396779016, + "grad_norm": 6.46875, + "learning_rate": 5.217180788080389e-07, + "loss": 0.96382732, + "memory(GiB)": 302.58, + "step": 307780, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.75972395, + "epoch": 1.7213633891508806, + "grad_norm": 8.125, + "learning_rate": 5.213068958989526e-07, + "loss": 0.94611645, + "memory(GiB)": 302.58, + "step": 307800, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.73709722, + "epoch": 1.72147523862386, + "grad_norm": 7.6875, + "learning_rate": 5.208958661753704e-07, + "loss": 1.06128531, + "memory(GiB)": 302.58, + "step": 307820, + "train_speed(iter/s)": 0.12366 + }, + { + "acc": 0.76193614, + "epoch": 1.7215870880968391, + "grad_norm": 10.125, + "learning_rate": 5.204849896513542e-07, + "loss": 0.91783171, + "memory(GiB)": 302.58, + "step": 307840, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.75843296, + "epoch": 1.7216989375698186, + "grad_norm": 7.09375, + "learning_rate": 5.200742663409553e-07, + "loss": 0.93613939, + "memory(GiB)": 302.58, + "step": 307860, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.75387969, + "epoch": 1.7218107870427977, + "grad_norm": 8.0, + "learning_rate": 5.196636962582219e-07, + "loss": 0.9508214, + "memory(GiB)": 302.58, + "step": 307880, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.76172109, + "epoch": 1.7219226365157771, + "grad_norm": 6.875, + "learning_rate": 5.192532794171967e-07, + "loss": 0.92146358, + "memory(GiB)": 302.58, + "step": 307900, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.74794064, + "epoch": 1.7220344859887562, + "grad_norm": 5.21875, + "learning_rate": 5.188430158319174e-07, + "loss": 0.97284327, + "memory(GiB)": 302.58, + "step": 307920, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.75739665, + "epoch": 1.7221463354617357, + "grad_norm": 8.625, + "learning_rate": 5.184329055164161e-07, + "loss": 0.97121925, + "memory(GiB)": 302.58, + "step": 307940, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.7479588, + "epoch": 1.7222581849347147, + "grad_norm": 7.40625, + "learning_rate": 5.180229484847204e-07, + "loss": 0.98427896, + "memory(GiB)": 302.58, + "step": 307960, + "train_speed(iter/s)": 0.123687 + }, + { + "acc": 0.7705689, + "epoch": 1.7223700344076942, + "grad_norm": 7.84375, + "learning_rate": 5.17613144750852e-07, + "loss": 0.87992764, + "memory(GiB)": 302.58, + "step": 307980, + "train_speed(iter/s)": 0.12369 + }, + { + "acc": 0.76726356, + "epoch": 1.7224818838806732, + "grad_norm": 6.5625, + "learning_rate": 5.172034943288274e-07, + "loss": 0.89032822, + "memory(GiB)": 302.58, + "step": 308000, + "train_speed(iter/s)": 0.123694 + }, + { + "epoch": 1.7224818838806732, + "eval_acc": 0.7069088657213712, + "eval_loss": 1.0117982625961304, + "eval_runtime": 7528.0927, + "eval_samples_per_second": 10.0, + "eval_steps_per_second": 10.0, + "step": 308000 + }, + { + "acc": 0.7445642, + "epoch": 1.7225937333536527, + "grad_norm": 6.71875, + "learning_rate": 5.167939972326574e-07, + "loss": 1.00378036, + "memory(GiB)": 302.58, + "step": 308020, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.76313829, + "epoch": 1.7227055828266318, + "grad_norm": 7.5, + "learning_rate": 5.16384653476349e-07, + "loss": 0.95213203, + "memory(GiB)": 302.58, + "step": 308040, + "train_speed(iter/s)": 0.123322 + }, + { + "acc": 0.75570579, + "epoch": 1.7228174322996113, + "grad_norm": 4.625, + "learning_rate": 5.159754630739022e-07, + "loss": 0.95382166, + "memory(GiB)": 302.58, + "step": 308060, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.76251826, + "epoch": 1.7229292817725903, + "grad_norm": 6.9375, + "learning_rate": 5.155664260393134e-07, + "loss": 0.94171467, + "memory(GiB)": 302.58, + "step": 308080, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.7558588, + "epoch": 1.7230411312455698, + "grad_norm": 6.0625, + "learning_rate": 5.151575423865712e-07, + "loss": 0.96114006, + "memory(GiB)": 302.58, + "step": 308100, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.73376503, + "epoch": 1.7231529807185488, + "grad_norm": 7.46875, + "learning_rate": 5.147488121296634e-07, + "loss": 1.06718225, + "memory(GiB)": 302.58, + "step": 308120, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.7660831, + "epoch": 1.7232648301915283, + "grad_norm": 5.96875, + "learning_rate": 5.143402352825682e-07, + "loss": 0.92803392, + "memory(GiB)": 302.58, + "step": 308140, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.75146956, + "epoch": 1.7233766796645074, + "grad_norm": 7.90625, + "learning_rate": 5.139318118592612e-07, + "loss": 0.96513443, + "memory(GiB)": 302.58, + "step": 308160, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.74815221, + "epoch": 1.7234885291374868, + "grad_norm": 5.9375, + "learning_rate": 5.135235418737106e-07, + "loss": 0.99994831, + "memory(GiB)": 302.58, + "step": 308180, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.75066252, + "epoch": 1.7236003786104659, + "grad_norm": 8.3125, + "learning_rate": 5.131154253398812e-07, + "loss": 0.98459482, + "memory(GiB)": 302.58, + "step": 308200, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.75652452, + "epoch": 1.7237122280834454, + "grad_norm": 6.5, + "learning_rate": 5.12707462271732e-07, + "loss": 0.95550165, + "memory(GiB)": 302.58, + "step": 308220, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.74169984, + "epoch": 1.7238240775564244, + "grad_norm": 6.34375, + "learning_rate": 5.122996526832163e-07, + "loss": 1.0148385, + "memory(GiB)": 302.58, + "step": 308240, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.75798669, + "epoch": 1.723935927029404, + "grad_norm": 9.25, + "learning_rate": 5.118919965882824e-07, + "loss": 0.94018135, + "memory(GiB)": 302.58, + "step": 308260, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.75444126, + "epoch": 1.724047776502383, + "grad_norm": 7.125, + "learning_rate": 5.11484494000874e-07, + "loss": 0.96891479, + "memory(GiB)": 302.58, + "step": 308280, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.76364765, + "epoch": 1.7241596259753624, + "grad_norm": 14.5, + "learning_rate": 5.110771449349283e-07, + "loss": 0.92600527, + "memory(GiB)": 302.58, + "step": 308300, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.76152153, + "epoch": 1.7242714754483415, + "grad_norm": 9.75, + "learning_rate": 5.106699494043782e-07, + "loss": 0.91378403, + "memory(GiB)": 302.58, + "step": 308320, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.75803599, + "epoch": 1.724383324921321, + "grad_norm": 8.9375, + "learning_rate": 5.102629074231513e-07, + "loss": 0.95132542, + "memory(GiB)": 302.58, + "step": 308340, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.74735742, + "epoch": 1.7244951743943, + "grad_norm": 8.5, + "learning_rate": 5.098560190051688e-07, + "loss": 1.01887112, + "memory(GiB)": 302.58, + "step": 308360, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.75190225, + "epoch": 1.7246070238672795, + "grad_norm": 8.6875, + "learning_rate": 5.09449284164349e-07, + "loss": 0.96864271, + "memory(GiB)": 302.58, + "step": 308380, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.7356277, + "epoch": 1.7247188733402585, + "grad_norm": 6.5625, + "learning_rate": 5.090427029146028e-07, + "loss": 1.06334162, + "memory(GiB)": 302.58, + "step": 308400, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.75120649, + "epoch": 1.724830722813238, + "grad_norm": 4.6875, + "learning_rate": 5.086362752698365e-07, + "loss": 0.96242285, + "memory(GiB)": 302.58, + "step": 308420, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.74984531, + "epoch": 1.724942572286217, + "grad_norm": 7.59375, + "learning_rate": 5.082300012439517e-07, + "loss": 0.9668293, + "memory(GiB)": 302.58, + "step": 308440, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.73570962, + "epoch": 1.7250544217591965, + "grad_norm": 4.375, + "learning_rate": 5.078238808508435e-07, + "loss": 1.03949337, + "memory(GiB)": 302.58, + "step": 308460, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.73936887, + "epoch": 1.7251662712321756, + "grad_norm": 6.90625, + "learning_rate": 5.074179141044022e-07, + "loss": 1.01255569, + "memory(GiB)": 302.58, + "step": 308480, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.74912496, + "epoch": 1.725278120705155, + "grad_norm": 7.59375, + "learning_rate": 5.070121010185147e-07, + "loss": 1.00043535, + "memory(GiB)": 302.58, + "step": 308500, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.74843383, + "epoch": 1.7253899701781341, + "grad_norm": 8.0, + "learning_rate": 5.066064416070598e-07, + "loss": 0.99705353, + "memory(GiB)": 302.58, + "step": 308520, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74955435, + "epoch": 1.7255018196511136, + "grad_norm": 7.15625, + "learning_rate": 5.062009358839132e-07, + "loss": 0.98643007, + "memory(GiB)": 302.58, + "step": 308540, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.73899074, + "epoch": 1.7256136691240926, + "grad_norm": 7.71875, + "learning_rate": 5.057955838629441e-07, + "loss": 1.0378026, + "memory(GiB)": 302.58, + "step": 308560, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75128284, + "epoch": 1.7257255185970721, + "grad_norm": 6.375, + "learning_rate": 5.053903855580166e-07, + "loss": 1.00592604, + "memory(GiB)": 302.58, + "step": 308580, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.7578464, + "epoch": 1.7258373680700512, + "grad_norm": 8.125, + "learning_rate": 5.049853409829897e-07, + "loss": 0.94014072, + "memory(GiB)": 302.58, + "step": 308600, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.74975638, + "epoch": 1.7259492175430307, + "grad_norm": 7.75, + "learning_rate": 5.045804501517176e-07, + "loss": 0.98350315, + "memory(GiB)": 302.58, + "step": 308620, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74758558, + "epoch": 1.7260610670160097, + "grad_norm": 6.28125, + "learning_rate": 5.041757130780489e-07, + "loss": 0.98617592, + "memory(GiB)": 302.58, + "step": 308640, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.73756642, + "epoch": 1.7261729164889892, + "grad_norm": 9.0625, + "learning_rate": 5.037711297758268e-07, + "loss": 1.07161169, + "memory(GiB)": 302.58, + "step": 308660, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.74880705, + "epoch": 1.7262847659619682, + "grad_norm": 5.8125, + "learning_rate": 5.033667002588888e-07, + "loss": 0.97811327, + "memory(GiB)": 302.58, + "step": 308680, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.73976808, + "epoch": 1.7263966154349477, + "grad_norm": 6.65625, + "learning_rate": 5.029624245410674e-07, + "loss": 1.00632896, + "memory(GiB)": 302.58, + "step": 308700, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.75878382, + "epoch": 1.7265084649079268, + "grad_norm": 5.34375, + "learning_rate": 5.025583026361918e-07, + "loss": 0.97070704, + "memory(GiB)": 302.58, + "step": 308720, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.73233628, + "epoch": 1.7266203143809062, + "grad_norm": 6.40625, + "learning_rate": 5.021543345580832e-07, + "loss": 1.06532393, + "memory(GiB)": 302.58, + "step": 308740, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.75110874, + "epoch": 1.7267321638538853, + "grad_norm": 7.53125, + "learning_rate": 5.017505203205586e-07, + "loss": 0.97194519, + "memory(GiB)": 302.58, + "step": 308760, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.75467505, + "epoch": 1.7268440133268648, + "grad_norm": 5.71875, + "learning_rate": 5.013468599374299e-07, + "loss": 0.95203819, + "memory(GiB)": 302.58, + "step": 308780, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74468708, + "epoch": 1.7269558627998438, + "grad_norm": 6.53125, + "learning_rate": 5.009433534225033e-07, + "loss": 1.02554054, + "memory(GiB)": 302.58, + "step": 308800, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.73771553, + "epoch": 1.7270677122728233, + "grad_norm": 4.78125, + "learning_rate": 5.0054000078958e-07, + "loss": 1.0274929, + "memory(GiB)": 302.58, + "step": 308820, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.7838222, + "epoch": 1.7271795617458023, + "grad_norm": 6.0625, + "learning_rate": 5.001368020524561e-07, + "loss": 0.83201971, + "memory(GiB)": 302.58, + "step": 308840, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.74994383, + "epoch": 1.7272914112187818, + "grad_norm": 7.53125, + "learning_rate": 4.997337572249222e-07, + "loss": 0.9905797, + "memory(GiB)": 302.58, + "step": 308860, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.75554934, + "epoch": 1.7274032606917609, + "grad_norm": 7.46875, + "learning_rate": 4.993308663207642e-07, + "loss": 0.94759359, + "memory(GiB)": 302.58, + "step": 308880, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.75168395, + "epoch": 1.7275151101647404, + "grad_norm": 5.625, + "learning_rate": 4.989281293537613e-07, + "loss": 0.97633791, + "memory(GiB)": 302.58, + "step": 308900, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74989619, + "epoch": 1.7276269596377194, + "grad_norm": 7.71875, + "learning_rate": 4.985255463376893e-07, + "loss": 1.00138178, + "memory(GiB)": 302.58, + "step": 308920, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.72649336, + "epoch": 1.7277388091106989, + "grad_norm": 6.75, + "learning_rate": 4.981231172863171e-07, + "loss": 1.08780851, + "memory(GiB)": 302.58, + "step": 308940, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.75923824, + "epoch": 1.727850658583678, + "grad_norm": 7.3125, + "learning_rate": 4.977208422134083e-07, + "loss": 0.93267946, + "memory(GiB)": 302.58, + "step": 308960, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.72775507, + "epoch": 1.7279625080566574, + "grad_norm": 7.875, + "learning_rate": 4.973187211327241e-07, + "loss": 1.06755562, + "memory(GiB)": 302.58, + "step": 308980, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.75584383, + "epoch": 1.7280743575296365, + "grad_norm": 7.875, + "learning_rate": 4.969167540580178e-07, + "loss": 0.96789675, + "memory(GiB)": 302.58, + "step": 309000, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.74607644, + "epoch": 1.728186207002616, + "grad_norm": 4.65625, + "learning_rate": 4.965149410030367e-07, + "loss": 0.99308596, + "memory(GiB)": 302.58, + "step": 309020, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.74060559, + "epoch": 1.728298056475595, + "grad_norm": 5.8125, + "learning_rate": 4.961132819815251e-07, + "loss": 1.02753735, + "memory(GiB)": 302.58, + "step": 309040, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.76076193, + "epoch": 1.7284099059485745, + "grad_norm": 6.875, + "learning_rate": 4.957117770072207e-07, + "loss": 0.95023737, + "memory(GiB)": 302.58, + "step": 309060, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.75123844, + "epoch": 1.7285217554215535, + "grad_norm": 6.15625, + "learning_rate": 4.953104260938563e-07, + "loss": 0.97196178, + "memory(GiB)": 302.58, + "step": 309080, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.76986985, + "epoch": 1.728633604894533, + "grad_norm": 8.625, + "learning_rate": 4.949092292551589e-07, + "loss": 0.89837332, + "memory(GiB)": 302.58, + "step": 309100, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.76938658, + "epoch": 1.7287454543675123, + "grad_norm": 5.90625, + "learning_rate": 4.945081865048513e-07, + "loss": 0.90492573, + "memory(GiB)": 302.58, + "step": 309120, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.75193739, + "epoch": 1.7288573038404915, + "grad_norm": 7.28125, + "learning_rate": 4.941072978566502e-07, + "loss": 0.98840723, + "memory(GiB)": 302.58, + "step": 309140, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.75063996, + "epoch": 1.7289691533134708, + "grad_norm": 7.125, + "learning_rate": 4.937065633242677e-07, + "loss": 0.97600365, + "memory(GiB)": 302.58, + "step": 309160, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.75637684, + "epoch": 1.72908100278645, + "grad_norm": 5.75, + "learning_rate": 4.933059829214093e-07, + "loss": 0.96116829, + "memory(GiB)": 302.58, + "step": 309180, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.76615419, + "epoch": 1.7291928522594293, + "grad_norm": 6.4375, + "learning_rate": 4.929055566617769e-07, + "loss": 0.90531969, + "memory(GiB)": 302.58, + "step": 309200, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75026784, + "epoch": 1.7293047017324086, + "grad_norm": 6.65625, + "learning_rate": 4.925052845590662e-07, + "loss": 0.96279583, + "memory(GiB)": 302.58, + "step": 309220, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.74754705, + "epoch": 1.7294165512053878, + "grad_norm": 8.375, + "learning_rate": 4.921051666269666e-07, + "loss": 0.98618603, + "memory(GiB)": 302.58, + "step": 309240, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.75210748, + "epoch": 1.729528400678367, + "grad_norm": 5.34375, + "learning_rate": 4.917052028791653e-07, + "loss": 0.99596834, + "memory(GiB)": 302.58, + "step": 309260, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.75241475, + "epoch": 1.7296402501513464, + "grad_norm": 6.8125, + "learning_rate": 4.913053933293416e-07, + "loss": 0.99522686, + "memory(GiB)": 302.58, + "step": 309280, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.7462409, + "epoch": 1.7297520996243256, + "grad_norm": 6.96875, + "learning_rate": 4.909057379911697e-07, + "loss": 0.99339724, + "memory(GiB)": 302.58, + "step": 309300, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.74575167, + "epoch": 1.729863949097305, + "grad_norm": 5.9375, + "learning_rate": 4.905062368783203e-07, + "loss": 1.00183783, + "memory(GiB)": 302.58, + "step": 309320, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.76917996, + "epoch": 1.7299757985702842, + "grad_norm": 7.5625, + "learning_rate": 4.901068900044564e-07, + "loss": 0.90413017, + "memory(GiB)": 302.58, + "step": 309340, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.74298553, + "epoch": 1.7300876480432634, + "grad_norm": 9.375, + "learning_rate": 4.897076973832371e-07, + "loss": 1.01669664, + "memory(GiB)": 302.58, + "step": 309360, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.7482368, + "epoch": 1.7301994975162427, + "grad_norm": 6.15625, + "learning_rate": 4.893086590283164e-07, + "loss": 0.975243, + "memory(GiB)": 302.58, + "step": 309380, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.7490582, + "epoch": 1.730311346989222, + "grad_norm": 4.6875, + "learning_rate": 4.889097749533428e-07, + "loss": 0.98224058, + "memory(GiB)": 302.58, + "step": 309400, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75185113, + "epoch": 1.7304231964622012, + "grad_norm": 7.625, + "learning_rate": 4.885110451719593e-07, + "loss": 0.98883247, + "memory(GiB)": 302.58, + "step": 309420, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.75968332, + "epoch": 1.7305350459351805, + "grad_norm": 10.25, + "learning_rate": 4.88112469697803e-07, + "loss": 0.9619339, + "memory(GiB)": 302.58, + "step": 309440, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75866942, + "epoch": 1.7306468954081597, + "grad_norm": 5.3125, + "learning_rate": 4.877140485445075e-07, + "loss": 0.95614347, + "memory(GiB)": 302.58, + "step": 309460, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.7331162, + "epoch": 1.730758744881139, + "grad_norm": 5.34375, + "learning_rate": 4.873157817256995e-07, + "loss": 1.04194403, + "memory(GiB)": 302.58, + "step": 309480, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.75221038, + "epoch": 1.7308705943541183, + "grad_norm": 7.34375, + "learning_rate": 4.869176692550015e-07, + "loss": 0.9801795, + "memory(GiB)": 302.58, + "step": 309500, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.7478621, + "epoch": 1.7309824438270975, + "grad_norm": 6.84375, + "learning_rate": 4.865197111460284e-07, + "loss": 1.01837473, + "memory(GiB)": 302.58, + "step": 309520, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.75583782, + "epoch": 1.7310942933000768, + "grad_norm": 13.4375, + "learning_rate": 4.86121907412394e-07, + "loss": 0.96744299, + "memory(GiB)": 302.58, + "step": 309540, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.7523068, + "epoch": 1.731206142773056, + "grad_norm": 9.375, + "learning_rate": 4.857242580677035e-07, + "loss": 0.97776794, + "memory(GiB)": 302.58, + "step": 309560, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.73510327, + "epoch": 1.7313179922460353, + "grad_norm": 6.5625, + "learning_rate": 4.853267631255576e-07, + "loss": 1.06611156, + "memory(GiB)": 302.58, + "step": 309580, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.7559803, + "epoch": 1.7314298417190146, + "grad_norm": 5.9375, + "learning_rate": 4.849294225995527e-07, + "loss": 0.96719484, + "memory(GiB)": 302.58, + "step": 309600, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.75879369, + "epoch": 1.7315416911919939, + "grad_norm": 6.4375, + "learning_rate": 4.845322365032779e-07, + "loss": 0.94228144, + "memory(GiB)": 302.58, + "step": 309620, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.75326052, + "epoch": 1.7316535406649731, + "grad_norm": 7.46875, + "learning_rate": 4.841352048503184e-07, + "loss": 0.96365938, + "memory(GiB)": 302.58, + "step": 309640, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.7419322, + "epoch": 1.7317653901379524, + "grad_norm": 6.0, + "learning_rate": 4.837383276542551e-07, + "loss": 1.00640068, + "memory(GiB)": 302.58, + "step": 309660, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.75024219, + "epoch": 1.7318772396109317, + "grad_norm": 8.875, + "learning_rate": 4.833416049286615e-07, + "loss": 0.98117151, + "memory(GiB)": 302.58, + "step": 309680, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.74409876, + "epoch": 1.731989089083911, + "grad_norm": 6.375, + "learning_rate": 4.829450366871063e-07, + "loss": 1.01921625, + "memory(GiB)": 302.58, + "step": 309700, + "train_speed(iter/s)": 0.123631 + }, + { + "acc": 0.76702714, + "epoch": 1.7321009385568902, + "grad_norm": 7.28125, + "learning_rate": 4.825486229431548e-07, + "loss": 0.91497421, + "memory(GiB)": 302.58, + "step": 309720, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.75676908, + "epoch": 1.7322127880298694, + "grad_norm": 7.6875, + "learning_rate": 4.821523637103637e-07, + "loss": 0.96085062, + "memory(GiB)": 302.58, + "step": 309740, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.74795432, + "epoch": 1.7323246375028487, + "grad_norm": 8.625, + "learning_rate": 4.817562590022884e-07, + "loss": 0.98197937, + "memory(GiB)": 302.58, + "step": 309760, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.7463768, + "epoch": 1.732436486975828, + "grad_norm": 4.96875, + "learning_rate": 4.813603088324759e-07, + "loss": 1.00834627, + "memory(GiB)": 302.58, + "step": 309780, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.7484549, + "epoch": 1.7325483364488072, + "grad_norm": 7.5, + "learning_rate": 4.809645132144691e-07, + "loss": 0.97042856, + "memory(GiB)": 302.58, + "step": 309800, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.73167195, + "epoch": 1.7326601859217865, + "grad_norm": 7.65625, + "learning_rate": 4.805688721618057e-07, + "loss": 1.07127657, + "memory(GiB)": 302.58, + "step": 309820, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.74494257, + "epoch": 1.7327720353947658, + "grad_norm": 4.40625, + "learning_rate": 4.801733856880158e-07, + "loss": 1.01683731, + "memory(GiB)": 302.58, + "step": 309840, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74236121, + "epoch": 1.732883884867745, + "grad_norm": 7.96875, + "learning_rate": 4.797780538066298e-07, + "loss": 1.02094469, + "memory(GiB)": 302.58, + "step": 309860, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.7353734, + "epoch": 1.7329957343407243, + "grad_norm": 10.1875, + "learning_rate": 4.793828765311675e-07, + "loss": 1.03559532, + "memory(GiB)": 302.58, + "step": 309880, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.75997167, + "epoch": 1.7331075838137036, + "grad_norm": 5.75, + "learning_rate": 4.789878538751452e-07, + "loss": 0.92912397, + "memory(GiB)": 302.58, + "step": 309900, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.756216, + "epoch": 1.7332194332866828, + "grad_norm": 7.3125, + "learning_rate": 4.785929858520744e-07, + "loss": 0.96083879, + "memory(GiB)": 302.58, + "step": 309920, + "train_speed(iter/s)": 0.123671 + }, + { + "acc": 0.74935398, + "epoch": 1.733331282759662, + "grad_norm": 7.4375, + "learning_rate": 4.781982724754597e-07, + "loss": 0.97805166, + "memory(GiB)": 302.58, + "step": 309940, + "train_speed(iter/s)": 0.123675 + }, + { + "acc": 0.75038733, + "epoch": 1.7334431322326413, + "grad_norm": 9.75, + "learning_rate": 4.778037137588032e-07, + "loss": 0.97636003, + "memory(GiB)": 302.58, + "step": 309960, + "train_speed(iter/s)": 0.123679 + }, + { + "acc": 0.73378315, + "epoch": 1.7335549817056206, + "grad_norm": 5.90625, + "learning_rate": 4.774093097155985e-07, + "loss": 1.05894995, + "memory(GiB)": 302.58, + "step": 309980, + "train_speed(iter/s)": 0.123683 + }, + { + "acc": 0.76148672, + "epoch": 1.7336668311785999, + "grad_norm": 7.96875, + "learning_rate": 4.770150603593365e-07, + "loss": 0.9463562, + "memory(GiB)": 302.58, + "step": 310000, + "train_speed(iter/s)": 0.123687 + }, + { + "epoch": 1.7336668311785999, + "eval_acc": 0.7068832806892953, + "eval_loss": 1.011795163154602, + "eval_runtime": 7527.2057, + "eval_samples_per_second": 10.001, + "eval_steps_per_second": 10.001, + "step": 310000 + }, + { + "acc": 0.75178409, + "epoch": 1.7337786806515791, + "grad_norm": 10.375, + "learning_rate": 4.7662096570350146e-07, + "loss": 0.9830348, + "memory(GiB)": 302.58, + "step": 310020, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.76560884, + "epoch": 1.7338905301245584, + "grad_norm": 9.1875, + "learning_rate": 4.7622702576157254e-07, + "loss": 0.91157675, + "memory(GiB)": 302.58, + "step": 310040, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.75073156, + "epoch": 1.7340023795975377, + "grad_norm": 8.375, + "learning_rate": 4.7583324054702393e-07, + "loss": 0.95771456, + "memory(GiB)": 302.58, + "step": 310060, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.74870939, + "epoch": 1.734114229070517, + "grad_norm": 6.21875, + "learning_rate": 4.754396100733238e-07, + "loss": 0.98896427, + "memory(GiB)": 302.58, + "step": 310080, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.76833534, + "epoch": 1.7342260785434962, + "grad_norm": 9.4375, + "learning_rate": 4.7504613435393644e-07, + "loss": 0.89923162, + "memory(GiB)": 302.58, + "step": 310100, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.75409451, + "epoch": 1.7343379280164755, + "grad_norm": 10.4375, + "learning_rate": 4.7465281340231817e-07, + "loss": 0.97786646, + "memory(GiB)": 302.58, + "step": 310120, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.75308933, + "epoch": 1.7344497774894547, + "grad_norm": 4.71875, + "learning_rate": 4.742596472319244e-07, + "loss": 0.96481905, + "memory(GiB)": 302.58, + "step": 310140, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.75908995, + "epoch": 1.734561626962434, + "grad_norm": 4.40625, + "learning_rate": 4.738666358562016e-07, + "loss": 0.94691467, + "memory(GiB)": 302.58, + "step": 310160, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.7671967, + "epoch": 1.7346734764354133, + "grad_norm": 7.78125, + "learning_rate": 4.734737792885918e-07, + "loss": 0.90476456, + "memory(GiB)": 302.58, + "step": 310180, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.75005617, + "epoch": 1.7347853259083925, + "grad_norm": 6.0, + "learning_rate": 4.7308107754253205e-07, + "loss": 0.98150568, + "memory(GiB)": 302.58, + "step": 310200, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.75050368, + "epoch": 1.7348971753813718, + "grad_norm": 8.9375, + "learning_rate": 4.7268853063145326e-07, + "loss": 0.97100115, + "memory(GiB)": 302.58, + "step": 310220, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.74419031, + "epoch": 1.735009024854351, + "grad_norm": 7.9375, + "learning_rate": 4.72296138568783e-07, + "loss": 1.01334343, + "memory(GiB)": 302.58, + "step": 310240, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.74437814, + "epoch": 1.7351208743273303, + "grad_norm": 6.375, + "learning_rate": 4.7190390136794164e-07, + "loss": 0.9908926, + "memory(GiB)": 302.58, + "step": 310260, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.74951515, + "epoch": 1.7352327238003096, + "grad_norm": 7.40625, + "learning_rate": 4.7151181904234513e-07, + "loss": 0.98458729, + "memory(GiB)": 302.58, + "step": 310280, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.73939409, + "epoch": 1.7353445732732888, + "grad_norm": 9.6875, + "learning_rate": 4.7111989160540386e-07, + "loss": 1.0283906, + "memory(GiB)": 302.58, + "step": 310300, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.74821644, + "epoch": 1.735456422746268, + "grad_norm": 6.5625, + "learning_rate": 4.7072811907052316e-07, + "loss": 1.00473347, + "memory(GiB)": 302.58, + "step": 310320, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.75879736, + "epoch": 1.7355682722192474, + "grad_norm": 7.71875, + "learning_rate": 4.7033650145110246e-07, + "loss": 0.93246126, + "memory(GiB)": 302.58, + "step": 310340, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.74449487, + "epoch": 1.7356801216922266, + "grad_norm": 8.125, + "learning_rate": 4.6994503876053643e-07, + "loss": 1.01721077, + "memory(GiB)": 302.58, + "step": 310360, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.75516725, + "epoch": 1.735791971165206, + "grad_norm": 7.25, + "learning_rate": 4.695537310122145e-07, + "loss": 0.94821024, + "memory(GiB)": 302.58, + "step": 310380, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.74900064, + "epoch": 1.7359038206381852, + "grad_norm": 7.40625, + "learning_rate": 4.6916257821951974e-07, + "loss": 0.97213593, + "memory(GiB)": 302.58, + "step": 310400, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.75891757, + "epoch": 1.7360156701111644, + "grad_norm": 7.46875, + "learning_rate": 4.6877158039583316e-07, + "loss": 0.9524848, + "memory(GiB)": 302.58, + "step": 310420, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.73987579, + "epoch": 1.7361275195841437, + "grad_norm": 8.625, + "learning_rate": 4.683807375545263e-07, + "loss": 1.02109442, + "memory(GiB)": 302.58, + "step": 310440, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.74519458, + "epoch": 1.736239369057123, + "grad_norm": 6.53125, + "learning_rate": 4.6799004970896733e-07, + "loss": 0.99180183, + "memory(GiB)": 302.58, + "step": 310460, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.73546209, + "epoch": 1.7363512185301022, + "grad_norm": 6.3125, + "learning_rate": 4.675995168725195e-07, + "loss": 1.06447039, + "memory(GiB)": 302.58, + "step": 310480, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.75826979, + "epoch": 1.7364630680030815, + "grad_norm": 8.9375, + "learning_rate": 4.6720913905853984e-07, + "loss": 0.95801716, + "memory(GiB)": 302.58, + "step": 310500, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.73311796, + "epoch": 1.7365749174760607, + "grad_norm": 7.6875, + "learning_rate": 4.66818916280381e-07, + "loss": 1.05610886, + "memory(GiB)": 302.58, + "step": 310520, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.75116172, + "epoch": 1.73668676694904, + "grad_norm": 7.15625, + "learning_rate": 4.664288485513896e-07, + "loss": 0.97496471, + "memory(GiB)": 302.58, + "step": 310540, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75859833, + "epoch": 1.7367986164220193, + "grad_norm": 7.71875, + "learning_rate": 4.6603893588490714e-07, + "loss": 0.94275408, + "memory(GiB)": 302.58, + "step": 310560, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.74640079, + "epoch": 1.7369104658949985, + "grad_norm": 8.9375, + "learning_rate": 4.656491782942696e-07, + "loss": 1.00976171, + "memory(GiB)": 302.58, + "step": 310580, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74597359, + "epoch": 1.7370223153679778, + "grad_norm": 6.46875, + "learning_rate": 4.6525957579280856e-07, + "loss": 0.99784355, + "memory(GiB)": 302.58, + "step": 310600, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75342064, + "epoch": 1.737134164840957, + "grad_norm": 7.4375, + "learning_rate": 4.6487012839384893e-07, + "loss": 0.96295538, + "memory(GiB)": 302.58, + "step": 310620, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75936542, + "epoch": 1.7372460143139363, + "grad_norm": 9.6875, + "learning_rate": 4.644808361107117e-07, + "loss": 0.92572479, + "memory(GiB)": 302.58, + "step": 310640, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.7553257, + "epoch": 1.7373578637869156, + "grad_norm": 5.21875, + "learning_rate": 4.6409169895671015e-07, + "loss": 0.96161633, + "memory(GiB)": 302.58, + "step": 310660, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.7405911, + "epoch": 1.7374697132598949, + "grad_norm": 7.40625, + "learning_rate": 4.637027169451569e-07, + "loss": 1.02251263, + "memory(GiB)": 302.58, + "step": 310680, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.76163664, + "epoch": 1.7375815627328741, + "grad_norm": 6.09375, + "learning_rate": 4.633138900893547e-07, + "loss": 0.91841717, + "memory(GiB)": 302.58, + "step": 310700, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.74551406, + "epoch": 1.7376934122058534, + "grad_norm": 9.9375, + "learning_rate": 4.629252184026034e-07, + "loss": 1.00496111, + "memory(GiB)": 302.58, + "step": 310720, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.76626549, + "epoch": 1.7378052616788326, + "grad_norm": 7.1875, + "learning_rate": 4.6253670189819576e-07, + "loss": 0.89580717, + "memory(GiB)": 302.58, + "step": 310740, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75401263, + "epoch": 1.737917111151812, + "grad_norm": 8.375, + "learning_rate": 4.6214834058942117e-07, + "loss": 0.94706984, + "memory(GiB)": 302.58, + "step": 310760, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.76701465, + "epoch": 1.7380289606247912, + "grad_norm": 8.3125, + "learning_rate": 4.6176013448956226e-07, + "loss": 0.89747906, + "memory(GiB)": 302.58, + "step": 310780, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.76685228, + "epoch": 1.7381408100977704, + "grad_norm": 12.4375, + "learning_rate": 4.6137208361189734e-07, + "loss": 0.91909981, + "memory(GiB)": 302.58, + "step": 310800, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.7505969, + "epoch": 1.7382526595707497, + "grad_norm": 6.40625, + "learning_rate": 4.6098418796969855e-07, + "loss": 0.97858496, + "memory(GiB)": 302.58, + "step": 310820, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.76227198, + "epoch": 1.738364509043729, + "grad_norm": 6.4375, + "learning_rate": 4.6059644757623314e-07, + "loss": 0.94184408, + "memory(GiB)": 302.58, + "step": 310840, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.7416223, + "epoch": 1.7384763585167082, + "grad_norm": 7.125, + "learning_rate": 4.6020886244476373e-07, + "loss": 1.03167543, + "memory(GiB)": 302.58, + "step": 310860, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.72649469, + "epoch": 1.7385882079896875, + "grad_norm": 7.6875, + "learning_rate": 4.598214325885464e-07, + "loss": 1.06710062, + "memory(GiB)": 302.58, + "step": 310880, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.75650253, + "epoch": 1.7387000574626668, + "grad_norm": 6.9375, + "learning_rate": 4.594341580208322e-07, + "loss": 0.95421476, + "memory(GiB)": 302.58, + "step": 310900, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.74205494, + "epoch": 1.738811906935646, + "grad_norm": 5.875, + "learning_rate": 4.590470387548679e-07, + "loss": 1.02626972, + "memory(GiB)": 302.58, + "step": 310920, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.77059426, + "epoch": 1.7389237564086253, + "grad_norm": 6.78125, + "learning_rate": 4.586600748038927e-07, + "loss": 0.90574436, + "memory(GiB)": 302.58, + "step": 310940, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74928527, + "epoch": 1.7390356058816046, + "grad_norm": 8.1875, + "learning_rate": 4.5827326618114454e-07, + "loss": 0.9834939, + "memory(GiB)": 302.58, + "step": 310960, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.75800166, + "epoch": 1.7391474553545838, + "grad_norm": 6.78125, + "learning_rate": 4.5788661289985156e-07, + "loss": 0.94524717, + "memory(GiB)": 302.58, + "step": 310980, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.76314654, + "epoch": 1.739259304827563, + "grad_norm": 6.21875, + "learning_rate": 4.575001149732383e-07, + "loss": 0.91864681, + "memory(GiB)": 302.58, + "step": 311000, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74807844, + "epoch": 1.7393711543005423, + "grad_norm": 9.5, + "learning_rate": 4.5711377241452635e-07, + "loss": 0.98368273, + "memory(GiB)": 302.58, + "step": 311020, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.7669126, + "epoch": 1.7394830037735216, + "grad_norm": 7.84375, + "learning_rate": 4.5672758523692794e-07, + "loss": 0.88864164, + "memory(GiB)": 302.58, + "step": 311040, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.74835939, + "epoch": 1.7395948532465009, + "grad_norm": 6.21875, + "learning_rate": 4.563415534536525e-07, + "loss": 0.99280109, + "memory(GiB)": 302.58, + "step": 311060, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.73564973, + "epoch": 1.7397067027194801, + "grad_norm": 7.65625, + "learning_rate": 4.559556770779039e-07, + "loss": 1.06757469, + "memory(GiB)": 302.58, + "step": 311080, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.75848756, + "epoch": 1.7398185521924594, + "grad_norm": 8.625, + "learning_rate": 4.5556995612287933e-07, + "loss": 0.96686983, + "memory(GiB)": 302.58, + "step": 311100, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75286574, + "epoch": 1.7399304016654387, + "grad_norm": 5.78125, + "learning_rate": 4.551843906017728e-07, + "loss": 0.97207899, + "memory(GiB)": 302.58, + "step": 311120, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74095807, + "epoch": 1.740042251138418, + "grad_norm": 3.96875, + "learning_rate": 4.5479898052777085e-07, + "loss": 1.04321785, + "memory(GiB)": 302.58, + "step": 311140, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.76406708, + "epoch": 1.7401541006113972, + "grad_norm": 5.21875, + "learning_rate": 4.544137259140569e-07, + "loss": 0.91968431, + "memory(GiB)": 302.58, + "step": 311160, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.77478743, + "epoch": 1.7402659500843765, + "grad_norm": 5.15625, + "learning_rate": 4.5402862677380646e-07, + "loss": 0.86617899, + "memory(GiB)": 302.58, + "step": 311180, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73930955, + "epoch": 1.7403777995573557, + "grad_norm": 7.46875, + "learning_rate": 4.5364368312019233e-07, + "loss": 1.04189787, + "memory(GiB)": 302.58, + "step": 311200, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.75286031, + "epoch": 1.740489649030335, + "grad_norm": 8.1875, + "learning_rate": 4.532588949663802e-07, + "loss": 0.98878555, + "memory(GiB)": 302.58, + "step": 311220, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.74025989, + "epoch": 1.7406014985033142, + "grad_norm": 7.65625, + "learning_rate": 4.5287426232553113e-07, + "loss": 1.00353394, + "memory(GiB)": 302.58, + "step": 311240, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.7607306, + "epoch": 1.7407133479762935, + "grad_norm": 7.90625, + "learning_rate": 4.524897852108007e-07, + "loss": 0.93260775, + "memory(GiB)": 302.58, + "step": 311260, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.75064526, + "epoch": 1.7408251974492728, + "grad_norm": 9.3125, + "learning_rate": 4.5210546363533834e-07, + "loss": 0.9704977, + "memory(GiB)": 302.58, + "step": 311280, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.76490884, + "epoch": 1.740937046922252, + "grad_norm": 8.25, + "learning_rate": 4.5172129761229144e-07, + "loss": 0.92215347, + "memory(GiB)": 302.58, + "step": 311300, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.73999453, + "epoch": 1.7410488963952313, + "grad_norm": 5.4375, + "learning_rate": 4.513372871547983e-07, + "loss": 1.01918983, + "memory(GiB)": 302.58, + "step": 311320, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.77724586, + "epoch": 1.7411607458682106, + "grad_norm": 6.78125, + "learning_rate": 4.509534322759934e-07, + "loss": 0.86900816, + "memory(GiB)": 302.58, + "step": 311340, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75435848, + "epoch": 1.7412725953411898, + "grad_norm": 7.28125, + "learning_rate": 4.5056973298900575e-07, + "loss": 0.96411057, + "memory(GiB)": 302.58, + "step": 311360, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.75505805, + "epoch": 1.741384444814169, + "grad_norm": 10.8125, + "learning_rate": 4.501861893069587e-07, + "loss": 0.95435047, + "memory(GiB)": 302.58, + "step": 311380, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.74606929, + "epoch": 1.7414962942871484, + "grad_norm": 7.25, + "learning_rate": 4.4980280124297115e-07, + "loss": 0.99834538, + "memory(GiB)": 302.58, + "step": 311400, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.76507578, + "epoch": 1.7416081437601276, + "grad_norm": 7.71875, + "learning_rate": 4.494195688101566e-07, + "loss": 0.93628883, + "memory(GiB)": 302.58, + "step": 311420, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74541793, + "epoch": 1.741719993233107, + "grad_norm": 7.6875, + "learning_rate": 4.490364920216217e-07, + "loss": 0.9962244, + "memory(GiB)": 302.58, + "step": 311440, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.75221872, + "epoch": 1.7418318427060862, + "grad_norm": 6.34375, + "learning_rate": 4.486535708904699e-07, + "loss": 0.9814599, + "memory(GiB)": 302.58, + "step": 311460, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.74859972, + "epoch": 1.7419436921790654, + "grad_norm": 5.90625, + "learning_rate": 4.482708054297974e-07, + "loss": 1.004527, + "memory(GiB)": 302.58, + "step": 311480, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.75321698, + "epoch": 1.7420555416520447, + "grad_norm": 8.0625, + "learning_rate": 4.4788819565269705e-07, + "loss": 1.00435009, + "memory(GiB)": 302.58, + "step": 311500, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.76035838, + "epoch": 1.742167391125024, + "grad_norm": 7.15625, + "learning_rate": 4.475057415722545e-07, + "loss": 0.94082375, + "memory(GiB)": 302.58, + "step": 311520, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75957274, + "epoch": 1.7422792405980032, + "grad_norm": 8.9375, + "learning_rate": 4.471234432015503e-07, + "loss": 0.9350893, + "memory(GiB)": 302.58, + "step": 311540, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.74907675, + "epoch": 1.7423910900709825, + "grad_norm": 8.25, + "learning_rate": 4.467413005536619e-07, + "loss": 0.97104616, + "memory(GiB)": 302.58, + "step": 311560, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.74804215, + "epoch": 1.7425029395439617, + "grad_norm": 4.71875, + "learning_rate": 4.463593136416594e-07, + "loss": 0.99943171, + "memory(GiB)": 302.58, + "step": 311580, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.73516769, + "epoch": 1.742614789016941, + "grad_norm": 8.5, + "learning_rate": 4.459774824786073e-07, + "loss": 1.06275511, + "memory(GiB)": 302.58, + "step": 311600, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.74403777, + "epoch": 1.7427266384899203, + "grad_norm": 6.28125, + "learning_rate": 4.4559580707756567e-07, + "loss": 1.01153421, + "memory(GiB)": 302.58, + "step": 311620, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74081016, + "epoch": 1.7428384879628995, + "grad_norm": 9.3125, + "learning_rate": 4.4521428745158914e-07, + "loss": 1.03618431, + "memory(GiB)": 302.58, + "step": 311640, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.75584908, + "epoch": 1.7429503374358788, + "grad_norm": 9.25, + "learning_rate": 4.448329236137267e-07, + "loss": 0.94193668, + "memory(GiB)": 302.58, + "step": 311660, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.7602416, + "epoch": 1.743062186908858, + "grad_norm": 7.3125, + "learning_rate": 4.4445171557702226e-07, + "loss": 0.93012686, + "memory(GiB)": 302.58, + "step": 311680, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.74213018, + "epoch": 1.7431740363818373, + "grad_norm": 6.84375, + "learning_rate": 4.440706633545144e-07, + "loss": 1.02442036, + "memory(GiB)": 302.58, + "step": 311700, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.74528122, + "epoch": 1.7432858858548166, + "grad_norm": 8.375, + "learning_rate": 4.436897669592366e-07, + "loss": 1.01495686, + "memory(GiB)": 302.58, + "step": 311720, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.74471698, + "epoch": 1.7433977353277958, + "grad_norm": 8.5, + "learning_rate": 4.433090264042161e-07, + "loss": 1.04262524, + "memory(GiB)": 302.58, + "step": 311740, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.76284752, + "epoch": 1.7435095848007751, + "grad_norm": 12.4375, + "learning_rate": 4.429284417024765e-07, + "loss": 0.92555723, + "memory(GiB)": 302.58, + "step": 311760, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.75409131, + "epoch": 1.7436214342737544, + "grad_norm": 8.75, + "learning_rate": 4.425480128670334e-07, + "loss": 0.96688232, + "memory(GiB)": 302.58, + "step": 311780, + "train_speed(iter/s)": 0.123642 + }, + { + "acc": 0.73722539, + "epoch": 1.7437332837467336, + "grad_norm": 5.5625, + "learning_rate": 4.4216773991089976e-07, + "loss": 1.04112988, + "memory(GiB)": 302.58, + "step": 311800, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.74207773, + "epoch": 1.743845133219713, + "grad_norm": 6.65625, + "learning_rate": 4.417876228470813e-07, + "loss": 1.01733217, + "memory(GiB)": 302.58, + "step": 311820, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.74744797, + "epoch": 1.7439569826926922, + "grad_norm": 6.125, + "learning_rate": 4.414076616885804e-07, + "loss": 0.97078695, + "memory(GiB)": 302.58, + "step": 311840, + "train_speed(iter/s)": 0.123653 + }, + { + "acc": 0.76317472, + "epoch": 1.7440688321656714, + "grad_norm": 5.25, + "learning_rate": 4.4102785644839275e-07, + "loss": 0.943647, + "memory(GiB)": 302.58, + "step": 311860, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.75149684, + "epoch": 1.7441806816386507, + "grad_norm": 9.5, + "learning_rate": 4.406482071395085e-07, + "loss": 0.96927414, + "memory(GiB)": 302.58, + "step": 311880, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.75099282, + "epoch": 1.74429253111163, + "grad_norm": 7.46875, + "learning_rate": 4.4026871377491233e-07, + "loss": 0.97690601, + "memory(GiB)": 302.58, + "step": 311900, + "train_speed(iter/s)": 0.123665 + }, + { + "acc": 0.74558263, + "epoch": 1.7444043805846092, + "grad_norm": 5.53125, + "learning_rate": 4.398893763675854e-07, + "loss": 0.97566128, + "memory(GiB)": 302.58, + "step": 311920, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.76220055, + "epoch": 1.7445162300575885, + "grad_norm": 8.5625, + "learning_rate": 4.3951019493050086e-07, + "loss": 0.92603598, + "memory(GiB)": 302.58, + "step": 311940, + "train_speed(iter/s)": 0.123672 + }, + { + "acc": 0.75899434, + "epoch": 1.7446280795305678, + "grad_norm": 6.9375, + "learning_rate": 4.3913116947662925e-07, + "loss": 0.93480635, + "memory(GiB)": 302.58, + "step": 311960, + "train_speed(iter/s)": 0.123676 + }, + { + "acc": 0.76329794, + "epoch": 1.744739929003547, + "grad_norm": 9.0625, + "learning_rate": 4.387523000189331e-07, + "loss": 0.91572466, + "memory(GiB)": 302.58, + "step": 311980, + "train_speed(iter/s)": 0.12368 + }, + { + "acc": 0.74586554, + "epoch": 1.7448517784765263, + "grad_norm": 7.40625, + "learning_rate": 4.38373586570372e-07, + "loss": 1.00654984, + "memory(GiB)": 302.58, + "step": 312000, + "train_speed(iter/s)": 0.123684 + }, + { + "epoch": 1.7448517784765263, + "eval_acc": 0.7069216828858024, + "eval_loss": 1.0117864608764648, + "eval_runtime": 7545.3196, + "eval_samples_per_second": 9.977, + "eval_steps_per_second": 9.977, + "step": 312000 + }, + { + "acc": 0.7532866, + "epoch": 1.7449636279495055, + "grad_norm": 8.4375, + "learning_rate": 4.3799502914389835e-07, + "loss": 0.96233625, + "memory(GiB)": 302.58, + "step": 312020, + "train_speed(iter/s)": 0.123312 + }, + { + "acc": 0.75896463, + "epoch": 1.7450754774224848, + "grad_norm": 7.15625, + "learning_rate": 4.3761662775246073e-07, + "loss": 0.93698788, + "memory(GiB)": 302.58, + "step": 312040, + "train_speed(iter/s)": 0.123315 + }, + { + "acc": 0.74776282, + "epoch": 1.745187326895464, + "grad_norm": 8.625, + "learning_rate": 4.3723838240900153e-07, + "loss": 0.99928122, + "memory(GiB)": 302.58, + "step": 312060, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.74474268, + "epoch": 1.7452991763684433, + "grad_norm": 6.53125, + "learning_rate": 4.3686029312645764e-07, + "loss": 0.99017649, + "memory(GiB)": 302.58, + "step": 312080, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.75776792, + "epoch": 1.7454110258414226, + "grad_norm": 6.3125, + "learning_rate": 4.3648235991776034e-07, + "loss": 0.96076193, + "memory(GiB)": 302.58, + "step": 312100, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.76387181, + "epoch": 1.7455228753144019, + "grad_norm": 6.65625, + "learning_rate": 4.361045827958371e-07, + "loss": 0.8999855, + "memory(GiB)": 302.58, + "step": 312120, + "train_speed(iter/s)": 0.123331 + }, + { + "acc": 0.74903188, + "epoch": 1.7456347247873811, + "grad_norm": 7.1875, + "learning_rate": 4.3572696177360984e-07, + "loss": 0.98186407, + "memory(GiB)": 302.58, + "step": 312140, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.74768858, + "epoch": 1.7457465742603604, + "grad_norm": 11.875, + "learning_rate": 4.3534949686399263e-07, + "loss": 0.98654299, + "memory(GiB)": 302.58, + "step": 312160, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.75558677, + "epoch": 1.7458584237333397, + "grad_norm": 6.8125, + "learning_rate": 4.3497218807989684e-07, + "loss": 0.9644557, + "memory(GiB)": 302.58, + "step": 312180, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.73609319, + "epoch": 1.745970273206319, + "grad_norm": 8.0625, + "learning_rate": 4.3459503543422766e-07, + "loss": 1.05368185, + "memory(GiB)": 302.58, + "step": 312200, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.758954, + "epoch": 1.7460821226792982, + "grad_norm": 7.90625, + "learning_rate": 4.342180389398848e-07, + "loss": 0.94460192, + "memory(GiB)": 302.58, + "step": 312220, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.74781332, + "epoch": 1.7461939721522775, + "grad_norm": 7.34375, + "learning_rate": 4.338411986097618e-07, + "loss": 1.02296553, + "memory(GiB)": 302.58, + "step": 312240, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.74464111, + "epoch": 1.7463058216252567, + "grad_norm": 6.90625, + "learning_rate": 4.3346451445674953e-07, + "loss": 0.99098158, + "memory(GiB)": 302.58, + "step": 312260, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.75321808, + "epoch": 1.746417671098236, + "grad_norm": 10.5, + "learning_rate": 4.33087986493731e-07, + "loss": 0.97755642, + "memory(GiB)": 302.58, + "step": 312280, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.75350366, + "epoch": 1.7465295205712152, + "grad_norm": 6.03125, + "learning_rate": 4.327116147335847e-07, + "loss": 0.98256721, + "memory(GiB)": 302.58, + "step": 312300, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.7558744, + "epoch": 1.7466413700441945, + "grad_norm": 11.6875, + "learning_rate": 4.3233539918918375e-07, + "loss": 0.96668406, + "memory(GiB)": 302.58, + "step": 312320, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.75355682, + "epoch": 1.7467532195171738, + "grad_norm": 6.1875, + "learning_rate": 4.319593398733957e-07, + "loss": 0.97013884, + "memory(GiB)": 302.58, + "step": 312340, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.73213139, + "epoch": 1.746865068990153, + "grad_norm": 5.90625, + "learning_rate": 4.3158343679908286e-07, + "loss": 1.04594259, + "memory(GiB)": 302.58, + "step": 312360, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.73782005, + "epoch": 1.7469769184631323, + "grad_norm": 6.28125, + "learning_rate": 4.312076899791029e-07, + "loss": 1.04078836, + "memory(GiB)": 302.58, + "step": 312380, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.75669098, + "epoch": 1.7470887679361116, + "grad_norm": 5.28125, + "learning_rate": 4.308320994263071e-07, + "loss": 0.96239395, + "memory(GiB)": 302.58, + "step": 312400, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75187206, + "epoch": 1.7472006174090908, + "grad_norm": 8.25, + "learning_rate": 4.304566651535408e-07, + "loss": 0.97369156, + "memory(GiB)": 302.58, + "step": 312420, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74233332, + "epoch": 1.74731246688207, + "grad_norm": 6.78125, + "learning_rate": 4.30081387173647e-07, + "loss": 1.01733313, + "memory(GiB)": 302.58, + "step": 312440, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.76653929, + "epoch": 1.7474243163550494, + "grad_norm": 6.65625, + "learning_rate": 4.2970626549946103e-07, + "loss": 0.93795671, + "memory(GiB)": 302.58, + "step": 312460, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74971108, + "epoch": 1.7475361658280286, + "grad_norm": 6.0, + "learning_rate": 4.2933130014381264e-07, + "loss": 0.96927567, + "memory(GiB)": 302.58, + "step": 312480, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.75490451, + "epoch": 1.7476480153010079, + "grad_norm": 5.875, + "learning_rate": 4.2895649111952654e-07, + "loss": 0.94739304, + "memory(GiB)": 302.58, + "step": 312500, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.74804072, + "epoch": 1.7477598647739871, + "grad_norm": 9.0, + "learning_rate": 4.285818384394236e-07, + "loss": 0.99709539, + "memory(GiB)": 302.58, + "step": 312520, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.76073189, + "epoch": 1.7478717142469664, + "grad_norm": 8.875, + "learning_rate": 4.282073421163169e-07, + "loss": 0.94053879, + "memory(GiB)": 302.58, + "step": 312540, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.74789438, + "epoch": 1.7479835637199457, + "grad_norm": 5.53125, + "learning_rate": 4.278330021630156e-07, + "loss": 0.99969168, + "memory(GiB)": 302.58, + "step": 312560, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75898099, + "epoch": 1.748095413192925, + "grad_norm": 5.25, + "learning_rate": 4.27458818592324e-07, + "loss": 0.95358543, + "memory(GiB)": 302.58, + "step": 312580, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.75801654, + "epoch": 1.7482072626659042, + "grad_norm": 10.5, + "learning_rate": 4.2708479141704006e-07, + "loss": 0.9543478, + "memory(GiB)": 302.58, + "step": 312600, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.77016964, + "epoch": 1.7483191121388835, + "grad_norm": 7.0625, + "learning_rate": 4.2671092064995643e-07, + "loss": 0.91114798, + "memory(GiB)": 302.58, + "step": 312620, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75428391, + "epoch": 1.7484309616118627, + "grad_norm": 7.75, + "learning_rate": 4.2633720630386065e-07, + "loss": 0.9565258, + "memory(GiB)": 302.58, + "step": 312640, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.76231661, + "epoch": 1.748542811084842, + "grad_norm": 6.9375, + "learning_rate": 4.2596364839153524e-07, + "loss": 0.93948526, + "memory(GiB)": 302.58, + "step": 312660, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.76547809, + "epoch": 1.7486546605578213, + "grad_norm": 7.96875, + "learning_rate": 4.255902469257567e-07, + "loss": 0.9271965, + "memory(GiB)": 302.58, + "step": 312680, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.75913105, + "epoch": 1.7487665100308005, + "grad_norm": 5.53125, + "learning_rate": 4.2521700191929636e-07, + "loss": 0.95257626, + "memory(GiB)": 302.58, + "step": 312700, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.76561074, + "epoch": 1.7488783595037798, + "grad_norm": 7.125, + "learning_rate": 4.2484391338492136e-07, + "loss": 0.90780554, + "memory(GiB)": 302.58, + "step": 312720, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.76269445, + "epoch": 1.748990208976759, + "grad_norm": 5.9375, + "learning_rate": 4.24470981335392e-07, + "loss": 0.92319469, + "memory(GiB)": 302.58, + "step": 312740, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.75159783, + "epoch": 1.7491020584497383, + "grad_norm": 9.1875, + "learning_rate": 4.240982057834636e-07, + "loss": 0.97312088, + "memory(GiB)": 302.58, + "step": 312760, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.77167358, + "epoch": 1.7492139079227176, + "grad_norm": 3.640625, + "learning_rate": 4.237255867418866e-07, + "loss": 0.90018835, + "memory(GiB)": 302.58, + "step": 312780, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.74164681, + "epoch": 1.7493257573956968, + "grad_norm": 5.59375, + "learning_rate": 4.2335312422340514e-07, + "loss": 1.03786097, + "memory(GiB)": 302.58, + "step": 312800, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.73800573, + "epoch": 1.749437606868676, + "grad_norm": 9.0625, + "learning_rate": 4.229808182407591e-07, + "loss": 1.03565187, + "memory(GiB)": 302.58, + "step": 312820, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.73584352, + "epoch": 1.7495494563416554, + "grad_norm": 7.6875, + "learning_rate": 4.226086688066827e-07, + "loss": 1.04252396, + "memory(GiB)": 302.58, + "step": 312840, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.75188689, + "epoch": 1.7496613058146346, + "grad_norm": 9.0, + "learning_rate": 4.222366759339042e-07, + "loss": 0.97790222, + "memory(GiB)": 302.58, + "step": 312860, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74275589, + "epoch": 1.749773155287614, + "grad_norm": 6.0, + "learning_rate": 4.2186483963514657e-07, + "loss": 1.01195183, + "memory(GiB)": 302.58, + "step": 312880, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74868197, + "epoch": 1.7498850047605932, + "grad_norm": 7.375, + "learning_rate": 4.2149315992312924e-07, + "loss": 0.98633213, + "memory(GiB)": 302.58, + "step": 312900, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75143766, + "epoch": 1.7499968542335724, + "grad_norm": 10.25, + "learning_rate": 4.2112163681056305e-07, + "loss": 0.9792531, + "memory(GiB)": 302.58, + "step": 312920, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.73936963, + "epoch": 1.7501087037065517, + "grad_norm": 7.34375, + "learning_rate": 4.207502703101568e-07, + "loss": 1.02734356, + "memory(GiB)": 302.58, + "step": 312940, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.74840069, + "epoch": 1.750220553179531, + "grad_norm": 8.375, + "learning_rate": 4.203790604346103e-07, + "loss": 0.98783264, + "memory(GiB)": 302.58, + "step": 312960, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.75877967, + "epoch": 1.7503324026525102, + "grad_norm": 6.125, + "learning_rate": 4.2000800719662227e-07, + "loss": 0.9425025, + "memory(GiB)": 302.58, + "step": 312980, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.75883455, + "epoch": 1.7504442521254895, + "grad_norm": 6.25, + "learning_rate": 4.1963711060888367e-07, + "loss": 0.93816633, + "memory(GiB)": 302.58, + "step": 313000, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.73856673, + "epoch": 1.7505561015984688, + "grad_norm": 9.625, + "learning_rate": 4.1926637068407994e-07, + "loss": 1.03293648, + "memory(GiB)": 302.58, + "step": 313020, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.76346788, + "epoch": 1.750667951071448, + "grad_norm": 7.0, + "learning_rate": 4.188957874348909e-07, + "loss": 0.93174734, + "memory(GiB)": 302.58, + "step": 313040, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.76086802, + "epoch": 1.7507798005444273, + "grad_norm": 6.4375, + "learning_rate": 4.1852536087399256e-07, + "loss": 0.92483711, + "memory(GiB)": 302.58, + "step": 313060, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.76318774, + "epoch": 1.7508916500174065, + "grad_norm": 5.5, + "learning_rate": 4.1815509101405414e-07, + "loss": 0.94410076, + "memory(GiB)": 302.58, + "step": 313080, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.76240568, + "epoch": 1.7510034994903858, + "grad_norm": 8.875, + "learning_rate": 4.1778497786774007e-07, + "loss": 0.93198175, + "memory(GiB)": 302.58, + "step": 313100, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.74754848, + "epoch": 1.751115348963365, + "grad_norm": 8.625, + "learning_rate": 4.174150214477102e-07, + "loss": 0.98703222, + "memory(GiB)": 302.58, + "step": 313120, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.743226, + "epoch": 1.7512271984363443, + "grad_norm": 6.78125, + "learning_rate": 4.170452217666171e-07, + "loss": 0.9963974, + "memory(GiB)": 302.58, + "step": 313140, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75202827, + "epoch": 1.7513390479093236, + "grad_norm": 9.125, + "learning_rate": 4.1667557883710963e-07, + "loss": 0.97733641, + "memory(GiB)": 302.58, + "step": 313160, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.73700924, + "epoch": 1.7514508973823029, + "grad_norm": 8.875, + "learning_rate": 4.1630609267183097e-07, + "loss": 1.03489094, + "memory(GiB)": 302.58, + "step": 313180, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.7293612, + "epoch": 1.7515627468552821, + "grad_norm": 6.4375, + "learning_rate": 4.159367632834177e-07, + "loss": 1.08476629, + "memory(GiB)": 302.58, + "step": 313200, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.74705677, + "epoch": 1.7516745963282614, + "grad_norm": 5.75, + "learning_rate": 4.155675906845036e-07, + "loss": 0.9742672, + "memory(GiB)": 302.58, + "step": 313220, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.75991468, + "epoch": 1.7517864458012407, + "grad_norm": 6.40625, + "learning_rate": 4.1519857488771353e-07, + "loss": 0.94565573, + "memory(GiB)": 302.58, + "step": 313240, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.75278139, + "epoch": 1.75189829527422, + "grad_norm": 8.8125, + "learning_rate": 4.1482971590567133e-07, + "loss": 0.9906415, + "memory(GiB)": 302.58, + "step": 313260, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75558114, + "epoch": 1.7520101447471992, + "grad_norm": 7.28125, + "learning_rate": 4.144610137509919e-07, + "loss": 0.95357847, + "memory(GiB)": 302.58, + "step": 313280, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.76622806, + "epoch": 1.7521219942201784, + "grad_norm": 6.28125, + "learning_rate": 4.140924684362857e-07, + "loss": 0.89543209, + "memory(GiB)": 302.58, + "step": 313300, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.76667347, + "epoch": 1.7522338436931577, + "grad_norm": 7.40625, + "learning_rate": 4.137240799741593e-07, + "loss": 0.90525723, + "memory(GiB)": 302.58, + "step": 313320, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.74653492, + "epoch": 1.752345693166137, + "grad_norm": 6.09375, + "learning_rate": 4.1335584837721153e-07, + "loss": 0.98672628, + "memory(GiB)": 302.58, + "step": 313340, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75872359, + "epoch": 1.7524575426391162, + "grad_norm": 6.40625, + "learning_rate": 4.129877736580379e-07, + "loss": 0.93291464, + "memory(GiB)": 302.58, + "step": 313360, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.73329234, + "epoch": 1.7525693921120955, + "grad_norm": 8.375, + "learning_rate": 4.1261985582922715e-07, + "loss": 1.07976646, + "memory(GiB)": 302.58, + "step": 313380, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.78151145, + "epoch": 1.7526812415850748, + "grad_norm": 9.0, + "learning_rate": 4.122520949033637e-07, + "loss": 0.83357944, + "memory(GiB)": 302.58, + "step": 313400, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.76023593, + "epoch": 1.752793091058054, + "grad_norm": 5.6875, + "learning_rate": 4.1188449089302587e-07, + "loss": 0.95731936, + "memory(GiB)": 302.58, + "step": 313420, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.75084062, + "epoch": 1.7529049405310333, + "grad_norm": 7.84375, + "learning_rate": 4.115170438107874e-07, + "loss": 0.98605862, + "memory(GiB)": 302.58, + "step": 313440, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.75117512, + "epoch": 1.7530167900040126, + "grad_norm": 7.71875, + "learning_rate": 4.1114975366921493e-07, + "loss": 0.99317741, + "memory(GiB)": 302.58, + "step": 313460, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74255476, + "epoch": 1.7531286394769918, + "grad_norm": 7.0, + "learning_rate": 4.1078262048087236e-07, + "loss": 1.00947485, + "memory(GiB)": 302.58, + "step": 313480, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.75350432, + "epoch": 1.753240488949971, + "grad_norm": 9.6875, + "learning_rate": 4.1041564425831516e-07, + "loss": 0.97194681, + "memory(GiB)": 302.58, + "step": 313500, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.75648112, + "epoch": 1.7533523384229504, + "grad_norm": 9.375, + "learning_rate": 4.1004882501409714e-07, + "loss": 0.97139149, + "memory(GiB)": 302.58, + "step": 313520, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.7472086, + "epoch": 1.7534641878959296, + "grad_norm": 7.625, + "learning_rate": 4.0968216276076334e-07, + "loss": 0.99485636, + "memory(GiB)": 302.58, + "step": 313540, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75665069, + "epoch": 1.7535760373689089, + "grad_norm": 9.4375, + "learning_rate": 4.0931565751085535e-07, + "loss": 0.96570883, + "memory(GiB)": 302.58, + "step": 313560, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.75086536, + "epoch": 1.7536878868418881, + "grad_norm": 8.4375, + "learning_rate": 4.0894930927690704e-07, + "loss": 0.97488117, + "memory(GiB)": 302.58, + "step": 313580, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.73259153, + "epoch": 1.7537997363148674, + "grad_norm": 6.34375, + "learning_rate": 4.085831180714517e-07, + "loss": 1.07261086, + "memory(GiB)": 302.58, + "step": 313600, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.76078134, + "epoch": 1.7539115857878467, + "grad_norm": 6.75, + "learning_rate": 4.0821708390701267e-07, + "loss": 0.92670803, + "memory(GiB)": 302.58, + "step": 313620, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.73590794, + "epoch": 1.754023435260826, + "grad_norm": 6.59375, + "learning_rate": 4.0785120679610935e-07, + "loss": 1.05228176, + "memory(GiB)": 302.58, + "step": 313640, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.75908203, + "epoch": 1.7541352847338052, + "grad_norm": 10.3125, + "learning_rate": 4.074854867512562e-07, + "loss": 0.9459837, + "memory(GiB)": 302.58, + "step": 313660, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.74589758, + "epoch": 1.7542471342067845, + "grad_norm": 5.9375, + "learning_rate": 4.0711992378496157e-07, + "loss": 1.00227575, + "memory(GiB)": 302.58, + "step": 313680, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.74746151, + "epoch": 1.7543589836797637, + "grad_norm": 13.5625, + "learning_rate": 4.067545179097293e-07, + "loss": 0.98927002, + "memory(GiB)": 302.58, + "step": 313700, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75929823, + "epoch": 1.754470833152743, + "grad_norm": 6.4375, + "learning_rate": 4.063892691380572e-07, + "loss": 0.94637661, + "memory(GiB)": 302.58, + "step": 313720, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.75454221, + "epoch": 1.7545826826257223, + "grad_norm": 6.8125, + "learning_rate": 4.060241774824386e-07, + "loss": 0.96328564, + "memory(GiB)": 302.58, + "step": 313740, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.74383526, + "epoch": 1.7546945320987015, + "grad_norm": 5.9375, + "learning_rate": 4.056592429553596e-07, + "loss": 1.01605883, + "memory(GiB)": 302.58, + "step": 313760, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.76854672, + "epoch": 1.7548063815716808, + "grad_norm": 7.34375, + "learning_rate": 4.0529446556930307e-07, + "loss": 0.90754566, + "memory(GiB)": 302.58, + "step": 313780, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.74799485, + "epoch": 1.75491823104466, + "grad_norm": 8.125, + "learning_rate": 4.049298453367451e-07, + "loss": 0.99700127, + "memory(GiB)": 302.58, + "step": 313800, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.73938451, + "epoch": 1.7550300805176393, + "grad_norm": 7.875, + "learning_rate": 4.045653822701573e-07, + "loss": 1.03884468, + "memory(GiB)": 302.58, + "step": 313820, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.75548434, + "epoch": 1.7551419299906186, + "grad_norm": 4.5, + "learning_rate": 4.0420107638200367e-07, + "loss": 0.96509724, + "memory(GiB)": 302.58, + "step": 313840, + "train_speed(iter/s)": 0.123651 + }, + { + "acc": 0.76648507, + "epoch": 1.7552537794635978, + "grad_norm": 5.375, + "learning_rate": 4.038369276847476e-07, + "loss": 0.92606869, + "memory(GiB)": 302.58, + "step": 313860, + "train_speed(iter/s)": 0.123655 + }, + { + "acc": 0.76179848, + "epoch": 1.755365628936577, + "grad_norm": 8.125, + "learning_rate": 4.0347293619084235e-07, + "loss": 0.91627903, + "memory(GiB)": 302.58, + "step": 313880, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.7512383, + "epoch": 1.7554774784095564, + "grad_norm": 8.3125, + "learning_rate": 4.031091019127381e-07, + "loss": 0.99899168, + "memory(GiB)": 302.58, + "step": 313900, + "train_speed(iter/s)": 0.123662 + }, + { + "acc": 0.76685452, + "epoch": 1.7555893278825356, + "grad_norm": 8.1875, + "learning_rate": 4.027454248628787e-07, + "loss": 0.91680012, + "memory(GiB)": 302.58, + "step": 313920, + "train_speed(iter/s)": 0.123666 + }, + { + "acc": 0.7584177, + "epoch": 1.755701177355515, + "grad_norm": 5.71875, + "learning_rate": 4.023819050537031e-07, + "loss": 0.92428741, + "memory(GiB)": 302.58, + "step": 313940, + "train_speed(iter/s)": 0.12367 + }, + { + "acc": 0.73843451, + "epoch": 1.7558130268284942, + "grad_norm": 6.0625, + "learning_rate": 4.0201854249764525e-07, + "loss": 1.03159456, + "memory(GiB)": 302.58, + "step": 313960, + "train_speed(iter/s)": 0.123674 + }, + { + "acc": 0.7396801, + "epoch": 1.7559248763014734, + "grad_norm": 8.9375, + "learning_rate": 4.01655337207133e-07, + "loss": 1.04187088, + "memory(GiB)": 302.58, + "step": 313980, + "train_speed(iter/s)": 0.123678 + }, + { + "acc": 0.75308056, + "epoch": 1.7560367257744527, + "grad_norm": 8.0625, + "learning_rate": 4.012922891945892e-07, + "loss": 0.98082418, + "memory(GiB)": 302.58, + "step": 314000, + "train_speed(iter/s)": 0.123681 + }, + { + "epoch": 1.7560367257744527, + "eval_acc": 0.7069070910370654, + "eval_loss": 1.011788010597229, + "eval_runtime": 7596.4606, + "eval_samples_per_second": 9.91, + "eval_steps_per_second": 9.91, + "step": 314000 + }, + { + "acc": 0.74858532, + "epoch": 1.756148575247432, + "grad_norm": 6.75, + "learning_rate": 4.0092939847243106e-07, + "loss": 0.98660126, + "memory(GiB)": 302.58, + "step": 314020, + "train_speed(iter/s)": 0.12331 + }, + { + "acc": 0.74677768, + "epoch": 1.7562604247204112, + "grad_norm": 6.96875, + "learning_rate": 4.00566665053071e-07, + "loss": 1.00790911, + "memory(GiB)": 302.58, + "step": 314040, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.75710435, + "epoch": 1.7563722741933905, + "grad_norm": 9.8125, + "learning_rate": 4.00204088948915e-07, + "loss": 0.93953657, + "memory(GiB)": 302.58, + "step": 314060, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.75430689, + "epoch": 1.7564841236663697, + "grad_norm": 4.78125, + "learning_rate": 3.998416701723651e-07, + "loss": 0.96790781, + "memory(GiB)": 302.58, + "step": 314080, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.73843303, + "epoch": 1.756595973139349, + "grad_norm": 7.59375, + "learning_rate": 3.9947940873581667e-07, + "loss": 1.04444475, + "memory(GiB)": 302.58, + "step": 314100, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.74875045, + "epoch": 1.7567078226123283, + "grad_norm": 7.1875, + "learning_rate": 3.9911730465165946e-07, + "loss": 1.00759516, + "memory(GiB)": 302.58, + "step": 314120, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.75337734, + "epoch": 1.7568196720853075, + "grad_norm": 7.78125, + "learning_rate": 3.987553579322806e-07, + "loss": 0.94770727, + "memory(GiB)": 302.58, + "step": 314140, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.75519176, + "epoch": 1.756931521558287, + "grad_norm": 10.4375, + "learning_rate": 3.9839356859005806e-07, + "loss": 0.97124405, + "memory(GiB)": 302.58, + "step": 314160, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.73721843, + "epoch": 1.757043371031266, + "grad_norm": 4.03125, + "learning_rate": 3.9803193663736694e-07, + "loss": 1.03653421, + "memory(GiB)": 302.58, + "step": 314180, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.75079012, + "epoch": 1.7571552205042456, + "grad_norm": 7.6875, + "learning_rate": 3.976704620865762e-07, + "loss": 0.97589836, + "memory(GiB)": 302.58, + "step": 314200, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.75545487, + "epoch": 1.7572670699772246, + "grad_norm": 7.3125, + "learning_rate": 3.9730914495004933e-07, + "loss": 0.96516056, + "memory(GiB)": 302.58, + "step": 314220, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.74532256, + "epoch": 1.757378919450204, + "grad_norm": 9.5, + "learning_rate": 3.9694798524014423e-07, + "loss": 1.0061657, + "memory(GiB)": 302.58, + "step": 314240, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.75667205, + "epoch": 1.7574907689231831, + "grad_norm": 8.375, + "learning_rate": 3.9658698296921426e-07, + "loss": 0.94171267, + "memory(GiB)": 302.58, + "step": 314260, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.75313225, + "epoch": 1.7576026183961626, + "grad_norm": 5.90625, + "learning_rate": 3.962261381496063e-07, + "loss": 0.98006725, + "memory(GiB)": 302.58, + "step": 314280, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.73768959, + "epoch": 1.7577144678691417, + "grad_norm": 4.96875, + "learning_rate": 3.958654507936621e-07, + "loss": 1.03036356, + "memory(GiB)": 302.58, + "step": 314300, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.74241481, + "epoch": 1.7578263173421211, + "grad_norm": 8.875, + "learning_rate": 3.955049209137196e-07, + "loss": 1.01782618, + "memory(GiB)": 302.58, + "step": 314320, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.74634433, + "epoch": 1.7579381668151002, + "grad_norm": 8.5, + "learning_rate": 3.9514454852210895e-07, + "loss": 0.99292183, + "memory(GiB)": 302.58, + "step": 314340, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.75297422, + "epoch": 1.7580500162880797, + "grad_norm": 6.4375, + "learning_rate": 3.947843336311563e-07, + "loss": 0.9479084, + "memory(GiB)": 302.58, + "step": 314360, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.76327243, + "epoch": 1.7581618657610587, + "grad_norm": 8.1875, + "learning_rate": 3.944242762531819e-07, + "loss": 0.92249565, + "memory(GiB)": 302.58, + "step": 314380, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.75315385, + "epoch": 1.7582737152340382, + "grad_norm": 7.40625, + "learning_rate": 3.9406437640050086e-07, + "loss": 1.0107996, + "memory(GiB)": 302.58, + "step": 314400, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.7506846, + "epoch": 1.7583855647070172, + "grad_norm": 6.375, + "learning_rate": 3.9370463408542334e-07, + "loss": 0.99014225, + "memory(GiB)": 302.58, + "step": 314420, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.74202189, + "epoch": 1.7584974141799967, + "grad_norm": 7.78125, + "learning_rate": 3.9334504932025394e-07, + "loss": 1.00892038, + "memory(GiB)": 302.58, + "step": 314440, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74086018, + "epoch": 1.7586092636529758, + "grad_norm": 7.5, + "learning_rate": 3.9298562211729117e-07, + "loss": 1.02919083, + "memory(GiB)": 302.58, + "step": 314460, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.72881899, + "epoch": 1.7587211131259552, + "grad_norm": 6.125, + "learning_rate": 3.9262635248882853e-07, + "loss": 1.08164644, + "memory(GiB)": 302.58, + "step": 314480, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.76234646, + "epoch": 1.7588329625989343, + "grad_norm": 7.34375, + "learning_rate": 3.9226724044715393e-07, + "loss": 0.95738573, + "memory(GiB)": 302.58, + "step": 314500, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.75939584, + "epoch": 1.7589448120719138, + "grad_norm": 7.25, + "learning_rate": 3.919082860045509e-07, + "loss": 0.91572542, + "memory(GiB)": 302.58, + "step": 314520, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.75201769, + "epoch": 1.7590566615448928, + "grad_norm": 7.28125, + "learning_rate": 3.9154948917329625e-07, + "loss": 0.98229275, + "memory(GiB)": 302.58, + "step": 314540, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.74731236, + "epoch": 1.7591685110178723, + "grad_norm": 7.875, + "learning_rate": 3.9119084996566193e-07, + "loss": 0.9892252, + "memory(GiB)": 302.58, + "step": 314560, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.75145316, + "epoch": 1.7592803604908513, + "grad_norm": 8.25, + "learning_rate": 3.908323683939147e-07, + "loss": 0.95601788, + "memory(GiB)": 302.58, + "step": 314580, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75645509, + "epoch": 1.7593922099638308, + "grad_norm": 8.25, + "learning_rate": 3.9047404447031587e-07, + "loss": 0.96239414, + "memory(GiB)": 302.58, + "step": 314600, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.74803801, + "epoch": 1.7595040594368099, + "grad_norm": 9.375, + "learning_rate": 3.901158782071207e-07, + "loss": 0.99064713, + "memory(GiB)": 302.58, + "step": 314620, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.753932, + "epoch": 1.7596159089097894, + "grad_norm": 5.75, + "learning_rate": 3.8975786961658044e-07, + "loss": 0.9720212, + "memory(GiB)": 302.58, + "step": 314640, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.75067215, + "epoch": 1.7597277583827684, + "grad_norm": 7.8125, + "learning_rate": 3.8940001871093867e-07, + "loss": 0.97483835, + "memory(GiB)": 302.58, + "step": 314660, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.73662009, + "epoch": 1.759839607855748, + "grad_norm": 8.1875, + "learning_rate": 3.8904232550243724e-07, + "loss": 1.02358551, + "memory(GiB)": 302.58, + "step": 314680, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.75452561, + "epoch": 1.759951457328727, + "grad_norm": 6.9375, + "learning_rate": 3.8868479000330917e-07, + "loss": 0.98020649, + "memory(GiB)": 302.58, + "step": 314700, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.74674549, + "epoch": 1.7600633068017064, + "grad_norm": 7.09375, + "learning_rate": 3.8832741222578295e-07, + "loss": 0.99994125, + "memory(GiB)": 302.58, + "step": 314720, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.76005893, + "epoch": 1.7601751562746855, + "grad_norm": 8.3125, + "learning_rate": 3.8797019218208276e-07, + "loss": 0.95209103, + "memory(GiB)": 302.58, + "step": 314740, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.75917053, + "epoch": 1.760287005747665, + "grad_norm": 9.75, + "learning_rate": 3.8761312988442546e-07, + "loss": 0.94756079, + "memory(GiB)": 302.58, + "step": 314760, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.75473495, + "epoch": 1.760398855220644, + "grad_norm": 7.28125, + "learning_rate": 3.872562253450257e-07, + "loss": 0.98654766, + "memory(GiB)": 302.58, + "step": 314780, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.74085131, + "epoch": 1.7605107046936235, + "grad_norm": 7.03125, + "learning_rate": 3.8689947857608936e-07, + "loss": 1.02675982, + "memory(GiB)": 302.58, + "step": 314800, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75105171, + "epoch": 1.7606225541666025, + "grad_norm": 8.5, + "learning_rate": 3.8654288958981823e-07, + "loss": 0.99590883, + "memory(GiB)": 302.58, + "step": 314820, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.75741825, + "epoch": 1.760734403639582, + "grad_norm": 6.625, + "learning_rate": 3.8618645839840984e-07, + "loss": 0.95544558, + "memory(GiB)": 302.58, + "step": 314840, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.74765992, + "epoch": 1.760846253112561, + "grad_norm": 7.1875, + "learning_rate": 3.8583018501405444e-07, + "loss": 0.99705906, + "memory(GiB)": 302.58, + "step": 314860, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.74977121, + "epoch": 1.7609581025855405, + "grad_norm": 6.28125, + "learning_rate": 3.8547406944893727e-07, + "loss": 0.99245119, + "memory(GiB)": 302.58, + "step": 314880, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.73517022, + "epoch": 1.7610699520585196, + "grad_norm": 11.0, + "learning_rate": 3.8511811171523973e-07, + "loss": 1.04604206, + "memory(GiB)": 302.58, + "step": 314900, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.74372659, + "epoch": 1.761181801531499, + "grad_norm": 4.96875, + "learning_rate": 3.8476231182513644e-07, + "loss": 1.00583582, + "memory(GiB)": 302.58, + "step": 314920, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74081039, + "epoch": 1.761293651004478, + "grad_norm": 7.65625, + "learning_rate": 3.844066697907961e-07, + "loss": 1.01431503, + "memory(GiB)": 302.58, + "step": 314940, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74268684, + "epoch": 1.7614055004774576, + "grad_norm": 9.0625, + "learning_rate": 3.8405118562438336e-07, + "loss": 1.03047533, + "memory(GiB)": 302.58, + "step": 314960, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.73947592, + "epoch": 1.7615173499504366, + "grad_norm": 7.71875, + "learning_rate": 3.8369585933805687e-07, + "loss": 1.03511343, + "memory(GiB)": 302.58, + "step": 314980, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75091934, + "epoch": 1.7616291994234161, + "grad_norm": 6.4375, + "learning_rate": 3.833406909439691e-07, + "loss": 0.96965275, + "memory(GiB)": 302.58, + "step": 315000, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.73110485, + "epoch": 1.7617410488963952, + "grad_norm": 5.34375, + "learning_rate": 3.8298568045426977e-07, + "loss": 1.06513653, + "memory(GiB)": 302.58, + "step": 315020, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.76743493, + "epoch": 1.7618528983693746, + "grad_norm": 7.4375, + "learning_rate": 3.8263082788109973e-07, + "loss": 0.9113596, + "memory(GiB)": 302.58, + "step": 315040, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.7442451, + "epoch": 1.7619647478423537, + "grad_norm": 7.28125, + "learning_rate": 3.8227613323659706e-07, + "loss": 1.01869726, + "memory(GiB)": 302.58, + "step": 315060, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.73239017, + "epoch": 1.7620765973153332, + "grad_norm": 8.5, + "learning_rate": 3.819215965328932e-07, + "loss": 1.05393448, + "memory(GiB)": 302.58, + "step": 315080, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.74396472, + "epoch": 1.7621884467883122, + "grad_norm": 6.03125, + "learning_rate": 3.815672177821139e-07, + "loss": 1.01885109, + "memory(GiB)": 302.58, + "step": 315100, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.7452054, + "epoch": 1.7623002962612917, + "grad_norm": 7.65625, + "learning_rate": 3.812129969963807e-07, + "loss": 1.0267314, + "memory(GiB)": 302.58, + "step": 315120, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74988604, + "epoch": 1.7624121457342707, + "grad_norm": 6.25, + "learning_rate": 3.8085893418780884e-07, + "loss": 0.96432571, + "memory(GiB)": 302.58, + "step": 315140, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75375977, + "epoch": 1.7625239952072502, + "grad_norm": 6.90625, + "learning_rate": 3.8050502936850807e-07, + "loss": 0.96102486, + "memory(GiB)": 302.58, + "step": 315160, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74196415, + "epoch": 1.7626358446802293, + "grad_norm": 7.21875, + "learning_rate": 3.8015128255058377e-07, + "loss": 1.00627813, + "memory(GiB)": 302.58, + "step": 315180, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.76221218, + "epoch": 1.7627476941532088, + "grad_norm": 4.9375, + "learning_rate": 3.7979769374613395e-07, + "loss": 0.94203358, + "memory(GiB)": 302.58, + "step": 315200, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.77439556, + "epoch": 1.7628595436261878, + "grad_norm": 6.875, + "learning_rate": 3.79444262967254e-07, + "loss": 0.88216047, + "memory(GiB)": 302.58, + "step": 315220, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.75656686, + "epoch": 1.7629713930991673, + "grad_norm": 7.46875, + "learning_rate": 3.7909099022603146e-07, + "loss": 0.98581724, + "memory(GiB)": 302.58, + "step": 315240, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.73789673, + "epoch": 1.7630832425721463, + "grad_norm": 8.6875, + "learning_rate": 3.7873787553454945e-07, + "loss": 1.02411776, + "memory(GiB)": 302.58, + "step": 315260, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.74044061, + "epoch": 1.7631950920451258, + "grad_norm": 7.28125, + "learning_rate": 3.7838491890488496e-07, + "loss": 1.03616314, + "memory(GiB)": 302.58, + "step": 315280, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.75492835, + "epoch": 1.7633069415181049, + "grad_norm": 9.9375, + "learning_rate": 3.7803212034911164e-07, + "loss": 0.95549622, + "memory(GiB)": 302.58, + "step": 315300, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.76271429, + "epoch": 1.7634187909910843, + "grad_norm": 8.9375, + "learning_rate": 3.77679479879296e-07, + "loss": 0.94058123, + "memory(GiB)": 302.58, + "step": 315320, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.75597029, + "epoch": 1.7635306404640634, + "grad_norm": 9.375, + "learning_rate": 3.773269975074989e-07, + "loss": 0.95254345, + "memory(GiB)": 302.58, + "step": 315340, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.7692287, + "epoch": 1.7636424899370429, + "grad_norm": 6.375, + "learning_rate": 3.769746732457763e-07, + "loss": 0.90106163, + "memory(GiB)": 302.58, + "step": 315360, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.76758828, + "epoch": 1.763754339410022, + "grad_norm": 7.46875, + "learning_rate": 3.7662250710617964e-07, + "loss": 0.89331474, + "memory(GiB)": 302.58, + "step": 315380, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.74496298, + "epoch": 1.7638661888830014, + "grad_norm": 9.0625, + "learning_rate": 3.7627049910075317e-07, + "loss": 1.02195749, + "memory(GiB)": 302.58, + "step": 315400, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.76229491, + "epoch": 1.7639780383559804, + "grad_norm": 7.03125, + "learning_rate": 3.7591864924153734e-07, + "loss": 0.93944044, + "memory(GiB)": 302.58, + "step": 315420, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.76017332, + "epoch": 1.76408988782896, + "grad_norm": 7.84375, + "learning_rate": 3.7556695754056626e-07, + "loss": 0.93518906, + "memory(GiB)": 302.58, + "step": 315440, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.77315183, + "epoch": 1.764201737301939, + "grad_norm": 7.84375, + "learning_rate": 3.7521542400986875e-07, + "loss": 0.90342588, + "memory(GiB)": 302.58, + "step": 315460, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.74580269, + "epoch": 1.7643135867749185, + "grad_norm": 8.8125, + "learning_rate": 3.7486404866146855e-07, + "loss": 1.0072958, + "memory(GiB)": 302.58, + "step": 315480, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.77302856, + "epoch": 1.7644254362478975, + "grad_norm": 6.90625, + "learning_rate": 3.7451283150738373e-07, + "loss": 0.88380871, + "memory(GiB)": 302.58, + "step": 315500, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.74744444, + "epoch": 1.764537285720877, + "grad_norm": 11.3125, + "learning_rate": 3.741617725596269e-07, + "loss": 0.9881464, + "memory(GiB)": 302.58, + "step": 315520, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.7495554, + "epoch": 1.764649135193856, + "grad_norm": 8.875, + "learning_rate": 3.738108718302053e-07, + "loss": 0.96895237, + "memory(GiB)": 302.58, + "step": 315540, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.74410539, + "epoch": 1.7647609846668355, + "grad_norm": 7.28125, + "learning_rate": 3.7346012933112186e-07, + "loss": 1.00640068, + "memory(GiB)": 302.58, + "step": 315560, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.74421825, + "epoch": 1.7648728341398146, + "grad_norm": 7.84375, + "learning_rate": 3.731095450743721e-07, + "loss": 1.00511007, + "memory(GiB)": 302.58, + "step": 315580, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.7528142, + "epoch": 1.764984683612794, + "grad_norm": 5.53125, + "learning_rate": 3.727591190719476e-07, + "loss": 0.96399364, + "memory(GiB)": 302.58, + "step": 315600, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.73996115, + "epoch": 1.765096533085773, + "grad_norm": 9.5, + "learning_rate": 3.724088513358337e-07, + "loss": 1.01136894, + "memory(GiB)": 302.58, + "step": 315620, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.75594974, + "epoch": 1.7652083825587526, + "grad_norm": 8.875, + "learning_rate": 3.7205874187801084e-07, + "loss": 0.96590872, + "memory(GiB)": 302.58, + "step": 315640, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.75769424, + "epoch": 1.7653202320317316, + "grad_norm": 6.9375, + "learning_rate": 3.7170879071045385e-07, + "loss": 0.94262362, + "memory(GiB)": 302.58, + "step": 315660, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.75393472, + "epoch": 1.765432081504711, + "grad_norm": 7.53125, + "learning_rate": 3.7135899784513207e-07, + "loss": 0.97843933, + "memory(GiB)": 302.58, + "step": 315680, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.75673242, + "epoch": 1.7655439309776901, + "grad_norm": 8.1875, + "learning_rate": 3.7100936329400927e-07, + "loss": 0.95346975, + "memory(GiB)": 302.58, + "step": 315700, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.75068593, + "epoch": 1.7656557804506696, + "grad_norm": 7.3125, + "learning_rate": 3.7065988706904475e-07, + "loss": 0.98007421, + "memory(GiB)": 302.58, + "step": 315720, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.75686078, + "epoch": 1.7657676299236487, + "grad_norm": 8.75, + "learning_rate": 3.703105691821912e-07, + "loss": 0.94045506, + "memory(GiB)": 302.58, + "step": 315740, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.74692011, + "epoch": 1.7658794793966281, + "grad_norm": 5.53125, + "learning_rate": 3.6996140964539627e-07, + "loss": 0.98773041, + "memory(GiB)": 302.58, + "step": 315760, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.74602571, + "epoch": 1.7659913288696072, + "grad_norm": 8.3125, + "learning_rate": 3.696124084706032e-07, + "loss": 0.99822445, + "memory(GiB)": 302.58, + "step": 315780, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.76481762, + "epoch": 1.7661031783425867, + "grad_norm": 6.65625, + "learning_rate": 3.6926356566974797e-07, + "loss": 0.90986233, + "memory(GiB)": 302.58, + "step": 315800, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.7540226, + "epoch": 1.7662150278155657, + "grad_norm": 10.9375, + "learning_rate": 3.689148812547616e-07, + "loss": 0.95951424, + "memory(GiB)": 302.58, + "step": 315820, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.75621204, + "epoch": 1.7663268772885452, + "grad_norm": 7.125, + "learning_rate": 3.6856635523757176e-07, + "loss": 0.93653984, + "memory(GiB)": 302.58, + "step": 315840, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74375081, + "epoch": 1.7664387267615242, + "grad_norm": 6.34375, + "learning_rate": 3.6821798763009897e-07, + "loss": 1.02372065, + "memory(GiB)": 302.58, + "step": 315860, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.7604425, + "epoch": 1.7665505762345037, + "grad_norm": 7.59375, + "learning_rate": 3.678697784442575e-07, + "loss": 0.94859629, + "memory(GiB)": 302.58, + "step": 315880, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.7414536, + "epoch": 1.7666624257074828, + "grad_norm": 7.28125, + "learning_rate": 3.675217276919579e-07, + "loss": 1.02789707, + "memory(GiB)": 302.58, + "step": 315900, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.76210093, + "epoch": 1.7667742751804623, + "grad_norm": 6.53125, + "learning_rate": 3.6717383538510455e-07, + "loss": 0.92112131, + "memory(GiB)": 302.58, + "step": 315920, + "train_speed(iter/s)": 0.123657 + }, + { + "acc": 0.739468, + "epoch": 1.7668861246534413, + "grad_norm": 5.0625, + "learning_rate": 3.668261015355962e-07, + "loss": 1.01956377, + "memory(GiB)": 302.58, + "step": 315940, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.73097744, + "epoch": 1.7669979741264208, + "grad_norm": 10.875, + "learning_rate": 3.664785261553261e-07, + "loss": 1.0512125, + "memory(GiB)": 302.58, + "step": 315960, + "train_speed(iter/s)": 0.123664 + }, + { + "acc": 0.75977178, + "epoch": 1.7671098235993998, + "grad_norm": 7.6875, + "learning_rate": 3.661311092561837e-07, + "loss": 0.95976562, + "memory(GiB)": 302.58, + "step": 315980, + "train_speed(iter/s)": 0.123668 + }, + { + "acc": 0.7382875, + "epoch": 1.7672216730723793, + "grad_norm": 5.96875, + "learning_rate": 3.6578385085005053e-07, + "loss": 1.02553387, + "memory(GiB)": 302.58, + "step": 316000, + "train_speed(iter/s)": 0.123672 + }, + { + "epoch": 1.7672216730723793, + "eval_acc": 0.7069004359709185, + "eval_loss": 1.0118035078048706, + "eval_runtime": 7596.0503, + "eval_samples_per_second": 9.911, + "eval_steps_per_second": 9.911, + "step": 316000 + }, + { + "acc": 0.75746036, + "epoch": 1.7673335225453584, + "grad_norm": 6.40625, + "learning_rate": 3.6543675094880325e-07, + "loss": 0.95777235, + "memory(GiB)": 302.58, + "step": 316020, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.75413017, + "epoch": 1.7674453720183378, + "grad_norm": 6.96875, + "learning_rate": 3.6508980956431626e-07, + "loss": 0.99049416, + "memory(GiB)": 302.58, + "step": 316040, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.75104938, + "epoch": 1.7675572214913169, + "grad_norm": 5.9375, + "learning_rate": 3.6474302670845453e-07, + "loss": 0.98316031, + "memory(GiB)": 302.58, + "step": 316060, + "train_speed(iter/s)": 0.12331 + }, + { + "acc": 0.74031143, + "epoch": 1.7676690709642964, + "grad_norm": 9.9375, + "learning_rate": 3.6439640239307906e-07, + "loss": 1.02058249, + "memory(GiB)": 302.58, + "step": 316080, + "train_speed(iter/s)": 0.123314 + }, + { + "acc": 0.74880099, + "epoch": 1.7677809204372754, + "grad_norm": 5.53125, + "learning_rate": 3.6404993663004594e-07, + "loss": 0.99809141, + "memory(GiB)": 302.58, + "step": 316100, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.76477876, + "epoch": 1.767892769910255, + "grad_norm": 8.9375, + "learning_rate": 3.637036294312052e-07, + "loss": 0.92714643, + "memory(GiB)": 302.58, + "step": 316120, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.75147638, + "epoch": 1.768004619383234, + "grad_norm": 11.5, + "learning_rate": 3.6335748080840063e-07, + "loss": 0.97605658, + "memory(GiB)": 302.58, + "step": 316140, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.74740634, + "epoch": 1.7681164688562134, + "grad_norm": 7.125, + "learning_rate": 3.630114907734739e-07, + "loss": 1.00619555, + "memory(GiB)": 302.58, + "step": 316160, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.75892472, + "epoch": 1.7682283183291925, + "grad_norm": 6.1875, + "learning_rate": 3.6266565933825714e-07, + "loss": 0.92868214, + "memory(GiB)": 302.58, + "step": 316180, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.75330806, + "epoch": 1.768340167802172, + "grad_norm": 7.1875, + "learning_rate": 3.623199865145799e-07, + "loss": 0.96879578, + "memory(GiB)": 302.58, + "step": 316200, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.76101847, + "epoch": 1.768452017275151, + "grad_norm": 9.25, + "learning_rate": 3.619744723142643e-07, + "loss": 0.94369822, + "memory(GiB)": 302.58, + "step": 316220, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.75316501, + "epoch": 1.7685638667481305, + "grad_norm": 7.1875, + "learning_rate": 3.616291167491287e-07, + "loss": 0.95906677, + "memory(GiB)": 302.58, + "step": 316240, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.73780179, + "epoch": 1.7686757162211095, + "grad_norm": 6.125, + "learning_rate": 3.612839198309853e-07, + "loss": 1.01888351, + "memory(GiB)": 302.58, + "step": 316260, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.73623929, + "epoch": 1.768787565694089, + "grad_norm": 8.6875, + "learning_rate": 3.609388815716408e-07, + "loss": 1.02535172, + "memory(GiB)": 302.58, + "step": 316280, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.74357548, + "epoch": 1.768899415167068, + "grad_norm": 8.8125, + "learning_rate": 3.605940019828963e-07, + "loss": 1.01242008, + "memory(GiB)": 302.58, + "step": 316300, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.74563966, + "epoch": 1.7690112646400475, + "grad_norm": 4.8125, + "learning_rate": 3.6024928107654843e-07, + "loss": 0.98748446, + "memory(GiB)": 302.58, + "step": 316320, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.74498463, + "epoch": 1.7691231141130266, + "grad_norm": 7.25, + "learning_rate": 3.599047188643873e-07, + "loss": 0.99253368, + "memory(GiB)": 302.58, + "step": 316340, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.75564303, + "epoch": 1.769234963586006, + "grad_norm": 8.3125, + "learning_rate": 3.595603153581984e-07, + "loss": 0.9478817, + "memory(GiB)": 302.58, + "step": 316360, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.75645008, + "epoch": 1.7693468130589851, + "grad_norm": 7.53125, + "learning_rate": 3.592160705697606e-07, + "loss": 0.9714426, + "memory(GiB)": 302.58, + "step": 316380, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.75438495, + "epoch": 1.7694586625319646, + "grad_norm": 6.5625, + "learning_rate": 3.5887198451084905e-07, + "loss": 0.96197872, + "memory(GiB)": 302.58, + "step": 316400, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.75833144, + "epoch": 1.7695705120049436, + "grad_norm": 10.625, + "learning_rate": 3.5852805719323147e-07, + "loss": 0.95106926, + "memory(GiB)": 302.58, + "step": 316420, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.7415452, + "epoch": 1.7696823614779231, + "grad_norm": 7.1875, + "learning_rate": 3.581842886286729e-07, + "loss": 1.02738466, + "memory(GiB)": 302.58, + "step": 316440, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.7669302, + "epoch": 1.7697942109509022, + "grad_norm": 7.28125, + "learning_rate": 3.5784067882893016e-07, + "loss": 0.90372639, + "memory(GiB)": 302.58, + "step": 316460, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.74453974, + "epoch": 1.7699060604238817, + "grad_norm": 9.6875, + "learning_rate": 3.5749722780575647e-07, + "loss": 0.99538383, + "memory(GiB)": 302.58, + "step": 316480, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.7639482, + "epoch": 1.7700179098968607, + "grad_norm": 6.1875, + "learning_rate": 3.5715393557089806e-07, + "loss": 0.92278614, + "memory(GiB)": 302.58, + "step": 316500, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.73568668, + "epoch": 1.7701297593698402, + "grad_norm": 7.40625, + "learning_rate": 3.5681080213609773e-07, + "loss": 1.05875273, + "memory(GiB)": 302.58, + "step": 316520, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.75567374, + "epoch": 1.7702416088428192, + "grad_norm": 7.25, + "learning_rate": 3.564678275130906e-07, + "loss": 0.93677025, + "memory(GiB)": 302.58, + "step": 316540, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.72262111, + "epoch": 1.7703534583157987, + "grad_norm": 8.1875, + "learning_rate": 3.561250117136084e-07, + "loss": 1.10371103, + "memory(GiB)": 302.58, + "step": 316560, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.74344749, + "epoch": 1.7704653077887778, + "grad_norm": 8.0, + "learning_rate": 3.5578235474937607e-07, + "loss": 1.00958176, + "memory(GiB)": 302.58, + "step": 316580, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.73314729, + "epoch": 1.7705771572617572, + "grad_norm": 8.8125, + "learning_rate": 3.5543985663211323e-07, + "loss": 1.05086737, + "memory(GiB)": 302.58, + "step": 316600, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.75347066, + "epoch": 1.7706890067347363, + "grad_norm": 8.5625, + "learning_rate": 3.550975173735349e-07, + "loss": 0.97516785, + "memory(GiB)": 302.58, + "step": 316620, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.76931334, + "epoch": 1.7708008562077158, + "grad_norm": 8.4375, + "learning_rate": 3.547553369853507e-07, + "loss": 0.91078539, + "memory(GiB)": 302.58, + "step": 316640, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.76344247, + "epoch": 1.7709127056806948, + "grad_norm": 5.375, + "learning_rate": 3.5441331547926285e-07, + "loss": 0.93486204, + "memory(GiB)": 302.58, + "step": 316660, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.75580578, + "epoch": 1.7710245551536743, + "grad_norm": 6.1875, + "learning_rate": 3.5407145286697085e-07, + "loss": 0.95724506, + "memory(GiB)": 302.58, + "step": 316680, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.75463781, + "epoch": 1.7711364046266533, + "grad_norm": 9.4375, + "learning_rate": 3.53729749160166e-07, + "loss": 0.96251602, + "memory(GiB)": 302.58, + "step": 316700, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74912038, + "epoch": 1.7712482540996328, + "grad_norm": 6.78125, + "learning_rate": 3.533882043705378e-07, + "loss": 0.98621855, + "memory(GiB)": 302.58, + "step": 316720, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74669199, + "epoch": 1.7713601035726119, + "grad_norm": 7.03125, + "learning_rate": 3.530468185097663e-07, + "loss": 0.99505186, + "memory(GiB)": 302.58, + "step": 316740, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.73927178, + "epoch": 1.7714719530455914, + "grad_norm": 7.4375, + "learning_rate": 3.5270559158952946e-07, + "loss": 1.02463751, + "memory(GiB)": 302.58, + "step": 316760, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.77098331, + "epoch": 1.7715838025185704, + "grad_norm": 6.03125, + "learning_rate": 3.523645236214973e-07, + "loss": 0.89563408, + "memory(GiB)": 302.58, + "step": 316780, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.74643993, + "epoch": 1.7716956519915499, + "grad_norm": 10.5625, + "learning_rate": 3.520236146173356e-07, + "loss": 0.98608561, + "memory(GiB)": 302.58, + "step": 316800, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.74834356, + "epoch": 1.771807501464529, + "grad_norm": 5.53125, + "learning_rate": 3.5168286458870436e-07, + "loss": 0.99809065, + "memory(GiB)": 302.58, + "step": 316820, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.75663648, + "epoch": 1.7719193509375084, + "grad_norm": 5.0, + "learning_rate": 3.513422735472588e-07, + "loss": 0.96261978, + "memory(GiB)": 302.58, + "step": 316840, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73956866, + "epoch": 1.7720312004104875, + "grad_norm": 5.0, + "learning_rate": 3.510018415046479e-07, + "loss": 1.03061724, + "memory(GiB)": 302.58, + "step": 316860, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.75752144, + "epoch": 1.772143049883467, + "grad_norm": 6.1875, + "learning_rate": 3.5066156847251565e-07, + "loss": 0.96226864, + "memory(GiB)": 302.58, + "step": 316880, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.76778097, + "epoch": 1.772254899356446, + "grad_norm": 10.3125, + "learning_rate": 3.5032145446250054e-07, + "loss": 0.92495871, + "memory(GiB)": 302.58, + "step": 316900, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74227939, + "epoch": 1.7723667488294255, + "grad_norm": 7.1875, + "learning_rate": 3.499814994862355e-07, + "loss": 0.99630175, + "memory(GiB)": 302.58, + "step": 316920, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74258223, + "epoch": 1.7724785983024045, + "grad_norm": 7.28125, + "learning_rate": 3.4964170355534787e-07, + "loss": 1.01124601, + "memory(GiB)": 302.58, + "step": 316940, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74985418, + "epoch": 1.772590447775384, + "grad_norm": 5.28125, + "learning_rate": 3.4930206668145893e-07, + "loss": 0.99775362, + "memory(GiB)": 302.58, + "step": 316960, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75718369, + "epoch": 1.772702297248363, + "grad_norm": 8.5625, + "learning_rate": 3.4896258887618717e-07, + "loss": 0.94888601, + "memory(GiB)": 302.58, + "step": 316980, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.74444208, + "epoch": 1.7728141467213425, + "grad_norm": 7.65625, + "learning_rate": 3.486232701511427e-07, + "loss": 1.00630064, + "memory(GiB)": 302.58, + "step": 317000, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74090772, + "epoch": 1.7729259961943216, + "grad_norm": 7.65625, + "learning_rate": 3.482841105179319e-07, + "loss": 1.02358599, + "memory(GiB)": 302.58, + "step": 317020, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.74789929, + "epoch": 1.773037845667301, + "grad_norm": 6.09375, + "learning_rate": 3.4794510998815423e-07, + "loss": 0.98861284, + "memory(GiB)": 302.58, + "step": 317040, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.74696889, + "epoch": 1.77314969514028, + "grad_norm": 10.3125, + "learning_rate": 3.476062685734055e-07, + "loss": 1.01675282, + "memory(GiB)": 302.58, + "step": 317060, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.74926972, + "epoch": 1.7732615446132596, + "grad_norm": 7.25, + "learning_rate": 3.472675862852748e-07, + "loss": 0.98292999, + "memory(GiB)": 302.58, + "step": 317080, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.76488652, + "epoch": 1.7733733940862386, + "grad_norm": 11.9375, + "learning_rate": 3.469290631353456e-07, + "loss": 0.93179884, + "memory(GiB)": 302.58, + "step": 317100, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.75802188, + "epoch": 1.773485243559218, + "grad_norm": 8.5, + "learning_rate": 3.465906991351975e-07, + "loss": 0.94964256, + "memory(GiB)": 302.58, + "step": 317120, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.75460224, + "epoch": 1.7735970930321971, + "grad_norm": 5.71875, + "learning_rate": 3.462524942964024e-07, + "loss": 0.98859663, + "memory(GiB)": 302.58, + "step": 317140, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74264188, + "epoch": 1.7737089425051766, + "grad_norm": 7.03125, + "learning_rate": 3.4591444863052883e-07, + "loss": 1.00074148, + "memory(GiB)": 302.58, + "step": 317160, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.74955721, + "epoch": 1.7738207919781557, + "grad_norm": 5.3125, + "learning_rate": 3.4557656214913913e-07, + "loss": 0.98865576, + "memory(GiB)": 302.58, + "step": 317180, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.75581827, + "epoch": 1.7739326414511352, + "grad_norm": 8.875, + "learning_rate": 3.4523883486378974e-07, + "loss": 0.97726202, + "memory(GiB)": 302.58, + "step": 317200, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.73197603, + "epoch": 1.7740444909241142, + "grad_norm": 5.4375, + "learning_rate": 3.4490126678603187e-07, + "loss": 1.0688179, + "memory(GiB)": 302.58, + "step": 317220, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.73705888, + "epoch": 1.7741563403970937, + "grad_norm": 4.84375, + "learning_rate": 3.445638579274108e-07, + "loss": 1.03005447, + "memory(GiB)": 302.58, + "step": 317240, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.76461473, + "epoch": 1.7742681898700727, + "grad_norm": 9.5, + "learning_rate": 3.442266082994683e-07, + "loss": 0.94365282, + "memory(GiB)": 302.58, + "step": 317260, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.75747328, + "epoch": 1.7743800393430522, + "grad_norm": 10.0625, + "learning_rate": 3.4388951791373916e-07, + "loss": 0.95298576, + "memory(GiB)": 302.58, + "step": 317280, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.77819338, + "epoch": 1.7744918888160313, + "grad_norm": 7.71875, + "learning_rate": 3.4355258678175184e-07, + "loss": 0.86482639, + "memory(GiB)": 302.58, + "step": 317300, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.75161114, + "epoch": 1.7746037382890107, + "grad_norm": 10.125, + "learning_rate": 3.432158149150322e-07, + "loss": 0.98226604, + "memory(GiB)": 302.58, + "step": 317320, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.74739704, + "epoch": 1.7747155877619898, + "grad_norm": 7.0, + "learning_rate": 3.428792023250976e-07, + "loss": 0.990658, + "memory(GiB)": 302.58, + "step": 317340, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.75671535, + "epoch": 1.7748274372349693, + "grad_norm": 6.21875, + "learning_rate": 3.425427490234617e-07, + "loss": 0.95021753, + "memory(GiB)": 302.58, + "step": 317360, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.73824501, + "epoch": 1.7749392867079483, + "grad_norm": 5.875, + "learning_rate": 3.4220645502163253e-07, + "loss": 1.02529564, + "memory(GiB)": 302.58, + "step": 317380, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.76464052, + "epoch": 1.7750511361809278, + "grad_norm": 10.5, + "learning_rate": 3.418703203311119e-07, + "loss": 0.92750406, + "memory(GiB)": 302.58, + "step": 317400, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.74517837, + "epoch": 1.7751629856539068, + "grad_norm": 5.875, + "learning_rate": 3.415343449633968e-07, + "loss": 1.00491295, + "memory(GiB)": 302.58, + "step": 317420, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.7326251, + "epoch": 1.7752748351268863, + "grad_norm": 7.71875, + "learning_rate": 3.411985289299785e-07, + "loss": 1.06149645, + "memory(GiB)": 302.58, + "step": 317440, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.7482058, + "epoch": 1.7753866845998654, + "grad_norm": 6.25, + "learning_rate": 3.4086287224234347e-07, + "loss": 0.98884525, + "memory(GiB)": 302.58, + "step": 317460, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.77724771, + "epoch": 1.7754985340728449, + "grad_norm": 9.25, + "learning_rate": 3.405273749119714e-07, + "loss": 0.88380766, + "memory(GiB)": 302.58, + "step": 317480, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74947252, + "epoch": 1.775610383545824, + "grad_norm": 4.25, + "learning_rate": 3.401920369503386e-07, + "loss": 0.9857502, + "memory(GiB)": 302.58, + "step": 317500, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.73431797, + "epoch": 1.7757222330188034, + "grad_norm": 5.0625, + "learning_rate": 3.398568583689138e-07, + "loss": 1.0410491, + "memory(GiB)": 302.58, + "step": 317520, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74945889, + "epoch": 1.7758340824917824, + "grad_norm": 7.84375, + "learning_rate": 3.3952183917916103e-07, + "loss": 0.99961014, + "memory(GiB)": 302.58, + "step": 317540, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.73455162, + "epoch": 1.775945931964762, + "grad_norm": 9.375, + "learning_rate": 3.3918697939253953e-07, + "loss": 1.04862671, + "memory(GiB)": 302.58, + "step": 317560, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.75549245, + "epoch": 1.776057781437741, + "grad_norm": 8.0, + "learning_rate": 3.388522790205012e-07, + "loss": 0.97035227, + "memory(GiB)": 302.58, + "step": 317580, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.76243911, + "epoch": 1.7761696309107204, + "grad_norm": 7.375, + "learning_rate": 3.385177380744964e-07, + "loss": 0.92591581, + "memory(GiB)": 302.58, + "step": 317600, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.74454203, + "epoch": 1.7762814803836995, + "grad_norm": 6.40625, + "learning_rate": 3.381833565659659e-07, + "loss": 0.98519402, + "memory(GiB)": 302.58, + "step": 317620, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.74559817, + "epoch": 1.776393329856679, + "grad_norm": 8.25, + "learning_rate": 3.378491345063467e-07, + "loss": 1.00777884, + "memory(GiB)": 302.58, + "step": 317640, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.75259542, + "epoch": 1.776505179329658, + "grad_norm": 7.125, + "learning_rate": 3.375150719070708e-07, + "loss": 0.97530413, + "memory(GiB)": 302.58, + "step": 317660, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.74808402, + "epoch": 1.7766170288026375, + "grad_norm": 6.21875, + "learning_rate": 3.371811687795634e-07, + "loss": 0.99811869, + "memory(GiB)": 302.58, + "step": 317680, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.75660806, + "epoch": 1.7767288782756165, + "grad_norm": 7.46875, + "learning_rate": 3.3684742513524546e-07, + "loss": 0.95000753, + "memory(GiB)": 302.58, + "step": 317700, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.74917426, + "epoch": 1.776840727748596, + "grad_norm": 9.75, + "learning_rate": 3.365138409855323e-07, + "loss": 0.99268847, + "memory(GiB)": 302.58, + "step": 317720, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.7458744, + "epoch": 1.776952577221575, + "grad_norm": 9.625, + "learning_rate": 3.3618041634183307e-07, + "loss": 0.9975728, + "memory(GiB)": 302.58, + "step": 317740, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.74798074, + "epoch": 1.7770644266945546, + "grad_norm": 5.34375, + "learning_rate": 3.35847151215552e-07, + "loss": 0.9783391, + "memory(GiB)": 302.58, + "step": 317760, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.7534255, + "epoch": 1.7771762761675336, + "grad_norm": 8.125, + "learning_rate": 3.3551404561808833e-07, + "loss": 0.97255173, + "memory(GiB)": 302.58, + "step": 317780, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.74601121, + "epoch": 1.777288125640513, + "grad_norm": 6.875, + "learning_rate": 3.351810995608346e-07, + "loss": 1.00047827, + "memory(GiB)": 302.58, + "step": 317800, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.75045395, + "epoch": 1.7773999751134921, + "grad_norm": 8.5, + "learning_rate": 3.348483130551794e-07, + "loss": 1.00259142, + "memory(GiB)": 302.58, + "step": 317820, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.74846201, + "epoch": 1.7775118245864716, + "grad_norm": 5.625, + "learning_rate": 3.3451568611250374e-07, + "loss": 0.99040155, + "memory(GiB)": 302.58, + "step": 317840, + "train_speed(iter/s)": 0.123638 + }, + { + "acc": 0.74568739, + "epoch": 1.7776236740594507, + "grad_norm": 9.0, + "learning_rate": 3.341832187441862e-07, + "loss": 0.99746428, + "memory(GiB)": 302.58, + "step": 317860, + "train_speed(iter/s)": 0.123641 + }, + { + "acc": 0.74761968, + "epoch": 1.7777355235324301, + "grad_norm": 6.9375, + "learning_rate": 3.338509109615973e-07, + "loss": 0.99144926, + "memory(GiB)": 302.58, + "step": 317880, + "train_speed(iter/s)": 0.123645 + }, + { + "acc": 0.74387922, + "epoch": 1.7778473730054092, + "grad_norm": 8.875, + "learning_rate": 3.3351876277610383e-07, + "loss": 1.02797451, + "memory(GiB)": 302.58, + "step": 317900, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.75566125, + "epoch": 1.7779592224783887, + "grad_norm": 6.75, + "learning_rate": 3.331867741990652e-07, + "loss": 0.93337479, + "memory(GiB)": 302.58, + "step": 317920, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75529771, + "epoch": 1.7780710719513677, + "grad_norm": 6.96875, + "learning_rate": 3.328549452418367e-07, + "loss": 0.96315289, + "memory(GiB)": 302.58, + "step": 317940, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.73907518, + "epoch": 1.7781829214243472, + "grad_norm": 4.84375, + "learning_rate": 3.325232759157687e-07, + "loss": 1.00323372, + "memory(GiB)": 302.58, + "step": 317960, + "train_speed(iter/s)": 0.123659 + }, + { + "acc": 0.74333177, + "epoch": 1.7782947708973262, + "grad_norm": 5.96875, + "learning_rate": 3.3219176623220494e-07, + "loss": 1.02565804, + "memory(GiB)": 302.58, + "step": 317980, + "train_speed(iter/s)": 0.123663 + }, + { + "acc": 0.76153479, + "epoch": 1.7784066203703057, + "grad_norm": 9.0625, + "learning_rate": 3.318604162024835e-07, + "loss": 0.93720074, + "memory(GiB)": 302.58, + "step": 318000, + "train_speed(iter/s)": 0.123667 + }, + { + "epoch": 1.7784066203703057, + "eval_acc": 0.7069144855550064, + "eval_loss": 1.0118063688278198, + "eval_runtime": 7523.8539, + "eval_samples_per_second": 10.006, + "eval_steps_per_second": 10.006, + "step": 318000 + }, + { + "acc": 0.75845299, + "epoch": 1.7785184698432848, + "grad_norm": 8.875, + "learning_rate": 3.3152922583793825e-07, + "loss": 0.94237747, + "memory(GiB)": 302.58, + "step": 318020, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.75820646, + "epoch": 1.7786303193162643, + "grad_norm": 7.78125, + "learning_rate": 3.3119819514989716e-07, + "loss": 0.92876101, + "memory(GiB)": 302.58, + "step": 318040, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.75493555, + "epoch": 1.7787421687892433, + "grad_norm": 6.46875, + "learning_rate": 3.308673241496818e-07, + "loss": 0.96651087, + "memory(GiB)": 302.58, + "step": 318060, + "train_speed(iter/s)": 0.12331 + }, + { + "acc": 0.73175974, + "epoch": 1.7788540182622228, + "grad_norm": 6.15625, + "learning_rate": 3.3053661284860926e-07, + "loss": 1.06052704, + "memory(GiB)": 302.58, + "step": 318080, + "train_speed(iter/s)": 0.123314 + }, + { + "acc": 0.75621409, + "epoch": 1.7789658677352018, + "grad_norm": 7.125, + "learning_rate": 3.3020606125799104e-07, + "loss": 0.95440006, + "memory(GiB)": 302.58, + "step": 318100, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.77140174, + "epoch": 1.7790777172081813, + "grad_norm": 6.1875, + "learning_rate": 3.298756693891325e-07, + "loss": 0.87962265, + "memory(GiB)": 302.58, + "step": 318120, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.75419841, + "epoch": 1.7791895666811604, + "grad_norm": 7.34375, + "learning_rate": 3.295454372533352e-07, + "loss": 0.95652905, + "memory(GiB)": 302.58, + "step": 318140, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.75484557, + "epoch": 1.7793014161541398, + "grad_norm": 6.4375, + "learning_rate": 3.29215364861894e-07, + "loss": 0.96411991, + "memory(GiB)": 302.58, + "step": 318160, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.75882707, + "epoch": 1.7794132656271189, + "grad_norm": 8.75, + "learning_rate": 3.288854522260976e-07, + "loss": 0.9604723, + "memory(GiB)": 302.58, + "step": 318180, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.74642115, + "epoch": 1.7795251151000984, + "grad_norm": 8.375, + "learning_rate": 3.285556993572303e-07, + "loss": 0.99102125, + "memory(GiB)": 302.58, + "step": 318200, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.76339788, + "epoch": 1.7796369645730774, + "grad_norm": 10.125, + "learning_rate": 3.2822610626657146e-07, + "loss": 0.91651669, + "memory(GiB)": 302.58, + "step": 318220, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.7501678, + "epoch": 1.779748814046057, + "grad_norm": 4.46875, + "learning_rate": 3.2789667296539316e-07, + "loss": 0.99654713, + "memory(GiB)": 302.58, + "step": 318240, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.74946833, + "epoch": 1.779860663519036, + "grad_norm": 8.0, + "learning_rate": 3.2756739946496307e-07, + "loss": 0.98740759, + "memory(GiB)": 302.58, + "step": 318260, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.74269376, + "epoch": 1.7799725129920154, + "grad_norm": 10.0625, + "learning_rate": 3.272382857765444e-07, + "loss": 1.01972008, + "memory(GiB)": 302.58, + "step": 318280, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.75558915, + "epoch": 1.7800843624649945, + "grad_norm": 6.375, + "learning_rate": 3.2690933191139307e-07, + "loss": 0.95094709, + "memory(GiB)": 302.58, + "step": 318300, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.74898944, + "epoch": 1.780196211937974, + "grad_norm": 6.125, + "learning_rate": 3.2658053788076073e-07, + "loss": 0.98707666, + "memory(GiB)": 302.58, + "step": 318320, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.74721708, + "epoch": 1.780308061410953, + "grad_norm": 5.53125, + "learning_rate": 3.2625190369589275e-07, + "loss": 0.99910879, + "memory(GiB)": 302.58, + "step": 318340, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.75872073, + "epoch": 1.7804199108839325, + "grad_norm": 9.0625, + "learning_rate": 3.2592342936802967e-07, + "loss": 0.96822729, + "memory(GiB)": 302.58, + "step": 318360, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.74644818, + "epoch": 1.7805317603569115, + "grad_norm": 7.1875, + "learning_rate": 3.255951149084069e-07, + "loss": 1.00398169, + "memory(GiB)": 302.58, + "step": 318380, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.76384025, + "epoch": 1.780643609829891, + "grad_norm": 8.125, + "learning_rate": 3.252669603282521e-07, + "loss": 0.92327108, + "memory(GiB)": 302.58, + "step": 318400, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.756602, + "epoch": 1.78075545930287, + "grad_norm": 6.9375, + "learning_rate": 3.2493896563879133e-07, + "loss": 0.9332099, + "memory(GiB)": 302.58, + "step": 318420, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.74546189, + "epoch": 1.7808673087758495, + "grad_norm": 7.21875, + "learning_rate": 3.246111308512423e-07, + "loss": 1.01247377, + "memory(GiB)": 302.58, + "step": 318440, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.76186609, + "epoch": 1.7809791582488286, + "grad_norm": 7.59375, + "learning_rate": 3.2428345597681764e-07, + "loss": 0.91285448, + "memory(GiB)": 302.58, + "step": 318460, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.74031816, + "epoch": 1.781091007721808, + "grad_norm": 7.875, + "learning_rate": 3.239559410267251e-07, + "loss": 1.02364273, + "memory(GiB)": 302.58, + "step": 318480, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.71365647, + "epoch": 1.781202857194787, + "grad_norm": 7.0, + "learning_rate": 3.2362858601216683e-07, + "loss": 1.16277866, + "memory(GiB)": 302.58, + "step": 318500, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.76073341, + "epoch": 1.7813147066677666, + "grad_norm": 6.78125, + "learning_rate": 3.2330139094433835e-07, + "loss": 0.95649471, + "memory(GiB)": 302.58, + "step": 318520, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.74878478, + "epoch": 1.7814265561407456, + "grad_norm": 5.71875, + "learning_rate": 3.229743558344323e-07, + "loss": 0.99300117, + "memory(GiB)": 302.58, + "step": 318540, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.75719609, + "epoch": 1.7815384056137251, + "grad_norm": 7.46875, + "learning_rate": 3.226474806936342e-07, + "loss": 0.91326113, + "memory(GiB)": 302.58, + "step": 318560, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.77436395, + "epoch": 1.7816502550867042, + "grad_norm": 6.53125, + "learning_rate": 3.223207655331234e-07, + "loss": 0.87707224, + "memory(GiB)": 302.58, + "step": 318580, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.75253496, + "epoch": 1.7817621045596836, + "grad_norm": 6.8125, + "learning_rate": 3.2199421036407494e-07, + "loss": 0.9706521, + "memory(GiB)": 302.58, + "step": 318600, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.74695225, + "epoch": 1.7818739540326627, + "grad_norm": 8.0, + "learning_rate": 3.216678151976582e-07, + "loss": 1.01669474, + "memory(GiB)": 302.58, + "step": 318620, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.76079855, + "epoch": 1.7819858035056422, + "grad_norm": 8.3125, + "learning_rate": 3.21341580045037e-07, + "loss": 0.95022049, + "memory(GiB)": 302.58, + "step": 318640, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.7548317, + "epoch": 1.7820976529786212, + "grad_norm": 8.3125, + "learning_rate": 3.2101550491736913e-07, + "loss": 0.96117105, + "memory(GiB)": 302.58, + "step": 318660, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.74565077, + "epoch": 1.7822095024516007, + "grad_norm": 7.6875, + "learning_rate": 3.2068958982580777e-07, + "loss": 0.99990339, + "memory(GiB)": 302.58, + "step": 318680, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75308795, + "epoch": 1.7823213519245797, + "grad_norm": 8.8125, + "learning_rate": 3.203638347814997e-07, + "loss": 0.95988865, + "memory(GiB)": 302.58, + "step": 318700, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.75400677, + "epoch": 1.7824332013975592, + "grad_norm": 7.40625, + "learning_rate": 3.20038239795587e-07, + "loss": 0.96650982, + "memory(GiB)": 302.58, + "step": 318720, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.76606746, + "epoch": 1.7825450508705383, + "grad_norm": 10.1875, + "learning_rate": 3.1971280487920644e-07, + "loss": 0.93400316, + "memory(GiB)": 302.58, + "step": 318740, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.75913901, + "epoch": 1.7826569003435178, + "grad_norm": 7.75, + "learning_rate": 3.19387530043489e-07, + "loss": 0.94385605, + "memory(GiB)": 302.58, + "step": 318760, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.76383681, + "epoch": 1.7827687498164968, + "grad_norm": 8.4375, + "learning_rate": 3.1906241529956027e-07, + "loss": 0.9233655, + "memory(GiB)": 302.58, + "step": 318780, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.75227013, + "epoch": 1.7828805992894763, + "grad_norm": 5.1875, + "learning_rate": 3.1873746065853916e-07, + "loss": 0.9755599, + "memory(GiB)": 302.58, + "step": 318800, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.76610975, + "epoch": 1.7829924487624553, + "grad_norm": 7.625, + "learning_rate": 3.184126661315412e-07, + "loss": 0.9047739, + "memory(GiB)": 302.58, + "step": 318820, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.73355913, + "epoch": 1.7831042982354348, + "grad_norm": 9.1875, + "learning_rate": 3.1808803172967475e-07, + "loss": 1.06117315, + "memory(GiB)": 302.58, + "step": 318840, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.74312782, + "epoch": 1.7832161477084139, + "grad_norm": 5.34375, + "learning_rate": 3.1776355746404366e-07, + "loss": 1.0043355, + "memory(GiB)": 302.58, + "step": 318860, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.75932074, + "epoch": 1.7833279971813933, + "grad_norm": 4.9375, + "learning_rate": 3.1743924334574574e-07, + "loss": 0.92756691, + "memory(GiB)": 302.58, + "step": 318880, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.76418958, + "epoch": 1.7834398466543724, + "grad_norm": 6.6875, + "learning_rate": 3.1711508938587374e-07, + "loss": 0.90801611, + "memory(GiB)": 302.58, + "step": 318900, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.72548347, + "epoch": 1.7835516961273519, + "grad_norm": 5.78125, + "learning_rate": 3.16791095595515e-07, + "loss": 1.07766113, + "memory(GiB)": 302.58, + "step": 318920, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.7532836, + "epoch": 1.783663545600331, + "grad_norm": 10.3125, + "learning_rate": 3.164672619857506e-07, + "loss": 0.96809368, + "memory(GiB)": 302.58, + "step": 318940, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.75222692, + "epoch": 1.7837753950733104, + "grad_norm": 7.90625, + "learning_rate": 3.161435885676567e-07, + "loss": 0.96460323, + "memory(GiB)": 302.58, + "step": 318960, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.74509144, + "epoch": 1.7838872445462894, + "grad_norm": 6.59375, + "learning_rate": 3.158200753523044e-07, + "loss": 0.99458179, + "memory(GiB)": 302.58, + "step": 318980, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.74122829, + "epoch": 1.783999094019269, + "grad_norm": 9.1875, + "learning_rate": 3.1549672235075826e-07, + "loss": 1.02515078, + "memory(GiB)": 302.58, + "step": 319000, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.75908923, + "epoch": 1.784110943492248, + "grad_norm": 8.4375, + "learning_rate": 3.151735295740788e-07, + "loss": 0.93160124, + "memory(GiB)": 302.58, + "step": 319020, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.73931975, + "epoch": 1.7842227929652275, + "grad_norm": 6.84375, + "learning_rate": 3.1485049703331936e-07, + "loss": 1.01111765, + "memory(GiB)": 302.58, + "step": 319040, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.74975281, + "epoch": 1.7843346424382065, + "grad_norm": 7.6875, + "learning_rate": 3.145276247395296e-07, + "loss": 0.97305861, + "memory(GiB)": 302.58, + "step": 319060, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.73853698, + "epoch": 1.784446491911186, + "grad_norm": 6.625, + "learning_rate": 3.142049127037522e-07, + "loss": 1.04415722, + "memory(GiB)": 302.58, + "step": 319080, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.76644478, + "epoch": 1.784558341384165, + "grad_norm": 6.65625, + "learning_rate": 3.1388236093702563e-07, + "loss": 0.916677, + "memory(GiB)": 302.58, + "step": 319100, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.7615778, + "epoch": 1.7846701908571445, + "grad_norm": 7.59375, + "learning_rate": 3.1355996945038103e-07, + "loss": 0.93840208, + "memory(GiB)": 302.58, + "step": 319120, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.75213308, + "epoch": 1.7847820403301236, + "grad_norm": 9.0625, + "learning_rate": 3.132377382548457e-07, + "loss": 0.96799107, + "memory(GiB)": 302.58, + "step": 319140, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.72682457, + "epoch": 1.784893889803103, + "grad_norm": 6.75, + "learning_rate": 3.1291566736144085e-07, + "loss": 1.07274723, + "memory(GiB)": 302.58, + "step": 319160, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75929108, + "epoch": 1.7850057392760823, + "grad_norm": 7.84375, + "learning_rate": 3.125937567811832e-07, + "loss": 0.96026258, + "memory(GiB)": 302.58, + "step": 319180, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.76883278, + "epoch": 1.7851175887490616, + "grad_norm": 8.9375, + "learning_rate": 3.122720065250817e-07, + "loss": 0.91229744, + "memory(GiB)": 302.58, + "step": 319200, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.75253196, + "epoch": 1.7852294382220408, + "grad_norm": 5.375, + "learning_rate": 3.119504166041426e-07, + "loss": 0.97339697, + "memory(GiB)": 302.58, + "step": 319220, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74407225, + "epoch": 1.78534128769502, + "grad_norm": 6.1875, + "learning_rate": 3.1162898702936427e-07, + "loss": 1.01381302, + "memory(GiB)": 302.58, + "step": 319240, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.76103158, + "epoch": 1.7854531371679994, + "grad_norm": 9.125, + "learning_rate": 3.113077178117402e-07, + "loss": 0.940487, + "memory(GiB)": 302.58, + "step": 319260, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.76442046, + "epoch": 1.7855649866409786, + "grad_norm": 7.625, + "learning_rate": 3.109866089622604e-07, + "loss": 0.9306078, + "memory(GiB)": 302.58, + "step": 319280, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.73572998, + "epoch": 1.7856768361139579, + "grad_norm": 7.125, + "learning_rate": 3.106656604919073e-07, + "loss": 1.03997231, + "memory(GiB)": 302.58, + "step": 319300, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75199213, + "epoch": 1.7857886855869372, + "grad_norm": 7.375, + "learning_rate": 3.103448724116581e-07, + "loss": 0.99065161, + "memory(GiB)": 302.58, + "step": 319320, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.72899709, + "epoch": 1.7859005350599164, + "grad_norm": 5.125, + "learning_rate": 3.100242447324847e-07, + "loss": 1.05914106, + "memory(GiB)": 302.58, + "step": 319340, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.75310187, + "epoch": 1.7860123845328957, + "grad_norm": 7.4375, + "learning_rate": 3.097037774653533e-07, + "loss": 0.97375517, + "memory(GiB)": 302.58, + "step": 319360, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.7513649, + "epoch": 1.786124234005875, + "grad_norm": 10.0, + "learning_rate": 3.0938347062122563e-07, + "loss": 1.00197811, + "memory(GiB)": 302.58, + "step": 319380, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74076958, + "epoch": 1.7862360834788542, + "grad_norm": 7.75, + "learning_rate": 3.0906332421105634e-07, + "loss": 1.01223984, + "memory(GiB)": 302.58, + "step": 319400, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.7636178, + "epoch": 1.7863479329518335, + "grad_norm": 6.71875, + "learning_rate": 3.087433382457966e-07, + "loss": 0.91972361, + "memory(GiB)": 302.58, + "step": 319420, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.75136662, + "epoch": 1.7864597824248127, + "grad_norm": 8.3125, + "learning_rate": 3.0842351273638936e-07, + "loss": 0.98782768, + "memory(GiB)": 302.58, + "step": 319440, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.74860687, + "epoch": 1.786571631897792, + "grad_norm": 5.0, + "learning_rate": 3.0810384769377535e-07, + "loss": 0.9837492, + "memory(GiB)": 302.58, + "step": 319460, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.75874319, + "epoch": 1.7866834813707713, + "grad_norm": 8.6875, + "learning_rate": 3.077843431288874e-07, + "loss": 0.924119, + "memory(GiB)": 302.58, + "step": 319480, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.7451014, + "epoch": 1.7867953308437505, + "grad_norm": 7.0, + "learning_rate": 3.0746499905265295e-07, + "loss": 1.00736217, + "memory(GiB)": 302.58, + "step": 319500, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.76232505, + "epoch": 1.7869071803167298, + "grad_norm": 8.875, + "learning_rate": 3.071458154759954e-07, + "loss": 0.93507318, + "memory(GiB)": 302.58, + "step": 319520, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.73136802, + "epoch": 1.787019029789709, + "grad_norm": 7.59375, + "learning_rate": 3.0682679240983113e-07, + "loss": 1.03852444, + "memory(GiB)": 302.58, + "step": 319540, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.76431932, + "epoch": 1.7871308792626883, + "grad_norm": 7.9375, + "learning_rate": 3.065079298650725e-07, + "loss": 0.94390068, + "memory(GiB)": 302.58, + "step": 319560, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.7575006, + "epoch": 1.7872427287356676, + "grad_norm": 7.9375, + "learning_rate": 3.061892278526257e-07, + "loss": 0.95890112, + "memory(GiB)": 302.58, + "step": 319580, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.75577183, + "epoch": 1.7873545782086468, + "grad_norm": 6.375, + "learning_rate": 3.058706863833905e-07, + "loss": 0.96754332, + "memory(GiB)": 302.58, + "step": 319600, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.74844537, + "epoch": 1.7874664276816261, + "grad_norm": 7.90625, + "learning_rate": 3.05552305468263e-07, + "loss": 0.98709717, + "memory(GiB)": 302.58, + "step": 319620, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.74442039, + "epoch": 1.7875782771546054, + "grad_norm": 4.875, + "learning_rate": 3.0523408511813233e-07, + "loss": 1.01932259, + "memory(GiB)": 302.58, + "step": 319640, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.75098796, + "epoch": 1.7876901266275846, + "grad_norm": 6.125, + "learning_rate": 3.049160253438821e-07, + "loss": 0.9972868, + "memory(GiB)": 302.58, + "step": 319660, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.7378386, + "epoch": 1.787801976100564, + "grad_norm": 7.75, + "learning_rate": 3.045981261563918e-07, + "loss": 1.03695698, + "memory(GiB)": 302.58, + "step": 319680, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.7563899, + "epoch": 1.7879138255735432, + "grad_norm": 9.1875, + "learning_rate": 3.0428038756653454e-07, + "loss": 0.96434774, + "memory(GiB)": 302.58, + "step": 319700, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.7630497, + "epoch": 1.7880256750465224, + "grad_norm": 5.625, + "learning_rate": 3.0396280958517764e-07, + "loss": 0.91814842, + "memory(GiB)": 302.58, + "step": 319720, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.75738707, + "epoch": 1.7881375245195017, + "grad_norm": 5.375, + "learning_rate": 3.0364539222318356e-07, + "loss": 0.94401951, + "memory(GiB)": 302.58, + "step": 319740, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.75599489, + "epoch": 1.788249373992481, + "grad_norm": 7.5625, + "learning_rate": 3.033281354914086e-07, + "loss": 0.96728716, + "memory(GiB)": 302.58, + "step": 319760, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.74641352, + "epoch": 1.7883612234654602, + "grad_norm": 9.25, + "learning_rate": 3.030110394007035e-07, + "loss": 1.01095304, + "memory(GiB)": 302.58, + "step": 319780, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.74190602, + "epoch": 1.7884730729384395, + "grad_norm": 9.3125, + "learning_rate": 3.026941039619152e-07, + "loss": 1.0421196, + "memory(GiB)": 302.58, + "step": 319800, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.76459179, + "epoch": 1.7885849224114188, + "grad_norm": 8.625, + "learning_rate": 3.023773291858834e-07, + "loss": 0.91925764, + "memory(GiB)": 302.58, + "step": 319820, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.75811586, + "epoch": 1.788696771884398, + "grad_norm": 8.75, + "learning_rate": 3.020607150834426e-07, + "loss": 0.96075401, + "memory(GiB)": 302.58, + "step": 319840, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.73204713, + "epoch": 1.7888086213573773, + "grad_norm": 9.625, + "learning_rate": 3.0174426166542204e-07, + "loss": 1.05271749, + "memory(GiB)": 302.58, + "step": 319860, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.74636092, + "epoch": 1.7889204708303565, + "grad_norm": 8.0, + "learning_rate": 3.014279689426447e-07, + "loss": 0.98301516, + "memory(GiB)": 302.58, + "step": 319880, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.76305118, + "epoch": 1.7890323203033358, + "grad_norm": 8.75, + "learning_rate": 3.0111183692593027e-07, + "loss": 0.91183939, + "memory(GiB)": 302.58, + "step": 319900, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.76515017, + "epoch": 1.789144169776315, + "grad_norm": 7.875, + "learning_rate": 3.007958656260912e-07, + "loss": 0.92944517, + "memory(GiB)": 302.58, + "step": 319920, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.74674239, + "epoch": 1.7892560192492943, + "grad_norm": 8.9375, + "learning_rate": 3.004800550539344e-07, + "loss": 0.96950178, + "memory(GiB)": 302.58, + "step": 319940, + "train_speed(iter/s)": 0.123654 + }, + { + "acc": 0.74911127, + "epoch": 1.7893678687222736, + "grad_norm": 9.0625, + "learning_rate": 3.001644052202607e-07, + "loss": 1.00325956, + "memory(GiB)": 302.58, + "step": 319960, + "train_speed(iter/s)": 0.123658 + }, + { + "acc": 0.74207845, + "epoch": 1.7894797181952529, + "grad_norm": 7.59375, + "learning_rate": 2.9984891613586807e-07, + "loss": 0.99875345, + "memory(GiB)": 302.58, + "step": 319980, + "train_speed(iter/s)": 0.123661 + }, + { + "acc": 0.74334249, + "epoch": 1.7895915676682321, + "grad_norm": 7.25, + "learning_rate": 2.9953358781154573e-07, + "loss": 1.01744385, + "memory(GiB)": 302.58, + "step": 320000, + "train_speed(iter/s)": 0.123665 + }, + { + "epoch": 1.7895915676682321, + "eval_acc": 0.7069247885833375, + "eval_loss": 1.0117712020874023, + "eval_runtime": 7573.9949, + "eval_samples_per_second": 9.94, + "eval_steps_per_second": 9.94, + "step": 320000 + }, + { + "acc": 0.74861784, + "epoch": 1.7897034171412114, + "grad_norm": 6.46875, + "learning_rate": 2.9921842025807944e-07, + "loss": 0.985958, + "memory(GiB)": 302.58, + "step": 320020, + "train_speed(iter/s)": 0.123301 + }, + { + "acc": 0.74963212, + "epoch": 1.7898152666141907, + "grad_norm": 6.96875, + "learning_rate": 2.9890341348624895e-07, + "loss": 0.96503277, + "memory(GiB)": 302.58, + "step": 320040, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.75083847, + "epoch": 1.78992711608717, + "grad_norm": 4.8125, + "learning_rate": 2.9858856750682896e-07, + "loss": 0.9959692, + "memory(GiB)": 302.58, + "step": 320060, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.72711358, + "epoch": 1.7900389655601492, + "grad_norm": 6.4375, + "learning_rate": 2.9827388233058694e-07, + "loss": 1.10389099, + "memory(GiB)": 302.58, + "step": 320080, + "train_speed(iter/s)": 0.123312 + }, + { + "acc": 0.72418718, + "epoch": 1.7901508150331285, + "grad_norm": 4.78125, + "learning_rate": 2.9795935796828767e-07, + "loss": 1.0955617, + "memory(GiB)": 302.58, + "step": 320100, + "train_speed(iter/s)": 0.123315 + }, + { + "acc": 0.75000677, + "epoch": 1.7902626645061077, + "grad_norm": 5.75, + "learning_rate": 2.9764499443068753e-07, + "loss": 0.99427862, + "memory(GiB)": 302.58, + "step": 320120, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.75121121, + "epoch": 1.790374513979087, + "grad_norm": 8.5, + "learning_rate": 2.9733079172853895e-07, + "loss": 1.00012312, + "memory(GiB)": 302.58, + "step": 320140, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.73295388, + "epoch": 1.7904863634520662, + "grad_norm": 5.375, + "learning_rate": 2.970167498725901e-07, + "loss": 1.04089794, + "memory(GiB)": 302.58, + "step": 320160, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.729245, + "epoch": 1.7905982129250455, + "grad_norm": 6.875, + "learning_rate": 2.967028688735807e-07, + "loss": 1.069911, + "memory(GiB)": 302.58, + "step": 320180, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.76714778, + "epoch": 1.7907100623980248, + "grad_norm": 7.84375, + "learning_rate": 2.9638914874224714e-07, + "loss": 0.91271915, + "memory(GiB)": 302.58, + "step": 320200, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.7353797, + "epoch": 1.790821911871004, + "grad_norm": 8.0, + "learning_rate": 2.9607558948931916e-07, + "loss": 1.07513971, + "memory(GiB)": 302.58, + "step": 320220, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.75411649, + "epoch": 1.7909337613439833, + "grad_norm": 6.25, + "learning_rate": 2.957621911255221e-07, + "loss": 0.97035599, + "memory(GiB)": 302.58, + "step": 320240, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.75777569, + "epoch": 1.7910456108169626, + "grad_norm": 7.25, + "learning_rate": 2.9544895366157453e-07, + "loss": 0.94799738, + "memory(GiB)": 302.58, + "step": 320260, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.72439804, + "epoch": 1.7911574602899418, + "grad_norm": 5.34375, + "learning_rate": 2.9513587710819025e-07, + "loss": 1.10157547, + "memory(GiB)": 302.58, + "step": 320280, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.7424036, + "epoch": 1.791269309762921, + "grad_norm": 7.15625, + "learning_rate": 2.9482296147607835e-07, + "loss": 1.00354118, + "memory(GiB)": 302.58, + "step": 320300, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.76092768, + "epoch": 1.7913811592359004, + "grad_norm": 9.0625, + "learning_rate": 2.945102067759403e-07, + "loss": 0.92877026, + "memory(GiB)": 302.58, + "step": 320320, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.7674005, + "epoch": 1.7914930087088796, + "grad_norm": 6.28125, + "learning_rate": 2.9419761301847426e-07, + "loss": 0.93516865, + "memory(GiB)": 302.58, + "step": 320340, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.76189785, + "epoch": 1.7916048581818589, + "grad_norm": 8.5625, + "learning_rate": 2.9388518021437163e-07, + "loss": 0.9506629, + "memory(GiB)": 302.58, + "step": 320360, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.76455722, + "epoch": 1.7917167076548381, + "grad_norm": 6.78125, + "learning_rate": 2.935729083743183e-07, + "loss": 0.92423344, + "memory(GiB)": 302.58, + "step": 320380, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.75824528, + "epoch": 1.7918285571278174, + "grad_norm": 8.0, + "learning_rate": 2.932607975089952e-07, + "loss": 0.94320478, + "memory(GiB)": 302.58, + "step": 320400, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.76161814, + "epoch": 1.7919404066007967, + "grad_norm": 5.84375, + "learning_rate": 2.929488476290765e-07, + "loss": 0.94958363, + "memory(GiB)": 302.58, + "step": 320420, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.73955278, + "epoch": 1.792052256073776, + "grad_norm": 8.375, + "learning_rate": 2.926370587452343e-07, + "loss": 1.05861235, + "memory(GiB)": 302.58, + "step": 320440, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.75637174, + "epoch": 1.7921641055467552, + "grad_norm": 8.125, + "learning_rate": 2.923254308681306e-07, + "loss": 0.94536419, + "memory(GiB)": 302.58, + "step": 320460, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.73974371, + "epoch": 1.7922759550197345, + "grad_norm": 5.15625, + "learning_rate": 2.920139640084252e-07, + "loss": 1.04272814, + "memory(GiB)": 302.58, + "step": 320480, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.76159873, + "epoch": 1.7923878044927137, + "grad_norm": 7.0, + "learning_rate": 2.917026581767707e-07, + "loss": 0.9697175, + "memory(GiB)": 302.58, + "step": 320500, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.73067117, + "epoch": 1.792499653965693, + "grad_norm": 8.875, + "learning_rate": 2.913915133838152e-07, + "loss": 1.06930876, + "memory(GiB)": 302.58, + "step": 320520, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.73413749, + "epoch": 1.7926115034386723, + "grad_norm": 11.0625, + "learning_rate": 2.9108052964020026e-07, + "loss": 1.06441622, + "memory(GiB)": 302.58, + "step": 320540, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.77535577, + "epoch": 1.7927233529116515, + "grad_norm": 8.3125, + "learning_rate": 2.9076970695656293e-07, + "loss": 0.88745365, + "memory(GiB)": 302.58, + "step": 320560, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.73266582, + "epoch": 1.7928352023846308, + "grad_norm": 6.625, + "learning_rate": 2.9045904534353353e-07, + "loss": 1.0537405, + "memory(GiB)": 302.58, + "step": 320580, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.7420083, + "epoch": 1.79294705185761, + "grad_norm": 5.9375, + "learning_rate": 2.9014854481173915e-07, + "loss": 1.01769972, + "memory(GiB)": 302.58, + "step": 320600, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.75366321, + "epoch": 1.7930589013305893, + "grad_norm": 12.5625, + "learning_rate": 2.898382053717985e-07, + "loss": 0.96795673, + "memory(GiB)": 302.58, + "step": 320620, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.76301851, + "epoch": 1.7931707508035686, + "grad_norm": 7.84375, + "learning_rate": 2.8952802703432756e-07, + "loss": 0.94096718, + "memory(GiB)": 302.58, + "step": 320640, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.74213138, + "epoch": 1.7932826002765478, + "grad_norm": 6.21875, + "learning_rate": 2.89218009809934e-07, + "loss": 1.01418438, + "memory(GiB)": 302.58, + "step": 320660, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.73723121, + "epoch": 1.793394449749527, + "grad_norm": 8.125, + "learning_rate": 2.88908153709222e-07, + "loss": 1.02621355, + "memory(GiB)": 302.58, + "step": 320680, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.75344858, + "epoch": 1.7935062992225064, + "grad_norm": 7.5, + "learning_rate": 2.885984587427887e-07, + "loss": 0.98019991, + "memory(GiB)": 302.58, + "step": 320700, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.73785872, + "epoch": 1.7936181486954856, + "grad_norm": 5.90625, + "learning_rate": 2.882889249212284e-07, + "loss": 1.02110567, + "memory(GiB)": 302.58, + "step": 320720, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74140735, + "epoch": 1.793729998168465, + "grad_norm": 7.9375, + "learning_rate": 2.879795522551276e-07, + "loss": 1.02833185, + "memory(GiB)": 302.58, + "step": 320740, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.75103536, + "epoch": 1.7938418476414442, + "grad_norm": 6.03125, + "learning_rate": 2.876703407550674e-07, + "loss": 0.99731483, + "memory(GiB)": 302.58, + "step": 320760, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.76152859, + "epoch": 1.7939536971144234, + "grad_norm": 8.875, + "learning_rate": 2.873612904316242e-07, + "loss": 0.92912035, + "memory(GiB)": 302.58, + "step": 320780, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.76571093, + "epoch": 1.7940655465874027, + "grad_norm": 8.8125, + "learning_rate": 2.8705240129536795e-07, + "loss": 0.91013622, + "memory(GiB)": 302.58, + "step": 320800, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.7505754, + "epoch": 1.794177396060382, + "grad_norm": 7.96875, + "learning_rate": 2.8674367335686347e-07, + "loss": 0.9906353, + "memory(GiB)": 302.58, + "step": 320820, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.75224395, + "epoch": 1.7942892455333612, + "grad_norm": 8.25, + "learning_rate": 2.864351066266713e-07, + "loss": 0.97272053, + "memory(GiB)": 302.58, + "step": 320840, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.76063895, + "epoch": 1.7944010950063405, + "grad_norm": 7.40625, + "learning_rate": 2.861267011153446e-07, + "loss": 0.93903399, + "memory(GiB)": 302.58, + "step": 320860, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.73537731, + "epoch": 1.7945129444793197, + "grad_norm": 10.0625, + "learning_rate": 2.858184568334321e-07, + "loss": 1.06877747, + "memory(GiB)": 302.58, + "step": 320880, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.75005012, + "epoch": 1.794624793952299, + "grad_norm": 4.46875, + "learning_rate": 2.8551037379147664e-07, + "loss": 0.95474033, + "memory(GiB)": 302.58, + "step": 320900, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.77836528, + "epoch": 1.7947366434252783, + "grad_norm": 7.0625, + "learning_rate": 2.852024520000152e-07, + "loss": 0.85554628, + "memory(GiB)": 302.58, + "step": 320920, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.74449024, + "epoch": 1.7948484928982575, + "grad_norm": 8.8125, + "learning_rate": 2.848946914695805e-07, + "loss": 1.03591232, + "memory(GiB)": 302.58, + "step": 320940, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.751298, + "epoch": 1.7949603423712368, + "grad_norm": 7.90625, + "learning_rate": 2.8458709221069804e-07, + "loss": 0.97937355, + "memory(GiB)": 302.58, + "step": 320960, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75420599, + "epoch": 1.795072191844216, + "grad_norm": 7.4375, + "learning_rate": 2.842796542338899e-07, + "loss": 0.96266108, + "memory(GiB)": 302.58, + "step": 320980, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.74438639, + "epoch": 1.7951840413171953, + "grad_norm": 7.5, + "learning_rate": 2.8397237754967056e-07, + "loss": 1.00326138, + "memory(GiB)": 302.58, + "step": 321000, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.75858455, + "epoch": 1.7952958907901746, + "grad_norm": 6.125, + "learning_rate": 2.836652621685493e-07, + "loss": 0.95527267, + "memory(GiB)": 302.58, + "step": 321020, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.76887312, + "epoch": 1.7954077402631539, + "grad_norm": 6.15625, + "learning_rate": 2.833583081010316e-07, + "loss": 0.88741856, + "memory(GiB)": 302.58, + "step": 321040, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.75625935, + "epoch": 1.7955195897361331, + "grad_norm": 8.5625, + "learning_rate": 2.8305151535761633e-07, + "loss": 0.97257881, + "memory(GiB)": 302.58, + "step": 321060, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.74501748, + "epoch": 1.7956314392091124, + "grad_norm": 6.375, + "learning_rate": 2.827448839487956e-07, + "loss": 0.98511448, + "memory(GiB)": 302.58, + "step": 321080, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74462094, + "epoch": 1.7957432886820917, + "grad_norm": 8.4375, + "learning_rate": 2.824384138850583e-07, + "loss": 1.0072566, + "memory(GiB)": 302.58, + "step": 321100, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.76003246, + "epoch": 1.795855138155071, + "grad_norm": 5.9375, + "learning_rate": 2.821321051768866e-07, + "loss": 0.93713646, + "memory(GiB)": 302.58, + "step": 321120, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75030475, + "epoch": 1.7959669876280502, + "grad_norm": 6.09375, + "learning_rate": 2.81825957834756e-07, + "loss": 0.97242746, + "memory(GiB)": 302.58, + "step": 321140, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75248675, + "epoch": 1.7960788371010294, + "grad_norm": 5.25, + "learning_rate": 2.8151997186913913e-07, + "loss": 0.96073694, + "memory(GiB)": 302.58, + "step": 321160, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74689584, + "epoch": 1.7961906865740087, + "grad_norm": 6.53125, + "learning_rate": 2.812141472905011e-07, + "loss": 1.00640383, + "memory(GiB)": 302.58, + "step": 321180, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.74855628, + "epoch": 1.796302536046988, + "grad_norm": 7.40625, + "learning_rate": 2.809084841093018e-07, + "loss": 0.96609554, + "memory(GiB)": 302.58, + "step": 321200, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.75181766, + "epoch": 1.7964143855199672, + "grad_norm": 7.5625, + "learning_rate": 2.806029823359968e-07, + "loss": 0.97641277, + "memory(GiB)": 302.58, + "step": 321220, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.74843926, + "epoch": 1.7965262349929465, + "grad_norm": 5.75, + "learning_rate": 2.8029764198103383e-07, + "loss": 0.99102306, + "memory(GiB)": 302.58, + "step": 321240, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75621247, + "epoch": 1.7966380844659258, + "grad_norm": 6.25, + "learning_rate": 2.799924630548578e-07, + "loss": 0.95587225, + "memory(GiB)": 302.58, + "step": 321260, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.73852429, + "epoch": 1.796749933938905, + "grad_norm": 7.84375, + "learning_rate": 2.7968744556790603e-07, + "loss": 1.04579391, + "memory(GiB)": 302.58, + "step": 321280, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.75215058, + "epoch": 1.7968617834118843, + "grad_norm": 8.0625, + "learning_rate": 2.793825895306107e-07, + "loss": 0.96729116, + "memory(GiB)": 302.58, + "step": 321300, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.73545089, + "epoch": 1.7969736328848636, + "grad_norm": 7.96875, + "learning_rate": 2.790778949534001e-07, + "loss": 1.04284382, + "memory(GiB)": 302.58, + "step": 321320, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73781681, + "epoch": 1.7970854823578428, + "grad_norm": 6.8125, + "learning_rate": 2.7877336184669537e-07, + "loss": 1.0418561, + "memory(GiB)": 302.58, + "step": 321340, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.75115113, + "epoch": 1.797197331830822, + "grad_norm": 9.1875, + "learning_rate": 2.78468990220912e-07, + "loss": 0.98963337, + "memory(GiB)": 302.58, + "step": 321360, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.75129762, + "epoch": 1.7973091813038014, + "grad_norm": 5.46875, + "learning_rate": 2.781647800864612e-07, + "loss": 0.9619956, + "memory(GiB)": 302.58, + "step": 321380, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.73877544, + "epoch": 1.7974210307767806, + "grad_norm": 6.03125, + "learning_rate": 2.778607314537468e-07, + "loss": 1.04332743, + "memory(GiB)": 302.58, + "step": 321400, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.74939175, + "epoch": 1.7975328802497599, + "grad_norm": 6.84375, + "learning_rate": 2.7755684433316944e-07, + "loss": 0.98617592, + "memory(GiB)": 302.58, + "step": 321420, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75384049, + "epoch": 1.7976447297227391, + "grad_norm": 9.0, + "learning_rate": 2.7725311873512185e-07, + "loss": 0.99739838, + "memory(GiB)": 302.58, + "step": 321440, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.75083418, + "epoch": 1.7977565791957184, + "grad_norm": 7.0625, + "learning_rate": 2.769495546699935e-07, + "loss": 0.98839931, + "memory(GiB)": 302.58, + "step": 321460, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.75282946, + "epoch": 1.7978684286686977, + "grad_norm": 7.125, + "learning_rate": 2.766461521481667e-07, + "loss": 0.97284956, + "memory(GiB)": 302.58, + "step": 321480, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.74844627, + "epoch": 1.797980278141677, + "grad_norm": 6.0, + "learning_rate": 2.7634291118001866e-07, + "loss": 0.99651756, + "memory(GiB)": 302.58, + "step": 321500, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.76056361, + "epoch": 1.7980921276146562, + "grad_norm": 9.1875, + "learning_rate": 2.760398317759216e-07, + "loss": 0.94579697, + "memory(GiB)": 302.58, + "step": 321520, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.77324352, + "epoch": 1.7982039770876355, + "grad_norm": 8.375, + "learning_rate": 2.757369139462418e-07, + "loss": 0.89089718, + "memory(GiB)": 302.58, + "step": 321540, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75993357, + "epoch": 1.7983158265606147, + "grad_norm": 9.6875, + "learning_rate": 2.7543415770133917e-07, + "loss": 0.95423136, + "memory(GiB)": 302.58, + "step": 321560, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.73854818, + "epoch": 1.798427676033594, + "grad_norm": 8.3125, + "learning_rate": 2.7513156305156886e-07, + "loss": 1.03164864, + "memory(GiB)": 302.58, + "step": 321580, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.76711473, + "epoch": 1.7985395255065733, + "grad_norm": 5.96875, + "learning_rate": 2.748291300072825e-07, + "loss": 0.89083023, + "memory(GiB)": 302.58, + "step": 321600, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74431467, + "epoch": 1.7986513749795525, + "grad_norm": 5.375, + "learning_rate": 2.7452685857882245e-07, + "loss": 0.9929636, + "memory(GiB)": 302.58, + "step": 321620, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.71239052, + "epoch": 1.7987632244525318, + "grad_norm": 8.25, + "learning_rate": 2.7422474877652814e-07, + "loss": 1.160783, + "memory(GiB)": 302.58, + "step": 321640, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.75543261, + "epoch": 1.798875073925511, + "grad_norm": 5.53125, + "learning_rate": 2.739228006107325e-07, + "loss": 0.95517826, + "memory(GiB)": 302.58, + "step": 321660, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74560652, + "epoch": 1.7989869233984903, + "grad_norm": 8.1875, + "learning_rate": 2.736210140917633e-07, + "loss": 0.99913988, + "memory(GiB)": 302.58, + "step": 321680, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.74710264, + "epoch": 1.7990987728714696, + "grad_norm": 6.8125, + "learning_rate": 2.7331938922994183e-07, + "loss": 1.00412016, + "memory(GiB)": 302.58, + "step": 321700, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.76203017, + "epoch": 1.7992106223444488, + "grad_norm": 10.875, + "learning_rate": 2.7301792603558585e-07, + "loss": 0.90359783, + "memory(GiB)": 302.58, + "step": 321720, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.75840001, + "epoch": 1.799322471817428, + "grad_norm": 10.3125, + "learning_rate": 2.7271662451900494e-07, + "loss": 0.9665082, + "memory(GiB)": 302.58, + "step": 321740, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.73653951, + "epoch": 1.7994343212904074, + "grad_norm": 8.25, + "learning_rate": 2.7241548469050584e-07, + "loss": 1.02241945, + "memory(GiB)": 302.58, + "step": 321760, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.77005439, + "epoch": 1.7995461707633866, + "grad_norm": 7.03125, + "learning_rate": 2.7211450656038753e-07, + "loss": 0.87892799, + "memory(GiB)": 302.58, + "step": 321780, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74958663, + "epoch": 1.799658020236366, + "grad_norm": 8.1875, + "learning_rate": 2.718136901389445e-07, + "loss": 0.99208117, + "memory(GiB)": 302.58, + "step": 321800, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.74642448, + "epoch": 1.7997698697093452, + "grad_norm": 6.09375, + "learning_rate": 2.7151303543646647e-07, + "loss": 1.02958345, + "memory(GiB)": 302.58, + "step": 321820, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.74662194, + "epoch": 1.7998817191823244, + "grad_norm": 6.34375, + "learning_rate": 2.712125424632356e-07, + "loss": 1.00487509, + "memory(GiB)": 302.58, + "step": 321840, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.76296101, + "epoch": 1.7999935686553037, + "grad_norm": 6.71875, + "learning_rate": 2.7091221122953096e-07, + "loss": 0.91159191, + "memory(GiB)": 302.58, + "step": 321860, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.77596111, + "epoch": 1.800105418128283, + "grad_norm": 9.5625, + "learning_rate": 2.7061204174562426e-07, + "loss": 0.88413963, + "memory(GiB)": 302.58, + "step": 321880, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.73863149, + "epoch": 1.8002172676012622, + "grad_norm": 9.875, + "learning_rate": 2.7031203402178183e-07, + "loss": 1.01261387, + "memory(GiB)": 302.58, + "step": 321900, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.75415545, + "epoch": 1.8003291170742415, + "grad_norm": 9.4375, + "learning_rate": 2.70012188068266e-07, + "loss": 0.95621443, + "memory(GiB)": 302.58, + "step": 321920, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.7451498, + "epoch": 1.8004409665472207, + "grad_norm": 7.25, + "learning_rate": 2.697125038953313e-07, + "loss": 1.01634674, + "memory(GiB)": 302.58, + "step": 321940, + "train_speed(iter/s)": 0.123648 + }, + { + "acc": 0.73375645, + "epoch": 1.8005528160202, + "grad_norm": 6.53125, + "learning_rate": 2.6941298151322795e-07, + "loss": 1.04226055, + "memory(GiB)": 302.58, + "step": 321960, + "train_speed(iter/s)": 0.123652 + }, + { + "acc": 0.75648756, + "epoch": 1.8006646654931793, + "grad_norm": 7.28125, + "learning_rate": 2.691136209322015e-07, + "loss": 0.94851093, + "memory(GiB)": 302.58, + "step": 321980, + "train_speed(iter/s)": 0.123656 + }, + { + "acc": 0.74044828, + "epoch": 1.8007765149661585, + "grad_norm": 10.6875, + "learning_rate": 2.6881442216248997e-07, + "loss": 1.04269972, + "memory(GiB)": 302.58, + "step": 322000, + "train_speed(iter/s)": 0.123659 + }, + { + "epoch": 1.8007765149661585, + "eval_acc": 0.7068932386401225, + "eval_loss": 1.0117928981781006, + "eval_runtime": 7567.5002, + "eval_samples_per_second": 9.948, + "eval_steps_per_second": 9.948, + "step": 322000 + }, + { + "acc": 0.74306517, + "epoch": 1.8008883644391378, + "grad_norm": 8.875, + "learning_rate": 2.6851538521432794e-07, + "loss": 0.99790411, + "memory(GiB)": 302.58, + "step": 322020, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.77339897, + "epoch": 1.801000213912117, + "grad_norm": 5.59375, + "learning_rate": 2.6821651009794226e-07, + "loss": 0.87761269, + "memory(GiB)": 302.58, + "step": 322040, + "train_speed(iter/s)": 0.123302 + }, + { + "acc": 0.75697813, + "epoch": 1.8011120633850963, + "grad_norm": 4.3125, + "learning_rate": 2.6791779682355635e-07, + "loss": 0.95956545, + "memory(GiB)": 302.58, + "step": 322060, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.74767861, + "epoch": 1.8012239128580756, + "grad_norm": 9.25, + "learning_rate": 2.676192454013871e-07, + "loss": 0.99372396, + "memory(GiB)": 302.58, + "step": 322080, + "train_speed(iter/s)": 0.123309 + }, + { + "acc": 0.74039869, + "epoch": 1.8013357623310549, + "grad_norm": 5.53125, + "learning_rate": 2.6732085584164515e-07, + "loss": 1.01141253, + "memory(GiB)": 302.58, + "step": 322100, + "train_speed(iter/s)": 0.123312 + }, + { + "acc": 0.7470531, + "epoch": 1.8014476118040341, + "grad_norm": 7.0625, + "learning_rate": 2.6702262815453626e-07, + "loss": 0.99137325, + "memory(GiB)": 302.58, + "step": 322120, + "train_speed(iter/s)": 0.123316 + }, + { + "acc": 0.75807014, + "epoch": 1.8015594612770134, + "grad_norm": 8.375, + "learning_rate": 2.667245623502618e-07, + "loss": 0.97129574, + "memory(GiB)": 302.58, + "step": 322140, + "train_speed(iter/s)": 0.12332 + }, + { + "acc": 0.75649972, + "epoch": 1.8016713107499926, + "grad_norm": 8.4375, + "learning_rate": 2.6642665843901684e-07, + "loss": 0.95516672, + "memory(GiB)": 302.58, + "step": 322160, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.75920115, + "epoch": 1.801783160222972, + "grad_norm": 6.0625, + "learning_rate": 2.6612891643098937e-07, + "loss": 0.93406954, + "memory(GiB)": 302.58, + "step": 322180, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.74729834, + "epoch": 1.8018950096959512, + "grad_norm": 7.375, + "learning_rate": 2.658313363363635e-07, + "loss": 1.00111713, + "memory(GiB)": 302.58, + "step": 322200, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.74874859, + "epoch": 1.8020068591689304, + "grad_norm": 6.84375, + "learning_rate": 2.6553391816531827e-07, + "loss": 0.98348951, + "memory(GiB)": 302.58, + "step": 322220, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.74953394, + "epoch": 1.8021187086419097, + "grad_norm": 10.6875, + "learning_rate": 2.65236661928025e-07, + "loss": 0.99601221, + "memory(GiB)": 302.58, + "step": 322240, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.7390501, + "epoch": 1.802230558114889, + "grad_norm": 7.59375, + "learning_rate": 2.6493956763465167e-07, + "loss": 1.01968164, + "memory(GiB)": 302.58, + "step": 322260, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.74031072, + "epoch": 1.8023424075878682, + "grad_norm": 8.9375, + "learning_rate": 2.646426352953585e-07, + "loss": 1.02845688, + "memory(GiB)": 302.58, + "step": 322280, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.75205541, + "epoch": 1.8024542570608475, + "grad_norm": 10.25, + "learning_rate": 2.64345864920304e-07, + "loss": 0.96923571, + "memory(GiB)": 302.58, + "step": 322300, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.75285773, + "epoch": 1.8025661065338268, + "grad_norm": 6.125, + "learning_rate": 2.6404925651963675e-07, + "loss": 0.97654724, + "memory(GiB)": 302.58, + "step": 322320, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.73963099, + "epoch": 1.802677956006806, + "grad_norm": 8.0625, + "learning_rate": 2.637528101035025e-07, + "loss": 1.0200881, + "memory(GiB)": 302.58, + "step": 322340, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.75119734, + "epoch": 1.8027898054797853, + "grad_norm": 10.8125, + "learning_rate": 2.634565256820404e-07, + "loss": 0.98355188, + "memory(GiB)": 302.58, + "step": 322360, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.74800472, + "epoch": 1.8029016549527646, + "grad_norm": 9.3125, + "learning_rate": 2.6316040326538396e-07, + "loss": 0.99577532, + "memory(GiB)": 302.58, + "step": 322380, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.74190769, + "epoch": 1.8030135044257438, + "grad_norm": 6.625, + "learning_rate": 2.628644428636623e-07, + "loss": 1.02110195, + "memory(GiB)": 302.58, + "step": 322400, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.74756651, + "epoch": 1.803125353898723, + "grad_norm": 8.6875, + "learning_rate": 2.6256864448699736e-07, + "loss": 1.00308743, + "memory(GiB)": 302.58, + "step": 322420, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.74827614, + "epoch": 1.8032372033717023, + "grad_norm": 8.5625, + "learning_rate": 2.6227300814550606e-07, + "loss": 1.01202736, + "memory(GiB)": 302.58, + "step": 322440, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.7448935, + "epoch": 1.8033490528446816, + "grad_norm": 7.0625, + "learning_rate": 2.619775338493019e-07, + "loss": 1.0023963, + "memory(GiB)": 302.58, + "step": 322460, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.74666204, + "epoch": 1.8034609023176609, + "grad_norm": 7.46875, + "learning_rate": 2.616822216084897e-07, + "loss": 0.98485088, + "memory(GiB)": 302.58, + "step": 322480, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.76478271, + "epoch": 1.8035727517906401, + "grad_norm": 6.34375, + "learning_rate": 2.613870714331701e-07, + "loss": 0.91446991, + "memory(GiB)": 302.58, + "step": 322500, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.76697845, + "epoch": 1.8036846012636194, + "grad_norm": 9.375, + "learning_rate": 2.610920833334385e-07, + "loss": 0.90703211, + "memory(GiB)": 302.58, + "step": 322520, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.74110222, + "epoch": 1.8037964507365987, + "grad_norm": 9.25, + "learning_rate": 2.6079725731938453e-07, + "loss": 1.03056631, + "memory(GiB)": 302.58, + "step": 322540, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.7563036, + "epoch": 1.803908300209578, + "grad_norm": 6.875, + "learning_rate": 2.605025934010919e-07, + "loss": 0.94647112, + "memory(GiB)": 302.58, + "step": 322560, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.75846887, + "epoch": 1.8040201496825572, + "grad_norm": 7.71875, + "learning_rate": 2.60208091588639e-07, + "loss": 0.96144896, + "memory(GiB)": 302.58, + "step": 322580, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.75003567, + "epoch": 1.8041319991555365, + "grad_norm": 5.9375, + "learning_rate": 2.5991375189209913e-07, + "loss": 0.97893696, + "memory(GiB)": 302.58, + "step": 322600, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.74799442, + "epoch": 1.8042438486285157, + "grad_norm": 9.625, + "learning_rate": 2.5961957432153904e-07, + "loss": 1.02108164, + "memory(GiB)": 302.58, + "step": 322620, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.76449037, + "epoch": 1.804355698101495, + "grad_norm": 5.25, + "learning_rate": 2.593255588870208e-07, + "loss": 0.90359068, + "memory(GiB)": 302.58, + "step": 322640, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.74437575, + "epoch": 1.8044675475744743, + "grad_norm": 6.8125, + "learning_rate": 2.590317055986008e-07, + "loss": 1.01681986, + "memory(GiB)": 302.58, + "step": 322660, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.73702164, + "epoch": 1.8045793970474535, + "grad_norm": 4.78125, + "learning_rate": 2.5873801446632985e-07, + "loss": 1.04282274, + "memory(GiB)": 302.58, + "step": 322680, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75602589, + "epoch": 1.8046912465204328, + "grad_norm": 7.40625, + "learning_rate": 2.584444855002527e-07, + "loss": 0.96930933, + "memory(GiB)": 302.58, + "step": 322700, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.73654451, + "epoch": 1.804803095993412, + "grad_norm": 4.5625, + "learning_rate": 2.5815111871040854e-07, + "loss": 1.05544472, + "memory(GiB)": 302.58, + "step": 322720, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75917473, + "epoch": 1.8049149454663913, + "grad_norm": 7.28125, + "learning_rate": 2.5785791410683326e-07, + "loss": 0.93968315, + "memory(GiB)": 302.58, + "step": 322740, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.77469711, + "epoch": 1.8050267949393706, + "grad_norm": 11.375, + "learning_rate": 2.575648716995538e-07, + "loss": 0.85767717, + "memory(GiB)": 302.58, + "step": 322760, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.76087227, + "epoch": 1.8051386444123498, + "grad_norm": 6.1875, + "learning_rate": 2.572719914985933e-07, + "loss": 0.95161505, + "memory(GiB)": 302.58, + "step": 322780, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.74712071, + "epoch": 1.805250493885329, + "grad_norm": 5.40625, + "learning_rate": 2.5697927351396977e-07, + "loss": 1.0049469, + "memory(GiB)": 302.58, + "step": 322800, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.75077596, + "epoch": 1.8053623433583084, + "grad_norm": 8.875, + "learning_rate": 2.5668671775569533e-07, + "loss": 0.95830965, + "memory(GiB)": 302.58, + "step": 322820, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.75525732, + "epoch": 1.8054741928312876, + "grad_norm": 9.875, + "learning_rate": 2.563943242337752e-07, + "loss": 0.95796814, + "memory(GiB)": 302.58, + "step": 322840, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.7477798, + "epoch": 1.805586042304267, + "grad_norm": 7.25, + "learning_rate": 2.561020929582109e-07, + "loss": 0.99499989, + "memory(GiB)": 302.58, + "step": 322860, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75444031, + "epoch": 1.8056978917772462, + "grad_norm": 4.6875, + "learning_rate": 2.558100239389971e-07, + "loss": 0.94203386, + "memory(GiB)": 302.58, + "step": 322880, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.74121571, + "epoch": 1.8058097412502254, + "grad_norm": 7.15625, + "learning_rate": 2.5551811718612427e-07, + "loss": 1.01809978, + "memory(GiB)": 302.58, + "step": 322900, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.74516635, + "epoch": 1.8059215907232047, + "grad_norm": 10.625, + "learning_rate": 2.552263727095766e-07, + "loss": 0.97383118, + "memory(GiB)": 302.58, + "step": 322920, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.75328574, + "epoch": 1.806033440196184, + "grad_norm": 9.4375, + "learning_rate": 2.5493479051933165e-07, + "loss": 0.95370226, + "memory(GiB)": 302.58, + "step": 322940, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.74611707, + "epoch": 1.8061452896691632, + "grad_norm": 7.6875, + "learning_rate": 2.5464337062536316e-07, + "loss": 0.99655733, + "memory(GiB)": 302.58, + "step": 322960, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.73263226, + "epoch": 1.8062571391421425, + "grad_norm": 9.0, + "learning_rate": 2.5435211303763864e-07, + "loss": 1.03220205, + "memory(GiB)": 302.58, + "step": 322980, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.75619392, + "epoch": 1.8063689886151217, + "grad_norm": 4.625, + "learning_rate": 2.5406101776611904e-07, + "loss": 0.96798506, + "memory(GiB)": 302.58, + "step": 323000, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.75217147, + "epoch": 1.806480838088101, + "grad_norm": 7.6875, + "learning_rate": 2.537700848207625e-07, + "loss": 0.96892738, + "memory(GiB)": 302.58, + "step": 323020, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74162912, + "epoch": 1.8065926875610803, + "grad_norm": 5.75, + "learning_rate": 2.534793142115188e-07, + "loss": 1.0207839, + "memory(GiB)": 302.58, + "step": 323040, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.76235938, + "epoch": 1.8067045370340595, + "grad_norm": 7.1875, + "learning_rate": 2.5318870594833345e-07, + "loss": 0.94885597, + "memory(GiB)": 302.58, + "step": 323060, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.76252947, + "epoch": 1.8068163865070388, + "grad_norm": 6.59375, + "learning_rate": 2.5289826004114614e-07, + "loss": 0.94799871, + "memory(GiB)": 302.58, + "step": 323080, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.75200415, + "epoch": 1.806928235980018, + "grad_norm": 10.0, + "learning_rate": 2.526079764998912e-07, + "loss": 0.97412262, + "memory(GiB)": 302.58, + "step": 323100, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.73444743, + "epoch": 1.8070400854529973, + "grad_norm": 8.4375, + "learning_rate": 2.5231785533449683e-07, + "loss": 1.04567509, + "memory(GiB)": 302.58, + "step": 323120, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.75509186, + "epoch": 1.8071519349259766, + "grad_norm": 7.84375, + "learning_rate": 2.5202789655488615e-07, + "loss": 0.97261171, + "memory(GiB)": 302.58, + "step": 323140, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.75222259, + "epoch": 1.8072637843989559, + "grad_norm": 7.84375, + "learning_rate": 2.5173810017097677e-07, + "loss": 0.96228046, + "memory(GiB)": 302.58, + "step": 323160, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.74803967, + "epoch": 1.8073756338719351, + "grad_norm": 6.90625, + "learning_rate": 2.514484661926808e-07, + "loss": 1.002806, + "memory(GiB)": 302.58, + "step": 323180, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74123278, + "epoch": 1.8074874833449144, + "grad_norm": 8.0625, + "learning_rate": 2.5115899462990424e-07, + "loss": 1.01094761, + "memory(GiB)": 302.58, + "step": 323200, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.73029532, + "epoch": 1.8075993328178936, + "grad_norm": 7.65625, + "learning_rate": 2.5086968549254854e-07, + "loss": 1.06024609, + "memory(GiB)": 302.58, + "step": 323220, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.751087, + "epoch": 1.807711182290873, + "grad_norm": 7.15625, + "learning_rate": 2.505805387905091e-07, + "loss": 0.97959652, + "memory(GiB)": 302.58, + "step": 323240, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.74604239, + "epoch": 1.8078230317638522, + "grad_norm": 6.625, + "learning_rate": 2.502915545336737e-07, + "loss": 1.00590048, + "memory(GiB)": 302.58, + "step": 323260, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.76287146, + "epoch": 1.8079348812368314, + "grad_norm": 8.25, + "learning_rate": 2.500027327319293e-07, + "loss": 0.91917839, + "memory(GiB)": 302.58, + "step": 323280, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.74891729, + "epoch": 1.8080467307098107, + "grad_norm": 7.625, + "learning_rate": 2.497140733951531e-07, + "loss": 0.97646008, + "memory(GiB)": 302.58, + "step": 323300, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.75059519, + "epoch": 1.80815858018279, + "grad_norm": 7.96875, + "learning_rate": 2.4942557653321817e-07, + "loss": 0.96084042, + "memory(GiB)": 302.58, + "step": 323320, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.74576249, + "epoch": 1.8082704296557692, + "grad_norm": 7.875, + "learning_rate": 2.491372421559929e-07, + "loss": 1.02153311, + "memory(GiB)": 302.58, + "step": 323340, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.75366874, + "epoch": 1.8083822791287485, + "grad_norm": 8.5625, + "learning_rate": 2.488490702733376e-07, + "loss": 0.94596348, + "memory(GiB)": 302.58, + "step": 323360, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.76423726, + "epoch": 1.8084941286017278, + "grad_norm": 7.0, + "learning_rate": 2.4856106089511e-07, + "loss": 0.93889236, + "memory(GiB)": 302.58, + "step": 323380, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.75148501, + "epoch": 1.808605978074707, + "grad_norm": 8.5625, + "learning_rate": 2.4827321403116054e-07, + "loss": 0.9518527, + "memory(GiB)": 302.58, + "step": 323400, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.74781146, + "epoch": 1.8087178275476863, + "grad_norm": 6.9375, + "learning_rate": 2.479855296913347e-07, + "loss": 0.99998035, + "memory(GiB)": 302.58, + "step": 323420, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.76320024, + "epoch": 1.8088296770206655, + "grad_norm": 7.375, + "learning_rate": 2.4769800788547183e-07, + "loss": 0.94468708, + "memory(GiB)": 302.58, + "step": 323440, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75367351, + "epoch": 1.8089415264936448, + "grad_norm": 8.625, + "learning_rate": 2.474106486234057e-07, + "loss": 0.97750711, + "memory(GiB)": 302.58, + "step": 323460, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.7512938, + "epoch": 1.809053375966624, + "grad_norm": 7.65625, + "learning_rate": 2.4712345191496624e-07, + "loss": 0.97736731, + "memory(GiB)": 302.58, + "step": 323480, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.75050869, + "epoch": 1.8091652254396033, + "grad_norm": 6.09375, + "learning_rate": 2.46836417769975e-07, + "loss": 0.99810057, + "memory(GiB)": 302.58, + "step": 323500, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.75419197, + "epoch": 1.8092770749125826, + "grad_norm": 7.65625, + "learning_rate": 2.465495461982509e-07, + "loss": 0.95204067, + "memory(GiB)": 302.58, + "step": 323520, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.75248237, + "epoch": 1.8093889243855619, + "grad_norm": 11.0625, + "learning_rate": 2.4626283720960376e-07, + "loss": 0.96456423, + "memory(GiB)": 302.58, + "step": 323540, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.75005922, + "epoch": 1.8095007738585411, + "grad_norm": 5.34375, + "learning_rate": 2.4597629081384246e-07, + "loss": 0.96164875, + "memory(GiB)": 302.58, + "step": 323560, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75235906, + "epoch": 1.8096126233315204, + "grad_norm": 6.84375, + "learning_rate": 2.4568990702076634e-07, + "loss": 0.97852564, + "memory(GiB)": 302.58, + "step": 323580, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75721898, + "epoch": 1.8097244728044997, + "grad_norm": 9.0, + "learning_rate": 2.4540368584016985e-07, + "loss": 0.95856342, + "memory(GiB)": 302.58, + "step": 323600, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75358319, + "epoch": 1.809836322277479, + "grad_norm": 6.90625, + "learning_rate": 2.4511762728184506e-07, + "loss": 0.97486696, + "memory(GiB)": 302.58, + "step": 323620, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74946747, + "epoch": 1.8099481717504582, + "grad_norm": 6.03125, + "learning_rate": 2.4483173135557425e-07, + "loss": 0.99314919, + "memory(GiB)": 302.58, + "step": 323640, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.77094517, + "epoch": 1.8100600212234375, + "grad_norm": 5.78125, + "learning_rate": 2.445459980711368e-07, + "loss": 0.88864126, + "memory(GiB)": 302.58, + "step": 323660, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.75844083, + "epoch": 1.8101718706964167, + "grad_norm": 6.59375, + "learning_rate": 2.4426042743830536e-07, + "loss": 0.9475296, + "memory(GiB)": 302.58, + "step": 323680, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.76208682, + "epoch": 1.810283720169396, + "grad_norm": 5.1875, + "learning_rate": 2.439750194668472e-07, + "loss": 0.91579561, + "memory(GiB)": 302.58, + "step": 323700, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.74482784, + "epoch": 1.8103955696423752, + "grad_norm": 7.1875, + "learning_rate": 2.4368977416652393e-07, + "loss": 0.99516258, + "memory(GiB)": 302.58, + "step": 323720, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.75337882, + "epoch": 1.8105074191153545, + "grad_norm": 9.125, + "learning_rate": 2.434046915470928e-07, + "loss": 0.97686901, + "memory(GiB)": 302.58, + "step": 323740, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.74078989, + "epoch": 1.8106192685883338, + "grad_norm": 7.0, + "learning_rate": 2.431197716183037e-07, + "loss": 1.01508512, + "memory(GiB)": 302.58, + "step": 323760, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.76686654, + "epoch": 1.810731118061313, + "grad_norm": 8.8125, + "learning_rate": 2.428350143899022e-07, + "loss": 0.92047024, + "memory(GiB)": 302.58, + "step": 323780, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.7472259, + "epoch": 1.8108429675342923, + "grad_norm": 7.65625, + "learning_rate": 2.4255041987162776e-07, + "loss": 1.00934753, + "memory(GiB)": 302.58, + "step": 323800, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.77236338, + "epoch": 1.8109548170072716, + "grad_norm": 9.1875, + "learning_rate": 2.422659880732148e-07, + "loss": 0.86071148, + "memory(GiB)": 302.58, + "step": 323820, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.76445746, + "epoch": 1.8110666664802508, + "grad_norm": 6.75, + "learning_rate": 2.419817190043905e-07, + "loss": 0.91911011, + "memory(GiB)": 302.58, + "step": 323840, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.74499321, + "epoch": 1.81117851595323, + "grad_norm": 7.15625, + "learning_rate": 2.4169761267487934e-07, + "loss": 1.01226768, + "memory(GiB)": 302.58, + "step": 323860, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.75162306, + "epoch": 1.8112903654262094, + "grad_norm": 7.28125, + "learning_rate": 2.4141366909439744e-07, + "loss": 0.97826328, + "memory(GiB)": 302.58, + "step": 323880, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.74595518, + "epoch": 1.8114022148991886, + "grad_norm": 8.0, + "learning_rate": 2.411298882726576e-07, + "loss": 0.99375458, + "memory(GiB)": 302.58, + "step": 323900, + "train_speed(iter/s)": 0.123635 + }, + { + "acc": 0.75189166, + "epoch": 1.8115140643721679, + "grad_norm": 8.375, + "learning_rate": 2.4084627021936535e-07, + "loss": 0.98282833, + "memory(GiB)": 302.58, + "step": 323920, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.74755483, + "epoch": 1.8116259138451472, + "grad_norm": 6.15625, + "learning_rate": 2.4056281494422185e-07, + "loss": 0.99328089, + "memory(GiB)": 302.58, + "step": 323940, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.75377655, + "epoch": 1.8117377633181264, + "grad_norm": 6.9375, + "learning_rate": 2.4027952245692154e-07, + "loss": 0.95775394, + "memory(GiB)": 302.58, + "step": 323960, + "train_speed(iter/s)": 0.123646 + }, + { + "acc": 0.74703956, + "epoch": 1.8118496127911057, + "grad_norm": 7.40625, + "learning_rate": 2.399963927671545e-07, + "loss": 0.99895182, + "memory(GiB)": 302.58, + "step": 323980, + "train_speed(iter/s)": 0.12365 + }, + { + "acc": 0.74230652, + "epoch": 1.811961462264085, + "grad_norm": 6.96875, + "learning_rate": 2.39713425884604e-07, + "loss": 1.0216918, + "memory(GiB)": 302.58, + "step": 324000, + "train_speed(iter/s)": 0.123653 + }, + { + "epoch": 1.811961462264085, + "eval_acc": 0.7069015697970027, + "eval_loss": 1.0118026733398438, + "eval_runtime": 7530.6404, + "eval_samples_per_second": 9.997, + "eval_steps_per_second": 9.997, + "step": 324000 + }, + { + "acc": 0.76128063, + "epoch": 1.8120733117370642, + "grad_norm": 7.84375, + "learning_rate": 2.3943062181894904e-07, + "loss": 0.92314777, + "memory(GiB)": 302.58, + "step": 324020, + "train_speed(iter/s)": 0.123296 + }, + { + "acc": 0.74104438, + "epoch": 1.8121851612100435, + "grad_norm": 6.8125, + "learning_rate": 2.3914798057986187e-07, + "loss": 0.99446592, + "memory(GiB)": 302.58, + "step": 324040, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.74243164, + "epoch": 1.8122970106830227, + "grad_norm": 5.0, + "learning_rate": 2.388655021770103e-07, + "loss": 1.02619085, + "memory(GiB)": 302.58, + "step": 324060, + "train_speed(iter/s)": 0.123304 + }, + { + "acc": 0.7543221, + "epoch": 1.812408860156002, + "grad_norm": 8.5625, + "learning_rate": 2.385831866200561e-07, + "loss": 0.94405651, + "memory(GiB)": 302.58, + "step": 324080, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.75501437, + "epoch": 1.8125207096289813, + "grad_norm": 9.375, + "learning_rate": 2.3830103391865421e-07, + "loss": 0.95390482, + "memory(GiB)": 302.58, + "step": 324100, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.74310479, + "epoch": 1.8126325591019605, + "grad_norm": 8.1875, + "learning_rate": 2.3801904408245647e-07, + "loss": 1.00705118, + "memory(GiB)": 302.58, + "step": 324120, + "train_speed(iter/s)": 0.123315 + }, + { + "acc": 0.76770554, + "epoch": 1.8127444085749398, + "grad_norm": 8.6875, + "learning_rate": 2.377372171211062e-07, + "loss": 0.89651785, + "memory(GiB)": 302.58, + "step": 324140, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.75823736, + "epoch": 1.812856258047919, + "grad_norm": 5.0625, + "learning_rate": 2.374555530442446e-07, + "loss": 0.95195732, + "memory(GiB)": 302.58, + "step": 324160, + "train_speed(iter/s)": 0.123322 + }, + { + "acc": 0.74563341, + "epoch": 1.8129681075208983, + "grad_norm": 5.78125, + "learning_rate": 2.3717405186150512e-07, + "loss": 1.03319941, + "memory(GiB)": 302.58, + "step": 324180, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.77429838, + "epoch": 1.8130799569938776, + "grad_norm": 7.09375, + "learning_rate": 2.3689271358251498e-07, + "loss": 0.87627344, + "memory(GiB)": 302.58, + "step": 324200, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.75583706, + "epoch": 1.813191806466857, + "grad_norm": 6.9375, + "learning_rate": 2.3661153821689763e-07, + "loss": 0.95340204, + "memory(GiB)": 302.58, + "step": 324220, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.73134084, + "epoch": 1.8133036559398361, + "grad_norm": 8.9375, + "learning_rate": 2.363305257742704e-07, + "loss": 1.05787821, + "memory(GiB)": 302.58, + "step": 324240, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.76278996, + "epoch": 1.8134155054128156, + "grad_norm": 7.0, + "learning_rate": 2.360496762642439e-07, + "loss": 0.92718248, + "memory(GiB)": 302.58, + "step": 324260, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.7714035, + "epoch": 1.8135273548857946, + "grad_norm": 5.78125, + "learning_rate": 2.3576898969642492e-07, + "loss": 0.88389282, + "memory(GiB)": 302.58, + "step": 324280, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.76619387, + "epoch": 1.8136392043587741, + "grad_norm": 8.75, + "learning_rate": 2.35488466080413e-07, + "loss": 0.91579704, + "memory(GiB)": 302.58, + "step": 324300, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.74673905, + "epoch": 1.8137510538317532, + "grad_norm": 8.625, + "learning_rate": 2.3520810542580374e-07, + "loss": 0.99840126, + "memory(GiB)": 302.58, + "step": 324320, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.75436115, + "epoch": 1.8138629033047327, + "grad_norm": 9.125, + "learning_rate": 2.3492790774218565e-07, + "loss": 0.95713587, + "memory(GiB)": 302.58, + "step": 324340, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.74427395, + "epoch": 1.8139747527777117, + "grad_norm": 7.875, + "learning_rate": 2.3464787303914326e-07, + "loss": 1.00136251, + "memory(GiB)": 302.58, + "step": 324360, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.7491642, + "epoch": 1.8140866022506912, + "grad_norm": 8.3125, + "learning_rate": 2.3436800132625338e-07, + "loss": 0.98714924, + "memory(GiB)": 302.58, + "step": 324380, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.76126146, + "epoch": 1.8141984517236702, + "grad_norm": 5.65625, + "learning_rate": 2.3408829261308996e-07, + "loss": 0.92332458, + "memory(GiB)": 302.58, + "step": 324400, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.74010348, + "epoch": 1.8143103011966497, + "grad_norm": 9.0, + "learning_rate": 2.3380874690921817e-07, + "loss": 1.02087812, + "memory(GiB)": 302.58, + "step": 324420, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.76367712, + "epoch": 1.8144221506696288, + "grad_norm": 6.5, + "learning_rate": 2.3352936422420146e-07, + "loss": 0.93531132, + "memory(GiB)": 302.58, + "step": 324440, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.74484558, + "epoch": 1.8145340001426082, + "grad_norm": 8.375, + "learning_rate": 2.3325014456759442e-07, + "loss": 1.02365875, + "memory(GiB)": 302.58, + "step": 324460, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.75998549, + "epoch": 1.8146458496155873, + "grad_norm": 7.8125, + "learning_rate": 2.3297108794894718e-07, + "loss": 0.91485453, + "memory(GiB)": 302.58, + "step": 324480, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.74685178, + "epoch": 1.8147576990885668, + "grad_norm": 5.71875, + "learning_rate": 2.3269219437780488e-07, + "loss": 0.98157082, + "memory(GiB)": 302.58, + "step": 324500, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75395088, + "epoch": 1.8148695485615458, + "grad_norm": 7.1875, + "learning_rate": 2.3241346386370601e-07, + "loss": 0.97688904, + "memory(GiB)": 302.58, + "step": 324520, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74041901, + "epoch": 1.8149813980345253, + "grad_norm": 6.71875, + "learning_rate": 2.3213489641618458e-07, + "loss": 1.01626825, + "memory(GiB)": 302.58, + "step": 324540, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.73193035, + "epoch": 1.8150932475075043, + "grad_norm": 11.875, + "learning_rate": 2.3185649204476746e-07, + "loss": 1.07570276, + "memory(GiB)": 302.58, + "step": 324560, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.75319448, + "epoch": 1.8152050969804838, + "grad_norm": 5.84375, + "learning_rate": 2.3157825075897865e-07, + "loss": 0.94802513, + "memory(GiB)": 302.58, + "step": 324580, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.76422215, + "epoch": 1.8153169464534629, + "grad_norm": 7.21875, + "learning_rate": 2.3130017256833338e-07, + "loss": 0.91589766, + "memory(GiB)": 302.58, + "step": 324600, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.74149117, + "epoch": 1.8154287959264424, + "grad_norm": 6.78125, + "learning_rate": 2.310222574823434e-07, + "loss": 1.01167622, + "memory(GiB)": 302.58, + "step": 324620, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.76152506, + "epoch": 1.8155406453994214, + "grad_norm": 7.8125, + "learning_rate": 2.3074450551051398e-07, + "loss": 0.92560101, + "memory(GiB)": 302.58, + "step": 324640, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.74823833, + "epoch": 1.8156524948724009, + "grad_norm": 6.09375, + "learning_rate": 2.3046691666234579e-07, + "loss": 1.0083704, + "memory(GiB)": 302.58, + "step": 324660, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.73889728, + "epoch": 1.81576434434538, + "grad_norm": 9.3125, + "learning_rate": 2.301894909473329e-07, + "loss": 1.03598661, + "memory(GiB)": 302.58, + "step": 324680, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.75388174, + "epoch": 1.8158761938183594, + "grad_norm": 7.71875, + "learning_rate": 2.299122283749633e-07, + "loss": 0.9661869, + "memory(GiB)": 302.58, + "step": 324700, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74530292, + "epoch": 1.8159880432913384, + "grad_norm": 5.53125, + "learning_rate": 2.296351289547216e-07, + "loss": 1.00069456, + "memory(GiB)": 302.58, + "step": 324720, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.75400658, + "epoch": 1.816099892764318, + "grad_norm": 6.96875, + "learning_rate": 2.2935819269608473e-07, + "loss": 0.9615406, + "memory(GiB)": 302.58, + "step": 324740, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75199003, + "epoch": 1.816211742237297, + "grad_norm": 6.59375, + "learning_rate": 2.2908141960852504e-07, + "loss": 0.97894335, + "memory(GiB)": 302.58, + "step": 324760, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.75188231, + "epoch": 1.8163235917102765, + "grad_norm": 9.25, + "learning_rate": 2.288048097015083e-07, + "loss": 0.95526934, + "memory(GiB)": 302.58, + "step": 324780, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.7483058, + "epoch": 1.8164354411832555, + "grad_norm": 8.0, + "learning_rate": 2.2852836298449698e-07, + "loss": 0.98102121, + "memory(GiB)": 302.58, + "step": 324800, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.76812229, + "epoch": 1.816547290656235, + "grad_norm": 5.3125, + "learning_rate": 2.2825207946694573e-07, + "loss": 0.91393967, + "memory(GiB)": 302.58, + "step": 324820, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74815555, + "epoch": 1.816659140129214, + "grad_norm": 5.46875, + "learning_rate": 2.2797595915830361e-07, + "loss": 0.99404955, + "memory(GiB)": 302.58, + "step": 324840, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.73369055, + "epoch": 1.8167709896021935, + "grad_norm": 7.59375, + "learning_rate": 2.2770000206801591e-07, + "loss": 1.03329029, + "memory(GiB)": 302.58, + "step": 324860, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.75497651, + "epoch": 1.8168828390751726, + "grad_norm": 6.8125, + "learning_rate": 2.2742420820552058e-07, + "loss": 0.96753712, + "memory(GiB)": 302.58, + "step": 324880, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.74299545, + "epoch": 1.816994688548152, + "grad_norm": 4.375, + "learning_rate": 2.2714857758025067e-07, + "loss": 1.00820141, + "memory(GiB)": 302.58, + "step": 324900, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.75287037, + "epoch": 1.817106538021131, + "grad_norm": 8.0625, + "learning_rate": 2.268731102016336e-07, + "loss": 0.96862917, + "memory(GiB)": 302.58, + "step": 324920, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74401922, + "epoch": 1.8172183874941106, + "grad_norm": 9.5, + "learning_rate": 2.2659780607909132e-07, + "loss": 1.01765938, + "memory(GiB)": 302.58, + "step": 324940, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.76962695, + "epoch": 1.8173302369670896, + "grad_norm": 7.28125, + "learning_rate": 2.2632266522204017e-07, + "loss": 0.89095917, + "memory(GiB)": 302.58, + "step": 324960, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.76143732, + "epoch": 1.817442086440069, + "grad_norm": 6.78125, + "learning_rate": 2.2604768763989092e-07, + "loss": 0.92733669, + "memory(GiB)": 302.58, + "step": 324980, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.73786726, + "epoch": 1.8175539359130481, + "grad_norm": 7.8125, + "learning_rate": 2.2577287334204835e-07, + "loss": 1.03865509, + "memory(GiB)": 302.58, + "step": 325000, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.75024433, + "epoch": 1.8176657853860276, + "grad_norm": 5.5, + "learning_rate": 2.2549822233791152e-07, + "loss": 0.97433023, + "memory(GiB)": 302.58, + "step": 325020, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.7334363, + "epoch": 1.8177776348590067, + "grad_norm": 8.4375, + "learning_rate": 2.2522373463687576e-07, + "loss": 1.04903231, + "memory(GiB)": 302.58, + "step": 325040, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.76233974, + "epoch": 1.8178894843319862, + "grad_norm": 6.375, + "learning_rate": 2.249494102483285e-07, + "loss": 0.93386669, + "memory(GiB)": 302.58, + "step": 325060, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.74898009, + "epoch": 1.8180013338049652, + "grad_norm": 8.5625, + "learning_rate": 2.246752491816534e-07, + "loss": 0.9958312, + "memory(GiB)": 302.58, + "step": 325080, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.74618092, + "epoch": 1.8181131832779447, + "grad_norm": 6.78125, + "learning_rate": 2.2440125144622626e-07, + "loss": 0.99564552, + "memory(GiB)": 302.58, + "step": 325100, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74112749, + "epoch": 1.8182250327509237, + "grad_norm": 8.75, + "learning_rate": 2.241274170514196e-07, + "loss": 1.01784868, + "memory(GiB)": 302.58, + "step": 325120, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.76566749, + "epoch": 1.8183368822239032, + "grad_norm": 8.5625, + "learning_rate": 2.238537460065987e-07, + "loss": 0.90719309, + "memory(GiB)": 302.58, + "step": 325140, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.75215964, + "epoch": 1.8184487316968823, + "grad_norm": 6.28125, + "learning_rate": 2.2358023832112497e-07, + "loss": 0.96565943, + "memory(GiB)": 302.58, + "step": 325160, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.74558358, + "epoch": 1.8185605811698617, + "grad_norm": 5.09375, + "learning_rate": 2.2330689400435257e-07, + "loss": 0.99578009, + "memory(GiB)": 302.58, + "step": 325180, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.74085846, + "epoch": 1.8186724306428408, + "grad_norm": 6.75, + "learning_rate": 2.230337130656307e-07, + "loss": 0.99851837, + "memory(GiB)": 302.58, + "step": 325200, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.75366907, + "epoch": 1.8187842801158203, + "grad_norm": 11.625, + "learning_rate": 2.2276069551430302e-07, + "loss": 0.96718569, + "memory(GiB)": 302.58, + "step": 325220, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.740802, + "epoch": 1.8188961295887993, + "grad_norm": 7.5, + "learning_rate": 2.2248784135970814e-07, + "loss": 1.01583195, + "memory(GiB)": 302.58, + "step": 325240, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.73091626, + "epoch": 1.8190079790617788, + "grad_norm": 7.5, + "learning_rate": 2.2221515061117805e-07, + "loss": 1.06191883, + "memory(GiB)": 302.58, + "step": 325260, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.76555376, + "epoch": 1.8191198285347578, + "grad_norm": 6.46875, + "learning_rate": 2.2194262327804029e-07, + "loss": 0.93308506, + "memory(GiB)": 302.58, + "step": 325280, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.76331239, + "epoch": 1.8192316780077373, + "grad_norm": 6.28125, + "learning_rate": 2.2167025936961406e-07, + "loss": 0.9214448, + "memory(GiB)": 302.58, + "step": 325300, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.74699039, + "epoch": 1.8193435274807164, + "grad_norm": 6.90625, + "learning_rate": 2.2139805889521804e-07, + "loss": 0.98669291, + "memory(GiB)": 302.58, + "step": 325320, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74775853, + "epoch": 1.8194553769536959, + "grad_norm": 9.125, + "learning_rate": 2.211260218641603e-07, + "loss": 0.99211159, + "memory(GiB)": 302.58, + "step": 325340, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.75658922, + "epoch": 1.819567226426675, + "grad_norm": 6.53125, + "learning_rate": 2.2085414828574625e-07, + "loss": 0.94573164, + "memory(GiB)": 302.58, + "step": 325360, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.74386425, + "epoch": 1.8196790758996544, + "grad_norm": 6.8125, + "learning_rate": 2.2058243816927504e-07, + "loss": 0.9982316, + "memory(GiB)": 302.58, + "step": 325380, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74909806, + "epoch": 1.8197909253726334, + "grad_norm": 5.0625, + "learning_rate": 2.2031089152403874e-07, + "loss": 0.98387308, + "memory(GiB)": 302.58, + "step": 325400, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74563184, + "epoch": 1.819902774845613, + "grad_norm": 7.78125, + "learning_rate": 2.2003950835932654e-07, + "loss": 0.99374323, + "memory(GiB)": 302.58, + "step": 325420, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75273771, + "epoch": 1.820014624318592, + "grad_norm": 5.71875, + "learning_rate": 2.197682886844199e-07, + "loss": 0.969559, + "memory(GiB)": 302.58, + "step": 325440, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.7415297, + "epoch": 1.8201264737915714, + "grad_norm": 5.21875, + "learning_rate": 2.1949723250859534e-07, + "loss": 1.05016756, + "memory(GiB)": 302.58, + "step": 325460, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.75233903, + "epoch": 1.8202383232645505, + "grad_norm": 6.46875, + "learning_rate": 2.192263398411243e-07, + "loss": 0.96885433, + "memory(GiB)": 302.58, + "step": 325480, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.76873188, + "epoch": 1.82035017273753, + "grad_norm": 8.125, + "learning_rate": 2.189556106912716e-07, + "loss": 0.90680809, + "memory(GiB)": 302.58, + "step": 325500, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.73712139, + "epoch": 1.820462022210509, + "grad_norm": 7.40625, + "learning_rate": 2.1868504506829701e-07, + "loss": 1.04011545, + "memory(GiB)": 302.58, + "step": 325520, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.73448539, + "epoch": 1.8205738716834885, + "grad_norm": 6.90625, + "learning_rate": 2.1841464298145542e-07, + "loss": 1.05744934, + "memory(GiB)": 302.58, + "step": 325540, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.77104802, + "epoch": 1.8206857211564675, + "grad_norm": 8.5, + "learning_rate": 2.181444044399944e-07, + "loss": 0.90646496, + "memory(GiB)": 302.58, + "step": 325560, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.76093445, + "epoch": 1.820797570629447, + "grad_norm": 6.0, + "learning_rate": 2.1787432945315822e-07, + "loss": 0.9472537, + "memory(GiB)": 302.58, + "step": 325580, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75286798, + "epoch": 1.820909420102426, + "grad_norm": 7.9375, + "learning_rate": 2.1760441803018339e-07, + "loss": 0.97057648, + "memory(GiB)": 302.58, + "step": 325600, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.73081007, + "epoch": 1.8210212695754056, + "grad_norm": 8.625, + "learning_rate": 2.1733467018030253e-07, + "loss": 1.06714859, + "memory(GiB)": 302.58, + "step": 325620, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75157614, + "epoch": 1.8211331190483846, + "grad_norm": 5.15625, + "learning_rate": 2.17065085912741e-07, + "loss": 0.98986063, + "memory(GiB)": 302.58, + "step": 325640, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74792337, + "epoch": 1.821244968521364, + "grad_norm": 7.03125, + "learning_rate": 2.167956652367198e-07, + "loss": 1.00739727, + "memory(GiB)": 302.58, + "step": 325660, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.74386296, + "epoch": 1.8213568179943431, + "grad_norm": 6.625, + "learning_rate": 2.1652640816145431e-07, + "loss": 1.02066498, + "memory(GiB)": 302.58, + "step": 325680, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.74594383, + "epoch": 1.8214686674673226, + "grad_norm": 7.875, + "learning_rate": 2.162573146961533e-07, + "loss": 0.98952007, + "memory(GiB)": 302.58, + "step": 325700, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.74824986, + "epoch": 1.8215805169403017, + "grad_norm": 7.5, + "learning_rate": 2.1598838485002105e-07, + "loss": 0.98958406, + "memory(GiB)": 302.58, + "step": 325720, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.77103372, + "epoch": 1.8216923664132811, + "grad_norm": 5.65625, + "learning_rate": 2.1571961863225577e-07, + "loss": 0.90409918, + "memory(GiB)": 302.58, + "step": 325740, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.7435586, + "epoch": 1.8218042158862602, + "grad_norm": 6.53125, + "learning_rate": 2.154510160520501e-07, + "loss": 1.00403137, + "memory(GiB)": 302.58, + "step": 325760, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.74433217, + "epoch": 1.8219160653592397, + "grad_norm": 7.03125, + "learning_rate": 2.1518257711859115e-07, + "loss": 1.01215086, + "memory(GiB)": 302.58, + "step": 325780, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.77154603, + "epoch": 1.8220279148322187, + "grad_norm": 7.15625, + "learning_rate": 2.1491430184105988e-07, + "loss": 0.89171171, + "memory(GiB)": 302.58, + "step": 325800, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.74211364, + "epoch": 1.8221397643051982, + "grad_norm": 8.5625, + "learning_rate": 2.1464619022863286e-07, + "loss": 1.04362345, + "memory(GiB)": 302.58, + "step": 325820, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.75372896, + "epoch": 1.8222516137781772, + "grad_norm": 6.0625, + "learning_rate": 2.1437824229047943e-07, + "loss": 0.96852131, + "memory(GiB)": 302.58, + "step": 325840, + "train_speed(iter/s)": 0.123622 + }, + { + "acc": 0.75491138, + "epoch": 1.8223634632511567, + "grad_norm": 6.59375, + "learning_rate": 2.1411045803576557e-07, + "loss": 0.94966106, + "memory(GiB)": 302.58, + "step": 325860, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.74109116, + "epoch": 1.8224753127241358, + "grad_norm": 9.0, + "learning_rate": 2.1384283747365009e-07, + "loss": 1.02749367, + "memory(GiB)": 302.58, + "step": 325880, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.74928637, + "epoch": 1.8225871621971153, + "grad_norm": 8.5625, + "learning_rate": 2.1357538061328564e-07, + "loss": 0.97250862, + "memory(GiB)": 302.58, + "step": 325900, + "train_speed(iter/s)": 0.123632 + }, + { + "acc": 0.76109567, + "epoch": 1.8226990116700943, + "grad_norm": 6.40625, + "learning_rate": 2.133080874638205e-07, + "loss": 0.92177277, + "memory(GiB)": 302.58, + "step": 325920, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.75549545, + "epoch": 1.8228108611430738, + "grad_norm": 6.90625, + "learning_rate": 2.1304095803439673e-07, + "loss": 0.9753809, + "memory(GiB)": 302.58, + "step": 325940, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.74633651, + "epoch": 1.8229227106160528, + "grad_norm": 6.125, + "learning_rate": 2.1277399233415153e-07, + "loss": 0.99074936, + "memory(GiB)": 302.58, + "step": 325960, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.75345883, + "epoch": 1.8230345600890323, + "grad_norm": 5.46875, + "learning_rate": 2.125071903722159e-07, + "loss": 0.95863485, + "memory(GiB)": 302.58, + "step": 325980, + "train_speed(iter/s)": 0.123647 + }, + { + "acc": 0.76401019, + "epoch": 1.8231464095620113, + "grad_norm": 7.4375, + "learning_rate": 2.1224055215771477e-07, + "loss": 0.93305044, + "memory(GiB)": 302.58, + "step": 326000, + "train_speed(iter/s)": 0.123651 + }, + { + "epoch": 1.8231464095620113, + "eval_acc": 0.706920351872573, + "eval_loss": 1.0117764472961426, + "eval_runtime": 7544.9985, + "eval_samples_per_second": 9.978, + "eval_steps_per_second": 9.978, + "step": 326000 + }, + { + "acc": 0.73367286, + "epoch": 1.8232582590349908, + "grad_norm": 5.59375, + "learning_rate": 2.1197407769976862e-07, + "loss": 1.03981428, + "memory(GiB)": 302.58, + "step": 326020, + "train_speed(iter/s)": 0.123295 + }, + { + "acc": 0.76668987, + "epoch": 1.8233701085079699, + "grad_norm": 6.5, + "learning_rate": 2.117077670074902e-07, + "loss": 0.91817665, + "memory(GiB)": 302.58, + "step": 326040, + "train_speed(iter/s)": 0.123299 + }, + { + "acc": 0.77105331, + "epoch": 1.8234819579809494, + "grad_norm": 9.625, + "learning_rate": 2.1144162008999047e-07, + "loss": 0.89086828, + "memory(GiB)": 302.58, + "step": 326060, + "train_speed(iter/s)": 0.123302 + }, + { + "acc": 0.75328126, + "epoch": 1.8235938074539284, + "grad_norm": 8.1875, + "learning_rate": 2.1117563695637165e-07, + "loss": 0.98496237, + "memory(GiB)": 302.58, + "step": 326080, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.73902974, + "epoch": 1.823705656926908, + "grad_norm": 7.125, + "learning_rate": 2.1090981761573093e-07, + "loss": 1.01241589, + "memory(GiB)": 302.58, + "step": 326100, + "train_speed(iter/s)": 0.12331 + }, + { + "acc": 0.75421133, + "epoch": 1.823817506399887, + "grad_norm": 5.3125, + "learning_rate": 2.1064416207715988e-07, + "loss": 0.95957642, + "memory(GiB)": 302.58, + "step": 326120, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.76214547, + "epoch": 1.8239293558728664, + "grad_norm": 6.78125, + "learning_rate": 2.1037867034974512e-07, + "loss": 0.91891613, + "memory(GiB)": 302.58, + "step": 326140, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.75235944, + "epoch": 1.8240412053458455, + "grad_norm": 7.84375, + "learning_rate": 2.1011334244256775e-07, + "loss": 0.96887827, + "memory(GiB)": 302.58, + "step": 326160, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.73653865, + "epoch": 1.824153054818825, + "grad_norm": 11.0, + "learning_rate": 2.0984817836470107e-07, + "loss": 1.04498243, + "memory(GiB)": 302.58, + "step": 326180, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.75625362, + "epoch": 1.824264904291804, + "grad_norm": 7.375, + "learning_rate": 2.0958317812521721e-07, + "loss": 0.95650864, + "memory(GiB)": 302.58, + "step": 326200, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.74845552, + "epoch": 1.8243767537647835, + "grad_norm": 6.1875, + "learning_rate": 2.0931834173317845e-07, + "loss": 0.99734335, + "memory(GiB)": 302.58, + "step": 326220, + "train_speed(iter/s)": 0.123331 + }, + { + "acc": 0.76551461, + "epoch": 1.8244886032377625, + "grad_norm": 5.21875, + "learning_rate": 2.0905366919764302e-07, + "loss": 0.93648672, + "memory(GiB)": 302.58, + "step": 326240, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.75750685, + "epoch": 1.824600452710742, + "grad_norm": 4.96875, + "learning_rate": 2.0878916052766373e-07, + "loss": 0.95557842, + "memory(GiB)": 302.58, + "step": 326260, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.73699017, + "epoch": 1.824712302183721, + "grad_norm": 5.0, + "learning_rate": 2.085248157322878e-07, + "loss": 1.03566513, + "memory(GiB)": 302.58, + "step": 326280, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.75054483, + "epoch": 1.8248241516567005, + "grad_norm": 7.53125, + "learning_rate": 2.082606348205568e-07, + "loss": 0.99086046, + "memory(GiB)": 302.58, + "step": 326300, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.77043171, + "epoch": 1.8249360011296796, + "grad_norm": 7.40625, + "learning_rate": 2.0799661780150582e-07, + "loss": 0.88549509, + "memory(GiB)": 302.58, + "step": 326320, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.75114131, + "epoch": 1.825047850602659, + "grad_norm": 9.5625, + "learning_rate": 2.0773276468416592e-07, + "loss": 0.97186384, + "memory(GiB)": 302.58, + "step": 326340, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.74516101, + "epoch": 1.825159700075638, + "grad_norm": 6.78125, + "learning_rate": 2.0746907547756157e-07, + "loss": 1.02358294, + "memory(GiB)": 302.58, + "step": 326360, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.75908203, + "epoch": 1.8252715495486176, + "grad_norm": 7.84375, + "learning_rate": 2.072055501907111e-07, + "loss": 0.95207319, + "memory(GiB)": 302.58, + "step": 326380, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.74038215, + "epoch": 1.8253833990215966, + "grad_norm": 8.5, + "learning_rate": 2.069421888326284e-07, + "loss": 1.0188632, + "memory(GiB)": 302.58, + "step": 326400, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.75430264, + "epoch": 1.8254952484945761, + "grad_norm": 9.125, + "learning_rate": 2.0667899141232128e-07, + "loss": 0.95505686, + "memory(GiB)": 302.58, + "step": 326420, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.75727386, + "epoch": 1.8256070979675552, + "grad_norm": 5.6875, + "learning_rate": 2.0641595793879088e-07, + "loss": 0.95200052, + "memory(GiB)": 302.58, + "step": 326440, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.75576768, + "epoch": 1.8257189474405346, + "grad_norm": 9.4375, + "learning_rate": 2.061530884210361e-07, + "loss": 0.94407892, + "memory(GiB)": 302.58, + "step": 326460, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.75091171, + "epoch": 1.8258307969135137, + "grad_norm": 7.125, + "learning_rate": 2.0589038286804586e-07, + "loss": 0.99508057, + "memory(GiB)": 302.58, + "step": 326480, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.75631161, + "epoch": 1.8259426463864932, + "grad_norm": 5.40625, + "learning_rate": 2.0562784128880687e-07, + "loss": 0.97093868, + "memory(GiB)": 302.58, + "step": 326500, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.75245986, + "epoch": 1.8260544958594722, + "grad_norm": 5.625, + "learning_rate": 2.053654636922975e-07, + "loss": 0.98310432, + "memory(GiB)": 302.58, + "step": 326520, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.75511994, + "epoch": 1.8261663453324517, + "grad_norm": 8.5625, + "learning_rate": 2.0510325008749333e-07, + "loss": 0.97319002, + "memory(GiB)": 302.58, + "step": 326540, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.73755398, + "epoch": 1.8262781948054307, + "grad_norm": 6.53125, + "learning_rate": 2.0484120048336164e-07, + "loss": 1.0313014, + "memory(GiB)": 302.58, + "step": 326560, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.74944758, + "epoch": 1.8263900442784102, + "grad_norm": 6.8125, + "learning_rate": 2.045793148888664e-07, + "loss": 0.99646826, + "memory(GiB)": 302.58, + "step": 326580, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.75449305, + "epoch": 1.8265018937513893, + "grad_norm": 7.125, + "learning_rate": 2.043175933129643e-07, + "loss": 0.96525774, + "memory(GiB)": 302.58, + "step": 326600, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.76043072, + "epoch": 1.8266137432243688, + "grad_norm": 8.1875, + "learning_rate": 2.040560357646071e-07, + "loss": 0.92245321, + "memory(GiB)": 302.58, + "step": 326620, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.76673527, + "epoch": 1.8267255926973478, + "grad_norm": 5.71875, + "learning_rate": 2.0379464225274148e-07, + "loss": 0.91839418, + "memory(GiB)": 302.58, + "step": 326640, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.76618404, + "epoch": 1.8268374421703273, + "grad_norm": 7.46875, + "learning_rate": 2.03533412786307e-07, + "loss": 0.90942135, + "memory(GiB)": 302.58, + "step": 326660, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.76450343, + "epoch": 1.8269492916433063, + "grad_norm": 8.5, + "learning_rate": 2.0327234737423928e-07, + "loss": 0.92842016, + "memory(GiB)": 302.58, + "step": 326680, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.75038671, + "epoch": 1.8270611411162858, + "grad_norm": 12.125, + "learning_rate": 2.0301144602546673e-07, + "loss": 0.98776655, + "memory(GiB)": 302.58, + "step": 326700, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75208364, + "epoch": 1.8271729905892649, + "grad_norm": 7.0625, + "learning_rate": 2.0275070874891333e-07, + "loss": 0.98359861, + "memory(GiB)": 302.58, + "step": 326720, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.77156458, + "epoch": 1.8272848400622443, + "grad_norm": 5.375, + "learning_rate": 2.0249013555349807e-07, + "loss": 0.87885218, + "memory(GiB)": 302.58, + "step": 326740, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.73960881, + "epoch": 1.8273966895352234, + "grad_norm": 8.25, + "learning_rate": 2.0222972644813264e-07, + "loss": 1.03860655, + "memory(GiB)": 302.58, + "step": 326760, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.75023403, + "epoch": 1.8275085390082029, + "grad_norm": 7.46875, + "learning_rate": 2.019694814417239e-07, + "loss": 0.97553673, + "memory(GiB)": 302.58, + "step": 326780, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74547205, + "epoch": 1.827620388481182, + "grad_norm": 7.375, + "learning_rate": 2.0170940054317244e-07, + "loss": 1.01725855, + "memory(GiB)": 302.58, + "step": 326800, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.74761119, + "epoch": 1.8277322379541614, + "grad_norm": 7.15625, + "learning_rate": 2.0144948376137507e-07, + "loss": 0.97604408, + "memory(GiB)": 302.58, + "step": 326820, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.75680923, + "epoch": 1.8278440874271404, + "grad_norm": 6.21875, + "learning_rate": 2.0118973110522134e-07, + "loss": 0.95267591, + "memory(GiB)": 302.58, + "step": 326840, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.77872615, + "epoch": 1.82795593690012, + "grad_norm": 5.59375, + "learning_rate": 2.0093014258359466e-07, + "loss": 0.85987215, + "memory(GiB)": 302.58, + "step": 326860, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.74623079, + "epoch": 1.828067786373099, + "grad_norm": 6.875, + "learning_rate": 2.006707182053752e-07, + "loss": 0.98533602, + "memory(GiB)": 302.58, + "step": 326880, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75454655, + "epoch": 1.8281796358460785, + "grad_norm": 5.5625, + "learning_rate": 2.0041145797943528e-07, + "loss": 0.94318619, + "memory(GiB)": 302.58, + "step": 326900, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.76163516, + "epoch": 1.8282914853190575, + "grad_norm": 9.4375, + "learning_rate": 2.0015236191464226e-07, + "loss": 0.93952913, + "memory(GiB)": 302.58, + "step": 326920, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.73335376, + "epoch": 1.828403334792037, + "grad_norm": 5.28125, + "learning_rate": 1.998934300198585e-07, + "loss": 1.06260653, + "memory(GiB)": 302.58, + "step": 326940, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.75600972, + "epoch": 1.828515184265016, + "grad_norm": 8.4375, + "learning_rate": 1.996346623039408e-07, + "loss": 0.95355453, + "memory(GiB)": 302.58, + "step": 326960, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.75204744, + "epoch": 1.8286270337379955, + "grad_norm": 7.25, + "learning_rate": 1.9937605877573818e-07, + "loss": 0.97099533, + "memory(GiB)": 302.58, + "step": 326980, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.74900146, + "epoch": 1.8287388832109746, + "grad_norm": 8.25, + "learning_rate": 1.991176194440969e-07, + "loss": 0.9856842, + "memory(GiB)": 302.58, + "step": 327000, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.75657454, + "epoch": 1.828850732683954, + "grad_norm": 5.78125, + "learning_rate": 1.9885934431785658e-07, + "loss": 0.94253416, + "memory(GiB)": 302.58, + "step": 327020, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.75167761, + "epoch": 1.828962582156933, + "grad_norm": 7.40625, + "learning_rate": 1.986012334058507e-07, + "loss": 0.9832058, + "memory(GiB)": 302.58, + "step": 327040, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.75602403, + "epoch": 1.8290744316299126, + "grad_norm": 5.0625, + "learning_rate": 1.9834328671690716e-07, + "loss": 0.95695019, + "memory(GiB)": 302.58, + "step": 327060, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.75713449, + "epoch": 1.8291862811028916, + "grad_norm": 7.5, + "learning_rate": 1.9808550425984895e-07, + "loss": 0.94380131, + "memory(GiB)": 302.58, + "step": 327080, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74431586, + "epoch": 1.829298130575871, + "grad_norm": 7.6875, + "learning_rate": 1.9782788604349345e-07, + "loss": 1.02916174, + "memory(GiB)": 302.58, + "step": 327100, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.75851445, + "epoch": 1.8294099800488501, + "grad_norm": 9.3125, + "learning_rate": 1.9757043207665084e-07, + "loss": 0.93844452, + "memory(GiB)": 302.58, + "step": 327120, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.75422316, + "epoch": 1.8295218295218296, + "grad_norm": 6.40625, + "learning_rate": 1.9731314236812793e-07, + "loss": 0.96049089, + "memory(GiB)": 302.58, + "step": 327140, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.76850619, + "epoch": 1.8296336789948087, + "grad_norm": 5.6875, + "learning_rate": 1.9705601692672437e-07, + "loss": 0.91800346, + "memory(GiB)": 302.58, + "step": 327160, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.73619399, + "epoch": 1.8297455284677882, + "grad_norm": 9.125, + "learning_rate": 1.967990557612348e-07, + "loss": 1.03984537, + "memory(GiB)": 302.58, + "step": 327180, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.73574634, + "epoch": 1.8298573779407672, + "grad_norm": 4.96875, + "learning_rate": 1.9654225888044831e-07, + "loss": 1.03795872, + "memory(GiB)": 302.58, + "step": 327200, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.74890695, + "epoch": 1.8299692274137467, + "grad_norm": 4.71875, + "learning_rate": 1.9628562629314728e-07, + "loss": 0.995961, + "memory(GiB)": 302.58, + "step": 327220, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.73773432, + "epoch": 1.8300810768867257, + "grad_norm": 8.3125, + "learning_rate": 1.960291580081103e-07, + "loss": 1.03877907, + "memory(GiB)": 302.58, + "step": 327240, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74422016, + "epoch": 1.8301929263597052, + "grad_norm": 8.9375, + "learning_rate": 1.9577285403410863e-07, + "loss": 1.01200523, + "memory(GiB)": 302.58, + "step": 327260, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.74858727, + "epoch": 1.8303047758326842, + "grad_norm": 6.84375, + "learning_rate": 1.9551671437990972e-07, + "loss": 0.99027338, + "memory(GiB)": 302.58, + "step": 327280, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.73758583, + "epoch": 1.8304166253056637, + "grad_norm": 8.125, + "learning_rate": 1.952607390542738e-07, + "loss": 1.0264576, + "memory(GiB)": 302.58, + "step": 327300, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.73732886, + "epoch": 1.8305284747786428, + "grad_norm": 8.0, + "learning_rate": 1.950049280659555e-07, + "loss": 1.03200464, + "memory(GiB)": 302.58, + "step": 327320, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.73461857, + "epoch": 1.8306403242516223, + "grad_norm": 6.53125, + "learning_rate": 1.9474928142370508e-07, + "loss": 1.05813303, + "memory(GiB)": 302.58, + "step": 327340, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74055033, + "epoch": 1.8307521737246013, + "grad_norm": 9.9375, + "learning_rate": 1.9449379913626664e-07, + "loss": 1.0364954, + "memory(GiB)": 302.58, + "step": 327360, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75363402, + "epoch": 1.8308640231975808, + "grad_norm": 7.90625, + "learning_rate": 1.942384812123782e-07, + "loss": 0.97755461, + "memory(GiB)": 302.58, + "step": 327380, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.75368481, + "epoch": 1.8309758726705598, + "grad_norm": 8.8125, + "learning_rate": 1.9398332766077222e-07, + "loss": 0.94620676, + "memory(GiB)": 302.58, + "step": 327400, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.75730453, + "epoch": 1.8310877221435393, + "grad_norm": 6.0, + "learning_rate": 1.937283384901756e-07, + "loss": 0.95634098, + "memory(GiB)": 302.58, + "step": 327420, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.74996262, + "epoch": 1.8311995716165184, + "grad_norm": 7.0, + "learning_rate": 1.9347351370931022e-07, + "loss": 0.98534584, + "memory(GiB)": 302.58, + "step": 327440, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.7522893, + "epoch": 1.8313114210894978, + "grad_norm": 10.375, + "learning_rate": 1.9321885332689193e-07, + "loss": 0.97437782, + "memory(GiB)": 302.58, + "step": 327460, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.7455246, + "epoch": 1.831423270562477, + "grad_norm": 6.15625, + "learning_rate": 1.92964357351631e-07, + "loss": 1.00297089, + "memory(GiB)": 302.58, + "step": 327480, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.74526482, + "epoch": 1.8315351200354564, + "grad_norm": 7.71875, + "learning_rate": 1.9271002579223097e-07, + "loss": 0.99902773, + "memory(GiB)": 302.58, + "step": 327500, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75130496, + "epoch": 1.8316469695084354, + "grad_norm": 8.25, + "learning_rate": 1.9245585865739214e-07, + "loss": 0.96418972, + "memory(GiB)": 302.58, + "step": 327520, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.75095787, + "epoch": 1.831758818981415, + "grad_norm": 6.875, + "learning_rate": 1.9220185595580698e-07, + "loss": 0.98389292, + "memory(GiB)": 302.58, + "step": 327540, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.75543098, + "epoch": 1.831870668454394, + "grad_norm": 4.75, + "learning_rate": 1.9194801769616413e-07, + "loss": 0.94833078, + "memory(GiB)": 302.58, + "step": 327560, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.73560929, + "epoch": 1.8319825179273734, + "grad_norm": 5.84375, + "learning_rate": 1.9169434388714437e-07, + "loss": 1.04761534, + "memory(GiB)": 302.58, + "step": 327580, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.7448534, + "epoch": 1.8320943674003525, + "grad_norm": 6.75, + "learning_rate": 1.9144083453742412e-07, + "loss": 1.01050329, + "memory(GiB)": 302.58, + "step": 327600, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.74218054, + "epoch": 1.832206216873332, + "grad_norm": 6.59375, + "learning_rate": 1.9118748965567534e-07, + "loss": 1.02305555, + "memory(GiB)": 302.58, + "step": 327620, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.75012708, + "epoch": 1.832318066346311, + "grad_norm": 8.875, + "learning_rate": 1.9093430925056332e-07, + "loss": 0.99120646, + "memory(GiB)": 302.58, + "step": 327640, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.75771122, + "epoch": 1.8324299158192905, + "grad_norm": 6.75, + "learning_rate": 1.906812933307467e-07, + "loss": 0.93060055, + "memory(GiB)": 302.58, + "step": 327660, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.75385704, + "epoch": 1.8325417652922695, + "grad_norm": 7.9375, + "learning_rate": 1.9042844190487963e-07, + "loss": 0.99085426, + "memory(GiB)": 302.58, + "step": 327680, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74826145, + "epoch": 1.832653614765249, + "grad_norm": 9.9375, + "learning_rate": 1.901757549816108e-07, + "loss": 0.99541111, + "memory(GiB)": 302.58, + "step": 327700, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.7509419, + "epoch": 1.832765464238228, + "grad_norm": 8.6875, + "learning_rate": 1.8992323256958267e-07, + "loss": 0.97801857, + "memory(GiB)": 302.58, + "step": 327720, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.75831356, + "epoch": 1.8328773137112075, + "grad_norm": 6.5625, + "learning_rate": 1.8967087467743283e-07, + "loss": 0.95405521, + "memory(GiB)": 302.58, + "step": 327740, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.74631014, + "epoch": 1.8329891631841866, + "grad_norm": 5.09375, + "learning_rate": 1.894186813137916e-07, + "loss": 0.98502064, + "memory(GiB)": 302.58, + "step": 327760, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.75240512, + "epoch": 1.833101012657166, + "grad_norm": 6.5, + "learning_rate": 1.8916665248728537e-07, + "loss": 0.96540022, + "memory(GiB)": 302.58, + "step": 327780, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.73160448, + "epoch": 1.8332128621301451, + "grad_norm": 8.125, + "learning_rate": 1.8891478820653452e-07, + "loss": 1.04853859, + "memory(GiB)": 302.58, + "step": 327800, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.73787022, + "epoch": 1.8333247116031246, + "grad_norm": 7.28125, + "learning_rate": 1.8866308848015325e-07, + "loss": 1.03663282, + "memory(GiB)": 302.58, + "step": 327820, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.75650253, + "epoch": 1.8334365610761036, + "grad_norm": 6.78125, + "learning_rate": 1.884115533167513e-07, + "loss": 0.94623575, + "memory(GiB)": 302.58, + "step": 327840, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.75937014, + "epoch": 1.8335484105490831, + "grad_norm": 4.4375, + "learning_rate": 1.8816018272493075e-07, + "loss": 0.93110657, + "memory(GiB)": 302.58, + "step": 327860, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.75194435, + "epoch": 1.8336602600220622, + "grad_norm": 8.3125, + "learning_rate": 1.8790897671328966e-07, + "loss": 0.9927907, + "memory(GiB)": 302.58, + "step": 327880, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.76610026, + "epoch": 1.8337721094950417, + "grad_norm": 5.875, + "learning_rate": 1.8765793529042065e-07, + "loss": 0.93353615, + "memory(GiB)": 302.58, + "step": 327900, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.75111942, + "epoch": 1.8338839589680207, + "grad_norm": 7.8125, + "learning_rate": 1.8740705846490957e-07, + "loss": 0.98663692, + "memory(GiB)": 302.58, + "step": 327920, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.75763431, + "epoch": 1.8339958084410002, + "grad_norm": 8.6875, + "learning_rate": 1.8715634624533795e-07, + "loss": 0.95335503, + "memory(GiB)": 302.58, + "step": 327940, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.75364146, + "epoch": 1.8341076579139792, + "grad_norm": 8.5625, + "learning_rate": 1.8690579864028001e-07, + "loss": 0.97114429, + "memory(GiB)": 302.58, + "step": 327960, + "train_speed(iter/s)": 0.12364 + }, + { + "acc": 0.75725574, + "epoch": 1.8342195073869587, + "grad_norm": 9.0, + "learning_rate": 1.8665541565830614e-07, + "loss": 0.95396299, + "memory(GiB)": 302.58, + "step": 327980, + "train_speed(iter/s)": 0.123644 + }, + { + "acc": 0.7642602, + "epoch": 1.8343313568599378, + "grad_norm": 6.0625, + "learning_rate": 1.8640519730797947e-07, + "loss": 0.91526089, + "memory(GiB)": 302.58, + "step": 328000, + "train_speed(iter/s)": 0.123648 + }, + { + "epoch": 1.8343313568599378, + "eval_acc": 0.7068884075550678, + "eval_loss": 1.0117746591567993, + "eval_runtime": 7608.2842, + "eval_samples_per_second": 9.895, + "eval_steps_per_second": 9.895, + "step": 328000 + }, + { + "acc": 0.76070781, + "epoch": 1.8344432063329172, + "grad_norm": 6.25, + "learning_rate": 1.8615514359785815e-07, + "loss": 0.94639435, + "memory(GiB)": 302.58, + "step": 328020, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.74056177, + "epoch": 1.8345550558058963, + "grad_norm": 7.0, + "learning_rate": 1.859052545364959e-07, + "loss": 1.02725964, + "memory(GiB)": 302.58, + "step": 328040, + "train_speed(iter/s)": 0.123295 + }, + { + "acc": 0.73190589, + "epoch": 1.8346669052788758, + "grad_norm": 8.75, + "learning_rate": 1.8565553013243865e-07, + "loss": 1.05866413, + "memory(GiB)": 302.58, + "step": 328060, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.75113554, + "epoch": 1.8347787547518548, + "grad_norm": 6.15625, + "learning_rate": 1.854059703942279e-07, + "loss": 0.98829031, + "memory(GiB)": 302.58, + "step": 328080, + "train_speed(iter/s)": 0.123302 + }, + { + "acc": 0.74432578, + "epoch": 1.8348906042248343, + "grad_norm": 6.5, + "learning_rate": 1.8515657533039967e-07, + "loss": 1.0119339, + "memory(GiB)": 302.58, + "step": 328100, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.74226637, + "epoch": 1.8350024536978133, + "grad_norm": 6.125, + "learning_rate": 1.8490734494948426e-07, + "loss": 1.03325672, + "memory(GiB)": 302.58, + "step": 328120, + "train_speed(iter/s)": 0.123309 + }, + { + "acc": 0.7597919, + "epoch": 1.8351143031707928, + "grad_norm": 6.3125, + "learning_rate": 1.8465827926000546e-07, + "loss": 0.9419445, + "memory(GiB)": 302.58, + "step": 328140, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.74949532, + "epoch": 1.8352261526437719, + "grad_norm": 6.3125, + "learning_rate": 1.844093782704831e-07, + "loss": 0.99535561, + "memory(GiB)": 302.58, + "step": 328160, + "train_speed(iter/s)": 0.123316 + }, + { + "acc": 0.73349977, + "epoch": 1.8353380021167514, + "grad_norm": 5.8125, + "learning_rate": 1.8416064198942985e-07, + "loss": 1.07757463, + "memory(GiB)": 302.58, + "step": 328180, + "train_speed(iter/s)": 0.12332 + }, + { + "acc": 0.72877789, + "epoch": 1.8354498515897304, + "grad_norm": 7.625, + "learning_rate": 1.8391207042535276e-07, + "loss": 1.08871212, + "memory(GiB)": 302.58, + "step": 328200, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.76438379, + "epoch": 1.8355617010627099, + "grad_norm": 6.46875, + "learning_rate": 1.8366366358675446e-07, + "loss": 0.92734327, + "memory(GiB)": 302.58, + "step": 328220, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.74380198, + "epoch": 1.835673550535689, + "grad_norm": 6.0625, + "learning_rate": 1.8341542148213155e-07, + "loss": 1.00268726, + "memory(GiB)": 302.58, + "step": 328240, + "train_speed(iter/s)": 0.123331 + }, + { + "acc": 0.734797, + "epoch": 1.8357854000086684, + "grad_norm": 5.21875, + "learning_rate": 1.8316734411997384e-07, + "loss": 1.05737228, + "memory(GiB)": 302.58, + "step": 328260, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.73456588, + "epoch": 1.8358972494816475, + "grad_norm": 6.3125, + "learning_rate": 1.829194315087668e-07, + "loss": 1.05273685, + "memory(GiB)": 302.58, + "step": 328280, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.74531384, + "epoch": 1.836009098954627, + "grad_norm": 8.5625, + "learning_rate": 1.8267168365698972e-07, + "loss": 1.02466526, + "memory(GiB)": 302.58, + "step": 328300, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.7560329, + "epoch": 1.836120948427606, + "grad_norm": 8.5, + "learning_rate": 1.8242410057311587e-07, + "loss": 0.94924459, + "memory(GiB)": 302.58, + "step": 328320, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.75229673, + "epoch": 1.8362327979005855, + "grad_norm": 7.46875, + "learning_rate": 1.821766822656146e-07, + "loss": 0.95948467, + "memory(GiB)": 302.58, + "step": 328340, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.74391465, + "epoch": 1.8363446473735645, + "grad_norm": 4.96875, + "learning_rate": 1.819294287429474e-07, + "loss": 1.00402422, + "memory(GiB)": 302.58, + "step": 328360, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.75297279, + "epoch": 1.836456496846544, + "grad_norm": 6.09375, + "learning_rate": 1.8168234001357144e-07, + "loss": 0.96099243, + "memory(GiB)": 302.58, + "step": 328380, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.75469422, + "epoch": 1.836568346319523, + "grad_norm": 8.4375, + "learning_rate": 1.8143541608593828e-07, + "loss": 0.96554937, + "memory(GiB)": 302.58, + "step": 328400, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.74449492, + "epoch": 1.8366801957925025, + "grad_norm": 7.25, + "learning_rate": 1.811886569684923e-07, + "loss": 1.01001225, + "memory(GiB)": 302.58, + "step": 328420, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.72457318, + "epoch": 1.8367920452654816, + "grad_norm": 9.0625, + "learning_rate": 1.809420626696745e-07, + "loss": 1.08160486, + "memory(GiB)": 302.58, + "step": 328440, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.75496006, + "epoch": 1.836903894738461, + "grad_norm": 6.65625, + "learning_rate": 1.8069563319791926e-07, + "loss": 0.94964676, + "memory(GiB)": 302.58, + "step": 328460, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.76428194, + "epoch": 1.83701574421144, + "grad_norm": 6.0625, + "learning_rate": 1.804493685616554e-07, + "loss": 0.9246067, + "memory(GiB)": 302.58, + "step": 328480, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.74115524, + "epoch": 1.8371275936844196, + "grad_norm": 7.3125, + "learning_rate": 1.8020326876930506e-07, + "loss": 1.02245016, + "memory(GiB)": 302.58, + "step": 328500, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.75339093, + "epoch": 1.8372394431573986, + "grad_norm": 8.8125, + "learning_rate": 1.7995733382928593e-07, + "loss": 0.96562815, + "memory(GiB)": 302.58, + "step": 328520, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.74854193, + "epoch": 1.837351292630378, + "grad_norm": 8.625, + "learning_rate": 1.7971156375000966e-07, + "loss": 0.99034176, + "memory(GiB)": 302.58, + "step": 328540, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.73200493, + "epoch": 1.8374631421033571, + "grad_norm": 5.875, + "learning_rate": 1.7946595853988335e-07, + "loss": 1.05742359, + "memory(GiB)": 302.58, + "step": 328560, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.75299129, + "epoch": 1.8375749915763366, + "grad_norm": 11.375, + "learning_rate": 1.7922051820730646e-07, + "loss": 0.99428272, + "memory(GiB)": 302.58, + "step": 328580, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.74642773, + "epoch": 1.8376868410493157, + "grad_norm": 7.21875, + "learning_rate": 1.7897524276067392e-07, + "loss": 0.97865782, + "memory(GiB)": 302.58, + "step": 328600, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74378166, + "epoch": 1.8377986905222952, + "grad_norm": 9.125, + "learning_rate": 1.7873013220837565e-07, + "loss": 1.00756521, + "memory(GiB)": 302.58, + "step": 328620, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.76412859, + "epoch": 1.8379105399952742, + "grad_norm": 4.9375, + "learning_rate": 1.7848518655879442e-07, + "loss": 0.91276741, + "memory(GiB)": 302.58, + "step": 328640, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.75824847, + "epoch": 1.8380223894682537, + "grad_norm": 7.15625, + "learning_rate": 1.7824040582030855e-07, + "loss": 0.91558695, + "memory(GiB)": 302.58, + "step": 328660, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.74763474, + "epoch": 1.8381342389412327, + "grad_norm": 10.3125, + "learning_rate": 1.779957900012902e-07, + "loss": 1.02103014, + "memory(GiB)": 302.58, + "step": 328680, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.72922964, + "epoch": 1.8382460884142122, + "grad_norm": 6.75, + "learning_rate": 1.77751339110106e-07, + "loss": 1.0740633, + "memory(GiB)": 302.58, + "step": 328700, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.74812002, + "epoch": 1.8383579378871913, + "grad_norm": 8.5625, + "learning_rate": 1.7750705315511652e-07, + "loss": 0.98911486, + "memory(GiB)": 302.58, + "step": 328720, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.7705709, + "epoch": 1.8384697873601707, + "grad_norm": 6.71875, + "learning_rate": 1.7726293214467726e-07, + "loss": 0.89745464, + "memory(GiB)": 302.58, + "step": 328740, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74473157, + "epoch": 1.8385816368331498, + "grad_norm": 7.90625, + "learning_rate": 1.770189760871388e-07, + "loss": 0.98758936, + "memory(GiB)": 302.58, + "step": 328760, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.75034771, + "epoch": 1.8386934863061293, + "grad_norm": 7.21875, + "learning_rate": 1.7677518499084445e-07, + "loss": 0.98557291, + "memory(GiB)": 302.58, + "step": 328780, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75457373, + "epoch": 1.8388053357791083, + "grad_norm": 8.75, + "learning_rate": 1.7653155886413308e-07, + "loss": 0.9527091, + "memory(GiB)": 302.58, + "step": 328800, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.75192237, + "epoch": 1.8389171852520878, + "grad_norm": 6.75, + "learning_rate": 1.7628809771533696e-07, + "loss": 0.9809659, + "memory(GiB)": 302.58, + "step": 328820, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.74415355, + "epoch": 1.8390290347250668, + "grad_norm": 6.6875, + "learning_rate": 1.7604480155278382e-07, + "loss": 1.02541409, + "memory(GiB)": 302.58, + "step": 328840, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.75178571, + "epoch": 1.8391408841980463, + "grad_norm": 8.75, + "learning_rate": 1.7580167038479423e-07, + "loss": 0.95643206, + "memory(GiB)": 302.58, + "step": 328860, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.7484221, + "epoch": 1.8392527336710254, + "grad_norm": 6.40625, + "learning_rate": 1.7555870421968546e-07, + "loss": 0.99918814, + "memory(GiB)": 302.58, + "step": 328880, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.73266706, + "epoch": 1.8393645831440049, + "grad_norm": 4.875, + "learning_rate": 1.753159030657664e-07, + "loss": 1.06620741, + "memory(GiB)": 302.58, + "step": 328900, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.73620572, + "epoch": 1.839476432616984, + "grad_norm": 6.625, + "learning_rate": 1.750732669313421e-07, + "loss": 1.03178177, + "memory(GiB)": 302.58, + "step": 328920, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.76820049, + "epoch": 1.8395882820899634, + "grad_norm": 7.78125, + "learning_rate": 1.7483079582471142e-07, + "loss": 0.91835556, + "memory(GiB)": 302.58, + "step": 328940, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73509464, + "epoch": 1.8397001315629424, + "grad_norm": 9.125, + "learning_rate": 1.7458848975416775e-07, + "loss": 1.03165894, + "memory(GiB)": 302.58, + "step": 328960, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.74575877, + "epoch": 1.839811981035922, + "grad_norm": 7.09375, + "learning_rate": 1.7434634872799894e-07, + "loss": 1.01279449, + "memory(GiB)": 302.58, + "step": 328980, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.75265942, + "epoch": 1.839923830508901, + "grad_norm": 7.34375, + "learning_rate": 1.7410437275448666e-07, + "loss": 0.98624783, + "memory(GiB)": 302.58, + "step": 329000, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.7423255, + "epoch": 1.8400356799818804, + "grad_norm": 6.6875, + "learning_rate": 1.7386256184190654e-07, + "loss": 1.01708803, + "memory(GiB)": 302.58, + "step": 329020, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.76296358, + "epoch": 1.8401475294548595, + "grad_norm": 8.875, + "learning_rate": 1.7362091599853082e-07, + "loss": 0.94996414, + "memory(GiB)": 302.58, + "step": 329040, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.76845956, + "epoch": 1.840259378927839, + "grad_norm": 6.59375, + "learning_rate": 1.7337943523262346e-07, + "loss": 0.87587442, + "memory(GiB)": 302.58, + "step": 329060, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74331131, + "epoch": 1.840371228400818, + "grad_norm": 7.90625, + "learning_rate": 1.7313811955244453e-07, + "loss": 1.00389433, + "memory(GiB)": 302.58, + "step": 329080, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.75136838, + "epoch": 1.8404830778737975, + "grad_norm": 7.8125, + "learning_rate": 1.7289696896624741e-07, + "loss": 0.98221855, + "memory(GiB)": 302.58, + "step": 329100, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.76970725, + "epoch": 1.8405949273467765, + "grad_norm": 6.4375, + "learning_rate": 1.7265598348228052e-07, + "loss": 0.89739151, + "memory(GiB)": 302.58, + "step": 329120, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74932036, + "epoch": 1.840706776819756, + "grad_norm": 7.1875, + "learning_rate": 1.7241516310878614e-07, + "loss": 0.99302778, + "memory(GiB)": 302.58, + "step": 329140, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.73284392, + "epoch": 1.840818626292735, + "grad_norm": 7.03125, + "learning_rate": 1.7217450785400048e-07, + "loss": 1.07394361, + "memory(GiB)": 302.58, + "step": 329160, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74084878, + "epoch": 1.8409304757657146, + "grad_norm": 10.0, + "learning_rate": 1.719340177261558e-07, + "loss": 1.01699905, + "memory(GiB)": 302.58, + "step": 329180, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.753263, + "epoch": 1.8410423252386936, + "grad_norm": 8.5, + "learning_rate": 1.716936927334767e-07, + "loss": 0.95709667, + "memory(GiB)": 302.58, + "step": 329200, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.77064228, + "epoch": 1.841154174711673, + "grad_norm": 5.71875, + "learning_rate": 1.7145353288418375e-07, + "loss": 0.88352518, + "memory(GiB)": 302.58, + "step": 329220, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.7553472, + "epoch": 1.8412660241846524, + "grad_norm": 4.3125, + "learning_rate": 1.71213538186491e-07, + "loss": 0.94404984, + "memory(GiB)": 302.58, + "step": 329240, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.76448426, + "epoch": 1.8413778736576316, + "grad_norm": 6.90625, + "learning_rate": 1.709737086486063e-07, + "loss": 0.92455473, + "memory(GiB)": 302.58, + "step": 329260, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.78074613, + "epoch": 1.8414897231306109, + "grad_norm": 8.1875, + "learning_rate": 1.7073404427873363e-07, + "loss": 0.85685177, + "memory(GiB)": 302.58, + "step": 329280, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.75133348, + "epoch": 1.8416015726035901, + "grad_norm": 8.9375, + "learning_rate": 1.7049454508506925e-07, + "loss": 0.98484659, + "memory(GiB)": 302.58, + "step": 329300, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.76490149, + "epoch": 1.8417134220765694, + "grad_norm": 8.5, + "learning_rate": 1.7025521107580602e-07, + "loss": 0.91121273, + "memory(GiB)": 302.58, + "step": 329320, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.74292231, + "epoch": 1.8418252715495487, + "grad_norm": 7.71875, + "learning_rate": 1.7001604225912906e-07, + "loss": 0.99552364, + "memory(GiB)": 302.58, + "step": 329340, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.73784294, + "epoch": 1.841937121022528, + "grad_norm": 7.5, + "learning_rate": 1.6977703864321904e-07, + "loss": 1.0462306, + "memory(GiB)": 302.58, + "step": 329360, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74421945, + "epoch": 1.8420489704955072, + "grad_norm": 7.0625, + "learning_rate": 1.6953820023625057e-07, + "loss": 1.01323309, + "memory(GiB)": 302.58, + "step": 329380, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.74716692, + "epoch": 1.8421608199684865, + "grad_norm": 7.0625, + "learning_rate": 1.6929952704639262e-07, + "loss": 0.98303671, + "memory(GiB)": 302.58, + "step": 329400, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.73158336, + "epoch": 1.8422726694414657, + "grad_norm": 9.0625, + "learning_rate": 1.6906101908180816e-07, + "loss": 1.06790762, + "memory(GiB)": 302.58, + "step": 329420, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.76151667, + "epoch": 1.842384518914445, + "grad_norm": 5.9375, + "learning_rate": 1.6882267635065564e-07, + "loss": 0.93606844, + "memory(GiB)": 302.58, + "step": 329440, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.75731025, + "epoch": 1.8424963683874243, + "grad_norm": 6.0625, + "learning_rate": 1.685844988610874e-07, + "loss": 0.95171432, + "memory(GiB)": 302.58, + "step": 329460, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.73554649, + "epoch": 1.8426082178604035, + "grad_norm": 7.53125, + "learning_rate": 1.6834648662124865e-07, + "loss": 1.05581617, + "memory(GiB)": 302.58, + "step": 329480, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.75955281, + "epoch": 1.8427200673333828, + "grad_norm": 6.40625, + "learning_rate": 1.681086396392806e-07, + "loss": 0.93909321, + "memory(GiB)": 302.58, + "step": 329500, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.75689621, + "epoch": 1.842831916806362, + "grad_norm": 8.8125, + "learning_rate": 1.6787095792331897e-07, + "loss": 0.96886587, + "memory(GiB)": 302.58, + "step": 329520, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.75744667, + "epoch": 1.8429437662793413, + "grad_norm": 7.9375, + "learning_rate": 1.6763344148149285e-07, + "loss": 0.95430183, + "memory(GiB)": 302.58, + "step": 329540, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.76340809, + "epoch": 1.8430556157523206, + "grad_norm": 10.375, + "learning_rate": 1.6739609032192573e-07, + "loss": 0.92684364, + "memory(GiB)": 302.58, + "step": 329560, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.76069317, + "epoch": 1.8431674652252998, + "grad_norm": 5.75, + "learning_rate": 1.6715890445273608e-07, + "loss": 0.92214489, + "memory(GiB)": 302.58, + "step": 329580, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.76070156, + "epoch": 1.843279314698279, + "grad_norm": 8.875, + "learning_rate": 1.669218838820369e-07, + "loss": 0.93901615, + "memory(GiB)": 302.58, + "step": 329600, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.75146666, + "epoch": 1.8433911641712584, + "grad_norm": 7.09375, + "learning_rate": 1.6668502861793445e-07, + "loss": 0.98210173, + "memory(GiB)": 302.58, + "step": 329620, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.73800907, + "epoch": 1.8435030136442376, + "grad_norm": 6.4375, + "learning_rate": 1.6644833866853006e-07, + "loss": 1.04332266, + "memory(GiB)": 302.58, + "step": 329640, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.73822622, + "epoch": 1.843614863117217, + "grad_norm": 5.875, + "learning_rate": 1.6621181404191889e-07, + "loss": 1.02473412, + "memory(GiB)": 302.58, + "step": 329660, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.75157576, + "epoch": 1.8437267125901962, + "grad_norm": 5.65625, + "learning_rate": 1.659754547461917e-07, + "loss": 0.97556324, + "memory(GiB)": 302.58, + "step": 329680, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.76205244, + "epoch": 1.8438385620631754, + "grad_norm": 6.78125, + "learning_rate": 1.6573926078943202e-07, + "loss": 0.91036539, + "memory(GiB)": 302.58, + "step": 329700, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.7424222, + "epoch": 1.8439504115361547, + "grad_norm": 6.59375, + "learning_rate": 1.6550323217971897e-07, + "loss": 1.00359249, + "memory(GiB)": 302.58, + "step": 329720, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.74647841, + "epoch": 1.844062261009134, + "grad_norm": 6.59375, + "learning_rate": 1.6526736892512441e-07, + "loss": 0.99428654, + "memory(GiB)": 302.58, + "step": 329740, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.7386549, + "epoch": 1.8441741104821132, + "grad_norm": 5.15625, + "learning_rate": 1.6503167103371688e-07, + "loss": 1.05009727, + "memory(GiB)": 302.58, + "step": 329760, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.75089211, + "epoch": 1.8442859599550925, + "grad_norm": 7.65625, + "learning_rate": 1.647961385135577e-07, + "loss": 0.9855402, + "memory(GiB)": 302.58, + "step": 329780, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.73937035, + "epoch": 1.8443978094280717, + "grad_norm": 8.75, + "learning_rate": 1.6456077137270154e-07, + "loss": 1.0022953, + "memory(GiB)": 302.58, + "step": 329800, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.73777461, + "epoch": 1.844509658901051, + "grad_norm": 7.34375, + "learning_rate": 1.643255696192003e-07, + "loss": 1.04957333, + "memory(GiB)": 302.58, + "step": 329820, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.76161985, + "epoch": 1.8446215083740303, + "grad_norm": 5.9375, + "learning_rate": 1.6409053326109868e-07, + "loss": 0.93255663, + "memory(GiB)": 302.58, + "step": 329840, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.75184646, + "epoch": 1.8447333578470095, + "grad_norm": 4.375, + "learning_rate": 1.6385566230643468e-07, + "loss": 0.98308649, + "memory(GiB)": 302.58, + "step": 329860, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.75652304, + "epoch": 1.8448452073199888, + "grad_norm": 6.875, + "learning_rate": 1.6362095676324186e-07, + "loss": 0.97680407, + "memory(GiB)": 302.58, + "step": 329880, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.7265964, + "epoch": 1.844957056792968, + "grad_norm": 5.84375, + "learning_rate": 1.6338641663954768e-07, + "loss": 1.08724375, + "memory(GiB)": 302.58, + "step": 329900, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.73893566, + "epoch": 1.8450689062659473, + "grad_norm": 9.0625, + "learning_rate": 1.6315204194337518e-07, + "loss": 1.04897442, + "memory(GiB)": 302.58, + "step": 329920, + "train_speed(iter/s)": 0.123627 + }, + { + "acc": 0.74790893, + "epoch": 1.8451807557389266, + "grad_norm": 4.96875, + "learning_rate": 1.629178326827402e-07, + "loss": 0.99954929, + "memory(GiB)": 302.58, + "step": 329940, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.75039101, + "epoch": 1.8452926052119059, + "grad_norm": 9.6875, + "learning_rate": 1.6268378886565296e-07, + "loss": 0.97722778, + "memory(GiB)": 302.58, + "step": 329960, + "train_speed(iter/s)": 0.123634 + }, + { + "acc": 0.76840305, + "epoch": 1.8454044546848851, + "grad_norm": 7.40625, + "learning_rate": 1.624499105001187e-07, + "loss": 0.88508434, + "memory(GiB)": 302.58, + "step": 329980, + "train_speed(iter/s)": 0.123637 + }, + { + "acc": 0.77136106, + "epoch": 1.8455163041578644, + "grad_norm": 8.875, + "learning_rate": 1.6221619759413776e-07, + "loss": 0.8949996, + "memory(GiB)": 302.58, + "step": 330000, + "train_speed(iter/s)": 0.123641 + }, + { + "epoch": 1.8455163041578644, + "eval_acc": 0.7068916118461754, + "eval_loss": 1.0117977857589722, + "eval_runtime": 7608.4905, + "eval_samples_per_second": 9.895, + "eval_steps_per_second": 9.895, + "step": 330000 + }, + { + "acc": 0.76317983, + "epoch": 1.8456281536308436, + "grad_norm": 10.0, + "learning_rate": 1.6198265015570258e-07, + "loss": 0.92034531, + "memory(GiB)": 302.58, + "step": 330020, + "train_speed(iter/s)": 0.123287 + }, + { + "acc": 0.75387287, + "epoch": 1.845740003103823, + "grad_norm": 8.0625, + "learning_rate": 1.6174926819280124e-07, + "loss": 0.9860178, + "memory(GiB)": 302.58, + "step": 330040, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.74883657, + "epoch": 1.8458518525768022, + "grad_norm": 7.6875, + "learning_rate": 1.6151605171341734e-07, + "loss": 1.02400923, + "memory(GiB)": 302.58, + "step": 330060, + "train_speed(iter/s)": 0.123294 + }, + { + "acc": 0.74639173, + "epoch": 1.8459637020497814, + "grad_norm": 7.25, + "learning_rate": 1.612830007255267e-07, + "loss": 0.99793882, + "memory(GiB)": 302.58, + "step": 330080, + "train_speed(iter/s)": 0.123297 + }, + { + "acc": 0.75198121, + "epoch": 1.8460755515227607, + "grad_norm": 7.34375, + "learning_rate": 1.6105011523710024e-07, + "loss": 0.99083052, + "memory(GiB)": 302.58, + "step": 330100, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.74194007, + "epoch": 1.84618740099574, + "grad_norm": 6.5625, + "learning_rate": 1.6081739525610428e-07, + "loss": 1.03145666, + "memory(GiB)": 302.58, + "step": 330120, + "train_speed(iter/s)": 0.123304 + }, + { + "acc": 0.76947303, + "epoch": 1.8462992504687192, + "grad_norm": 7.78125, + "learning_rate": 1.6058484079049753e-07, + "loss": 0.89986362, + "memory(GiB)": 302.58, + "step": 330140, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.73906918, + "epoch": 1.8464110999416985, + "grad_norm": 4.1875, + "learning_rate": 1.6035245184823468e-07, + "loss": 1.03862381, + "memory(GiB)": 302.58, + "step": 330160, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.74291205, + "epoch": 1.8465229494146778, + "grad_norm": 7.21875, + "learning_rate": 1.6012022843726383e-07, + "loss": 1.01451521, + "memory(GiB)": 302.58, + "step": 330180, + "train_speed(iter/s)": 0.123314 + }, + { + "acc": 0.75830765, + "epoch": 1.846634798887657, + "grad_norm": 9.375, + "learning_rate": 1.5988817056552808e-07, + "loss": 0.95548868, + "memory(GiB)": 302.58, + "step": 330200, + "train_speed(iter/s)": 0.123318 + }, + { + "acc": 0.74517241, + "epoch": 1.8467466483606363, + "grad_norm": 7.53125, + "learning_rate": 1.5965627824096498e-07, + "loss": 1.02675724, + "memory(GiB)": 302.58, + "step": 330220, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.76176853, + "epoch": 1.8468584978336156, + "grad_norm": 9.6875, + "learning_rate": 1.5942455147150536e-07, + "loss": 0.93230009, + "memory(GiB)": 302.58, + "step": 330240, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.74255624, + "epoch": 1.8469703473065948, + "grad_norm": 9.375, + "learning_rate": 1.5919299026507518e-07, + "loss": 1.0058362, + "memory(GiB)": 302.58, + "step": 330260, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.76554828, + "epoch": 1.847082196779574, + "grad_norm": 8.5, + "learning_rate": 1.5896159462959416e-07, + "loss": 0.91458302, + "memory(GiB)": 302.58, + "step": 330280, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.75092969, + "epoch": 1.8471940462525533, + "grad_norm": 7.625, + "learning_rate": 1.5873036457297707e-07, + "loss": 1.02553291, + "memory(GiB)": 302.58, + "step": 330300, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.74703331, + "epoch": 1.8473058957255326, + "grad_norm": 8.125, + "learning_rate": 1.584993001031332e-07, + "loss": 1.00201178, + "memory(GiB)": 302.58, + "step": 330320, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.75862193, + "epoch": 1.8474177451985119, + "grad_norm": 6.0625, + "learning_rate": 1.5826840122796504e-07, + "loss": 0.9683548, + "memory(GiB)": 302.58, + "step": 330340, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.7513042, + "epoch": 1.8475295946714911, + "grad_norm": 7.09375, + "learning_rate": 1.5803766795537022e-07, + "loss": 0.97327957, + "memory(GiB)": 302.58, + "step": 330360, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.75176206, + "epoch": 1.8476414441444704, + "grad_norm": 6.1875, + "learning_rate": 1.5780710029324077e-07, + "loss": 0.98311319, + "memory(GiB)": 302.58, + "step": 330380, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.75457096, + "epoch": 1.8477532936174497, + "grad_norm": 8.1875, + "learning_rate": 1.5757669824946253e-07, + "loss": 0.96079464, + "memory(GiB)": 302.58, + "step": 330400, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.72958598, + "epoch": 1.847865143090429, + "grad_norm": 7.75, + "learning_rate": 1.573464618319165e-07, + "loss": 1.08299913, + "memory(GiB)": 302.58, + "step": 330420, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.7594409, + "epoch": 1.8479769925634082, + "grad_norm": 7.375, + "learning_rate": 1.571163910484763e-07, + "loss": 0.93389177, + "memory(GiB)": 302.58, + "step": 330440, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.77405653, + "epoch": 1.8480888420363875, + "grad_norm": 7.28125, + "learning_rate": 1.568864859070124e-07, + "loss": 0.88456974, + "memory(GiB)": 302.58, + "step": 330460, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.73122225, + "epoch": 1.8482006915093667, + "grad_norm": 13.75, + "learning_rate": 1.5665674641538787e-07, + "loss": 1.08825054, + "memory(GiB)": 302.58, + "step": 330480, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.73885102, + "epoch": 1.848312540982346, + "grad_norm": 9.9375, + "learning_rate": 1.5642717258145979e-07, + "loss": 1.01685705, + "memory(GiB)": 302.58, + "step": 330500, + "train_speed(iter/s)": 0.123371 + }, + { + "acc": 0.74795771, + "epoch": 1.8484243904553253, + "grad_norm": 7.375, + "learning_rate": 1.5619776441308187e-07, + "loss": 0.97961235, + "memory(GiB)": 302.58, + "step": 330520, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.75198159, + "epoch": 1.8485362399283045, + "grad_norm": 11.0, + "learning_rate": 1.5596852191809898e-07, + "loss": 0.96896124, + "memory(GiB)": 302.58, + "step": 330540, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.75685263, + "epoch": 1.8486480894012838, + "grad_norm": 6.5, + "learning_rate": 1.5573944510435312e-07, + "loss": 0.97688828, + "memory(GiB)": 302.58, + "step": 330560, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.75272274, + "epoch": 1.848759938874263, + "grad_norm": 5.09375, + "learning_rate": 1.555105339796792e-07, + "loss": 0.96664333, + "memory(GiB)": 302.58, + "step": 330580, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.78276386, + "epoch": 1.8488717883472423, + "grad_norm": 6.15625, + "learning_rate": 1.552817885519059e-07, + "loss": 0.84769239, + "memory(GiB)": 302.58, + "step": 330600, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.76244569, + "epoch": 1.8489836378202216, + "grad_norm": 7.90625, + "learning_rate": 1.550532088288581e-07, + "loss": 0.93872929, + "memory(GiB)": 302.58, + "step": 330620, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.73526983, + "epoch": 1.8490954872932008, + "grad_norm": 4.40625, + "learning_rate": 1.548247948183529e-07, + "loss": 1.05064678, + "memory(GiB)": 302.58, + "step": 330640, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.75782251, + "epoch": 1.84920733676618, + "grad_norm": 9.125, + "learning_rate": 1.5459654652820343e-07, + "loss": 0.96535301, + "memory(GiB)": 302.58, + "step": 330660, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.74468174, + "epoch": 1.8493191862391594, + "grad_norm": 4.1875, + "learning_rate": 1.5436846396621684e-07, + "loss": 1.02395382, + "memory(GiB)": 302.58, + "step": 330680, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.77429223, + "epoch": 1.8494310357121386, + "grad_norm": 6.71875, + "learning_rate": 1.5414054714019356e-07, + "loss": 0.88223066, + "memory(GiB)": 302.58, + "step": 330700, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.75192084, + "epoch": 1.849542885185118, + "grad_norm": 12.75, + "learning_rate": 1.5391279605792896e-07, + "loss": 0.99108944, + "memory(GiB)": 302.58, + "step": 330720, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.75309935, + "epoch": 1.8496547346580972, + "grad_norm": 5.9375, + "learning_rate": 1.5368521072721409e-07, + "loss": 0.96047544, + "memory(GiB)": 302.58, + "step": 330740, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.74319711, + "epoch": 1.8497665841310764, + "grad_norm": 9.8125, + "learning_rate": 1.534577911558316e-07, + "loss": 1.02449446, + "memory(GiB)": 302.58, + "step": 330760, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.76757669, + "epoch": 1.8498784336040557, + "grad_norm": 5.40625, + "learning_rate": 1.5323053735156136e-07, + "loss": 0.9138319, + "memory(GiB)": 302.58, + "step": 330780, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74287009, + "epoch": 1.849990283077035, + "grad_norm": 6.34375, + "learning_rate": 1.5300344932217494e-07, + "loss": 1.03663378, + "memory(GiB)": 302.58, + "step": 330800, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.76329699, + "epoch": 1.8501021325500142, + "grad_norm": 6.53125, + "learning_rate": 1.5277652707544e-07, + "loss": 0.92495213, + "memory(GiB)": 302.58, + "step": 330820, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.74791436, + "epoch": 1.8502139820229935, + "grad_norm": 4.71875, + "learning_rate": 1.5254977061911813e-07, + "loss": 0.99685421, + "memory(GiB)": 302.58, + "step": 330840, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74476299, + "epoch": 1.8503258314959727, + "grad_norm": 9.0625, + "learning_rate": 1.523231799609648e-07, + "loss": 1.00235758, + "memory(GiB)": 302.58, + "step": 330860, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.75410657, + "epoch": 1.850437680968952, + "grad_norm": 5.4375, + "learning_rate": 1.5209675510873044e-07, + "loss": 0.97257395, + "memory(GiB)": 302.58, + "step": 330880, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.74377646, + "epoch": 1.8505495304419313, + "grad_norm": 6.59375, + "learning_rate": 1.5187049607015891e-07, + "loss": 1.01474524, + "memory(GiB)": 302.58, + "step": 330900, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.74733663, + "epoch": 1.8506613799149105, + "grad_norm": 5.375, + "learning_rate": 1.5164440285299008e-07, + "loss": 1.00716133, + "memory(GiB)": 302.58, + "step": 330920, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.75785623, + "epoch": 1.8507732293878898, + "grad_norm": 8.0625, + "learning_rate": 1.5141847546495615e-07, + "loss": 0.95806971, + "memory(GiB)": 302.58, + "step": 330940, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.75339746, + "epoch": 1.850885078860869, + "grad_norm": 8.125, + "learning_rate": 1.5119271391378477e-07, + "loss": 0.97710209, + "memory(GiB)": 302.58, + "step": 330960, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.74324579, + "epoch": 1.8509969283338483, + "grad_norm": 5.5, + "learning_rate": 1.5096711820719756e-07, + "loss": 1.00320206, + "memory(GiB)": 302.58, + "step": 330980, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.75934424, + "epoch": 1.8511087778068276, + "grad_norm": 8.8125, + "learning_rate": 1.5074168835291002e-07, + "loss": 0.94159384, + "memory(GiB)": 302.58, + "step": 331000, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.75268731, + "epoch": 1.8512206272798069, + "grad_norm": 6.71875, + "learning_rate": 1.5051642435863433e-07, + "loss": 0.98180847, + "memory(GiB)": 302.58, + "step": 331020, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.74649553, + "epoch": 1.8513324767527861, + "grad_norm": 7.125, + "learning_rate": 1.5029132623207432e-07, + "loss": 1.00591669, + "memory(GiB)": 302.58, + "step": 331040, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.76580639, + "epoch": 1.8514443262257654, + "grad_norm": 7.3125, + "learning_rate": 1.500663939809277e-07, + "loss": 0.9263546, + "memory(GiB)": 302.58, + "step": 331060, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74352331, + "epoch": 1.8515561756987446, + "grad_norm": 8.5, + "learning_rate": 1.4984162761289056e-07, + "loss": 1.01357927, + "memory(GiB)": 302.58, + "step": 331080, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.76882787, + "epoch": 1.851668025171724, + "grad_norm": 8.0, + "learning_rate": 1.496170271356484e-07, + "loss": 0.90284414, + "memory(GiB)": 302.58, + "step": 331100, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.7564302, + "epoch": 1.8517798746447032, + "grad_norm": 7.125, + "learning_rate": 1.4939259255688453e-07, + "loss": 0.96531477, + "memory(GiB)": 302.58, + "step": 331120, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.7609169, + "epoch": 1.8518917241176824, + "grad_norm": 6.28125, + "learning_rate": 1.4916832388427504e-07, + "loss": 0.93497391, + "memory(GiB)": 302.58, + "step": 331140, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.76658731, + "epoch": 1.8520035735906617, + "grad_norm": 6.875, + "learning_rate": 1.4894422112548988e-07, + "loss": 0.91791906, + "memory(GiB)": 302.58, + "step": 331160, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74054351, + "epoch": 1.852115423063641, + "grad_norm": 7.71875, + "learning_rate": 1.4872028428819517e-07, + "loss": 1.03660412, + "memory(GiB)": 302.58, + "step": 331180, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.75337477, + "epoch": 1.8522272725366202, + "grad_norm": 7.9375, + "learning_rate": 1.4849651338004922e-07, + "loss": 0.97190723, + "memory(GiB)": 302.58, + "step": 331200, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.75324397, + "epoch": 1.8523391220095995, + "grad_norm": 8.625, + "learning_rate": 1.4827290840870645e-07, + "loss": 0.96947737, + "memory(GiB)": 302.58, + "step": 331220, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.7623374, + "epoch": 1.8524509714825788, + "grad_norm": 5.84375, + "learning_rate": 1.4804946938181464e-07, + "loss": 0.92593222, + "memory(GiB)": 302.58, + "step": 331240, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.7344945, + "epoch": 1.852562820955558, + "grad_norm": 8.9375, + "learning_rate": 1.47826196307016e-07, + "loss": 1.0650526, + "memory(GiB)": 302.58, + "step": 331260, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.73566995, + "epoch": 1.8526746704285373, + "grad_norm": 6.3125, + "learning_rate": 1.4760308919194776e-07, + "loss": 1.04074907, + "memory(GiB)": 302.58, + "step": 331280, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.7537302, + "epoch": 1.8527865199015165, + "grad_norm": 5.875, + "learning_rate": 1.4738014804423994e-07, + "loss": 0.95037899, + "memory(GiB)": 302.58, + "step": 331300, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.7532043, + "epoch": 1.8528983693744958, + "grad_norm": 6.65625, + "learning_rate": 1.4715737287151754e-07, + "loss": 0.96413755, + "memory(GiB)": 302.58, + "step": 331320, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.74033537, + "epoch": 1.853010218847475, + "grad_norm": 6.3125, + "learning_rate": 1.4693476368140224e-07, + "loss": 1.03309669, + "memory(GiB)": 302.58, + "step": 331340, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75727062, + "epoch": 1.8531220683204543, + "grad_norm": 7.53125, + "learning_rate": 1.4671232048150575e-07, + "loss": 0.92611475, + "memory(GiB)": 302.58, + "step": 331360, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74364915, + "epoch": 1.8532339177934336, + "grad_norm": 9.75, + "learning_rate": 1.4649004327943806e-07, + "loss": 1.01961021, + "memory(GiB)": 302.58, + "step": 331380, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74714704, + "epoch": 1.8533457672664129, + "grad_norm": 10.0625, + "learning_rate": 1.4626793208280032e-07, + "loss": 0.99815941, + "memory(GiB)": 302.58, + "step": 331400, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.73539286, + "epoch": 1.8534576167393921, + "grad_norm": 5.9375, + "learning_rate": 1.4604598689919036e-07, + "loss": 1.06863575, + "memory(GiB)": 302.58, + "step": 331420, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.75783854, + "epoch": 1.8535694662123714, + "grad_norm": 5.0625, + "learning_rate": 1.4582420773619933e-07, + "loss": 0.94471655, + "memory(GiB)": 302.58, + "step": 331440, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75029573, + "epoch": 1.8536813156853507, + "grad_norm": 7.9375, + "learning_rate": 1.4560259460141223e-07, + "loss": 0.98482122, + "memory(GiB)": 302.58, + "step": 331460, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.75532932, + "epoch": 1.85379316515833, + "grad_norm": 6.5625, + "learning_rate": 1.4538114750240918e-07, + "loss": 0.9485281, + "memory(GiB)": 302.58, + "step": 331480, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.7515305, + "epoch": 1.8539050146313092, + "grad_norm": 8.5, + "learning_rate": 1.4515986644676462e-07, + "loss": 0.95051622, + "memory(GiB)": 302.58, + "step": 331500, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.76064558, + "epoch": 1.8540168641042885, + "grad_norm": 4.8125, + "learning_rate": 1.4493875144204696e-07, + "loss": 0.93504572, + "memory(GiB)": 302.58, + "step": 331520, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75790982, + "epoch": 1.8541287135772677, + "grad_norm": 10.8125, + "learning_rate": 1.44717802495819e-07, + "loss": 0.97071915, + "memory(GiB)": 302.58, + "step": 331540, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.75312309, + "epoch": 1.854240563050247, + "grad_norm": 6.90625, + "learning_rate": 1.4449701961563812e-07, + "loss": 0.959058, + "memory(GiB)": 302.58, + "step": 331560, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.76679549, + "epoch": 1.8543524125232262, + "grad_norm": 10.75, + "learning_rate": 1.4427640280905542e-07, + "loss": 0.90643759, + "memory(GiB)": 302.58, + "step": 331580, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.75442386, + "epoch": 1.8544642619962055, + "grad_norm": 7.125, + "learning_rate": 1.44055952083616e-07, + "loss": 0.95947065, + "memory(GiB)": 302.58, + "step": 331600, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.75333848, + "epoch": 1.8545761114691848, + "grad_norm": 8.3125, + "learning_rate": 1.438356674468616e-07, + "loss": 0.97753162, + "memory(GiB)": 302.58, + "step": 331620, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.73337259, + "epoch": 1.854687960942164, + "grad_norm": 8.1875, + "learning_rate": 1.4361554890632569e-07, + "loss": 1.05548639, + "memory(GiB)": 302.58, + "step": 331640, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.74723201, + "epoch": 1.8547998104151433, + "grad_norm": 7.75, + "learning_rate": 1.4339559646953716e-07, + "loss": 1.00913239, + "memory(GiB)": 302.58, + "step": 331660, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74383407, + "epoch": 1.8549116598881226, + "grad_norm": 7.84375, + "learning_rate": 1.4317581014401949e-07, + "loss": 0.99174595, + "memory(GiB)": 302.58, + "step": 331680, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.75894279, + "epoch": 1.8550235093611018, + "grad_norm": 9.1875, + "learning_rate": 1.4295618993728944e-07, + "loss": 0.94354153, + "memory(GiB)": 302.58, + "step": 331700, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.74133415, + "epoch": 1.855135358834081, + "grad_norm": 6.375, + "learning_rate": 1.4273673585685878e-07, + "loss": 1.01629343, + "memory(GiB)": 302.58, + "step": 331720, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.74694562, + "epoch": 1.8552472083070604, + "grad_norm": 7.75, + "learning_rate": 1.425174479102337e-07, + "loss": 0.98858833, + "memory(GiB)": 302.58, + "step": 331740, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74825506, + "epoch": 1.8553590577800396, + "grad_norm": 6.5625, + "learning_rate": 1.422983261049149e-07, + "loss": 0.98165159, + "memory(GiB)": 302.58, + "step": 331760, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.75984926, + "epoch": 1.8554709072530189, + "grad_norm": 9.25, + "learning_rate": 1.4207937044839638e-07, + "loss": 0.94443016, + "memory(GiB)": 302.58, + "step": 331780, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.73298969, + "epoch": 1.8555827567259982, + "grad_norm": 5.78125, + "learning_rate": 1.418605809481677e-07, + "loss": 1.05503368, + "memory(GiB)": 302.58, + "step": 331800, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.75911698, + "epoch": 1.8556946061989774, + "grad_norm": 8.5625, + "learning_rate": 1.4164195761171173e-07, + "loss": 0.95666208, + "memory(GiB)": 302.58, + "step": 331820, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.73752389, + "epoch": 1.8558064556719567, + "grad_norm": 5.03125, + "learning_rate": 1.4142350044650587e-07, + "loss": 1.04918242, + "memory(GiB)": 302.58, + "step": 331840, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.75044599, + "epoch": 1.855918305144936, + "grad_norm": 6.28125, + "learning_rate": 1.4120520946002303e-07, + "loss": 0.97714872, + "memory(GiB)": 302.58, + "step": 331860, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.76953907, + "epoch": 1.8560301546179152, + "grad_norm": 7.46875, + "learning_rate": 1.4098708465972832e-07, + "loss": 0.87157316, + "memory(GiB)": 302.58, + "step": 331880, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.75308981, + "epoch": 1.8561420040908945, + "grad_norm": 6.5625, + "learning_rate": 1.4076912605308301e-07, + "loss": 0.96511116, + "memory(GiB)": 302.58, + "step": 331900, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.75158772, + "epoch": 1.8562538535638737, + "grad_norm": 7.59375, + "learning_rate": 1.4055133364754225e-07, + "loss": 1.00715933, + "memory(GiB)": 302.58, + "step": 331920, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.7559124, + "epoch": 1.856365703036853, + "grad_norm": 6.5, + "learning_rate": 1.4033370745055396e-07, + "loss": 0.95143089, + "memory(GiB)": 302.58, + "step": 331940, + "train_speed(iter/s)": 0.123621 + }, + { + "acc": 0.75311446, + "epoch": 1.8564775525098323, + "grad_norm": 6.28125, + "learning_rate": 1.401162474695633e-07, + "loss": 0.96098795, + "memory(GiB)": 302.58, + "step": 331960, + "train_speed(iter/s)": 0.123625 + }, + { + "acc": 0.75319352, + "epoch": 1.8565894019828115, + "grad_norm": 7.0, + "learning_rate": 1.3989895371200657e-07, + "loss": 0.97111063, + "memory(GiB)": 302.58, + "step": 331980, + "train_speed(iter/s)": 0.123628 + }, + { + "acc": 0.74465504, + "epoch": 1.8567012514557908, + "grad_norm": 6.6875, + "learning_rate": 1.3968182618531723e-07, + "loss": 1.00457525, + "memory(GiB)": 302.58, + "step": 332000, + "train_speed(iter/s)": 0.123632 + }, + { + "epoch": 1.8567012514557908, + "eval_acc": 0.7068968373055204, + "eval_loss": 1.0117839574813843, + "eval_runtime": 7542.6469, + "eval_samples_per_second": 9.981, + "eval_steps_per_second": 9.981, + "step": 332000 + }, + { + "acc": 0.75854263, + "epoch": 1.85681310092877, + "grad_norm": 6.15625, + "learning_rate": 1.39464864896921e-07, + "loss": 0.93081512, + "memory(GiB)": 302.58, + "step": 332020, + "train_speed(iter/s)": 0.123283 + }, + { + "acc": 0.74725504, + "epoch": 1.8569249504017493, + "grad_norm": 7.75, + "learning_rate": 1.3924806985423812e-07, + "loss": 0.98205729, + "memory(GiB)": 302.58, + "step": 332040, + "train_speed(iter/s)": 0.123286 + }, + { + "acc": 0.75636749, + "epoch": 1.8570367998747286, + "grad_norm": 7.21875, + "learning_rate": 1.390314410646848e-07, + "loss": 0.97233076, + "memory(GiB)": 302.58, + "step": 332060, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.75888505, + "epoch": 1.8571486493477078, + "grad_norm": 7.75, + "learning_rate": 1.388149785356696e-07, + "loss": 0.94243498, + "memory(GiB)": 302.58, + "step": 332080, + "train_speed(iter/s)": 0.123293 + }, + { + "acc": 0.7554018, + "epoch": 1.8572604988206871, + "grad_norm": 7.46875, + "learning_rate": 1.3859868227459717e-07, + "loss": 0.98168049, + "memory(GiB)": 302.58, + "step": 332100, + "train_speed(iter/s)": 0.123297 + }, + { + "acc": 0.75470452, + "epoch": 1.8573723482936664, + "grad_norm": 7.71875, + "learning_rate": 1.3838255228886433e-07, + "loss": 0.9684247, + "memory(GiB)": 302.58, + "step": 332120, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.7498157, + "epoch": 1.8574841977666456, + "grad_norm": 7.65625, + "learning_rate": 1.381665885858635e-07, + "loss": 0.98211231, + "memory(GiB)": 302.58, + "step": 332140, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.74492097, + "epoch": 1.857596047239625, + "grad_norm": 8.4375, + "learning_rate": 1.3795079117298215e-07, + "loss": 1.00406256, + "memory(GiB)": 302.58, + "step": 332160, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.75389051, + "epoch": 1.8577078967126042, + "grad_norm": 10.3125, + "learning_rate": 1.3773516005760103e-07, + "loss": 0.99078903, + "memory(GiB)": 302.58, + "step": 332180, + "train_speed(iter/s)": 0.123311 + }, + { + "acc": 0.76036983, + "epoch": 1.8578197461855834, + "grad_norm": 6.5625, + "learning_rate": 1.375196952470953e-07, + "loss": 0.92019558, + "memory(GiB)": 302.58, + "step": 332200, + "train_speed(iter/s)": 0.123314 + }, + { + "acc": 0.75528383, + "epoch": 1.8579315956585627, + "grad_norm": 6.4375, + "learning_rate": 1.3730439674883467e-07, + "loss": 0.96700535, + "memory(GiB)": 302.58, + "step": 332220, + "train_speed(iter/s)": 0.123318 + }, + { + "acc": 0.74631033, + "epoch": 1.858043445131542, + "grad_norm": 7.84375, + "learning_rate": 1.3708926457018269e-07, + "loss": 0.99722595, + "memory(GiB)": 302.58, + "step": 332240, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.74350023, + "epoch": 1.8581552946045212, + "grad_norm": 6.40625, + "learning_rate": 1.3687429871849788e-07, + "loss": 1.01649723, + "memory(GiB)": 302.58, + "step": 332260, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.76292052, + "epoch": 1.8582671440775005, + "grad_norm": 8.25, + "learning_rate": 1.3665949920113275e-07, + "loss": 0.94049101, + "memory(GiB)": 302.58, + "step": 332280, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.76417522, + "epoch": 1.8583789935504798, + "grad_norm": 10.125, + "learning_rate": 1.3644486602543306e-07, + "loss": 0.92785969, + "memory(GiB)": 302.58, + "step": 332300, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.74742556, + "epoch": 1.858490843023459, + "grad_norm": 6.5625, + "learning_rate": 1.3623039919874126e-07, + "loss": 0.99891148, + "memory(GiB)": 302.58, + "step": 332320, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.74866862, + "epoch": 1.8586026924964383, + "grad_norm": 5.34375, + "learning_rate": 1.3601609872839317e-07, + "loss": 1.01561098, + "memory(GiB)": 302.58, + "step": 332340, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.76097026, + "epoch": 1.8587145419694175, + "grad_norm": 5.71875, + "learning_rate": 1.358019646217168e-07, + "loss": 0.93676004, + "memory(GiB)": 302.58, + "step": 332360, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.76368842, + "epoch": 1.8588263914423968, + "grad_norm": 5.25, + "learning_rate": 1.3558799688603796e-07, + "loss": 0.91930094, + "memory(GiB)": 302.58, + "step": 332380, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.75641704, + "epoch": 1.858938240915376, + "grad_norm": 9.3125, + "learning_rate": 1.3537419552867416e-07, + "loss": 0.94864578, + "memory(GiB)": 302.58, + "step": 332400, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.75835304, + "epoch": 1.8590500903883553, + "grad_norm": 8.5625, + "learning_rate": 1.3516056055693782e-07, + "loss": 0.92771759, + "memory(GiB)": 302.58, + "step": 332420, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.73617268, + "epoch": 1.8591619398613346, + "grad_norm": 6.96875, + "learning_rate": 1.3494709197813648e-07, + "loss": 1.04863586, + "memory(GiB)": 302.58, + "step": 332440, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.75263071, + "epoch": 1.8592737893343139, + "grad_norm": 5.03125, + "learning_rate": 1.3473378979957152e-07, + "loss": 0.98225193, + "memory(GiB)": 302.58, + "step": 332460, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.75821309, + "epoch": 1.8593856388072931, + "grad_norm": 7.71875, + "learning_rate": 1.345206540285371e-07, + "loss": 0.93685274, + "memory(GiB)": 302.58, + "step": 332480, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.74419885, + "epoch": 1.8594974882802724, + "grad_norm": 5.28125, + "learning_rate": 1.3430768467232568e-07, + "loss": 1.01235523, + "memory(GiB)": 302.58, + "step": 332500, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.74777803, + "epoch": 1.8596093377532517, + "grad_norm": 9.125, + "learning_rate": 1.3409488173821929e-07, + "loss": 1.0082346, + "memory(GiB)": 302.58, + "step": 332520, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.76068225, + "epoch": 1.859721187226231, + "grad_norm": 6.59375, + "learning_rate": 1.3388224523349813e-07, + "loss": 0.94382801, + "memory(GiB)": 302.58, + "step": 332540, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.73237095, + "epoch": 1.8598330366992102, + "grad_norm": 11.9375, + "learning_rate": 1.3366977516543367e-07, + "loss": 1.04709167, + "memory(GiB)": 302.58, + "step": 332560, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.75378928, + "epoch": 1.8599448861721894, + "grad_norm": 7.1875, + "learning_rate": 1.3345747154129396e-07, + "loss": 1.00335732, + "memory(GiB)": 302.58, + "step": 332580, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.74331965, + "epoch": 1.8600567356451687, + "grad_norm": 8.125, + "learning_rate": 1.3324533436833987e-07, + "loss": 0.98639965, + "memory(GiB)": 302.58, + "step": 332600, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.74274788, + "epoch": 1.860168585118148, + "grad_norm": 6.78125, + "learning_rate": 1.3303336365382724e-07, + "loss": 1.02159033, + "memory(GiB)": 302.58, + "step": 332620, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74911261, + "epoch": 1.8602804345911272, + "grad_norm": 7.03125, + "learning_rate": 1.328215594050064e-07, + "loss": 0.959972, + "memory(GiB)": 302.58, + "step": 332640, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.74142399, + "epoch": 1.8603922840641065, + "grad_norm": 6.15625, + "learning_rate": 1.3260992162912213e-07, + "loss": 0.99850302, + "memory(GiB)": 302.58, + "step": 332660, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74852571, + "epoch": 1.8605041335370858, + "grad_norm": 7.1875, + "learning_rate": 1.3239845033341192e-07, + "loss": 0.97252102, + "memory(GiB)": 302.58, + "step": 332680, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.74557018, + "epoch": 1.860615983010065, + "grad_norm": 10.5625, + "learning_rate": 1.3218714552511002e-07, + "loss": 1.00751972, + "memory(GiB)": 302.58, + "step": 332700, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.76084123, + "epoch": 1.8607278324830443, + "grad_norm": 10.0625, + "learning_rate": 1.319760072114429e-07, + "loss": 0.95000076, + "memory(GiB)": 302.58, + "step": 332720, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.75408511, + "epoch": 1.8608396819560236, + "grad_norm": 6.40625, + "learning_rate": 1.3176503539963193e-07, + "loss": 0.95028143, + "memory(GiB)": 302.58, + "step": 332740, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.75002384, + "epoch": 1.8609515314290028, + "grad_norm": 7.65625, + "learning_rate": 1.3155423009689417e-07, + "loss": 0.99360113, + "memory(GiB)": 302.58, + "step": 332760, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75066471, + "epoch": 1.861063380901982, + "grad_norm": 6.84375, + "learning_rate": 1.3134359131043883e-07, + "loss": 0.97600126, + "memory(GiB)": 302.58, + "step": 332780, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.7276463, + "epoch": 1.8611752303749614, + "grad_norm": 6.125, + "learning_rate": 1.311331190474707e-07, + "loss": 1.080826, + "memory(GiB)": 302.58, + "step": 332800, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.75852747, + "epoch": 1.8612870798479406, + "grad_norm": 7.65625, + "learning_rate": 1.3092281331518908e-07, + "loss": 0.9414649, + "memory(GiB)": 302.58, + "step": 332820, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.74629741, + "epoch": 1.8613989293209199, + "grad_norm": 4.75, + "learning_rate": 1.3071267412078647e-07, + "loss": 1.00057707, + "memory(GiB)": 302.58, + "step": 332840, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75007033, + "epoch": 1.8615107787938991, + "grad_norm": 10.3125, + "learning_rate": 1.305027014714505e-07, + "loss": 0.99574518, + "memory(GiB)": 302.58, + "step": 332860, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.7468452, + "epoch": 1.8616226282668784, + "grad_norm": 7.0625, + "learning_rate": 1.3029289537436318e-07, + "loss": 1.00254774, + "memory(GiB)": 302.58, + "step": 332880, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.75689216, + "epoch": 1.8617344777398577, + "grad_norm": 8.25, + "learning_rate": 1.300832558366999e-07, + "loss": 0.9403944, + "memory(GiB)": 302.58, + "step": 332900, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.74603724, + "epoch": 1.861846327212837, + "grad_norm": 7.90625, + "learning_rate": 1.2987378286563156e-07, + "loss": 1.02050591, + "memory(GiB)": 302.58, + "step": 332920, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.72618327, + "epoch": 1.8619581766858162, + "grad_norm": 10.0625, + "learning_rate": 1.2966447646832247e-07, + "loss": 1.09887705, + "memory(GiB)": 302.58, + "step": 332940, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.76590381, + "epoch": 1.8620700261587955, + "grad_norm": 8.1875, + "learning_rate": 1.2945533665193134e-07, + "loss": 0.91065102, + "memory(GiB)": 302.58, + "step": 332960, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.74697862, + "epoch": 1.8621818756317747, + "grad_norm": 6.53125, + "learning_rate": 1.292463634236124e-07, + "loss": 1.00854559, + "memory(GiB)": 302.58, + "step": 332980, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.7437047, + "epoch": 1.862293725104754, + "grad_norm": 6.03125, + "learning_rate": 1.290375567905122e-07, + "loss": 1.0186058, + "memory(GiB)": 302.58, + "step": 333000, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.73383131, + "epoch": 1.8624055745777333, + "grad_norm": 8.1875, + "learning_rate": 1.2882891675977284e-07, + "loss": 1.0410759, + "memory(GiB)": 302.58, + "step": 333020, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.77525334, + "epoch": 1.8625174240507125, + "grad_norm": 6.9375, + "learning_rate": 1.2862044333853073e-07, + "loss": 0.87034941, + "memory(GiB)": 302.58, + "step": 333040, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.74710827, + "epoch": 1.8626292735236918, + "grad_norm": 5.59375, + "learning_rate": 1.2841213653391638e-07, + "loss": 1.00090485, + "memory(GiB)": 302.58, + "step": 333060, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74899154, + "epoch": 1.862741122996671, + "grad_norm": 7.21875, + "learning_rate": 1.2820399635305404e-07, + "loss": 0.98999939, + "memory(GiB)": 302.58, + "step": 333080, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.73541427, + "epoch": 1.8628529724696503, + "grad_norm": 7.5625, + "learning_rate": 1.2799602280306355e-07, + "loss": 1.03829441, + "memory(GiB)": 302.58, + "step": 333100, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74774709, + "epoch": 1.8629648219426296, + "grad_norm": 5.75, + "learning_rate": 1.277882158910576e-07, + "loss": 1.00774288, + "memory(GiB)": 302.58, + "step": 333120, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74188104, + "epoch": 1.8630766714156088, + "grad_norm": 5.40625, + "learning_rate": 1.2758057562414383e-07, + "loss": 1.00113754, + "memory(GiB)": 302.58, + "step": 333140, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.74195204, + "epoch": 1.863188520888588, + "grad_norm": 6.90625, + "learning_rate": 1.273731020094243e-07, + "loss": 0.99463577, + "memory(GiB)": 302.58, + "step": 333160, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74919152, + "epoch": 1.8633003703615674, + "grad_norm": 8.875, + "learning_rate": 1.271657950539956e-07, + "loss": 0.99188938, + "memory(GiB)": 302.58, + "step": 333180, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75295577, + "epoch": 1.8634122198345466, + "grad_norm": 7.84375, + "learning_rate": 1.2695865476494818e-07, + "loss": 0.96511173, + "memory(GiB)": 302.58, + "step": 333200, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.74179301, + "epoch": 1.863524069307526, + "grad_norm": 8.125, + "learning_rate": 1.2675168114936688e-07, + "loss": 1.01475649, + "memory(GiB)": 302.58, + "step": 333220, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.77167482, + "epoch": 1.8636359187805052, + "grad_norm": 7.3125, + "learning_rate": 1.2654487421432993e-07, + "loss": 0.87004824, + "memory(GiB)": 302.58, + "step": 333240, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.75257802, + "epoch": 1.8637477682534844, + "grad_norm": 10.1875, + "learning_rate": 1.2633823396691225e-07, + "loss": 0.97123871, + "memory(GiB)": 302.58, + "step": 333260, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.76499205, + "epoch": 1.8638596177264637, + "grad_norm": 5.1875, + "learning_rate": 1.2613176041418096e-07, + "loss": 0.92745218, + "memory(GiB)": 302.58, + "step": 333280, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.73265009, + "epoch": 1.863971467199443, + "grad_norm": 6.1875, + "learning_rate": 1.2592545356319762e-07, + "loss": 1.07676353, + "memory(GiB)": 302.58, + "step": 333300, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.7473547, + "epoch": 1.8640833166724222, + "grad_norm": 8.375, + "learning_rate": 1.257193134210194e-07, + "loss": 1.00443068, + "memory(GiB)": 302.58, + "step": 333320, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.75677357, + "epoch": 1.8641951661454015, + "grad_norm": 8.0, + "learning_rate": 1.2551333999469618e-07, + "loss": 0.94397688, + "memory(GiB)": 302.58, + "step": 333340, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.76527805, + "epoch": 1.8643070156183807, + "grad_norm": 7.125, + "learning_rate": 1.2530753329127398e-07, + "loss": 0.92210951, + "memory(GiB)": 302.58, + "step": 333360, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.76192088, + "epoch": 1.86441886509136, + "grad_norm": 7.21875, + "learning_rate": 1.251018933177911e-07, + "loss": 0.93874388, + "memory(GiB)": 302.58, + "step": 333380, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.7679739, + "epoch": 1.8645307145643393, + "grad_norm": 8.9375, + "learning_rate": 1.2489642008128133e-07, + "loss": 0.90529947, + "memory(GiB)": 302.58, + "step": 333400, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.76421604, + "epoch": 1.8646425640373185, + "grad_norm": 4.34375, + "learning_rate": 1.246911135887724e-07, + "loss": 0.91806011, + "memory(GiB)": 302.58, + "step": 333420, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74478374, + "epoch": 1.8647544135102978, + "grad_norm": 5.5, + "learning_rate": 1.2448597384728645e-07, + "loss": 1.00809326, + "memory(GiB)": 302.58, + "step": 333440, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.77266636, + "epoch": 1.864866262983277, + "grad_norm": 7.75, + "learning_rate": 1.2428100086384065e-07, + "loss": 0.88906546, + "memory(GiB)": 302.58, + "step": 333460, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.74326448, + "epoch": 1.8649781124562563, + "grad_norm": 5.96875, + "learning_rate": 1.240761946454444e-07, + "loss": 1.01489353, + "memory(GiB)": 302.58, + "step": 333480, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.77928362, + "epoch": 1.8650899619292356, + "grad_norm": 7.5625, + "learning_rate": 1.2387155519910376e-07, + "loss": 0.86347952, + "memory(GiB)": 302.58, + "step": 333500, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.73238163, + "epoch": 1.8652018114022149, + "grad_norm": 8.875, + "learning_rate": 1.2366708253181757e-07, + "loss": 1.05250053, + "memory(GiB)": 302.58, + "step": 333520, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.75328288, + "epoch": 1.8653136608751941, + "grad_norm": 9.8125, + "learning_rate": 1.234627766505797e-07, + "loss": 0.9801384, + "memory(GiB)": 302.58, + "step": 333540, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.75449967, + "epoch": 1.8654255103481734, + "grad_norm": 7.53125, + "learning_rate": 1.2325863756237788e-07, + "loss": 0.93404684, + "memory(GiB)": 302.58, + "step": 333560, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.74410806, + "epoch": 1.8655373598211527, + "grad_norm": 6.5625, + "learning_rate": 1.230546652741943e-07, + "loss": 0.98941031, + "memory(GiB)": 302.58, + "step": 333580, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.75740829, + "epoch": 1.865649209294132, + "grad_norm": 5.8125, + "learning_rate": 1.228508597930056e-07, + "loss": 0.97168312, + "memory(GiB)": 302.58, + "step": 333600, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.75035048, + "epoch": 1.8657610587671112, + "grad_norm": 5.75, + "learning_rate": 1.226472211257823e-07, + "loss": 0.98192482, + "memory(GiB)": 302.58, + "step": 333620, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.7606595, + "epoch": 1.8658729082400904, + "grad_norm": 9.0, + "learning_rate": 1.2244374927948998e-07, + "loss": 0.94764929, + "memory(GiB)": 302.58, + "step": 333640, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74464202, + "epoch": 1.8659847577130697, + "grad_norm": 11.625, + "learning_rate": 1.2224044426108804e-07, + "loss": 1.00406046, + "memory(GiB)": 302.58, + "step": 333660, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.75547967, + "epoch": 1.866096607186049, + "grad_norm": 8.4375, + "learning_rate": 1.220373060775293e-07, + "loss": 0.94504356, + "memory(GiB)": 302.58, + "step": 333680, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.76537313, + "epoch": 1.8662084566590282, + "grad_norm": 5.46875, + "learning_rate": 1.2183433473576257e-07, + "loss": 0.9397418, + "memory(GiB)": 302.58, + "step": 333700, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.75037284, + "epoch": 1.8663203061320075, + "grad_norm": 9.125, + "learning_rate": 1.216315302427301e-07, + "loss": 0.99176178, + "memory(GiB)": 302.58, + "step": 333720, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.75160623, + "epoch": 1.8664321556049868, + "grad_norm": 6.375, + "learning_rate": 1.2142889260536804e-07, + "loss": 0.98198442, + "memory(GiB)": 302.58, + "step": 333740, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.74897985, + "epoch": 1.866544005077966, + "grad_norm": 6.3125, + "learning_rate": 1.2122642183060696e-07, + "loss": 0.98516874, + "memory(GiB)": 302.58, + "step": 333760, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.73592701, + "epoch": 1.8666558545509453, + "grad_norm": 5.21875, + "learning_rate": 1.2102411792537295e-07, + "loss": 1.05281076, + "memory(GiB)": 302.58, + "step": 333780, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74254651, + "epoch": 1.8667677040239246, + "grad_norm": 8.25, + "learning_rate": 1.2082198089658493e-07, + "loss": 1.01293001, + "memory(GiB)": 302.58, + "step": 333800, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.72923803, + "epoch": 1.8668795534969038, + "grad_norm": 6.625, + "learning_rate": 1.2062001075115627e-07, + "loss": 1.06576509, + "memory(GiB)": 302.58, + "step": 333820, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.76677732, + "epoch": 1.866991402969883, + "grad_norm": 5.59375, + "learning_rate": 1.2041820749599586e-07, + "loss": 0.90062551, + "memory(GiB)": 302.58, + "step": 333840, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.73071637, + "epoch": 1.8671032524428623, + "grad_norm": 7.15625, + "learning_rate": 1.2021657113800488e-07, + "loss": 1.08252306, + "memory(GiB)": 302.58, + "step": 333860, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.75939531, + "epoch": 1.8672151019158416, + "grad_norm": 7.8125, + "learning_rate": 1.200151016840806e-07, + "loss": 0.93472996, + "memory(GiB)": 302.58, + "step": 333880, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.76384273, + "epoch": 1.8673269513888209, + "grad_norm": 6.6875, + "learning_rate": 1.1981379914111357e-07, + "loss": 0.91406593, + "memory(GiB)": 302.58, + "step": 333900, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.76683078, + "epoch": 1.8674388008618001, + "grad_norm": 6.40625, + "learning_rate": 1.1961266351599e-07, + "loss": 0.921171, + "memory(GiB)": 302.58, + "step": 333920, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.75038033, + "epoch": 1.8675506503347794, + "grad_norm": 7.3125, + "learning_rate": 1.1941169481558824e-07, + "loss": 0.99497776, + "memory(GiB)": 302.58, + "step": 333940, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.74679103, + "epoch": 1.8676624998077587, + "grad_norm": 7.8125, + "learning_rate": 1.192108930467828e-07, + "loss": 0.98990231, + "memory(GiB)": 302.58, + "step": 333960, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.75291061, + "epoch": 1.867774349280738, + "grad_norm": 8.5, + "learning_rate": 1.1901025821644097e-07, + "loss": 0.9734025, + "memory(GiB)": 302.58, + "step": 333980, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.77208209, + "epoch": 1.8678861987537172, + "grad_norm": 8.375, + "learning_rate": 1.1880979033142559e-07, + "loss": 0.90212584, + "memory(GiB)": 302.58, + "step": 334000, + "train_speed(iter/s)": 0.12363 + }, + { + "epoch": 1.8678861987537172, + "eval_acc": 0.7068963443376578, + "eval_loss": 1.011797547340393, + "eval_runtime": 7539.0911, + "eval_samples_per_second": 9.986, + "eval_steps_per_second": 9.986, + "step": 334000 + }, + { + "acc": 0.73789225, + "epoch": 1.8679980482266965, + "grad_norm": 6.09375, + "learning_rate": 1.186094893985934e-07, + "loss": 1.0098546, + "memory(GiB)": 302.58, + "step": 334020, + "train_speed(iter/s)": 0.123282 + }, + { + "acc": 0.73256302, + "epoch": 1.8681098976996757, + "grad_norm": 10.125, + "learning_rate": 1.1840935542479503e-07, + "loss": 1.05998583, + "memory(GiB)": 302.58, + "step": 334040, + "train_speed(iter/s)": 0.123286 + }, + { + "acc": 0.75219688, + "epoch": 1.868221747172655, + "grad_norm": 8.6875, + "learning_rate": 1.1820938841687612e-07, + "loss": 0.96173258, + "memory(GiB)": 302.58, + "step": 334060, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.76581259, + "epoch": 1.8683335966456343, + "grad_norm": 4.5, + "learning_rate": 1.180095883816751e-07, + "loss": 0.9309454, + "memory(GiB)": 302.58, + "step": 334080, + "train_speed(iter/s)": 0.123293 + }, + { + "acc": 0.75963964, + "epoch": 1.8684454461186135, + "grad_norm": 7.78125, + "learning_rate": 1.1780995532602702e-07, + "loss": 0.94321413, + "memory(GiB)": 302.58, + "step": 334100, + "train_speed(iter/s)": 0.123296 + }, + { + "acc": 0.76375532, + "epoch": 1.8685572955915928, + "grad_norm": 8.25, + "learning_rate": 1.1761048925675978e-07, + "loss": 0.93043251, + "memory(GiB)": 302.58, + "step": 334120, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.76739259, + "epoch": 1.868669145064572, + "grad_norm": 6.59375, + "learning_rate": 1.1741119018069514e-07, + "loss": 0.89613466, + "memory(GiB)": 302.58, + "step": 334140, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.75487623, + "epoch": 1.8687809945375513, + "grad_norm": 7.9375, + "learning_rate": 1.1721205810464986e-07, + "loss": 0.97479334, + "memory(GiB)": 302.58, + "step": 334160, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.7429419, + "epoch": 1.8688928440105306, + "grad_norm": 7.21875, + "learning_rate": 1.1701309303543463e-07, + "loss": 1.03537006, + "memory(GiB)": 302.58, + "step": 334180, + "train_speed(iter/s)": 0.12331 + }, + { + "acc": 0.75121288, + "epoch": 1.8690046934835098, + "grad_norm": 7.03125, + "learning_rate": 1.168142949798562e-07, + "loss": 0.98160038, + "memory(GiB)": 302.58, + "step": 334200, + "train_speed(iter/s)": 0.123314 + }, + { + "acc": 0.74639392, + "epoch": 1.869116542956489, + "grad_norm": 6.9375, + "learning_rate": 1.1661566394471247e-07, + "loss": 1.03123865, + "memory(GiB)": 302.58, + "step": 334220, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.75607486, + "epoch": 1.8692283924294684, + "grad_norm": 5.15625, + "learning_rate": 1.1641719993679801e-07, + "loss": 0.96233053, + "memory(GiB)": 302.58, + "step": 334240, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.73832908, + "epoch": 1.8693402419024476, + "grad_norm": 8.5, + "learning_rate": 1.1621890296290072e-07, + "loss": 1.04577255, + "memory(GiB)": 302.58, + "step": 334260, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.73933835, + "epoch": 1.8694520913754271, + "grad_norm": 12.1875, + "learning_rate": 1.1602077302980297e-07, + "loss": 1.03326645, + "memory(GiB)": 302.58, + "step": 334280, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.75203929, + "epoch": 1.8695639408484062, + "grad_norm": 9.4375, + "learning_rate": 1.1582281014428154e-07, + "loss": 0.99125938, + "memory(GiB)": 302.58, + "step": 334300, + "train_speed(iter/s)": 0.123331 + }, + { + "acc": 0.74883347, + "epoch": 1.8696757903213856, + "grad_norm": 10.6875, + "learning_rate": 1.1562501431310713e-07, + "loss": 0.97293272, + "memory(GiB)": 302.58, + "step": 334320, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.727668, + "epoch": 1.8697876397943647, + "grad_norm": 8.0, + "learning_rate": 1.1542738554304545e-07, + "loss": 1.09451523, + "memory(GiB)": 302.58, + "step": 334340, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.75280881, + "epoch": 1.8698994892673442, + "grad_norm": 5.5625, + "learning_rate": 1.1522992384085551e-07, + "loss": 0.98041325, + "memory(GiB)": 302.58, + "step": 334360, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.74214287, + "epoch": 1.8700113387403232, + "grad_norm": 7.875, + "learning_rate": 1.1503262921329139e-07, + "loss": 1.04051752, + "memory(GiB)": 302.58, + "step": 334380, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.74721045, + "epoch": 1.8701231882133027, + "grad_norm": 8.3125, + "learning_rate": 1.1483550166710156e-07, + "loss": 1.0061842, + "memory(GiB)": 302.58, + "step": 334400, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.75071144, + "epoch": 1.8702350376862817, + "grad_norm": 10.25, + "learning_rate": 1.1463854120902728e-07, + "loss": 0.96872826, + "memory(GiB)": 302.58, + "step": 334420, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.7758328, + "epoch": 1.8703468871592612, + "grad_norm": 9.1875, + "learning_rate": 1.1444174784580598e-07, + "loss": 0.87224216, + "memory(GiB)": 302.58, + "step": 334440, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.76523385, + "epoch": 1.8704587366322403, + "grad_norm": 6.125, + "learning_rate": 1.1424512158416889e-07, + "loss": 0.92530375, + "memory(GiB)": 302.58, + "step": 334460, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.74578247, + "epoch": 1.8705705861052198, + "grad_norm": 7.5, + "learning_rate": 1.1404866243084123e-07, + "loss": 1.00866337, + "memory(GiB)": 302.58, + "step": 334480, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.73078833, + "epoch": 1.8706824355781988, + "grad_norm": 9.4375, + "learning_rate": 1.1385237039254204e-07, + "loss": 1.06832123, + "memory(GiB)": 302.58, + "step": 334500, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.75807548, + "epoch": 1.8707942850511783, + "grad_norm": 5.15625, + "learning_rate": 1.1365624547598541e-07, + "loss": 0.96041059, + "memory(GiB)": 302.58, + "step": 334520, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.7419385, + "epoch": 1.8709061345241573, + "grad_norm": 6.5625, + "learning_rate": 1.1346028768787931e-07, + "loss": 0.99720001, + "memory(GiB)": 302.58, + "step": 334540, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.7550231, + "epoch": 1.8710179839971368, + "grad_norm": 7.15625, + "learning_rate": 1.1326449703492615e-07, + "loss": 0.96859493, + "memory(GiB)": 302.58, + "step": 334560, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.77659831, + "epoch": 1.8711298334701159, + "grad_norm": 5.75, + "learning_rate": 1.1306887352382223e-07, + "loss": 0.87555733, + "memory(GiB)": 302.58, + "step": 334580, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.76658773, + "epoch": 1.8712416829430953, + "grad_norm": 6.125, + "learning_rate": 1.1287341716125944e-07, + "loss": 0.90301428, + "memory(GiB)": 302.58, + "step": 334600, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.76278272, + "epoch": 1.8713535324160744, + "grad_norm": 5.5625, + "learning_rate": 1.1267812795392241e-07, + "loss": 0.94454336, + "memory(GiB)": 302.58, + "step": 334620, + "train_speed(iter/s)": 0.123388 + }, + { + "acc": 0.75000563, + "epoch": 1.8714653818890539, + "grad_norm": 5.46875, + "learning_rate": 1.1248300590849026e-07, + "loss": 0.97828732, + "memory(GiB)": 302.58, + "step": 334640, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.74533, + "epoch": 1.871577231362033, + "grad_norm": 8.5, + "learning_rate": 1.1228805103163765e-07, + "loss": 1.00780621, + "memory(GiB)": 302.58, + "step": 334660, + "train_speed(iter/s)": 0.123395 + }, + { + "acc": 0.73927374, + "epoch": 1.8716890808350124, + "grad_norm": 6.625, + "learning_rate": 1.1209326333003201e-07, + "loss": 1.03158293, + "memory(GiB)": 302.58, + "step": 334680, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.73900576, + "epoch": 1.8718009303079914, + "grad_norm": 9.125, + "learning_rate": 1.118986428103358e-07, + "loss": 1.02058172, + "memory(GiB)": 302.58, + "step": 334700, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.77198877, + "epoch": 1.871912779780971, + "grad_norm": 7.625, + "learning_rate": 1.1170418947920536e-07, + "loss": 0.89856415, + "memory(GiB)": 302.58, + "step": 334720, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.74615307, + "epoch": 1.87202462925395, + "grad_norm": 8.9375, + "learning_rate": 1.115099033432926e-07, + "loss": 1.01134672, + "memory(GiB)": 302.58, + "step": 334740, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.7623034, + "epoch": 1.8721364787269295, + "grad_norm": 5.90625, + "learning_rate": 1.1131578440924163e-07, + "loss": 0.92757664, + "memory(GiB)": 302.58, + "step": 334760, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.75559735, + "epoch": 1.8722483281999085, + "grad_norm": 7.5625, + "learning_rate": 1.1112183268369325e-07, + "loss": 0.97701302, + "memory(GiB)": 302.58, + "step": 334780, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.74271765, + "epoch": 1.872360177672888, + "grad_norm": 5.75, + "learning_rate": 1.1092804817327995e-07, + "loss": 1.01292858, + "memory(GiB)": 302.58, + "step": 334800, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.74866982, + "epoch": 1.872472027145867, + "grad_norm": 7.46875, + "learning_rate": 1.1073443088462976e-07, + "loss": 1.00159254, + "memory(GiB)": 302.58, + "step": 334820, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.7436492, + "epoch": 1.8725838766188465, + "grad_norm": 7.53125, + "learning_rate": 1.105409808243657e-07, + "loss": 1.00660286, + "memory(GiB)": 302.58, + "step": 334840, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.76187644, + "epoch": 1.8726957260918256, + "grad_norm": 5.84375, + "learning_rate": 1.1034769799910472e-07, + "loss": 0.9366519, + "memory(GiB)": 302.58, + "step": 334860, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.757234, + "epoch": 1.872807575564805, + "grad_norm": 6.875, + "learning_rate": 1.1015458241545651e-07, + "loss": 0.95289688, + "memory(GiB)": 302.58, + "step": 334880, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.75915837, + "epoch": 1.872919425037784, + "grad_norm": 5.40625, + "learning_rate": 1.0996163408002747e-07, + "loss": 0.93331013, + "memory(GiB)": 302.58, + "step": 334900, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.76141219, + "epoch": 1.8730312745107636, + "grad_norm": 7.8125, + "learning_rate": 1.097688529994162e-07, + "loss": 0.92920485, + "memory(GiB)": 302.58, + "step": 334920, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.73063393, + "epoch": 1.8731431239837426, + "grad_norm": 6.59375, + "learning_rate": 1.0957623918021631e-07, + "loss": 1.02532921, + "memory(GiB)": 302.58, + "step": 334940, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.76196222, + "epoch": 1.873254973456722, + "grad_norm": 7.125, + "learning_rate": 1.0938379262901644e-07, + "loss": 0.91769104, + "memory(GiB)": 302.58, + "step": 334960, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.75603471, + "epoch": 1.8733668229297011, + "grad_norm": 7.90625, + "learning_rate": 1.0919151335239908e-07, + "loss": 0.9537446, + "memory(GiB)": 302.58, + "step": 334980, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.7480329, + "epoch": 1.8734786724026806, + "grad_norm": 5.9375, + "learning_rate": 1.0899940135693953e-07, + "loss": 0.98073616, + "memory(GiB)": 302.58, + "step": 335000, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.75384874, + "epoch": 1.8735905218756597, + "grad_norm": 8.3125, + "learning_rate": 1.0880745664920978e-07, + "loss": 0.9572999, + "memory(GiB)": 302.58, + "step": 335020, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.74886303, + "epoch": 1.8737023713486392, + "grad_norm": 8.6875, + "learning_rate": 1.0861567923577399e-07, + "loss": 1.00142298, + "memory(GiB)": 302.58, + "step": 335040, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.75930676, + "epoch": 1.8738142208216182, + "grad_norm": 6.4375, + "learning_rate": 1.0842406912319248e-07, + "loss": 0.95452528, + "memory(GiB)": 302.58, + "step": 335060, + "train_speed(iter/s)": 0.123464 + }, + { + "acc": 0.76661887, + "epoch": 1.8739260702945977, + "grad_norm": 9.125, + "learning_rate": 1.082326263180189e-07, + "loss": 0.91061621, + "memory(GiB)": 302.58, + "step": 335080, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.76800737, + "epoch": 1.8740379197675767, + "grad_norm": 6.96875, + "learning_rate": 1.0804135082680078e-07, + "loss": 0.91240759, + "memory(GiB)": 302.58, + "step": 335100, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.7383069, + "epoch": 1.8741497692405562, + "grad_norm": 6.46875, + "learning_rate": 1.0785024265608012e-07, + "loss": 1.03818216, + "memory(GiB)": 302.58, + "step": 335120, + "train_speed(iter/s)": 0.123474 + }, + { + "acc": 0.75473022, + "epoch": 1.8742616187135352, + "grad_norm": 7.09375, + "learning_rate": 1.0765930181239392e-07, + "loss": 0.97604952, + "memory(GiB)": 302.58, + "step": 335140, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.74876137, + "epoch": 1.8743734681865147, + "grad_norm": 6.28125, + "learning_rate": 1.0746852830227305e-07, + "loss": 0.99663343, + "memory(GiB)": 302.58, + "step": 335160, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.7622036, + "epoch": 1.8744853176594938, + "grad_norm": 10.5625, + "learning_rate": 1.0727792213224175e-07, + "loss": 0.92121849, + "memory(GiB)": 302.58, + "step": 335180, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.73239713, + "epoch": 1.8745971671324733, + "grad_norm": 7.0, + "learning_rate": 1.0708748330882035e-07, + "loss": 1.0649004, + "memory(GiB)": 302.58, + "step": 335200, + "train_speed(iter/s)": 0.123488 + }, + { + "acc": 0.75811505, + "epoch": 1.8747090166054523, + "grad_norm": 7.21875, + "learning_rate": 1.0689721183852198e-07, + "loss": 0.93105412, + "memory(GiB)": 302.58, + "step": 335220, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.75633702, + "epoch": 1.8748208660784318, + "grad_norm": 8.4375, + "learning_rate": 1.0670710772785476e-07, + "loss": 0.97358494, + "memory(GiB)": 302.58, + "step": 335240, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.75853329, + "epoch": 1.8749327155514108, + "grad_norm": 6.625, + "learning_rate": 1.0651717098332015e-07, + "loss": 0.93983383, + "memory(GiB)": 302.58, + "step": 335260, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75824032, + "epoch": 1.8750445650243903, + "grad_norm": 7.25, + "learning_rate": 1.0632740161141575e-07, + "loss": 0.93030205, + "memory(GiB)": 302.58, + "step": 335280, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.76222892, + "epoch": 1.8751564144973694, + "grad_norm": 7.0625, + "learning_rate": 1.0613779961863135e-07, + "loss": 0.95318365, + "memory(GiB)": 302.58, + "step": 335300, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.73357892, + "epoch": 1.8752682639703488, + "grad_norm": 5.375, + "learning_rate": 1.0594836501145123e-07, + "loss": 1.06021166, + "memory(GiB)": 302.58, + "step": 335320, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.7589385, + "epoch": 1.875380113443328, + "grad_norm": 8.25, + "learning_rate": 1.0575909779635684e-07, + "loss": 0.97250872, + "memory(GiB)": 302.58, + "step": 335340, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.7594677, + "epoch": 1.8754919629163074, + "grad_norm": 9.6875, + "learning_rate": 1.0556999797982026e-07, + "loss": 0.95576391, + "memory(GiB)": 302.58, + "step": 335360, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.74879007, + "epoch": 1.8756038123892864, + "grad_norm": 7.9375, + "learning_rate": 1.0538106556830963e-07, + "loss": 0.97724457, + "memory(GiB)": 302.58, + "step": 335380, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.76856065, + "epoch": 1.875715661862266, + "grad_norm": 6.25, + "learning_rate": 1.05192300568287e-07, + "loss": 0.88943806, + "memory(GiB)": 302.58, + "step": 335400, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.76111679, + "epoch": 1.875827511335245, + "grad_norm": 6.09375, + "learning_rate": 1.0500370298620832e-07, + "loss": 0.94201431, + "memory(GiB)": 302.58, + "step": 335420, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.75580711, + "epoch": 1.8759393608082244, + "grad_norm": 7.59375, + "learning_rate": 1.048152728285251e-07, + "loss": 0.96287031, + "memory(GiB)": 302.58, + "step": 335440, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.73930244, + "epoch": 1.8760512102812035, + "grad_norm": 5.375, + "learning_rate": 1.0462701010168163e-07, + "loss": 1.02911873, + "memory(GiB)": 302.58, + "step": 335460, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.74888482, + "epoch": 1.876163059754183, + "grad_norm": 6.1875, + "learning_rate": 1.0443891481211665e-07, + "loss": 0.99269123, + "memory(GiB)": 302.58, + "step": 335480, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.76356001, + "epoch": 1.876274909227162, + "grad_norm": 8.4375, + "learning_rate": 1.0425098696626501e-07, + "loss": 0.92707748, + "memory(GiB)": 302.58, + "step": 335500, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.74954877, + "epoch": 1.8763867587001415, + "grad_norm": 6.1875, + "learning_rate": 1.0406322657055324e-07, + "loss": 0.99688978, + "memory(GiB)": 302.58, + "step": 335520, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.73936529, + "epoch": 1.8764986081731205, + "grad_norm": 6.28125, + "learning_rate": 1.0387563363140396e-07, + "loss": 1.03161182, + "memory(GiB)": 302.58, + "step": 335540, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.74809895, + "epoch": 1.8766104576461, + "grad_norm": 8.9375, + "learning_rate": 1.0368820815523261e-07, + "loss": 0.97245083, + "memory(GiB)": 302.58, + "step": 335560, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.73139501, + "epoch": 1.876722307119079, + "grad_norm": 8.5, + "learning_rate": 1.0350095014845074e-07, + "loss": 1.06542463, + "memory(GiB)": 302.58, + "step": 335580, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.75612125, + "epoch": 1.8768341565920585, + "grad_norm": 4.9375, + "learning_rate": 1.0331385961746265e-07, + "loss": 0.96627846, + "memory(GiB)": 302.58, + "step": 335600, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.76081614, + "epoch": 1.8769460060650376, + "grad_norm": 7.875, + "learning_rate": 1.0312693656866712e-07, + "loss": 0.9316803, + "memory(GiB)": 302.58, + "step": 335620, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.73733435, + "epoch": 1.877057855538017, + "grad_norm": 7.65625, + "learning_rate": 1.0294018100845849e-07, + "loss": 1.03866367, + "memory(GiB)": 302.58, + "step": 335640, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.75141716, + "epoch": 1.8771697050109961, + "grad_norm": 7.3125, + "learning_rate": 1.0275359294322385e-07, + "loss": 0.99600611, + "memory(GiB)": 302.58, + "step": 335660, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.73266287, + "epoch": 1.8772815544839756, + "grad_norm": 5.625, + "learning_rate": 1.0256717237934477e-07, + "loss": 1.06442461, + "memory(GiB)": 302.58, + "step": 335680, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.74572692, + "epoch": 1.8773934039569546, + "grad_norm": 6.78125, + "learning_rate": 1.0238091932319838e-07, + "loss": 1.00852566, + "memory(GiB)": 302.58, + "step": 335700, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.75452991, + "epoch": 1.8775052534299341, + "grad_norm": 6.53125, + "learning_rate": 1.0219483378115402e-07, + "loss": 0.97320223, + "memory(GiB)": 302.58, + "step": 335720, + "train_speed(iter/s)": 0.123578 + }, + { + "acc": 0.7571147, + "epoch": 1.8776171029029132, + "grad_norm": 8.5625, + "learning_rate": 1.0200891575957662e-07, + "loss": 0.94648342, + "memory(GiB)": 302.58, + "step": 335740, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.73804655, + "epoch": 1.8777289523758927, + "grad_norm": 6.625, + "learning_rate": 1.018231652648255e-07, + "loss": 1.02335491, + "memory(GiB)": 302.58, + "step": 335760, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74548249, + "epoch": 1.8778408018488717, + "grad_norm": 7.59375, + "learning_rate": 1.0163758230325449e-07, + "loss": 1.00877724, + "memory(GiB)": 302.58, + "step": 335780, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.7580884, + "epoch": 1.8779526513218512, + "grad_norm": 7.4375, + "learning_rate": 1.0145216688120962e-07, + "loss": 0.93847523, + "memory(GiB)": 302.58, + "step": 335800, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.76379972, + "epoch": 1.8780645007948302, + "grad_norm": 6.375, + "learning_rate": 1.0126691900503416e-07, + "loss": 0.91882229, + "memory(GiB)": 302.58, + "step": 335820, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.74460588, + "epoch": 1.8781763502678097, + "grad_norm": 7.40625, + "learning_rate": 1.0108183868106302e-07, + "loss": 1.00440369, + "memory(GiB)": 302.58, + "step": 335840, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.76310987, + "epoch": 1.8782881997407888, + "grad_norm": 7.3125, + "learning_rate": 1.0089692591562672e-07, + "loss": 0.92049894, + "memory(GiB)": 302.58, + "step": 335860, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.75642986, + "epoch": 1.8784000492137682, + "grad_norm": 7.96875, + "learning_rate": 1.0071218071505074e-07, + "loss": 0.9480896, + "memory(GiB)": 302.58, + "step": 335880, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.73822107, + "epoch": 1.8785118986867473, + "grad_norm": 5.8125, + "learning_rate": 1.0052760308565335e-07, + "loss": 1.03398418, + "memory(GiB)": 302.58, + "step": 335900, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.74909229, + "epoch": 1.8786237481597268, + "grad_norm": 6.4375, + "learning_rate": 1.0034319303374784e-07, + "loss": 0.97652712, + "memory(GiB)": 302.58, + "step": 335920, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.75704684, + "epoch": 1.8787355976327058, + "grad_norm": 9.5625, + "learning_rate": 1.001589505656414e-07, + "loss": 0.95611115, + "memory(GiB)": 302.58, + "step": 335940, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.74363194, + "epoch": 1.8788474471056853, + "grad_norm": 10.6875, + "learning_rate": 9.99748756876362e-08, + "loss": 1.01325226, + "memory(GiB)": 302.58, + "step": 335960, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.77007675, + "epoch": 1.8789592965786643, + "grad_norm": 7.9375, + "learning_rate": 9.97909684060272e-08, + "loss": 0.9080183, + "memory(GiB)": 302.58, + "step": 335980, + "train_speed(iter/s)": 0.123624 + }, + { + "acc": 0.75502138, + "epoch": 1.8790711460516438, + "grad_norm": 9.4375, + "learning_rate": 9.960722872710549e-08, + "loss": 0.93226576, + "memory(GiB)": 302.58, + "step": 336000, + "train_speed(iter/s)": 0.123627 + }, + { + "epoch": 1.8790711460516438, + "eval_acc": 0.7069138939935711, + "eval_loss": 1.0117892026901245, + "eval_runtime": 7541.7035, + "eval_samples_per_second": 9.982, + "eval_steps_per_second": 9.982, + "step": 336000 + }, + { + "acc": 0.74819183, + "epoch": 1.8791829955246229, + "grad_norm": 9.75, + "learning_rate": 9.942365665715548e-08, + "loss": 0.9998621, + "memory(GiB)": 302.58, + "step": 336020, + "train_speed(iter/s)": 0.123282 + }, + { + "acc": 0.75642467, + "epoch": 1.8792948449976024, + "grad_norm": 5.34375, + "learning_rate": 9.924025220245548e-08, + "loss": 0.93835487, + "memory(GiB)": 302.58, + "step": 336040, + "train_speed(iter/s)": 0.123286 + }, + { + "acc": 0.7395936, + "epoch": 1.8794066944705814, + "grad_norm": 5.8125, + "learning_rate": 9.905701536927825e-08, + "loss": 1.01281986, + "memory(GiB)": 302.58, + "step": 336060, + "train_speed(iter/s)": 0.12329 + }, + { + "acc": 0.74998837, + "epoch": 1.8795185439435609, + "grad_norm": 9.4375, + "learning_rate": 9.887394616389212e-08, + "loss": 0.9758893, + "memory(GiB)": 302.58, + "step": 336080, + "train_speed(iter/s)": 0.123293 + }, + { + "acc": 0.74471326, + "epoch": 1.87963039341654, + "grad_norm": 6.4375, + "learning_rate": 9.869104459255818e-08, + "loss": 1.00890932, + "memory(GiB)": 302.58, + "step": 336100, + "train_speed(iter/s)": 0.123296 + }, + { + "acc": 0.75258179, + "epoch": 1.8797422428895194, + "grad_norm": 8.0, + "learning_rate": 9.850831066153199e-08, + "loss": 0.97338505, + "memory(GiB)": 302.58, + "step": 336120, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.75685763, + "epoch": 1.8798540923624985, + "grad_norm": 8.5625, + "learning_rate": 9.83257443770641e-08, + "loss": 0.95742893, + "memory(GiB)": 302.58, + "step": 336140, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.75586209, + "epoch": 1.879965941835478, + "grad_norm": 5.78125, + "learning_rate": 9.814334574539785e-08, + "loss": 0.95878038, + "memory(GiB)": 302.58, + "step": 336160, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.76838555, + "epoch": 1.880077791308457, + "grad_norm": 7.25, + "learning_rate": 9.796111477277325e-08, + "loss": 0.8852602, + "memory(GiB)": 302.58, + "step": 336180, + "train_speed(iter/s)": 0.12331 + }, + { + "acc": 0.75335727, + "epoch": 1.8801896407814365, + "grad_norm": 6.1875, + "learning_rate": 9.777905146542143e-08, + "loss": 0.973067, + "memory(GiB)": 302.58, + "step": 336200, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.73966012, + "epoch": 1.8803014902544155, + "grad_norm": 5.84375, + "learning_rate": 9.75971558295713e-08, + "loss": 1.04175043, + "memory(GiB)": 302.58, + "step": 336220, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.75071998, + "epoch": 1.880413339727395, + "grad_norm": 6.625, + "learning_rate": 9.741542787144398e-08, + "loss": 0.97322054, + "memory(GiB)": 302.58, + "step": 336240, + "train_speed(iter/s)": 0.12332 + }, + { + "acc": 0.74310389, + "epoch": 1.880525189200374, + "grad_norm": 6.09375, + "learning_rate": 9.723386759725395e-08, + "loss": 1.02247877, + "memory(GiB)": 302.58, + "step": 336260, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.75415516, + "epoch": 1.8806370386733535, + "grad_norm": 8.5, + "learning_rate": 9.705247501321235e-08, + "loss": 0.9658061, + "memory(GiB)": 302.58, + "step": 336280, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.75262356, + "epoch": 1.8807488881463326, + "grad_norm": 5.21875, + "learning_rate": 9.687125012552257e-08, + "loss": 0.9649581, + "memory(GiB)": 302.58, + "step": 336300, + "train_speed(iter/s)": 0.123331 + }, + { + "acc": 0.76202116, + "epoch": 1.880860737619312, + "grad_norm": 10.8125, + "learning_rate": 9.669019294038406e-08, + "loss": 0.93335314, + "memory(GiB)": 302.58, + "step": 336320, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.7530324, + "epoch": 1.880972587092291, + "grad_norm": 8.1875, + "learning_rate": 9.650930346398856e-08, + "loss": 0.97509308, + "memory(GiB)": 302.58, + "step": 336340, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.7580019, + "epoch": 1.8810844365652706, + "grad_norm": 8.5625, + "learning_rate": 9.632858170252335e-08, + "loss": 0.94949675, + "memory(GiB)": 302.58, + "step": 336360, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.76110339, + "epoch": 1.8811962860382496, + "grad_norm": 7.0625, + "learning_rate": 9.614802766216957e-08, + "loss": 0.91865826, + "memory(GiB)": 302.58, + "step": 336380, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.74567447, + "epoch": 1.881308135511229, + "grad_norm": 6.78125, + "learning_rate": 9.596764134910341e-08, + "loss": 1.00707254, + "memory(GiB)": 302.58, + "step": 336400, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.76245046, + "epoch": 1.8814199849842081, + "grad_norm": 4.75, + "learning_rate": 9.578742276949382e-08, + "loss": 0.92928762, + "memory(GiB)": 302.58, + "step": 336420, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.73373537, + "epoch": 1.8815318344571876, + "grad_norm": 5.34375, + "learning_rate": 9.560737192950532e-08, + "loss": 1.06482973, + "memory(GiB)": 302.58, + "step": 336440, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.75316677, + "epoch": 1.8816436839301667, + "grad_norm": 7.78125, + "learning_rate": 9.54274888352963e-08, + "loss": 0.97960672, + "memory(GiB)": 302.58, + "step": 336460, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.77139573, + "epoch": 1.8817555334031462, + "grad_norm": 7.78125, + "learning_rate": 9.524777349301851e-08, + "loss": 0.88882256, + "memory(GiB)": 302.58, + "step": 336480, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.76614575, + "epoch": 1.8818673828761252, + "grad_norm": 8.125, + "learning_rate": 9.506822590881982e-08, + "loss": 0.92584171, + "memory(GiB)": 302.58, + "step": 336500, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.76525054, + "epoch": 1.8819792323491047, + "grad_norm": 5.25, + "learning_rate": 9.488884608884086e-08, + "loss": 0.94267311, + "memory(GiB)": 302.58, + "step": 336520, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.7456212, + "epoch": 1.8820910818220837, + "grad_norm": 9.375, + "learning_rate": 9.47096340392173e-08, + "loss": 1.00053177, + "memory(GiB)": 302.58, + "step": 336540, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.74480581, + "epoch": 1.8822029312950632, + "grad_norm": 7.53125, + "learning_rate": 9.453058976607865e-08, + "loss": 0.99664164, + "memory(GiB)": 302.58, + "step": 336560, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.74047961, + "epoch": 1.8823147807680423, + "grad_norm": 6.65625, + "learning_rate": 9.435171327554837e-08, + "loss": 1.00583515, + "memory(GiB)": 302.58, + "step": 336580, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.75273881, + "epoch": 1.8824266302410217, + "grad_norm": 7.71875, + "learning_rate": 9.41730045737449e-08, + "loss": 0.96256351, + "memory(GiB)": 302.58, + "step": 336600, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.74940076, + "epoch": 1.8825384797140008, + "grad_norm": 6.03125, + "learning_rate": 9.399446366678055e-08, + "loss": 0.98667345, + "memory(GiB)": 302.58, + "step": 336620, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.75486379, + "epoch": 1.8826503291869803, + "grad_norm": 8.1875, + "learning_rate": 9.381609056076213e-08, + "loss": 0.96120758, + "memory(GiB)": 302.58, + "step": 336640, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.76023946, + "epoch": 1.8827621786599593, + "grad_norm": 6.84375, + "learning_rate": 9.363788526179029e-08, + "loss": 0.93485012, + "memory(GiB)": 302.58, + "step": 336660, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74217453, + "epoch": 1.8828740281329388, + "grad_norm": 5.75, + "learning_rate": 9.345984777596073e-08, + "loss": 1.01252298, + "memory(GiB)": 302.58, + "step": 336680, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.74866867, + "epoch": 1.8829858776059178, + "grad_norm": 8.6875, + "learning_rate": 9.328197810936246e-08, + "loss": 0.99764509, + "memory(GiB)": 302.58, + "step": 336700, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.75469904, + "epoch": 1.8830977270788973, + "grad_norm": 6.59375, + "learning_rate": 9.31042762680795e-08, + "loss": 0.96767921, + "memory(GiB)": 302.58, + "step": 336720, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.75254078, + "epoch": 1.8832095765518764, + "grad_norm": 8.375, + "learning_rate": 9.292674225818921e-08, + "loss": 0.97072573, + "memory(GiB)": 302.58, + "step": 336740, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.73236961, + "epoch": 1.8833214260248559, + "grad_norm": 8.6875, + "learning_rate": 9.274937608576451e-08, + "loss": 1.05835867, + "memory(GiB)": 302.58, + "step": 336760, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74719701, + "epoch": 1.883433275497835, + "grad_norm": 11.75, + "learning_rate": 9.257217775687166e-08, + "loss": 0.98776588, + "memory(GiB)": 302.58, + "step": 336780, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.73454399, + "epoch": 1.8835451249708144, + "grad_norm": 6.125, + "learning_rate": 9.239514727757138e-08, + "loss": 1.05752544, + "memory(GiB)": 302.58, + "step": 336800, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.73130765, + "epoch": 1.8836569744437934, + "grad_norm": 9.125, + "learning_rate": 9.22182846539188e-08, + "loss": 1.05173206, + "memory(GiB)": 302.58, + "step": 336820, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.7424603, + "epoch": 1.883768823916773, + "grad_norm": 10.5625, + "learning_rate": 9.204158989196299e-08, + "loss": 1.01665668, + "memory(GiB)": 302.58, + "step": 336840, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.73389688, + "epoch": 1.883880673389752, + "grad_norm": 7.0, + "learning_rate": 9.186506299774689e-08, + "loss": 1.05577936, + "memory(GiB)": 302.58, + "step": 336860, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74156065, + "epoch": 1.8839925228627314, + "grad_norm": 7.71875, + "learning_rate": 9.168870397730956e-08, + "loss": 0.9984766, + "memory(GiB)": 302.58, + "step": 336880, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74527049, + "epoch": 1.8841043723357105, + "grad_norm": 6.40625, + "learning_rate": 9.151251283668228e-08, + "loss": 0.98722725, + "memory(GiB)": 302.58, + "step": 336900, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.76220169, + "epoch": 1.88421622180869, + "grad_norm": 8.8125, + "learning_rate": 9.133648958189134e-08, + "loss": 0.92030382, + "memory(GiB)": 302.58, + "step": 336920, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.74247069, + "epoch": 1.884328071281669, + "grad_norm": 6.90625, + "learning_rate": 9.116063421895749e-08, + "loss": 1.00594711, + "memory(GiB)": 302.58, + "step": 336940, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.76391873, + "epoch": 1.8844399207546485, + "grad_norm": 10.125, + "learning_rate": 9.098494675389535e-08, + "loss": 0.91291761, + "memory(GiB)": 302.58, + "step": 336960, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.75785909, + "epoch": 1.8845517702276275, + "grad_norm": 6.6875, + "learning_rate": 9.080942719271457e-08, + "loss": 0.94579391, + "memory(GiB)": 302.58, + "step": 336980, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.74982185, + "epoch": 1.884663619700607, + "grad_norm": 7.53125, + "learning_rate": 9.063407554141757e-08, + "loss": 0.9857769, + "memory(GiB)": 302.58, + "step": 337000, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.74557843, + "epoch": 1.884775469173586, + "grad_norm": 7.75, + "learning_rate": 9.045889180600176e-08, + "loss": 0.98562765, + "memory(GiB)": 302.58, + "step": 337020, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.76242909, + "epoch": 1.8848873186465656, + "grad_norm": 8.4375, + "learning_rate": 9.02838759924607e-08, + "loss": 0.94016676, + "memory(GiB)": 302.58, + "step": 337040, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.74234018, + "epoch": 1.8849991681195446, + "grad_norm": 10.0625, + "learning_rate": 9.010902810677847e-08, + "loss": 1.00240631, + "memory(GiB)": 302.58, + "step": 337060, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74989915, + "epoch": 1.885111017592524, + "grad_norm": 7.09375, + "learning_rate": 8.993434815493696e-08, + "loss": 0.99638929, + "memory(GiB)": 302.58, + "step": 337080, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.74732895, + "epoch": 1.8852228670655031, + "grad_norm": 6.34375, + "learning_rate": 8.975983614290972e-08, + "loss": 1.00307398, + "memory(GiB)": 302.58, + "step": 337100, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.75621686, + "epoch": 1.8853347165384826, + "grad_norm": 8.625, + "learning_rate": 8.958549207666645e-08, + "loss": 0.96779919, + "memory(GiB)": 302.58, + "step": 337120, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.7606514, + "epoch": 1.8854465660114617, + "grad_norm": 5.375, + "learning_rate": 8.941131596216956e-08, + "loss": 0.92906132, + "memory(GiB)": 302.58, + "step": 337140, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75312924, + "epoch": 1.8855584154844411, + "grad_norm": 6.28125, + "learning_rate": 8.923730780537709e-08, + "loss": 0.96738825, + "memory(GiB)": 302.58, + "step": 337160, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.74053979, + "epoch": 1.8856702649574202, + "grad_norm": 7.78125, + "learning_rate": 8.906346761223983e-08, + "loss": 1.01787891, + "memory(GiB)": 302.58, + "step": 337180, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.76226969, + "epoch": 1.8857821144303997, + "grad_norm": 7.71875, + "learning_rate": 8.888979538870468e-08, + "loss": 0.91011629, + "memory(GiB)": 302.58, + "step": 337200, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75184946, + "epoch": 1.8858939639033787, + "grad_norm": 8.125, + "learning_rate": 8.871629114071079e-08, + "loss": 0.98124437, + "memory(GiB)": 302.58, + "step": 337220, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74272685, + "epoch": 1.8860058133763582, + "grad_norm": 9.6875, + "learning_rate": 8.854295487419284e-08, + "loss": 1.00398893, + "memory(GiB)": 302.58, + "step": 337240, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.73749766, + "epoch": 1.8861176628493372, + "grad_norm": 8.1875, + "learning_rate": 8.836978659508e-08, + "loss": 1.04285536, + "memory(GiB)": 302.58, + "step": 337260, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.74642406, + "epoch": 1.8862295123223167, + "grad_norm": 6.78125, + "learning_rate": 8.819678630929474e-08, + "loss": 0.99528484, + "memory(GiB)": 302.58, + "step": 337280, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.74714017, + "epoch": 1.8863413617952958, + "grad_norm": 7.5625, + "learning_rate": 8.802395402275399e-08, + "loss": 1.01034451, + "memory(GiB)": 302.58, + "step": 337300, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.75060487, + "epoch": 1.8864532112682753, + "grad_norm": 8.5, + "learning_rate": 8.785128974136969e-08, + "loss": 0.98920517, + "memory(GiB)": 302.58, + "step": 337320, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74435091, + "epoch": 1.8865650607412543, + "grad_norm": 9.0625, + "learning_rate": 8.767879347104714e-08, + "loss": 1.00995808, + "memory(GiB)": 302.58, + "step": 337340, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.74819226, + "epoch": 1.8866769102142338, + "grad_norm": 9.125, + "learning_rate": 8.750646521768658e-08, + "loss": 0.98019562, + "memory(GiB)": 302.58, + "step": 337360, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.75152102, + "epoch": 1.8867887596872128, + "grad_norm": 10.25, + "learning_rate": 8.733430498718165e-08, + "loss": 0.97272549, + "memory(GiB)": 302.58, + "step": 337380, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.76236677, + "epoch": 1.8869006091601923, + "grad_norm": 6.75, + "learning_rate": 8.716231278542153e-08, + "loss": 0.93544216, + "memory(GiB)": 302.58, + "step": 337400, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.73764491, + "epoch": 1.8870124586331714, + "grad_norm": 11.5625, + "learning_rate": 8.699048861828873e-08, + "loss": 1.03716364, + "memory(GiB)": 302.58, + "step": 337420, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.73846674, + "epoch": 1.8871243081061508, + "grad_norm": 6.9375, + "learning_rate": 8.681883249165968e-08, + "loss": 1.02159719, + "memory(GiB)": 302.58, + "step": 337440, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.7236227, + "epoch": 1.8872361575791299, + "grad_norm": 8.4375, + "learning_rate": 8.664734441140577e-08, + "loss": 1.07140093, + "memory(GiB)": 302.58, + "step": 337460, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.7483686, + "epoch": 1.8873480070521094, + "grad_norm": 6.875, + "learning_rate": 8.647602438339286e-08, + "loss": 1.01122389, + "memory(GiB)": 302.58, + "step": 337480, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.74069047, + "epoch": 1.8874598565250884, + "grad_norm": 8.9375, + "learning_rate": 8.630487241347963e-08, + "loss": 1.04235439, + "memory(GiB)": 302.58, + "step": 337500, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75669079, + "epoch": 1.887571705998068, + "grad_norm": 5.84375, + "learning_rate": 8.613388850752136e-08, + "loss": 0.94944897, + "memory(GiB)": 302.58, + "step": 337520, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.73896942, + "epoch": 1.887683555471047, + "grad_norm": 5.21875, + "learning_rate": 8.596307267136506e-08, + "loss": 1.02807102, + "memory(GiB)": 302.58, + "step": 337540, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74579277, + "epoch": 1.8877954049440264, + "grad_norm": 8.25, + "learning_rate": 8.579242491085382e-08, + "loss": 0.97775888, + "memory(GiB)": 302.58, + "step": 337560, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75851574, + "epoch": 1.8879072544170055, + "grad_norm": 5.53125, + "learning_rate": 8.56219452318241e-08, + "loss": 0.95469265, + "memory(GiB)": 302.58, + "step": 337580, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.74286861, + "epoch": 1.888019103889985, + "grad_norm": 7.90625, + "learning_rate": 8.545163364010733e-08, + "loss": 1.02737989, + "memory(GiB)": 302.58, + "step": 337600, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.74364195, + "epoch": 1.888130953362964, + "grad_norm": 8.8125, + "learning_rate": 8.528149014152776e-08, + "loss": 1.01578169, + "memory(GiB)": 302.58, + "step": 337620, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.77156525, + "epoch": 1.8882428028359435, + "grad_norm": 5.875, + "learning_rate": 8.511151474190516e-08, + "loss": 0.87185907, + "memory(GiB)": 302.58, + "step": 337640, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.74892368, + "epoch": 1.8883546523089225, + "grad_norm": 9.3125, + "learning_rate": 8.494170744705432e-08, + "loss": 0.98482485, + "memory(GiB)": 302.58, + "step": 337660, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.74453025, + "epoch": 1.888466501781902, + "grad_norm": 8.1875, + "learning_rate": 8.477206826278172e-08, + "loss": 1.01167393, + "memory(GiB)": 302.58, + "step": 337680, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.75995331, + "epoch": 1.888578351254881, + "grad_norm": 5.65625, + "learning_rate": 8.46025971948905e-08, + "loss": 0.91415491, + "memory(GiB)": 302.58, + "step": 337700, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.75118012, + "epoch": 1.8886902007278605, + "grad_norm": 5.75, + "learning_rate": 8.443329424917657e-08, + "loss": 0.9816123, + "memory(GiB)": 302.58, + "step": 337720, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74695005, + "epoch": 1.8888020502008396, + "grad_norm": 8.375, + "learning_rate": 8.426415943143029e-08, + "loss": 1.002178, + "memory(GiB)": 302.58, + "step": 337740, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75395288, + "epoch": 1.888913899673819, + "grad_norm": 6.09375, + "learning_rate": 8.409519274743816e-08, + "loss": 0.98175364, + "memory(GiB)": 302.58, + "step": 337760, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75481677, + "epoch": 1.889025749146798, + "grad_norm": 5.71875, + "learning_rate": 8.392639420297777e-08, + "loss": 0.96975212, + "memory(GiB)": 302.58, + "step": 337780, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74398847, + "epoch": 1.8891375986197776, + "grad_norm": 5.84375, + "learning_rate": 8.375776380382283e-08, + "loss": 1.00154982, + "memory(GiB)": 302.58, + "step": 337800, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.73271031, + "epoch": 1.8892494480927566, + "grad_norm": 6.0625, + "learning_rate": 8.358930155574208e-08, + "loss": 1.0541254, + "memory(GiB)": 302.58, + "step": 337820, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.76520619, + "epoch": 1.8893612975657361, + "grad_norm": 7.78125, + "learning_rate": 8.342100746449645e-08, + "loss": 0.91445818, + "memory(GiB)": 302.58, + "step": 337840, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.73960266, + "epoch": 1.8894731470387152, + "grad_norm": 10.4375, + "learning_rate": 8.325288153584243e-08, + "loss": 1.01729412, + "memory(GiB)": 302.58, + "step": 337860, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.72968283, + "epoch": 1.8895849965116946, + "grad_norm": 10.625, + "learning_rate": 8.308492377553045e-08, + "loss": 1.07232246, + "memory(GiB)": 302.58, + "step": 337880, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.73522663, + "epoch": 1.8896968459846737, + "grad_norm": 7.46875, + "learning_rate": 8.291713418930425e-08, + "loss": 1.01972733, + "memory(GiB)": 302.58, + "step": 337900, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74121938, + "epoch": 1.8898086954576532, + "grad_norm": 5.1875, + "learning_rate": 8.274951278290477e-08, + "loss": 1.02193642, + "memory(GiB)": 302.58, + "step": 337920, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.77717161, + "epoch": 1.8899205449306322, + "grad_norm": 7.40625, + "learning_rate": 8.258205956206411e-08, + "loss": 0.87392693, + "memory(GiB)": 302.58, + "step": 337940, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.73603201, + "epoch": 1.8900323944036117, + "grad_norm": 7.1875, + "learning_rate": 8.241477453250934e-08, + "loss": 1.05039597, + "memory(GiB)": 302.58, + "step": 337960, + "train_speed(iter/s)": 0.123617 + }, + { + "acc": 0.74113498, + "epoch": 1.8901442438765907, + "grad_norm": 8.1875, + "learning_rate": 8.224765769996257e-08, + "loss": 1.00360451, + "memory(GiB)": 302.58, + "step": 337980, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.74954891, + "epoch": 1.8902560933495702, + "grad_norm": 6.3125, + "learning_rate": 8.208070907014032e-08, + "loss": 0.97098665, + "memory(GiB)": 302.58, + "step": 338000, + "train_speed(iter/s)": 0.123624 + }, + { + "epoch": 1.8902560933495702, + "eval_acc": 0.7069173940653966, + "eval_loss": 1.0117850303649902, + "eval_runtime": 7543.7159, + "eval_samples_per_second": 9.98, + "eval_steps_per_second": 9.98, + "step": 338000 + }, + { + "acc": 0.7520596, + "epoch": 1.8903679428225493, + "grad_norm": 6.28125, + "learning_rate": 8.191392864875137e-08, + "loss": 0.96669092, + "memory(GiB)": 302.58, + "step": 338020, + "train_speed(iter/s)": 0.123281 + }, + { + "acc": 0.75409846, + "epoch": 1.8904797922955288, + "grad_norm": 7.875, + "learning_rate": 8.174731644150113e-08, + "loss": 0.95618639, + "memory(GiB)": 302.58, + "step": 338040, + "train_speed(iter/s)": 0.123284 + }, + { + "acc": 0.74749908, + "epoch": 1.8905916417685078, + "grad_norm": 9.0, + "learning_rate": 8.158087245408841e-08, + "loss": 0.99932356, + "memory(GiB)": 302.58, + "step": 338060, + "train_speed(iter/s)": 0.123288 + }, + { + "acc": 0.74068289, + "epoch": 1.8907034912414873, + "grad_norm": 7.53125, + "learning_rate": 8.141459669220531e-08, + "loss": 1.04099255, + "memory(GiB)": 302.58, + "step": 338080, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.74465942, + "epoch": 1.8908153407144663, + "grad_norm": 10.1875, + "learning_rate": 8.12484891615395e-08, + "loss": 0.9840888, + "memory(GiB)": 302.58, + "step": 338100, + "train_speed(iter/s)": 0.123295 + }, + { + "acc": 0.75058088, + "epoch": 1.8909271901874458, + "grad_norm": 7.25, + "learning_rate": 8.108254986777197e-08, + "loss": 0.96901226, + "memory(GiB)": 302.58, + "step": 338120, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.73840666, + "epoch": 1.8910390396604249, + "grad_norm": 10.125, + "learning_rate": 8.091677881657933e-08, + "loss": 1.02313948, + "memory(GiB)": 302.58, + "step": 338140, + "train_speed(iter/s)": 0.123302 + }, + { + "acc": 0.76300311, + "epoch": 1.8911508891334043, + "grad_norm": 6.40625, + "learning_rate": 8.075117601363036e-08, + "loss": 0.91575031, + "memory(GiB)": 302.58, + "step": 338160, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.75040269, + "epoch": 1.8912627386063834, + "grad_norm": 7.53125, + "learning_rate": 8.058574146458942e-08, + "loss": 0.96656036, + "memory(GiB)": 302.58, + "step": 338180, + "train_speed(iter/s)": 0.123309 + }, + { + "acc": 0.72749057, + "epoch": 1.8913745880793629, + "grad_norm": 5.0, + "learning_rate": 8.042047517511586e-08, + "loss": 1.07144012, + "memory(GiB)": 302.58, + "step": 338200, + "train_speed(iter/s)": 0.123312 + }, + { + "acc": 0.75906982, + "epoch": 1.891486437552342, + "grad_norm": 9.75, + "learning_rate": 8.02553771508613e-08, + "loss": 0.97713709, + "memory(GiB)": 302.58, + "step": 338220, + "train_speed(iter/s)": 0.123315 + }, + { + "acc": 0.75490694, + "epoch": 1.8915982870253214, + "grad_norm": 7.03125, + "learning_rate": 8.009044739747285e-08, + "loss": 0.9726018, + "memory(GiB)": 302.58, + "step": 338240, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.75418663, + "epoch": 1.8917101364983004, + "grad_norm": 8.875, + "learning_rate": 7.992568592059158e-08, + "loss": 0.94903851, + "memory(GiB)": 302.58, + "step": 338260, + "train_speed(iter/s)": 0.123322 + }, + { + "acc": 0.76269283, + "epoch": 1.89182198597128, + "grad_norm": 7.90625, + "learning_rate": 7.976109272585353e-08, + "loss": 0.92258968, + "memory(GiB)": 302.58, + "step": 338280, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.74828286, + "epoch": 1.891933835444259, + "grad_norm": 5.53125, + "learning_rate": 7.959666781888753e-08, + "loss": 0.97607021, + "memory(GiB)": 302.58, + "step": 338300, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.75825291, + "epoch": 1.8920456849172385, + "grad_norm": 6.90625, + "learning_rate": 7.943241120531741e-08, + "loss": 0.95071697, + "memory(GiB)": 302.58, + "step": 338320, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.77928538, + "epoch": 1.8921575343902175, + "grad_norm": 5.75, + "learning_rate": 7.926832289076202e-08, + "loss": 0.86406145, + "memory(GiB)": 302.58, + "step": 338340, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.74196415, + "epoch": 1.892269383863197, + "grad_norm": 6.125, + "learning_rate": 7.910440288083243e-08, + "loss": 1.03295288, + "memory(GiB)": 302.58, + "step": 338360, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.76536632, + "epoch": 1.892381233336176, + "grad_norm": 8.125, + "learning_rate": 7.894065118113692e-08, + "loss": 0.91747532, + "memory(GiB)": 302.58, + "step": 338380, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.75166306, + "epoch": 1.8924930828091555, + "grad_norm": 6.46875, + "learning_rate": 7.877706779727489e-08, + "loss": 0.96626005, + "memory(GiB)": 302.58, + "step": 338400, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.76219149, + "epoch": 1.8926049322821346, + "grad_norm": 9.25, + "learning_rate": 7.86136527348419e-08, + "loss": 0.92760296, + "memory(GiB)": 302.58, + "step": 338420, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.74747763, + "epoch": 1.892716781755114, + "grad_norm": 8.0625, + "learning_rate": 7.845040599942677e-08, + "loss": 0.98650103, + "memory(GiB)": 302.58, + "step": 338440, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.76344247, + "epoch": 1.892828631228093, + "grad_norm": 12.875, + "learning_rate": 7.82873275966145e-08, + "loss": 0.92164974, + "memory(GiB)": 302.58, + "step": 338460, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.75279384, + "epoch": 1.8929404807010726, + "grad_norm": 5.03125, + "learning_rate": 7.81244175319812e-08, + "loss": 0.97934856, + "memory(GiB)": 302.58, + "step": 338480, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.74135146, + "epoch": 1.8930523301740516, + "grad_norm": 7.15625, + "learning_rate": 7.796167581110015e-08, + "loss": 1.02364397, + "memory(GiB)": 302.58, + "step": 338500, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.75492625, + "epoch": 1.893164179647031, + "grad_norm": 6.625, + "learning_rate": 7.779910243953747e-08, + "loss": 0.97200584, + "memory(GiB)": 302.58, + "step": 338520, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.76052327, + "epoch": 1.8932760291200101, + "grad_norm": 5.21875, + "learning_rate": 7.763669742285262e-08, + "loss": 0.930585, + "memory(GiB)": 302.58, + "step": 338540, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.75726395, + "epoch": 1.8933878785929896, + "grad_norm": 10.375, + "learning_rate": 7.74744607666017e-08, + "loss": 0.94165449, + "memory(GiB)": 302.58, + "step": 338560, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.7698576, + "epoch": 1.8934997280659687, + "grad_norm": 9.375, + "learning_rate": 7.731239247633249e-08, + "loss": 0.8854764, + "memory(GiB)": 302.58, + "step": 338580, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.76592884, + "epoch": 1.8936115775389482, + "grad_norm": 7.15625, + "learning_rate": 7.715049255758888e-08, + "loss": 0.90932732, + "memory(GiB)": 302.58, + "step": 338600, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.75540004, + "epoch": 1.8937234270119272, + "grad_norm": 7.875, + "learning_rate": 7.698876101590925e-08, + "loss": 0.96245699, + "memory(GiB)": 302.58, + "step": 338620, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.7642334, + "epoch": 1.8938352764849067, + "grad_norm": 9.5, + "learning_rate": 7.682719785682358e-08, + "loss": 0.90325632, + "memory(GiB)": 302.58, + "step": 338640, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.76030788, + "epoch": 1.8939471259578857, + "grad_norm": 6.875, + "learning_rate": 7.666580308585913e-08, + "loss": 0.93889303, + "memory(GiB)": 302.58, + "step": 338660, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.7325387, + "epoch": 1.8940589754308652, + "grad_norm": 5.6875, + "learning_rate": 7.650457670853539e-08, + "loss": 1.06176863, + "memory(GiB)": 302.58, + "step": 338680, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.7506999, + "epoch": 1.8941708249038443, + "grad_norm": 7.96875, + "learning_rate": 7.634351873036738e-08, + "loss": 0.9970026, + "memory(GiB)": 302.58, + "step": 338700, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.75888443, + "epoch": 1.8942826743768237, + "grad_norm": 6.71875, + "learning_rate": 7.618262915686348e-08, + "loss": 0.94596205, + "memory(GiB)": 302.58, + "step": 338720, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.74581156, + "epoch": 1.8943945238498028, + "grad_norm": 11.5, + "learning_rate": 7.602190799352704e-08, + "loss": 1.01579418, + "memory(GiB)": 302.58, + "step": 338740, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.7528367, + "epoch": 1.8945063733227823, + "grad_norm": 4.8125, + "learning_rate": 7.586135524585425e-08, + "loss": 0.99354105, + "memory(GiB)": 302.58, + "step": 338760, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.75497723, + "epoch": 1.8946182227957613, + "grad_norm": 10.8125, + "learning_rate": 7.570097091933681e-08, + "loss": 0.96148853, + "memory(GiB)": 302.58, + "step": 338780, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75156116, + "epoch": 1.8947300722687408, + "grad_norm": 5.90625, + "learning_rate": 7.554075501946145e-08, + "loss": 0.98359385, + "memory(GiB)": 302.58, + "step": 338800, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.74916058, + "epoch": 1.8948419217417198, + "grad_norm": 8.3125, + "learning_rate": 7.538070755170712e-08, + "loss": 1.00622435, + "memory(GiB)": 302.58, + "step": 338820, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.74595408, + "epoch": 1.8949537712146993, + "grad_norm": 5.65625, + "learning_rate": 7.522082852154777e-08, + "loss": 1.02536373, + "memory(GiB)": 302.58, + "step": 338840, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.77126231, + "epoch": 1.8950656206876784, + "grad_norm": 8.5625, + "learning_rate": 7.506111793445292e-08, + "loss": 0.88918495, + "memory(GiB)": 302.58, + "step": 338860, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.75559444, + "epoch": 1.8951774701606579, + "grad_norm": 4.3125, + "learning_rate": 7.490157579588375e-08, + "loss": 0.93864412, + "memory(GiB)": 302.58, + "step": 338880, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.7407423, + "epoch": 1.895289319633637, + "grad_norm": 7.4375, + "learning_rate": 7.474220211129812e-08, + "loss": 1.02665071, + "memory(GiB)": 302.58, + "step": 338900, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.74266624, + "epoch": 1.8954011691066164, + "grad_norm": 13.6875, + "learning_rate": 7.458299688614668e-08, + "loss": 1.03909731, + "memory(GiB)": 302.58, + "step": 338920, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75424905, + "epoch": 1.8955130185795954, + "grad_norm": 6.875, + "learning_rate": 7.442396012587449e-08, + "loss": 0.9558526, + "memory(GiB)": 302.58, + "step": 338940, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.7536418, + "epoch": 1.895624868052575, + "grad_norm": 6.46875, + "learning_rate": 7.426509183592167e-08, + "loss": 0.97733078, + "memory(GiB)": 302.58, + "step": 338960, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.7639492, + "epoch": 1.895736717525554, + "grad_norm": 6.4375, + "learning_rate": 7.410639202172165e-08, + "loss": 0.91615162, + "memory(GiB)": 302.58, + "step": 338980, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.75493178, + "epoch": 1.8958485669985334, + "grad_norm": 6.0, + "learning_rate": 7.394786068870285e-08, + "loss": 0.96108036, + "memory(GiB)": 302.58, + "step": 339000, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.7377285, + "epoch": 1.8959604164715125, + "grad_norm": 4.4375, + "learning_rate": 7.378949784228706e-08, + "loss": 1.03463726, + "memory(GiB)": 302.58, + "step": 339020, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.73997593, + "epoch": 1.896072265944492, + "grad_norm": 7.15625, + "learning_rate": 7.363130348789105e-08, + "loss": 1.02133331, + "memory(GiB)": 302.58, + "step": 339040, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.75424781, + "epoch": 1.896184115417471, + "grad_norm": 7.25, + "learning_rate": 7.347327763092549e-08, + "loss": 0.96882572, + "memory(GiB)": 302.58, + "step": 339060, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.75440326, + "epoch": 1.8962959648904505, + "grad_norm": 7.09375, + "learning_rate": 7.33154202767955e-08, + "loss": 0.9600173, + "memory(GiB)": 302.58, + "step": 339080, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.74162507, + "epoch": 1.8964078143634295, + "grad_norm": 5.09375, + "learning_rate": 7.315773143090066e-08, + "loss": 1.0290801, + "memory(GiB)": 302.58, + "step": 339100, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.75487657, + "epoch": 1.896519663836409, + "grad_norm": 8.0625, + "learning_rate": 7.300021109863387e-08, + "loss": 0.95994711, + "memory(GiB)": 302.58, + "step": 339120, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.75273442, + "epoch": 1.896631513309388, + "grad_norm": 10.4375, + "learning_rate": 7.284285928538248e-08, + "loss": 0.97486858, + "memory(GiB)": 302.58, + "step": 339140, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.75988064, + "epoch": 1.8967433627823675, + "grad_norm": 9.25, + "learning_rate": 7.268567599652942e-08, + "loss": 0.93706264, + "memory(GiB)": 302.58, + "step": 339160, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.74246454, + "epoch": 1.8968552122553466, + "grad_norm": 7.125, + "learning_rate": 7.252866123744983e-08, + "loss": 1.01864262, + "memory(GiB)": 302.58, + "step": 339180, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.7634553, + "epoch": 1.896967061728326, + "grad_norm": 8.6875, + "learning_rate": 7.237181501351498e-08, + "loss": 0.93963509, + "memory(GiB)": 302.58, + "step": 339200, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.74770932, + "epoch": 1.8970789112013051, + "grad_norm": 8.0, + "learning_rate": 7.22151373300889e-08, + "loss": 1.03147917, + "memory(GiB)": 302.58, + "step": 339220, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74142737, + "epoch": 1.8971907606742846, + "grad_norm": 8.6875, + "learning_rate": 7.205862819253118e-08, + "loss": 1.03033438, + "memory(GiB)": 302.58, + "step": 339240, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.74989147, + "epoch": 1.8973026101472636, + "grad_norm": 6.625, + "learning_rate": 7.190228760619366e-08, + "loss": 0.99082336, + "memory(GiB)": 302.58, + "step": 339260, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.76373596, + "epoch": 1.8974144596202431, + "grad_norm": 9.25, + "learning_rate": 7.174611557642541e-08, + "loss": 0.94516449, + "memory(GiB)": 302.58, + "step": 339280, + "train_speed(iter/s)": 0.123497 + }, + { + "acc": 0.7538506, + "epoch": 1.8975263090932224, + "grad_norm": 6.8125, + "learning_rate": 7.159011210856659e-08, + "loss": 0.93130102, + "memory(GiB)": 302.58, + "step": 339300, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.74548445, + "epoch": 1.8976381585662017, + "grad_norm": 5.46875, + "learning_rate": 7.143427720795293e-08, + "loss": 1.00124846, + "memory(GiB)": 302.58, + "step": 339320, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.73592978, + "epoch": 1.897750008039181, + "grad_norm": 7.0625, + "learning_rate": 7.127861087991572e-08, + "loss": 1.03821621, + "memory(GiB)": 302.58, + "step": 339340, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.74883261, + "epoch": 1.8978618575121602, + "grad_norm": 7.21875, + "learning_rate": 7.112311312977849e-08, + "loss": 1.00904665, + "memory(GiB)": 302.58, + "step": 339360, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.74490032, + "epoch": 1.8979737069851395, + "grad_norm": 6.8125, + "learning_rate": 7.096778396285975e-08, + "loss": 1.02390795, + "memory(GiB)": 302.58, + "step": 339380, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.7430655, + "epoch": 1.8980855564581187, + "grad_norm": 7.59375, + "learning_rate": 7.081262338447247e-08, + "loss": 1.01455135, + "memory(GiB)": 302.58, + "step": 339400, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.75878196, + "epoch": 1.898197405931098, + "grad_norm": 4.84375, + "learning_rate": 7.065763139992354e-08, + "loss": 0.967204, + "memory(GiB)": 302.58, + "step": 339420, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75290384, + "epoch": 1.8983092554040772, + "grad_norm": 5.3125, + "learning_rate": 7.05028080145137e-08, + "loss": 0.98645983, + "memory(GiB)": 302.58, + "step": 339440, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.75522814, + "epoch": 1.8984211048770565, + "grad_norm": 8.1875, + "learning_rate": 7.034815323353927e-08, + "loss": 0.96631441, + "memory(GiB)": 302.58, + "step": 339460, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.74417334, + "epoch": 1.8985329543500358, + "grad_norm": 9.0625, + "learning_rate": 7.019366706228936e-08, + "loss": 1.03536282, + "memory(GiB)": 302.58, + "step": 339480, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.75355272, + "epoch": 1.898644803823015, + "grad_norm": 8.125, + "learning_rate": 7.003934950604807e-08, + "loss": 0.96197252, + "memory(GiB)": 302.58, + "step": 339500, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.7555048, + "epoch": 1.8987566532959943, + "grad_norm": 5.1875, + "learning_rate": 6.98852005700934e-08, + "loss": 0.95074759, + "memory(GiB)": 302.58, + "step": 339520, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.74954991, + "epoch": 1.8988685027689736, + "grad_norm": 8.375, + "learning_rate": 6.973122025969836e-08, + "loss": 0.96895876, + "memory(GiB)": 302.58, + "step": 339540, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.76157565, + "epoch": 1.8989803522419528, + "grad_norm": 8.5625, + "learning_rate": 6.957740858012874e-08, + "loss": 0.93707323, + "memory(GiB)": 302.58, + "step": 339560, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.78002996, + "epoch": 1.899092201714932, + "grad_norm": 8.5, + "learning_rate": 6.942376553664532e-08, + "loss": 0.8555563, + "memory(GiB)": 302.58, + "step": 339580, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.75022664, + "epoch": 1.8992040511879114, + "grad_norm": 8.5625, + "learning_rate": 6.927029113450335e-08, + "loss": 0.97331161, + "memory(GiB)": 302.58, + "step": 339600, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.72469196, + "epoch": 1.8993159006608906, + "grad_norm": 6.0, + "learning_rate": 6.911698537895306e-08, + "loss": 1.08535738, + "memory(GiB)": 302.58, + "step": 339620, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.742202, + "epoch": 1.8994277501338699, + "grad_norm": 6.78125, + "learning_rate": 6.896384827523695e-08, + "loss": 1.01876392, + "memory(GiB)": 302.58, + "step": 339640, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.74384379, + "epoch": 1.8995395996068491, + "grad_norm": 5.8125, + "learning_rate": 6.881087982859303e-08, + "loss": 1.02497902, + "memory(GiB)": 302.58, + "step": 339660, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.76381483, + "epoch": 1.8996514490798284, + "grad_norm": 6.3125, + "learning_rate": 6.865808004425378e-08, + "loss": 0.93361483, + "memory(GiB)": 302.58, + "step": 339680, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.74317389, + "epoch": 1.8997632985528077, + "grad_norm": 6.9375, + "learning_rate": 6.850544892744504e-08, + "loss": 1.01349716, + "memory(GiB)": 302.58, + "step": 339700, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.74564729, + "epoch": 1.899875148025787, + "grad_norm": 5.1875, + "learning_rate": 6.835298648338706e-08, + "loss": 0.99566593, + "memory(GiB)": 302.58, + "step": 339720, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.75855374, + "epoch": 1.8999869974987662, + "grad_norm": 6.15625, + "learning_rate": 6.820069271729458e-08, + "loss": 0.95894051, + "memory(GiB)": 302.58, + "step": 339740, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74646173, + "epoch": 1.9000988469717455, + "grad_norm": 9.0, + "learning_rate": 6.804856763437673e-08, + "loss": 0.99525623, + "memory(GiB)": 302.58, + "step": 339760, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75142345, + "epoch": 1.9002106964447247, + "grad_norm": 4.28125, + "learning_rate": 6.789661123983715e-08, + "loss": 0.96430826, + "memory(GiB)": 302.58, + "step": 339780, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75463042, + "epoch": 1.900322545917704, + "grad_norm": 7.21875, + "learning_rate": 6.774482353887224e-08, + "loss": 0.95534639, + "memory(GiB)": 302.58, + "step": 339800, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74432435, + "epoch": 1.9004343953906833, + "grad_norm": 8.875, + "learning_rate": 6.75932045366745e-08, + "loss": 1.01599293, + "memory(GiB)": 302.58, + "step": 339820, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.7561831, + "epoch": 1.9005462448636625, + "grad_norm": 4.90625, + "learning_rate": 6.744175423842925e-08, + "loss": 0.93734684, + "memory(GiB)": 302.58, + "step": 339840, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.74143925, + "epoch": 1.9006580943366418, + "grad_norm": 5.65625, + "learning_rate": 6.729047264931676e-08, + "loss": 1.03352661, + "memory(GiB)": 302.58, + "step": 339860, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.75111098, + "epoch": 1.900769943809621, + "grad_norm": 7.3125, + "learning_rate": 6.713935977451125e-08, + "loss": 0.97334518, + "memory(GiB)": 302.58, + "step": 339880, + "train_speed(iter/s)": 0.1236 + }, + { + "acc": 0.75834093, + "epoch": 1.9008817932826003, + "grad_norm": 6.21875, + "learning_rate": 6.698841561918135e-08, + "loss": 0.93339319, + "memory(GiB)": 302.58, + "step": 339900, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.74862671, + "epoch": 1.9009936427555796, + "grad_norm": 9.1875, + "learning_rate": 6.683764018848903e-08, + "loss": 0.99921198, + "memory(GiB)": 302.58, + "step": 339920, + "train_speed(iter/s)": 0.123607 + }, + { + "acc": 0.75210438, + "epoch": 1.9011054922285588, + "grad_norm": 8.5, + "learning_rate": 6.668703348759298e-08, + "loss": 0.94574347, + "memory(GiB)": 302.58, + "step": 339940, + "train_speed(iter/s)": 0.12361 + }, + { + "acc": 0.76024656, + "epoch": 1.901217341701538, + "grad_norm": 6.5625, + "learning_rate": 6.653659552164349e-08, + "loss": 0.93526134, + "memory(GiB)": 302.58, + "step": 339960, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.75242267, + "epoch": 1.9013291911745174, + "grad_norm": 5.65625, + "learning_rate": 6.638632629578589e-08, + "loss": 0.98027, + "memory(GiB)": 302.58, + "step": 339980, + "train_speed(iter/s)": 0.123618 + }, + { + "acc": 0.74989419, + "epoch": 1.9014410406474966, + "grad_norm": 5.34375, + "learning_rate": 6.623622581515998e-08, + "loss": 0.96881065, + "memory(GiB)": 302.58, + "step": 340000, + "train_speed(iter/s)": 0.123621 + }, + { + "epoch": 1.9014410406474966, + "eval_acc": 0.7069193166400612, + "eval_loss": 1.0117732286453247, + "eval_runtime": 7536.6903, + "eval_samples_per_second": 9.989, + "eval_steps_per_second": 9.989, + "step": 340000 + }, + { + "acc": 0.75162153, + "epoch": 1.901552890120476, + "grad_norm": 8.6875, + "learning_rate": 6.608629408489942e-08, + "loss": 0.95871029, + "memory(GiB)": 302.58, + "step": 340020, + "train_speed(iter/s)": 0.123281 + }, + { + "acc": 0.74566479, + "epoch": 1.9016647395934552, + "grad_norm": 6.28125, + "learning_rate": 6.593653111013287e-08, + "loss": 1.00094452, + "memory(GiB)": 302.58, + "step": 340040, + "train_speed(iter/s)": 0.123284 + }, + { + "acc": 0.74026995, + "epoch": 1.9017765890664344, + "grad_norm": 6.53125, + "learning_rate": 6.578693689598237e-08, + "loss": 1.02391644, + "memory(GiB)": 302.58, + "step": 340060, + "train_speed(iter/s)": 0.123287 + }, + { + "acc": 0.7513207, + "epoch": 1.9018884385394137, + "grad_norm": 10.625, + "learning_rate": 6.563751144756491e-08, + "loss": 0.98569059, + "memory(GiB)": 302.58, + "step": 340080, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.72403026, + "epoch": 1.902000288012393, + "grad_norm": 7.40625, + "learning_rate": 6.548825476999032e-08, + "loss": 1.11635303, + "memory(GiB)": 302.58, + "step": 340100, + "train_speed(iter/s)": 0.123294 + }, + { + "acc": 0.73320427, + "epoch": 1.9021121374853722, + "grad_norm": 5.78125, + "learning_rate": 6.533916686836506e-08, + "loss": 1.06202269, + "memory(GiB)": 302.58, + "step": 340120, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.74926291, + "epoch": 1.9022239869583515, + "grad_norm": 5.1875, + "learning_rate": 6.519024774778726e-08, + "loss": 1.00467396, + "memory(GiB)": 302.58, + "step": 340140, + "train_speed(iter/s)": 0.123301 + }, + { + "acc": 0.75339689, + "epoch": 1.9023358364313308, + "grad_norm": 7.40625, + "learning_rate": 6.50414974133512e-08, + "loss": 0.97496338, + "memory(GiB)": 302.58, + "step": 340160, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.73632121, + "epoch": 1.90244768590431, + "grad_norm": 7.5625, + "learning_rate": 6.48929158701439e-08, + "loss": 1.06317444, + "memory(GiB)": 302.58, + "step": 340180, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.74235783, + "epoch": 1.9025595353772893, + "grad_norm": 7.40625, + "learning_rate": 6.474450312324742e-08, + "loss": 1.0155467, + "memory(GiB)": 302.58, + "step": 340200, + "train_speed(iter/s)": 0.123312 + }, + { + "acc": 0.75298042, + "epoch": 1.9026713848502685, + "grad_norm": 8.0625, + "learning_rate": 6.459625917773882e-08, + "loss": 0.96217117, + "memory(GiB)": 302.58, + "step": 340220, + "train_speed(iter/s)": 0.123315 + }, + { + "acc": 0.75290194, + "epoch": 1.9027832343232478, + "grad_norm": 7.0625, + "learning_rate": 6.444818403868736e-08, + "loss": 0.97212782, + "memory(GiB)": 302.58, + "step": 340240, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.77129526, + "epoch": 1.902895083796227, + "grad_norm": 6.78125, + "learning_rate": 6.430027771115843e-08, + "loss": 0.88599968, + "memory(GiB)": 302.58, + "step": 340260, + "train_speed(iter/s)": 0.123322 + }, + { + "acc": 0.7663228, + "epoch": 1.9030069332692063, + "grad_norm": 4.3125, + "learning_rate": 6.415254020021077e-08, + "loss": 0.91978674, + "memory(GiB)": 302.58, + "step": 340280, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.7434433, + "epoch": 1.9031187827421856, + "grad_norm": 5.8125, + "learning_rate": 6.400497151089757e-08, + "loss": 1.01084394, + "memory(GiB)": 302.58, + "step": 340300, + "train_speed(iter/s)": 0.123329 + }, + { + "acc": 0.73960428, + "epoch": 1.9032306322151649, + "grad_norm": 6.6875, + "learning_rate": 6.385757164826589e-08, + "loss": 1.02406998, + "memory(GiB)": 302.58, + "step": 340320, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.75486383, + "epoch": 1.9033424816881441, + "grad_norm": 7.03125, + "learning_rate": 6.371034061735726e-08, + "loss": 0.94762516, + "memory(GiB)": 302.58, + "step": 340340, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.75725965, + "epoch": 1.9034543311611234, + "grad_norm": 5.84375, + "learning_rate": 6.356327842320708e-08, + "loss": 0.95216427, + "memory(GiB)": 302.58, + "step": 340360, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.74566655, + "epoch": 1.9035661806341027, + "grad_norm": 12.6875, + "learning_rate": 6.341638507084635e-08, + "loss": 1.00276775, + "memory(GiB)": 302.58, + "step": 340380, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.72891574, + "epoch": 1.903678030107082, + "grad_norm": 6.75, + "learning_rate": 6.326966056529882e-08, + "loss": 1.06842451, + "memory(GiB)": 302.58, + "step": 340400, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.7464479, + "epoch": 1.9037898795800612, + "grad_norm": 8.25, + "learning_rate": 6.312310491158269e-08, + "loss": 1.00048361, + "memory(GiB)": 302.58, + "step": 340420, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.74120808, + "epoch": 1.9039017290530404, + "grad_norm": 9.8125, + "learning_rate": 6.297671811471062e-08, + "loss": 1.03042059, + "memory(GiB)": 302.58, + "step": 340440, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.76311955, + "epoch": 1.9040135785260197, + "grad_norm": 9.3125, + "learning_rate": 6.283050017968973e-08, + "loss": 0.92322636, + "memory(GiB)": 302.58, + "step": 340460, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.75384278, + "epoch": 1.904125427998999, + "grad_norm": 13.875, + "learning_rate": 6.268445111152044e-08, + "loss": 0.96325531, + "memory(GiB)": 302.58, + "step": 340480, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.74394879, + "epoch": 1.9042372774719782, + "grad_norm": 6.09375, + "learning_rate": 6.253857091519989e-08, + "loss": 0.99116678, + "memory(GiB)": 302.58, + "step": 340500, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.76139307, + "epoch": 1.9043491269449575, + "grad_norm": 7.40625, + "learning_rate": 6.239285959571573e-08, + "loss": 0.96191044, + "memory(GiB)": 302.58, + "step": 340520, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.75469217, + "epoch": 1.9044609764179368, + "grad_norm": 6.71875, + "learning_rate": 6.224731715805288e-08, + "loss": 0.96253233, + "memory(GiB)": 302.58, + "step": 340540, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.75676122, + "epoch": 1.904572825890916, + "grad_norm": 8.0, + "learning_rate": 6.210194360718902e-08, + "loss": 0.95992899, + "memory(GiB)": 302.58, + "step": 340560, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.76932316, + "epoch": 1.9046846753638953, + "grad_norm": 10.3125, + "learning_rate": 6.195673894809572e-08, + "loss": 0.89676065, + "memory(GiB)": 302.58, + "step": 340580, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.75219879, + "epoch": 1.9047965248368746, + "grad_norm": 8.0625, + "learning_rate": 6.18117031857407e-08, + "loss": 0.97823143, + "memory(GiB)": 302.58, + "step": 340600, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.73606224, + "epoch": 1.9049083743098538, + "grad_norm": 5.75, + "learning_rate": 6.166683632508385e-08, + "loss": 1.04815617, + "memory(GiB)": 302.58, + "step": 340620, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75046568, + "epoch": 1.905020223782833, + "grad_norm": 6.625, + "learning_rate": 6.152213837108012e-08, + "loss": 0.9810482, + "memory(GiB)": 302.58, + "step": 340640, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.75375428, + "epoch": 1.9051320732558124, + "grad_norm": 10.8125, + "learning_rate": 6.137760932867887e-08, + "loss": 0.94489241, + "memory(GiB)": 302.58, + "step": 340660, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.74408994, + "epoch": 1.9052439227287916, + "grad_norm": 6.03125, + "learning_rate": 6.123324920282337e-08, + "loss": 1.01813211, + "memory(GiB)": 302.58, + "step": 340680, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.73850222, + "epoch": 1.9053557722017709, + "grad_norm": 9.25, + "learning_rate": 6.108905799845077e-08, + "loss": 1.02595758, + "memory(GiB)": 302.58, + "step": 340700, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.74731255, + "epoch": 1.9054676216747501, + "grad_norm": 5.875, + "learning_rate": 6.094503572049381e-08, + "loss": 1.01094894, + "memory(GiB)": 302.58, + "step": 340720, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.72770872, + "epoch": 1.9055794711477294, + "grad_norm": 5.875, + "learning_rate": 6.080118237387745e-08, + "loss": 1.1095314, + "memory(GiB)": 302.58, + "step": 340740, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.76112709, + "epoch": 1.9056913206207087, + "grad_norm": 8.8125, + "learning_rate": 6.065749796352216e-08, + "loss": 0.91905422, + "memory(GiB)": 302.58, + "step": 340760, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.76335454, + "epoch": 1.905803170093688, + "grad_norm": 6.4375, + "learning_rate": 6.051398249434349e-08, + "loss": 0.9096427, + "memory(GiB)": 302.58, + "step": 340780, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75553217, + "epoch": 1.9059150195666672, + "grad_norm": 7.25, + "learning_rate": 6.037063597124859e-08, + "loss": 0.97444181, + "memory(GiB)": 302.58, + "step": 340800, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.76673441, + "epoch": 1.9060268690396465, + "grad_norm": 4.9375, + "learning_rate": 6.02274583991419e-08, + "loss": 0.92386322, + "memory(GiB)": 302.58, + "step": 340820, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.77006855, + "epoch": 1.9061387185126257, + "grad_norm": 7.28125, + "learning_rate": 6.008444978291894e-08, + "loss": 0.90741854, + "memory(GiB)": 302.58, + "step": 340840, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.74451094, + "epoch": 1.906250567985605, + "grad_norm": 7.5, + "learning_rate": 5.994161012747246e-08, + "loss": 1.00497313, + "memory(GiB)": 302.58, + "step": 340860, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.75534534, + "epoch": 1.9063624174585843, + "grad_norm": 7.90625, + "learning_rate": 5.97989394376869e-08, + "loss": 0.96454401, + "memory(GiB)": 302.58, + "step": 340880, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.75583682, + "epoch": 1.9064742669315635, + "grad_norm": 7.34375, + "learning_rate": 5.965643771844276e-08, + "loss": 0.95060539, + "memory(GiB)": 302.58, + "step": 340900, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.77098517, + "epoch": 1.9065861164045428, + "grad_norm": 5.46875, + "learning_rate": 5.951410497461396e-08, + "loss": 0.88022366, + "memory(GiB)": 302.58, + "step": 340920, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75011187, + "epoch": 1.906697965877522, + "grad_norm": 6.65625, + "learning_rate": 5.937194121106882e-08, + "loss": 0.96783266, + "memory(GiB)": 302.58, + "step": 340940, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.75684695, + "epoch": 1.9068098153505013, + "grad_norm": 9.25, + "learning_rate": 5.922994643266899e-08, + "loss": 0.95518103, + "memory(GiB)": 302.58, + "step": 340960, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.76735358, + "epoch": 1.9069216648234806, + "grad_norm": 9.25, + "learning_rate": 5.9088120644272276e-08, + "loss": 0.90193958, + "memory(GiB)": 302.58, + "step": 340980, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.74322495, + "epoch": 1.9070335142964598, + "grad_norm": 6.125, + "learning_rate": 5.8946463850729216e-08, + "loss": 1.02520361, + "memory(GiB)": 302.58, + "step": 341000, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.75967536, + "epoch": 1.907145363769439, + "grad_norm": 8.5625, + "learning_rate": 5.880497605688374e-08, + "loss": 0.94563026, + "memory(GiB)": 302.58, + "step": 341020, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.74552689, + "epoch": 1.9072572132424184, + "grad_norm": 8.0, + "learning_rate": 5.866365726757695e-08, + "loss": 0.97837563, + "memory(GiB)": 302.58, + "step": 341040, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.74868817, + "epoch": 1.9073690627153976, + "grad_norm": 5.625, + "learning_rate": 5.852250748764166e-08, + "loss": 0.98342113, + "memory(GiB)": 302.58, + "step": 341060, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.74219985, + "epoch": 1.907480912188377, + "grad_norm": 8.1875, + "learning_rate": 5.838152672190511e-08, + "loss": 1.01006136, + "memory(GiB)": 302.58, + "step": 341080, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.72851653, + "epoch": 1.9075927616613562, + "grad_norm": 7.84375, + "learning_rate": 5.8240714975190124e-08, + "loss": 1.10826435, + "memory(GiB)": 302.58, + "step": 341100, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.75228024, + "epoch": 1.9077046111343354, + "grad_norm": 5.6875, + "learning_rate": 5.8100072252312845e-08, + "loss": 0.96485195, + "memory(GiB)": 302.58, + "step": 341120, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.75200429, + "epoch": 1.9078164606073147, + "grad_norm": 6.75, + "learning_rate": 5.7959598558082753e-08, + "loss": 0.96919689, + "memory(GiB)": 302.58, + "step": 341140, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.76166077, + "epoch": 1.907928310080294, + "grad_norm": 4.8125, + "learning_rate": 5.781929389730601e-08, + "loss": 0.91196966, + "memory(GiB)": 302.58, + "step": 341160, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.75590272, + "epoch": 1.9080401595532732, + "grad_norm": 8.375, + "learning_rate": 5.767915827477988e-08, + "loss": 0.96098995, + "memory(GiB)": 302.58, + "step": 341180, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.76449728, + "epoch": 1.9081520090262525, + "grad_norm": 9.75, + "learning_rate": 5.753919169529887e-08, + "loss": 0.89619741, + "memory(GiB)": 302.58, + "step": 341200, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.77438736, + "epoch": 1.9082638584992317, + "grad_norm": 5.53125, + "learning_rate": 5.7399394163649144e-08, + "loss": 0.88047743, + "memory(GiB)": 302.58, + "step": 341220, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.75083432, + "epoch": 1.908375707972211, + "grad_norm": 7.5, + "learning_rate": 5.7259765684613e-08, + "loss": 0.98875904, + "memory(GiB)": 302.58, + "step": 341240, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.74460759, + "epoch": 1.9084875574451903, + "grad_norm": 6.5, + "learning_rate": 5.712030626296605e-08, + "loss": 0.995924, + "memory(GiB)": 302.58, + "step": 341260, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.76695719, + "epoch": 1.9085994069181695, + "grad_norm": 7.0625, + "learning_rate": 5.698101590347782e-08, + "loss": 0.92347507, + "memory(GiB)": 302.58, + "step": 341280, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.77132959, + "epoch": 1.9087112563911488, + "grad_norm": 6.15625, + "learning_rate": 5.684189461091283e-08, + "loss": 0.90187197, + "memory(GiB)": 302.58, + "step": 341300, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75088806, + "epoch": 1.908823105864128, + "grad_norm": 7.65625, + "learning_rate": 5.6702942390028936e-08, + "loss": 0.96905642, + "memory(GiB)": 302.58, + "step": 341320, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.74901361, + "epoch": 1.9089349553371073, + "grad_norm": 6.625, + "learning_rate": 5.6564159245579565e-08, + "loss": 1.00932999, + "memory(GiB)": 302.58, + "step": 341340, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.76300387, + "epoch": 1.9090468048100866, + "grad_norm": 6.71875, + "learning_rate": 5.6425545182310914e-08, + "loss": 0.92142267, + "memory(GiB)": 302.58, + "step": 341360, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74813261, + "epoch": 1.9091586542830659, + "grad_norm": 7.0625, + "learning_rate": 5.62871002049642e-08, + "loss": 0.99369478, + "memory(GiB)": 302.58, + "step": 341380, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74748507, + "epoch": 1.9092705037560451, + "grad_norm": 6.34375, + "learning_rate": 5.6148824318275064e-08, + "loss": 0.99373426, + "memory(GiB)": 302.58, + "step": 341400, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75388083, + "epoch": 1.9093823532290244, + "grad_norm": 6.78125, + "learning_rate": 5.601071752697307e-08, + "loss": 0.95675182, + "memory(GiB)": 302.58, + "step": 341420, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.75318332, + "epoch": 1.9094942027020037, + "grad_norm": 6.5, + "learning_rate": 5.5872779835780544e-08, + "loss": 0.98538179, + "memory(GiB)": 302.58, + "step": 341440, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.75158205, + "epoch": 1.909606052174983, + "grad_norm": 8.375, + "learning_rate": 5.5735011249417046e-08, + "loss": 0.9886055, + "memory(GiB)": 302.58, + "step": 341460, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.74721346, + "epoch": 1.9097179016479622, + "grad_norm": 6.34375, + "learning_rate": 5.559741177259381e-08, + "loss": 0.98507576, + "memory(GiB)": 302.58, + "step": 341480, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.76144381, + "epoch": 1.9098297511209414, + "grad_norm": 7.9375, + "learning_rate": 5.5459981410017626e-08, + "loss": 0.95375891, + "memory(GiB)": 302.58, + "step": 341500, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.74965382, + "epoch": 1.9099416005939207, + "grad_norm": 8.5625, + "learning_rate": 5.5322720166388064e-08, + "loss": 0.97696609, + "memory(GiB)": 302.58, + "step": 341520, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.73113813, + "epoch": 1.9100534500669, + "grad_norm": 6.96875, + "learning_rate": 5.5185628046401374e-08, + "loss": 1.07957916, + "memory(GiB)": 302.58, + "step": 341540, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.73942561, + "epoch": 1.9101652995398792, + "grad_norm": 7.25, + "learning_rate": 5.504870505474547e-08, + "loss": 1.02289782, + "memory(GiB)": 302.58, + "step": 341560, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.75608225, + "epoch": 1.9102771490128585, + "grad_norm": 6.40625, + "learning_rate": 5.491195119610382e-08, + "loss": 0.95033751, + "memory(GiB)": 302.58, + "step": 341580, + "train_speed(iter/s)": 0.123546 + }, + { + "acc": 0.7434835, + "epoch": 1.9103889984858378, + "grad_norm": 9.1875, + "learning_rate": 5.47753664751538e-08, + "loss": 1.03127146, + "memory(GiB)": 302.58, + "step": 341600, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.76106386, + "epoch": 1.910500847958817, + "grad_norm": 7.875, + "learning_rate": 5.4638950896567237e-08, + "loss": 0.91525812, + "memory(GiB)": 302.58, + "step": 341620, + "train_speed(iter/s)": 0.123553 + }, + { + "acc": 0.75936961, + "epoch": 1.9106126974317963, + "grad_norm": 8.1875, + "learning_rate": 5.4502704465009825e-08, + "loss": 0.93037634, + "memory(GiB)": 302.58, + "step": 341640, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.73121548, + "epoch": 1.9107245469047756, + "grad_norm": 6.28125, + "learning_rate": 5.436662718514174e-08, + "loss": 1.06915035, + "memory(GiB)": 302.58, + "step": 341660, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.75269165, + "epoch": 1.9108363963777548, + "grad_norm": 11.375, + "learning_rate": 5.423071906161759e-08, + "loss": 0.97409449, + "memory(GiB)": 302.58, + "step": 341680, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.74319668, + "epoch": 1.910948245850734, + "grad_norm": 9.0, + "learning_rate": 5.409498009908476e-08, + "loss": 1.01273375, + "memory(GiB)": 302.58, + "step": 341700, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.76473002, + "epoch": 1.9110600953237133, + "grad_norm": 7.28125, + "learning_rate": 5.3959410302186765e-08, + "loss": 0.91572304, + "memory(GiB)": 302.58, + "step": 341720, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.74223876, + "epoch": 1.9111719447966926, + "grad_norm": 5.125, + "learning_rate": 5.3824009675561006e-08, + "loss": 1.01928082, + "memory(GiB)": 302.58, + "step": 341740, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.73949685, + "epoch": 1.9112837942696719, + "grad_norm": 7.5625, + "learning_rate": 5.368877822383767e-08, + "loss": 1.00568762, + "memory(GiB)": 302.58, + "step": 341760, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.76218638, + "epoch": 1.9113956437426511, + "grad_norm": 10.6875, + "learning_rate": 5.355371595164249e-08, + "loss": 0.93090773, + "memory(GiB)": 302.58, + "step": 341780, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.775313, + "epoch": 1.9115074932156304, + "grad_norm": 10.0625, + "learning_rate": 5.3418822863594565e-08, + "loss": 0.88236275, + "memory(GiB)": 302.58, + "step": 341800, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.7453125, + "epoch": 1.9116193426886097, + "grad_norm": 8.75, + "learning_rate": 5.328409896430853e-08, + "loss": 0.99649725, + "memory(GiB)": 302.58, + "step": 341820, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.75709381, + "epoch": 1.911731192161589, + "grad_norm": 6.96875, + "learning_rate": 5.3149544258391804e-08, + "loss": 0.93202944, + "memory(GiB)": 302.58, + "step": 341840, + "train_speed(iter/s)": 0.123591 + }, + { + "acc": 0.74549251, + "epoch": 1.9118430416345682, + "grad_norm": 8.0625, + "learning_rate": 5.3015158750446825e-08, + "loss": 1.02169218, + "memory(GiB)": 302.58, + "step": 341860, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.76432467, + "epoch": 1.9119548911075475, + "grad_norm": 5.71875, + "learning_rate": 5.288094244506936e-08, + "loss": 0.92928753, + "memory(GiB)": 302.58, + "step": 341880, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.7570806, + "epoch": 1.9120667405805267, + "grad_norm": 4.6875, + "learning_rate": 5.274689534685073e-08, + "loss": 0.95409632, + "memory(GiB)": 302.58, + "step": 341900, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.75449305, + "epoch": 1.912178590053506, + "grad_norm": 10.9375, + "learning_rate": 5.2613017460375613e-08, + "loss": 0.95966434, + "memory(GiB)": 302.58, + "step": 341920, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.77078996, + "epoch": 1.9122904395264853, + "grad_norm": 9.8125, + "learning_rate": 5.2479308790223674e-08, + "loss": 0.8804039, + "memory(GiB)": 302.58, + "step": 341940, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.74750643, + "epoch": 1.9124022889994645, + "grad_norm": 10.25, + "learning_rate": 5.23457693409668e-08, + "loss": 0.98386078, + "memory(GiB)": 302.58, + "step": 341960, + "train_speed(iter/s)": 0.123611 + }, + { + "acc": 0.76118598, + "epoch": 1.9125141384724438, + "grad_norm": 6.8125, + "learning_rate": 5.221239911717357e-08, + "loss": 0.93166685, + "memory(GiB)": 302.58, + "step": 341980, + "train_speed(iter/s)": 0.123614 + }, + { + "acc": 0.74221873, + "epoch": 1.912625987945423, + "grad_norm": 9.875, + "learning_rate": 5.207919812340534e-08, + "loss": 1.01212854, + "memory(GiB)": 302.58, + "step": 342000, + "train_speed(iter/s)": 0.123618 + }, + { + "epoch": 1.912625987945423, + "eval_acc": 0.7068905273168775, + "eval_loss": 1.0117872953414917, + "eval_runtime": 7563.3993, + "eval_samples_per_second": 9.954, + "eval_steps_per_second": 9.954, + "step": 342000 + }, + { + "acc": 0.74081755, + "epoch": 1.9127378374184023, + "grad_norm": 7.375, + "learning_rate": 5.19461663642179e-08, + "loss": 1.01375551, + "memory(GiB)": 302.58, + "step": 342020, + "train_speed(iter/s)": 0.123278 + }, + { + "acc": 0.72851315, + "epoch": 1.9128496868913816, + "grad_norm": 6.3125, + "learning_rate": 5.18133038441615e-08, + "loss": 1.06522007, + "memory(GiB)": 302.58, + "step": 342040, + "train_speed(iter/s)": 0.123282 + }, + { + "acc": 0.76172876, + "epoch": 1.9129615363643608, + "grad_norm": 7.75, + "learning_rate": 5.1680610567780286e-08, + "loss": 0.94273748, + "memory(GiB)": 302.58, + "step": 342060, + "train_speed(iter/s)": 0.123285 + }, + { + "acc": 0.7537765, + "epoch": 1.91307338583734, + "grad_norm": 6.0, + "learning_rate": 5.1548086539612855e-08, + "loss": 0.9710516, + "memory(GiB)": 302.58, + "step": 342080, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.7434885, + "epoch": 1.9131852353103194, + "grad_norm": 8.5625, + "learning_rate": 5.141573176419168e-08, + "loss": 1.01690378, + "memory(GiB)": 302.58, + "step": 342100, + "train_speed(iter/s)": 0.123292 + }, + { + "acc": 0.75569167, + "epoch": 1.9132970847832986, + "grad_norm": 7.53125, + "learning_rate": 5.1283546246044256e-08, + "loss": 0.93418922, + "memory(GiB)": 302.58, + "step": 342120, + "train_speed(iter/s)": 0.123295 + }, + { + "acc": 0.75168886, + "epoch": 1.913408934256278, + "grad_norm": 9.375, + "learning_rate": 5.115152998969142e-08, + "loss": 0.97741442, + "memory(GiB)": 302.58, + "step": 342140, + "train_speed(iter/s)": 0.123299 + }, + { + "acc": 0.74462018, + "epoch": 1.9135207837292572, + "grad_norm": 8.375, + "learning_rate": 5.101968299964899e-08, + "loss": 1.01961737, + "memory(GiB)": 302.58, + "step": 342160, + "train_speed(iter/s)": 0.123302 + }, + { + "acc": 0.75510612, + "epoch": 1.9136326332022364, + "grad_norm": 7.40625, + "learning_rate": 5.088800528042559e-08, + "loss": 0.95645218, + "memory(GiB)": 302.58, + "step": 342180, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.74720173, + "epoch": 1.9137444826752157, + "grad_norm": 9.4375, + "learning_rate": 5.075649683652595e-08, + "loss": 0.97928867, + "memory(GiB)": 302.58, + "step": 342200, + "train_speed(iter/s)": 0.123309 + }, + { + "acc": 0.72969317, + "epoch": 1.913856332148195, + "grad_norm": 6.90625, + "learning_rate": 5.062515767244813e-08, + "loss": 1.03962526, + "memory(GiB)": 302.58, + "step": 342220, + "train_speed(iter/s)": 0.123312 + }, + { + "acc": 0.75440326, + "epoch": 1.9139681816211742, + "grad_norm": 6.71875, + "learning_rate": 5.049398779268355e-08, + "loss": 0.96508055, + "memory(GiB)": 302.58, + "step": 342240, + "train_speed(iter/s)": 0.123316 + }, + { + "acc": 0.75688844, + "epoch": 1.9140800310941535, + "grad_norm": 9.0625, + "learning_rate": 5.0362987201719174e-08, + "loss": 0.95199108, + "memory(GiB)": 302.58, + "step": 342260, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.75190153, + "epoch": 1.9141918805671327, + "grad_norm": 8.3125, + "learning_rate": 5.0232155904035296e-08, + "loss": 0.96969585, + "memory(GiB)": 302.58, + "step": 342280, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.746526, + "epoch": 1.914303730040112, + "grad_norm": 5.84375, + "learning_rate": 5.010149390410723e-08, + "loss": 1.01960926, + "memory(GiB)": 302.58, + "step": 342300, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.7398828, + "epoch": 1.9144155795130913, + "grad_norm": 7.84375, + "learning_rate": 4.9971001206403634e-08, + "loss": 1.03310547, + "memory(GiB)": 302.58, + "step": 342320, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.76055255, + "epoch": 1.9145274289860705, + "grad_norm": 8.5625, + "learning_rate": 4.9840677815387594e-08, + "loss": 0.92241249, + "memory(GiB)": 302.58, + "step": 342340, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.75297136, + "epoch": 1.9146392784590498, + "grad_norm": 7.03125, + "learning_rate": 4.971052373551721e-08, + "loss": 0.98031263, + "memory(GiB)": 302.58, + "step": 342360, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.72716484, + "epoch": 1.914751127932029, + "grad_norm": 4.84375, + "learning_rate": 4.958053897124393e-08, + "loss": 1.06881342, + "memory(GiB)": 302.58, + "step": 342380, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.7487268, + "epoch": 1.9148629774050083, + "grad_norm": 6.125, + "learning_rate": 4.9450723527013634e-08, + "loss": 0.96816235, + "memory(GiB)": 302.58, + "step": 342400, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.7666872, + "epoch": 1.9149748268779876, + "grad_norm": 5.84375, + "learning_rate": 4.9321077407266104e-08, + "loss": 0.92000875, + "memory(GiB)": 302.58, + "step": 342420, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.7542017, + "epoch": 1.9150866763509669, + "grad_norm": 7.3125, + "learning_rate": 4.919160061643613e-08, + "loss": 0.97575302, + "memory(GiB)": 302.58, + "step": 342440, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.75871177, + "epoch": 1.9151985258239461, + "grad_norm": 6.65625, + "learning_rate": 4.9062293158952387e-08, + "loss": 0.94723072, + "memory(GiB)": 302.58, + "step": 342460, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.77001004, + "epoch": 1.9153103752969254, + "grad_norm": 7.125, + "learning_rate": 4.8933155039236345e-08, + "loss": 0.90034466, + "memory(GiB)": 302.58, + "step": 342480, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.73222179, + "epoch": 1.9154222247699046, + "grad_norm": 6.28125, + "learning_rate": 4.880418626170613e-08, + "loss": 1.06567564, + "memory(GiB)": 302.58, + "step": 342500, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.73584399, + "epoch": 1.915534074242884, + "grad_norm": 7.0625, + "learning_rate": 4.867538683077266e-08, + "loss": 1.04514732, + "memory(GiB)": 302.58, + "step": 342520, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.74113402, + "epoch": 1.9156459237158632, + "grad_norm": 5.46875, + "learning_rate": 4.8546756750841304e-08, + "loss": 1.01568727, + "memory(GiB)": 302.58, + "step": 342540, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.7533926, + "epoch": 1.9157577731888424, + "grad_norm": 6.5, + "learning_rate": 4.841829602631132e-08, + "loss": 0.97568655, + "memory(GiB)": 302.58, + "step": 342560, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.77170272, + "epoch": 1.9158696226618217, + "grad_norm": 4.96875, + "learning_rate": 4.829000466157696e-08, + "loss": 0.90645962, + "memory(GiB)": 302.58, + "step": 342580, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.74875627, + "epoch": 1.915981472134801, + "grad_norm": 8.25, + "learning_rate": 4.8161882661025286e-08, + "loss": 0.99812431, + "memory(GiB)": 302.58, + "step": 342600, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.73192663, + "epoch": 1.9160933216077802, + "grad_norm": 5.78125, + "learning_rate": 4.803393002903944e-08, + "loss": 1.0598279, + "memory(GiB)": 302.58, + "step": 342620, + "train_speed(iter/s)": 0.123381 + }, + { + "acc": 0.74060073, + "epoch": 1.9162051710807595, + "grad_norm": 10.625, + "learning_rate": 4.790614676999594e-08, + "loss": 1.03657427, + "memory(GiB)": 302.58, + "step": 342640, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.74852619, + "epoch": 1.9163170205537388, + "grad_norm": 7.3125, + "learning_rate": 4.777853288826406e-08, + "loss": 1.00330639, + "memory(GiB)": 302.58, + "step": 342660, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.75622282, + "epoch": 1.916428870026718, + "grad_norm": 8.5625, + "learning_rate": 4.7651088388209754e-08, + "loss": 0.95179138, + "memory(GiB)": 302.58, + "step": 342680, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.75737691, + "epoch": 1.9165407194996973, + "grad_norm": 6.0625, + "learning_rate": 4.752381327419176e-08, + "loss": 0.93706799, + "memory(GiB)": 302.58, + "step": 342700, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.75759592, + "epoch": 1.9166525689726766, + "grad_norm": 8.375, + "learning_rate": 4.739670755056325e-08, + "loss": 0.96336184, + "memory(GiB)": 302.58, + "step": 342720, + "train_speed(iter/s)": 0.123398 + }, + { + "acc": 0.72946572, + "epoch": 1.9167644184456558, + "grad_norm": 7.34375, + "learning_rate": 4.7269771221671866e-08, + "loss": 1.06571054, + "memory(GiB)": 302.58, + "step": 342740, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.74618073, + "epoch": 1.916876267918635, + "grad_norm": 9.3125, + "learning_rate": 4.7143004291858584e-08, + "loss": 1.01522169, + "memory(GiB)": 302.58, + "step": 342760, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.74162889, + "epoch": 1.9169881173916143, + "grad_norm": 9.125, + "learning_rate": 4.701640676545938e-08, + "loss": 1.04275789, + "memory(GiB)": 302.58, + "step": 342780, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.74474106, + "epoch": 1.9170999668645936, + "grad_norm": 8.375, + "learning_rate": 4.6889978646804666e-08, + "loss": 1.01012793, + "memory(GiB)": 302.58, + "step": 342800, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.75705032, + "epoch": 1.9172118163375729, + "grad_norm": 9.0, + "learning_rate": 4.6763719940218775e-08, + "loss": 0.98129253, + "memory(GiB)": 302.58, + "step": 342820, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.73739538, + "epoch": 1.9173236658105521, + "grad_norm": 7.40625, + "learning_rate": 4.663763065001992e-08, + "loss": 1.01703033, + "memory(GiB)": 302.58, + "step": 342840, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75448918, + "epoch": 1.9174355152835314, + "grad_norm": 8.25, + "learning_rate": 4.65117107805213e-08, + "loss": 0.96984835, + "memory(GiB)": 302.58, + "step": 342860, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.75310287, + "epoch": 1.9175473647565107, + "grad_norm": 8.0, + "learning_rate": 4.638596033602838e-08, + "loss": 0.99192524, + "memory(GiB)": 302.58, + "step": 342880, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.7556541, + "epoch": 1.91765921422949, + "grad_norm": 8.9375, + "learning_rate": 4.6260379320843815e-08, + "loss": 0.96494026, + "memory(GiB)": 302.58, + "step": 342900, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74853601, + "epoch": 1.9177710637024692, + "grad_norm": 4.625, + "learning_rate": 4.613496773926196e-08, + "loss": 1.00084639, + "memory(GiB)": 302.58, + "step": 342920, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.75756097, + "epoch": 1.9178829131754485, + "grad_norm": 8.9375, + "learning_rate": 4.6009725595572153e-08, + "loss": 0.95674906, + "memory(GiB)": 302.58, + "step": 342940, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.74089065, + "epoch": 1.9179947626484277, + "grad_norm": 6.1875, + "learning_rate": 4.5884652894058746e-08, + "loss": 1.03435173, + "memory(GiB)": 302.58, + "step": 342960, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.7505837, + "epoch": 1.918106612121407, + "grad_norm": 5.40625, + "learning_rate": 4.575974963899887e-08, + "loss": 0.96375942, + "memory(GiB)": 302.58, + "step": 342980, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.74436307, + "epoch": 1.9182184615943862, + "grad_norm": 9.875, + "learning_rate": 4.563501583466578e-08, + "loss": 1.00644693, + "memory(GiB)": 302.58, + "step": 343000, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.7537271, + "epoch": 1.9183303110673655, + "grad_norm": 7.75, + "learning_rate": 4.5510451485324405e-08, + "loss": 0.9588975, + "memory(GiB)": 302.58, + "step": 343020, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.74706082, + "epoch": 1.9184421605403448, + "grad_norm": 6.34375, + "learning_rate": 4.538605659523576e-08, + "loss": 1.01184855, + "memory(GiB)": 302.58, + "step": 343040, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.73502169, + "epoch": 1.918554010013324, + "grad_norm": 6.0625, + "learning_rate": 4.526183116865479e-08, + "loss": 1.04231634, + "memory(GiB)": 302.58, + "step": 343060, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.76159511, + "epoch": 1.9186658594863033, + "grad_norm": 5.53125, + "learning_rate": 4.513777520983031e-08, + "loss": 0.94811268, + "memory(GiB)": 302.58, + "step": 343080, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.75089731, + "epoch": 1.9187777089592826, + "grad_norm": 7.46875, + "learning_rate": 4.5013888723005605e-08, + "loss": 0.99210024, + "memory(GiB)": 302.58, + "step": 343100, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.73954873, + "epoch": 1.9188895584322618, + "grad_norm": 10.25, + "learning_rate": 4.4890171712417275e-08, + "loss": 1.01788187, + "memory(GiB)": 302.58, + "step": 343120, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.76455936, + "epoch": 1.919001407905241, + "grad_norm": 7.1875, + "learning_rate": 4.476662418229749e-08, + "loss": 0.9394083, + "memory(GiB)": 302.58, + "step": 343140, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.74906244, + "epoch": 1.9191132573782204, + "grad_norm": 7.78125, + "learning_rate": 4.464324613687232e-08, + "loss": 0.9868679, + "memory(GiB)": 302.58, + "step": 343160, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74530168, + "epoch": 1.9192251068511996, + "grad_norm": 8.125, + "learning_rate": 4.452003758036061e-08, + "loss": 1.02109175, + "memory(GiB)": 302.58, + "step": 343180, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.75274549, + "epoch": 1.9193369563241789, + "grad_norm": 5.125, + "learning_rate": 4.439699851697676e-08, + "loss": 0.95948324, + "memory(GiB)": 302.58, + "step": 343200, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.76860447, + "epoch": 1.9194488057971582, + "grad_norm": 10.4375, + "learning_rate": 4.427412895092964e-08, + "loss": 0.88806963, + "memory(GiB)": 302.58, + "step": 343220, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.74857917, + "epoch": 1.9195606552701374, + "grad_norm": 6.90625, + "learning_rate": 4.4151428886421985e-08, + "loss": 0.96639538, + "memory(GiB)": 302.58, + "step": 343240, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.74114952, + "epoch": 1.9196725047431167, + "grad_norm": 7.5, + "learning_rate": 4.402889832764934e-08, + "loss": 1.03414068, + "memory(GiB)": 302.58, + "step": 343260, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.7572679, + "epoch": 1.919784354216096, + "grad_norm": 4.78125, + "learning_rate": 4.390653727880334e-08, + "loss": 0.96742582, + "memory(GiB)": 302.58, + "step": 343280, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.74542742, + "epoch": 1.9198962036890752, + "grad_norm": 6.84375, + "learning_rate": 4.378434574406953e-08, + "loss": 0.99806356, + "memory(GiB)": 302.58, + "step": 343300, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.74248281, + "epoch": 1.9200080531620545, + "grad_norm": 6.65625, + "learning_rate": 4.366232372762624e-08, + "loss": 1.02541695, + "memory(GiB)": 302.58, + "step": 343320, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.76906743, + "epoch": 1.9201199026350337, + "grad_norm": 8.3125, + "learning_rate": 4.354047123364791e-08, + "loss": 0.8949131, + "memory(GiB)": 302.58, + "step": 343340, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.74681411, + "epoch": 1.920231752108013, + "grad_norm": 5.5, + "learning_rate": 4.34187882663023e-08, + "loss": 0.97857141, + "memory(GiB)": 302.58, + "step": 343360, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.77848878, + "epoch": 1.9203436015809923, + "grad_norm": 9.8125, + "learning_rate": 4.3297274829750545e-08, + "loss": 0.8559413, + "memory(GiB)": 302.58, + "step": 343380, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74779906, + "epoch": 1.9204554510539715, + "grad_norm": 4.65625, + "learning_rate": 4.3175930928149865e-08, + "loss": 0.9862134, + "memory(GiB)": 302.58, + "step": 343400, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75699091, + "epoch": 1.9205673005269508, + "grad_norm": 7.84375, + "learning_rate": 4.305475656564917e-08, + "loss": 0.96281233, + "memory(GiB)": 302.58, + "step": 343420, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.74358015, + "epoch": 1.92067914999993, + "grad_norm": 9.625, + "learning_rate": 4.293375174639458e-08, + "loss": 1.00956984, + "memory(GiB)": 302.58, + "step": 343440, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.75131512, + "epoch": 1.9207909994729093, + "grad_norm": 8.3125, + "learning_rate": 4.2812916474523905e-08, + "loss": 0.99664745, + "memory(GiB)": 302.58, + "step": 343460, + "train_speed(iter/s)": 0.123523 + }, + { + "acc": 0.74675651, + "epoch": 1.9209028489458886, + "grad_norm": 7.9375, + "learning_rate": 4.269225075416994e-08, + "loss": 1.01150017, + "memory(GiB)": 302.58, + "step": 343480, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.74307251, + "epoch": 1.9210146984188678, + "grad_norm": 7.375, + "learning_rate": 4.25717545894605e-08, + "loss": 1.01070137, + "memory(GiB)": 302.58, + "step": 343500, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.75435734, + "epoch": 1.9211265478918471, + "grad_norm": 9.0625, + "learning_rate": 4.245142798451673e-08, + "loss": 0.95468321, + "memory(GiB)": 302.58, + "step": 343520, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.74820437, + "epoch": 1.9212383973648264, + "grad_norm": 7.34375, + "learning_rate": 4.233127094345424e-08, + "loss": 1.00163116, + "memory(GiB)": 302.58, + "step": 343540, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.76401925, + "epoch": 1.9213502468378056, + "grad_norm": 9.5, + "learning_rate": 4.22112834703825e-08, + "loss": 0.90068274, + "memory(GiB)": 302.58, + "step": 343560, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.76108518, + "epoch": 1.921462096310785, + "grad_norm": 8.125, + "learning_rate": 4.209146556940547e-08, + "loss": 0.94557085, + "memory(GiB)": 302.58, + "step": 343580, + "train_speed(iter/s)": 0.123543 + }, + { + "acc": 0.74026351, + "epoch": 1.9215739457837642, + "grad_norm": 7.90625, + "learning_rate": 4.197181724462096e-08, + "loss": 1.02057161, + "memory(GiB)": 302.58, + "step": 343600, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.74671149, + "epoch": 1.9216857952567434, + "grad_norm": 6.46875, + "learning_rate": 4.185233850012238e-08, + "loss": 1.00081682, + "memory(GiB)": 302.58, + "step": 343620, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.7475903, + "epoch": 1.9217976447297227, + "grad_norm": 5.90625, + "learning_rate": 4.1733029339995346e-08, + "loss": 1.00364103, + "memory(GiB)": 302.58, + "step": 343640, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.77819557, + "epoch": 1.921909494202702, + "grad_norm": 6.53125, + "learning_rate": 4.161388976832103e-08, + "loss": 0.85243444, + "memory(GiB)": 302.58, + "step": 343660, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75867391, + "epoch": 1.9220213436756812, + "grad_norm": 7.125, + "learning_rate": 4.149491978917397e-08, + "loss": 0.94781876, + "memory(GiB)": 302.58, + "step": 343680, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.76211038, + "epoch": 1.9221331931486605, + "grad_norm": 8.3125, + "learning_rate": 4.137611940662423e-08, + "loss": 0.93906717, + "memory(GiB)": 302.58, + "step": 343700, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.73763328, + "epoch": 1.9222450426216398, + "grad_norm": 7.53125, + "learning_rate": 4.125748862473466e-08, + "loss": 1.04721947, + "memory(GiB)": 302.58, + "step": 343720, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74039025, + "epoch": 1.922356892094619, + "grad_norm": 7.03125, + "learning_rate": 4.1139027447562595e-08, + "loss": 1.02488222, + "memory(GiB)": 302.58, + "step": 343740, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.73101468, + "epoch": 1.9224687415675983, + "grad_norm": 7.84375, + "learning_rate": 4.102073587915978e-08, + "loss": 1.07517166, + "memory(GiB)": 302.58, + "step": 343760, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.74283643, + "epoch": 1.9225805910405775, + "grad_norm": 6.84375, + "learning_rate": 4.090261392357187e-08, + "loss": 1.02273273, + "memory(GiB)": 302.58, + "step": 343780, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.73425412, + "epoch": 1.9226924405135568, + "grad_norm": 8.4375, + "learning_rate": 4.078466158484007e-08, + "loss": 1.04250679, + "memory(GiB)": 302.58, + "step": 343800, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74294825, + "epoch": 1.922804289986536, + "grad_norm": 4.75, + "learning_rate": 4.066687886699783e-08, + "loss": 1.02292557, + "memory(GiB)": 302.58, + "step": 343820, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.76154027, + "epoch": 1.9229161394595153, + "grad_norm": 10.25, + "learning_rate": 4.0549265774074145e-08, + "loss": 0.94694233, + "memory(GiB)": 302.58, + "step": 343840, + "train_speed(iter/s)": 0.123588 + }, + { + "acc": 0.74187708, + "epoch": 1.9230279889324946, + "grad_norm": 6.3125, + "learning_rate": 4.043182231009135e-08, + "loss": 1.03239174, + "memory(GiB)": 302.58, + "step": 343860, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.73281031, + "epoch": 1.9231398384054739, + "grad_norm": 8.5625, + "learning_rate": 4.0314548479066796e-08, + "loss": 1.05107708, + "memory(GiB)": 302.58, + "step": 343880, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.77385182, + "epoch": 1.9232516878784531, + "grad_norm": 6.5, + "learning_rate": 4.0197444285011157e-08, + "loss": 0.89752846, + "memory(GiB)": 302.58, + "step": 343900, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.75203462, + "epoch": 1.9233635373514324, + "grad_norm": 6.65625, + "learning_rate": 4.0080509731930116e-08, + "loss": 0.97784472, + "memory(GiB)": 302.58, + "step": 343920, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.74275079, + "epoch": 1.9234753868244117, + "grad_norm": 5.46875, + "learning_rate": 3.9963744823823256e-08, + "loss": 1.04053946, + "memory(GiB)": 302.58, + "step": 343940, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.75293703, + "epoch": 1.923587236297391, + "grad_norm": 6.78125, + "learning_rate": 3.9847149564684054e-08, + "loss": 0.96062212, + "memory(GiB)": 302.58, + "step": 343960, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.75416441, + "epoch": 1.9236990857703702, + "grad_norm": 6.28125, + "learning_rate": 3.973072395850042e-08, + "loss": 0.95136824, + "memory(GiB)": 302.58, + "step": 343980, + "train_speed(iter/s)": 0.123612 + }, + { + "acc": 0.74369893, + "epoch": 1.9238109352433495, + "grad_norm": 6.46875, + "learning_rate": 3.961446800925528e-08, + "loss": 0.99732513, + "memory(GiB)": 302.58, + "step": 344000, + "train_speed(iter/s)": 0.123615 + }, + { + "epoch": 1.9238109352433495, + "eval_acc": 0.7068971330862381, + "eval_loss": 1.0117912292480469, + "eval_runtime": 7599.6417, + "eval_samples_per_second": 9.906, + "eval_steps_per_second": 9.906, + "step": 344000 + }, + { + "acc": 0.75735712, + "epoch": 1.9239227847163287, + "grad_norm": 9.75, + "learning_rate": 3.9498381720923797e-08, + "loss": 0.95583315, + "memory(GiB)": 302.58, + "step": 344020, + "train_speed(iter/s)": 0.123276 + }, + { + "acc": 0.74954085, + "epoch": 1.924034634189308, + "grad_norm": 6.9375, + "learning_rate": 3.938246509747667e-08, + "loss": 1.01034412, + "memory(GiB)": 302.58, + "step": 344040, + "train_speed(iter/s)": 0.123279 + }, + { + "acc": 0.74968724, + "epoch": 1.9241464836622872, + "grad_norm": 9.75, + "learning_rate": 3.9266718142879635e-08, + "loss": 0.98966312, + "memory(GiB)": 302.58, + "step": 344060, + "train_speed(iter/s)": 0.123283 + }, + { + "acc": 0.74063835, + "epoch": 1.9242583331352665, + "grad_norm": 5.46875, + "learning_rate": 3.9151140861090064e-08, + "loss": 1.02278214, + "memory(GiB)": 302.58, + "step": 344080, + "train_speed(iter/s)": 0.123286 + }, + { + "acc": 0.75041499, + "epoch": 1.9243701826082458, + "grad_norm": 8.6875, + "learning_rate": 3.903573325606258e-08, + "loss": 1.0049469, + "memory(GiB)": 302.58, + "step": 344100, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.76076832, + "epoch": 1.924482032081225, + "grad_norm": 5.84375, + "learning_rate": 3.892049533174347e-08, + "loss": 0.93317327, + "memory(GiB)": 302.58, + "step": 344120, + "train_speed(iter/s)": 0.123293 + }, + { + "acc": 0.74167371, + "epoch": 1.9245938815542043, + "grad_norm": 7.25, + "learning_rate": 3.880542709207513e-08, + "loss": 1.02828407, + "memory(GiB)": 302.58, + "step": 344140, + "train_speed(iter/s)": 0.123296 + }, + { + "acc": 0.7551331, + "epoch": 1.9247057310271836, + "grad_norm": 7.59375, + "learning_rate": 3.86905285409922e-08, + "loss": 0.96083536, + "memory(GiB)": 302.58, + "step": 344160, + "train_speed(iter/s)": 0.123299 + }, + { + "acc": 0.74558377, + "epoch": 1.9248175805001628, + "grad_norm": 7.4375, + "learning_rate": 3.8575799682424865e-08, + "loss": 0.99166603, + "memory(GiB)": 302.58, + "step": 344180, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.76044898, + "epoch": 1.924929429973142, + "grad_norm": 5.75, + "learning_rate": 3.846124052029776e-08, + "loss": 0.92852001, + "memory(GiB)": 302.58, + "step": 344200, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.74331183, + "epoch": 1.9250412794461214, + "grad_norm": 6.3125, + "learning_rate": 3.8346851058528866e-08, + "loss": 0.9918787, + "memory(GiB)": 302.58, + "step": 344220, + "train_speed(iter/s)": 0.123309 + }, + { + "acc": 0.74622316, + "epoch": 1.9251531289191006, + "grad_norm": 9.1875, + "learning_rate": 3.8232631301030606e-08, + "loss": 1.01346483, + "memory(GiB)": 302.58, + "step": 344240, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.73914218, + "epoch": 1.9252649783920799, + "grad_norm": 6.03125, + "learning_rate": 3.811858125170931e-08, + "loss": 1.04256315, + "memory(GiB)": 302.58, + "step": 344260, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.77358904, + "epoch": 1.9253768278650591, + "grad_norm": 8.3125, + "learning_rate": 3.800470091446684e-08, + "loss": 0.88858652, + "memory(GiB)": 302.58, + "step": 344280, + "train_speed(iter/s)": 0.12332 + }, + { + "acc": 0.76564794, + "epoch": 1.9254886773380384, + "grad_norm": 6.8125, + "learning_rate": 3.789099029319731e-08, + "loss": 0.91191692, + "memory(GiB)": 302.58, + "step": 344300, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.76559353, + "epoch": 1.9256005268110177, + "grad_norm": 5.3125, + "learning_rate": 3.77774493917904e-08, + "loss": 0.93100719, + "memory(GiB)": 302.58, + "step": 344320, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.75025787, + "epoch": 1.9257123762839972, + "grad_norm": 5.90625, + "learning_rate": 3.766407821412965e-08, + "loss": 0.97593546, + "memory(GiB)": 302.58, + "step": 344340, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.76318693, + "epoch": 1.9258242257569762, + "grad_norm": 6.46875, + "learning_rate": 3.7550876764092523e-08, + "loss": 0.93172255, + "memory(GiB)": 302.58, + "step": 344360, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.74224353, + "epoch": 1.9259360752299557, + "grad_norm": 5.75, + "learning_rate": 3.7437845045550925e-08, + "loss": 1.02300997, + "memory(GiB)": 302.58, + "step": 344380, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.74236212, + "epoch": 1.9260479247029347, + "grad_norm": 6.5625, + "learning_rate": 3.7324983062370645e-08, + "loss": 0.98712034, + "memory(GiB)": 302.58, + "step": 344400, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.76711798, + "epoch": 1.9261597741759142, + "grad_norm": 10.5625, + "learning_rate": 3.721229081841249e-08, + "loss": 0.91047487, + "memory(GiB)": 302.58, + "step": 344420, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.74236145, + "epoch": 1.9262716236488933, + "grad_norm": 8.875, + "learning_rate": 3.709976831753004e-08, + "loss": 0.98923817, + "memory(GiB)": 302.58, + "step": 344440, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.75177274, + "epoch": 1.9263834731218727, + "grad_norm": 7.96875, + "learning_rate": 3.698741556357299e-08, + "loss": 0.98242702, + "memory(GiB)": 302.58, + "step": 344460, + "train_speed(iter/s)": 0.12335 + }, + { + "acc": 0.75374198, + "epoch": 1.9264953225948518, + "grad_norm": 6.59375, + "learning_rate": 3.687523256038328e-08, + "loss": 0.98090096, + "memory(GiB)": 302.58, + "step": 344480, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.74658828, + "epoch": 1.9266071720678313, + "grad_norm": 9.8125, + "learning_rate": 3.676321931179838e-08, + "loss": 1.00676584, + "memory(GiB)": 302.58, + "step": 344500, + "train_speed(iter/s)": 0.123357 + }, + { + "acc": 0.73723202, + "epoch": 1.9267190215408103, + "grad_norm": 8.75, + "learning_rate": 3.665137582164913e-08, + "loss": 1.02949867, + "memory(GiB)": 302.58, + "step": 344520, + "train_speed(iter/s)": 0.12336 + }, + { + "acc": 0.75099382, + "epoch": 1.9268308710137898, + "grad_norm": 8.375, + "learning_rate": 3.6539702093761345e-08, + "loss": 0.98003368, + "memory(GiB)": 302.58, + "step": 344540, + "train_speed(iter/s)": 0.123363 + }, + { + "acc": 0.76835647, + "epoch": 1.9269427204867688, + "grad_norm": 6.0625, + "learning_rate": 3.6428198131954754e-08, + "loss": 0.90450258, + "memory(GiB)": 302.58, + "step": 344560, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.76488032, + "epoch": 1.9270545699597483, + "grad_norm": 7.5, + "learning_rate": 3.631686394004241e-08, + "loss": 0.91820116, + "memory(GiB)": 302.58, + "step": 344580, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.74535346, + "epoch": 1.9271664194327274, + "grad_norm": 5.65625, + "learning_rate": 3.620569952183295e-08, + "loss": 1.0084549, + "memory(GiB)": 302.58, + "step": 344600, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.74054298, + "epoch": 1.9272782689057069, + "grad_norm": 8.0625, + "learning_rate": 3.6094704881128874e-08, + "loss": 1.03093653, + "memory(GiB)": 302.58, + "step": 344620, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.76915817, + "epoch": 1.927390118378686, + "grad_norm": 6.71875, + "learning_rate": 3.598388002172548e-08, + "loss": 0.88364792, + "memory(GiB)": 302.58, + "step": 344640, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.74617462, + "epoch": 1.9275019678516654, + "grad_norm": 8.6875, + "learning_rate": 3.587322494741363e-08, + "loss": 0.98286753, + "memory(GiB)": 302.58, + "step": 344660, + "train_speed(iter/s)": 0.123384 + }, + { + "acc": 0.76253819, + "epoch": 1.9276138173246444, + "grad_norm": 7.375, + "learning_rate": 3.576273966197863e-08, + "loss": 0.94074297, + "memory(GiB)": 302.58, + "step": 344680, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.75045452, + "epoch": 1.927725666797624, + "grad_norm": 10.5, + "learning_rate": 3.5652424169199116e-08, + "loss": 0.998874, + "memory(GiB)": 302.58, + "step": 344700, + "train_speed(iter/s)": 0.123391 + }, + { + "acc": 0.74780798, + "epoch": 1.927837516270603, + "grad_norm": 7.25, + "learning_rate": 3.554227847284764e-08, + "loss": 0.99115572, + "memory(GiB)": 302.58, + "step": 344720, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.74498782, + "epoch": 1.9279493657435824, + "grad_norm": 9.375, + "learning_rate": 3.543230257669283e-08, + "loss": 1.0057272, + "memory(GiB)": 302.58, + "step": 344740, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.73843937, + "epoch": 1.9280612152165615, + "grad_norm": 6.0625, + "learning_rate": 3.532249648449504e-08, + "loss": 1.01191196, + "memory(GiB)": 302.58, + "step": 344760, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.76061554, + "epoch": 1.928173064689541, + "grad_norm": 7.59375, + "learning_rate": 3.521286020001069e-08, + "loss": 0.93819771, + "memory(GiB)": 302.58, + "step": 344780, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.75273991, + "epoch": 1.92828491416252, + "grad_norm": 5.21875, + "learning_rate": 3.510339372698901e-08, + "loss": 0.97135105, + "memory(GiB)": 302.58, + "step": 344800, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.74962935, + "epoch": 1.9283967636354995, + "grad_norm": 7.375, + "learning_rate": 3.499409706917534e-08, + "loss": 0.97226934, + "memory(GiB)": 302.58, + "step": 344820, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.76533065, + "epoch": 1.9285086131084785, + "grad_norm": 5.40625, + "learning_rate": 3.488497023030612e-08, + "loss": 0.9145175, + "memory(GiB)": 302.58, + "step": 344840, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.75482564, + "epoch": 1.928620462581458, + "grad_norm": 10.625, + "learning_rate": 3.477601321411561e-08, + "loss": 0.9538619, + "memory(GiB)": 302.58, + "step": 344860, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75177784, + "epoch": 1.928732312054437, + "grad_norm": 7.875, + "learning_rate": 3.4667226024329706e-08, + "loss": 0.97233906, + "memory(GiB)": 302.58, + "step": 344880, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.75829606, + "epoch": 1.9288441615274166, + "grad_norm": 9.375, + "learning_rate": 3.4558608664669314e-08, + "loss": 0.96010342, + "memory(GiB)": 302.58, + "step": 344900, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75701942, + "epoch": 1.9289560110003956, + "grad_norm": 8.25, + "learning_rate": 3.445016113884925e-08, + "loss": 0.94504576, + "memory(GiB)": 302.58, + "step": 344920, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.75938592, + "epoch": 1.929067860473375, + "grad_norm": 7.78125, + "learning_rate": 3.434188345057932e-08, + "loss": 0.96253767, + "memory(GiB)": 302.58, + "step": 344940, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.74574313, + "epoch": 1.9291797099463541, + "grad_norm": 5.96875, + "learning_rate": 3.423377560356211e-08, + "loss": 0.9967598, + "memory(GiB)": 302.58, + "step": 344960, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.757901, + "epoch": 1.9292915594193336, + "grad_norm": 10.4375, + "learning_rate": 3.412583760149635e-08, + "loss": 0.9368433, + "memory(GiB)": 302.58, + "step": 344980, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.74572711, + "epoch": 1.9294034088923127, + "grad_norm": 6.84375, + "learning_rate": 3.401806944807296e-08, + "loss": 0.98240662, + "memory(GiB)": 302.58, + "step": 345000, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.74866152, + "epoch": 1.9295152583652921, + "grad_norm": 7.15625, + "learning_rate": 3.391047114697843e-08, + "loss": 0.98586817, + "memory(GiB)": 302.58, + "step": 345020, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.7496397, + "epoch": 1.9296271078382712, + "grad_norm": 6.4375, + "learning_rate": 3.3803042701893165e-08, + "loss": 0.98874044, + "memory(GiB)": 302.58, + "step": 345040, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.76324849, + "epoch": 1.9297389573112507, + "grad_norm": 8.0, + "learning_rate": 3.369578411649088e-08, + "loss": 0.95124025, + "memory(GiB)": 302.58, + "step": 345060, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.72855263, + "epoch": 1.9298508067842297, + "grad_norm": 5.8125, + "learning_rate": 3.3588695394440317e-08, + "loss": 1.07569113, + "memory(GiB)": 302.58, + "step": 345080, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.76526542, + "epoch": 1.9299626562572092, + "grad_norm": 5.90625, + "learning_rate": 3.348177653940465e-08, + "loss": 0.91519032, + "memory(GiB)": 302.58, + "step": 345100, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.73917451, + "epoch": 1.9300745057301882, + "grad_norm": 10.625, + "learning_rate": 3.3375027555040406e-08, + "loss": 1.02636995, + "memory(GiB)": 302.58, + "step": 345120, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.77100887, + "epoch": 1.9301863552031677, + "grad_norm": 7.1875, + "learning_rate": 3.32684484449991e-08, + "loss": 0.91394224, + "memory(GiB)": 302.58, + "step": 345140, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.7573833, + "epoch": 1.9302982046761468, + "grad_norm": 5.1875, + "learning_rate": 3.3162039212926155e-08, + "loss": 0.96353207, + "memory(GiB)": 302.58, + "step": 345160, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.76641297, + "epoch": 1.9304100541491263, + "grad_norm": 8.875, + "learning_rate": 3.305579986246088e-08, + "loss": 0.90964622, + "memory(GiB)": 302.58, + "step": 345180, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.76277332, + "epoch": 1.9305219036221053, + "grad_norm": 7.21875, + "learning_rate": 3.2949730397236476e-08, + "loss": 0.91273661, + "memory(GiB)": 302.58, + "step": 345200, + "train_speed(iter/s)": 0.123476 + }, + { + "acc": 0.73769841, + "epoch": 1.9306337530950848, + "grad_norm": 7.5, + "learning_rate": 3.2843830820881716e-08, + "loss": 1.04229288, + "memory(GiB)": 302.58, + "step": 345220, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.7340415, + "epoch": 1.9307456025680638, + "grad_norm": 6.9375, + "learning_rate": 3.27381011370187e-08, + "loss": 1.04304476, + "memory(GiB)": 302.58, + "step": 345240, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.76841235, + "epoch": 1.9308574520410433, + "grad_norm": 8.25, + "learning_rate": 3.263254134926286e-08, + "loss": 0.90049353, + "memory(GiB)": 302.58, + "step": 345260, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.76570482, + "epoch": 1.9309693015140224, + "grad_norm": 8.0, + "learning_rate": 3.252715146122576e-08, + "loss": 0.91107426, + "memory(GiB)": 302.58, + "step": 345280, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.76494555, + "epoch": 1.9310811509870018, + "grad_norm": 7.875, + "learning_rate": 3.242193147651118e-08, + "loss": 0.93271017, + "memory(GiB)": 302.58, + "step": 345300, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.76345491, + "epoch": 1.9311930004599809, + "grad_norm": 8.9375, + "learning_rate": 3.231688139871847e-08, + "loss": 0.91848793, + "memory(GiB)": 302.58, + "step": 345320, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.75415626, + "epoch": 1.9313048499329604, + "grad_norm": 8.875, + "learning_rate": 3.221200123143975e-08, + "loss": 0.973981, + "memory(GiB)": 302.58, + "step": 345340, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.73880744, + "epoch": 1.9314166994059394, + "grad_norm": 9.0, + "learning_rate": 3.210729097826382e-08, + "loss": 1.02891645, + "memory(GiB)": 302.58, + "step": 345360, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.73517303, + "epoch": 1.931528548878919, + "grad_norm": 6.8125, + "learning_rate": 3.200275064277114e-08, + "loss": 1.03511696, + "memory(GiB)": 302.58, + "step": 345380, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.75772696, + "epoch": 1.931640398351898, + "grad_norm": 7.78125, + "learning_rate": 3.189838022853719e-08, + "loss": 0.92734518, + "memory(GiB)": 302.58, + "step": 345400, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.74813914, + "epoch": 1.9317522478248774, + "grad_norm": 8.25, + "learning_rate": 3.179417973913246e-08, + "loss": 0.98711367, + "memory(GiB)": 302.58, + "step": 345420, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75732722, + "epoch": 1.9318640972978565, + "grad_norm": 7.21875, + "learning_rate": 3.1690149178120744e-08, + "loss": 0.96739273, + "memory(GiB)": 302.58, + "step": 345440, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.73801465, + "epoch": 1.931975946770836, + "grad_norm": 15.4375, + "learning_rate": 3.1586288549059205e-08, + "loss": 1.04467373, + "memory(GiB)": 302.58, + "step": 345460, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.76981931, + "epoch": 1.932087796243815, + "grad_norm": 6.90625, + "learning_rate": 3.1482597855501674e-08, + "loss": 0.90712118, + "memory(GiB)": 302.58, + "step": 345480, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.76097317, + "epoch": 1.9321996457167945, + "grad_norm": 8.8125, + "learning_rate": 3.137907710099364e-08, + "loss": 0.93842134, + "memory(GiB)": 302.58, + "step": 345500, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.73998876, + "epoch": 1.9323114951897735, + "grad_norm": 7.0, + "learning_rate": 3.1275726289076715e-08, + "loss": 1.04321203, + "memory(GiB)": 302.58, + "step": 345520, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.74781818, + "epoch": 1.932423344662753, + "grad_norm": 7.84375, + "learning_rate": 3.1172545423285294e-08, + "loss": 1.01280651, + "memory(GiB)": 302.58, + "step": 345540, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.74516702, + "epoch": 1.932535194135732, + "grad_norm": 7.09375, + "learning_rate": 3.106953450714822e-08, + "loss": 0.98483477, + "memory(GiB)": 302.58, + "step": 345560, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.75643659, + "epoch": 1.9326470436087115, + "grad_norm": 8.625, + "learning_rate": 3.0966693544189353e-08, + "loss": 0.93445568, + "memory(GiB)": 302.58, + "step": 345580, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.75349588, + "epoch": 1.9327588930816906, + "grad_norm": 7.75, + "learning_rate": 3.086402253792531e-08, + "loss": 0.95617571, + "memory(GiB)": 302.58, + "step": 345600, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75211606, + "epoch": 1.93287074255467, + "grad_norm": 8.75, + "learning_rate": 3.076152149186884e-08, + "loss": 0.97455225, + "memory(GiB)": 302.58, + "step": 345620, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.77064638, + "epoch": 1.932982592027649, + "grad_norm": 8.5625, + "learning_rate": 3.065919040952547e-08, + "loss": 0.8937355, + "memory(GiB)": 302.58, + "step": 345640, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.76313744, + "epoch": 1.9330944415006286, + "grad_norm": 8.625, + "learning_rate": 3.055702929439519e-08, + "loss": 0.9293149, + "memory(GiB)": 302.58, + "step": 345660, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.74699974, + "epoch": 1.9332062909736076, + "grad_norm": 6.71875, + "learning_rate": 3.045503814997186e-08, + "loss": 0.99827986, + "memory(GiB)": 302.58, + "step": 345680, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.75272932, + "epoch": 1.9333181404465871, + "grad_norm": 7.25, + "learning_rate": 3.035321697974436e-08, + "loss": 0.96741009, + "memory(GiB)": 302.58, + "step": 345700, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.74194407, + "epoch": 1.9334299899195662, + "grad_norm": 8.25, + "learning_rate": 3.025156578719546e-08, + "loss": 1.00911932, + "memory(GiB)": 302.58, + "step": 345720, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.75716166, + "epoch": 1.9335418393925456, + "grad_norm": 7.5625, + "learning_rate": 3.015008457580127e-08, + "loss": 0.9500823, + "memory(GiB)": 302.58, + "step": 345740, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.73936634, + "epoch": 1.9336536888655247, + "grad_norm": 5.8125, + "learning_rate": 3.004877334903344e-08, + "loss": 1.00546103, + "memory(GiB)": 302.58, + "step": 345760, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.76521969, + "epoch": 1.9337655383385042, + "grad_norm": 5.125, + "learning_rate": 2.9947632110357004e-08, + "loss": 0.90812635, + "memory(GiB)": 302.58, + "step": 345780, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.73703995, + "epoch": 1.9338773878114832, + "grad_norm": 7.84375, + "learning_rate": 2.9846660863230844e-08, + "loss": 1.03270569, + "memory(GiB)": 302.58, + "step": 345800, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75325251, + "epoch": 1.9339892372844627, + "grad_norm": 6.65625, + "learning_rate": 2.9745859611108872e-08, + "loss": 0.97711906, + "memory(GiB)": 302.58, + "step": 345820, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.74216919, + "epoch": 1.9341010867574417, + "grad_norm": 8.0, + "learning_rate": 2.9645228357438882e-08, + "loss": 1.0054122, + "memory(GiB)": 302.58, + "step": 345840, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74710436, + "epoch": 1.9342129362304212, + "grad_norm": 14.1875, + "learning_rate": 2.9544767105662565e-08, + "loss": 1.00052719, + "memory(GiB)": 302.58, + "step": 345860, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.76261806, + "epoch": 1.9343247857034003, + "grad_norm": 6.75, + "learning_rate": 2.944447585921606e-08, + "loss": 0.93394251, + "memory(GiB)": 302.58, + "step": 345880, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.74976249, + "epoch": 1.9344366351763798, + "grad_norm": 5.28125, + "learning_rate": 2.9344354621529402e-08, + "loss": 0.99450569, + "memory(GiB)": 302.58, + "step": 345900, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.74498591, + "epoch": 1.9345484846493588, + "grad_norm": 7.71875, + "learning_rate": 2.924440339602763e-08, + "loss": 1.02504969, + "memory(GiB)": 302.58, + "step": 345920, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.7449759, + "epoch": 1.9346603341223383, + "grad_norm": 5.84375, + "learning_rate": 2.9144622186129123e-08, + "loss": 1.02958317, + "memory(GiB)": 302.58, + "step": 345940, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.76701746, + "epoch": 1.9347721835953173, + "grad_norm": 7.21875, + "learning_rate": 2.9045010995246147e-08, + "loss": 0.90411758, + "memory(GiB)": 302.58, + "step": 345960, + "train_speed(iter/s)": 0.123605 + }, + { + "acc": 0.75481124, + "epoch": 1.9348840330682968, + "grad_norm": 7.875, + "learning_rate": 2.894556982678709e-08, + "loss": 0.97868176, + "memory(GiB)": 302.58, + "step": 345980, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.75329728, + "epoch": 1.9349958825412759, + "grad_norm": 6.875, + "learning_rate": 2.8846298684152007e-08, + "loss": 0.98181009, + "memory(GiB)": 302.58, + "step": 346000, + "train_speed(iter/s)": 0.123612 + }, + { + "epoch": 1.9349958825412759, + "eval_acc": 0.7069052670559732, + "eval_loss": 1.011773943901062, + "eval_runtime": 7533.6377, + "eval_samples_per_second": 9.993, + "eval_steps_per_second": 9.993, + "step": 346000 + }, + { + "acc": 0.74065838, + "epoch": 1.9351077320142553, + "grad_norm": 5.59375, + "learning_rate": 2.8747197570736517e-08, + "loss": 0.99424314, + "memory(GiB)": 302.58, + "step": 346020, + "train_speed(iter/s)": 0.123278 + }, + { + "acc": 0.73944349, + "epoch": 1.9352195814872344, + "grad_norm": 9.6875, + "learning_rate": 2.864826648993013e-08, + "loss": 1.0443511, + "memory(GiB)": 302.58, + "step": 346040, + "train_speed(iter/s)": 0.123281 + }, + { + "acc": 0.75396914, + "epoch": 1.9353314309602139, + "grad_norm": 6.40625, + "learning_rate": 2.8549505445117364e-08, + "loss": 0.96951056, + "memory(GiB)": 302.58, + "step": 346060, + "train_speed(iter/s)": 0.123285 + }, + { + "acc": 0.73957963, + "epoch": 1.935443280433193, + "grad_norm": 6.0, + "learning_rate": 2.845091443967496e-08, + "loss": 1.02570705, + "memory(GiB)": 302.58, + "step": 346080, + "train_speed(iter/s)": 0.123288 + }, + { + "acc": 0.76986246, + "epoch": 1.9355551299061724, + "grad_norm": 7.21875, + "learning_rate": 2.8352493476975773e-08, + "loss": 0.8688117, + "memory(GiB)": 302.58, + "step": 346100, + "train_speed(iter/s)": 0.123291 + }, + { + "acc": 0.75626884, + "epoch": 1.9356669793791514, + "grad_norm": 11.5625, + "learning_rate": 2.8254242560386003e-08, + "loss": 0.95701437, + "memory(GiB)": 302.58, + "step": 346120, + "train_speed(iter/s)": 0.123295 + }, + { + "acc": 0.75638056, + "epoch": 1.935778828852131, + "grad_norm": 7.03125, + "learning_rate": 2.8156161693266294e-08, + "loss": 0.95673876, + "memory(GiB)": 302.58, + "step": 346140, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.74573956, + "epoch": 1.93589067832511, + "grad_norm": 8.75, + "learning_rate": 2.8058250878971184e-08, + "loss": 1.00489101, + "memory(GiB)": 302.58, + "step": 346160, + "train_speed(iter/s)": 0.123302 + }, + { + "acc": 0.74716387, + "epoch": 1.9360025277980895, + "grad_norm": 4.9375, + "learning_rate": 2.7960510120849106e-08, + "loss": 0.99444485, + "memory(GiB)": 302.58, + "step": 346180, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.74156194, + "epoch": 1.9361143772710685, + "grad_norm": 9.3125, + "learning_rate": 2.786293942224405e-08, + "loss": 1.02259855, + "memory(GiB)": 302.58, + "step": 346200, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.757407, + "epoch": 1.936226226744048, + "grad_norm": 4.75, + "learning_rate": 2.7765538786491684e-08, + "loss": 0.95012808, + "memory(GiB)": 302.58, + "step": 346220, + "train_speed(iter/s)": 0.123312 + }, + { + "acc": 0.74735746, + "epoch": 1.936338076217027, + "grad_norm": 9.75, + "learning_rate": 2.7668308216925454e-08, + "loss": 1.0027009, + "memory(GiB)": 302.58, + "step": 346240, + "train_speed(iter/s)": 0.123315 + }, + { + "acc": 0.74729676, + "epoch": 1.9364499256900065, + "grad_norm": 5.65625, + "learning_rate": 2.7571247716869366e-08, + "loss": 0.99025736, + "memory(GiB)": 302.58, + "step": 346260, + "train_speed(iter/s)": 0.123319 + }, + { + "acc": 0.74260488, + "epoch": 1.9365617751629856, + "grad_norm": 9.8125, + "learning_rate": 2.7474357289644094e-08, + "loss": 1.00090847, + "memory(GiB)": 302.58, + "step": 346280, + "train_speed(iter/s)": 0.123322 + }, + { + "acc": 0.75327272, + "epoch": 1.936673624635965, + "grad_norm": 7.46875, + "learning_rate": 2.73776369385631e-08, + "loss": 0.95190783, + "memory(GiB)": 302.58, + "step": 346300, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.73291888, + "epoch": 1.936785474108944, + "grad_norm": 6.3125, + "learning_rate": 2.7281086666934852e-08, + "loss": 1.06354036, + "memory(GiB)": 302.58, + "step": 346320, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.75781188, + "epoch": 1.9368973235819236, + "grad_norm": 7.25, + "learning_rate": 2.718470647806115e-08, + "loss": 0.95150642, + "memory(GiB)": 302.58, + "step": 346340, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.7444634, + "epoch": 1.9370091730549026, + "grad_norm": 7.84375, + "learning_rate": 2.7088496375239358e-08, + "loss": 1.01077375, + "memory(GiB)": 302.58, + "step": 346360, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.75609188, + "epoch": 1.937121022527882, + "grad_norm": 9.0625, + "learning_rate": 2.6992456361759067e-08, + "loss": 0.94455099, + "memory(GiB)": 302.58, + "step": 346380, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.74432464, + "epoch": 1.9372328720008611, + "grad_norm": 6.46875, + "learning_rate": 2.689658644090598e-08, + "loss": 1.01332102, + "memory(GiB)": 302.58, + "step": 346400, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.74861617, + "epoch": 1.9373447214738406, + "grad_norm": 8.375, + "learning_rate": 2.6800886615959143e-08, + "loss": 0.9968709, + "memory(GiB)": 302.58, + "step": 346420, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.75610824, + "epoch": 1.9374565709468197, + "grad_norm": 6.8125, + "learning_rate": 2.670535689019149e-08, + "loss": 0.95438356, + "memory(GiB)": 302.58, + "step": 346440, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.75726409, + "epoch": 1.9375684204197992, + "grad_norm": 6.15625, + "learning_rate": 2.6609997266870413e-08, + "loss": 0.95425129, + "memory(GiB)": 302.58, + "step": 346460, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.73065758, + "epoch": 1.9376802698927782, + "grad_norm": 6.46875, + "learning_rate": 2.651480774925774e-08, + "loss": 1.07185535, + "memory(GiB)": 302.58, + "step": 346480, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.75846367, + "epoch": 1.9377921193657577, + "grad_norm": 7.25, + "learning_rate": 2.6419788340608653e-08, + "loss": 0.94776983, + "memory(GiB)": 302.58, + "step": 346500, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.7597064, + "epoch": 1.9379039688387367, + "grad_norm": 7.25, + "learning_rate": 2.632493904417388e-08, + "loss": 0.92659569, + "memory(GiB)": 302.58, + "step": 346520, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.76323061, + "epoch": 1.9380158183117162, + "grad_norm": 5.09375, + "learning_rate": 2.6230259863197493e-08, + "loss": 0.93885622, + "memory(GiB)": 302.58, + "step": 346540, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.75775361, + "epoch": 1.9381276677846953, + "grad_norm": 7.75, + "learning_rate": 2.613575080091746e-08, + "loss": 0.94312067, + "memory(GiB)": 302.58, + "step": 346560, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.73504877, + "epoch": 1.9382395172576747, + "grad_norm": 4.3125, + "learning_rate": 2.6041411860566744e-08, + "loss": 1.02660999, + "memory(GiB)": 302.58, + "step": 346580, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.73919034, + "epoch": 1.9383513667306538, + "grad_norm": 6.375, + "learning_rate": 2.5947243045371663e-08, + "loss": 1.04101725, + "memory(GiB)": 302.58, + "step": 346600, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.74746451, + "epoch": 1.9384632162036333, + "grad_norm": 9.3125, + "learning_rate": 2.5853244358552966e-08, + "loss": 0.99089537, + "memory(GiB)": 302.58, + "step": 346620, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.74702845, + "epoch": 1.9385750656766123, + "grad_norm": 8.875, + "learning_rate": 2.575941580332586e-08, + "loss": 1.01338806, + "memory(GiB)": 302.58, + "step": 346640, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.76174011, + "epoch": 1.9386869151495918, + "grad_norm": 6.75, + "learning_rate": 2.566575738289945e-08, + "loss": 0.94439964, + "memory(GiB)": 302.58, + "step": 346660, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.77095985, + "epoch": 1.9387987646225708, + "grad_norm": 7.59375, + "learning_rate": 2.557226910047783e-08, + "loss": 0.90835075, + "memory(GiB)": 302.58, + "step": 346680, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.74765983, + "epoch": 1.9389106140955503, + "grad_norm": 7.9375, + "learning_rate": 2.5478950959257898e-08, + "loss": 1.00307121, + "memory(GiB)": 302.58, + "step": 346700, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.76041298, + "epoch": 1.9390224635685294, + "grad_norm": 5.53125, + "learning_rate": 2.5385802962431538e-08, + "loss": 0.94283724, + "memory(GiB)": 302.58, + "step": 346720, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.75345278, + "epoch": 1.9391343130415089, + "grad_norm": 7.125, + "learning_rate": 2.5292825113184537e-08, + "loss": 0.96559181, + "memory(GiB)": 302.58, + "step": 346740, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.72922893, + "epoch": 1.939246162514488, + "grad_norm": 6.40625, + "learning_rate": 2.5200017414697686e-08, + "loss": 1.0644021, + "memory(GiB)": 302.58, + "step": 346760, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.75736508, + "epoch": 1.9393580119874674, + "grad_norm": 4.75, + "learning_rate": 2.5107379870144555e-08, + "loss": 0.93815861, + "memory(GiB)": 302.58, + "step": 346780, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.76488199, + "epoch": 1.9394698614604464, + "grad_norm": 6.5, + "learning_rate": 2.5014912482694276e-08, + "loss": 0.91749992, + "memory(GiB)": 302.58, + "step": 346800, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.74647217, + "epoch": 1.939581710933426, + "grad_norm": 8.375, + "learning_rate": 2.492261525550932e-08, + "loss": 0.99259977, + "memory(GiB)": 302.58, + "step": 346820, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.76461287, + "epoch": 1.939693560406405, + "grad_norm": 7.34375, + "learning_rate": 2.483048819174605e-08, + "loss": 0.93756971, + "memory(GiB)": 302.58, + "step": 346840, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.74192009, + "epoch": 1.9398054098793844, + "grad_norm": 6.28125, + "learning_rate": 2.473853129455639e-08, + "loss": 1.023909, + "memory(GiB)": 302.58, + "step": 346860, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.76743865, + "epoch": 1.9399172593523635, + "grad_norm": 7.84375, + "learning_rate": 2.4646744567085044e-08, + "loss": 0.91925278, + "memory(GiB)": 302.58, + "step": 346880, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.73962383, + "epoch": 1.940029108825343, + "grad_norm": 8.8125, + "learning_rate": 2.4555128012471175e-08, + "loss": 1.04101753, + "memory(GiB)": 302.58, + "step": 346900, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75064802, + "epoch": 1.940140958298322, + "grad_norm": 8.3125, + "learning_rate": 2.4463681633848933e-08, + "loss": 0.98356543, + "memory(GiB)": 302.58, + "step": 346920, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.75786314, + "epoch": 1.9402528077713015, + "grad_norm": 9.0, + "learning_rate": 2.437240543434527e-08, + "loss": 0.95679007, + "memory(GiB)": 302.58, + "step": 346940, + "train_speed(iter/s)": 0.123433 + }, + { + "acc": 0.75865374, + "epoch": 1.9403646572442805, + "grad_norm": 7.28125, + "learning_rate": 2.428129941708324e-08, + "loss": 0.96661205, + "memory(GiB)": 302.58, + "step": 346960, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.74465766, + "epoch": 1.94047650671726, + "grad_norm": 5.21875, + "learning_rate": 2.419036358517812e-08, + "loss": 1.01881332, + "memory(GiB)": 302.58, + "step": 346980, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.75130763, + "epoch": 1.940588356190239, + "grad_norm": 7.8125, + "learning_rate": 2.4099597941740215e-08, + "loss": 0.97869158, + "memory(GiB)": 302.58, + "step": 347000, + "train_speed(iter/s)": 0.123443 + }, + { + "acc": 0.75058613, + "epoch": 1.9407002056632185, + "grad_norm": 9.625, + "learning_rate": 2.4009002489874256e-08, + "loss": 0.99652548, + "memory(GiB)": 302.58, + "step": 347020, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.75701356, + "epoch": 1.9408120551361976, + "grad_norm": 6.34375, + "learning_rate": 2.3918577232678873e-08, + "loss": 0.93931227, + "memory(GiB)": 302.58, + "step": 347040, + "train_speed(iter/s)": 0.12345 + }, + { + "acc": 0.76257505, + "epoch": 1.940923904609177, + "grad_norm": 5.84375, + "learning_rate": 2.3828322173247153e-08, + "loss": 0.92404175, + "memory(GiB)": 302.58, + "step": 347060, + "train_speed(iter/s)": 0.123453 + }, + { + "acc": 0.73567672, + "epoch": 1.9410357540821561, + "grad_norm": 7.65625, + "learning_rate": 2.3738237314665514e-08, + "loss": 1.04800091, + "memory(GiB)": 302.58, + "step": 347080, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.74147005, + "epoch": 1.9411476035551356, + "grad_norm": 9.0625, + "learning_rate": 2.3648322660015376e-08, + "loss": 1.00471067, + "memory(GiB)": 302.58, + "step": 347100, + "train_speed(iter/s)": 0.12346 + }, + { + "acc": 0.77025576, + "epoch": 1.9412594530281146, + "grad_norm": 7.875, + "learning_rate": 2.3558578212372617e-08, + "loss": 0.89688988, + "memory(GiB)": 302.58, + "step": 347120, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.75657392, + "epoch": 1.9413713025010941, + "grad_norm": 8.25, + "learning_rate": 2.346900397480645e-08, + "loss": 0.94167395, + "memory(GiB)": 302.58, + "step": 347140, + "train_speed(iter/s)": 0.123467 + }, + { + "acc": 0.75255933, + "epoch": 1.9414831519740732, + "grad_norm": 5.96875, + "learning_rate": 2.3379599950379973e-08, + "loss": 0.97339058, + "memory(GiB)": 302.58, + "step": 347160, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.73424959, + "epoch": 1.9415950014470527, + "grad_norm": 5.6875, + "learning_rate": 2.3290366142151855e-08, + "loss": 1.06895504, + "memory(GiB)": 302.58, + "step": 347180, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.7641151, + "epoch": 1.9417068509200317, + "grad_norm": 7.90625, + "learning_rate": 2.3201302553174097e-08, + "loss": 0.91018286, + "memory(GiB)": 302.58, + "step": 347200, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.74900212, + "epoch": 1.9418187003930112, + "grad_norm": 7.1875, + "learning_rate": 2.31124091864926e-08, + "loss": 1.0044857, + "memory(GiB)": 302.58, + "step": 347220, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.75439405, + "epoch": 1.9419305498659902, + "grad_norm": 5.1875, + "learning_rate": 2.3023686045147707e-08, + "loss": 0.98589935, + "memory(GiB)": 302.58, + "step": 347240, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.73756657, + "epoch": 1.9420423993389697, + "grad_norm": 7.5625, + "learning_rate": 2.293513313217477e-08, + "loss": 1.04090443, + "memory(GiB)": 302.58, + "step": 347260, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.73884091, + "epoch": 1.9421542488119488, + "grad_norm": 7.03125, + "learning_rate": 2.2846750450601916e-08, + "loss": 1.04669971, + "memory(GiB)": 302.58, + "step": 347280, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.73269272, + "epoch": 1.9422660982849282, + "grad_norm": 9.3125, + "learning_rate": 2.2758538003452292e-08, + "loss": 1.0363307, + "memory(GiB)": 302.58, + "step": 347300, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.75886431, + "epoch": 1.9423779477579073, + "grad_norm": 6.625, + "learning_rate": 2.2670495793742918e-08, + "loss": 0.96150694, + "memory(GiB)": 302.58, + "step": 347320, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.73848553, + "epoch": 1.9424897972308868, + "grad_norm": 7.75, + "learning_rate": 2.2582623824485284e-08, + "loss": 1.0366868, + "memory(GiB)": 302.58, + "step": 347340, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.75662551, + "epoch": 1.9426016467038658, + "grad_norm": 5.15625, + "learning_rate": 2.2494922098685313e-08, + "loss": 0.96145372, + "memory(GiB)": 302.58, + "step": 347360, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.74528608, + "epoch": 1.9427134961768453, + "grad_norm": 6.25, + "learning_rate": 2.2407390619341718e-08, + "loss": 0.99542418, + "memory(GiB)": 302.58, + "step": 347380, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.75022693, + "epoch": 1.9428253456498243, + "grad_norm": 9.125, + "learning_rate": 2.2320029389449328e-08, + "loss": 0.98110275, + "memory(GiB)": 302.58, + "step": 347400, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.73774891, + "epoch": 1.9429371951228038, + "grad_norm": 7.71875, + "learning_rate": 2.223283841199575e-08, + "loss": 1.03120337, + "memory(GiB)": 302.58, + "step": 347420, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.75091219, + "epoch": 1.9430490445957829, + "grad_norm": 7.1875, + "learning_rate": 2.2145817689963045e-08, + "loss": 0.99467306, + "memory(GiB)": 302.58, + "step": 347440, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.77238798, + "epoch": 1.9431608940687624, + "grad_norm": 7.96875, + "learning_rate": 2.205896722632772e-08, + "loss": 0.88982267, + "memory(GiB)": 302.58, + "step": 347460, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.74893398, + "epoch": 1.9432727435417414, + "grad_norm": 7.9375, + "learning_rate": 2.1972287024060178e-08, + "loss": 0.99624424, + "memory(GiB)": 302.58, + "step": 347480, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.7691731, + "epoch": 1.9433845930147209, + "grad_norm": 6.59375, + "learning_rate": 2.188577708612527e-08, + "loss": 0.90025396, + "memory(GiB)": 302.58, + "step": 347500, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.73602271, + "epoch": 1.9434964424877, + "grad_norm": 6.6875, + "learning_rate": 2.1799437415482293e-08, + "loss": 1.05585966, + "memory(GiB)": 302.58, + "step": 347520, + "train_speed(iter/s)": 0.12353 + }, + { + "acc": 0.74456863, + "epoch": 1.9436082919606794, + "grad_norm": 5.0625, + "learning_rate": 2.1713268015083887e-08, + "loss": 1.02410135, + "memory(GiB)": 302.58, + "step": 347540, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75075421, + "epoch": 1.9437201414336585, + "grad_norm": 4.5625, + "learning_rate": 2.162726888787714e-08, + "loss": 0.96875525, + "memory(GiB)": 302.58, + "step": 347560, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.76922989, + "epoch": 1.943831990906638, + "grad_norm": 5.34375, + "learning_rate": 2.154144003680414e-08, + "loss": 0.89400949, + "memory(GiB)": 302.58, + "step": 347580, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.74524841, + "epoch": 1.943943840379617, + "grad_norm": 6.875, + "learning_rate": 2.1455781464800317e-08, + "loss": 1.01398888, + "memory(GiB)": 302.58, + "step": 347600, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74953642, + "epoch": 1.9440556898525965, + "grad_norm": 7.03125, + "learning_rate": 2.1370293174795e-08, + "loss": 0.97953691, + "memory(GiB)": 302.58, + "step": 347620, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.74434886, + "epoch": 1.9441675393255755, + "grad_norm": 7.625, + "learning_rate": 2.1284975169712508e-08, + "loss": 1.00019989, + "memory(GiB)": 302.58, + "step": 347640, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.73651409, + "epoch": 1.944279388798555, + "grad_norm": 6.84375, + "learning_rate": 2.119982745247051e-08, + "loss": 1.04165268, + "memory(GiB)": 302.58, + "step": 347660, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75091085, + "epoch": 1.944391238271534, + "grad_norm": 5.90625, + "learning_rate": 2.1114850025982235e-08, + "loss": 0.97737799, + "memory(GiB)": 302.58, + "step": 347680, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.76589131, + "epoch": 1.9445030877445135, + "grad_norm": 5.15625, + "learning_rate": 2.103004289315369e-08, + "loss": 0.92344551, + "memory(GiB)": 302.58, + "step": 347700, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.75149174, + "epoch": 1.9446149372174926, + "grad_norm": 6.53125, + "learning_rate": 2.0945406056885886e-08, + "loss": 0.98071012, + "memory(GiB)": 302.58, + "step": 347720, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.73937354, + "epoch": 1.944726786690472, + "grad_norm": 7.84375, + "learning_rate": 2.086093952007262e-08, + "loss": 1.03152456, + "memory(GiB)": 302.58, + "step": 347740, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.74879136, + "epoch": 1.944838636163451, + "grad_norm": 5.78125, + "learning_rate": 2.077664328560436e-08, + "loss": 0.99783354, + "memory(GiB)": 302.58, + "step": 347760, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.74693804, + "epoch": 1.9449504856364306, + "grad_norm": 8.875, + "learning_rate": 2.069251735636324e-08, + "loss": 0.9921092, + "memory(GiB)": 302.58, + "step": 347780, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.74326301, + "epoch": 1.9450623351094096, + "grad_norm": 7.53125, + "learning_rate": 2.0608561735226962e-08, + "loss": 1.00719719, + "memory(GiB)": 302.58, + "step": 347800, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.76322913, + "epoch": 1.945174184582389, + "grad_norm": 7.53125, + "learning_rate": 2.0524776425067118e-08, + "loss": 0.91983824, + "memory(GiB)": 302.58, + "step": 347820, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.75310349, + "epoch": 1.9452860340553682, + "grad_norm": 6.125, + "learning_rate": 2.044116142874919e-08, + "loss": 0.9724802, + "memory(GiB)": 302.58, + "step": 347840, + "train_speed(iter/s)": 0.123584 + }, + { + "acc": 0.74635367, + "epoch": 1.9453978835283476, + "grad_norm": 6.78125, + "learning_rate": 2.035771674913367e-08, + "loss": 1.00581522, + "memory(GiB)": 302.58, + "step": 347860, + "train_speed(iter/s)": 0.123587 + }, + { + "acc": 0.7313849, + "epoch": 1.9455097330013267, + "grad_norm": 6.09375, + "learning_rate": 2.0274442389073833e-08, + "loss": 1.06865845, + "memory(GiB)": 302.58, + "step": 347880, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.75537262, + "epoch": 1.9456215824743062, + "grad_norm": 7.9375, + "learning_rate": 2.0191338351419065e-08, + "loss": 0.95035639, + "memory(GiB)": 302.58, + "step": 347900, + "train_speed(iter/s)": 0.123594 + }, + { + "acc": 0.75697346, + "epoch": 1.9457334319472852, + "grad_norm": 6.6875, + "learning_rate": 2.0108404639010427e-08, + "loss": 0.95407486, + "memory(GiB)": 302.58, + "step": 347920, + "train_speed(iter/s)": 0.123597 + }, + { + "acc": 0.757232, + "epoch": 1.9458452814202647, + "grad_norm": 8.0, + "learning_rate": 2.002564125468509e-08, + "loss": 0.95053234, + "memory(GiB)": 302.58, + "step": 347940, + "train_speed(iter/s)": 0.123601 + }, + { + "acc": 0.75780711, + "epoch": 1.9459571308932437, + "grad_norm": 9.3125, + "learning_rate": 1.994304820127413e-08, + "loss": 0.94704571, + "memory(GiB)": 302.58, + "step": 347960, + "train_speed(iter/s)": 0.123604 + }, + { + "acc": 0.75767884, + "epoch": 1.9460689803662232, + "grad_norm": 6.125, + "learning_rate": 1.9860625481602504e-08, + "loss": 0.94739714, + "memory(GiB)": 302.58, + "step": 347980, + "train_speed(iter/s)": 0.123608 + }, + { + "acc": 0.74195118, + "epoch": 1.9461808298392023, + "grad_norm": 5.8125, + "learning_rate": 1.977837309848851e-08, + "loss": 1.02496166, + "memory(GiB)": 302.58, + "step": 348000, + "train_speed(iter/s)": 0.123611 + }, + { + "epoch": 1.9461808298392023, + "eval_acc": 0.7068898371618696, + "eval_loss": 1.0117835998535156, + "eval_runtime": 7739.668, + "eval_samples_per_second": 9.727, + "eval_steps_per_second": 9.727, + "step": 348000 + }, + { + "acc": 0.7479073, + "epoch": 1.9462926793121818, + "grad_norm": 7.28125, + "learning_rate": 1.9696291054746575e-08, + "loss": 0.99228172, + "memory(GiB)": 302.58, + "step": 348020, + "train_speed(iter/s)": 0.123269 + }, + { + "acc": 0.73534055, + "epoch": 1.9464045287851608, + "grad_norm": 7.75, + "learning_rate": 1.9614379353183332e-08, + "loss": 1.033955, + "memory(GiB)": 302.58, + "step": 348040, + "train_speed(iter/s)": 0.123272 + }, + { + "acc": 0.74268508, + "epoch": 1.9465163782581403, + "grad_norm": 8.0625, + "learning_rate": 1.953263799660099e-08, + "loss": 0.98642216, + "memory(GiB)": 302.58, + "step": 348060, + "train_speed(iter/s)": 0.123276 + }, + { + "acc": 0.75597043, + "epoch": 1.9466282277311193, + "grad_norm": 6.5625, + "learning_rate": 1.945106698779453e-08, + "loss": 0.96220112, + "memory(GiB)": 302.58, + "step": 348080, + "train_speed(iter/s)": 0.123279 + }, + { + "acc": 0.74433427, + "epoch": 1.9467400772040988, + "grad_norm": 6.28125, + "learning_rate": 1.936966632955506e-08, + "loss": 0.98891287, + "memory(GiB)": 302.58, + "step": 348100, + "train_speed(iter/s)": 0.123283 + }, + { + "acc": 0.73877602, + "epoch": 1.9468519266770778, + "grad_norm": 6.9375, + "learning_rate": 1.92884360246659e-08, + "loss": 1.05418501, + "memory(GiB)": 302.58, + "step": 348120, + "train_speed(iter/s)": 0.123286 + }, + { + "acc": 0.73599176, + "epoch": 1.9469637761500573, + "grad_norm": 7.5625, + "learning_rate": 1.9207376075905394e-08, + "loss": 1.04182501, + "memory(GiB)": 302.58, + "step": 348140, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.76317663, + "epoch": 1.9470756256230364, + "grad_norm": 5.84375, + "learning_rate": 1.9126486486046313e-08, + "loss": 0.93141937, + "memory(GiB)": 302.58, + "step": 348160, + "train_speed(iter/s)": 0.123293 + }, + { + "acc": 0.74399624, + "epoch": 1.9471874750960159, + "grad_norm": 6.59375, + "learning_rate": 1.9045767257855896e-08, + "loss": 1.01843615, + "memory(GiB)": 302.58, + "step": 348180, + "train_speed(iter/s)": 0.123296 + }, + { + "acc": 0.73812628, + "epoch": 1.947299324568995, + "grad_norm": 6.3125, + "learning_rate": 1.8965218394093598e-08, + "loss": 1.02302303, + "memory(GiB)": 302.58, + "step": 348200, + "train_speed(iter/s)": 0.1233 + }, + { + "acc": 0.75156999, + "epoch": 1.9474111740419744, + "grad_norm": 8.75, + "learning_rate": 1.888483989751555e-08, + "loss": 0.9593401, + "memory(GiB)": 302.58, + "step": 348220, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.72786012, + "epoch": 1.9475230235149534, + "grad_norm": 8.0, + "learning_rate": 1.880463177087122e-08, + "loss": 1.09141159, + "memory(GiB)": 302.58, + "step": 348240, + "train_speed(iter/s)": 0.123307 + }, + { + "acc": 0.77024302, + "epoch": 1.947634872987933, + "grad_norm": 5.875, + "learning_rate": 1.8724594016902854e-08, + "loss": 0.89368505, + "memory(GiB)": 302.58, + "step": 348260, + "train_speed(iter/s)": 0.12331 + }, + { + "acc": 0.76827445, + "epoch": 1.947746722460912, + "grad_norm": 8.625, + "learning_rate": 1.864472663834882e-08, + "loss": 0.91416435, + "memory(GiB)": 302.58, + "step": 348280, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.75974894, + "epoch": 1.9478585719338914, + "grad_norm": 7.1875, + "learning_rate": 1.8565029637940267e-08, + "loss": 0.9269043, + "memory(GiB)": 302.58, + "step": 348300, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.74530177, + "epoch": 1.9479704214068705, + "grad_norm": 6.8125, + "learning_rate": 1.8485503018403906e-08, + "loss": 1.00186338, + "memory(GiB)": 302.58, + "step": 348320, + "train_speed(iter/s)": 0.12332 + }, + { + "acc": 0.76646495, + "epoch": 1.94808227087985, + "grad_norm": 9.8125, + "learning_rate": 1.8406146782458666e-08, + "loss": 0.9182991, + "memory(GiB)": 302.58, + "step": 348340, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.75521312, + "epoch": 1.948194120352829, + "grad_norm": 5.65625, + "learning_rate": 1.8326960932819603e-08, + "loss": 0.99390984, + "memory(GiB)": 302.58, + "step": 348360, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.73758178, + "epoch": 1.9483059698258085, + "grad_norm": 8.1875, + "learning_rate": 1.8247945472195104e-08, + "loss": 1.03835268, + "memory(GiB)": 302.58, + "step": 348380, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.74321008, + "epoch": 1.9484178192987875, + "grad_norm": 5.71875, + "learning_rate": 1.8169100403288008e-08, + "loss": 1.0080512, + "memory(GiB)": 302.58, + "step": 348400, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.74941545, + "epoch": 1.948529668771767, + "grad_norm": 6.1875, + "learning_rate": 1.8090425728793936e-08, + "loss": 0.99054966, + "memory(GiB)": 302.58, + "step": 348420, + "train_speed(iter/s)": 0.123337 + }, + { + "acc": 0.75270443, + "epoch": 1.948641518244746, + "grad_norm": 9.25, + "learning_rate": 1.8011921451404624e-08, + "loss": 0.96132212, + "memory(GiB)": 302.58, + "step": 348440, + "train_speed(iter/s)": 0.12334 + }, + { + "acc": 0.7275857, + "epoch": 1.9487533677177256, + "grad_norm": 5.4375, + "learning_rate": 1.7933587573805143e-08, + "loss": 1.09086609, + "memory(GiB)": 302.58, + "step": 348460, + "train_speed(iter/s)": 0.123344 + }, + { + "acc": 0.77353334, + "epoch": 1.9488652171907046, + "grad_norm": 6.9375, + "learning_rate": 1.7855424098675023e-08, + "loss": 0.88740473, + "memory(GiB)": 302.58, + "step": 348480, + "train_speed(iter/s)": 0.123347 + }, + { + "acc": 0.74524231, + "epoch": 1.948977066663684, + "grad_norm": 8.8125, + "learning_rate": 1.7777431028686563e-08, + "loss": 0.9970952, + "memory(GiB)": 302.58, + "step": 348500, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.74762468, + "epoch": 1.9490889161366631, + "grad_norm": 6.09375, + "learning_rate": 1.7699608366508748e-08, + "loss": 0.99726305, + "memory(GiB)": 302.58, + "step": 348520, + "train_speed(iter/s)": 0.123354 + }, + { + "acc": 0.7424963, + "epoch": 1.9492007656096426, + "grad_norm": 6.4375, + "learning_rate": 1.7621956114802772e-08, + "loss": 1.02410212, + "memory(GiB)": 302.58, + "step": 348540, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.75555167, + "epoch": 1.9493126150826217, + "grad_norm": 5.90625, + "learning_rate": 1.75444742762243e-08, + "loss": 0.96381731, + "memory(GiB)": 302.58, + "step": 348560, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.76174645, + "epoch": 1.9494244645556011, + "grad_norm": 7.65625, + "learning_rate": 1.746716285342398e-08, + "loss": 0.92760658, + "memory(GiB)": 302.58, + "step": 348580, + "train_speed(iter/s)": 0.123364 + }, + { + "acc": 0.74234424, + "epoch": 1.9495363140285802, + "grad_norm": 6.59375, + "learning_rate": 1.7390021849045814e-08, + "loss": 1.03011274, + "memory(GiB)": 302.58, + "step": 348600, + "train_speed(iter/s)": 0.123367 + }, + { + "acc": 0.75051837, + "epoch": 1.9496481635015597, + "grad_norm": 8.5, + "learning_rate": 1.7313051265728242e-08, + "loss": 0.95924721, + "memory(GiB)": 302.58, + "step": 348620, + "train_speed(iter/s)": 0.12337 + }, + { + "acc": 0.76583967, + "epoch": 1.9497600129745387, + "grad_norm": 8.0625, + "learning_rate": 1.723625110610416e-08, + "loss": 0.90996714, + "memory(GiB)": 302.58, + "step": 348640, + "train_speed(iter/s)": 0.123374 + }, + { + "acc": 0.75088511, + "epoch": 1.9498718624475182, + "grad_norm": 6.5625, + "learning_rate": 1.715962137280036e-08, + "loss": 0.99612904, + "memory(GiB)": 302.58, + "step": 348660, + "train_speed(iter/s)": 0.123377 + }, + { + "acc": 0.76012063, + "epoch": 1.9499837119204972, + "grad_norm": 8.3125, + "learning_rate": 1.708316206843752e-08, + "loss": 0.95762539, + "memory(GiB)": 302.58, + "step": 348680, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.753894, + "epoch": 1.9500955613934767, + "grad_norm": 4.9375, + "learning_rate": 1.700687319563077e-08, + "loss": 0.98647747, + "memory(GiB)": 302.58, + "step": 348700, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.75851703, + "epoch": 1.9502074108664558, + "grad_norm": 7.03125, + "learning_rate": 1.6930754756989688e-08, + "loss": 0.92121553, + "memory(GiB)": 302.58, + "step": 348720, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.74255853, + "epoch": 1.9503192603394353, + "grad_norm": 8.25, + "learning_rate": 1.6854806755117748e-08, + "loss": 1.02773752, + "memory(GiB)": 302.58, + "step": 348740, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.75156941, + "epoch": 1.9504311098124143, + "grad_norm": 10.3125, + "learning_rate": 1.6779029192612318e-08, + "loss": 0.96968927, + "memory(GiB)": 302.58, + "step": 348760, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.7479733, + "epoch": 1.9505429592853938, + "grad_norm": 8.0, + "learning_rate": 1.6703422072065768e-08, + "loss": 0.99576292, + "memory(GiB)": 302.58, + "step": 348780, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.74836912, + "epoch": 1.9506548087583728, + "grad_norm": 5.625, + "learning_rate": 1.662798539606325e-08, + "loss": 1.01030703, + "memory(GiB)": 302.58, + "step": 348800, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.76031089, + "epoch": 1.9507666582313523, + "grad_norm": 8.0625, + "learning_rate": 1.655271916718604e-08, + "loss": 0.92522926, + "memory(GiB)": 302.58, + "step": 348820, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.74979949, + "epoch": 1.9508785077043314, + "grad_norm": 6.5625, + "learning_rate": 1.6477623388007624e-08, + "loss": 0.9698946, + "memory(GiB)": 302.58, + "step": 348840, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.75846577, + "epoch": 1.9509903571773108, + "grad_norm": 9.6875, + "learning_rate": 1.6402698061097067e-08, + "loss": 0.93619442, + "memory(GiB)": 302.58, + "step": 348860, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.74851565, + "epoch": 1.9511022066502899, + "grad_norm": 7.8125, + "learning_rate": 1.632794318901676e-08, + "loss": 1.00065708, + "memory(GiB)": 302.58, + "step": 348880, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.74722486, + "epoch": 1.9512140561232694, + "grad_norm": 9.875, + "learning_rate": 1.6253358774322992e-08, + "loss": 1.00190334, + "memory(GiB)": 302.58, + "step": 348900, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.76393561, + "epoch": 1.9513259055962484, + "grad_norm": 6.1875, + "learning_rate": 1.6178944819568166e-08, + "loss": 0.91681223, + "memory(GiB)": 302.58, + "step": 348920, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.76354074, + "epoch": 1.951437755069228, + "grad_norm": 6.84375, + "learning_rate": 1.6104701327296358e-08, + "loss": 0.9219388, + "memory(GiB)": 302.58, + "step": 348940, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.75264544, + "epoch": 1.951549604542207, + "grad_norm": 7.625, + "learning_rate": 1.60306283000472e-08, + "loss": 0.98584547, + "memory(GiB)": 302.58, + "step": 348960, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.725453, + "epoch": 1.9516614540151864, + "grad_norm": 5.71875, + "learning_rate": 1.5956725740354783e-08, + "loss": 1.10878582, + "memory(GiB)": 302.58, + "step": 348980, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.763587, + "epoch": 1.9517733034881655, + "grad_norm": 6.625, + "learning_rate": 1.588299365074597e-08, + "loss": 0.92752934, + "memory(GiB)": 302.58, + "step": 349000, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.7375104, + "epoch": 1.951885152961145, + "grad_norm": 7.53125, + "learning_rate": 1.5809432033742634e-08, + "loss": 1.02214022, + "memory(GiB)": 302.58, + "step": 349020, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.75508351, + "epoch": 1.951997002434124, + "grad_norm": 9.25, + "learning_rate": 1.573604089186165e-08, + "loss": 0.97217026, + "memory(GiB)": 302.58, + "step": 349040, + "train_speed(iter/s)": 0.123441 + }, + { + "acc": 0.75954213, + "epoch": 1.9521088519071035, + "grad_norm": 5.5625, + "learning_rate": 1.5662820227612674e-08, + "loss": 0.94868317, + "memory(GiB)": 302.58, + "step": 349060, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.73218589, + "epoch": 1.9522207013800825, + "grad_norm": 5.6875, + "learning_rate": 1.558977004349982e-08, + "loss": 1.07471294, + "memory(GiB)": 302.58, + "step": 349080, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.74866624, + "epoch": 1.952332550853062, + "grad_norm": 10.125, + "learning_rate": 1.5516890342022194e-08, + "loss": 0.99226723, + "memory(GiB)": 302.58, + "step": 349100, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.76090884, + "epoch": 1.952444400326041, + "grad_norm": 9.25, + "learning_rate": 1.5444181125672254e-08, + "loss": 0.95310879, + "memory(GiB)": 302.58, + "step": 349120, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.75228672, + "epoch": 1.9525562497990205, + "grad_norm": 7.75, + "learning_rate": 1.5371642396936893e-08, + "loss": 0.95034285, + "memory(GiB)": 302.58, + "step": 349140, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.75171509, + "epoch": 1.9526680992719996, + "grad_norm": 7.96875, + "learning_rate": 1.5299274158297463e-08, + "loss": 0.97982063, + "memory(GiB)": 302.58, + "step": 349160, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.75183935, + "epoch": 1.952779948744979, + "grad_norm": 6.125, + "learning_rate": 1.5227076412228647e-08, + "loss": 0.95994234, + "memory(GiB)": 302.58, + "step": 349180, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74847779, + "epoch": 1.952891798217958, + "grad_norm": 5.21875, + "learning_rate": 1.5155049161200142e-08, + "loss": 0.98692036, + "memory(GiB)": 302.58, + "step": 349200, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.744169, + "epoch": 1.9530036476909376, + "grad_norm": 8.625, + "learning_rate": 1.508319240767553e-08, + "loss": 1.00377798, + "memory(GiB)": 302.58, + "step": 349220, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.75012646, + "epoch": 1.9531154971639166, + "grad_norm": 15.75, + "learning_rate": 1.5011506154112288e-08, + "loss": 1.0180604, + "memory(GiB)": 302.58, + "step": 349240, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.76092024, + "epoch": 1.9532273466368961, + "grad_norm": 7.59375, + "learning_rate": 1.4939990402962346e-08, + "loss": 0.94530182, + "memory(GiB)": 302.58, + "step": 349260, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.77221327, + "epoch": 1.9533391961098752, + "grad_norm": 6.71875, + "learning_rate": 1.4868645156672634e-08, + "loss": 0.87662392, + "memory(GiB)": 302.58, + "step": 349280, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.76031413, + "epoch": 1.9534510455828547, + "grad_norm": 5.09375, + "learning_rate": 1.4797470417682314e-08, + "loss": 0.9432416, + "memory(GiB)": 302.58, + "step": 349300, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.73580141, + "epoch": 1.9535628950558337, + "grad_norm": 7.65625, + "learning_rate": 1.4726466188426102e-08, + "loss": 1.04021378, + "memory(GiB)": 302.58, + "step": 349320, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.772859, + "epoch": 1.9536747445288132, + "grad_norm": 7.46875, + "learning_rate": 1.4655632471332614e-08, + "loss": 0.87406836, + "memory(GiB)": 302.58, + "step": 349340, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.76199245, + "epoch": 1.9537865940017924, + "grad_norm": 6.96875, + "learning_rate": 1.4584969268824357e-08, + "loss": 0.92341194, + "memory(GiB)": 302.58, + "step": 349360, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.74408851, + "epoch": 1.9538984434747717, + "grad_norm": 5.8125, + "learning_rate": 1.4514476583318837e-08, + "loss": 1.01227894, + "memory(GiB)": 302.58, + "step": 349380, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.75446005, + "epoch": 1.954010292947751, + "grad_norm": 7.09375, + "learning_rate": 1.4444154417226352e-08, + "loss": 0.98395748, + "memory(GiB)": 302.58, + "step": 349400, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.74405675, + "epoch": 1.9541221424207302, + "grad_norm": 5.71875, + "learning_rate": 1.4374002772953311e-08, + "loss": 1.00232639, + "memory(GiB)": 302.58, + "step": 349420, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.75028687, + "epoch": 1.9542339918937095, + "grad_norm": 6.625, + "learning_rate": 1.4304021652897792e-08, + "loss": 0.97762423, + "memory(GiB)": 302.58, + "step": 349440, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.76031752, + "epoch": 1.9543458413666888, + "grad_norm": 7.34375, + "learning_rate": 1.4234211059453995e-08, + "loss": 0.91224575, + "memory(GiB)": 302.58, + "step": 349460, + "train_speed(iter/s)": 0.123513 + }, + { + "acc": 0.74641185, + "epoch": 1.954457690839668, + "grad_norm": 8.8125, + "learning_rate": 1.4164570995010007e-08, + "loss": 1.01953964, + "memory(GiB)": 302.58, + "step": 349480, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.75695381, + "epoch": 1.9545695403126473, + "grad_norm": 7.4375, + "learning_rate": 1.4095101461946703e-08, + "loss": 0.96327286, + "memory(GiB)": 302.58, + "step": 349500, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.73879595, + "epoch": 1.9546813897856266, + "grad_norm": 5.90625, + "learning_rate": 1.402580246264107e-08, + "loss": 1.03595638, + "memory(GiB)": 302.58, + "step": 349520, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.7547442, + "epoch": 1.9547932392586058, + "grad_norm": 7.34375, + "learning_rate": 1.3956673999463432e-08, + "loss": 0.97084751, + "memory(GiB)": 302.58, + "step": 349540, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.74632807, + "epoch": 1.954905088731585, + "grad_norm": 9.0, + "learning_rate": 1.3887716074777457e-08, + "loss": 0.99415035, + "memory(GiB)": 302.58, + "step": 349560, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.75722175, + "epoch": 1.9550169382045643, + "grad_norm": 7.90625, + "learning_rate": 1.3818928690942367e-08, + "loss": 0.92557344, + "memory(GiB)": 302.58, + "step": 349580, + "train_speed(iter/s)": 0.123532 + }, + { + "acc": 0.73709049, + "epoch": 1.9551287876775436, + "grad_norm": 8.25, + "learning_rate": 1.3750311850310727e-08, + "loss": 1.05271139, + "memory(GiB)": 302.58, + "step": 349600, + "train_speed(iter/s)": 0.123536 + }, + { + "acc": 0.75256181, + "epoch": 1.9552406371505229, + "grad_norm": 7.8125, + "learning_rate": 1.368186555522899e-08, + "loss": 0.96711264, + "memory(GiB)": 302.58, + "step": 349620, + "train_speed(iter/s)": 0.123539 + }, + { + "acc": 0.77368021, + "epoch": 1.9553524866235021, + "grad_norm": 7.46875, + "learning_rate": 1.361358980803862e-08, + "loss": 0.89088764, + "memory(GiB)": 302.58, + "step": 349640, + "train_speed(iter/s)": 0.123542 + }, + { + "acc": 0.75512414, + "epoch": 1.9554643360964814, + "grad_norm": 5.3125, + "learning_rate": 1.3545484611074966e-08, + "loss": 0.96239452, + "memory(GiB)": 302.58, + "step": 349660, + "train_speed(iter/s)": 0.123545 + }, + { + "acc": 0.74661374, + "epoch": 1.9555761855694607, + "grad_norm": 5.46875, + "learning_rate": 1.3477549966667836e-08, + "loss": 0.98697577, + "memory(GiB)": 302.58, + "step": 349680, + "train_speed(iter/s)": 0.123549 + }, + { + "acc": 0.74541373, + "epoch": 1.95568803504244, + "grad_norm": 8.4375, + "learning_rate": 1.3409785877139813e-08, + "loss": 1.00344009, + "memory(GiB)": 302.58, + "step": 349700, + "train_speed(iter/s)": 0.123552 + }, + { + "acc": 0.73761902, + "epoch": 1.9557998845154192, + "grad_norm": 7.5625, + "learning_rate": 1.3342192344809601e-08, + "loss": 1.02188997, + "memory(GiB)": 302.58, + "step": 349720, + "train_speed(iter/s)": 0.123556 + }, + { + "acc": 0.75137792, + "epoch": 1.9559117339883985, + "grad_norm": 6.09375, + "learning_rate": 1.3274769371988128e-08, + "loss": 0.95982256, + "memory(GiB)": 302.58, + "step": 349740, + "train_speed(iter/s)": 0.123559 + }, + { + "acc": 0.75106773, + "epoch": 1.9560235834613777, + "grad_norm": 9.5, + "learning_rate": 1.3207516960982436e-08, + "loss": 0.98362885, + "memory(GiB)": 302.58, + "step": 349760, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.7583765, + "epoch": 1.956135432934357, + "grad_norm": 5.375, + "learning_rate": 1.3140435114092354e-08, + "loss": 0.95090618, + "memory(GiB)": 302.58, + "step": 349780, + "train_speed(iter/s)": 0.123566 + }, + { + "acc": 0.73951516, + "epoch": 1.9562472824073363, + "grad_norm": 8.1875, + "learning_rate": 1.3073523833612156e-08, + "loss": 1.02568769, + "memory(GiB)": 302.58, + "step": 349800, + "train_speed(iter/s)": 0.123569 + }, + { + "acc": 0.738027, + "epoch": 1.9563591318803155, + "grad_norm": 8.125, + "learning_rate": 1.300678312183057e-08, + "loss": 1.0288311, + "memory(GiB)": 302.58, + "step": 349820, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.75710173, + "epoch": 1.9564709813532948, + "grad_norm": 5.46875, + "learning_rate": 1.294021298103021e-08, + "loss": 0.96557751, + "memory(GiB)": 302.58, + "step": 349840, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.74739423, + "epoch": 1.956582830826274, + "grad_norm": 7.03125, + "learning_rate": 1.2873813413488146e-08, + "loss": 0.9589448, + "memory(GiB)": 302.58, + "step": 349860, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.73579555, + "epoch": 1.9566946802992533, + "grad_norm": 6.125, + "learning_rate": 1.2807584421475895e-08, + "loss": 1.05089331, + "memory(GiB)": 302.58, + "step": 349880, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75678129, + "epoch": 1.9568065297722326, + "grad_norm": 7.9375, + "learning_rate": 1.2741526007257754e-08, + "loss": 0.94140224, + "memory(GiB)": 302.58, + "step": 349900, + "train_speed(iter/s)": 0.123585 + }, + { + "acc": 0.74506292, + "epoch": 1.9569183792452118, + "grad_norm": 10.3125, + "learning_rate": 1.2675638173093586e-08, + "loss": 1.00040245, + "memory(GiB)": 302.58, + "step": 349920, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.76769156, + "epoch": 1.957030228718191, + "grad_norm": 7.53125, + "learning_rate": 1.260992092123714e-08, + "loss": 0.9108674, + "memory(GiB)": 302.58, + "step": 349940, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.73075948, + "epoch": 1.9571420781911704, + "grad_norm": 5.625, + "learning_rate": 1.254437425393551e-08, + "loss": 1.06901274, + "memory(GiB)": 302.58, + "step": 349960, + "train_speed(iter/s)": 0.123595 + }, + { + "acc": 0.75471425, + "epoch": 1.9572539276641496, + "grad_norm": 7.90625, + "learning_rate": 1.2478998173431344e-08, + "loss": 0.94937115, + "memory(GiB)": 302.58, + "step": 349980, + "train_speed(iter/s)": 0.123598 + }, + { + "acc": 0.7421618, + "epoch": 1.957365777137129, + "grad_norm": 7.59375, + "learning_rate": 1.2413792681960635e-08, + "loss": 1.00605202, + "memory(GiB)": 302.58, + "step": 350000, + "train_speed(iter/s)": 0.123602 + }, + { + "epoch": 1.957365777137129, + "eval_acc": 0.7069086685342261, + "eval_loss": 1.0117883682250977, + "eval_runtime": 7583.6694, + "eval_samples_per_second": 9.927, + "eval_steps_per_second": 9.927, + "step": 350000 + }, + { + "acc": 0.75350628, + "epoch": 1.9574776266101082, + "grad_norm": 6.25, + "learning_rate": 1.2348757781753262e-08, + "loss": 0.97027779, + "memory(GiB)": 302.58, + "step": 350020, + "train_speed(iter/s)": 0.123269 + }, + { + "acc": 0.74698882, + "epoch": 1.9575894760830874, + "grad_norm": 7.71875, + "learning_rate": 1.2283893475034114e-08, + "loss": 1.0017992, + "memory(GiB)": 302.58, + "step": 350040, + "train_speed(iter/s)": 0.123272 + }, + { + "acc": 0.74584732, + "epoch": 1.9577013255560667, + "grad_norm": 7.125, + "learning_rate": 1.2219199764020862e-08, + "loss": 0.99368343, + "memory(GiB)": 302.58, + "step": 350060, + "train_speed(iter/s)": 0.123276 + }, + { + "acc": 0.75873866, + "epoch": 1.957813175029046, + "grad_norm": 7.90625, + "learning_rate": 1.2154676650927288e-08, + "loss": 0.957199, + "memory(GiB)": 302.58, + "step": 350080, + "train_speed(iter/s)": 0.123279 + }, + { + "acc": 0.75538144, + "epoch": 1.9579250245020252, + "grad_norm": 6.59375, + "learning_rate": 1.2090324137959964e-08, + "loss": 0.97528, + "memory(GiB)": 302.58, + "step": 350100, + "train_speed(iter/s)": 0.123282 + }, + { + "acc": 0.75117736, + "epoch": 1.9580368739750045, + "grad_norm": 5.71875, + "learning_rate": 1.2026142227319349e-08, + "loss": 0.97058878, + "memory(GiB)": 302.58, + "step": 350120, + "train_speed(iter/s)": 0.123285 + }, + { + "acc": 0.75750189, + "epoch": 1.9581487234479837, + "grad_norm": 10.375, + "learning_rate": 1.1962130921200909e-08, + "loss": 0.94568472, + "memory(GiB)": 302.58, + "step": 350140, + "train_speed(iter/s)": 0.123289 + }, + { + "acc": 0.76279116, + "epoch": 1.958260572920963, + "grad_norm": 9.0, + "learning_rate": 1.1898290221794561e-08, + "loss": 0.94146852, + "memory(GiB)": 302.58, + "step": 350160, + "train_speed(iter/s)": 0.123292 + }, + { + "acc": 0.74555445, + "epoch": 1.9583724223939423, + "grad_norm": 4.53125, + "learning_rate": 1.1834620131283558e-08, + "loss": 1.02500963, + "memory(GiB)": 302.58, + "step": 350180, + "train_speed(iter/s)": 0.123296 + }, + { + "acc": 0.75135717, + "epoch": 1.9584842718669215, + "grad_norm": 8.25, + "learning_rate": 1.1771120651845602e-08, + "loss": 0.95816126, + "memory(GiB)": 302.58, + "step": 350200, + "train_speed(iter/s)": 0.123299 + }, + { + "acc": 0.75917139, + "epoch": 1.9585961213399008, + "grad_norm": 8.3125, + "learning_rate": 1.1707791785652289e-08, + "loss": 0.93984938, + "memory(GiB)": 302.58, + "step": 350220, + "train_speed(iter/s)": 0.123303 + }, + { + "acc": 0.74978318, + "epoch": 1.95870797081288, + "grad_norm": 7.46875, + "learning_rate": 1.164463353487022e-08, + "loss": 0.96446848, + "memory(GiB)": 302.58, + "step": 350240, + "train_speed(iter/s)": 0.123306 + }, + { + "acc": 0.75774522, + "epoch": 1.9588198202858593, + "grad_norm": 10.25, + "learning_rate": 1.1581645901659333e-08, + "loss": 0.92138252, + "memory(GiB)": 302.58, + "step": 350260, + "train_speed(iter/s)": 0.12331 + }, + { + "acc": 0.74226003, + "epoch": 1.9589316697588386, + "grad_norm": 6.3125, + "learning_rate": 1.1518828888174016e-08, + "loss": 1.01792402, + "memory(GiB)": 302.58, + "step": 350280, + "train_speed(iter/s)": 0.123313 + }, + { + "acc": 0.75597401, + "epoch": 1.9590435192318179, + "grad_norm": 5.59375, + "learning_rate": 1.145618249656255e-08, + "loss": 0.95292864, + "memory(GiB)": 302.58, + "step": 350300, + "train_speed(iter/s)": 0.123317 + }, + { + "acc": 0.74210858, + "epoch": 1.9591553687047971, + "grad_norm": 8.0, + "learning_rate": 1.1393706728968224e-08, + "loss": 0.99482222, + "memory(GiB)": 302.58, + "step": 350320, + "train_speed(iter/s)": 0.12332 + }, + { + "acc": 0.74984307, + "epoch": 1.9592672181777764, + "grad_norm": 4.9375, + "learning_rate": 1.1331401587527102e-08, + "loss": 0.98133907, + "memory(GiB)": 302.58, + "step": 350340, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.75274096, + "epoch": 1.9593790676507556, + "grad_norm": 6.15625, + "learning_rate": 1.1269267074371371e-08, + "loss": 0.98071327, + "memory(GiB)": 302.58, + "step": 350360, + "train_speed(iter/s)": 0.123327 + }, + { + "acc": 0.75013647, + "epoch": 1.959490917123735, + "grad_norm": 6.84375, + "learning_rate": 1.1207303191624885e-08, + "loss": 0.99000597, + "memory(GiB)": 302.58, + "step": 350380, + "train_speed(iter/s)": 0.123331 + }, + { + "acc": 0.75242424, + "epoch": 1.9596027665967142, + "grad_norm": 8.8125, + "learning_rate": 1.1145509941408172e-08, + "loss": 0.98487558, + "memory(GiB)": 302.58, + "step": 350400, + "train_speed(iter/s)": 0.123334 + }, + { + "acc": 0.74739833, + "epoch": 1.9597146160696934, + "grad_norm": 9.0625, + "learning_rate": 1.1083887325833986e-08, + "loss": 1.00293989, + "memory(GiB)": 302.58, + "step": 350420, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.73494577, + "epoch": 1.9598264655426727, + "grad_norm": 7.71875, + "learning_rate": 1.1022435347010085e-08, + "loss": 1.06575842, + "memory(GiB)": 302.58, + "step": 350440, + "train_speed(iter/s)": 0.123341 + }, + { + "acc": 0.74803457, + "epoch": 1.959938315015652, + "grad_norm": 7.84375, + "learning_rate": 1.0961154007038677e-08, + "loss": 0.99809732, + "memory(GiB)": 302.58, + "step": 350460, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.76496496, + "epoch": 1.9600501644886312, + "grad_norm": 9.25, + "learning_rate": 1.0900043308015862e-08, + "loss": 0.91859589, + "memory(GiB)": 302.58, + "step": 350480, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.74226942, + "epoch": 1.9601620139616105, + "grad_norm": 12.3125, + "learning_rate": 1.083910325203108e-08, + "loss": 1.03523779, + "memory(GiB)": 302.58, + "step": 350500, + "train_speed(iter/s)": 0.123351 + }, + { + "acc": 0.74570174, + "epoch": 1.9602738634345898, + "grad_norm": 8.625, + "learning_rate": 1.0778333841169331e-08, + "loss": 0.99385805, + "memory(GiB)": 302.58, + "step": 350520, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.77163572, + "epoch": 1.960385712907569, + "grad_norm": 8.1875, + "learning_rate": 1.0717735077509505e-08, + "loss": 0.8919982, + "memory(GiB)": 302.58, + "step": 350540, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.75596843, + "epoch": 1.9604975623805483, + "grad_norm": 6.75, + "learning_rate": 1.0657306963123282e-08, + "loss": 0.94325829, + "memory(GiB)": 302.58, + "step": 350560, + "train_speed(iter/s)": 0.123361 + }, + { + "acc": 0.76200647, + "epoch": 1.9606094118535276, + "grad_norm": 7.90625, + "learning_rate": 1.0597049500077894e-08, + "loss": 0.95859661, + "memory(GiB)": 302.58, + "step": 350580, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.73889856, + "epoch": 1.9607212613265068, + "grad_norm": 7.625, + "learning_rate": 1.0536962690434472e-08, + "loss": 1.03433886, + "memory(GiB)": 302.58, + "step": 350600, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.76170473, + "epoch": 1.960833110799486, + "grad_norm": 9.125, + "learning_rate": 1.0477046536248037e-08, + "loss": 0.93290243, + "memory(GiB)": 302.58, + "step": 350620, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.74814591, + "epoch": 1.9609449602724653, + "grad_norm": 6.28125, + "learning_rate": 1.0417301039568062e-08, + "loss": 1.00582285, + "memory(GiB)": 302.58, + "step": 350640, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.75085797, + "epoch": 1.9610568097454446, + "grad_norm": 5.90625, + "learning_rate": 1.0357726202437913e-08, + "loss": 0.97897253, + "memory(GiB)": 302.58, + "step": 350660, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.76339445, + "epoch": 1.9611686592184239, + "grad_norm": 7.9375, + "learning_rate": 1.0298322026894847e-08, + "loss": 0.94435654, + "memory(GiB)": 302.58, + "step": 350680, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.75122752, + "epoch": 1.9612805086914031, + "grad_norm": 6.84375, + "learning_rate": 1.0239088514971685e-08, + "loss": 0.98203936, + "memory(GiB)": 302.58, + "step": 350700, + "train_speed(iter/s)": 0.123386 + }, + { + "acc": 0.74552941, + "epoch": 1.9613923581643824, + "grad_norm": 7.875, + "learning_rate": 1.0180025668693471e-08, + "loss": 1.0174716, + "memory(GiB)": 302.58, + "step": 350720, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.72774777, + "epoch": 1.9615042076373617, + "grad_norm": 8.375, + "learning_rate": 1.0121133490080814e-08, + "loss": 1.09978542, + "memory(GiB)": 302.58, + "step": 350740, + "train_speed(iter/s)": 0.123393 + }, + { + "acc": 0.7722651, + "epoch": 1.961616057110341, + "grad_norm": 7.6875, + "learning_rate": 1.0062411981147657e-08, + "loss": 0.8825448, + "memory(GiB)": 302.58, + "step": 350760, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.76828523, + "epoch": 1.9617279065833202, + "grad_norm": 7.90625, + "learning_rate": 1.0003861143902949e-08, + "loss": 0.88717041, + "memory(GiB)": 302.58, + "step": 350780, + "train_speed(iter/s)": 0.1234 + }, + { + "acc": 0.7325892, + "epoch": 1.9618397560562995, + "grad_norm": 4.53125, + "learning_rate": 9.94548098034842e-09, + "loss": 1.07569962, + "memory(GiB)": 302.58, + "step": 350800, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.73613243, + "epoch": 1.9619516055292787, + "grad_norm": 9.0, + "learning_rate": 9.887271492482475e-09, + "loss": 1.06810341, + "memory(GiB)": 302.58, + "step": 350820, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.76127496, + "epoch": 1.962063455002258, + "grad_norm": 7.90625, + "learning_rate": 9.829232682294076e-09, + "loss": 0.93027658, + "memory(GiB)": 302.58, + "step": 350840, + "train_speed(iter/s)": 0.12341 + }, + { + "acc": 0.75136857, + "epoch": 1.9621753044752372, + "grad_norm": 6.75, + "learning_rate": 9.771364551769968e-09, + "loss": 0.97190418, + "memory(GiB)": 302.58, + "step": 350860, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.73825674, + "epoch": 1.9622871539482165, + "grad_norm": 5.34375, + "learning_rate": 9.71366710288857e-09, + "loss": 1.02659588, + "memory(GiB)": 302.58, + "step": 350880, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.75331659, + "epoch": 1.9623990034211958, + "grad_norm": 9.9375, + "learning_rate": 9.656140337623854e-09, + "loss": 0.95851917, + "memory(GiB)": 302.58, + "step": 350900, + "train_speed(iter/s)": 0.12342 + }, + { + "acc": 0.7515789, + "epoch": 1.962510852894175, + "grad_norm": 6.9375, + "learning_rate": 9.598784257942584e-09, + "loss": 0.95800467, + "memory(GiB)": 302.58, + "step": 350920, + "train_speed(iter/s)": 0.123424 + }, + { + "acc": 0.7465632, + "epoch": 1.9626227023671543, + "grad_norm": 6.34375, + "learning_rate": 9.541598865807078e-09, + "loss": 1.00182657, + "memory(GiB)": 302.58, + "step": 350940, + "train_speed(iter/s)": 0.123427 + }, + { + "acc": 0.74505978, + "epoch": 1.9627345518401336, + "grad_norm": 7.0, + "learning_rate": 9.484584163173549e-09, + "loss": 1.02736044, + "memory(GiB)": 302.58, + "step": 350960, + "train_speed(iter/s)": 0.12343 + }, + { + "acc": 0.74911876, + "epoch": 1.9628464013131128, + "grad_norm": 8.1875, + "learning_rate": 9.427740151991549e-09, + "loss": 1.00381231, + "memory(GiB)": 302.58, + "step": 350980, + "train_speed(iter/s)": 0.123434 + }, + { + "acc": 0.75217233, + "epoch": 1.962958250786092, + "grad_norm": 5.5, + "learning_rate": 9.371066834205634e-09, + "loss": 0.96145277, + "memory(GiB)": 302.58, + "step": 351000, + "train_speed(iter/s)": 0.123437 + }, + { + "acc": 0.74160757, + "epoch": 1.9630701002590714, + "grad_norm": 7.03125, + "learning_rate": 9.314564211753697e-09, + "loss": 1.03777857, + "memory(GiB)": 302.58, + "step": 351020, + "train_speed(iter/s)": 0.12344 + }, + { + "acc": 0.73625436, + "epoch": 1.9631819497320506, + "grad_norm": 6.4375, + "learning_rate": 9.258232286569191e-09, + "loss": 1.04745541, + "memory(GiB)": 302.58, + "step": 351040, + "train_speed(iter/s)": 0.123444 + }, + { + "acc": 0.75760207, + "epoch": 1.9632937992050299, + "grad_norm": 8.4375, + "learning_rate": 9.2020710605778e-09, + "loss": 0.95169792, + "memory(GiB)": 302.58, + "step": 351060, + "train_speed(iter/s)": 0.123447 + }, + { + "acc": 0.75561943, + "epoch": 1.9634056486780092, + "grad_norm": 5.75, + "learning_rate": 9.146080535701318e-09, + "loss": 0.9700696, + "memory(GiB)": 302.58, + "step": 351080, + "train_speed(iter/s)": 0.123451 + }, + { + "acc": 0.74108472, + "epoch": 1.9635174981509884, + "grad_norm": 8.25, + "learning_rate": 9.090260713854881e-09, + "loss": 1.00146532, + "memory(GiB)": 302.58, + "step": 351100, + "train_speed(iter/s)": 0.123454 + }, + { + "acc": 0.75204992, + "epoch": 1.9636293476239677, + "grad_norm": 6.5625, + "learning_rate": 9.034611596946408e-09, + "loss": 0.96957111, + "memory(GiB)": 302.58, + "step": 351120, + "train_speed(iter/s)": 0.123457 + }, + { + "acc": 0.74559722, + "epoch": 1.963741197096947, + "grad_norm": 6.5625, + "learning_rate": 8.979133186880484e-09, + "loss": 0.9939065, + "memory(GiB)": 302.58, + "step": 351140, + "train_speed(iter/s)": 0.123461 + }, + { + "acc": 0.75866714, + "epoch": 1.9638530465699262, + "grad_norm": 8.625, + "learning_rate": 8.923825485554483e-09, + "loss": 0.91885033, + "memory(GiB)": 302.58, + "step": 351160, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.75548959, + "epoch": 1.9639648960429055, + "grad_norm": 10.375, + "learning_rate": 8.868688494859668e-09, + "loss": 0.94059305, + "memory(GiB)": 302.58, + "step": 351180, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74528036, + "epoch": 1.9640767455158847, + "grad_norm": 7.125, + "learning_rate": 8.813722216682308e-09, + "loss": 0.98682861, + "memory(GiB)": 302.58, + "step": 351200, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.74434009, + "epoch": 1.964188594988864, + "grad_norm": 9.875, + "learning_rate": 8.758926652902567e-09, + "loss": 0.98353806, + "memory(GiB)": 302.58, + "step": 351220, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74941115, + "epoch": 1.9643004444618433, + "grad_norm": 7.90625, + "learning_rate": 8.704301805393944e-09, + "loss": 0.99751635, + "memory(GiB)": 302.58, + "step": 351240, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.75513463, + "epoch": 1.9644122939348225, + "grad_norm": 9.875, + "learning_rate": 8.649847676025502e-09, + "loss": 0.96427917, + "memory(GiB)": 302.58, + "step": 351260, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.75356779, + "epoch": 1.9645241434078018, + "grad_norm": 6.28125, + "learning_rate": 8.595564266659084e-09, + "loss": 0.97778864, + "memory(GiB)": 302.58, + "step": 351280, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.73882766, + "epoch": 1.964635992880781, + "grad_norm": 8.25, + "learning_rate": 8.541451579151538e-09, + "loss": 1.02783966, + "memory(GiB)": 302.58, + "step": 351300, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.75965915, + "epoch": 1.9647478423537603, + "grad_norm": 8.9375, + "learning_rate": 8.487509615354162e-09, + "loss": 0.96925497, + "memory(GiB)": 302.58, + "step": 351320, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.75478687, + "epoch": 1.9648596918267396, + "grad_norm": 11.375, + "learning_rate": 8.433738377111589e-09, + "loss": 0.95356302, + "memory(GiB)": 302.58, + "step": 351340, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.75464673, + "epoch": 1.9649715412997188, + "grad_norm": 7.625, + "learning_rate": 8.380137866262905e-09, + "loss": 0.97037067, + "memory(GiB)": 302.58, + "step": 351360, + "train_speed(iter/s)": 0.123499 + }, + { + "acc": 0.76211557, + "epoch": 1.9650833907726981, + "grad_norm": 7.65625, + "learning_rate": 8.32670808464109e-09, + "loss": 0.92428904, + "memory(GiB)": 302.58, + "step": 351380, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.73745809, + "epoch": 1.9651952402456774, + "grad_norm": 9.5, + "learning_rate": 8.273449034074676e-09, + "loss": 1.01047306, + "memory(GiB)": 302.58, + "step": 351400, + "train_speed(iter/s)": 0.123506 + }, + { + "acc": 0.73762522, + "epoch": 1.9653070897186566, + "grad_norm": 6.15625, + "learning_rate": 8.220360716384434e-09, + "loss": 1.04374142, + "memory(GiB)": 302.58, + "step": 351420, + "train_speed(iter/s)": 0.123509 + }, + { + "acc": 0.74762077, + "epoch": 1.965418939191636, + "grad_norm": 7.125, + "learning_rate": 8.167443133386133e-09, + "loss": 0.99135256, + "memory(GiB)": 302.58, + "step": 351440, + "train_speed(iter/s)": 0.123512 + }, + { + "acc": 0.74926171, + "epoch": 1.9655307886646152, + "grad_norm": 6.8125, + "learning_rate": 8.114696286890545e-09, + "loss": 0.9880434, + "memory(GiB)": 302.58, + "step": 351460, + "train_speed(iter/s)": 0.123516 + }, + { + "acc": 0.74030652, + "epoch": 1.9656426381375944, + "grad_norm": 4.3125, + "learning_rate": 8.062120178700672e-09, + "loss": 1.03420315, + "memory(GiB)": 302.58, + "step": 351480, + "train_speed(iter/s)": 0.123519 + }, + { + "acc": 0.74700851, + "epoch": 1.9657544876105737, + "grad_norm": 5.90625, + "learning_rate": 8.009714810615633e-09, + "loss": 1.00827703, + "memory(GiB)": 302.58, + "step": 351500, + "train_speed(iter/s)": 0.123522 + }, + { + "acc": 0.76395512, + "epoch": 1.965866337083553, + "grad_norm": 10.375, + "learning_rate": 7.957480184427325e-09, + "loss": 0.94243288, + "memory(GiB)": 302.58, + "step": 351520, + "train_speed(iter/s)": 0.123526 + }, + { + "acc": 0.75528522, + "epoch": 1.9659781865565322, + "grad_norm": 7.5, + "learning_rate": 7.905416301922097e-09, + "loss": 0.96671076, + "memory(GiB)": 302.58, + "step": 351540, + "train_speed(iter/s)": 0.123529 + }, + { + "acc": 0.74656014, + "epoch": 1.9660900360295115, + "grad_norm": 7.375, + "learning_rate": 7.85352316488186e-09, + "loss": 0.99708118, + "memory(GiB)": 302.58, + "step": 351560, + "train_speed(iter/s)": 0.123533 + }, + { + "acc": 0.74773698, + "epoch": 1.9662018855024908, + "grad_norm": 6.0, + "learning_rate": 7.801800775080193e-09, + "loss": 1.01989737, + "memory(GiB)": 302.58, + "step": 351580, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.7523241, + "epoch": 1.96631373497547, + "grad_norm": 7.34375, + "learning_rate": 7.750249134287346e-09, + "loss": 0.97384958, + "memory(GiB)": 302.58, + "step": 351600, + "train_speed(iter/s)": 0.12354 + }, + { + "acc": 0.7593617, + "epoch": 1.9664255844484493, + "grad_norm": 6.65625, + "learning_rate": 7.6988682442658e-09, + "loss": 0.92954941, + "memory(GiB)": 302.58, + "step": 351620, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.7554852, + "epoch": 1.9665374339214285, + "grad_norm": 9.125, + "learning_rate": 7.647658106772482e-09, + "loss": 0.94213295, + "memory(GiB)": 302.58, + "step": 351640, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.75396686, + "epoch": 1.9666492833944078, + "grad_norm": 5.6875, + "learning_rate": 7.596618723560434e-09, + "loss": 0.97448683, + "memory(GiB)": 302.58, + "step": 351660, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.73361135, + "epoch": 1.966761132867387, + "grad_norm": 6.96875, + "learning_rate": 7.545750096374371e-09, + "loss": 1.07018318, + "memory(GiB)": 302.58, + "step": 351680, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.75246987, + "epoch": 1.9668729823403663, + "grad_norm": 7.1875, + "learning_rate": 7.495052226954013e-09, + "loss": 0.95486164, + "memory(GiB)": 302.58, + "step": 351700, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.7478724, + "epoch": 1.9669848318133456, + "grad_norm": 5.15625, + "learning_rate": 7.44452511703353e-09, + "loss": 0.99177208, + "memory(GiB)": 302.58, + "step": 351720, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.76560025, + "epoch": 1.9670966812863249, + "grad_norm": 9.4375, + "learning_rate": 7.394168768341536e-09, + "loss": 0.93267717, + "memory(GiB)": 302.58, + "step": 351740, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.76852636, + "epoch": 1.9672085307593041, + "grad_norm": 8.1875, + "learning_rate": 7.343983182599989e-09, + "loss": 0.90983849, + "memory(GiB)": 302.58, + "step": 351760, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.7414309, + "epoch": 1.9673203802322834, + "grad_norm": 7.4375, + "learning_rate": 7.293968361525849e-09, + "loss": 1.03581667, + "memory(GiB)": 302.58, + "step": 351780, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.76422644, + "epoch": 1.9674322297052627, + "grad_norm": 8.0, + "learning_rate": 7.244124306828859e-09, + "loss": 0.91048107, + "memory(GiB)": 302.58, + "step": 351800, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.74906101, + "epoch": 1.967544079178242, + "grad_norm": 6.0625, + "learning_rate": 7.194451020214877e-09, + "loss": 0.9997879, + "memory(GiB)": 302.58, + "step": 351820, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.75836835, + "epoch": 1.9676559286512212, + "grad_norm": 6.375, + "learning_rate": 7.144948503382543e-09, + "loss": 0.95467606, + "memory(GiB)": 302.58, + "step": 351840, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.75606251, + "epoch": 1.9677677781242005, + "grad_norm": 6.9375, + "learning_rate": 7.095616758024393e-09, + "loss": 0.95712042, + "memory(GiB)": 302.58, + "step": 351860, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.74073725, + "epoch": 1.9678796275971797, + "grad_norm": 5.75, + "learning_rate": 7.046455785828521e-09, + "loss": 1.03831949, + "memory(GiB)": 302.58, + "step": 351880, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.74970245, + "epoch": 1.967991477070159, + "grad_norm": 9.0, + "learning_rate": 6.997465588475805e-09, + "loss": 0.98122044, + "memory(GiB)": 302.58, + "step": 351900, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.74685774, + "epoch": 1.9681033265431382, + "grad_norm": 4.46875, + "learning_rate": 6.948646167642126e-09, + "loss": 0.99533243, + "memory(GiB)": 302.58, + "step": 351920, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.74643574, + "epoch": 1.9682151760161175, + "grad_norm": 5.625, + "learning_rate": 6.899997524997815e-09, + "loss": 1.01050444, + "memory(GiB)": 302.58, + "step": 351940, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.73539562, + "epoch": 1.9683270254890968, + "grad_norm": 9.1875, + "learning_rate": 6.851519662205985e-09, + "loss": 1.03215179, + "memory(GiB)": 302.58, + "step": 351960, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.76012311, + "epoch": 1.968438874962076, + "grad_norm": 6.6875, + "learning_rate": 6.8032125809247565e-09, + "loss": 0.93683453, + "memory(GiB)": 302.58, + "step": 351980, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.76479735, + "epoch": 1.9685507244350553, + "grad_norm": 7.71875, + "learning_rate": 6.755076282806694e-09, + "loss": 0.91073694, + "memory(GiB)": 302.58, + "step": 352000, + "train_speed(iter/s)": 0.12361 + }, + { + "epoch": 1.9685507244350553, + "eval_acc": 0.7069125629803418, + "eval_loss": 1.0117965936660767, + "eval_runtime": 7239.3066, + "eval_samples_per_second": 10.399, + "eval_steps_per_second": 10.399, + "step": 352000 + }, + { + "acc": 0.75355964, + "epoch": 1.9686625739080346, + "grad_norm": 8.5, + "learning_rate": 6.707110769498815e-09, + "loss": 0.95374069, + "memory(GiB)": 302.58, + "step": 352020, + "train_speed(iter/s)": 0.123294 + }, + { + "acc": 0.75365429, + "epoch": 1.9687744233810138, + "grad_norm": 5.21875, + "learning_rate": 6.6593160426403624e-09, + "loss": 0.96034813, + "memory(GiB)": 302.58, + "step": 352040, + "train_speed(iter/s)": 0.123298 + }, + { + "acc": 0.75366974, + "epoch": 1.968886272853993, + "grad_norm": 8.125, + "learning_rate": 6.611692103867251e-09, + "loss": 0.96589775, + "memory(GiB)": 302.58, + "step": 352060, + "train_speed(iter/s)": 0.123301 + }, + { + "acc": 0.75508661, + "epoch": 1.9689981223269724, + "grad_norm": 9.0, + "learning_rate": 6.564238954807623e-09, + "loss": 0.95468931, + "memory(GiB)": 302.58, + "step": 352080, + "train_speed(iter/s)": 0.123305 + }, + { + "acc": 0.74873786, + "epoch": 1.9691099717999516, + "grad_norm": 7.625, + "learning_rate": 6.5169565970846226e-09, + "loss": 0.96847649, + "memory(GiB)": 302.58, + "step": 352100, + "train_speed(iter/s)": 0.123308 + }, + { + "acc": 0.75401254, + "epoch": 1.9692218212729309, + "grad_norm": 7.46875, + "learning_rate": 6.469845032315847e-09, + "loss": 0.95216427, + "memory(GiB)": 302.58, + "step": 352120, + "train_speed(iter/s)": 0.123312 + }, + { + "acc": 0.76665192, + "epoch": 1.9693336707459101, + "grad_norm": 6.28125, + "learning_rate": 6.422904262112229e-09, + "loss": 0.91389074, + "memory(GiB)": 302.58, + "step": 352140, + "train_speed(iter/s)": 0.123315 + }, + { + "acc": 0.75696979, + "epoch": 1.9694455202188894, + "grad_norm": 6.5625, + "learning_rate": 6.376134288079705e-09, + "loss": 0.95901213, + "memory(GiB)": 302.58, + "step": 352160, + "train_speed(iter/s)": 0.123318 + }, + { + "acc": 0.75139246, + "epoch": 1.9695573696918687, + "grad_norm": 9.0625, + "learning_rate": 6.329535111817553e-09, + "loss": 0.98740063, + "memory(GiB)": 302.58, + "step": 352180, + "train_speed(iter/s)": 0.123322 + }, + { + "acc": 0.75006547, + "epoch": 1.969669219164848, + "grad_norm": 7.0625, + "learning_rate": 6.2831067349200524e-09, + "loss": 0.996241, + "memory(GiB)": 302.58, + "step": 352200, + "train_speed(iter/s)": 0.123325 + }, + { + "acc": 0.75122328, + "epoch": 1.9697810686378272, + "grad_norm": 9.5625, + "learning_rate": 6.2368491589748225e-09, + "loss": 0.97846508, + "memory(GiB)": 302.58, + "step": 352220, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.73955078, + "epoch": 1.9698929181108065, + "grad_norm": 4.53125, + "learning_rate": 6.190762385563931e-09, + "loss": 1.01685944, + "memory(GiB)": 302.58, + "step": 352240, + "train_speed(iter/s)": 0.123332 + }, + { + "acc": 0.7355207, + "epoch": 1.9700047675837857, + "grad_norm": 8.4375, + "learning_rate": 6.144846416263894e-09, + "loss": 1.04922333, + "memory(GiB)": 302.58, + "step": 352260, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.75914307, + "epoch": 1.970116617056765, + "grad_norm": 4.875, + "learning_rate": 6.099101252645678e-09, + "loss": 0.95786572, + "memory(GiB)": 302.58, + "step": 352280, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.75675583, + "epoch": 1.9702284665297443, + "grad_norm": 8.0625, + "learning_rate": 6.0535268962730324e-09, + "loss": 0.95702744, + "memory(GiB)": 302.58, + "step": 352300, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.7493536, + "epoch": 1.9703403160027235, + "grad_norm": 6.6875, + "learning_rate": 6.008123348705263e-09, + "loss": 0.98586178, + "memory(GiB)": 302.58, + "step": 352320, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.75892334, + "epoch": 1.9704521654757028, + "grad_norm": 5.78125, + "learning_rate": 5.9628906114950205e-09, + "loss": 0.95092926, + "memory(GiB)": 302.58, + "step": 352340, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.75169511, + "epoch": 1.970564014948682, + "grad_norm": 6.75, + "learning_rate": 5.917828686189953e-09, + "loss": 0.98494244, + "memory(GiB)": 302.58, + "step": 352360, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.74647355, + "epoch": 1.9706758644216613, + "grad_norm": 7.1875, + "learning_rate": 5.872937574330495e-09, + "loss": 0.99881659, + "memory(GiB)": 302.58, + "step": 352380, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.74702229, + "epoch": 1.9707877138946406, + "grad_norm": 7.78125, + "learning_rate": 5.828217277453196e-09, + "loss": 0.99063253, + "memory(GiB)": 302.58, + "step": 352400, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.74968958, + "epoch": 1.9708995633676198, + "grad_norm": 7.6875, + "learning_rate": 5.7836677970862785e-09, + "loss": 0.98607712, + "memory(GiB)": 302.58, + "step": 352420, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.74907627, + "epoch": 1.971011412840599, + "grad_norm": 9.0625, + "learning_rate": 5.739289134754633e-09, + "loss": 1.02707577, + "memory(GiB)": 302.58, + "step": 352440, + "train_speed(iter/s)": 0.123366 + }, + { + "acc": 0.74218588, + "epoch": 1.9711232623135784, + "grad_norm": 8.1875, + "learning_rate": 5.6950812919753796e-09, + "loss": 1.01509562, + "memory(GiB)": 302.58, + "step": 352460, + "train_speed(iter/s)": 0.123369 + }, + { + "acc": 0.77051253, + "epoch": 1.9712351117865576, + "grad_norm": 9.375, + "learning_rate": 5.6510442702606415e-09, + "loss": 0.88175497, + "memory(GiB)": 302.58, + "step": 352480, + "train_speed(iter/s)": 0.123373 + }, + { + "acc": 0.73396735, + "epoch": 1.971346961259537, + "grad_norm": 8.25, + "learning_rate": 5.607178071116992e-09, + "loss": 1.03911219, + "memory(GiB)": 302.58, + "step": 352500, + "train_speed(iter/s)": 0.123376 + }, + { + "acc": 0.75825734, + "epoch": 1.9714588107325162, + "grad_norm": 7.125, + "learning_rate": 5.563482696044897e-09, + "loss": 0.92798462, + "memory(GiB)": 302.58, + "step": 352520, + "train_speed(iter/s)": 0.12338 + }, + { + "acc": 0.75522151, + "epoch": 1.9715706602054954, + "grad_norm": 6.1875, + "learning_rate": 5.519958146538717e-09, + "loss": 0.96560535, + "memory(GiB)": 302.58, + "step": 352540, + "train_speed(iter/s)": 0.123383 + }, + { + "acc": 0.74391737, + "epoch": 1.9716825096784747, + "grad_norm": 7.84375, + "learning_rate": 5.476604424086707e-09, + "loss": 0.99694376, + "memory(GiB)": 302.58, + "step": 352560, + "train_speed(iter/s)": 0.123387 + }, + { + "acc": 0.75567951, + "epoch": 1.971794359151454, + "grad_norm": 5.8125, + "learning_rate": 5.433421530172123e-09, + "loss": 0.97017698, + "memory(GiB)": 302.58, + "step": 352580, + "train_speed(iter/s)": 0.12339 + }, + { + "acc": 0.74716902, + "epoch": 1.9719062086244332, + "grad_norm": 6.8125, + "learning_rate": 5.390409466271562e-09, + "loss": 0.97746906, + "memory(GiB)": 302.58, + "step": 352600, + "train_speed(iter/s)": 0.123394 + }, + { + "acc": 0.75640903, + "epoch": 1.9720180580974125, + "grad_norm": 7.03125, + "learning_rate": 5.347568233857181e-09, + "loss": 0.97244196, + "memory(GiB)": 302.58, + "step": 352620, + "train_speed(iter/s)": 0.123397 + }, + { + "acc": 0.75163021, + "epoch": 1.9721299075703917, + "grad_norm": 5.6875, + "learning_rate": 5.304897834392808e-09, + "loss": 0.9706358, + "memory(GiB)": 302.58, + "step": 352640, + "train_speed(iter/s)": 0.123401 + }, + { + "acc": 0.75450754, + "epoch": 1.972241757043371, + "grad_norm": 6.125, + "learning_rate": 5.262398269339497e-09, + "loss": 0.95433674, + "memory(GiB)": 302.58, + "step": 352660, + "train_speed(iter/s)": 0.123404 + }, + { + "acc": 0.75206389, + "epoch": 1.9723536065163503, + "grad_norm": 6.46875, + "learning_rate": 5.220069540149419e-09, + "loss": 0.99550953, + "memory(GiB)": 302.58, + "step": 352680, + "train_speed(iter/s)": 0.123407 + }, + { + "acc": 0.76159844, + "epoch": 1.9724654559893295, + "grad_norm": 5.78125, + "learning_rate": 5.177911648271417e-09, + "loss": 0.94386358, + "memory(GiB)": 302.58, + "step": 352700, + "train_speed(iter/s)": 0.123411 + }, + { + "acc": 0.74393744, + "epoch": 1.9725773054623088, + "grad_norm": 6.25, + "learning_rate": 5.135924595146558e-09, + "loss": 1.04246264, + "memory(GiB)": 302.58, + "step": 352720, + "train_speed(iter/s)": 0.123414 + }, + { + "acc": 0.75703506, + "epoch": 1.972689154935288, + "grad_norm": 5.28125, + "learning_rate": 5.094108382211471e-09, + "loss": 0.95733995, + "memory(GiB)": 302.58, + "step": 352740, + "train_speed(iter/s)": 0.123417 + }, + { + "acc": 0.74563947, + "epoch": 1.9728010044082673, + "grad_norm": 5.28125, + "learning_rate": 5.05246301089668e-09, + "loss": 0.98555365, + "memory(GiB)": 302.58, + "step": 352760, + "train_speed(iter/s)": 0.123421 + }, + { + "acc": 0.7547575, + "epoch": 1.9729128538812466, + "grad_norm": 8.75, + "learning_rate": 5.01098848262549e-09, + "loss": 0.96159716, + "memory(GiB)": 302.58, + "step": 352780, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.74750223, + "epoch": 1.9730247033542259, + "grad_norm": 6.875, + "learning_rate": 4.969684798817875e-09, + "loss": 0.99388628, + "memory(GiB)": 302.58, + "step": 352800, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.76205721, + "epoch": 1.9731365528272051, + "grad_norm": 8.0, + "learning_rate": 4.928551960885486e-09, + "loss": 0.92094822, + "memory(GiB)": 302.58, + "step": 352820, + "train_speed(iter/s)": 0.123431 + }, + { + "acc": 0.76137986, + "epoch": 1.9732484023001844, + "grad_norm": 7.28125, + "learning_rate": 4.887589970235529e-09, + "loss": 0.9374918, + "memory(GiB)": 302.58, + "step": 352840, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.75190234, + "epoch": 1.9733602517731637, + "grad_norm": 6.03125, + "learning_rate": 4.846798828269106e-09, + "loss": 0.97416534, + "memory(GiB)": 302.58, + "step": 352860, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.75259609, + "epoch": 1.973472101246143, + "grad_norm": 7.03125, + "learning_rate": 4.8061785363812115e-09, + "loss": 0.95929766, + "memory(GiB)": 302.58, + "step": 352880, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.74578743, + "epoch": 1.9735839507191222, + "grad_norm": 6.625, + "learning_rate": 4.7657290959618465e-09, + "loss": 0.99603653, + "memory(GiB)": 302.58, + "step": 352900, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.73914852, + "epoch": 1.9736958001921014, + "grad_norm": 6.5, + "learning_rate": 4.725450508393792e-09, + "loss": 1.0229681, + "memory(GiB)": 302.58, + "step": 352920, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.73995795, + "epoch": 1.9738076496650807, + "grad_norm": 6.28125, + "learning_rate": 4.685342775054835e-09, + "loss": 1.04837074, + "memory(GiB)": 302.58, + "step": 352940, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.7493042, + "epoch": 1.97391949913806, + "grad_norm": 11.25, + "learning_rate": 4.6454058973166574e-09, + "loss": 0.98564386, + "memory(GiB)": 302.58, + "step": 352960, + "train_speed(iter/s)": 0.123456 + }, + { + "acc": 0.73340888, + "epoch": 1.9740313486110392, + "grad_norm": 6.5, + "learning_rate": 4.605639876545942e-09, + "loss": 1.05154095, + "memory(GiB)": 302.58, + "step": 352980, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.76318097, + "epoch": 1.9741431980840185, + "grad_norm": 8.625, + "learning_rate": 4.5660447141021576e-09, + "loss": 0.92669182, + "memory(GiB)": 302.58, + "step": 353000, + "train_speed(iter/s)": 0.123463 + }, + { + "acc": 0.76030817, + "epoch": 1.9742550475569978, + "grad_norm": 6.75, + "learning_rate": 4.526620411339222e-09, + "loss": 0.94874449, + "memory(GiB)": 302.58, + "step": 353020, + "train_speed(iter/s)": 0.123466 + }, + { + "acc": 0.75154023, + "epoch": 1.974366897029977, + "grad_norm": 6.96875, + "learning_rate": 4.48736696960661e-09, + "loss": 0.9842906, + "memory(GiB)": 302.58, + "step": 353040, + "train_speed(iter/s)": 0.12347 + }, + { + "acc": 0.74672832, + "epoch": 1.9744787465029563, + "grad_norm": 6.625, + "learning_rate": 4.448284390246027e-09, + "loss": 0.99055834, + "memory(GiB)": 302.58, + "step": 353060, + "train_speed(iter/s)": 0.123473 + }, + { + "acc": 0.75230899, + "epoch": 1.9745905959759356, + "grad_norm": 8.625, + "learning_rate": 4.409372674594736e-09, + "loss": 0.96462421, + "memory(GiB)": 302.58, + "step": 353080, + "train_speed(iter/s)": 0.123477 + }, + { + "acc": 0.7615551, + "epoch": 1.9747024454489148, + "grad_norm": 7.84375, + "learning_rate": 4.370631823983895e-09, + "loss": 0.93998013, + "memory(GiB)": 302.58, + "step": 353100, + "train_speed(iter/s)": 0.12348 + }, + { + "acc": 0.74669752, + "epoch": 1.974814294921894, + "grad_norm": 6.0, + "learning_rate": 4.332061839737445e-09, + "loss": 1.00603828, + "memory(GiB)": 302.58, + "step": 353120, + "train_speed(iter/s)": 0.123483 + }, + { + "acc": 0.76580319, + "epoch": 1.9749261443948734, + "grad_norm": 6.3125, + "learning_rate": 4.2936627231759955e-09, + "loss": 0.91401472, + "memory(GiB)": 302.58, + "step": 353140, + "train_speed(iter/s)": 0.123486 + }, + { + "acc": 0.77018104, + "epoch": 1.9750379938678526, + "grad_norm": 6.65625, + "learning_rate": 4.255434475611831e-09, + "loss": 0.89844398, + "memory(GiB)": 302.58, + "step": 353160, + "train_speed(iter/s)": 0.12349 + }, + { + "acc": 0.75201492, + "epoch": 1.9751498433408319, + "grad_norm": 12.125, + "learning_rate": 4.217377098352793e-09, + "loss": 0.9677495, + "memory(GiB)": 302.58, + "step": 353180, + "train_speed(iter/s)": 0.123493 + }, + { + "acc": 0.75814257, + "epoch": 1.9752616928138111, + "grad_norm": 5.09375, + "learning_rate": 4.179490592701174e-09, + "loss": 0.95532446, + "memory(GiB)": 302.58, + "step": 353200, + "train_speed(iter/s)": 0.123496 + }, + { + "acc": 0.75458941, + "epoch": 1.9753735422867904, + "grad_norm": 5.71875, + "learning_rate": 4.141774959952049e-09, + "loss": 0.95402784, + "memory(GiB)": 302.58, + "step": 353220, + "train_speed(iter/s)": 0.1235 + }, + { + "acc": 0.76155977, + "epoch": 1.9754853917597697, + "grad_norm": 5.59375, + "learning_rate": 4.104230201395498e-09, + "loss": 0.92028418, + "memory(GiB)": 302.58, + "step": 353240, + "train_speed(iter/s)": 0.123503 + }, + { + "acc": 0.75091743, + "epoch": 1.975597241232749, + "grad_norm": 9.375, + "learning_rate": 4.066856318316048e-09, + "loss": 0.96804838, + "memory(GiB)": 302.58, + "step": 353260, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.7507741, + "epoch": 1.9757090907057282, + "grad_norm": 6.5625, + "learning_rate": 4.029653311991566e-09, + "loss": 0.99255028, + "memory(GiB)": 302.58, + "step": 353280, + "train_speed(iter/s)": 0.12351 + }, + { + "acc": 0.73031216, + "epoch": 1.9758209401787075, + "grad_norm": 6.125, + "learning_rate": 3.992621183694367e-09, + "loss": 1.05126362, + "memory(GiB)": 302.58, + "step": 353300, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.75479274, + "epoch": 1.9759327896516867, + "grad_norm": 6.71875, + "learning_rate": 3.955759934692327e-09, + "loss": 0.96427288, + "memory(GiB)": 302.58, + "step": 353320, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.75834384, + "epoch": 1.976044639124666, + "grad_norm": 7.71875, + "learning_rate": 3.919069566244438e-09, + "loss": 0.94694891, + "memory(GiB)": 302.58, + "step": 353340, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.76088133, + "epoch": 1.9761564885976453, + "grad_norm": 9.0, + "learning_rate": 3.882550079606917e-09, + "loss": 0.93930397, + "memory(GiB)": 302.58, + "step": 353360, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.7641098, + "epoch": 1.9762683380706245, + "grad_norm": 7.78125, + "learning_rate": 3.846201476028766e-09, + "loss": 0.94388685, + "memory(GiB)": 302.58, + "step": 353380, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.75417976, + "epoch": 1.9763801875436038, + "grad_norm": 7.1875, + "learning_rate": 3.810023756752323e-09, + "loss": 0.96175165, + "memory(GiB)": 302.58, + "step": 353400, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.73608346, + "epoch": 1.976492037016583, + "grad_norm": 7.3125, + "learning_rate": 3.774016923016044e-09, + "loss": 1.04052172, + "memory(GiB)": 302.58, + "step": 353420, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.75121193, + "epoch": 1.9766038864895623, + "grad_norm": 5.6875, + "learning_rate": 3.738180976050609e-09, + "loss": 0.9690794, + "memory(GiB)": 302.58, + "step": 353440, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.75464482, + "epoch": 1.9767157359625416, + "grad_norm": 4.5625, + "learning_rate": 3.702515917082261e-09, + "loss": 0.97355328, + "memory(GiB)": 302.58, + "step": 353460, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.76286936, + "epoch": 1.9768275854355208, + "grad_norm": 8.875, + "learning_rate": 3.667021747331134e-09, + "loss": 0.92840958, + "memory(GiB)": 302.58, + "step": 353480, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.75970192, + "epoch": 1.9769394349085, + "grad_norm": 6.65625, + "learning_rate": 3.6316984680101473e-09, + "loss": 0.95265369, + "memory(GiB)": 302.58, + "step": 353500, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.75457182, + "epoch": 1.9770512843814794, + "grad_norm": 8.3125, + "learning_rate": 3.5965460803283335e-09, + "loss": 0.96988211, + "memory(GiB)": 302.58, + "step": 353520, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.72919855, + "epoch": 1.9771631338544586, + "grad_norm": 7.53125, + "learning_rate": 3.561564585488064e-09, + "loss": 1.07773666, + "memory(GiB)": 302.58, + "step": 353540, + "train_speed(iter/s)": 0.123555 + }, + { + "acc": 0.76369824, + "epoch": 1.977274983327438, + "grad_norm": 6.65625, + "learning_rate": 3.5267539846850497e-09, + "loss": 0.91823082, + "memory(GiB)": 302.58, + "step": 353560, + "train_speed(iter/s)": 0.123558 + }, + { + "acc": 0.75009418, + "epoch": 1.9773868328004172, + "grad_norm": 6.65625, + "learning_rate": 3.492114279111114e-09, + "loss": 0.98561764, + "memory(GiB)": 302.58, + "step": 353580, + "train_speed(iter/s)": 0.123562 + }, + { + "acc": 0.73200631, + "epoch": 1.9774986822733964, + "grad_norm": 6.71875, + "learning_rate": 3.4576454699497552e-09, + "loss": 1.07173414, + "memory(GiB)": 302.58, + "step": 353600, + "train_speed(iter/s)": 0.123565 + }, + { + "acc": 0.73853002, + "epoch": 1.9776105317463757, + "grad_norm": 10.5, + "learning_rate": 3.423347558380585e-09, + "loss": 1.04019222, + "memory(GiB)": 302.58, + "step": 353620, + "train_speed(iter/s)": 0.123568 + }, + { + "acc": 0.74782515, + "epoch": 1.977722381219355, + "grad_norm": 5.15625, + "learning_rate": 3.3892205455771097e-09, + "loss": 0.98843737, + "memory(GiB)": 302.58, + "step": 353640, + "train_speed(iter/s)": 0.123572 + }, + { + "acc": 0.74534802, + "epoch": 1.9778342306923342, + "grad_norm": 6.46875, + "learning_rate": 3.3552644327056184e-09, + "loss": 0.99381752, + "memory(GiB)": 302.58, + "step": 353660, + "train_speed(iter/s)": 0.123575 + }, + { + "acc": 0.76313682, + "epoch": 1.9779460801653135, + "grad_norm": 6.78125, + "learning_rate": 3.3214792209279587e-09, + "loss": 0.95315809, + "memory(GiB)": 302.58, + "step": 353680, + "train_speed(iter/s)": 0.123579 + }, + { + "acc": 0.73624964, + "epoch": 1.9780579296382927, + "grad_norm": 6.875, + "learning_rate": 3.287864911400429e-09, + "loss": 1.02180052, + "memory(GiB)": 302.58, + "step": 353700, + "train_speed(iter/s)": 0.123582 + }, + { + "acc": 0.73830099, + "epoch": 1.978169779111272, + "grad_norm": 6.9375, + "learning_rate": 3.2544215052715543e-09, + "loss": 1.05194511, + "memory(GiB)": 302.58, + "step": 353720, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.75376167, + "epoch": 1.9782816285842513, + "grad_norm": 9.9375, + "learning_rate": 3.22114900368542e-09, + "loss": 0.96876421, + "memory(GiB)": 302.58, + "step": 353740, + "train_speed(iter/s)": 0.123589 + }, + { + "acc": 0.73547559, + "epoch": 1.9783934780572305, + "grad_norm": 5.375, + "learning_rate": 3.1880474077811143e-09, + "loss": 1.07190189, + "memory(GiB)": 302.58, + "step": 353760, + "train_speed(iter/s)": 0.123592 + }, + { + "acc": 0.74513993, + "epoch": 1.9785053275302098, + "grad_norm": 7.125, + "learning_rate": 3.1551167186893993e-09, + "loss": 1.00968227, + "memory(GiB)": 302.58, + "step": 353780, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.75271297, + "epoch": 1.978617177003189, + "grad_norm": 7.65625, + "learning_rate": 3.122356937537707e-09, + "loss": 0.97250547, + "memory(GiB)": 302.58, + "step": 353800, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.77236824, + "epoch": 1.9787290264761683, + "grad_norm": 6.28125, + "learning_rate": 3.0897680654456975e-09, + "loss": 0.8895236, + "memory(GiB)": 302.58, + "step": 353820, + "train_speed(iter/s)": 0.123602 + }, + { + "acc": 0.76483569, + "epoch": 1.9788408759491476, + "grad_norm": 10.25, + "learning_rate": 3.057350103528589e-09, + "loss": 0.94048138, + "memory(GiB)": 302.58, + "step": 353840, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.75414124, + "epoch": 1.9789527254221269, + "grad_norm": 13.375, + "learning_rate": 3.0251030528949398e-09, + "loss": 0.96259451, + "memory(GiB)": 302.58, + "step": 353860, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.75180259, + "epoch": 1.9790645748951061, + "grad_norm": 10.0, + "learning_rate": 2.9930269146477563e-09, + "loss": 0.95499029, + "memory(GiB)": 302.58, + "step": 353880, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.75990891, + "epoch": 1.9791764243680854, + "grad_norm": 8.875, + "learning_rate": 2.9611216898844942e-09, + "loss": 0.93805304, + "memory(GiB)": 302.58, + "step": 353900, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.7528625, + "epoch": 1.9792882738410646, + "grad_norm": 7.15625, + "learning_rate": 2.929387379695392e-09, + "loss": 0.97707529, + "memory(GiB)": 302.58, + "step": 353920, + "train_speed(iter/s)": 0.12362 + }, + { + "acc": 0.75002084, + "epoch": 1.979400123314044, + "grad_norm": 6.6875, + "learning_rate": 2.897823985166803e-09, + "loss": 0.97147064, + "memory(GiB)": 302.58, + "step": 353940, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.75344839, + "epoch": 1.9795119727870232, + "grad_norm": 6.59375, + "learning_rate": 2.86643150737842e-09, + "loss": 0.96875801, + "memory(GiB)": 302.58, + "step": 353960, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.7553051, + "epoch": 1.9796238222600024, + "grad_norm": 10.9375, + "learning_rate": 2.835209947402717e-09, + "loss": 0.94538717, + "memory(GiB)": 302.58, + "step": 353980, + "train_speed(iter/s)": 0.12363 + }, + { + "acc": 0.74018674, + "epoch": 1.9797356717329817, + "grad_norm": 4.8125, + "learning_rate": 2.804159306309395e-09, + "loss": 1.06176395, + "memory(GiB)": 302.58, + "step": 354000, + "train_speed(iter/s)": 0.123633 + }, + { + "epoch": 1.9797356717329817, + "eval_acc": 0.7068924991883284, + "eval_loss": 1.0117732286453247, + "eval_runtime": 7271.9024, + "eval_samples_per_second": 10.353, + "eval_steps_per_second": 10.353, + "step": 354000 + }, + { + "acc": 0.76207767, + "epoch": 1.979847521205961, + "grad_norm": 8.125, + "learning_rate": 2.773279585158717e-09, + "loss": 0.94211445, + "memory(GiB)": 302.58, + "step": 354020, + "train_speed(iter/s)": 0.123318 + }, + { + "acc": 0.7579195, + "epoch": 1.9799593706789402, + "grad_norm": 8.75, + "learning_rate": 2.7425707850081697e-09, + "loss": 0.97929363, + "memory(GiB)": 302.58, + "step": 354040, + "train_speed(iter/s)": 0.123321 + }, + { + "acc": 0.78071265, + "epoch": 1.9800712201519195, + "grad_norm": 9.8125, + "learning_rate": 2.712032906906914e-09, + "loss": 0.85961151, + "memory(GiB)": 302.58, + "step": 354060, + "train_speed(iter/s)": 0.123324 + }, + { + "acc": 0.75114985, + "epoch": 1.9801830696248988, + "grad_norm": 6.90625, + "learning_rate": 2.6816659519007802e-09, + "loss": 0.96968279, + "memory(GiB)": 302.58, + "step": 354080, + "train_speed(iter/s)": 0.123328 + }, + { + "acc": 0.7487061, + "epoch": 1.980294919097878, + "grad_norm": 8.1875, + "learning_rate": 2.651469921027272e-09, + "loss": 0.97673264, + "memory(GiB)": 302.58, + "step": 354100, + "train_speed(iter/s)": 0.123331 + }, + { + "acc": 0.74563174, + "epoch": 1.9804067685708573, + "grad_norm": 7.75, + "learning_rate": 2.621444815320007e-09, + "loss": 1.01472225, + "memory(GiB)": 302.58, + "step": 354120, + "train_speed(iter/s)": 0.123335 + }, + { + "acc": 0.74844141, + "epoch": 1.9805186180438366, + "grad_norm": 8.625, + "learning_rate": 2.5915906358053854e-09, + "loss": 0.99513693, + "memory(GiB)": 302.58, + "step": 354140, + "train_speed(iter/s)": 0.123338 + }, + { + "acc": 0.76429219, + "epoch": 1.9806304675168158, + "grad_norm": 8.75, + "learning_rate": 2.5619073835048137e-09, + "loss": 0.90909052, + "memory(GiB)": 302.58, + "step": 354160, + "train_speed(iter/s)": 0.123342 + }, + { + "acc": 0.74247255, + "epoch": 1.980742316989795, + "grad_norm": 4.90625, + "learning_rate": 2.53239505943359e-09, + "loss": 0.99974308, + "memory(GiB)": 302.58, + "step": 354180, + "train_speed(iter/s)": 0.123345 + }, + { + "acc": 0.74693227, + "epoch": 1.9808541664627743, + "grad_norm": 7.90625, + "learning_rate": 2.5030536646009073e-09, + "loss": 0.99972954, + "memory(GiB)": 302.58, + "step": 354200, + "train_speed(iter/s)": 0.123348 + }, + { + "acc": 0.75592842, + "epoch": 1.9809660159357536, + "grad_norm": 4.8125, + "learning_rate": 2.4738832000104075e-09, + "loss": 0.97597208, + "memory(GiB)": 302.58, + "step": 354220, + "train_speed(iter/s)": 0.123352 + }, + { + "acc": 0.76409149, + "epoch": 1.9810778654087329, + "grad_norm": 7.6875, + "learning_rate": 2.4448836666596254e-09, + "loss": 0.92995806, + "memory(GiB)": 302.58, + "step": 354240, + "train_speed(iter/s)": 0.123355 + }, + { + "acc": 0.76067505, + "epoch": 1.9811897148817121, + "grad_norm": 7.46875, + "learning_rate": 2.416055065540546e-09, + "loss": 0.95609808, + "memory(GiB)": 302.58, + "step": 354260, + "train_speed(iter/s)": 0.123358 + }, + { + "acc": 0.73586955, + "epoch": 1.9813015643546914, + "grad_norm": 8.3125, + "learning_rate": 2.3873973976396015e-09, + "loss": 1.03637638, + "memory(GiB)": 302.58, + "step": 354280, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.74846449, + "epoch": 1.9814134138276707, + "grad_norm": 8.0625, + "learning_rate": 2.3589106639365642e-09, + "loss": 0.96552448, + "memory(GiB)": 302.58, + "step": 354300, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.75134296, + "epoch": 1.98152526330065, + "grad_norm": 5.46875, + "learning_rate": 2.3305948654062106e-09, + "loss": 0.97506132, + "memory(GiB)": 302.58, + "step": 354320, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.76338301, + "epoch": 1.9816371127736292, + "grad_norm": 7.09375, + "learning_rate": 2.302450003016099e-09, + "loss": 0.93303795, + "memory(GiB)": 302.58, + "step": 354340, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.74877815, + "epoch": 1.9817489622466087, + "grad_norm": 6.90625, + "learning_rate": 2.2744760777299034e-09, + "loss": 0.96756544, + "memory(GiB)": 302.58, + "step": 354360, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.74354477, + "epoch": 1.9818608117195877, + "grad_norm": 5.5625, + "learning_rate": 2.2466730905040814e-09, + "loss": 1.00682859, + "memory(GiB)": 302.58, + "step": 354380, + "train_speed(iter/s)": 0.123379 + }, + { + "acc": 0.75237556, + "epoch": 1.9819726611925672, + "grad_norm": 6.21875, + "learning_rate": 2.219041042288983e-09, + "loss": 0.95931311, + "memory(GiB)": 302.58, + "step": 354400, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.75625429, + "epoch": 1.9820845106655463, + "grad_norm": 8.375, + "learning_rate": 2.191579934030519e-09, + "loss": 0.95280256, + "memory(GiB)": 302.58, + "step": 354420, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.74595876, + "epoch": 1.9821963601385257, + "grad_norm": 8.9375, + "learning_rate": 2.164289766667382e-09, + "loss": 0.98674479, + "memory(GiB)": 302.58, + "step": 354440, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.74050546, + "epoch": 1.9823082096115048, + "grad_norm": 6.59375, + "learning_rate": 2.1371705411338263e-09, + "loss": 1.0380744, + "memory(GiB)": 302.58, + "step": 354460, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.76384039, + "epoch": 1.9824200590844843, + "grad_norm": 7.71875, + "learning_rate": 2.1102222583563312e-09, + "loss": 0.93456087, + "memory(GiB)": 302.58, + "step": 354480, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.76064153, + "epoch": 1.9825319085574633, + "grad_norm": 9.1875, + "learning_rate": 2.0834449192569383e-09, + "loss": 0.95192556, + "memory(GiB)": 302.58, + "step": 354500, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.75470209, + "epoch": 1.9826437580304428, + "grad_norm": 6.53125, + "learning_rate": 2.056838524752136e-09, + "loss": 0.96110268, + "memory(GiB)": 302.58, + "step": 354520, + "train_speed(iter/s)": 0.123403 + }, + { + "acc": 0.74588346, + "epoch": 1.9827556075034218, + "grad_norm": 7.5, + "learning_rate": 2.030403075751752e-09, + "loss": 0.98839464, + "memory(GiB)": 302.58, + "step": 354540, + "train_speed(iter/s)": 0.123406 + }, + { + "acc": 0.73228412, + "epoch": 1.9828674569764013, + "grad_norm": 7.5, + "learning_rate": 2.0041385731589537e-09, + "loss": 1.06221781, + "memory(GiB)": 302.58, + "step": 354560, + "train_speed(iter/s)": 0.123409 + }, + { + "acc": 0.74804568, + "epoch": 1.9829793064493804, + "grad_norm": 6.125, + "learning_rate": 1.9780450178735754e-09, + "loss": 0.99858675, + "memory(GiB)": 302.58, + "step": 354580, + "train_speed(iter/s)": 0.123413 + }, + { + "acc": 0.74613247, + "epoch": 1.9830911559223598, + "grad_norm": 9.0, + "learning_rate": 1.952122410786572e-09, + "loss": 0.98768988, + "memory(GiB)": 302.58, + "step": 354600, + "train_speed(iter/s)": 0.123416 + }, + { + "acc": 0.75657377, + "epoch": 1.983203005395339, + "grad_norm": 8.3125, + "learning_rate": 1.926370752786122e-09, + "loss": 0.95799522, + "memory(GiB)": 302.58, + "step": 354620, + "train_speed(iter/s)": 0.123419 + }, + { + "acc": 0.77044992, + "epoch": 1.9833148548683184, + "grad_norm": 6.8125, + "learning_rate": 1.9007900447515217e-09, + "loss": 0.91392679, + "memory(GiB)": 302.58, + "step": 354640, + "train_speed(iter/s)": 0.123423 + }, + { + "acc": 0.76086998, + "epoch": 1.9834267043412974, + "grad_norm": 7.71875, + "learning_rate": 1.875380287559292e-09, + "loss": 0.93047848, + "memory(GiB)": 302.58, + "step": 354660, + "train_speed(iter/s)": 0.123426 + }, + { + "acc": 0.75481963, + "epoch": 1.983538553814277, + "grad_norm": 5.9375, + "learning_rate": 1.8501414820770724e-09, + "loss": 0.97096167, + "memory(GiB)": 302.58, + "step": 354680, + "train_speed(iter/s)": 0.123429 + }, + { + "acc": 0.77566342, + "epoch": 1.983650403287256, + "grad_norm": 10.875, + "learning_rate": 1.8250736291686167e-09, + "loss": 0.8930891, + "memory(GiB)": 302.58, + "step": 354700, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.74925966, + "epoch": 1.9837622527602354, + "grad_norm": 6.34375, + "learning_rate": 1.8001767296921269e-09, + "loss": 0.99372931, + "memory(GiB)": 302.58, + "step": 354720, + "train_speed(iter/s)": 0.123436 + }, + { + "acc": 0.75710278, + "epoch": 1.9838741022332145, + "grad_norm": 4.34375, + "learning_rate": 1.775450784498034e-09, + "loss": 0.94437475, + "memory(GiB)": 302.58, + "step": 354740, + "train_speed(iter/s)": 0.123439 + }, + { + "acc": 0.74562378, + "epoch": 1.983985951706194, + "grad_norm": 11.5, + "learning_rate": 1.7508957944323279e-09, + "loss": 1.01743364, + "memory(GiB)": 302.58, + "step": 354760, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.74058743, + "epoch": 1.984097801179173, + "grad_norm": 7.03125, + "learning_rate": 1.7265117603348924e-09, + "loss": 1.02329674, + "memory(GiB)": 302.58, + "step": 354780, + "train_speed(iter/s)": 0.123446 + }, + { + "acc": 0.74568205, + "epoch": 1.9842096506521525, + "grad_norm": 8.5, + "learning_rate": 1.70229868304006e-09, + "loss": 1.00222387, + "memory(GiB)": 302.58, + "step": 354800, + "train_speed(iter/s)": 0.123449 + }, + { + "acc": 0.75640831, + "epoch": 1.9843215001251315, + "grad_norm": 6.3125, + "learning_rate": 1.6782565633760572e-09, + "loss": 0.92132444, + "memory(GiB)": 302.58, + "step": 354820, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.74807959, + "epoch": 1.984433349598111, + "grad_norm": 8.5625, + "learning_rate": 1.654385402164449e-09, + "loss": 0.98973942, + "memory(GiB)": 302.58, + "step": 354840, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.75870724, + "epoch": 1.98454519907109, + "grad_norm": 5.8125, + "learning_rate": 1.6306852002223594e-09, + "loss": 0.93273373, + "memory(GiB)": 302.58, + "step": 354860, + "train_speed(iter/s)": 0.123459 + }, + { + "acc": 0.75169392, + "epoch": 1.9846570485440695, + "grad_norm": 7.6875, + "learning_rate": 1.6071559583602514e-09, + "loss": 0.98142052, + "memory(GiB)": 302.58, + "step": 354880, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.75202694, + "epoch": 1.9847688980170486, + "grad_norm": 7.78125, + "learning_rate": 1.5837976773830366e-09, + "loss": 0.96438227, + "memory(GiB)": 302.58, + "step": 354900, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74467654, + "epoch": 1.984880747490028, + "grad_norm": 5.5, + "learning_rate": 1.5606103580895205e-09, + "loss": 1.00673437, + "memory(GiB)": 302.58, + "step": 354920, + "train_speed(iter/s)": 0.123469 + }, + { + "acc": 0.73920307, + "epoch": 1.9849925969630071, + "grad_norm": 8.125, + "learning_rate": 1.5375940012724022e-09, + "loss": 1.04601231, + "memory(GiB)": 302.58, + "step": 354940, + "train_speed(iter/s)": 0.123472 + }, + { + "acc": 0.73925881, + "epoch": 1.9851044464359866, + "grad_norm": 9.375, + "learning_rate": 1.514748607719385e-09, + "loss": 1.03458338, + "memory(GiB)": 302.58, + "step": 354960, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74906139, + "epoch": 1.9852162959089656, + "grad_norm": 9.375, + "learning_rate": 1.4920741782120663e-09, + "loss": 0.98928137, + "memory(GiB)": 302.58, + "step": 354980, + "train_speed(iter/s)": 0.123479 + }, + { + "acc": 0.75404735, + "epoch": 1.9853281453819451, + "grad_norm": 5.84375, + "learning_rate": 1.469570713525381e-09, + "loss": 0.96108236, + "memory(GiB)": 302.58, + "step": 355000, + "train_speed(iter/s)": 0.123482 + }, + { + "acc": 0.76093392, + "epoch": 1.9854399948549242, + "grad_norm": 5.96875, + "learning_rate": 1.4472382144292696e-09, + "loss": 0.92727013, + "memory(GiB)": 302.58, + "step": 355020, + "train_speed(iter/s)": 0.123485 + }, + { + "acc": 0.77634096, + "epoch": 1.9855518443279037, + "grad_norm": 10.4375, + "learning_rate": 1.425076681687565e-09, + "loss": 0.87033262, + "memory(GiB)": 302.58, + "step": 355040, + "train_speed(iter/s)": 0.123489 + }, + { + "acc": 0.75191784, + "epoch": 1.9856636938008827, + "grad_norm": 8.6875, + "learning_rate": 1.4030861160579945e-09, + "loss": 0.98160114, + "memory(GiB)": 302.58, + "step": 355060, + "train_speed(iter/s)": 0.123492 + }, + { + "acc": 0.74519091, + "epoch": 1.9857755432738622, + "grad_norm": 7.0625, + "learning_rate": 1.3812665182932893e-09, + "loss": 1.00674, + "memory(GiB)": 302.58, + "step": 355080, + "train_speed(iter/s)": 0.123495 + }, + { + "acc": 0.75754037, + "epoch": 1.9858873927468412, + "grad_norm": 8.3125, + "learning_rate": 1.3596178891395196e-09, + "loss": 0.94806414, + "memory(GiB)": 302.58, + "step": 355100, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.76920757, + "epoch": 1.9859992422198207, + "grad_norm": 5.90625, + "learning_rate": 1.3381402293372036e-09, + "loss": 0.885711, + "memory(GiB)": 302.58, + "step": 355120, + "train_speed(iter/s)": 0.123502 + }, + { + "acc": 0.73928013, + "epoch": 1.9861110916927998, + "grad_norm": 5.5625, + "learning_rate": 1.3168335396207544e-09, + "loss": 0.99489794, + "memory(GiB)": 302.58, + "step": 355140, + "train_speed(iter/s)": 0.123505 + }, + { + "acc": 0.76747813, + "epoch": 1.9862229411657792, + "grad_norm": 7.84375, + "learning_rate": 1.295697820719033e-09, + "loss": 0.90522633, + "memory(GiB)": 302.58, + "step": 355160, + "train_speed(iter/s)": 0.123508 + }, + { + "acc": 0.74165587, + "epoch": 1.9863347906387583, + "grad_norm": 5.84375, + "learning_rate": 1.2747330733547946e-09, + "loss": 1.02159004, + "memory(GiB)": 302.58, + "step": 355180, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.74253402, + "epoch": 1.9864466401117378, + "grad_norm": 7.375, + "learning_rate": 1.2539392982452436e-09, + "loss": 1.00978117, + "memory(GiB)": 302.58, + "step": 355200, + "train_speed(iter/s)": 0.123515 + }, + { + "acc": 0.73806639, + "epoch": 1.9865584895847168, + "grad_norm": 10.6875, + "learning_rate": 1.2333164961014777e-09, + "loss": 1.03844395, + "memory(GiB)": 302.58, + "step": 355220, + "train_speed(iter/s)": 0.123518 + }, + { + "acc": 0.75112896, + "epoch": 1.9866703390576963, + "grad_norm": 9.25, + "learning_rate": 1.2128646676290433e-09, + "loss": 0.98548479, + "memory(GiB)": 302.58, + "step": 355240, + "train_speed(iter/s)": 0.123521 + }, + { + "acc": 0.75121169, + "epoch": 1.9867821885306753, + "grad_norm": 7.59375, + "learning_rate": 1.1925838135273815e-09, + "loss": 0.99584188, + "memory(GiB)": 302.58, + "step": 355260, + "train_speed(iter/s)": 0.123525 + }, + { + "acc": 0.7551435, + "epoch": 1.9868940380036548, + "grad_norm": 7.125, + "learning_rate": 1.172473934489826e-09, + "loss": 0.96734858, + "memory(GiB)": 302.58, + "step": 355280, + "train_speed(iter/s)": 0.123528 + }, + { + "acc": 0.75703187, + "epoch": 1.9870058874766339, + "grad_norm": 7.0625, + "learning_rate": 1.1525350312047156e-09, + "loss": 0.95703382, + "memory(GiB)": 302.58, + "step": 355300, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.74161315, + "epoch": 1.9871177369496134, + "grad_norm": 7.09375, + "learning_rate": 1.132767104354282e-09, + "loss": 1.0067709, + "memory(GiB)": 302.58, + "step": 355320, + "train_speed(iter/s)": 0.123535 + }, + { + "acc": 0.74626613, + "epoch": 1.9872295864225924, + "grad_norm": 5.6875, + "learning_rate": 1.113170154613541e-09, + "loss": 0.99935246, + "memory(GiB)": 302.58, + "step": 355340, + "train_speed(iter/s)": 0.123538 + }, + { + "acc": 0.76169934, + "epoch": 1.9873414358955719, + "grad_norm": 9.25, + "learning_rate": 1.093744182653622e-09, + "loss": 0.93945284, + "memory(GiB)": 302.58, + "step": 355360, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.74638276, + "epoch": 1.987453285368551, + "grad_norm": 9.625, + "learning_rate": 1.074489189138994e-09, + "loss": 1.01977901, + "memory(GiB)": 302.58, + "step": 355380, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.74050541, + "epoch": 1.9875651348415304, + "grad_norm": 7.46875, + "learning_rate": 1.0554051747280192e-09, + "loss": 1.02969837, + "memory(GiB)": 302.58, + "step": 355400, + "train_speed(iter/s)": 0.123548 + }, + { + "acc": 0.76248679, + "epoch": 1.9876769843145095, + "grad_norm": 9.75, + "learning_rate": 1.0364921400729532e-09, + "loss": 0.92879171, + "memory(GiB)": 302.58, + "step": 355420, + "train_speed(iter/s)": 0.123551 + }, + { + "acc": 0.75665541, + "epoch": 1.987788833787489, + "grad_norm": 7.21875, + "learning_rate": 1.0177500858216116e-09, + "loss": 0.93011017, + "memory(GiB)": 302.58, + "step": 355440, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.74283257, + "epoch": 1.987900683260468, + "grad_norm": 7.96875, + "learning_rate": 9.991790126140378e-10, + "loss": 1.01272392, + "memory(GiB)": 302.58, + "step": 355460, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.75400453, + "epoch": 1.9880125327334475, + "grad_norm": 6.5, + "learning_rate": 9.807789210863894e-10, + "loss": 0.95089855, + "memory(GiB)": 302.58, + "step": 355480, + "train_speed(iter/s)": 0.123561 + }, + { + "acc": 0.76941819, + "epoch": 1.9881243822064265, + "grad_norm": 6.28125, + "learning_rate": 9.62549811867608e-10, + "loss": 0.90015259, + "memory(GiB)": 302.58, + "step": 355500, + "train_speed(iter/s)": 0.123564 + }, + { + "acc": 0.75109186, + "epoch": 1.988236231679406, + "grad_norm": 5.53125, + "learning_rate": 9.444916855805287e-10, + "loss": 0.9762394, + "memory(GiB)": 302.58, + "step": 355520, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.76252861, + "epoch": 1.988348081152385, + "grad_norm": 7.75, + "learning_rate": 9.266045428441006e-10, + "loss": 0.92300901, + "memory(GiB)": 302.58, + "step": 355540, + "train_speed(iter/s)": 0.123571 + }, + { + "acc": 0.74872713, + "epoch": 1.9884599306253645, + "grad_norm": 7.125, + "learning_rate": 9.088883842689467e-10, + "loss": 0.99890633, + "memory(GiB)": 302.58, + "step": 355560, + "train_speed(iter/s)": 0.123574 + }, + { + "acc": 0.74578815, + "epoch": 1.9885717800983436, + "grad_norm": 6.15625, + "learning_rate": 8.913432104618036e-10, + "loss": 1.00380859, + "memory(GiB)": 302.58, + "step": 355580, + "train_speed(iter/s)": 0.123577 + }, + { + "acc": 0.7575429, + "epoch": 1.988683629571323, + "grad_norm": 7.34375, + "learning_rate": 8.739690220221919e-10, + "loss": 0.95093384, + "memory(GiB)": 302.58, + "step": 355600, + "train_speed(iter/s)": 0.123581 + }, + { + "acc": 0.74587388, + "epoch": 1.988795479044302, + "grad_norm": 9.125, + "learning_rate": 8.567658195451911e-10, + "loss": 1.00277882, + "memory(GiB)": 302.58, + "step": 355620, + "train_speed(iter/s)": 0.123583 + }, + { + "acc": 0.75595179, + "epoch": 1.9889073285172816, + "grad_norm": 10.8125, + "learning_rate": 8.397336036181092e-10, + "loss": 0.93251762, + "memory(GiB)": 302.58, + "step": 355640, + "train_speed(iter/s)": 0.123586 + }, + { + "acc": 0.76813059, + "epoch": 1.9890191779902606, + "grad_norm": 10.0625, + "learning_rate": 8.228723748243684e-10, + "loss": 0.92162485, + "memory(GiB)": 302.58, + "step": 355660, + "train_speed(iter/s)": 0.12359 + }, + { + "acc": 0.73524847, + "epoch": 1.98913102746324, + "grad_norm": 7.0625, + "learning_rate": 8.061821337401743e-10, + "loss": 1.05751152, + "memory(GiB)": 302.58, + "step": 355680, + "train_speed(iter/s)": 0.123593 + }, + { + "acc": 0.73508086, + "epoch": 1.9892428769362192, + "grad_norm": 5.75, + "learning_rate": 7.896628809367368e-10, + "loss": 1.07013836, + "memory(GiB)": 302.58, + "step": 355700, + "train_speed(iter/s)": 0.123596 + }, + { + "acc": 0.7231328, + "epoch": 1.9893547264091986, + "grad_norm": 6.8125, + "learning_rate": 7.733146169786043e-10, + "loss": 1.11723003, + "memory(GiB)": 302.58, + "step": 355720, + "train_speed(iter/s)": 0.123599 + }, + { + "acc": 0.74739828, + "epoch": 1.9894665758821777, + "grad_norm": 6.03125, + "learning_rate": 7.571373424253292e-10, + "loss": 0.99006929, + "memory(GiB)": 302.58, + "step": 355740, + "train_speed(iter/s)": 0.123603 + }, + { + "acc": 0.74215026, + "epoch": 1.9895784253551572, + "grad_norm": 6.5, + "learning_rate": 7.411310578303576e-10, + "loss": 1.03881369, + "memory(GiB)": 302.58, + "step": 355760, + "train_speed(iter/s)": 0.123606 + }, + { + "acc": 0.74323001, + "epoch": 1.9896902748281362, + "grad_norm": 5.59375, + "learning_rate": 7.252957637404745e-10, + "loss": 1.02148809, + "memory(GiB)": 302.58, + "step": 355780, + "train_speed(iter/s)": 0.123609 + }, + { + "acc": 0.73394866, + "epoch": 1.9898021243011157, + "grad_norm": 5.96875, + "learning_rate": 7.096314606980236e-10, + "loss": 1.06406174, + "memory(GiB)": 302.58, + "step": 355800, + "train_speed(iter/s)": 0.123613 + }, + { + "acc": 0.74679952, + "epoch": 1.9899139737740947, + "grad_norm": 7.4375, + "learning_rate": 6.941381492386879e-10, + "loss": 1.00833836, + "memory(GiB)": 302.58, + "step": 355820, + "train_speed(iter/s)": 0.123616 + }, + { + "acc": 0.73853083, + "epoch": 1.9900258232470742, + "grad_norm": 7.53125, + "learning_rate": 6.788158298920433e-10, + "loss": 1.03157587, + "memory(GiB)": 302.58, + "step": 355840, + "train_speed(iter/s)": 0.123619 + }, + { + "acc": 0.75366063, + "epoch": 1.9901376727200533, + "grad_norm": 6.71875, + "learning_rate": 6.636645031821154e-10, + "loss": 0.95948229, + "memory(GiB)": 302.58, + "step": 355860, + "train_speed(iter/s)": 0.123623 + }, + { + "acc": 0.74685392, + "epoch": 1.9902495221930327, + "grad_norm": 9.625, + "learning_rate": 6.486841696273782e-10, + "loss": 0.99889669, + "memory(GiB)": 302.58, + "step": 355880, + "train_speed(iter/s)": 0.123626 + }, + { + "acc": 0.7558917, + "epoch": 1.9903613716660118, + "grad_norm": 7.625, + "learning_rate": 6.338748297401997e-10, + "loss": 0.94498863, + "memory(GiB)": 302.58, + "step": 355900, + "train_speed(iter/s)": 0.123629 + }, + { + "acc": 0.74785438, + "epoch": 1.9904732211389913, + "grad_norm": 8.6875, + "learning_rate": 6.192364840273968e-10, + "loss": 0.99167013, + "memory(GiB)": 302.58, + "step": 355920, + "train_speed(iter/s)": 0.123633 + }, + { + "acc": 0.75124474, + "epoch": 1.9905850706119703, + "grad_norm": 7.53125, + "learning_rate": 6.047691329891248e-10, + "loss": 1.00349712, + "memory(GiB)": 302.58, + "step": 355940, + "train_speed(iter/s)": 0.123636 + }, + { + "acc": 0.7511405, + "epoch": 1.9906969200849498, + "grad_norm": 5.125, + "learning_rate": 5.904727771199881e-10, + "loss": 0.97902384, + "memory(GiB)": 302.58, + "step": 355960, + "train_speed(iter/s)": 0.123639 + }, + { + "acc": 0.77052541, + "epoch": 1.9908087695579288, + "grad_norm": 6.65625, + "learning_rate": 5.763474169095951e-10, + "loss": 0.8814887, + "memory(GiB)": 302.58, + "step": 355980, + "train_speed(iter/s)": 0.123643 + }, + { + "acc": 0.74610538, + "epoch": 1.9909206190309083, + "grad_norm": 6.53125, + "learning_rate": 5.623930528414478e-10, + "loss": 1.01242609, + "memory(GiB)": 302.58, + "step": 356000, + "train_speed(iter/s)": 0.123646 + }, + { + "epoch": 1.9909206190309083, + "eval_acc": 0.7069170982846789, + "eval_loss": 1.0117700099945068, + "eval_runtime": 7537.628, + "eval_samples_per_second": 9.988, + "eval_steps_per_second": 9.988, + "step": 356000 + }, + { + "acc": 0.75436587, + "epoch": 1.9910324685038874, + "grad_norm": 5.875, + "learning_rate": 5.486096853918321e-10, + "loss": 0.96560898, + "memory(GiB)": 302.58, + "step": 356020, + "train_speed(iter/s)": 0.12332 + }, + { + "acc": 0.76086731, + "epoch": 1.9911443179768669, + "grad_norm": 8.1875, + "learning_rate": 5.349973150320376e-10, + "loss": 0.95392017, + "memory(GiB)": 302.58, + "step": 356040, + "train_speed(iter/s)": 0.123323 + }, + { + "acc": 0.7564888, + "epoch": 1.991256167449846, + "grad_norm": 6.15625, + "learning_rate": 5.215559422289129e-10, + "loss": 0.92541018, + "memory(GiB)": 302.58, + "step": 356060, + "train_speed(iter/s)": 0.123326 + }, + { + "acc": 0.73893328, + "epoch": 1.9913680169228254, + "grad_norm": 8.375, + "learning_rate": 5.082855674415355e-10, + "loss": 1.0418602, + "memory(GiB)": 302.58, + "step": 356080, + "train_speed(iter/s)": 0.12333 + }, + { + "acc": 0.74140196, + "epoch": 1.9914798663958044, + "grad_norm": 6.875, + "learning_rate": 4.951861911234312e-10, + "loss": 1.03428612, + "memory(GiB)": 302.58, + "step": 356100, + "train_speed(iter/s)": 0.123333 + }, + { + "acc": 0.74131026, + "epoch": 1.991591715868784, + "grad_norm": 7.375, + "learning_rate": 4.822578137231304e-10, + "loss": 1.01397572, + "memory(GiB)": 302.58, + "step": 356120, + "train_speed(iter/s)": 0.123336 + }, + { + "acc": 0.76639977, + "epoch": 1.991703565341763, + "grad_norm": 11.9375, + "learning_rate": 4.695004356825017e-10, + "loss": 0.90843744, + "memory(GiB)": 302.58, + "step": 356140, + "train_speed(iter/s)": 0.123339 + }, + { + "acc": 0.75124731, + "epoch": 1.9918154148147424, + "grad_norm": 6.4375, + "learning_rate": 4.5691405743786277e-10, + "loss": 0.96030712, + "memory(GiB)": 302.58, + "step": 356160, + "train_speed(iter/s)": 0.123343 + }, + { + "acc": 0.75595541, + "epoch": 1.9919272642877215, + "grad_norm": 7.65625, + "learning_rate": 4.4449867941998016e-10, + "loss": 0.96485863, + "memory(GiB)": 302.58, + "step": 356180, + "train_speed(iter/s)": 0.123346 + }, + { + "acc": 0.75086846, + "epoch": 1.992039113760701, + "grad_norm": 8.4375, + "learning_rate": 4.3225430205351414e-10, + "loss": 0.96631184, + "memory(GiB)": 302.58, + "step": 356200, + "train_speed(iter/s)": 0.123349 + }, + { + "acc": 0.76799059, + "epoch": 1.99215096323368, + "grad_norm": 10.3125, + "learning_rate": 4.2018092575757396e-10, + "loss": 0.91172495, + "memory(GiB)": 302.58, + "step": 356220, + "train_speed(iter/s)": 0.123353 + }, + { + "acc": 0.75859799, + "epoch": 1.9922628127066595, + "grad_norm": 7.125, + "learning_rate": 4.0827855094405235e-10, + "loss": 0.93221245, + "memory(GiB)": 302.58, + "step": 356240, + "train_speed(iter/s)": 0.123356 + }, + { + "acc": 0.74425015, + "epoch": 1.9923746621796385, + "grad_norm": 7.21875, + "learning_rate": 3.965471780209562e-10, + "loss": 1.02529926, + "memory(GiB)": 302.58, + "step": 356260, + "train_speed(iter/s)": 0.123359 + }, + { + "acc": 0.76329155, + "epoch": 1.992486511652618, + "grad_norm": 8.3125, + "learning_rate": 3.8498680738907614e-10, + "loss": 0.92177696, + "memory(GiB)": 302.58, + "step": 356280, + "train_speed(iter/s)": 0.123362 + }, + { + "acc": 0.76260529, + "epoch": 1.992598361125597, + "grad_norm": 7.3125, + "learning_rate": 3.7359743944420654e-10, + "loss": 0.92823067, + "memory(GiB)": 302.58, + "step": 356300, + "train_speed(iter/s)": 0.123365 + }, + { + "acc": 0.74999709, + "epoch": 1.9927102105985766, + "grad_norm": 9.625, + "learning_rate": 3.6237907457548073e-10, + "loss": 0.98223524, + "memory(GiB)": 302.58, + "step": 356320, + "train_speed(iter/s)": 0.123368 + }, + { + "acc": 0.76255016, + "epoch": 1.9928220600715556, + "grad_norm": 6.8125, + "learning_rate": 3.5133171316703575e-10, + "loss": 0.92918243, + "memory(GiB)": 302.58, + "step": 356340, + "train_speed(iter/s)": 0.123372 + }, + { + "acc": 0.75663342, + "epoch": 1.992933909544535, + "grad_norm": 8.125, + "learning_rate": 3.4045535559634746e-10, + "loss": 0.95101395, + "memory(GiB)": 302.58, + "step": 356360, + "train_speed(iter/s)": 0.123375 + }, + { + "acc": 0.7622643, + "epoch": 1.9930457590175141, + "grad_norm": 7.1875, + "learning_rate": 3.297500022358957e-10, + "loss": 0.9224823, + "memory(GiB)": 302.58, + "step": 356380, + "train_speed(iter/s)": 0.123378 + }, + { + "acc": 0.74325542, + "epoch": 1.9931576084904936, + "grad_norm": 7.34375, + "learning_rate": 3.1921565345094387e-10, + "loss": 1.00821514, + "memory(GiB)": 302.58, + "step": 356400, + "train_speed(iter/s)": 0.123382 + }, + { + "acc": 0.742417, + "epoch": 1.9932694579634727, + "grad_norm": 9.9375, + "learning_rate": 3.088523096028695e-10, + "loss": 1.00118761, + "memory(GiB)": 302.58, + "step": 356420, + "train_speed(iter/s)": 0.123385 + }, + { + "acc": 0.76191726, + "epoch": 1.9933813074364521, + "grad_norm": 8.5625, + "learning_rate": 2.9865997104583376e-10, + "loss": 0.93990908, + "memory(GiB)": 302.58, + "step": 356440, + "train_speed(iter/s)": 0.123389 + }, + { + "acc": 0.76707754, + "epoch": 1.9934931569094312, + "grad_norm": 7.25, + "learning_rate": 2.8863863812789164e-10, + "loss": 0.89284534, + "memory(GiB)": 302.58, + "step": 356460, + "train_speed(iter/s)": 0.123392 + }, + { + "acc": 0.75638466, + "epoch": 1.9936050063824107, + "grad_norm": 8.3125, + "learning_rate": 2.7878831119210193e-10, + "loss": 0.9432045, + "memory(GiB)": 302.58, + "step": 356480, + "train_speed(iter/s)": 0.123396 + }, + { + "acc": 0.75981908, + "epoch": 1.9937168558553897, + "grad_norm": 7.90625, + "learning_rate": 2.691089905754174e-10, + "loss": 0.93039284, + "memory(GiB)": 302.58, + "step": 356500, + "train_speed(iter/s)": 0.123399 + }, + { + "acc": 0.76050367, + "epoch": 1.9938287053283692, + "grad_norm": 6.84375, + "learning_rate": 2.596006766086845e-10, + "loss": 0.94639435, + "memory(GiB)": 302.58, + "step": 356520, + "train_speed(iter/s)": 0.123402 + }, + { + "acc": 0.749719, + "epoch": 1.9939405548013482, + "grad_norm": 7.1875, + "learning_rate": 2.502633696177537e-10, + "loss": 0.96968842, + "memory(GiB)": 302.58, + "step": 356540, + "train_speed(iter/s)": 0.123405 + }, + { + "acc": 0.75915289, + "epoch": 1.9940524042743277, + "grad_norm": 6.03125, + "learning_rate": 2.41097069921814e-10, + "loss": 0.95633001, + "memory(GiB)": 302.58, + "step": 356560, + "train_speed(iter/s)": 0.123408 + }, + { + "acc": 0.74691939, + "epoch": 1.9941642537473068, + "grad_norm": 7.96875, + "learning_rate": 2.3210177783339338e-10, + "loss": 0.99267702, + "memory(GiB)": 302.58, + "step": 356580, + "train_speed(iter/s)": 0.123412 + }, + { + "acc": 0.73108239, + "epoch": 1.9942761032202863, + "grad_norm": 5.84375, + "learning_rate": 2.2327749366168883e-10, + "loss": 1.06354771, + "memory(GiB)": 302.58, + "step": 356600, + "train_speed(iter/s)": 0.123415 + }, + { + "acc": 0.75651155, + "epoch": 1.9943879526932653, + "grad_norm": 9.6875, + "learning_rate": 2.1462421770757082e-10, + "loss": 0.95884104, + "memory(GiB)": 302.58, + "step": 356620, + "train_speed(iter/s)": 0.123418 + }, + { + "acc": 0.75609107, + "epoch": 1.9944998021662448, + "grad_norm": 8.5625, + "learning_rate": 2.0614195026691376e-10, + "loss": 0.94037085, + "memory(GiB)": 302.58, + "step": 356640, + "train_speed(iter/s)": 0.123422 + }, + { + "acc": 0.7420125, + "epoch": 1.9946116516392238, + "grad_norm": 8.5625, + "learning_rate": 1.9783069163004098e-10, + "loss": 1.02755136, + "memory(GiB)": 302.58, + "step": 356660, + "train_speed(iter/s)": 0.123425 + }, + { + "acc": 0.75804911, + "epoch": 1.9947235011122033, + "grad_norm": 8.5625, + "learning_rate": 1.8969044208172472e-10, + "loss": 0.93530893, + "memory(GiB)": 302.58, + "step": 356680, + "train_speed(iter/s)": 0.123428 + }, + { + "acc": 0.74477859, + "epoch": 1.9948353505851824, + "grad_norm": 7.5, + "learning_rate": 1.817212019000758e-10, + "loss": 1.00148039, + "memory(GiB)": 302.58, + "step": 356700, + "train_speed(iter/s)": 0.123432 + }, + { + "acc": 0.75174217, + "epoch": 1.9949472000581618, + "grad_norm": 6.90625, + "learning_rate": 1.739229713570989e-10, + "loss": 0.96859894, + "memory(GiB)": 302.58, + "step": 356720, + "train_speed(iter/s)": 0.123435 + }, + { + "acc": 0.73930993, + "epoch": 1.9950590495311409, + "grad_norm": 6.71875, + "learning_rate": 1.6629575072035776e-10, + "loss": 1.03362951, + "memory(GiB)": 302.58, + "step": 356740, + "train_speed(iter/s)": 0.123438 + }, + { + "acc": 0.74791727, + "epoch": 1.9951708990041204, + "grad_norm": 5.84375, + "learning_rate": 1.5883954024964454e-10, + "loss": 0.99956331, + "memory(GiB)": 302.58, + "step": 356760, + "train_speed(iter/s)": 0.123442 + }, + { + "acc": 0.77197371, + "epoch": 1.9952827484770994, + "grad_norm": 8.875, + "learning_rate": 1.5155434020142079e-10, + "loss": 0.85948534, + "memory(GiB)": 302.58, + "step": 356780, + "train_speed(iter/s)": 0.123445 + }, + { + "acc": 0.75080099, + "epoch": 1.995394597950079, + "grad_norm": 9.6875, + "learning_rate": 1.4444015082382135e-10, + "loss": 0.99228058, + "memory(GiB)": 302.58, + "step": 356800, + "train_speed(iter/s)": 0.123448 + }, + { + "acc": 0.762082, + "epoch": 1.995506447423058, + "grad_norm": 5.78125, + "learning_rate": 1.3749697236054016e-10, + "loss": 0.92963781, + "memory(GiB)": 302.58, + "step": 356820, + "train_speed(iter/s)": 0.123452 + }, + { + "acc": 0.74662542, + "epoch": 1.9956182968960374, + "grad_norm": 6.15625, + "learning_rate": 1.3072480504916497e-10, + "loss": 0.9963726, + "memory(GiB)": 302.58, + "step": 356840, + "train_speed(iter/s)": 0.123455 + }, + { + "acc": 0.73816161, + "epoch": 1.9957301463690165, + "grad_norm": 7.4375, + "learning_rate": 1.2412364912062213e-10, + "loss": 1.04232702, + "memory(GiB)": 302.58, + "step": 356860, + "train_speed(iter/s)": 0.123458 + }, + { + "acc": 0.74293013, + "epoch": 1.995841995841996, + "grad_norm": 5.5625, + "learning_rate": 1.176935048019523e-10, + "loss": 1.01110783, + "memory(GiB)": 302.58, + "step": 356880, + "train_speed(iter/s)": 0.123462 + }, + { + "acc": 0.75478234, + "epoch": 1.995953845314975, + "grad_norm": 6.34375, + "learning_rate": 1.1143437231186937e-10, + "loss": 0.96445761, + "memory(GiB)": 302.58, + "step": 356900, + "train_speed(iter/s)": 0.123465 + }, + { + "acc": 0.74030056, + "epoch": 1.9960656947879545, + "grad_norm": 10.25, + "learning_rate": 1.0534625186520153e-10, + "loss": 1.01170673, + "memory(GiB)": 302.58, + "step": 356920, + "train_speed(iter/s)": 0.123468 + }, + { + "acc": 0.74468322, + "epoch": 1.9961775442609335, + "grad_norm": 9.25, + "learning_rate": 9.942914366956047e-11, + "loss": 0.99166908, + "memory(GiB)": 302.58, + "step": 356940, + "train_speed(iter/s)": 0.123471 + }, + { + "acc": 0.75142369, + "epoch": 1.996289393733913, + "grad_norm": 8.0625, + "learning_rate": 9.3683047928117e-11, + "loss": 0.97808304, + "memory(GiB)": 302.58, + "step": 356960, + "train_speed(iter/s)": 0.123475 + }, + { + "acc": 0.74566598, + "epoch": 1.996401243206892, + "grad_norm": 7.40625, + "learning_rate": 8.81079648368255e-11, + "loss": 1.01043758, + "memory(GiB)": 302.58, + "step": 356980, + "train_speed(iter/s)": 0.123478 + }, + { + "acc": 0.76503129, + "epoch": 1.9965130926798715, + "grad_norm": 6.90625, + "learning_rate": 8.270389458664429e-11, + "loss": 0.92011976, + "memory(GiB)": 302.58, + "step": 357000, + "train_speed(iter/s)": 0.123481 + }, + { + "acc": 0.76125245, + "epoch": 1.9966249421528506, + "grad_norm": 5.625, + "learning_rate": 7.747083736187044e-11, + "loss": 0.9494832, + "memory(GiB)": 302.58, + "step": 357020, + "train_speed(iter/s)": 0.123484 + }, + { + "acc": 0.74659963, + "epoch": 1.99673679162583, + "grad_norm": 8.5625, + "learning_rate": 7.240879334236006e-11, + "loss": 0.98614426, + "memory(GiB)": 302.58, + "step": 357040, + "train_speed(iter/s)": 0.123487 + }, + { + "acc": 0.74409838, + "epoch": 1.996848641098809, + "grad_norm": 8.4375, + "learning_rate": 6.751776270075283e-11, + "loss": 0.98774691, + "memory(GiB)": 302.58, + "step": 357060, + "train_speed(iter/s)": 0.123491 + }, + { + "acc": 0.76528459, + "epoch": 1.9969604905717886, + "grad_norm": 6.875, + "learning_rate": 6.279774560413732e-11, + "loss": 0.90334282, + "memory(GiB)": 302.58, + "step": 357080, + "train_speed(iter/s)": 0.123494 + }, + { + "acc": 0.7448379, + "epoch": 1.9970723400447676, + "grad_norm": 8.1875, + "learning_rate": 5.824874221460608e-11, + "loss": 0.99563751, + "memory(GiB)": 302.58, + "step": 357100, + "train_speed(iter/s)": 0.123498 + }, + { + "acc": 0.74948468, + "epoch": 1.9971841895177471, + "grad_norm": 5.71875, + "learning_rate": 5.387075268703523e-11, + "loss": 0.98543568, + "memory(GiB)": 302.58, + "step": 357120, + "train_speed(iter/s)": 0.123501 + }, + { + "acc": 0.75417681, + "epoch": 1.9972960389907262, + "grad_norm": 10.5, + "learning_rate": 4.966377717130488e-11, + "loss": 0.97015066, + "memory(GiB)": 302.58, + "step": 357140, + "train_speed(iter/s)": 0.123504 + }, + { + "acc": 0.74349294, + "epoch": 1.9974078884637056, + "grad_norm": 8.875, + "learning_rate": 4.562781581174403e-11, + "loss": 1.0034235, + "memory(GiB)": 302.58, + "step": 357160, + "train_speed(iter/s)": 0.123507 + }, + { + "acc": 0.76318073, + "epoch": 1.9975197379366847, + "grad_norm": 7.46875, + "learning_rate": 4.1762868746575424e-11, + "loss": 0.9215642, + "memory(GiB)": 302.58, + "step": 357180, + "train_speed(iter/s)": 0.123511 + }, + { + "acc": 0.75662189, + "epoch": 1.9976315874096642, + "grad_norm": 7.40625, + "learning_rate": 3.8068936106805396e-11, + "loss": 0.95213346, + "memory(GiB)": 302.58, + "step": 357200, + "train_speed(iter/s)": 0.123514 + }, + { + "acc": 0.76982932, + "epoch": 1.9977434368826432, + "grad_norm": 4.96875, + "learning_rate": 3.4546018020109596e-11, + "loss": 0.89427366, + "memory(GiB)": 302.58, + "step": 357220, + "train_speed(iter/s)": 0.123517 + }, + { + "acc": 0.76385489, + "epoch": 1.9978552863556227, + "grad_norm": 6.125, + "learning_rate": 3.119411460583699e-11, + "loss": 0.94001694, + "memory(GiB)": 302.58, + "step": 357240, + "train_speed(iter/s)": 0.12352 + }, + { + "acc": 0.76920662, + "epoch": 1.9979671358286017, + "grad_norm": 10.3125, + "learning_rate": 2.8013225979450775e-11, + "loss": 0.89932728, + "memory(GiB)": 302.58, + "step": 357260, + "train_speed(iter/s)": 0.123524 + }, + { + "acc": 0.74092526, + "epoch": 1.9980789853015812, + "grad_norm": 5.6875, + "learning_rate": 2.500335224975281e-11, + "loss": 1.01297235, + "memory(GiB)": 302.58, + "step": 357280, + "train_speed(iter/s)": 0.123527 + }, + { + "acc": 0.74263844, + "epoch": 1.9981908347745603, + "grad_norm": 5.3125, + "learning_rate": 2.2164493519438722e-11, + "loss": 1.00274124, + "memory(GiB)": 302.58, + "step": 357300, + "train_speed(iter/s)": 0.123531 + }, + { + "acc": 0.76304936, + "epoch": 1.9983026842475398, + "grad_norm": 6.21875, + "learning_rate": 1.9496649885097917e-11, + "loss": 0.91183996, + "memory(GiB)": 302.58, + "step": 357320, + "train_speed(iter/s)": 0.123534 + }, + { + "acc": 0.74660807, + "epoch": 1.9984145337205188, + "grad_norm": 7.9375, + "learning_rate": 1.6999821438878904e-11, + "loss": 1.00293846, + "memory(GiB)": 302.58, + "step": 357340, + "train_speed(iter/s)": 0.123537 + }, + { + "acc": 0.76576138, + "epoch": 1.9985263831934983, + "grad_norm": 5.90625, + "learning_rate": 1.4674008265713745e-11, + "loss": 0.91540995, + "memory(GiB)": 302.58, + "step": 357360, + "train_speed(iter/s)": 0.123541 + }, + { + "acc": 0.75446715, + "epoch": 1.9986382326664773, + "grad_norm": 7.5, + "learning_rate": 1.2519210444983388e-11, + "loss": 0.95108986, + "memory(GiB)": 302.58, + "step": 357380, + "train_speed(iter/s)": 0.123544 + }, + { + "acc": 0.76731987, + "epoch": 1.9987500821394568, + "grad_norm": 5.46875, + "learning_rate": 1.0535428051072771e-11, + "loss": 0.90177279, + "memory(GiB)": 302.58, + "step": 357400, + "train_speed(iter/s)": 0.123547 + }, + { + "acc": 0.75741577, + "epoch": 1.9988619316124359, + "grad_norm": 10.125, + "learning_rate": 8.722661151150392e-12, + "loss": 0.92651596, + "memory(GiB)": 302.58, + "step": 357420, + "train_speed(iter/s)": 0.12355 + }, + { + "acc": 0.74023275, + "epoch": 1.9989737810854153, + "grad_norm": 5.9375, + "learning_rate": 7.080909807388736e-12, + "loss": 1.03493547, + "memory(GiB)": 302.58, + "step": 357440, + "train_speed(iter/s)": 0.123554 + }, + { + "acc": 0.741996, + "epoch": 1.9990856305583944, + "grad_norm": 7.3125, + "learning_rate": 5.610174075854069e-12, + "loss": 1.02625742, + "memory(GiB)": 302.58, + "step": 357460, + "train_speed(iter/s)": 0.123557 + }, + { + "acc": 0.73920317, + "epoch": 1.9991974800313739, + "grad_norm": 8.6875, + "learning_rate": 4.310454007616649e-12, + "loss": 1.03225813, + "memory(GiB)": 302.58, + "step": 357480, + "train_speed(iter/s)": 0.12356 + }, + { + "acc": 0.74691796, + "epoch": 1.999309329504353, + "grad_norm": 3.859375, + "learning_rate": 3.181749645975174e-12, + "loss": 0.99571695, + "memory(GiB)": 302.58, + "step": 357500, + "train_speed(iter/s)": 0.123563 + }, + { + "acc": 0.74769607, + "epoch": 1.9994211789773324, + "grad_norm": 8.6875, + "learning_rate": 2.2240610303425616e-12, + "loss": 0.99232388, + "memory(GiB)": 302.58, + "step": 357520, + "train_speed(iter/s)": 0.123567 + }, + { + "acc": 0.7518239, + "epoch": 1.9995330284503114, + "grad_norm": 7.65625, + "learning_rate": 1.4373881929152788e-12, + "loss": 0.99144316, + "memory(GiB)": 302.58, + "step": 357540, + "train_speed(iter/s)": 0.12357 + }, + { + "acc": 0.74739747, + "epoch": 1.999644877923291, + "grad_norm": 6.875, + "learning_rate": 8.217311608937906e-13, + "loss": 0.99637089, + "memory(GiB)": 302.58, + "step": 357560, + "train_speed(iter/s)": 0.123573 + }, + { + "acc": 0.7568871, + "epoch": 1.99975672739627, + "grad_norm": 5.78125, + "learning_rate": 3.7708995592744544e-13, + "loss": 0.95811987, + "memory(GiB)": 302.58, + "step": 357580, + "train_speed(iter/s)": 0.123576 + }, + { + "acc": 0.76023417, + "epoch": 1.9998685768692495, + "grad_norm": 11.0625, + "learning_rate": 1.0346459189403136e-13, + "loss": 0.96870403, + "memory(GiB)": 302.58, + "step": 357600, + "train_speed(iter/s)": 0.12358 + }, + { + "acc": 0.75685439, + "epoch": 1.9999804263422285, + "grad_norm": 5.09375, + "learning_rate": 8.550793406669755e-16, + "loss": 0.95032158, + "memory(GiB)": 302.58, + "step": 357620, + "train_speed(iter/s)": 0.123583 + }, + { + "epoch": 1.9999916112895266, + "eval_acc": 0.7069055628366909, + "eval_loss": 1.0117747783660889, + "eval_runtime": 7551.8934, + "eval_samples_per_second": 9.969, + "eval_steps_per_second": 9.969, + "step": 357622 + } + ], + "logging_steps": 20, + "max_steps": 357622, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "total_flos": 6.495530577257143e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}