| { | |
| "best_metric": 1.3922204971313477, | |
| "best_model_checkpoint": "miner_id_24/checkpoint-400", | |
| "epoch": 0.2705444707473791, | |
| "eval_steps": 50, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0006763611768684477, | |
| "grad_norm": 0.11527051031589508, | |
| "learning_rate": 1e-05, | |
| "loss": 1.3722, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0006763611768684477, | |
| "eval_loss": 1.6020574569702148, | |
| "eval_runtime": 45.762, | |
| "eval_samples_per_second": 54.412, | |
| "eval_steps_per_second": 13.614, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0013527223537368955, | |
| "grad_norm": 0.12793777883052826, | |
| "learning_rate": 2e-05, | |
| "loss": 1.5535, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.002029083530605343, | |
| "grad_norm": 0.13140501081943512, | |
| "learning_rate": 3e-05, | |
| "loss": 1.6156, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.002705444707473791, | |
| "grad_norm": 0.11015880107879639, | |
| "learning_rate": 4e-05, | |
| "loss": 1.4666, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0033818058843422386, | |
| "grad_norm": 0.10875169187784195, | |
| "learning_rate": 5e-05, | |
| "loss": 1.4681, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.004058167061210686, | |
| "grad_norm": 0.11826880276203156, | |
| "learning_rate": 6e-05, | |
| "loss": 1.5281, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0047345282380791345, | |
| "grad_norm": 0.11252899467945099, | |
| "learning_rate": 7e-05, | |
| "loss": 1.4841, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.005410889414947582, | |
| "grad_norm": 0.10688446462154388, | |
| "learning_rate": 8e-05, | |
| "loss": 1.4466, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.00608725059181603, | |
| "grad_norm": 0.10228835046291351, | |
| "learning_rate": 9e-05, | |
| "loss": 1.4563, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.006763611768684477, | |
| "grad_norm": 0.09173312038183212, | |
| "learning_rate": 0.0001, | |
| "loss": 1.5006, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0074399729455529254, | |
| "grad_norm": 0.09866360574960709, | |
| "learning_rate": 9.99983777858264e-05, | |
| "loss": 1.6285, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.008116334122421373, | |
| "grad_norm": 0.08047564327716827, | |
| "learning_rate": 9.999351124856874e-05, | |
| "loss": 1.5584, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.00879269529928982, | |
| "grad_norm": 0.0752352774143219, | |
| "learning_rate": 9.998540070400966e-05, | |
| "loss": 1.5559, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.009469056476158269, | |
| "grad_norm": 0.07531195878982544, | |
| "learning_rate": 9.997404667843075e-05, | |
| "loss": 1.5742, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.010145417653026716, | |
| "grad_norm": 0.07908021658658981, | |
| "learning_rate": 9.995944990857849e-05, | |
| "loss": 1.4653, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.010821778829895164, | |
| "grad_norm": 0.08029956370592117, | |
| "learning_rate": 9.994161134161634e-05, | |
| "loss": 1.5325, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.011498140006763611, | |
| "grad_norm": 0.0774945616722107, | |
| "learning_rate": 9.992053213506334e-05, | |
| "loss": 1.5462, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.01217450118363206, | |
| "grad_norm": 0.08803839236497879, | |
| "learning_rate": 9.989621365671902e-05, | |
| "loss": 1.584, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.012850862360500507, | |
| "grad_norm": 0.10591822117567062, | |
| "learning_rate": 9.986865748457457e-05, | |
| "loss": 1.523, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.013527223537368955, | |
| "grad_norm": 0.08525391668081284, | |
| "learning_rate": 9.983786540671051e-05, | |
| "loss": 1.5191, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.014203584714237404, | |
| "grad_norm": 0.0858788788318634, | |
| "learning_rate": 9.980383942118066e-05, | |
| "loss": 1.5742, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.014879945891105851, | |
| "grad_norm": 0.07967844605445862, | |
| "learning_rate": 9.976658173588244e-05, | |
| "loss": 1.5609, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.015556307067974298, | |
| "grad_norm": 0.07027371227741241, | |
| "learning_rate": 9.972609476841367e-05, | |
| "loss": 1.4291, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.016232668244842745, | |
| "grad_norm": 0.06715142726898193, | |
| "learning_rate": 9.968238114591566e-05, | |
| "loss": 1.564, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.016909029421711193, | |
| "grad_norm": 0.06723647564649582, | |
| "learning_rate": 9.96354437049027e-05, | |
| "loss": 1.5361, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.01758539059857964, | |
| "grad_norm": 0.07239236682653427, | |
| "learning_rate": 9.95852854910781e-05, | |
| "loss": 1.5553, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.01826175177544809, | |
| "grad_norm": 0.07131913304328918, | |
| "learning_rate": 9.953190975913647e-05, | |
| "loss": 1.4897, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.018938112952316538, | |
| "grad_norm": 0.06529667973518372, | |
| "learning_rate": 9.947531997255256e-05, | |
| "loss": 1.5582, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.019614474129184985, | |
| "grad_norm": 0.06842414289712906, | |
| "learning_rate": 9.941551980335652e-05, | |
| "loss": 1.5437, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.020290835306053433, | |
| "grad_norm": 0.06674303114414215, | |
| "learning_rate": 9.935251313189564e-05, | |
| "loss": 1.5053, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02096719648292188, | |
| "grad_norm": 0.07287029922008514, | |
| "learning_rate": 9.928630404658255e-05, | |
| "loss": 1.5596, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.021643557659790327, | |
| "grad_norm": 0.06220496445894241, | |
| "learning_rate": 9.921689684362989e-05, | |
| "loss": 1.4754, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.022319918836658775, | |
| "grad_norm": 0.0621667206287384, | |
| "learning_rate": 9.914429602677162e-05, | |
| "loss": 1.4362, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.022996280013527222, | |
| "grad_norm": 0.06346531957387924, | |
| "learning_rate": 9.906850630697068e-05, | |
| "loss": 1.5012, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.023672641190395673, | |
| "grad_norm": 0.06350383907556534, | |
| "learning_rate": 9.898953260211338e-05, | |
| "loss": 1.5494, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.02434900236726412, | |
| "grad_norm": 0.06582257896661758, | |
| "learning_rate": 9.890738003669029e-05, | |
| "loss": 1.5761, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.025025363544132567, | |
| "grad_norm": 0.06340694427490234, | |
| "learning_rate": 9.882205394146361e-05, | |
| "loss": 1.4405, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.025701724721001015, | |
| "grad_norm": 0.06537513434886932, | |
| "learning_rate": 9.87335598531214e-05, | |
| "loss": 1.5213, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.026378085897869462, | |
| "grad_norm": 0.06288953870534897, | |
| "learning_rate": 9.864190351391822e-05, | |
| "loss": 1.4631, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.02705444707473791, | |
| "grad_norm": 0.06635712087154388, | |
| "learning_rate": 9.85470908713026e-05, | |
| "loss": 1.5389, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.027730808251606356, | |
| "grad_norm": 0.06636327505111694, | |
| "learning_rate": 9.844912807753104e-05, | |
| "loss": 1.4432, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.028407169428474807, | |
| "grad_norm": 0.06624823063611984, | |
| "learning_rate": 9.834802148926882e-05, | |
| "loss": 1.5093, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.029083530605343254, | |
| "grad_norm": 0.06883368641138077, | |
| "learning_rate": 9.824377766717759e-05, | |
| "loss": 1.4792, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.029759891782211702, | |
| "grad_norm": 0.06978511810302734, | |
| "learning_rate": 9.813640337548954e-05, | |
| "loss": 1.3603, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.03043625295908015, | |
| "grad_norm": 0.07289346307516098, | |
| "learning_rate": 9.802590558156862e-05, | |
| "loss": 1.51, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.031112614135948596, | |
| "grad_norm": 0.07324250042438507, | |
| "learning_rate": 9.791229145545831e-05, | |
| "loss": 1.4035, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.03178897531281705, | |
| "grad_norm": 0.07471778243780136, | |
| "learning_rate": 9.779556836941645e-05, | |
| "loss": 1.4359, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.03246533648968549, | |
| "grad_norm": 0.08484581112861633, | |
| "learning_rate": 9.767574389743682e-05, | |
| "loss": 1.4309, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.03314169766655394, | |
| "grad_norm": 0.10304436087608337, | |
| "learning_rate": 9.755282581475769e-05, | |
| "loss": 1.4602, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.033818058843422386, | |
| "grad_norm": 0.1328336000442505, | |
| "learning_rate": 9.742682209735727e-05, | |
| "loss": 1.4454, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.033818058843422386, | |
| "eval_loss": 1.4632972478866577, | |
| "eval_runtime": 45.874, | |
| "eval_samples_per_second": 54.279, | |
| "eval_steps_per_second": 13.581, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.034494420020290836, | |
| "grad_norm": 0.08472179621458054, | |
| "learning_rate": 9.729774092143627e-05, | |
| "loss": 1.2426, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.03517078119715928, | |
| "grad_norm": 0.09228364378213882, | |
| "learning_rate": 9.716559066288715e-05, | |
| "loss": 1.3988, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.03584714237402773, | |
| "grad_norm": 0.08056582510471344, | |
| "learning_rate": 9.703037989675087e-05, | |
| "loss": 1.4519, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.03652350355089618, | |
| "grad_norm": 0.07555064558982849, | |
| "learning_rate": 9.689211739666023e-05, | |
| "loss": 1.3203, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.037199864727764625, | |
| "grad_norm": 0.07038799673318863, | |
| "learning_rate": 9.675081213427076e-05, | |
| "loss": 1.248, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.037876225904633076, | |
| "grad_norm": 0.0710492879152298, | |
| "learning_rate": 9.66064732786784e-05, | |
| "loss": 1.3863, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.03855258708150152, | |
| "grad_norm": 0.06492199748754501, | |
| "learning_rate": 9.645911019582467e-05, | |
| "loss": 1.3043, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.03922894825836997, | |
| "grad_norm": 0.06760136038064957, | |
| "learning_rate": 9.630873244788883e-05, | |
| "loss": 1.3661, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.039905309435238415, | |
| "grad_norm": 0.06265412271022797, | |
| "learning_rate": 9.615534979266745e-05, | |
| "loss": 1.3609, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.040581670612106865, | |
| "grad_norm": 0.061597876250743866, | |
| "learning_rate": 9.599897218294122e-05, | |
| "loss": 1.4391, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.041258031788975316, | |
| "grad_norm": 0.05606893450021744, | |
| "learning_rate": 9.583960976582913e-05, | |
| "loss": 1.4275, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.04193439296584376, | |
| "grad_norm": 0.05780341103672981, | |
| "learning_rate": 9.567727288213005e-05, | |
| "loss": 1.4847, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.04261075414271221, | |
| "grad_norm": 0.056065794080495834, | |
| "learning_rate": 9.551197206565173e-05, | |
| "loss": 1.4721, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.043287115319580655, | |
| "grad_norm": 0.056345950812101364, | |
| "learning_rate": 9.534371804252728e-05, | |
| "loss": 1.4328, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.043963476496449105, | |
| "grad_norm": 0.058517809957265854, | |
| "learning_rate": 9.517252173051911e-05, | |
| "loss": 1.4927, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.04463983767331755, | |
| "grad_norm": 0.0576399601995945, | |
| "learning_rate": 9.49983942383106e-05, | |
| "loss": 1.3739, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.045316198850186, | |
| "grad_norm": 0.05825427174568176, | |
| "learning_rate": 9.482134686478519e-05, | |
| "loss": 1.4341, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.045992560027054444, | |
| "grad_norm": 0.06041615083813667, | |
| "learning_rate": 9.464139109829321e-05, | |
| "loss": 1.3953, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.046668921203922895, | |
| "grad_norm": 0.057867467403411865, | |
| "learning_rate": 9.445853861590647e-05, | |
| "loss": 1.3927, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.047345282380791345, | |
| "grad_norm": 0.05943746492266655, | |
| "learning_rate": 9.42728012826605e-05, | |
| "loss": 1.4106, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.04802164355765979, | |
| "grad_norm": 0.05969482287764549, | |
| "learning_rate": 9.408419115078471e-05, | |
| "loss": 1.4894, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.04869800473452824, | |
| "grad_norm": 0.06147676706314087, | |
| "learning_rate": 9.389272045892024e-05, | |
| "loss": 1.4909, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.049374365911396684, | |
| "grad_norm": 0.05909387767314911, | |
| "learning_rate": 9.36984016313259e-05, | |
| "loss": 1.4984, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.050050727088265135, | |
| "grad_norm": 0.05941369757056236, | |
| "learning_rate": 9.350124727707197e-05, | |
| "loss": 1.4996, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.05072708826513358, | |
| "grad_norm": 0.06290363520383835, | |
| "learning_rate": 9.330127018922194e-05, | |
| "loss": 1.3869, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.05140344944200203, | |
| "grad_norm": 0.05921513959765434, | |
| "learning_rate": 9.309848334400246e-05, | |
| "loss": 1.398, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.05207981061887048, | |
| "grad_norm": 0.061745863407850266, | |
| "learning_rate": 9.289289989996133e-05, | |
| "loss": 1.4086, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.052756171795738924, | |
| "grad_norm": 0.06437841057777405, | |
| "learning_rate": 9.268453319711363e-05, | |
| "loss": 1.4707, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.053432532972607374, | |
| "grad_norm": 0.06059794872999191, | |
| "learning_rate": 9.247339675607605e-05, | |
| "loss": 1.5068, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.05410889414947582, | |
| "grad_norm": 0.06443328410387039, | |
| "learning_rate": 9.225950427718975e-05, | |
| "loss": 1.3733, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05478525532634427, | |
| "grad_norm": 0.06261321902275085, | |
| "learning_rate": 9.204286963963111e-05, | |
| "loss": 1.4561, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.05546161650321271, | |
| "grad_norm": 0.061781320720911026, | |
| "learning_rate": 9.182350690051133e-05, | |
| "loss": 1.4226, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.056137977680081164, | |
| "grad_norm": 0.06218276172876358, | |
| "learning_rate": 9.160143029396422e-05, | |
| "loss": 1.4586, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.056814338856949614, | |
| "grad_norm": 0.06496214866638184, | |
| "learning_rate": 9.13766542302225e-05, | |
| "loss": 1.3872, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.05749070003381806, | |
| "grad_norm": 0.0679158866405487, | |
| "learning_rate": 9.114919329468282e-05, | |
| "loss": 1.4496, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.05816706121068651, | |
| "grad_norm": 0.06544684618711472, | |
| "learning_rate": 9.091906224695935e-05, | |
| "loss": 1.4761, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.05884342238755495, | |
| "grad_norm": 0.06715362519025803, | |
| "learning_rate": 9.068627601992598e-05, | |
| "loss": 1.4541, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.059519783564423404, | |
| "grad_norm": 0.06633464992046356, | |
| "learning_rate": 9.045084971874738e-05, | |
| "loss": 1.4444, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.06019614474129185, | |
| "grad_norm": 0.07181775569915771, | |
| "learning_rate": 9.021279861989885e-05, | |
| "loss": 1.5305, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.0608725059181603, | |
| "grad_norm": 0.07183065265417099, | |
| "learning_rate": 8.997213817017507e-05, | |
| "loss": 1.4656, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06154886709502874, | |
| "grad_norm": 0.07328730821609497, | |
| "learning_rate": 8.972888398568772e-05, | |
| "loss": 1.4432, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.06222522827189719, | |
| "grad_norm": 0.07428156584501266, | |
| "learning_rate": 8.948305185085225e-05, | |
| "loss": 1.3638, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.06290158944876564, | |
| "grad_norm": 0.07306837290525436, | |
| "learning_rate": 8.92346577173636e-05, | |
| "loss": 1.4581, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.0635779506256341, | |
| "grad_norm": 0.07504208385944366, | |
| "learning_rate": 8.898371770316111e-05, | |
| "loss": 1.5102, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.06425431180250253, | |
| "grad_norm": 0.0772990882396698, | |
| "learning_rate": 8.873024809138272e-05, | |
| "loss": 1.4124, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.06493067297937098, | |
| "grad_norm": 0.07291746884584427, | |
| "learning_rate": 8.847426532930831e-05, | |
| "loss": 1.4007, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.06560703415623943, | |
| "grad_norm": 0.08162276446819305, | |
| "learning_rate": 8.821578602729242e-05, | |
| "loss": 1.4192, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.06628339533310788, | |
| "grad_norm": 0.08907538652420044, | |
| "learning_rate": 8.795482695768658e-05, | |
| "loss": 1.3384, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.06695975650997633, | |
| "grad_norm": 0.0932670459151268, | |
| "learning_rate": 8.769140505375085e-05, | |
| "loss": 1.4058, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.06763611768684477, | |
| "grad_norm": 0.1595381498336792, | |
| "learning_rate": 8.742553740855506e-05, | |
| "loss": 1.5911, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06763611768684477, | |
| "eval_loss": 1.4289613962173462, | |
| "eval_runtime": 45.8536, | |
| "eval_samples_per_second": 54.303, | |
| "eval_steps_per_second": 13.587, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06831247886371322, | |
| "grad_norm": 0.09340012818574905, | |
| "learning_rate": 8.715724127386972e-05, | |
| "loss": 1.4526, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.06898884004058167, | |
| "grad_norm": 0.08029083907604218, | |
| "learning_rate": 8.688653405904652e-05, | |
| "loss": 1.4576, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.06966520121745012, | |
| "grad_norm": 0.08003030717372894, | |
| "learning_rate": 8.661343332988869e-05, | |
| "loss": 1.4589, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.07034156239431856, | |
| "grad_norm": 0.06887253373861313, | |
| "learning_rate": 8.633795680751116e-05, | |
| "loss": 1.3456, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.07101792357118701, | |
| "grad_norm": 0.07519850134849548, | |
| "learning_rate": 8.606012236719073e-05, | |
| "loss": 1.4775, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.07169428474805546, | |
| "grad_norm": 0.0648961216211319, | |
| "learning_rate": 8.577994803720606e-05, | |
| "loss": 1.3131, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.07237064592492391, | |
| "grad_norm": 0.059890005737543106, | |
| "learning_rate": 8.549745199766792e-05, | |
| "loss": 1.248, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.07304700710179236, | |
| "grad_norm": 0.05943324416875839, | |
| "learning_rate": 8.521265257933948e-05, | |
| "loss": 1.2764, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.0737233682786608, | |
| "grad_norm": 0.06385409832000732, | |
| "learning_rate": 8.492556826244687e-05, | |
| "loss": 1.3833, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.07439972945552925, | |
| "grad_norm": 0.06080995127558708, | |
| "learning_rate": 8.463621767547998e-05, | |
| "loss": 1.42, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0750760906323977, | |
| "grad_norm": 0.05953231826424599, | |
| "learning_rate": 8.434461959398376e-05, | |
| "loss": 1.4552, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.07575245180926615, | |
| "grad_norm": 0.059315070509910583, | |
| "learning_rate": 8.405079293933986e-05, | |
| "loss": 1.3466, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.07642881298613459, | |
| "grad_norm": 0.06055072322487831, | |
| "learning_rate": 8.375475677753881e-05, | |
| "loss": 1.4209, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.07710517416300304, | |
| "grad_norm": 0.0636223629117012, | |
| "learning_rate": 8.345653031794292e-05, | |
| "loss": 1.443, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.07778153533987149, | |
| "grad_norm": 0.06455370783805847, | |
| "learning_rate": 8.315613291203976e-05, | |
| "loss": 1.4417, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.07845789651673994, | |
| "grad_norm": 0.06425165385007858, | |
| "learning_rate": 8.285358405218655e-05, | |
| "loss": 1.4707, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.07913425769360839, | |
| "grad_norm": 0.06385717540979385, | |
| "learning_rate": 8.25489033703452e-05, | |
| "loss": 1.4646, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.07981061887047683, | |
| "grad_norm": 0.06884630769491196, | |
| "learning_rate": 8.224211063680853e-05, | |
| "loss": 1.4347, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.08048698004734528, | |
| "grad_norm": 0.06414289772510529, | |
| "learning_rate": 8.19332257589174e-05, | |
| "loss": 1.4166, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.08116334122421373, | |
| "grad_norm": 0.0632779598236084, | |
| "learning_rate": 8.162226877976887e-05, | |
| "loss": 1.4469, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08183970240108218, | |
| "grad_norm": 0.06453357636928558, | |
| "learning_rate": 8.130925987691569e-05, | |
| "loss": 1.4517, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.08251606357795063, | |
| "grad_norm": 0.06426431983709335, | |
| "learning_rate": 8.099421936105702e-05, | |
| "loss": 1.4219, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.08319242475481907, | |
| "grad_norm": 0.06670214980840683, | |
| "learning_rate": 8.067716767472045e-05, | |
| "loss": 1.4, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.08386878593168752, | |
| "grad_norm": 0.06815646588802338, | |
| "learning_rate": 8.035812539093557e-05, | |
| "loss": 1.4972, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.08454514710855597, | |
| "grad_norm": 0.06987130641937256, | |
| "learning_rate": 8.003711321189895e-05, | |
| "loss": 1.5506, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.08522150828542442, | |
| "grad_norm": 0.06460455805063248, | |
| "learning_rate": 7.971415196763088e-05, | |
| "loss": 1.39, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.08589786946229286, | |
| "grad_norm": 0.0693214163184166, | |
| "learning_rate": 7.938926261462366e-05, | |
| "loss": 1.4748, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.08657423063916131, | |
| "grad_norm": 0.06622923165559769, | |
| "learning_rate": 7.906246623448183e-05, | |
| "loss": 1.4451, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.08725059181602976, | |
| "grad_norm": 0.06661586463451385, | |
| "learning_rate": 7.873378403255419e-05, | |
| "loss": 1.3786, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.08792695299289821, | |
| "grad_norm": 0.06834559142589569, | |
| "learning_rate": 7.840323733655778e-05, | |
| "loss": 1.4451, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.08860331416976666, | |
| "grad_norm": 0.06984566152095795, | |
| "learning_rate": 7.807084759519405e-05, | |
| "loss": 1.4485, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.0892796753466351, | |
| "grad_norm": 0.06916147470474243, | |
| "learning_rate": 7.773663637675694e-05, | |
| "loss": 1.4026, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.08995603652350355, | |
| "grad_norm": 0.07113322615623474, | |
| "learning_rate": 7.740062536773352e-05, | |
| "loss": 1.5727, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.090632397700372, | |
| "grad_norm": 0.07131800055503845, | |
| "learning_rate": 7.706283637139658e-05, | |
| "loss": 1.454, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.09130875887724045, | |
| "grad_norm": 0.06780333071947098, | |
| "learning_rate": 7.672329130639005e-05, | |
| "loss": 1.4221, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.09198512005410889, | |
| "grad_norm": 0.07369336485862732, | |
| "learning_rate": 7.638201220530665e-05, | |
| "loss": 1.4888, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.09266148123097734, | |
| "grad_norm": 0.07125518471002579, | |
| "learning_rate": 7.603902121325813e-05, | |
| "loss": 1.3931, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.09333784240784579, | |
| "grad_norm": 0.07436627149581909, | |
| "learning_rate": 7.569434058643844e-05, | |
| "loss": 1.4815, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.09401420358471424, | |
| "grad_norm": 0.07242149114608765, | |
| "learning_rate": 7.534799269067953e-05, | |
| "loss": 1.3624, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.09469056476158269, | |
| "grad_norm": 0.07634232938289642, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 1.3689, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09536692593845113, | |
| "grad_norm": 0.0717257410287857, | |
| "learning_rate": 7.465038509514688e-05, | |
| "loss": 1.3373, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.09604328711531958, | |
| "grad_norm": 0.0763496682047844, | |
| "learning_rate": 7.42991706621303e-05, | |
| "loss": 1.3357, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.09671964829218803, | |
| "grad_norm": 0.08058321475982666, | |
| "learning_rate": 7.394637949075154e-05, | |
| "loss": 1.4484, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.09739600946905648, | |
| "grad_norm": 0.0799909457564354, | |
| "learning_rate": 7.35920344731241e-05, | |
| "loss": 1.4597, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.09807237064592493, | |
| "grad_norm": 0.08274323493242264, | |
| "learning_rate": 7.323615860218843e-05, | |
| "loss": 1.348, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.09874873182279337, | |
| "grad_norm": 0.08631595224142075, | |
| "learning_rate": 7.287877497021978e-05, | |
| "loss": 1.4747, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.09942509299966182, | |
| "grad_norm": 0.09415697306394577, | |
| "learning_rate": 7.251990676732984e-05, | |
| "loss": 1.4111, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.10010145417653027, | |
| "grad_norm": 0.09759113937616348, | |
| "learning_rate": 7.215957727996207e-05, | |
| "loss": 1.3532, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.10077781535339872, | |
| "grad_norm": 0.10980842262506485, | |
| "learning_rate": 7.179780988938051e-05, | |
| "loss": 1.3642, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.10145417653026716, | |
| "grad_norm": 0.17088009417057037, | |
| "learning_rate": 7.143462807015271e-05, | |
| "loss": 1.3813, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10145417653026716, | |
| "eval_loss": 1.4142309427261353, | |
| "eval_runtime": 45.7509, | |
| "eval_samples_per_second": 54.425, | |
| "eval_steps_per_second": 13.617, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10213053770713561, | |
| "grad_norm": 0.0729823112487793, | |
| "learning_rate": 7.107005538862646e-05, | |
| "loss": 1.2632, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.10280689888400406, | |
| "grad_norm": 0.08052191883325577, | |
| "learning_rate": 7.07041155014006e-05, | |
| "loss": 1.4305, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.10348326006087251, | |
| "grad_norm": 0.07198279350996017, | |
| "learning_rate": 7.033683215379002e-05, | |
| "loss": 1.3343, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.10415962123774096, | |
| "grad_norm": 0.071586973965168, | |
| "learning_rate": 6.996822917828477e-05, | |
| "loss": 1.3404, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.1048359824146094, | |
| "grad_norm": 0.06972288340330124, | |
| "learning_rate": 6.959833049300377e-05, | |
| "loss": 1.2954, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.10551234359147785, | |
| "grad_norm": 0.06969049572944641, | |
| "learning_rate": 6.922716010014255e-05, | |
| "loss": 1.3755, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.1061887047683463, | |
| "grad_norm": 0.06844943761825562, | |
| "learning_rate": 6.885474208441603e-05, | |
| "loss": 1.3698, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.10686506594521475, | |
| "grad_norm": 0.06935857236385345, | |
| "learning_rate": 6.848110061149556e-05, | |
| "loss": 1.384, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.10754142712208319, | |
| "grad_norm": 0.06879723072052002, | |
| "learning_rate": 6.810625992644085e-05, | |
| "loss": 1.34, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.10821778829895164, | |
| "grad_norm": 0.06257957220077515, | |
| "learning_rate": 6.773024435212678e-05, | |
| "loss": 1.3494, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.10889414947582009, | |
| "grad_norm": 0.06383413076400757, | |
| "learning_rate": 6.735307828766515e-05, | |
| "loss": 1.3507, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.10957051065268854, | |
| "grad_norm": 0.06752607226371765, | |
| "learning_rate": 6.697478620682137e-05, | |
| "loss": 1.4388, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.11024687182955699, | |
| "grad_norm": 0.06457509845495224, | |
| "learning_rate": 6.659539265642643e-05, | |
| "loss": 1.3931, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.11092323300642543, | |
| "grad_norm": 0.06549496948719025, | |
| "learning_rate": 6.621492225478414e-05, | |
| "loss": 1.4351, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.11159959418329388, | |
| "grad_norm": 0.06523440778255463, | |
| "learning_rate": 6.583339969007363e-05, | |
| "loss": 1.4126, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.11227595536016233, | |
| "grad_norm": 0.06583328545093536, | |
| "learning_rate": 6.545084971874738e-05, | |
| "loss": 1.4372, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.11295231653703078, | |
| "grad_norm": 0.06682123988866806, | |
| "learning_rate": 6.506729716392481e-05, | |
| "loss": 1.5266, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.11362867771389923, | |
| "grad_norm": 0.06995988637208939, | |
| "learning_rate": 6.468276691378155e-05, | |
| "loss": 1.4669, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.11430503889076767, | |
| "grad_norm": 0.07168641686439514, | |
| "learning_rate": 6.429728391993446e-05, | |
| "loss": 1.4679, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.11498140006763612, | |
| "grad_norm": 0.06742502003908157, | |
| "learning_rate": 6.391087319582264e-05, | |
| "loss": 1.4673, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.11565776124450457, | |
| "grad_norm": 0.06958625465631485, | |
| "learning_rate": 6.35235598150842e-05, | |
| "loss": 1.4848, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.11633412242137302, | |
| "grad_norm": 0.06822968274354935, | |
| "learning_rate": 6.313536890992935e-05, | |
| "loss": 1.4539, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.11701048359824145, | |
| "grad_norm": 0.06616730242967606, | |
| "learning_rate": 6.274632566950967e-05, | |
| "loss": 1.4109, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.1176868447751099, | |
| "grad_norm": 0.07107029855251312, | |
| "learning_rate": 6.235645533828349e-05, | |
| "loss": 1.3708, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.11836320595197836, | |
| "grad_norm": 0.07082799077033997, | |
| "learning_rate": 6.19657832143779e-05, | |
| "loss": 1.3988, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.11903956712884681, | |
| "grad_norm": 0.07087160646915436, | |
| "learning_rate": 6.157433464794716e-05, | |
| "loss": 1.4447, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.11971592830571526, | |
| "grad_norm": 0.0733053982257843, | |
| "learning_rate": 6.118213503952779e-05, | |
| "loss": 1.4614, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.1203922894825837, | |
| "grad_norm": 0.07366427779197693, | |
| "learning_rate": 6.078920983839031e-05, | |
| "loss": 1.4916, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.12106865065945215, | |
| "grad_norm": 0.07121474295854568, | |
| "learning_rate": 6.0395584540887963e-05, | |
| "loss": 1.4848, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.1217450118363206, | |
| "grad_norm": 0.07643849402666092, | |
| "learning_rate": 6.0001284688802226e-05, | |
| "loss": 1.4999, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.12242137301318905, | |
| "grad_norm": 0.07152778655290604, | |
| "learning_rate": 5.960633586768543e-05, | |
| "loss": 1.3907, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.12309773419005748, | |
| "grad_norm": 0.07466912269592285, | |
| "learning_rate": 5.921076370520058e-05, | |
| "loss": 1.4363, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.12377409536692593, | |
| "grad_norm": 0.07023868709802628, | |
| "learning_rate": 5.8814593869458455e-05, | |
| "loss": 1.3702, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.12445045654379439, | |
| "grad_norm": 0.07278748601675034, | |
| "learning_rate": 5.841785206735192e-05, | |
| "loss": 1.4415, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.12512681772066284, | |
| "grad_norm": 0.07512617111206055, | |
| "learning_rate": 5.8020564042888015e-05, | |
| "loss": 1.43, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.1258031788975313, | |
| "grad_norm": 0.07797680795192719, | |
| "learning_rate": 5.762275557551727e-05, | |
| "loss": 1.4119, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.12647954007439974, | |
| "grad_norm": 0.07866205275058746, | |
| "learning_rate": 5.7224452478461064e-05, | |
| "loss": 1.4005, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.1271559012512682, | |
| "grad_norm": 0.07712559401988983, | |
| "learning_rate": 5.682568059703659e-05, | |
| "loss": 1.4383, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.1278322624281366, | |
| "grad_norm": 0.07656645774841309, | |
| "learning_rate": 5.642646580697973e-05, | |
| "loss": 1.327, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.12850862360500506, | |
| "grad_norm": 0.08458492159843445, | |
| "learning_rate": 5.602683401276615e-05, | |
| "loss": 1.4881, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1291849847818735, | |
| "grad_norm": 0.08732939511537552, | |
| "learning_rate": 5.562681114593028e-05, | |
| "loss": 1.4576, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.12986134595874196, | |
| "grad_norm": 0.08154824376106262, | |
| "learning_rate": 5.522642316338268e-05, | |
| "loss": 1.375, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.13053770713561041, | |
| "grad_norm": 0.08457835018634796, | |
| "learning_rate": 5.482569604572576e-05, | |
| "loss": 1.445, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.13121406831247887, | |
| "grad_norm": 0.08304689079523087, | |
| "learning_rate": 5.442465579556793e-05, | |
| "loss": 1.3442, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.13189042948934732, | |
| "grad_norm": 0.0892767384648323, | |
| "learning_rate": 5.402332843583631e-05, | |
| "loss": 1.4186, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.13256679066621577, | |
| "grad_norm": 0.0887024998664856, | |
| "learning_rate": 5.3621740008088126e-05, | |
| "loss": 1.4056, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.13324315184308422, | |
| "grad_norm": 0.09405411779880524, | |
| "learning_rate": 5.321991657082097e-05, | |
| "loss": 1.3538, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.13391951301995267, | |
| "grad_norm": 0.09736300259828568, | |
| "learning_rate": 5.281788419778187e-05, | |
| "loss": 1.3224, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.1345958741968211, | |
| "grad_norm": 0.10766453295946121, | |
| "learning_rate": 5.2415668976275355e-05, | |
| "loss": 1.302, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.13527223537368954, | |
| "grad_norm": 0.13096509873867035, | |
| "learning_rate": 5.201329700547076e-05, | |
| "loss": 1.3488, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13527223537368954, | |
| "eval_loss": 1.4043376445770264, | |
| "eval_runtime": 45.9119, | |
| "eval_samples_per_second": 54.234, | |
| "eval_steps_per_second": 13.569, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.135948596550558, | |
| "grad_norm": 0.07206866890192032, | |
| "learning_rate": 5.161079439470866e-05, | |
| "loss": 1.2779, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.13662495772742644, | |
| "grad_norm": 0.07780182361602783, | |
| "learning_rate": 5.1208187261806615e-05, | |
| "loss": 1.3795, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.1373013189042949, | |
| "grad_norm": 0.07541805505752563, | |
| "learning_rate": 5.080550173136457e-05, | |
| "loss": 1.3189, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.13797768008116335, | |
| "grad_norm": 0.07348611205816269, | |
| "learning_rate": 5.0402763933069496e-05, | |
| "loss": 1.3119, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.1386540412580318, | |
| "grad_norm": 0.07221783697605133, | |
| "learning_rate": 5e-05, | |
| "loss": 1.3501, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.13933040243490025, | |
| "grad_norm": 0.0704021155834198, | |
| "learning_rate": 4.9597236066930516e-05, | |
| "loss": 1.3384, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.1400067636117687, | |
| "grad_norm": 0.06795866042375565, | |
| "learning_rate": 4.919449826863544e-05, | |
| "loss": 1.3522, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.14068312478863712, | |
| "grad_norm": 0.06520047038793564, | |
| "learning_rate": 4.87918127381934e-05, | |
| "loss": 1.218, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.14135948596550557, | |
| "grad_norm": 0.07124362885951996, | |
| "learning_rate": 4.8389205605291365e-05, | |
| "loss": 1.3148, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.14203584714237402, | |
| "grad_norm": 0.06937072426080704, | |
| "learning_rate": 4.798670299452926e-05, | |
| "loss": 1.3524, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.14271220831924247, | |
| "grad_norm": 0.06877253204584122, | |
| "learning_rate": 4.758433102372466e-05, | |
| "loss": 1.3803, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.14338856949611092, | |
| "grad_norm": 0.07117702066898346, | |
| "learning_rate": 4.7182115802218126e-05, | |
| "loss": 1.3561, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.14406493067297937, | |
| "grad_norm": 0.07073140144348145, | |
| "learning_rate": 4.678008342917903e-05, | |
| "loss": 1.4709, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.14474129184984783, | |
| "grad_norm": 0.07057556509971619, | |
| "learning_rate": 4.6378259991911886e-05, | |
| "loss": 1.3807, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.14541765302671628, | |
| "grad_norm": 0.06970713287591934, | |
| "learning_rate": 4.597667156416371e-05, | |
| "loss": 1.4542, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.14609401420358473, | |
| "grad_norm": 0.06909048557281494, | |
| "learning_rate": 4.5575344204432084e-05, | |
| "loss": 1.4294, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.14677037538045315, | |
| "grad_norm": 0.07124718278646469, | |
| "learning_rate": 4.5174303954274244e-05, | |
| "loss": 1.5282, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.1474467365573216, | |
| "grad_norm": 0.06672398000955582, | |
| "learning_rate": 4.477357683661734e-05, | |
| "loss": 1.3559, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.14812309773419005, | |
| "grad_norm": 0.07380548864603043, | |
| "learning_rate": 4.437318885406973e-05, | |
| "loss": 1.4161, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.1487994589110585, | |
| "grad_norm": 0.07196389138698578, | |
| "learning_rate": 4.397316598723385e-05, | |
| "loss": 1.368, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.14947582008792695, | |
| "grad_norm": 0.07408401370048523, | |
| "learning_rate": 4.3573534193020274e-05, | |
| "loss": 1.4736, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.1501521812647954, | |
| "grad_norm": 0.07140503078699112, | |
| "learning_rate": 4.317431940296343e-05, | |
| "loss": 1.3147, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.15082854244166385, | |
| "grad_norm": 0.07496851682662964, | |
| "learning_rate": 4.277554752153895e-05, | |
| "loss": 1.4287, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.1515049036185323, | |
| "grad_norm": 0.07181335985660553, | |
| "learning_rate": 4.237724442448273e-05, | |
| "loss": 1.39, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.15218126479540076, | |
| "grad_norm": 0.07524239271879196, | |
| "learning_rate": 4.197943595711198e-05, | |
| "loss": 1.4701, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.15285762597226918, | |
| "grad_norm": 0.07574082165956497, | |
| "learning_rate": 4.1582147932648074e-05, | |
| "loss": 1.4445, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.15353398714913763, | |
| "grad_norm": 0.07705846428871155, | |
| "learning_rate": 4.118540613054156e-05, | |
| "loss": 1.4178, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.15421034832600608, | |
| "grad_norm": 0.07315905392169952, | |
| "learning_rate": 4.078923629479943e-05, | |
| "loss": 1.4865, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.15488670950287453, | |
| "grad_norm": 0.07923450320959091, | |
| "learning_rate": 4.039366413231458e-05, | |
| "loss": 1.4065, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.15556307067974298, | |
| "grad_norm": 0.07415217906236649, | |
| "learning_rate": 3.9998715311197785e-05, | |
| "loss": 1.3639, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.15623943185661143, | |
| "grad_norm": 0.08203402161598206, | |
| "learning_rate": 3.960441545911204e-05, | |
| "loss": 1.4945, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.15691579303347988, | |
| "grad_norm": 0.07680166512727737, | |
| "learning_rate": 3.92107901616097e-05, | |
| "loss": 1.4052, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.15759215421034833, | |
| "grad_norm": 0.07834422588348389, | |
| "learning_rate": 3.8817864960472236e-05, | |
| "loss": 1.4893, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.15826851538721678, | |
| "grad_norm": 0.08108842372894287, | |
| "learning_rate": 3.842566535205286e-05, | |
| "loss": 1.4315, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.1589448765640852, | |
| "grad_norm": 0.08216694742441177, | |
| "learning_rate": 3.803421678562213e-05, | |
| "loss": 1.4253, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.15962123774095366, | |
| "grad_norm": 0.07897795736789703, | |
| "learning_rate": 3.764354466171652e-05, | |
| "loss": 1.3707, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.1602975989178221, | |
| "grad_norm": 0.07699574530124664, | |
| "learning_rate": 3.725367433049033e-05, | |
| "loss": 1.4609, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.16097396009469056, | |
| "grad_norm": 0.08165720850229263, | |
| "learning_rate": 3.6864631090070655e-05, | |
| "loss": 1.3859, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.161650321271559, | |
| "grad_norm": 0.08102511614561081, | |
| "learning_rate": 3.6476440184915815e-05, | |
| "loss": 1.355, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.16232668244842746, | |
| "grad_norm": 0.08690423518419266, | |
| "learning_rate": 3.608912680417737e-05, | |
| "loss": 1.3506, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.1630030436252959, | |
| "grad_norm": 0.08367474377155304, | |
| "learning_rate": 3.570271608006555e-05, | |
| "loss": 1.3122, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.16367940480216436, | |
| "grad_norm": 0.08739592134952545, | |
| "learning_rate": 3.531723308621847e-05, | |
| "loss": 1.4115, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.16435576597903281, | |
| "grad_norm": 0.08051220327615738, | |
| "learning_rate": 3.493270283607522e-05, | |
| "loss": 1.2894, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.16503212715590126, | |
| "grad_norm": 0.09064695984125137, | |
| "learning_rate": 3.4549150281252636e-05, | |
| "loss": 1.3753, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.1657084883327697, | |
| "grad_norm": 0.0889323428273201, | |
| "learning_rate": 3.4166600309926387e-05, | |
| "loss": 1.3243, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.16638484950963814, | |
| "grad_norm": 0.1018919125199318, | |
| "learning_rate": 3.3785077745215873e-05, | |
| "loss": 1.4442, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.1670612106865066, | |
| "grad_norm": 0.09463594108819962, | |
| "learning_rate": 3.340460734357359e-05, | |
| "loss": 1.3204, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.16773757186337504, | |
| "grad_norm": 0.10294807702302933, | |
| "learning_rate": 3.3025213793178646e-05, | |
| "loss": 1.3608, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.1684139330402435, | |
| "grad_norm": 0.11613894999027252, | |
| "learning_rate": 3.264692171233485e-05, | |
| "loss": 1.3338, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.16909029421711194, | |
| "grad_norm": 0.17183607816696167, | |
| "learning_rate": 3.226975564787322e-05, | |
| "loss": 1.4796, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.16909029421711194, | |
| "eval_loss": 1.3976595401763916, | |
| "eval_runtime": 45.8538, | |
| "eval_samples_per_second": 54.303, | |
| "eval_steps_per_second": 13.587, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1697666553939804, | |
| "grad_norm": 0.078953318297863, | |
| "learning_rate": 3.189374007355917e-05, | |
| "loss": 1.3758, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.17044301657084884, | |
| "grad_norm": 0.0744386836886406, | |
| "learning_rate": 3.151889938850445e-05, | |
| "loss": 1.3349, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.1711193777477173, | |
| "grad_norm": 0.0737113356590271, | |
| "learning_rate": 3.114525791558398e-05, | |
| "loss": 1.3502, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.17179573892458572, | |
| "grad_norm": 0.07107130438089371, | |
| "learning_rate": 3.0772839899857464e-05, | |
| "loss": 1.3401, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.17247210010145417, | |
| "grad_norm": 0.07271695137023926, | |
| "learning_rate": 3.0401669506996256e-05, | |
| "loss": 1.3141, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.17314846127832262, | |
| "grad_norm": 0.07352391630411148, | |
| "learning_rate": 3.003177082171523e-05, | |
| "loss": 1.3572, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.17382482245519107, | |
| "grad_norm": 0.06897805631160736, | |
| "learning_rate": 2.9663167846209998e-05, | |
| "loss": 1.3375, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.17450118363205952, | |
| "grad_norm": 0.06769099086523056, | |
| "learning_rate": 2.9295884498599414e-05, | |
| "loss": 1.3414, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.17517754480892797, | |
| "grad_norm": 0.07317459583282471, | |
| "learning_rate": 2.8929944611373554e-05, | |
| "loss": 1.3435, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.17585390598579642, | |
| "grad_norm": 0.07190477102994919, | |
| "learning_rate": 2.8565371929847284e-05, | |
| "loss": 1.3547, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.17653026716266487, | |
| "grad_norm": 0.07559996843338013, | |
| "learning_rate": 2.8202190110619493e-05, | |
| "loss": 1.453, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.17720662833953332, | |
| "grad_norm": 0.06975331157445908, | |
| "learning_rate": 2.784042272003794e-05, | |
| "loss": 1.3556, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.17788298951640175, | |
| "grad_norm": 0.07106088846921921, | |
| "learning_rate": 2.7480093232670158e-05, | |
| "loss": 1.4059, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.1785593506932702, | |
| "grad_norm": 0.06821250170469284, | |
| "learning_rate": 2.712122502978024e-05, | |
| "loss": 1.3723, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.17923571187013865, | |
| "grad_norm": 0.0697351023554802, | |
| "learning_rate": 2.6763841397811573e-05, | |
| "loss": 1.3633, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.1799120730470071, | |
| "grad_norm": 0.07180540263652802, | |
| "learning_rate": 2.64079655268759e-05, | |
| "loss": 1.4461, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.18058843422387555, | |
| "grad_norm": 0.07183071225881577, | |
| "learning_rate": 2.605362050924848e-05, | |
| "loss": 1.4576, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.181264795400744, | |
| "grad_norm": 0.07112300395965576, | |
| "learning_rate": 2.57008293378697e-05, | |
| "loss": 1.401, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.18194115657761245, | |
| "grad_norm": 0.07420746237039566, | |
| "learning_rate": 2.534961490485313e-05, | |
| "loss": 1.5515, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.1826175177544809, | |
| "grad_norm": 0.0707039013504982, | |
| "learning_rate": 2.500000000000001e-05, | |
| "loss": 1.4783, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.18329387893134935, | |
| "grad_norm": 0.07006899267435074, | |
| "learning_rate": 2.4652007309320498e-05, | |
| "loss": 1.3614, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.18397024010821778, | |
| "grad_norm": 0.07219888269901276, | |
| "learning_rate": 2.430565941356157e-05, | |
| "loss": 1.4088, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.18464660128508623, | |
| "grad_norm": 0.07091671228408813, | |
| "learning_rate": 2.3960978786741877e-05, | |
| "loss": 1.4517, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.18532296246195468, | |
| "grad_norm": 0.0724414512515068, | |
| "learning_rate": 2.361798779469336e-05, | |
| "loss": 1.4833, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.18599932363882313, | |
| "grad_norm": 0.07302770018577576, | |
| "learning_rate": 2.3276708693609943e-05, | |
| "loss": 1.4728, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.18667568481569158, | |
| "grad_norm": 0.07354607433080673, | |
| "learning_rate": 2.2937163628603435e-05, | |
| "loss": 1.4262, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.18735204599256003, | |
| "grad_norm": 0.07791664451360703, | |
| "learning_rate": 2.259937463226651e-05, | |
| "loss": 1.3972, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.18802840716942848, | |
| "grad_norm": 0.07679464668035507, | |
| "learning_rate": 2.2263363623243054e-05, | |
| "loss": 1.4607, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.18870476834629693, | |
| "grad_norm": 0.07656057178974152, | |
| "learning_rate": 2.192915240480596e-05, | |
| "loss": 1.3248, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.18938112952316538, | |
| "grad_norm": 0.07713098078966141, | |
| "learning_rate": 2.1596762663442218e-05, | |
| "loss": 1.4162, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1900574907000338, | |
| "grad_norm": 0.08018391579389572, | |
| "learning_rate": 2.1266215967445824e-05, | |
| "loss": 1.4747, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.19073385187690226, | |
| "grad_norm": 0.07631267607212067, | |
| "learning_rate": 2.0937533765518187e-05, | |
| "loss": 1.4274, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.1914102130537707, | |
| "grad_norm": 0.07951124012470245, | |
| "learning_rate": 2.061073738537635e-05, | |
| "loss": 1.4151, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.19208657423063916, | |
| "grad_norm": 0.07667551189661026, | |
| "learning_rate": 2.0285848032369137e-05, | |
| "loss": 1.3486, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.1927629354075076, | |
| "grad_norm": 0.08102501928806305, | |
| "learning_rate": 1.996288678810105e-05, | |
| "loss": 1.4048, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.19343929658437606, | |
| "grad_norm": 0.07826242595911026, | |
| "learning_rate": 1.9641874609064443e-05, | |
| "loss": 1.4236, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.1941156577612445, | |
| "grad_norm": 0.08553146570920944, | |
| "learning_rate": 1.932283232527956e-05, | |
| "loss": 1.429, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.19479201893811296, | |
| "grad_norm": 0.08594300597906113, | |
| "learning_rate": 1.9005780638942982e-05, | |
| "loss": 1.4627, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.1954683801149814, | |
| "grad_norm": 0.08326655626296997, | |
| "learning_rate": 1.8690740123084316e-05, | |
| "loss": 1.3625, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.19614474129184986, | |
| "grad_norm": 0.08529611676931381, | |
| "learning_rate": 1.837773122023114e-05, | |
| "loss": 1.3801, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.19682110246871828, | |
| "grad_norm": 0.09172829985618591, | |
| "learning_rate": 1.8066774241082612e-05, | |
| "loss": 1.4218, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.19749746364558673, | |
| "grad_norm": 0.08743683993816376, | |
| "learning_rate": 1.7757889363191483e-05, | |
| "loss": 1.3857, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.19817382482245519, | |
| "grad_norm": 0.09676212817430496, | |
| "learning_rate": 1.745109662965481e-05, | |
| "loss": 1.4006, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.19885018599932364, | |
| "grad_norm": 0.093772754073143, | |
| "learning_rate": 1.714641594781347e-05, | |
| "loss": 1.442, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.1995265471761921, | |
| "grad_norm": 0.09733811020851135, | |
| "learning_rate": 1.684386708796025e-05, | |
| "loss": 1.5033, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.20020290835306054, | |
| "grad_norm": 0.09554203599691391, | |
| "learning_rate": 1.6543469682057106e-05, | |
| "loss": 1.4162, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.200879269529929, | |
| "grad_norm": 0.0979439914226532, | |
| "learning_rate": 1.62452432224612e-05, | |
| "loss": 1.42, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.20155563070679744, | |
| "grad_norm": 0.11299502104520798, | |
| "learning_rate": 1.5949207060660138e-05, | |
| "loss": 1.481, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.2022319918836659, | |
| "grad_norm": 0.1257336437702179, | |
| "learning_rate": 1.5655380406016235e-05, | |
| "loss": 1.3572, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.2029083530605343, | |
| "grad_norm": 0.19890309870243073, | |
| "learning_rate": 1.536378232452003e-05, | |
| "loss": 1.3492, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2029083530605343, | |
| "eval_loss": 1.3946185111999512, | |
| "eval_runtime": 45.9309, | |
| "eval_samples_per_second": 54.212, | |
| "eval_steps_per_second": 13.564, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.20358471423740276, | |
| "grad_norm": 0.06888294219970703, | |
| "learning_rate": 1.5074431737553157e-05, | |
| "loss": 1.2703, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.20426107541427121, | |
| "grad_norm": 0.07071013748645782, | |
| "learning_rate": 1.4787347420660541e-05, | |
| "loss": 1.2927, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.20493743659113967, | |
| "grad_norm": 0.07139529287815094, | |
| "learning_rate": 1.4502548002332088e-05, | |
| "loss": 1.3657, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.20561379776800812, | |
| "grad_norm": 0.06573140621185303, | |
| "learning_rate": 1.422005196279395e-05, | |
| "loss": 1.2871, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.20629015894487657, | |
| "grad_norm": 0.06961559504270554, | |
| "learning_rate": 1.3939877632809278e-05, | |
| "loss": 1.3394, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.20696652012174502, | |
| "grad_norm": 0.07154294103384018, | |
| "learning_rate": 1.3662043192488849e-05, | |
| "loss": 1.4211, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.20764288129861347, | |
| "grad_norm": 0.0693695992231369, | |
| "learning_rate": 1.338656667011134e-05, | |
| "loss": 1.3076, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.20831924247548192, | |
| "grad_norm": 0.0711139366030693, | |
| "learning_rate": 1.3113465940953495e-05, | |
| "loss": 1.3016, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.20899560365235034, | |
| "grad_norm": 0.06786596775054932, | |
| "learning_rate": 1.2842758726130283e-05, | |
| "loss": 1.2759, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.2096719648292188, | |
| "grad_norm": 0.07203105837106705, | |
| "learning_rate": 1.257446259144494e-05, | |
| "loss": 1.2681, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.21034832600608724, | |
| "grad_norm": 0.06898628920316696, | |
| "learning_rate": 1.2308594946249163e-05, | |
| "loss": 1.3097, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.2110246871829557, | |
| "grad_norm": 0.07093969732522964, | |
| "learning_rate": 1.204517304231343e-05, | |
| "loss": 1.3423, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.21170104835982415, | |
| "grad_norm": 0.07039978355169296, | |
| "learning_rate": 1.178421397270758e-05, | |
| "loss": 1.3544, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.2123774095366926, | |
| "grad_norm": 0.07245506346225739, | |
| "learning_rate": 1.1525734670691701e-05, | |
| "loss": 1.3775, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.21305377071356105, | |
| "grad_norm": 0.0731048583984375, | |
| "learning_rate": 1.1269751908617277e-05, | |
| "loss": 1.4099, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.2137301318904295, | |
| "grad_norm": 0.07116963714361191, | |
| "learning_rate": 1.1016282296838887e-05, | |
| "loss": 1.3899, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.21440649306729795, | |
| "grad_norm": 0.07567148655653, | |
| "learning_rate": 1.0765342282636416e-05, | |
| "loss": 1.5033, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.21508285424416637, | |
| "grad_norm": 0.07363573461771011, | |
| "learning_rate": 1.0516948149147754e-05, | |
| "loss": 1.442, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.21575921542103482, | |
| "grad_norm": 0.07205386459827423, | |
| "learning_rate": 1.0271116014312293e-05, | |
| "loss": 1.3578, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.21643557659790327, | |
| "grad_norm": 0.07321757078170776, | |
| "learning_rate": 1.0027861829824952e-05, | |
| "loss": 1.4057, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.21711193777477172, | |
| "grad_norm": 0.07573118805885315, | |
| "learning_rate": 9.787201380101157e-06, | |
| "loss": 1.3645, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.21778829895164017, | |
| "grad_norm": 0.07296445220708847, | |
| "learning_rate": 9.549150281252633e-06, | |
| "loss": 1.4686, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.21846466012850863, | |
| "grad_norm": 0.0765371024608612, | |
| "learning_rate": 9.313723980074018e-06, | |
| "loss": 1.509, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.21914102130537708, | |
| "grad_norm": 0.0745658352971077, | |
| "learning_rate": 9.080937753040646e-06, | |
| "loss": 1.4014, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.21981738248224553, | |
| "grad_norm": 0.0729154720902443, | |
| "learning_rate": 8.850806705317183e-06, | |
| "loss": 1.3839, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.22049374365911398, | |
| "grad_norm": 0.07217688858509064, | |
| "learning_rate": 8.623345769777514e-06, | |
| "loss": 1.4157, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.22117010483598243, | |
| "grad_norm": 0.07794667035341263, | |
| "learning_rate": 8.398569706035792e-06, | |
| "loss": 1.4788, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.22184646601285085, | |
| "grad_norm": 0.07797662168741226, | |
| "learning_rate": 8.176493099488663e-06, | |
| "loss": 1.3679, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.2225228271897193, | |
| "grad_norm": 0.0776330828666687, | |
| "learning_rate": 7.957130360368898e-06, | |
| "loss": 1.4592, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.22319918836658775, | |
| "grad_norm": 0.07840722054243088, | |
| "learning_rate": 7.740495722810271e-06, | |
| "loss": 1.4205, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.2238755495434562, | |
| "grad_norm": 0.07938168942928314, | |
| "learning_rate": 7.526603243923957e-06, | |
| "loss": 1.4727, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.22455191072032465, | |
| "grad_norm": 0.07692472636699677, | |
| "learning_rate": 7.315466802886401e-06, | |
| "loss": 1.3764, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.2252282718971931, | |
| "grad_norm": 0.07381051033735275, | |
| "learning_rate": 7.107100100038671e-06, | |
| "loss": 1.3103, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.22590463307406156, | |
| "grad_norm": 0.08517421782016754, | |
| "learning_rate": 6.901516655997536e-06, | |
| "loss": 1.4538, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.22658099425093, | |
| "grad_norm": 0.07926001399755478, | |
| "learning_rate": 6.698729810778065e-06, | |
| "loss": 1.4098, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.22725735542779846, | |
| "grad_norm": 0.08531263470649719, | |
| "learning_rate": 6.498752722928042e-06, | |
| "loss": 1.42, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.22793371660466688, | |
| "grad_norm": 0.08142856508493423, | |
| "learning_rate": 6.301598368674105e-06, | |
| "loss": 1.4362, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.22861007778153533, | |
| "grad_norm": 0.08574479818344116, | |
| "learning_rate": 6.107279541079769e-06, | |
| "loss": 1.4158, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.22928643895840378, | |
| "grad_norm": 0.08034282922744751, | |
| "learning_rate": 5.915808849215304e-06, | |
| "loss": 1.3813, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.22996280013527223, | |
| "grad_norm": 0.08398088812828064, | |
| "learning_rate": 5.727198717339511e-06, | |
| "loss": 1.3915, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.23063916131214068, | |
| "grad_norm": 0.0855628028512001, | |
| "learning_rate": 5.54146138409355e-06, | |
| "loss": 1.4603, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.23131552248900913, | |
| "grad_norm": 0.09168089181184769, | |
| "learning_rate": 5.358608901706802e-06, | |
| "loss": 1.4113, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.23199188366587759, | |
| "grad_norm": 0.09065324068069458, | |
| "learning_rate": 5.178653135214812e-06, | |
| "loss": 1.3356, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.23266824484274604, | |
| "grad_norm": 0.09464474767446518, | |
| "learning_rate": 5.001605761689398e-06, | |
| "loss": 1.4075, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.2333446060196145, | |
| "grad_norm": 0.09780465066432953, | |
| "learning_rate": 4.827478269480895e-06, | |
| "loss": 1.4186, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.2340209671964829, | |
| "grad_norm": 0.09540226310491562, | |
| "learning_rate": 4.65628195747273e-06, | |
| "loss": 1.4061, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.23469732837335136, | |
| "grad_norm": 0.10365621000528336, | |
| "learning_rate": 4.488027934348271e-06, | |
| "loss": 1.3495, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.2353736895502198, | |
| "grad_norm": 0.1070394292473793, | |
| "learning_rate": 4.322727117869951e-06, | |
| "loss": 1.4006, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.23605005072708826, | |
| "grad_norm": 0.12799158692359924, | |
| "learning_rate": 4.16039023417088e-06, | |
| "loss": 1.322, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.2367264119039567, | |
| "grad_norm": 0.16463126242160797, | |
| "learning_rate": 4.001027817058789e-06, | |
| "loss": 1.1977, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2367264119039567, | |
| "eval_loss": 1.392707109451294, | |
| "eval_runtime": 45.9044, | |
| "eval_samples_per_second": 54.243, | |
| "eval_steps_per_second": 13.572, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.23740277308082516, | |
| "grad_norm": 0.07486344873905182, | |
| "learning_rate": 3.844650207332562e-06, | |
| "loss": 1.2625, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.23807913425769361, | |
| "grad_norm": 0.07041779160499573, | |
| "learning_rate": 3.691267552111183e-06, | |
| "loss": 1.4214, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.23875549543456207, | |
| "grad_norm": 0.06538067013025284, | |
| "learning_rate": 3.54088980417534e-06, | |
| "loss": 1.2736, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.23943185661143052, | |
| "grad_norm": 0.06986545771360397, | |
| "learning_rate": 3.393526721321616e-06, | |
| "loss": 1.344, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.24010821778829894, | |
| "grad_norm": 0.06664931774139404, | |
| "learning_rate": 3.249187865729264e-06, | |
| "loss": 1.3045, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.2407845789651674, | |
| "grad_norm": 0.06488357484340668, | |
| "learning_rate": 3.1078826033397843e-06, | |
| "loss": 1.3117, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.24146094014203584, | |
| "grad_norm": 0.06983731687068939, | |
| "learning_rate": 2.9696201032491434e-06, | |
| "loss": 1.3953, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.2421373013189043, | |
| "grad_norm": 0.06828371435403824, | |
| "learning_rate": 2.8344093371128424e-06, | |
| "loss": 1.2914, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.24281366249577274, | |
| "grad_norm": 0.06504950672388077, | |
| "learning_rate": 2.70225907856374e-06, | |
| "loss": 1.2957, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.2434900236726412, | |
| "grad_norm": 0.06765610724687576, | |
| "learning_rate": 2.573177902642726e-06, | |
| "loss": 1.3295, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.24416638484950964, | |
| "grad_norm": 0.0687704086303711, | |
| "learning_rate": 2.4471741852423237e-06, | |
| "loss": 1.376, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.2448427460263781, | |
| "grad_norm": 0.07070591300725937, | |
| "learning_rate": 2.324256102563188e-06, | |
| "loss": 1.3248, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.24551910720324654, | |
| "grad_norm": 0.07138653844594955, | |
| "learning_rate": 2.204431630583548e-06, | |
| "loss": 1.4185, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.24619546838011497, | |
| "grad_norm": 0.06933178007602692, | |
| "learning_rate": 2.087708544541689e-06, | |
| "loss": 1.3003, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.24687182955698342, | |
| "grad_norm": 0.07162750512361526, | |
| "learning_rate": 1.974094418431388e-06, | |
| "loss": 1.4175, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.24754819073385187, | |
| "grad_norm": 0.06946908682584763, | |
| "learning_rate": 1.8635966245104664e-06, | |
| "loss": 1.3431, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.24822455191072032, | |
| "grad_norm": 0.07021202892065048, | |
| "learning_rate": 1.7562223328224325e-06, | |
| "loss": 1.3501, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.24890091308758877, | |
| "grad_norm": 0.07219018787145615, | |
| "learning_rate": 1.6519785107311891e-06, | |
| "loss": 1.4302, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.24957727426445722, | |
| "grad_norm": 0.07118932902812958, | |
| "learning_rate": 1.5508719224689717e-06, | |
| "loss": 1.3583, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.2502536354413257, | |
| "grad_norm": 0.07472287863492966, | |
| "learning_rate": 1.4529091286973995e-06, | |
| "loss": 1.4124, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2509299966181941, | |
| "grad_norm": 0.07825792580842972, | |
| "learning_rate": 1.358096486081778e-06, | |
| "loss": 1.4895, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.2516063577950626, | |
| "grad_norm": 0.07341321557760239, | |
| "learning_rate": 1.2664401468786114e-06, | |
| "loss": 1.4379, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.252282718971931, | |
| "grad_norm": 0.07650943100452423, | |
| "learning_rate": 1.1779460585363944e-06, | |
| "loss": 1.4964, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.2529590801487995, | |
| "grad_norm": 0.07561939209699631, | |
| "learning_rate": 1.0926199633097157e-06, | |
| "loss": 1.4085, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.2536354413256679, | |
| "grad_norm": 0.07571756094694138, | |
| "learning_rate": 1.0104673978866164e-06, | |
| "loss": 1.3657, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.2543118025025364, | |
| "grad_norm": 0.07461043447256088, | |
| "learning_rate": 9.314936930293283e-07, | |
| "loss": 1.3648, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.2549881636794048, | |
| "grad_norm": 0.0762941762804985, | |
| "learning_rate": 8.557039732283944e-07, | |
| "loss": 1.3433, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.2556645248562732, | |
| "grad_norm": 0.0783124789595604, | |
| "learning_rate": 7.83103156370113e-07, | |
| "loss": 1.4268, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.2563408860331417, | |
| "grad_norm": 0.07456471771001816, | |
| "learning_rate": 7.136959534174592e-07, | |
| "loss": 1.3878, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.2570172472100101, | |
| "grad_norm": 0.07892836630344391, | |
| "learning_rate": 6.474868681043578e-07, | |
| "loss": 1.4588, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2576936083868786, | |
| "grad_norm": 0.0801616907119751, | |
| "learning_rate": 5.844801966434832e-07, | |
| "loss": 1.5042, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.258369969563747, | |
| "grad_norm": 0.07792920619249344, | |
| "learning_rate": 5.246800274474439e-07, | |
| "loss": 1.421, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.2590463307406155, | |
| "grad_norm": 0.07863734662532806, | |
| "learning_rate": 4.680902408635335e-07, | |
| "loss": 1.4013, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.2597226919174839, | |
| "grad_norm": 0.07995697110891342, | |
| "learning_rate": 4.1471450892189846e-07, | |
| "loss": 1.4333, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.2603990530943524, | |
| "grad_norm": 0.08075670152902603, | |
| "learning_rate": 3.6455629509730136e-07, | |
| "loss": 1.4242, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.26107541427122083, | |
| "grad_norm": 0.08324610441923141, | |
| "learning_rate": 3.1761885408435054e-07, | |
| "loss": 1.4737, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.26175177544808925, | |
| "grad_norm": 0.08701196312904358, | |
| "learning_rate": 2.7390523158633554e-07, | |
| "loss": 1.4813, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.26242813662495773, | |
| "grad_norm": 0.0778915211558342, | |
| "learning_rate": 2.334182641175686e-07, | |
| "loss": 1.3242, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.26310449780182615, | |
| "grad_norm": 0.08328765630722046, | |
| "learning_rate": 1.9616057881935436e-07, | |
| "loss": 1.5292, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.26378085897869463, | |
| "grad_norm": 0.0862572193145752, | |
| "learning_rate": 1.6213459328950352e-07, | |
| "loss": 1.4288, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.26445722015556306, | |
| "grad_norm": 0.0847046822309494, | |
| "learning_rate": 1.3134251542544774e-07, | |
| "loss": 1.3865, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.26513358133243153, | |
| "grad_norm": 0.0888221487402916, | |
| "learning_rate": 1.0378634328099269e-07, | |
| "loss": 1.4391, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.26580994250929996, | |
| "grad_norm": 0.09016448259353638, | |
| "learning_rate": 7.946786493666647e-08, | |
| "loss": 1.424, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.26648630368616844, | |
| "grad_norm": 0.09526152163743973, | |
| "learning_rate": 5.838865838366792e-08, | |
| "loss": 1.3966, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.26716266486303686, | |
| "grad_norm": 0.09396642446517944, | |
| "learning_rate": 4.055009142152067e-08, | |
| "loss": 1.3453, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.26783902603990534, | |
| "grad_norm": 0.09826450049877167, | |
| "learning_rate": 2.595332156925534e-08, | |
| "loss": 1.4621, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.26851538721677376, | |
| "grad_norm": 0.10213644802570343, | |
| "learning_rate": 1.4599295990352924e-08, | |
| "loss": 1.4176, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.2691917483936422, | |
| "grad_norm": 0.11088913679122925, | |
| "learning_rate": 6.488751431266149e-09, | |
| "loss": 1.4419, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.26986810957051066, | |
| "grad_norm": 0.12139194458723068, | |
| "learning_rate": 1.622214173602199e-09, | |
| "loss": 1.3843, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.2705444707473791, | |
| "grad_norm": 0.1520748883485794, | |
| "learning_rate": 0.0, | |
| "loss": 1.3685, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2705444707473791, | |
| "eval_loss": 1.3922204971313477, | |
| "eval_runtime": 45.9365, | |
| "eval_samples_per_second": 54.205, | |
| "eval_steps_per_second": 13.562, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.285041640112128e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |