| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 532, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.174624726176262, | |
| "epoch": 0.0018827959519887032, | |
| "grad_norm": 0.3589564859867096, | |
| "learning_rate": 0.0005, | |
| "loss": 1.7667, | |
| "mean_token_accuracy": 0.6097231954336166, | |
| "num_tokens": 26212.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.3834485709667206, | |
| "epoch": 0.0037655919039774064, | |
| "grad_norm": 0.273681104183197, | |
| "learning_rate": 0.000499812030075188, | |
| "loss": 1.6137, | |
| "mean_token_accuracy": 0.6240904033184052, | |
| "num_tokens": 53331.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 2.3064600229263306, | |
| "epoch": 0.00564838785596611, | |
| "grad_norm": 0.8047769665718079, | |
| "learning_rate": 0.0004996240601503759, | |
| "loss": 1.6899, | |
| "mean_token_accuracy": 0.6088793724775314, | |
| "num_tokens": 80291.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.630955085158348, | |
| "epoch": 0.007531183807954813, | |
| "grad_norm": 0.30714720487594604, | |
| "learning_rate": 0.0004994360902255639, | |
| "loss": 1.5608, | |
| "mean_token_accuracy": 0.6291212365031242, | |
| "num_tokens": 106966.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3567735850811005, | |
| "epoch": 0.009413979759943516, | |
| "grad_norm": 0.2066618800163269, | |
| "learning_rate": 0.0004992481203007519, | |
| "loss": 1.4887, | |
| "mean_token_accuracy": 0.6415289863944054, | |
| "num_tokens": 132786.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.361013576388359, | |
| "epoch": 0.01129677571193222, | |
| "grad_norm": 0.24627672135829926, | |
| "learning_rate": 0.0004990601503759398, | |
| "loss": 1.4956, | |
| "mean_token_accuracy": 0.6329040080308914, | |
| "num_tokens": 157854.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.4551365226507187, | |
| "epoch": 0.013179571663920923, | |
| "grad_norm": 0.24504677951335907, | |
| "learning_rate": 0.0004988721804511278, | |
| "loss": 1.4555, | |
| "mean_token_accuracy": 0.6410629153251648, | |
| "num_tokens": 183628.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.558847650885582, | |
| "epoch": 0.015062367615909626, | |
| "grad_norm": 0.24714401364326477, | |
| "learning_rate": 0.0004986842105263158, | |
| "loss": 1.4574, | |
| "mean_token_accuracy": 0.6385244429111481, | |
| "num_tokens": 212024.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.4725914895534515, | |
| "epoch": 0.016945163567898328, | |
| "grad_norm": 0.14686766266822815, | |
| "learning_rate": 0.0004984962406015037, | |
| "loss": 1.4077, | |
| "mean_token_accuracy": 0.6496255323290825, | |
| "num_tokens": 239247.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.399958148598671, | |
| "epoch": 0.01882795951988703, | |
| "grad_norm": 0.2573543190956116, | |
| "learning_rate": 0.0004983082706766917, | |
| "loss": 1.4648, | |
| "mean_token_accuracy": 0.6321976333856583, | |
| "num_tokens": 265365.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.3477602005004883, | |
| "epoch": 0.020710755471875734, | |
| "grad_norm": 0.19095759093761444, | |
| "learning_rate": 0.0004981203007518797, | |
| "loss": 1.3914, | |
| "mean_token_accuracy": 0.6472064480185509, | |
| "num_tokens": 292664.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 1.3985529839992523, | |
| "epoch": 0.02259355142386444, | |
| "grad_norm": 0.12443722784519196, | |
| "learning_rate": 0.0004979323308270676, | |
| "loss": 1.3841, | |
| "mean_token_accuracy": 0.6470160931348801, | |
| "num_tokens": 318823.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 1.4100047498941422, | |
| "epoch": 0.024476347375853143, | |
| "grad_norm": 0.18163365125656128, | |
| "learning_rate": 0.0004977443609022556, | |
| "loss": 1.3475, | |
| "mean_token_accuracy": 0.6554316207766533, | |
| "num_tokens": 345276.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 1.3673983961343765, | |
| "epoch": 0.026359143327841845, | |
| "grad_norm": 0.21292470395565033, | |
| "learning_rate": 0.0004975563909774436, | |
| "loss": 1.3423, | |
| "mean_token_accuracy": 0.6571086272597313, | |
| "num_tokens": 372290.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 1.3170630186796188, | |
| "epoch": 0.028241939279830548, | |
| "grad_norm": 0.14680063724517822, | |
| "learning_rate": 0.0004973684210526315, | |
| "loss": 1.3433, | |
| "mean_token_accuracy": 0.6587843522429466, | |
| "num_tokens": 398806.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.4363876283168793, | |
| "epoch": 0.03012473523181925, | |
| "grad_norm": 0.1492491513490677, | |
| "learning_rate": 0.0004971804511278195, | |
| "loss": 1.3881, | |
| "mean_token_accuracy": 0.6493127718567848, | |
| "num_tokens": 427973.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 1.3928384333848953, | |
| "epoch": 0.032007531183807954, | |
| "grad_norm": 0.21353831887245178, | |
| "learning_rate": 0.0004969924812030076, | |
| "loss": 1.3303, | |
| "mean_token_accuracy": 0.6532666012644768, | |
| "num_tokens": 455705.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 1.3039959222078323, | |
| "epoch": 0.033890327135796657, | |
| "grad_norm": 0.12421785295009613, | |
| "learning_rate": 0.0004968045112781954, | |
| "loss": 1.3078, | |
| "mean_token_accuracy": 0.6589679047465324, | |
| "num_tokens": 481697.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.323414146900177, | |
| "epoch": 0.03577312308778536, | |
| "grad_norm": 0.13252823054790497, | |
| "learning_rate": 0.0004966165413533834, | |
| "loss": 1.3682, | |
| "mean_token_accuracy": 0.6478805840015411, | |
| "num_tokens": 508637.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 1.320784792304039, | |
| "epoch": 0.03765591903977406, | |
| "grad_norm": 0.13821907341480255, | |
| "learning_rate": 0.0004964285714285715, | |
| "loss": 1.3087, | |
| "mean_token_accuracy": 0.6556096524000168, | |
| "num_tokens": 533762.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.435991793870926, | |
| "epoch": 0.039538714991762765, | |
| "grad_norm": 0.13946449756622314, | |
| "learning_rate": 0.0004962406015037594, | |
| "loss": 1.4031, | |
| "mean_token_accuracy": 0.6474809646606445, | |
| "num_tokens": 558068.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.3843661397695541, | |
| "epoch": 0.04142151094375147, | |
| "grad_norm": 0.14075031876564026, | |
| "learning_rate": 0.0004960526315789473, | |
| "loss": 1.3313, | |
| "mean_token_accuracy": 0.6577248424291611, | |
| "num_tokens": 585582.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.3438803404569626, | |
| "epoch": 0.04330430689574018, | |
| "grad_norm": 0.12071845680475235, | |
| "learning_rate": 0.0004958646616541354, | |
| "loss": 1.3205, | |
| "mean_token_accuracy": 0.6598646715283394, | |
| "num_tokens": 614078.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.2872049808502197, | |
| "epoch": 0.04518710284772888, | |
| "grad_norm": 0.13585081696510315, | |
| "learning_rate": 0.0004956766917293234, | |
| "loss": 1.2847, | |
| "mean_token_accuracy": 0.6646199747920036, | |
| "num_tokens": 641604.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.4031487703323364, | |
| "epoch": 0.04706989879971758, | |
| "grad_norm": 0.16168682277202606, | |
| "learning_rate": 0.0004954887218045112, | |
| "loss": 1.3906, | |
| "mean_token_accuracy": 0.6470670253038406, | |
| "num_tokens": 668099.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.3954781144857407, | |
| "epoch": 0.048952694751706285, | |
| "grad_norm": 0.1519748568534851, | |
| "learning_rate": 0.0004953007518796993, | |
| "loss": 1.3143, | |
| "mean_token_accuracy": 0.6569681242108345, | |
| "num_tokens": 693467.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.4201241582632065, | |
| "epoch": 0.05083549070369499, | |
| "grad_norm": 0.12228523939847946, | |
| "learning_rate": 0.0004951127819548873, | |
| "loss": 1.3585, | |
| "mean_token_accuracy": 0.6522250324487686, | |
| "num_tokens": 719428.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.3096809834241867, | |
| "epoch": 0.05271828665568369, | |
| "grad_norm": 0.12990325689315796, | |
| "learning_rate": 0.0004949248120300752, | |
| "loss": 1.3363, | |
| "mean_token_accuracy": 0.6576437503099442, | |
| "num_tokens": 743498.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.2695416510105133, | |
| "epoch": 0.054601082607672394, | |
| "grad_norm": 0.12629908323287964, | |
| "learning_rate": 0.0004947368421052632, | |
| "loss": 1.256, | |
| "mean_token_accuracy": 0.6671914085745811, | |
| "num_tokens": 771083.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.3144675344228745, | |
| "epoch": 0.056483878559661096, | |
| "grad_norm": 0.13920928537845612, | |
| "learning_rate": 0.0004945488721804512, | |
| "loss": 1.2797, | |
| "mean_token_accuracy": 0.6726761981844902, | |
| "num_tokens": 798194.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.3235575556755066, | |
| "epoch": 0.0583666745116498, | |
| "grad_norm": 0.1421487033367157, | |
| "learning_rate": 0.0004943609022556391, | |
| "loss": 1.3095, | |
| "mean_token_accuracy": 0.6596867814660072, | |
| "num_tokens": 823348.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.2517389357089996, | |
| "epoch": 0.0602494704636385, | |
| "grad_norm": 0.11075025051832199, | |
| "learning_rate": 0.0004941729323308271, | |
| "loss": 1.2458, | |
| "mean_token_accuracy": 0.6723818778991699, | |
| "num_tokens": 849713.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.2159670144319534, | |
| "epoch": 0.062132266415627205, | |
| "grad_norm": 0.11285679787397385, | |
| "learning_rate": 0.0004939849624060151, | |
| "loss": 1.2158, | |
| "mean_token_accuracy": 0.6808358430862427, | |
| "num_tokens": 876659.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.2742353826761246, | |
| "epoch": 0.06401506236761591, | |
| "grad_norm": 0.1200110912322998, | |
| "learning_rate": 0.000493796992481203, | |
| "loss": 1.2414, | |
| "mean_token_accuracy": 0.6697632297873497, | |
| "num_tokens": 904196.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.3724654912948608, | |
| "epoch": 0.06589785831960461, | |
| "grad_norm": 0.11141709238290787, | |
| "learning_rate": 0.000493609022556391, | |
| "loss": 1.3037, | |
| "mean_token_accuracy": 0.6641954258084297, | |
| "num_tokens": 930650.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.332644298672676, | |
| "epoch": 0.06778065427159331, | |
| "grad_norm": 0.11270242929458618, | |
| "learning_rate": 0.000493421052631579, | |
| "loss": 1.2723, | |
| "mean_token_accuracy": 0.6652832478284836, | |
| "num_tokens": 958361.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.2781042605638504, | |
| "epoch": 0.06966345022358202, | |
| "grad_norm": 0.12608197331428528, | |
| "learning_rate": 0.0004932330827067669, | |
| "loss": 1.2664, | |
| "mean_token_accuracy": 0.6701500117778778, | |
| "num_tokens": 982981.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.2652703523635864, | |
| "epoch": 0.07154624617557072, | |
| "grad_norm": 0.11680380254983902, | |
| "learning_rate": 0.0004930451127819549, | |
| "loss": 1.2363, | |
| "mean_token_accuracy": 0.6758281961083412, | |
| "num_tokens": 1010214.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.2895056456327438, | |
| "epoch": 0.07342904212755942, | |
| "grad_norm": 0.13060909509658813, | |
| "learning_rate": 0.0004928571428571429, | |
| "loss": 1.2921, | |
| "mean_token_accuracy": 0.6617036908864975, | |
| "num_tokens": 1036007.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.2508063912391663, | |
| "epoch": 0.07531183807954812, | |
| "grad_norm": 0.11048955470323563, | |
| "learning_rate": 0.0004926691729323308, | |
| "loss": 1.2388, | |
| "mean_token_accuracy": 0.6743078008294106, | |
| "num_tokens": 1064839.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.2910813689231873, | |
| "epoch": 0.07719463403153683, | |
| "grad_norm": 0.12634366750717163, | |
| "learning_rate": 0.0004924812030075188, | |
| "loss": 1.2923, | |
| "mean_token_accuracy": 0.6658936813473701, | |
| "num_tokens": 1089267.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.314329817891121, | |
| "epoch": 0.07907742998352553, | |
| "grad_norm": 0.11990135908126831, | |
| "learning_rate": 0.0004922932330827068, | |
| "loss": 1.2823, | |
| "mean_token_accuracy": 0.6621334031224251, | |
| "num_tokens": 1114747.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.372491493821144, | |
| "epoch": 0.08096022593551423, | |
| "grad_norm": 0.14962127804756165, | |
| "learning_rate": 0.0004921052631578947, | |
| "loss": 1.3012, | |
| "mean_token_accuracy": 0.6624018624424934, | |
| "num_tokens": 1140568.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.3109306246042252, | |
| "epoch": 0.08284302188750294, | |
| "grad_norm": 0.1251574158668518, | |
| "learning_rate": 0.0004919172932330827, | |
| "loss": 1.2753, | |
| "mean_token_accuracy": 0.6643748208880424, | |
| "num_tokens": 1166132.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.2547127306461334, | |
| "epoch": 0.08472581783949165, | |
| "grad_norm": 0.14988984167575836, | |
| "learning_rate": 0.0004917293233082707, | |
| "loss": 1.2591, | |
| "mean_token_accuracy": 0.6667659133672714, | |
| "num_tokens": 1191773.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.2385195791721344, | |
| "epoch": 0.08660861379148035, | |
| "grad_norm": 0.14218594133853912, | |
| "learning_rate": 0.0004915413533834586, | |
| "loss": 1.2551, | |
| "mean_token_accuracy": 0.67237289249897, | |
| "num_tokens": 1217928.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.286237582564354, | |
| "epoch": 0.08849140974346906, | |
| "grad_norm": 0.1285715401172638, | |
| "learning_rate": 0.0004913533834586466, | |
| "loss": 1.228, | |
| "mean_token_accuracy": 0.6695188358426094, | |
| "num_tokens": 1243853.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.2577073574066162, | |
| "epoch": 0.09037420569545776, | |
| "grad_norm": 0.1297583132982254, | |
| "learning_rate": 0.0004911654135338346, | |
| "loss": 1.1889, | |
| "mean_token_accuracy": 0.6802271753549576, | |
| "num_tokens": 1270883.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.2520407736301422, | |
| "epoch": 0.09225700164744646, | |
| "grad_norm": 0.10652397572994232, | |
| "learning_rate": 0.0004909774436090225, | |
| "loss": 1.2295, | |
| "mean_token_accuracy": 0.675907552242279, | |
| "num_tokens": 1296937.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 1.2889134734869003, | |
| "epoch": 0.09413979759943517, | |
| "grad_norm": 0.15478400886058807, | |
| "learning_rate": 0.0004907894736842106, | |
| "loss": 1.325, | |
| "mean_token_accuracy": 0.656628705561161, | |
| "num_tokens": 1323691.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.319000005722046, | |
| "epoch": 0.09602259355142387, | |
| "grad_norm": 0.14395709335803986, | |
| "learning_rate": 0.0004906015037593985, | |
| "loss": 1.2879, | |
| "mean_token_accuracy": 0.6644657775759697, | |
| "num_tokens": 1347574.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 1.265960842370987, | |
| "epoch": 0.09790538950341257, | |
| "grad_norm": 0.1301705241203308, | |
| "learning_rate": 0.0004904135338345864, | |
| "loss": 1.1913, | |
| "mean_token_accuracy": 0.6857202649116516, | |
| "num_tokens": 1376965.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.2671979069709778, | |
| "epoch": 0.09978818545540127, | |
| "grad_norm": 0.12502525746822357, | |
| "learning_rate": 0.0004902255639097745, | |
| "loss": 1.2473, | |
| "mean_token_accuracy": 0.666202001273632, | |
| "num_tokens": 1402456.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 1.2768708020448685, | |
| "epoch": 0.10167098140738998, | |
| "grad_norm": 0.1106332466006279, | |
| "learning_rate": 0.0004900375939849624, | |
| "loss": 1.2406, | |
| "mean_token_accuracy": 0.6731417253613472, | |
| "num_tokens": 1430744.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.2286315560340881, | |
| "epoch": 0.10355377735937868, | |
| "grad_norm": 0.12362819164991379, | |
| "learning_rate": 0.0004898496240601503, | |
| "loss": 1.2452, | |
| "mean_token_accuracy": 0.6803877055644989, | |
| "num_tokens": 1459596.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 1.2663686275482178, | |
| "epoch": 0.10543657331136738, | |
| "grad_norm": 0.11787568777799606, | |
| "learning_rate": 0.0004896616541353384, | |
| "loss": 1.2594, | |
| "mean_token_accuracy": 0.6688775643706322, | |
| "num_tokens": 1487663.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 1.2797971814870834, | |
| "epoch": 0.10731936926335608, | |
| "grad_norm": 0.11497815698385239, | |
| "learning_rate": 0.0004894736842105264, | |
| "loss": 1.2556, | |
| "mean_token_accuracy": 0.6690255850553513, | |
| "num_tokens": 1514365.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 1.2839107066392899, | |
| "epoch": 0.10920216521534479, | |
| "grad_norm": 0.11505855619907379, | |
| "learning_rate": 0.0004892857142857142, | |
| "loss": 1.2213, | |
| "mean_token_accuracy": 0.6812370792031288, | |
| "num_tokens": 1542885.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 1.290139302611351, | |
| "epoch": 0.11108496116733349, | |
| "grad_norm": 0.11844398826360703, | |
| "learning_rate": 0.0004890977443609023, | |
| "loss": 1.2462, | |
| "mean_token_accuracy": 0.6695830523967743, | |
| "num_tokens": 1567898.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 1.2590511292219162, | |
| "epoch": 0.11296775711932219, | |
| "grad_norm": 0.12767820060253143, | |
| "learning_rate": 0.0004889097744360903, | |
| "loss": 1.2515, | |
| "mean_token_accuracy": 0.6738757342100143, | |
| "num_tokens": 1594742.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.2260379791259766, | |
| "epoch": 0.1148505530713109, | |
| "grad_norm": 0.11811124533414841, | |
| "learning_rate": 0.0004887218045112781, | |
| "loss": 1.1979, | |
| "mean_token_accuracy": 0.6808087155222893, | |
| "num_tokens": 1620685.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 1.301318883895874, | |
| "epoch": 0.1167333490232996, | |
| "grad_norm": 0.13785120844841003, | |
| "learning_rate": 0.0004885338345864662, | |
| "loss": 1.3155, | |
| "mean_token_accuracy": 0.6592775583267212, | |
| "num_tokens": 1646541.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 1.2704945504665375, | |
| "epoch": 0.1186161449752883, | |
| "grad_norm": 0.11612152308225632, | |
| "learning_rate": 0.0004883458646616542, | |
| "loss": 1.2429, | |
| "mean_token_accuracy": 0.6690341830253601, | |
| "num_tokens": 1674445.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 1.2772111147642136, | |
| "epoch": 0.120498940927277, | |
| "grad_norm": 0.12045788764953613, | |
| "learning_rate": 0.00048815789473684215, | |
| "loss": 1.2114, | |
| "mean_token_accuracy": 0.6808006837964058, | |
| "num_tokens": 1701277.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 1.2712904959917068, | |
| "epoch": 0.1223817368792657, | |
| "grad_norm": 0.11429794877767563, | |
| "learning_rate": 0.00048796992481203006, | |
| "loss": 1.216, | |
| "mean_token_accuracy": 0.6720417365431786, | |
| "num_tokens": 1728984.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 1.3161986768245697, | |
| "epoch": 0.12426453283125441, | |
| "grad_norm": 0.1338111013174057, | |
| "learning_rate": 0.00048778195488721803, | |
| "loss": 1.3229, | |
| "mean_token_accuracy": 0.6602049320936203, | |
| "num_tokens": 1755598.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 1.2473317682743073, | |
| "epoch": 0.1261473287832431, | |
| "grad_norm": 0.10488025099039078, | |
| "learning_rate": 0.00048759398496240605, | |
| "loss": 1.2263, | |
| "mean_token_accuracy": 0.6753234788775444, | |
| "num_tokens": 1783417.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 1.2551011592149734, | |
| "epoch": 0.12803012473523182, | |
| "grad_norm": 0.11638512462377548, | |
| "learning_rate": 0.000487406015037594, | |
| "loss": 1.224, | |
| "mean_token_accuracy": 0.6783930733799934, | |
| "num_tokens": 1809462.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 1.2382186502218246, | |
| "epoch": 0.12991292068722052, | |
| "grad_norm": 0.14887025952339172, | |
| "learning_rate": 0.00048721804511278193, | |
| "loss": 1.2175, | |
| "mean_token_accuracy": 0.6787804737687111, | |
| "num_tokens": 1835642.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 1.274851605296135, | |
| "epoch": 0.13179571663920922, | |
| "grad_norm": 0.13403619825839996, | |
| "learning_rate": 0.00048703007518796995, | |
| "loss": 1.2662, | |
| "mean_token_accuracy": 0.6663196384906769, | |
| "num_tokens": 1859904.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.303640365600586, | |
| "epoch": 0.13367851259119792, | |
| "grad_norm": 0.11801115423440933, | |
| "learning_rate": 0.0004868421052631579, | |
| "loss": 1.3138, | |
| "mean_token_accuracy": 0.6627907082438469, | |
| "num_tokens": 1886915.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 1.2814981341362, | |
| "epoch": 0.13556130854318663, | |
| "grad_norm": 0.12543627619743347, | |
| "learning_rate": 0.00048665413533834583, | |
| "loss": 1.2599, | |
| "mean_token_accuracy": 0.6737553998827934, | |
| "num_tokens": 1912683.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 1.2715606987476349, | |
| "epoch": 0.13744410449517533, | |
| "grad_norm": 0.11963653564453125, | |
| "learning_rate": 0.00048646616541353385, | |
| "loss": 1.2075, | |
| "mean_token_accuracy": 0.6787137389183044, | |
| "num_tokens": 1940455.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 1.2765703648328781, | |
| "epoch": 0.13932690044716403, | |
| "grad_norm": 0.13952264189720154, | |
| "learning_rate": 0.0004862781954887218, | |
| "loss": 1.2043, | |
| "mean_token_accuracy": 0.6798917651176453, | |
| "num_tokens": 1965949.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 1.229781836271286, | |
| "epoch": 0.14120969639915273, | |
| "grad_norm": 0.11769476532936096, | |
| "learning_rate": 0.0004860902255639098, | |
| "loss": 1.2063, | |
| "mean_token_accuracy": 0.6715990677475929, | |
| "num_tokens": 1992293.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 1.1944819241762161, | |
| "epoch": 0.14309249235114144, | |
| "grad_norm": 0.12095087021589279, | |
| "learning_rate": 0.00048590225563909775, | |
| "loss": 1.217, | |
| "mean_token_accuracy": 0.6814620569348335, | |
| "num_tokens": 2019182.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 1.2649260014295578, | |
| "epoch": 0.14497528830313014, | |
| "grad_norm": 0.12220579385757446, | |
| "learning_rate": 0.0004857142857142857, | |
| "loss": 1.2827, | |
| "mean_token_accuracy": 0.6689692661166191, | |
| "num_tokens": 2045357.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 1.2532286047935486, | |
| "epoch": 0.14685808425511884, | |
| "grad_norm": 0.12137361615896225, | |
| "learning_rate": 0.0004855263157894737, | |
| "loss": 1.202, | |
| "mean_token_accuracy": 0.6808355078101158, | |
| "num_tokens": 2071015.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 1.334955409169197, | |
| "epoch": 0.14874088020710755, | |
| "grad_norm": 0.12754660844802856, | |
| "learning_rate": 0.0004853383458646617, | |
| "loss": 1.2514, | |
| "mean_token_accuracy": 0.6797578409314156, | |
| "num_tokens": 2096831.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 1.2261384725570679, | |
| "epoch": 0.15062367615909625, | |
| "grad_norm": 0.11096950620412827, | |
| "learning_rate": 0.0004851503759398496, | |
| "loss": 1.1933, | |
| "mean_token_accuracy": 0.6880421414971352, | |
| "num_tokens": 2126421.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.2615373581647873, | |
| "epoch": 0.15250647211108495, | |
| "grad_norm": 0.13106736540794373, | |
| "learning_rate": 0.0004849624060150376, | |
| "loss": 1.2198, | |
| "mean_token_accuracy": 0.6821138635277748, | |
| "num_tokens": 2153303.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 1.2859619706869125, | |
| "epoch": 0.15438926806307365, | |
| "grad_norm": 0.13115623593330383, | |
| "learning_rate": 0.0004847744360902256, | |
| "loss": 1.2783, | |
| "mean_token_accuracy": 0.6689222902059555, | |
| "num_tokens": 2180250.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 1.248913735151291, | |
| "epoch": 0.15627206401506236, | |
| "grad_norm": 0.11291101574897766, | |
| "learning_rate": 0.0004845864661654135, | |
| "loss": 1.2351, | |
| "mean_token_accuracy": 0.6730126142501831, | |
| "num_tokens": 2207001.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 1.2413169145584106, | |
| "epoch": 0.15815485996705106, | |
| "grad_norm": 0.1277051717042923, | |
| "learning_rate": 0.0004843984962406015, | |
| "loss": 1.2159, | |
| "mean_token_accuracy": 0.681744784116745, | |
| "num_tokens": 2232587.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 1.2155817747116089, | |
| "epoch": 0.16003765591903976, | |
| "grad_norm": 0.15200501680374146, | |
| "learning_rate": 0.0004842105263157895, | |
| "loss": 1.1881, | |
| "mean_token_accuracy": 0.6845081895589828, | |
| "num_tokens": 2260040.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 1.1750262528657913, | |
| "epoch": 0.16192045187102846, | |
| "grad_norm": 0.13496170938014984, | |
| "learning_rate": 0.0004840225563909775, | |
| "loss": 1.1566, | |
| "mean_token_accuracy": 0.6882026270031929, | |
| "num_tokens": 2286811.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 1.2582080215215683, | |
| "epoch": 0.16380324782301717, | |
| "grad_norm": 0.12751278281211853, | |
| "learning_rate": 0.0004838345864661654, | |
| "loss": 1.2334, | |
| "mean_token_accuracy": 0.6756840199232101, | |
| "num_tokens": 2312376.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 1.2530706375837326, | |
| "epoch": 0.16568604377500587, | |
| "grad_norm": 0.12347429990768433, | |
| "learning_rate": 0.0004836466165413534, | |
| "loss": 1.2358, | |
| "mean_token_accuracy": 0.6713104099035263, | |
| "num_tokens": 2338959.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 1.2693426012992859, | |
| "epoch": 0.1675688397269946, | |
| "grad_norm": 0.16009417176246643, | |
| "learning_rate": 0.0004834586466165414, | |
| "loss": 1.2511, | |
| "mean_token_accuracy": 0.6736921593546867, | |
| "num_tokens": 2366183.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 1.255973756313324, | |
| "epoch": 0.1694516356789833, | |
| "grad_norm": 0.12181756645441055, | |
| "learning_rate": 0.00048327067669172934, | |
| "loss": 1.2052, | |
| "mean_token_accuracy": 0.6734501421451569, | |
| "num_tokens": 2392856.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.2562214732170105, | |
| "epoch": 0.171334431630972, | |
| "grad_norm": 0.12082800269126892, | |
| "learning_rate": 0.0004830827067669173, | |
| "loss": 1.2519, | |
| "mean_token_accuracy": 0.6692837849259377, | |
| "num_tokens": 2419897.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 1.1730956435203552, | |
| "epoch": 0.1732172275829607, | |
| "grad_norm": 0.11969847977161407, | |
| "learning_rate": 0.0004828947368421053, | |
| "loss": 1.1305, | |
| "mean_token_accuracy": 0.6944040432572365, | |
| "num_tokens": 2449131.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 1.2573560923337936, | |
| "epoch": 0.1751000235349494, | |
| "grad_norm": 0.1183922290802002, | |
| "learning_rate": 0.00048270676691729324, | |
| "loss": 1.224, | |
| "mean_token_accuracy": 0.6771978959441185, | |
| "num_tokens": 2474107.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 1.2122257351875305, | |
| "epoch": 0.17698281948693811, | |
| "grad_norm": 0.1325969696044922, | |
| "learning_rate": 0.0004825187969924812, | |
| "loss": 1.1754, | |
| "mean_token_accuracy": 0.6865298077464104, | |
| "num_tokens": 2501837.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 1.2060312926769257, | |
| "epoch": 0.17886561543892682, | |
| "grad_norm": 0.12340355664491653, | |
| "learning_rate": 0.0004823308270676692, | |
| "loss": 1.2042, | |
| "mean_token_accuracy": 0.6752656251192093, | |
| "num_tokens": 2528769.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 1.268461525440216, | |
| "epoch": 0.18074841139091552, | |
| "grad_norm": 0.1260639727115631, | |
| "learning_rate": 0.00048214285714285715, | |
| "loss": 1.2781, | |
| "mean_token_accuracy": 0.6681492626667023, | |
| "num_tokens": 2555451.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 1.2650732845067978, | |
| "epoch": 0.18263120734290422, | |
| "grad_norm": 0.12851010262966156, | |
| "learning_rate": 0.00048195488721804517, | |
| "loss": 1.2458, | |
| "mean_token_accuracy": 0.671695739030838, | |
| "num_tokens": 2582196.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 1.2784437835216522, | |
| "epoch": 0.18451400329489293, | |
| "grad_norm": 0.1278950273990631, | |
| "learning_rate": 0.0004817669172932331, | |
| "loss": 1.2319, | |
| "mean_token_accuracy": 0.6702851504087448, | |
| "num_tokens": 2608444.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 1.2551447749137878, | |
| "epoch": 0.18639679924688163, | |
| "grad_norm": 0.1206209808588028, | |
| "learning_rate": 0.00048157894736842105, | |
| "loss": 1.2044, | |
| "mean_token_accuracy": 0.677789680659771, | |
| "num_tokens": 2634109.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 1.2039145231246948, | |
| "epoch": 0.18827959519887033, | |
| "grad_norm": 0.12305069714784622, | |
| "learning_rate": 0.00048139097744360907, | |
| "loss": 1.1637, | |
| "mean_token_accuracy": 0.6861624270677567, | |
| "num_tokens": 2659548.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.2327278852462769, | |
| "epoch": 0.19016239115085903, | |
| "grad_norm": 0.13643652200698853, | |
| "learning_rate": 0.000481203007518797, | |
| "loss": 1.212, | |
| "mean_token_accuracy": 0.6804677918553352, | |
| "num_tokens": 2684638.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 1.194289356470108, | |
| "epoch": 0.19204518710284774, | |
| "grad_norm": 0.15666837990283966, | |
| "learning_rate": 0.00048101503759398495, | |
| "loss": 1.1797, | |
| "mean_token_accuracy": 0.683199092745781, | |
| "num_tokens": 2711970.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 1.2052866965532303, | |
| "epoch": 0.19392798305483644, | |
| "grad_norm": 0.12934386730194092, | |
| "learning_rate": 0.00048082706766917297, | |
| "loss": 1.1954, | |
| "mean_token_accuracy": 0.6831924915313721, | |
| "num_tokens": 2738028.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 1.2316648960113525, | |
| "epoch": 0.19581077900682514, | |
| "grad_norm": 0.12603920698165894, | |
| "learning_rate": 0.00048063909774436094, | |
| "loss": 1.2112, | |
| "mean_token_accuracy": 0.6792290285229683, | |
| "num_tokens": 2765091.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 1.2624593675136566, | |
| "epoch": 0.19769357495881384, | |
| "grad_norm": 0.1318008452653885, | |
| "learning_rate": 0.00048045112781954885, | |
| "loss": 1.2389, | |
| "mean_token_accuracy": 0.6782659739255905, | |
| "num_tokens": 2792661.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 1.2824029475450516, | |
| "epoch": 0.19957637091080255, | |
| "grad_norm": 0.13028129935264587, | |
| "learning_rate": 0.00048026315789473687, | |
| "loss": 1.2581, | |
| "mean_token_accuracy": 0.6727664992213249, | |
| "num_tokens": 2819535.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 1.1964116394519806, | |
| "epoch": 0.20145916686279125, | |
| "grad_norm": 0.16565856337547302, | |
| "learning_rate": 0.00048007518796992484, | |
| "loss": 1.1427, | |
| "mean_token_accuracy": 0.6922469958662987, | |
| "num_tokens": 2848429.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 1.2726367861032486, | |
| "epoch": 0.20334196281477995, | |
| "grad_norm": 0.1416698843240738, | |
| "learning_rate": 0.0004798872180451128, | |
| "loss": 1.225, | |
| "mean_token_accuracy": 0.6754879876971245, | |
| "num_tokens": 2874776.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 1.2357124537229538, | |
| "epoch": 0.20522475876676866, | |
| "grad_norm": 0.12491658329963684, | |
| "learning_rate": 0.00047969924812030077, | |
| "loss": 1.204, | |
| "mean_token_accuracy": 0.6739878728985786, | |
| "num_tokens": 2902602.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 1.2650941908359528, | |
| "epoch": 0.20710755471875736, | |
| "grad_norm": 0.13329921662807465, | |
| "learning_rate": 0.00047951127819548874, | |
| "loss": 1.2432, | |
| "mean_token_accuracy": 0.6738255694508553, | |
| "num_tokens": 2929536.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.2259162962436676, | |
| "epoch": 0.20899035067074606, | |
| "grad_norm": 0.14152902364730835, | |
| "learning_rate": 0.0004793233082706767, | |
| "loss": 1.1886, | |
| "mean_token_accuracy": 0.6813376769423485, | |
| "num_tokens": 2955236.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 1.1335331127047539, | |
| "epoch": 0.21087314662273476, | |
| "grad_norm": 0.13298991322517395, | |
| "learning_rate": 0.00047913533834586467, | |
| "loss": 1.1339, | |
| "mean_token_accuracy": 0.6916593015193939, | |
| "num_tokens": 2979921.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 1.2154437899589539, | |
| "epoch": 0.21275594257472347, | |
| "grad_norm": 0.15994608402252197, | |
| "learning_rate": 0.00047894736842105264, | |
| "loss": 1.2115, | |
| "mean_token_accuracy": 0.679818794131279, | |
| "num_tokens": 3005638.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 1.215769276022911, | |
| "epoch": 0.21463873852671217, | |
| "grad_norm": 0.11282095313072205, | |
| "learning_rate": 0.0004787593984962406, | |
| "loss": 1.1821, | |
| "mean_token_accuracy": 0.6841456890106201, | |
| "num_tokens": 3033979.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 1.2786222100257874, | |
| "epoch": 0.21652153447870087, | |
| "grad_norm": 0.13811451196670532, | |
| "learning_rate": 0.0004785714285714286, | |
| "loss": 1.2177, | |
| "mean_token_accuracy": 0.6760591194033623, | |
| "num_tokens": 3060581.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 1.1782392710447311, | |
| "epoch": 0.21840433043068957, | |
| "grad_norm": 0.12641046941280365, | |
| "learning_rate": 0.00047838345864661654, | |
| "loss": 1.1449, | |
| "mean_token_accuracy": 0.6954788789153099, | |
| "num_tokens": 3086594.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 1.2415330708026886, | |
| "epoch": 0.22028712638267828, | |
| "grad_norm": 0.1396101415157318, | |
| "learning_rate": 0.0004781954887218045, | |
| "loss": 1.2245, | |
| "mean_token_accuracy": 0.6794020012021065, | |
| "num_tokens": 3114117.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 1.2689218074083328, | |
| "epoch": 0.22216992233466698, | |
| "grad_norm": 0.13006678223609924, | |
| "learning_rate": 0.00047800751879699253, | |
| "loss": 1.2523, | |
| "mean_token_accuracy": 0.6741964370012283, | |
| "num_tokens": 3140643.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 1.2656696736812592, | |
| "epoch": 0.22405271828665568, | |
| "grad_norm": 0.15107867121696472, | |
| "learning_rate": 0.0004778195488721805, | |
| "loss": 1.2539, | |
| "mean_token_accuracy": 0.6682558432221413, | |
| "num_tokens": 3166141.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 1.1993789225816727, | |
| "epoch": 0.22593551423864439, | |
| "grad_norm": 0.11653780192136765, | |
| "learning_rate": 0.0004776315789473684, | |
| "loss": 1.1753, | |
| "mean_token_accuracy": 0.6902748569846153, | |
| "num_tokens": 3193339.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.231392353773117, | |
| "epoch": 0.2278183101906331, | |
| "grad_norm": 0.1314115673303604, | |
| "learning_rate": 0.00047744360902255643, | |
| "loss": 1.2005, | |
| "mean_token_accuracy": 0.6799951046705246, | |
| "num_tokens": 3219993.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 1.2121622115373611, | |
| "epoch": 0.2297011061426218, | |
| "grad_norm": 0.12394538521766663, | |
| "learning_rate": 0.0004772556390977444, | |
| "loss": 1.1715, | |
| "mean_token_accuracy": 0.6903199851512909, | |
| "num_tokens": 3247444.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 1.2413930743932724, | |
| "epoch": 0.2315839020946105, | |
| "grad_norm": 0.1266545057296753, | |
| "learning_rate": 0.0004770676691729323, | |
| "loss": 1.1899, | |
| "mean_token_accuracy": 0.682403139770031, | |
| "num_tokens": 3272627.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 1.1818571537733078, | |
| "epoch": 0.2334666980465992, | |
| "grad_norm": 0.15664935111999512, | |
| "learning_rate": 0.00047687969924812033, | |
| "loss": 1.1479, | |
| "mean_token_accuracy": 0.6944203674793243, | |
| "num_tokens": 3296898.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 1.2419498413801193, | |
| "epoch": 0.2353494939985879, | |
| "grad_norm": 0.15578152239322662, | |
| "learning_rate": 0.0004766917293233083, | |
| "loss": 1.2335, | |
| "mean_token_accuracy": 0.6732713803648949, | |
| "num_tokens": 3322692.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 1.2249382436275482, | |
| "epoch": 0.2372322899505766, | |
| "grad_norm": 0.14584508538246155, | |
| "learning_rate": 0.00047650375939849626, | |
| "loss": 1.2124, | |
| "mean_token_accuracy": 0.6797131448984146, | |
| "num_tokens": 3348376.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 1.2090249583125114, | |
| "epoch": 0.2391150859025653, | |
| "grad_norm": 0.15335120260715485, | |
| "learning_rate": 0.0004763157894736842, | |
| "loss": 1.1861, | |
| "mean_token_accuracy": 0.6816836297512054, | |
| "num_tokens": 3375056.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 1.2331191301345825, | |
| "epoch": 0.240997881854554, | |
| "grad_norm": 0.13854444026947021, | |
| "learning_rate": 0.0004761278195488722, | |
| "loss": 1.1867, | |
| "mean_token_accuracy": 0.6822093352675438, | |
| "num_tokens": 3401338.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 1.2083263993263245, | |
| "epoch": 0.2428806778065427, | |
| "grad_norm": 0.1330289989709854, | |
| "learning_rate": 0.00047593984962406016, | |
| "loss": 1.1774, | |
| "mean_token_accuracy": 0.6801193058490753, | |
| "num_tokens": 3426232.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 1.2008604258298874, | |
| "epoch": 0.2447634737585314, | |
| "grad_norm": 0.14914868772029877, | |
| "learning_rate": 0.00047575187969924813, | |
| "loss": 1.1679, | |
| "mean_token_accuracy": 0.6855365261435509, | |
| "num_tokens": 3454080.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.2279947251081467, | |
| "epoch": 0.24664626971052012, | |
| "grad_norm": 0.18307369947433472, | |
| "learning_rate": 0.0004755639097744361, | |
| "loss": 1.2333, | |
| "mean_token_accuracy": 0.672551229596138, | |
| "num_tokens": 3478258.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 1.1894963383674622, | |
| "epoch": 0.24852906566250882, | |
| "grad_norm": 0.13398650288581848, | |
| "learning_rate": 0.00047537593984962407, | |
| "loss": 1.1953, | |
| "mean_token_accuracy": 0.6832383349537849, | |
| "num_tokens": 3504254.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 1.2269657999277115, | |
| "epoch": 0.2504118616144975, | |
| "grad_norm": 0.13811668753623962, | |
| "learning_rate": 0.00047518796992481203, | |
| "loss": 1.1741, | |
| "mean_token_accuracy": 0.6880706697702408, | |
| "num_tokens": 3531225.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 1.198286533355713, | |
| "epoch": 0.2522946575664862, | |
| "grad_norm": 0.17705924808979034, | |
| "learning_rate": 0.000475, | |
| "loss": 1.1395, | |
| "mean_token_accuracy": 0.691774420440197, | |
| "num_tokens": 3556428.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 1.2244715094566345, | |
| "epoch": 0.2541774535184749, | |
| "grad_norm": 0.17644067108631134, | |
| "learning_rate": 0.00047481203007518797, | |
| "loss": 1.2204, | |
| "mean_token_accuracy": 0.6757577136158943, | |
| "num_tokens": 3583373.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 1.208250641822815, | |
| "epoch": 0.25606024947046363, | |
| "grad_norm": 0.12975312769412994, | |
| "learning_rate": 0.00047462406015037593, | |
| "loss": 1.2032, | |
| "mean_token_accuracy": 0.68288903683424, | |
| "num_tokens": 3610878.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 1.1764077246189117, | |
| "epoch": 0.25794304542245233, | |
| "grad_norm": 0.13420140743255615, | |
| "learning_rate": 0.00047443609022556395, | |
| "loss": 1.1343, | |
| "mean_token_accuracy": 0.6927010640501976, | |
| "num_tokens": 3636794.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 1.2354558259248734, | |
| "epoch": 0.25982584137444104, | |
| "grad_norm": 0.12880398333072662, | |
| "learning_rate": 0.00047424812030075187, | |
| "loss": 1.1809, | |
| "mean_token_accuracy": 0.682947002351284, | |
| "num_tokens": 3665578.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 1.175147533416748, | |
| "epoch": 0.26170863732642974, | |
| "grad_norm": 0.15634110569953918, | |
| "learning_rate": 0.00047406015037593983, | |
| "loss": 1.1483, | |
| "mean_token_accuracy": 0.6907549053430557, | |
| "num_tokens": 3691407.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 1.1331272423267365, | |
| "epoch": 0.26359143327841844, | |
| "grad_norm": 0.13562822341918945, | |
| "learning_rate": 0.00047387218045112786, | |
| "loss": 1.119, | |
| "mean_token_accuracy": 0.6953889951109886, | |
| "num_tokens": 3718468.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.2285344004631042, | |
| "epoch": 0.26547422923040714, | |
| "grad_norm": 0.1443127691745758, | |
| "learning_rate": 0.00047368421052631577, | |
| "loss": 1.2352, | |
| "mean_token_accuracy": 0.6712902784347534, | |
| "num_tokens": 3744121.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 1.2572973817586899, | |
| "epoch": 0.26735702518239585, | |
| "grad_norm": 0.14697600901126862, | |
| "learning_rate": 0.00047349624060150373, | |
| "loss": 1.2545, | |
| "mean_token_accuracy": 0.6712752804160118, | |
| "num_tokens": 3768665.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 1.2219904512166977, | |
| "epoch": 0.26923982113438455, | |
| "grad_norm": 0.1259946972131729, | |
| "learning_rate": 0.00047330827067669176, | |
| "loss": 1.1953, | |
| "mean_token_accuracy": 0.6853306293487549, | |
| "num_tokens": 3798421.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 1.2031358480453491, | |
| "epoch": 0.27112261708637325, | |
| "grad_norm": 0.1336822658777237, | |
| "learning_rate": 0.0004731203007518797, | |
| "loss": 1.1158, | |
| "mean_token_accuracy": 0.7008628249168396, | |
| "num_tokens": 3826569.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 1.2654242366552353, | |
| "epoch": 0.27300541303836195, | |
| "grad_norm": 0.12933260202407837, | |
| "learning_rate": 0.00047293233082706764, | |
| "loss": 1.2125, | |
| "mean_token_accuracy": 0.6849671006202698, | |
| "num_tokens": 3853128.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 1.1577993482351303, | |
| "epoch": 0.27488820899035066, | |
| "grad_norm": 0.13406828045845032, | |
| "learning_rate": 0.00047274436090225566, | |
| "loss": 1.1624, | |
| "mean_token_accuracy": 0.6865072473883629, | |
| "num_tokens": 3880569.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 1.1901942938566208, | |
| "epoch": 0.27677100494233936, | |
| "grad_norm": 0.14410416781902313, | |
| "learning_rate": 0.0004725563909774436, | |
| "loss": 1.2313, | |
| "mean_token_accuracy": 0.6749508231878281, | |
| "num_tokens": 3907559.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 1.1600831672549248, | |
| "epoch": 0.27865380089432806, | |
| "grad_norm": 0.1339792162179947, | |
| "learning_rate": 0.0004723684210526316, | |
| "loss": 1.1987, | |
| "mean_token_accuracy": 0.6836483106017113, | |
| "num_tokens": 3934255.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 1.2559089958667755, | |
| "epoch": 0.28053659684631677, | |
| "grad_norm": 0.12650057673454285, | |
| "learning_rate": 0.00047218045112781956, | |
| "loss": 1.2294, | |
| "mean_token_accuracy": 0.6761154308915138, | |
| "num_tokens": 3959809.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 1.2887302935123444, | |
| "epoch": 0.28241939279830547, | |
| "grad_norm": 0.14123603701591492, | |
| "learning_rate": 0.0004719924812030075, | |
| "loss": 1.1892, | |
| "mean_token_accuracy": 0.6841337457299232, | |
| "num_tokens": 3984834.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.2641656994819641, | |
| "epoch": 0.28430218875029417, | |
| "grad_norm": 0.13069137930870056, | |
| "learning_rate": 0.0004718045112781955, | |
| "loss": 1.178, | |
| "mean_token_accuracy": 0.6903347223997116, | |
| "num_tokens": 4011854.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 1.2745257169008255, | |
| "epoch": 0.2861849847022829, | |
| "grad_norm": 0.12974441051483154, | |
| "learning_rate": 0.00047161654135338346, | |
| "loss": 1.2299, | |
| "mean_token_accuracy": 0.6787015795707703, | |
| "num_tokens": 4038272.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 1.2451976537704468, | |
| "epoch": 0.2880677806542716, | |
| "grad_norm": 0.15594416856765747, | |
| "learning_rate": 0.0004714285714285714, | |
| "loss": 1.2506, | |
| "mean_token_accuracy": 0.6727647334337234, | |
| "num_tokens": 4066761.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 1.1639655232429504, | |
| "epoch": 0.2899505766062603, | |
| "grad_norm": 0.12053865194320679, | |
| "learning_rate": 0.0004712406015037594, | |
| "loss": 1.167, | |
| "mean_token_accuracy": 0.6889369264245033, | |
| "num_tokens": 4094208.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 1.1459853649139404, | |
| "epoch": 0.291833372558249, | |
| "grad_norm": 0.15322330594062805, | |
| "learning_rate": 0.0004710526315789474, | |
| "loss": 1.1297, | |
| "mean_token_accuracy": 0.691886380314827, | |
| "num_tokens": 4121959.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 1.2293187081813812, | |
| "epoch": 0.2937161685102377, | |
| "grad_norm": 0.135823056101799, | |
| "learning_rate": 0.0004708646616541353, | |
| "loss": 1.2266, | |
| "mean_token_accuracy": 0.6803058981895447, | |
| "num_tokens": 4147782.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 1.192505158483982, | |
| "epoch": 0.2955989644622264, | |
| "grad_norm": 0.13535255193710327, | |
| "learning_rate": 0.0004706766917293233, | |
| "loss": 1.1608, | |
| "mean_token_accuracy": 0.6955654844641685, | |
| "num_tokens": 4176277.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 1.2871312350034714, | |
| "epoch": 0.2974817604142151, | |
| "grad_norm": 0.12719225883483887, | |
| "learning_rate": 0.0004704887218045113, | |
| "loss": 1.2311, | |
| "mean_token_accuracy": 0.6765939891338348, | |
| "num_tokens": 4202697.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 1.2744830250740051, | |
| "epoch": 0.2993645563662038, | |
| "grad_norm": 0.15343067049980164, | |
| "learning_rate": 0.0004703007518796993, | |
| "loss": 1.2229, | |
| "mean_token_accuracy": 0.671116054058075, | |
| "num_tokens": 4229068.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 1.2606779783964157, | |
| "epoch": 0.3012473523181925, | |
| "grad_norm": 0.12448015809059143, | |
| "learning_rate": 0.0004701127819548872, | |
| "loss": 1.2061, | |
| "mean_token_accuracy": 0.6829146966338158, | |
| "num_tokens": 4256896.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.150521382689476, | |
| "epoch": 0.3031301482701812, | |
| "grad_norm": 0.1213938444852829, | |
| "learning_rate": 0.0004699248120300752, | |
| "loss": 1.128, | |
| "mean_token_accuracy": 0.6945177465677261, | |
| "num_tokens": 4283765.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 1.1809571981430054, | |
| "epoch": 0.3050129442221699, | |
| "grad_norm": 0.13989101350307465, | |
| "learning_rate": 0.0004697368421052632, | |
| "loss": 1.1549, | |
| "mean_token_accuracy": 0.6888199374079704, | |
| "num_tokens": 4308970.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 1.151911549270153, | |
| "epoch": 0.3068957401741586, | |
| "grad_norm": 0.2074657380580902, | |
| "learning_rate": 0.0004695488721804511, | |
| "loss": 1.1309, | |
| "mean_token_accuracy": 0.6942140832543373, | |
| "num_tokens": 4333158.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 1.1968079656362534, | |
| "epoch": 0.3087785361261473, | |
| "grad_norm": 0.13570360839366913, | |
| "learning_rate": 0.0004693609022556391, | |
| "loss": 1.1814, | |
| "mean_token_accuracy": 0.6869696602225304, | |
| "num_tokens": 4360040.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 1.1787877827882767, | |
| "epoch": 0.310661332078136, | |
| "grad_norm": 0.13379861414432526, | |
| "learning_rate": 0.0004691729323308271, | |
| "loss": 1.1791, | |
| "mean_token_accuracy": 0.6811994835734367, | |
| "num_tokens": 4386186.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 1.2168269157409668, | |
| "epoch": 0.3125441280301247, | |
| "grad_norm": 0.1466514617204666, | |
| "learning_rate": 0.00046898496240601505, | |
| "loss": 1.2131, | |
| "mean_token_accuracy": 0.6801121830940247, | |
| "num_tokens": 4412572.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 1.191074714064598, | |
| "epoch": 0.3144269239821134, | |
| "grad_norm": 0.13052161037921906, | |
| "learning_rate": 0.000468796992481203, | |
| "loss": 1.1818, | |
| "mean_token_accuracy": 0.6877126544713974, | |
| "num_tokens": 4439798.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 1.310966208577156, | |
| "epoch": 0.3163097199341021, | |
| "grad_norm": 0.14339525997638702, | |
| "learning_rate": 0.000468609022556391, | |
| "loss": 1.2826, | |
| "mean_token_accuracy": 0.6668709591031075, | |
| "num_tokens": 4465182.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 1.249758929014206, | |
| "epoch": 0.3181925158860908, | |
| "grad_norm": 0.14204370975494385, | |
| "learning_rate": 0.00046842105263157895, | |
| "loss": 1.1944, | |
| "mean_token_accuracy": 0.6822869181632996, | |
| "num_tokens": 4491690.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 1.2281111925840378, | |
| "epoch": 0.3200753118380795, | |
| "grad_norm": 0.13778182864189148, | |
| "learning_rate": 0.0004682330827067669, | |
| "loss": 1.1821, | |
| "mean_token_accuracy": 0.6827872395515442, | |
| "num_tokens": 4518668.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.1907898932695389, | |
| "epoch": 0.3219581077900682, | |
| "grad_norm": 0.13682714104652405, | |
| "learning_rate": 0.0004680451127819549, | |
| "loss": 1.1654, | |
| "mean_token_accuracy": 0.6878219619393349, | |
| "num_tokens": 4544500.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 1.2053745537996292, | |
| "epoch": 0.32384090374205693, | |
| "grad_norm": 0.1406177431344986, | |
| "learning_rate": 0.00046785714285714285, | |
| "loss": 1.2351, | |
| "mean_token_accuracy": 0.6759226024150848, | |
| "num_tokens": 4570672.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 1.1686365455389023, | |
| "epoch": 0.32572369969404563, | |
| "grad_norm": 0.1390364021062851, | |
| "learning_rate": 0.0004676691729323309, | |
| "loss": 1.1563, | |
| "mean_token_accuracy": 0.6870525777339935, | |
| "num_tokens": 4597157.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 1.1847928017377853, | |
| "epoch": 0.32760649564603433, | |
| "grad_norm": 0.12553362548351288, | |
| "learning_rate": 0.0004674812030075188, | |
| "loss": 1.1464, | |
| "mean_token_accuracy": 0.6896436884999275, | |
| "num_tokens": 4622963.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 1.2175119668245316, | |
| "epoch": 0.32948929159802304, | |
| "grad_norm": 0.12723615765571594, | |
| "learning_rate": 0.00046729323308270675, | |
| "loss": 1.1887, | |
| "mean_token_accuracy": 0.6839049756526947, | |
| "num_tokens": 4650796.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 1.2538534700870514, | |
| "epoch": 0.33137208755001174, | |
| "grad_norm": 0.1439773291349411, | |
| "learning_rate": 0.0004671052631578948, | |
| "loss": 1.1796, | |
| "mean_token_accuracy": 0.6849694699048996, | |
| "num_tokens": 4675067.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 1.2113288342952728, | |
| "epoch": 0.33325488350200044, | |
| "grad_norm": 0.20407459139823914, | |
| "learning_rate": 0.00046691729323308274, | |
| "loss": 1.1616, | |
| "mean_token_accuracy": 0.6856766641139984, | |
| "num_tokens": 4700943.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 1.1914596557617188, | |
| "epoch": 0.3351376794539892, | |
| "grad_norm": 0.13831955194473267, | |
| "learning_rate": 0.00046672932330827065, | |
| "loss": 1.1938, | |
| "mean_token_accuracy": 0.6882949769496918, | |
| "num_tokens": 4728608.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 1.1632477790117264, | |
| "epoch": 0.3370204754059779, | |
| "grad_norm": 0.1430656909942627, | |
| "learning_rate": 0.0004665413533834587, | |
| "loss": 1.1745, | |
| "mean_token_accuracy": 0.6857840716838837, | |
| "num_tokens": 4754323.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 1.1661407798528671, | |
| "epoch": 0.3389032713579666, | |
| "grad_norm": 0.13480572402477264, | |
| "learning_rate": 0.00046635338345864664, | |
| "loss": 1.1677, | |
| "mean_token_accuracy": 0.6842626482248306, | |
| "num_tokens": 4777734.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.2307626903057098, | |
| "epoch": 0.3407860673099553, | |
| "grad_norm": 0.14171424508094788, | |
| "learning_rate": 0.00046616541353383456, | |
| "loss": 1.2112, | |
| "mean_token_accuracy": 0.6779276877641678, | |
| "num_tokens": 4803062.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 1.2344750761985779, | |
| "epoch": 0.342668863261944, | |
| "grad_norm": 0.1366141438484192, | |
| "learning_rate": 0.0004659774436090226, | |
| "loss": 1.1521, | |
| "mean_token_accuracy": 0.6871028989553452, | |
| "num_tokens": 4828406.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 1.2267533838748932, | |
| "epoch": 0.3445516592139327, | |
| "grad_norm": 0.12364047765731812, | |
| "learning_rate": 0.00046578947368421054, | |
| "loss": 1.157, | |
| "mean_token_accuracy": 0.6939859166741371, | |
| "num_tokens": 4855048.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 1.25662961602211, | |
| "epoch": 0.3464344551659214, | |
| "grad_norm": 0.14521241188049316, | |
| "learning_rate": 0.0004656015037593985, | |
| "loss": 1.2005, | |
| "mean_token_accuracy": 0.6837843209505081, | |
| "num_tokens": 4879838.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 1.1265386119484901, | |
| "epoch": 0.3483172511179101, | |
| "grad_norm": 0.13281729817390442, | |
| "learning_rate": 0.0004654135338345865, | |
| "loss": 1.1245, | |
| "mean_token_accuracy": 0.7005239203572273, | |
| "num_tokens": 4906673.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 1.1675947606563568, | |
| "epoch": 0.3502000470698988, | |
| "grad_norm": 0.13612613081932068, | |
| "learning_rate": 0.00046522556390977444, | |
| "loss": 1.1783, | |
| "mean_token_accuracy": 0.6867906153202057, | |
| "num_tokens": 4932081.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 1.1747846454381943, | |
| "epoch": 0.3520828430218875, | |
| "grad_norm": 0.14062775671482086, | |
| "learning_rate": 0.0004650375939849624, | |
| "loss": 1.1849, | |
| "mean_token_accuracy": 0.6804407685995102, | |
| "num_tokens": 4957805.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 1.3040417283773422, | |
| "epoch": 0.35396563897387623, | |
| "grad_norm": 0.13647155463695526, | |
| "learning_rate": 0.00046484962406015043, | |
| "loss": 1.2723, | |
| "mean_token_accuracy": 0.6708482652902603, | |
| "num_tokens": 4982727.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 1.273634523153305, | |
| "epoch": 0.35584843492586493, | |
| "grad_norm": 0.2908094823360443, | |
| "learning_rate": 0.00046466165413533835, | |
| "loss": 1.2188, | |
| "mean_token_accuracy": 0.6769787892699242, | |
| "num_tokens": 5008167.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 1.294351875782013, | |
| "epoch": 0.35773123087785363, | |
| "grad_norm": 0.14780114591121674, | |
| "learning_rate": 0.0004644736842105263, | |
| "loss": 1.2497, | |
| "mean_token_accuracy": 0.6740161553025246, | |
| "num_tokens": 5031994.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.164976328611374, | |
| "epoch": 0.35961402682984234, | |
| "grad_norm": 0.1321694701910019, | |
| "learning_rate": 0.00046428571428571433, | |
| "loss": 1.1297, | |
| "mean_token_accuracy": 0.6937556862831116, | |
| "num_tokens": 5058242.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 1.1738992556929588, | |
| "epoch": 0.36149682278183104, | |
| "grad_norm": 0.13215236365795135, | |
| "learning_rate": 0.00046409774436090225, | |
| "loss": 1.1639, | |
| "mean_token_accuracy": 0.688830278813839, | |
| "num_tokens": 5086002.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 1.2423847168684006, | |
| "epoch": 0.36337961873381974, | |
| "grad_norm": 0.13844619691371918, | |
| "learning_rate": 0.0004639097744360902, | |
| "loss": 1.2462, | |
| "mean_token_accuracy": 0.6728790327906609, | |
| "num_tokens": 5115116.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 1.188772901892662, | |
| "epoch": 0.36526241468580845, | |
| "grad_norm": 0.1350889950990677, | |
| "learning_rate": 0.00046372180451127824, | |
| "loss": 1.162, | |
| "mean_token_accuracy": 0.6961116194725037, | |
| "num_tokens": 5141316.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 1.2510673254728317, | |
| "epoch": 0.36714521063779715, | |
| "grad_norm": 0.13393868505954742, | |
| "learning_rate": 0.0004635338345864662, | |
| "loss": 1.2165, | |
| "mean_token_accuracy": 0.675739549100399, | |
| "num_tokens": 5168389.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 1.2140327990055084, | |
| "epoch": 0.36902800658978585, | |
| "grad_norm": 0.15341585874557495, | |
| "learning_rate": 0.0004633458646616541, | |
| "loss": 1.1891, | |
| "mean_token_accuracy": 0.6846036836504936, | |
| "num_tokens": 5196797.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 1.140480324625969, | |
| "epoch": 0.37091080254177455, | |
| "grad_norm": 0.14681561291217804, | |
| "learning_rate": 0.00046315789473684214, | |
| "loss": 1.1129, | |
| "mean_token_accuracy": 0.7001371458172798, | |
| "num_tokens": 5221689.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 1.149554505944252, | |
| "epoch": 0.37279359849376326, | |
| "grad_norm": 0.12448862940073013, | |
| "learning_rate": 0.0004629699248120301, | |
| "loss": 1.0918, | |
| "mean_token_accuracy": 0.7011524215340614, | |
| "num_tokens": 5248151.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 1.1877187192440033, | |
| "epoch": 0.37467639444575196, | |
| "grad_norm": 0.12904192507266998, | |
| "learning_rate": 0.00046278195488721807, | |
| "loss": 1.1381, | |
| "mean_token_accuracy": 0.6980564966797829, | |
| "num_tokens": 5276462.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 1.1336260885000229, | |
| "epoch": 0.37655919039774066, | |
| "grad_norm": 0.14019370079040527, | |
| "learning_rate": 0.00046259398496240604, | |
| "loss": 1.1408, | |
| "mean_token_accuracy": 0.6882188692688942, | |
| "num_tokens": 5303965.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.142029918730259, | |
| "epoch": 0.37844198634972936, | |
| "grad_norm": 0.12954500317573547, | |
| "learning_rate": 0.000462406015037594, | |
| "loss": 1.1225, | |
| "mean_token_accuracy": 0.7019821628928185, | |
| "num_tokens": 5333147.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 1.1055554077029228, | |
| "epoch": 0.38032478230171807, | |
| "grad_norm": 0.14525440335273743, | |
| "learning_rate": 0.00046221804511278197, | |
| "loss": 1.0873, | |
| "mean_token_accuracy": 0.6984671205282211, | |
| "num_tokens": 5360603.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 1.1669521182775497, | |
| "epoch": 0.38220757825370677, | |
| "grad_norm": 0.12719959020614624, | |
| "learning_rate": 0.00046203007518796994, | |
| "loss": 1.1408, | |
| "mean_token_accuracy": 0.6958698183298111, | |
| "num_tokens": 5386882.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 1.2504252791404724, | |
| "epoch": 0.3840903742056955, | |
| "grad_norm": 0.14054498076438904, | |
| "learning_rate": 0.0004618421052631579, | |
| "loss": 1.2147, | |
| "mean_token_accuracy": 0.6776561290025711, | |
| "num_tokens": 5413184.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 1.226726457476616, | |
| "epoch": 0.3859731701576842, | |
| "grad_norm": 0.13887910544872284, | |
| "learning_rate": 0.00046165413533834587, | |
| "loss": 1.193, | |
| "mean_token_accuracy": 0.6823991388082504, | |
| "num_tokens": 5438606.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 1.1875706166028976, | |
| "epoch": 0.3878559661096729, | |
| "grad_norm": 0.14024114608764648, | |
| "learning_rate": 0.0004614661654135339, | |
| "loss": 1.1676, | |
| "mean_token_accuracy": 0.684231162071228, | |
| "num_tokens": 5464123.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 1.2047923803329468, | |
| "epoch": 0.3897387620616616, | |
| "grad_norm": 0.1310993880033493, | |
| "learning_rate": 0.0004612781954887218, | |
| "loss": 1.1851, | |
| "mean_token_accuracy": 0.6833815798163414, | |
| "num_tokens": 5491426.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 1.2198069095611572, | |
| "epoch": 0.3916215580136503, | |
| "grad_norm": 0.13591070473194122, | |
| "learning_rate": 0.00046109022556390977, | |
| "loss": 1.2115, | |
| "mean_token_accuracy": 0.6876263841986656, | |
| "num_tokens": 5517873.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 1.2492990344762802, | |
| "epoch": 0.393504353965639, | |
| "grad_norm": 0.1313110738992691, | |
| "learning_rate": 0.0004609022556390978, | |
| "loss": 1.2303, | |
| "mean_token_accuracy": 0.6741604581475258, | |
| "num_tokens": 5545541.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 1.2249716967344284, | |
| "epoch": 0.3953871499176277, | |
| "grad_norm": 0.13691024482250214, | |
| "learning_rate": 0.0004607142857142857, | |
| "loss": 1.1994, | |
| "mean_token_accuracy": 0.6825065985321999, | |
| "num_tokens": 5571818.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.2132453471422195, | |
| "epoch": 0.3972699458696164, | |
| "grad_norm": 0.13897888362407684, | |
| "learning_rate": 0.0004605263157894737, | |
| "loss": 1.2105, | |
| "mean_token_accuracy": 0.6761833131313324, | |
| "num_tokens": 5598744.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 1.1871661990880966, | |
| "epoch": 0.3991527418216051, | |
| "grad_norm": 0.13007131218910217, | |
| "learning_rate": 0.00046033834586466164, | |
| "loss": 1.1726, | |
| "mean_token_accuracy": 0.6834597215056419, | |
| "num_tokens": 5625839.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 1.1333737969398499, | |
| "epoch": 0.4010355377735938, | |
| "grad_norm": 0.12430460005998611, | |
| "learning_rate": 0.00046015037593984966, | |
| "loss": 1.1019, | |
| "mean_token_accuracy": 0.7014463916420937, | |
| "num_tokens": 5654141.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 1.2297871708869934, | |
| "epoch": 0.4029183337255825, | |
| "grad_norm": 0.13888096809387207, | |
| "learning_rate": 0.0004599624060150376, | |
| "loss": 1.1764, | |
| "mean_token_accuracy": 0.6898130550980568, | |
| "num_tokens": 5678609.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 1.2013902068138123, | |
| "epoch": 0.4048011296775712, | |
| "grad_norm": 0.12778723239898682, | |
| "learning_rate": 0.00045977443609022554, | |
| "loss": 1.1552, | |
| "mean_token_accuracy": 0.6898351311683655, | |
| "num_tokens": 5705310.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 1.2131111025810242, | |
| "epoch": 0.4066839256295599, | |
| "grad_norm": 0.1250849962234497, | |
| "learning_rate": 0.00045958646616541356, | |
| "loss": 1.1997, | |
| "mean_token_accuracy": 0.6817116960883141, | |
| "num_tokens": 5733075.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 1.195549488067627, | |
| "epoch": 0.4085667215815486, | |
| "grad_norm": 0.14742979407310486, | |
| "learning_rate": 0.00045939849624060153, | |
| "loss": 1.1542, | |
| "mean_token_accuracy": 0.6895313560962677, | |
| "num_tokens": 5758265.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 1.169806808233261, | |
| "epoch": 0.4104495175335373, | |
| "grad_norm": 0.13026666641235352, | |
| "learning_rate": 0.00045921052631578944, | |
| "loss": 1.1244, | |
| "mean_token_accuracy": 0.6982120722532272, | |
| "num_tokens": 5784948.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 1.182911455631256, | |
| "epoch": 0.412332313485526, | |
| "grad_norm": 0.13583756983280182, | |
| "learning_rate": 0.00045902255639097746, | |
| "loss": 1.168, | |
| "mean_token_accuracy": 0.6856559291481972, | |
| "num_tokens": 5811165.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 1.0761431455612183, | |
| "epoch": 0.4142151094375147, | |
| "grad_norm": 0.13843543827533722, | |
| "learning_rate": 0.00045883458646616543, | |
| "loss": 1.0857, | |
| "mean_token_accuracy": 0.7090724036097527, | |
| "num_tokens": 5839268.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.1751226484775543, | |
| "epoch": 0.4160979053895034, | |
| "grad_norm": 0.13362666964530945, | |
| "learning_rate": 0.00045864661654135334, | |
| "loss": 1.1766, | |
| "mean_token_accuracy": 0.6880608201026917, | |
| "num_tokens": 5866181.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 1.1817846149206161, | |
| "epoch": 0.4179807013414921, | |
| "grad_norm": 0.1283264309167862, | |
| "learning_rate": 0.00045845864661654136, | |
| "loss": 1.1698, | |
| "mean_token_accuracy": 0.6846595779061317, | |
| "num_tokens": 5894863.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 1.2609765976667404, | |
| "epoch": 0.4198634972934808, | |
| "grad_norm": 0.1493021547794342, | |
| "learning_rate": 0.00045827067669172933, | |
| "loss": 1.2032, | |
| "mean_token_accuracy": 0.6831384673714638, | |
| "num_tokens": 5919134.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 1.239750549197197, | |
| "epoch": 0.42174629324546953, | |
| "grad_norm": 0.14113545417785645, | |
| "learning_rate": 0.0004580827067669173, | |
| "loss": 1.186, | |
| "mean_token_accuracy": 0.6857739984989166, | |
| "num_tokens": 5944399.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 1.2144103646278381, | |
| "epoch": 0.42362908919745823, | |
| "grad_norm": 0.13381649553775787, | |
| "learning_rate": 0.00045789473684210527, | |
| "loss": 1.1787, | |
| "mean_token_accuracy": 0.6889763921499252, | |
| "num_tokens": 5969936.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 1.157375693321228, | |
| "epoch": 0.42551188514944693, | |
| "grad_norm": 0.13331881165504456, | |
| "learning_rate": 0.00045770676691729323, | |
| "loss": 1.1613, | |
| "mean_token_accuracy": 0.6869198232889175, | |
| "num_tokens": 5998086.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 1.16208166629076, | |
| "epoch": 0.42739468110143564, | |
| "grad_norm": 0.1284441202878952, | |
| "learning_rate": 0.0004575187969924812, | |
| "loss": 1.1593, | |
| "mean_token_accuracy": 0.6875879392027855, | |
| "num_tokens": 6027253.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 1.1543057709932327, | |
| "epoch": 0.42927747705342434, | |
| "grad_norm": 0.13240714371204376, | |
| "learning_rate": 0.0004573308270676692, | |
| "loss": 1.1397, | |
| "mean_token_accuracy": 0.6932123303413391, | |
| "num_tokens": 6053458.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 1.2234352231025696, | |
| "epoch": 0.43116027300541304, | |
| "grad_norm": 0.13276036083698273, | |
| "learning_rate": 0.00045714285714285713, | |
| "loss": 1.1783, | |
| "mean_token_accuracy": 0.6839658245444298, | |
| "num_tokens": 6077746.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 1.2401353865861893, | |
| "epoch": 0.43304306895740174, | |
| "grad_norm": 0.13763296604156494, | |
| "learning_rate": 0.0004569548872180451, | |
| "loss": 1.2126, | |
| "mean_token_accuracy": 0.6801036223769188, | |
| "num_tokens": 6104277.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.1862784177064896, | |
| "epoch": 0.43492586490939045, | |
| "grad_norm": 0.14408177137374878, | |
| "learning_rate": 0.0004567669172932331, | |
| "loss": 1.1804, | |
| "mean_token_accuracy": 0.6879640221595764, | |
| "num_tokens": 6131048.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 1.2236796170473099, | |
| "epoch": 0.43680866086137915, | |
| "grad_norm": 0.1351345330476761, | |
| "learning_rate": 0.00045657894736842103, | |
| "loss": 1.1814, | |
| "mean_token_accuracy": 0.6808154359459877, | |
| "num_tokens": 6157407.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 1.2412819564342499, | |
| "epoch": 0.43869145681336785, | |
| "grad_norm": 0.1346222460269928, | |
| "learning_rate": 0.000456390977443609, | |
| "loss": 1.2092, | |
| "mean_token_accuracy": 0.676831878721714, | |
| "num_tokens": 6183884.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 1.2513677477836609, | |
| "epoch": 0.44057425276535656, | |
| "grad_norm": 0.14077451825141907, | |
| "learning_rate": 0.000456203007518797, | |
| "loss": 1.2274, | |
| "mean_token_accuracy": 0.6783920973539352, | |
| "num_tokens": 6210214.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 1.1642959266901016, | |
| "epoch": 0.44245704871734526, | |
| "grad_norm": 0.1407959908246994, | |
| "learning_rate": 0.000456015037593985, | |
| "loss": 1.1149, | |
| "mean_token_accuracy": 0.6936823204159737, | |
| "num_tokens": 6237636.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 1.1751240193843842, | |
| "epoch": 0.44433984466933396, | |
| "grad_norm": 0.1335555762052536, | |
| "learning_rate": 0.0004558270676691729, | |
| "loss": 1.1695, | |
| "mean_token_accuracy": 0.6895338296890259, | |
| "num_tokens": 6263952.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 1.1486622989177704, | |
| "epoch": 0.44622264062132266, | |
| "grad_norm": 0.17950989305973053, | |
| "learning_rate": 0.0004556390977443609, | |
| "loss": 1.155, | |
| "mean_token_accuracy": 0.6848675832152367, | |
| "num_tokens": 6292031.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 1.185767188668251, | |
| "epoch": 0.44810543657331137, | |
| "grad_norm": 0.1306653767824173, | |
| "learning_rate": 0.0004554511278195489, | |
| "loss": 1.1606, | |
| "mean_token_accuracy": 0.6900418549776077, | |
| "num_tokens": 6321764.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 1.2462199479341507, | |
| "epoch": 0.44998823252530007, | |
| "grad_norm": 0.1400284469127655, | |
| "learning_rate": 0.00045526315789473686, | |
| "loss": 1.2094, | |
| "mean_token_accuracy": 0.6798161789774895, | |
| "num_tokens": 6347788.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 1.2244273871183395, | |
| "epoch": 0.45187102847728877, | |
| "grad_norm": 0.1347157508134842, | |
| "learning_rate": 0.0004550751879699248, | |
| "loss": 1.1674, | |
| "mean_token_accuracy": 0.6886308640241623, | |
| "num_tokens": 6374007.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.2273097336292267, | |
| "epoch": 0.4537538244292775, | |
| "grad_norm": 0.1288744956254959, | |
| "learning_rate": 0.0004548872180451128, | |
| "loss": 1.1775, | |
| "mean_token_accuracy": 0.6868400648236275, | |
| "num_tokens": 6400589.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 1.2171413898468018, | |
| "epoch": 0.4556366203812662, | |
| "grad_norm": 0.14212685823440552, | |
| "learning_rate": 0.00045469924812030076, | |
| "loss": 1.2173, | |
| "mean_token_accuracy": 0.680756650865078, | |
| "num_tokens": 6428529.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 1.1739053502678871, | |
| "epoch": 0.4575194163332549, | |
| "grad_norm": 0.13274581730365753, | |
| "learning_rate": 0.0004545112781954887, | |
| "loss": 1.1491, | |
| "mean_token_accuracy": 0.6945304796099663, | |
| "num_tokens": 6456003.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 1.1879045367240906, | |
| "epoch": 0.4594022122852436, | |
| "grad_norm": 0.14754825830459595, | |
| "learning_rate": 0.0004543233082706767, | |
| "loss": 1.153, | |
| "mean_token_accuracy": 0.6907599717378616, | |
| "num_tokens": 6481488.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 1.1874423921108246, | |
| "epoch": 0.4612850082372323, | |
| "grad_norm": 0.14292332530021667, | |
| "learning_rate": 0.00045413533834586466, | |
| "loss": 1.1531, | |
| "mean_token_accuracy": 0.6900304704904556, | |
| "num_tokens": 6509304.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 1.1584448963403702, | |
| "epoch": 0.463167804189221, | |
| "grad_norm": 0.13040532171726227, | |
| "learning_rate": 0.0004539473684210527, | |
| "loss": 1.1492, | |
| "mean_token_accuracy": 0.6877822354435921, | |
| "num_tokens": 6536066.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 1.1855371445417404, | |
| "epoch": 0.4650506001412097, | |
| "grad_norm": 0.13368549942970276, | |
| "learning_rate": 0.0004537593984962406, | |
| "loss": 1.1777, | |
| "mean_token_accuracy": 0.6852287128567696, | |
| "num_tokens": 6565018.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 1.1443724185228348, | |
| "epoch": 0.4669333960931984, | |
| "grad_norm": 0.14028339087963104, | |
| "learning_rate": 0.00045357142857142856, | |
| "loss": 1.1356, | |
| "mean_token_accuracy": 0.6946588978171349, | |
| "num_tokens": 6592536.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 1.1854888200759888, | |
| "epoch": 0.4688161920451871, | |
| "grad_norm": 0.13055366277694702, | |
| "learning_rate": 0.0004533834586466166, | |
| "loss": 1.1731, | |
| "mean_token_accuracy": 0.6873556599020958, | |
| "num_tokens": 6620329.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 1.1635265052318573, | |
| "epoch": 0.4706989879971758, | |
| "grad_norm": 0.12299590557813644, | |
| "learning_rate": 0.0004531954887218045, | |
| "loss": 1.1174, | |
| "mean_token_accuracy": 0.6956649720668793, | |
| "num_tokens": 6647929.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.1612417101860046, | |
| "epoch": 0.4725817839491645, | |
| "grad_norm": 0.14049823582172394, | |
| "learning_rate": 0.00045300751879699246, | |
| "loss": 1.1348, | |
| "mean_token_accuracy": 0.694083645939827, | |
| "num_tokens": 6674419.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 1.2213299870491028, | |
| "epoch": 0.4744645799011532, | |
| "grad_norm": 0.13414214551448822, | |
| "learning_rate": 0.0004528195488721805, | |
| "loss": 1.2013, | |
| "mean_token_accuracy": 0.6825797632336617, | |
| "num_tokens": 6701851.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 1.183507114648819, | |
| "epoch": 0.4763473758531419, | |
| "grad_norm": 0.15232087671756744, | |
| "learning_rate": 0.00045263157894736845, | |
| "loss": 1.162, | |
| "mean_token_accuracy": 0.6850753352046013, | |
| "num_tokens": 6729161.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 1.0959549844264984, | |
| "epoch": 0.4782301718051306, | |
| "grad_norm": 0.12658758461475372, | |
| "learning_rate": 0.00045244360902255636, | |
| "loss": 1.0808, | |
| "mean_token_accuracy": 0.7000140845775604, | |
| "num_tokens": 6756047.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 1.193654179573059, | |
| "epoch": 0.4801129677571193, | |
| "grad_norm": 0.14304682612419128, | |
| "learning_rate": 0.0004522556390977444, | |
| "loss": 1.1611, | |
| "mean_token_accuracy": 0.6860647276043892, | |
| "num_tokens": 6782155.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 1.189740851521492, | |
| "epoch": 0.481995763709108, | |
| "grad_norm": 0.1279287487268448, | |
| "learning_rate": 0.00045206766917293235, | |
| "loss": 1.1533, | |
| "mean_token_accuracy": 0.6969729140400887, | |
| "num_tokens": 6809906.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 1.1370235309004784, | |
| "epoch": 0.4838785596610967, | |
| "grad_norm": 0.12549139559268951, | |
| "learning_rate": 0.0004518796992481203, | |
| "loss": 1.1005, | |
| "mean_token_accuracy": 0.6986983045935631, | |
| "num_tokens": 6837978.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 1.1274943947792053, | |
| "epoch": 0.4857613556130854, | |
| "grad_norm": 0.13078007102012634, | |
| "learning_rate": 0.0004516917293233083, | |
| "loss": 1.116, | |
| "mean_token_accuracy": 0.6968672722578049, | |
| "num_tokens": 6863894.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 1.1707115471363068, | |
| "epoch": 0.4876441515650741, | |
| "grad_norm": 0.13655990362167358, | |
| "learning_rate": 0.00045150375939849625, | |
| "loss": 1.1502, | |
| "mean_token_accuracy": 0.6891424879431725, | |
| "num_tokens": 6889219.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 1.1765428930521011, | |
| "epoch": 0.4895269475170628, | |
| "grad_norm": 0.13517631590366364, | |
| "learning_rate": 0.0004513157894736842, | |
| "loss": 1.1736, | |
| "mean_token_accuracy": 0.6828250586986542, | |
| "num_tokens": 6915957.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.1622217297554016, | |
| "epoch": 0.49140974346905153, | |
| "grad_norm": 0.1339031159877777, | |
| "learning_rate": 0.0004511278195488722, | |
| "loss": 1.1602, | |
| "mean_token_accuracy": 0.6858406886458397, | |
| "num_tokens": 6942729.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 1.188800647854805, | |
| "epoch": 0.49329253942104023, | |
| "grad_norm": 0.1516953706741333, | |
| "learning_rate": 0.00045093984962406015, | |
| "loss": 1.1541, | |
| "mean_token_accuracy": 0.6871596127748489, | |
| "num_tokens": 6966884.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 1.1681264340877533, | |
| "epoch": 0.49517533537302894, | |
| "grad_norm": 0.14556634426116943, | |
| "learning_rate": 0.0004507518796992481, | |
| "loss": 1.1307, | |
| "mean_token_accuracy": 0.6948810294270515, | |
| "num_tokens": 6992842.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 1.1910002678632736, | |
| "epoch": 0.49705813132501764, | |
| "grad_norm": 0.1371603161096573, | |
| "learning_rate": 0.00045056390977443614, | |
| "loss": 1.1469, | |
| "mean_token_accuracy": 0.6974197626113892, | |
| "num_tokens": 7018704.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 1.2533641755580902, | |
| "epoch": 0.49894092727700634, | |
| "grad_norm": 0.15122705698013306, | |
| "learning_rate": 0.00045037593984962405, | |
| "loss": 1.1964, | |
| "mean_token_accuracy": 0.6835278943181038, | |
| "num_tokens": 7045985.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 1.18770419806242, | |
| "epoch": 0.500823723228995, | |
| "grad_norm": 0.1283893585205078, | |
| "learning_rate": 0.000450187969924812, | |
| "loss": 1.1613, | |
| "mean_token_accuracy": 0.6964623779058456, | |
| "num_tokens": 7073668.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 1.1760464161634445, | |
| "epoch": 0.5027065191809837, | |
| "grad_norm": 0.13645370304584503, | |
| "learning_rate": 0.00045000000000000004, | |
| "loss": 1.181, | |
| "mean_token_accuracy": 0.6851188093423843, | |
| "num_tokens": 7100612.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 1.1559069901704788, | |
| "epoch": 0.5045893151329724, | |
| "grad_norm": 0.14222431182861328, | |
| "learning_rate": 0.000449812030075188, | |
| "loss": 1.1661, | |
| "mean_token_accuracy": 0.6858489215373993, | |
| "num_tokens": 7127648.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 1.155109003186226, | |
| "epoch": 0.5064721110849612, | |
| "grad_norm": 0.14752890169620514, | |
| "learning_rate": 0.0004496240601503759, | |
| "loss": 1.1549, | |
| "mean_token_accuracy": 0.6923946589231491, | |
| "num_tokens": 7153048.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 1.2506433129310608, | |
| "epoch": 0.5083549070369499, | |
| "grad_norm": 0.14298772811889648, | |
| "learning_rate": 0.00044943609022556394, | |
| "loss": 1.193, | |
| "mean_token_accuracy": 0.684316597878933, | |
| "num_tokens": 7177628.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 1.2653572857379913, | |
| "epoch": 0.5102377029889386, | |
| "grad_norm": 0.167319193482399, | |
| "learning_rate": 0.0004492481203007519, | |
| "loss": 1.1959, | |
| "mean_token_accuracy": 0.6871765851974487, | |
| "num_tokens": 7201577.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 1.2064370959997177, | |
| "epoch": 0.5121204989409273, | |
| "grad_norm": 0.15246403217315674, | |
| "learning_rate": 0.0004490601503759398, | |
| "loss": 1.1574, | |
| "mean_token_accuracy": 0.6841192170977592, | |
| "num_tokens": 7226259.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 1.1363181620836258, | |
| "epoch": 0.514003294892916, | |
| "grad_norm": 0.13937003910541534, | |
| "learning_rate": 0.00044887218045112784, | |
| "loss": 1.1257, | |
| "mean_token_accuracy": 0.6941032037138939, | |
| "num_tokens": 7253373.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 1.1732933074235916, | |
| "epoch": 0.5158860908449047, | |
| "grad_norm": 0.14371132850646973, | |
| "learning_rate": 0.0004486842105263158, | |
| "loss": 1.1715, | |
| "mean_token_accuracy": 0.6919308379292488, | |
| "num_tokens": 7278945.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 1.175576038658619, | |
| "epoch": 0.5177688867968934, | |
| "grad_norm": 0.1441759318113327, | |
| "learning_rate": 0.0004484962406015038, | |
| "loss": 1.1515, | |
| "mean_token_accuracy": 0.694126233458519, | |
| "num_tokens": 7305391.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 1.2058104127645493, | |
| "epoch": 0.5196516827488821, | |
| "grad_norm": 0.13355745375156403, | |
| "learning_rate": 0.00044830827067669174, | |
| "loss": 1.1916, | |
| "mean_token_accuracy": 0.687326617538929, | |
| "num_tokens": 7332607.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 1.2485528588294983, | |
| "epoch": 0.5215344787008708, | |
| "grad_norm": 0.14986877143383026, | |
| "learning_rate": 0.0004481203007518797, | |
| "loss": 1.2103, | |
| "mean_token_accuracy": 0.6793005913496017, | |
| "num_tokens": 7358139.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 1.187769129872322, | |
| "epoch": 0.5234172746528595, | |
| "grad_norm": 0.14205658435821533, | |
| "learning_rate": 0.0004479323308270677, | |
| "loss": 1.1564, | |
| "mean_token_accuracy": 0.6925127878785133, | |
| "num_tokens": 7384537.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 1.1303328722715378, | |
| "epoch": 0.5253000706048482, | |
| "grad_norm": 0.14045588672161102, | |
| "learning_rate": 0.00044774436090225565, | |
| "loss": 1.1287, | |
| "mean_token_accuracy": 0.6949460133910179, | |
| "num_tokens": 7411036.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 1.2028415352106094, | |
| "epoch": 0.5271828665568369, | |
| "grad_norm": 0.1550549864768982, | |
| "learning_rate": 0.0004475563909774436, | |
| "loss": 1.2004, | |
| "mean_token_accuracy": 0.6846116036176682, | |
| "num_tokens": 7437443.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 1.182666465640068, | |
| "epoch": 0.5290656625088256, | |
| "grad_norm": 0.2469193935394287, | |
| "learning_rate": 0.0004473684210526316, | |
| "loss": 1.1759, | |
| "mean_token_accuracy": 0.6844401434063911, | |
| "num_tokens": 7462227.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 1.202811524271965, | |
| "epoch": 0.5309484584608143, | |
| "grad_norm": 0.14160913228988647, | |
| "learning_rate": 0.0004471804511278196, | |
| "loss": 1.1957, | |
| "mean_token_accuracy": 0.6817988455295563, | |
| "num_tokens": 7487080.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 1.1812713742256165, | |
| "epoch": 0.532831254412803, | |
| "grad_norm": 0.15075385570526123, | |
| "learning_rate": 0.0004469924812030075, | |
| "loss": 1.1481, | |
| "mean_token_accuracy": 0.6930856108665466, | |
| "num_tokens": 7511921.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 1.2214877009391785, | |
| "epoch": 0.5347140503647917, | |
| "grad_norm": 0.1399138867855072, | |
| "learning_rate": 0.0004468045112781955, | |
| "loss": 1.1678, | |
| "mean_token_accuracy": 0.6885346695780754, | |
| "num_tokens": 7538663.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 1.2207457572221756, | |
| "epoch": 0.5365968463167804, | |
| "grad_norm": 0.16030077636241913, | |
| "learning_rate": 0.0004466165413533835, | |
| "loss": 1.1498, | |
| "mean_token_accuracy": 0.6934774816036224, | |
| "num_tokens": 7563898.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 1.1787783950567245, | |
| "epoch": 0.5384796422687691, | |
| "grad_norm": 0.13601085543632507, | |
| "learning_rate": 0.00044642857142857147, | |
| "loss": 1.145, | |
| "mean_token_accuracy": 0.6905470564961433, | |
| "num_tokens": 7590702.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 1.081341713666916, | |
| "epoch": 0.5403624382207578, | |
| "grad_norm": 0.13594649732112885, | |
| "learning_rate": 0.0004462406015037594, | |
| "loss": 1.0881, | |
| "mean_token_accuracy": 0.7003285214304924, | |
| "num_tokens": 7618002.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 1.1418119072914124, | |
| "epoch": 0.5422452341727465, | |
| "grad_norm": 0.15701550245285034, | |
| "learning_rate": 0.0004460526315789474, | |
| "loss": 1.1544, | |
| "mean_token_accuracy": 0.6906085088849068, | |
| "num_tokens": 7644482.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 1.1627637073397636, | |
| "epoch": 0.5441280301247352, | |
| "grad_norm": 0.13722968101501465, | |
| "learning_rate": 0.00044586466165413537, | |
| "loss": 1.1586, | |
| "mean_token_accuracy": 0.6932996585965157, | |
| "num_tokens": 7671479.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 1.1320042312145233, | |
| "epoch": 0.5460108260767239, | |
| "grad_norm": 0.15330596268177032, | |
| "learning_rate": 0.0004456766917293233, | |
| "loss": 1.108, | |
| "mean_token_accuracy": 0.6965923383831978, | |
| "num_tokens": 7697013.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 1.2310521453619003, | |
| "epoch": 0.5478936220287126, | |
| "grad_norm": 0.14045506715774536, | |
| "learning_rate": 0.00044548872180451125, | |
| "loss": 1.1978, | |
| "mean_token_accuracy": 0.6855576112866402, | |
| "num_tokens": 7722551.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 1.1880534440279007, | |
| "epoch": 0.5497764179807013, | |
| "grad_norm": 0.14293448626995087, | |
| "learning_rate": 0.00044530075187969927, | |
| "loss": 1.1251, | |
| "mean_token_accuracy": 0.701711505651474, | |
| "num_tokens": 7748016.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 1.141702115535736, | |
| "epoch": 0.55165921393269, | |
| "grad_norm": 0.1439259648323059, | |
| "learning_rate": 0.00044511278195488724, | |
| "loss": 1.1361, | |
| "mean_token_accuracy": 0.6944170445203781, | |
| "num_tokens": 7774858.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 1.1963759511709213, | |
| "epoch": 0.5535420098846787, | |
| "grad_norm": 0.15148387849330902, | |
| "learning_rate": 0.00044492481203007515, | |
| "loss": 1.1768, | |
| "mean_token_accuracy": 0.6924594268202782, | |
| "num_tokens": 7800802.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 1.2073182165622711, | |
| "epoch": 0.5554248058366674, | |
| "grad_norm": 0.14503706991672516, | |
| "learning_rate": 0.00044473684210526317, | |
| "loss": 1.2075, | |
| "mean_token_accuracy": 0.6802205815911293, | |
| "num_tokens": 7825288.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 1.1897266507148743, | |
| "epoch": 0.5573076017886561, | |
| "grad_norm": 0.13914930820465088, | |
| "learning_rate": 0.00044454887218045114, | |
| "loss": 1.1668, | |
| "mean_token_accuracy": 0.6842218562960625, | |
| "num_tokens": 7853255.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 1.138252004981041, | |
| "epoch": 0.5591903977406448, | |
| "grad_norm": 0.1277482956647873, | |
| "learning_rate": 0.0004443609022556391, | |
| "loss": 1.095, | |
| "mean_token_accuracy": 0.6993494555354118, | |
| "num_tokens": 7880497.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 1.1767967641353607, | |
| "epoch": 0.5610731936926335, | |
| "grad_norm": 0.14053884148597717, | |
| "learning_rate": 0.00044417293233082707, | |
| "loss": 1.1443, | |
| "mean_token_accuracy": 0.6948733255267143, | |
| "num_tokens": 7906730.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 1.2134106159210205, | |
| "epoch": 0.5629559896446222, | |
| "grad_norm": 0.14005884528160095, | |
| "learning_rate": 0.00044398496240601504, | |
| "loss": 1.1822, | |
| "mean_token_accuracy": 0.6892389133572578, | |
| "num_tokens": 7933216.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 1.1945680975914001, | |
| "epoch": 0.5648387855966109, | |
| "grad_norm": 0.1356893926858902, | |
| "learning_rate": 0.000443796992481203, | |
| "loss": 1.1689, | |
| "mean_token_accuracy": 0.6882117986679077, | |
| "num_tokens": 7960270.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.1890588849782944, | |
| "epoch": 0.5667215815485996, | |
| "grad_norm": 0.14139321446418762, | |
| "learning_rate": 0.000443609022556391, | |
| "loss": 1.1757, | |
| "mean_token_accuracy": 0.6851599663496017, | |
| "num_tokens": 7987900.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 1.1338028833270073, | |
| "epoch": 0.5686043775005883, | |
| "grad_norm": 0.14264994859695435, | |
| "learning_rate": 0.00044342105263157894, | |
| "loss": 1.1502, | |
| "mean_token_accuracy": 0.6855240687727928, | |
| "num_tokens": 8013351.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 1.1318519860506058, | |
| "epoch": 0.570487173452577, | |
| "grad_norm": 0.13565586507320404, | |
| "learning_rate": 0.0004432330827067669, | |
| "loss": 1.1165, | |
| "mean_token_accuracy": 0.6999509632587433, | |
| "num_tokens": 8038918.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 1.2122758030891418, | |
| "epoch": 0.5723699694045657, | |
| "grad_norm": 0.13487568497657776, | |
| "learning_rate": 0.00044304511278195493, | |
| "loss": 1.1738, | |
| "mean_token_accuracy": 0.681725949048996, | |
| "num_tokens": 8066501.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 1.1797229945659637, | |
| "epoch": 0.5742527653565545, | |
| "grad_norm": 0.13627903163433075, | |
| "learning_rate": 0.00044285714285714284, | |
| "loss": 1.1376, | |
| "mean_token_accuracy": 0.689607098698616, | |
| "num_tokens": 8093242.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 1.1857865750789642, | |
| "epoch": 0.5761355613085432, | |
| "grad_norm": 0.13779953122138977, | |
| "learning_rate": 0.0004426691729323308, | |
| "loss": 1.1367, | |
| "mean_token_accuracy": 0.6948609203100204, | |
| "num_tokens": 8121053.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 1.1960344910621643, | |
| "epoch": 0.5780183572605319, | |
| "grad_norm": 0.13792765140533447, | |
| "learning_rate": 0.00044248120300751883, | |
| "loss": 1.1472, | |
| "mean_token_accuracy": 0.6897515431046486, | |
| "num_tokens": 8147832.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 1.19243024289608, | |
| "epoch": 0.5799011532125206, | |
| "grad_norm": 0.1438818722963333, | |
| "learning_rate": 0.0004422932330827068, | |
| "loss": 1.1905, | |
| "mean_token_accuracy": 0.6858177557587624, | |
| "num_tokens": 8173841.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 1.211151197552681, | |
| "epoch": 0.5817839491645093, | |
| "grad_norm": 0.1361284852027893, | |
| "learning_rate": 0.0004421052631578947, | |
| "loss": 1.214, | |
| "mean_token_accuracy": 0.67852383852005, | |
| "num_tokens": 8202120.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 1.1578274965286255, | |
| "epoch": 0.583666745116498, | |
| "grad_norm": 0.14872749149799347, | |
| "learning_rate": 0.00044191729323308273, | |
| "loss": 1.1497, | |
| "mean_token_accuracy": 0.6920148581266403, | |
| "num_tokens": 8229217.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.1631289571523666, | |
| "epoch": 0.5855495410684867, | |
| "grad_norm": 0.15371911227703094, | |
| "learning_rate": 0.0004417293233082707, | |
| "loss": 1.1437, | |
| "mean_token_accuracy": 0.6945102214813232, | |
| "num_tokens": 8254581.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 1.1813505440950394, | |
| "epoch": 0.5874323370204754, | |
| "grad_norm": 0.14172406494617462, | |
| "learning_rate": 0.0004415413533834586, | |
| "loss": 1.1445, | |
| "mean_token_accuracy": 0.7006291374564171, | |
| "num_tokens": 8280615.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 1.1823447942733765, | |
| "epoch": 0.5893151329724641, | |
| "grad_norm": 0.14375410974025726, | |
| "learning_rate": 0.00044135338345864663, | |
| "loss": 1.1497, | |
| "mean_token_accuracy": 0.6918843537569046, | |
| "num_tokens": 8307395.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 1.1527684777975082, | |
| "epoch": 0.5911979289244528, | |
| "grad_norm": 0.1389397829771042, | |
| "learning_rate": 0.0004411654135338346, | |
| "loss": 1.1189, | |
| "mean_token_accuracy": 0.6944358944892883, | |
| "num_tokens": 8332107.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 1.165027841925621, | |
| "epoch": 0.5930807248764415, | |
| "grad_norm": 0.14531069993972778, | |
| "learning_rate": 0.00044097744360902257, | |
| "loss": 1.161, | |
| "mean_token_accuracy": 0.6896175295114517, | |
| "num_tokens": 8358194.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 1.2045851200819016, | |
| "epoch": 0.5949635208284302, | |
| "grad_norm": 0.1540374457836151, | |
| "learning_rate": 0.00044078947368421053, | |
| "loss": 1.1797, | |
| "mean_token_accuracy": 0.6859044209122658, | |
| "num_tokens": 8386180.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 1.194406397640705, | |
| "epoch": 0.5968463167804189, | |
| "grad_norm": 0.14392457902431488, | |
| "learning_rate": 0.0004406015037593985, | |
| "loss": 1.1483, | |
| "mean_token_accuracy": 0.6856495141983032, | |
| "num_tokens": 8412257.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 1.1843983232975006, | |
| "epoch": 0.5987291127324076, | |
| "grad_norm": 0.12984612584114075, | |
| "learning_rate": 0.00044041353383458647, | |
| "loss": 1.159, | |
| "mean_token_accuracy": 0.6899672672152519, | |
| "num_tokens": 8440139.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 1.159614846110344, | |
| "epoch": 0.6006119086843963, | |
| "grad_norm": 0.13649439811706543, | |
| "learning_rate": 0.00044022556390977443, | |
| "loss": 1.1277, | |
| "mean_token_accuracy": 0.6980894953012466, | |
| "num_tokens": 8466297.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 1.1729088872671127, | |
| "epoch": 0.602494704636385, | |
| "grad_norm": 0.14619147777557373, | |
| "learning_rate": 0.0004400375939849624, | |
| "loss": 1.1511, | |
| "mean_token_accuracy": 0.6904428154230118, | |
| "num_tokens": 8492672.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 1.1907424926757812, | |
| "epoch": 0.6043775005883737, | |
| "grad_norm": 0.14279942214488983, | |
| "learning_rate": 0.00043984962406015037, | |
| "loss": 1.1775, | |
| "mean_token_accuracy": 0.6842730417847633, | |
| "num_tokens": 8521582.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 1.1668616235256195, | |
| "epoch": 0.6062602965403624, | |
| "grad_norm": 0.1608172506093979, | |
| "learning_rate": 0.0004396616541353384, | |
| "loss": 1.1169, | |
| "mean_token_accuracy": 0.6961806491017342, | |
| "num_tokens": 8549037.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 1.172086626291275, | |
| "epoch": 0.6081430924923511, | |
| "grad_norm": 0.13843871653079987, | |
| "learning_rate": 0.0004394736842105263, | |
| "loss": 1.1337, | |
| "mean_token_accuracy": 0.6961240246891975, | |
| "num_tokens": 8577320.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 1.1471307575702667, | |
| "epoch": 0.6100258884443398, | |
| "grad_norm": 0.17384615540504456, | |
| "learning_rate": 0.00043928571428571427, | |
| "loss": 1.132, | |
| "mean_token_accuracy": 0.6966283246874809, | |
| "num_tokens": 8604513.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 1.1775583177804947, | |
| "epoch": 0.6119086843963285, | |
| "grad_norm": 0.1405702829360962, | |
| "learning_rate": 0.0004390977443609023, | |
| "loss": 1.1713, | |
| "mean_token_accuracy": 0.6833978369832039, | |
| "num_tokens": 8631088.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 1.1986607536673546, | |
| "epoch": 0.6137914803483172, | |
| "grad_norm": 0.17384964227676392, | |
| "learning_rate": 0.00043890977443609026, | |
| "loss": 1.1903, | |
| "mean_token_accuracy": 0.6892447099089622, | |
| "num_tokens": 8658317.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 1.1727805137634277, | |
| "epoch": 0.6156742763003059, | |
| "grad_norm": 0.14653940498828888, | |
| "learning_rate": 0.00043872180451127817, | |
| "loss": 1.1706, | |
| "mean_token_accuracy": 0.6892889738082886, | |
| "num_tokens": 8685883.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 1.1792996972799301, | |
| "epoch": 0.6175570722522946, | |
| "grad_norm": 0.14093339443206787, | |
| "learning_rate": 0.0004385338345864662, | |
| "loss": 1.1659, | |
| "mean_token_accuracy": 0.6881109997630119, | |
| "num_tokens": 8710584.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 1.1784557923674583, | |
| "epoch": 0.6194398682042833, | |
| "grad_norm": 0.14964358508586884, | |
| "learning_rate": 0.00043834586466165416, | |
| "loss": 1.1098, | |
| "mean_token_accuracy": 0.6995358616113663, | |
| "num_tokens": 8737455.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 1.2075697928667068, | |
| "epoch": 0.621322664156272, | |
| "grad_norm": 0.14746899902820587, | |
| "learning_rate": 0.00043815789473684207, | |
| "loss": 1.1564, | |
| "mean_token_accuracy": 0.6904364302754402, | |
| "num_tokens": 8764718.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.259048119187355, | |
| "epoch": 0.6232054601082607, | |
| "grad_norm": 0.13727432489395142, | |
| "learning_rate": 0.0004379699248120301, | |
| "loss": 1.2152, | |
| "mean_token_accuracy": 0.6816830709576607, | |
| "num_tokens": 8792699.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 1.176329106092453, | |
| "epoch": 0.6250882560602494, | |
| "grad_norm": 0.13555607199668884, | |
| "learning_rate": 0.00043778195488721806, | |
| "loss": 1.1337, | |
| "mean_token_accuracy": 0.6938095465302467, | |
| "num_tokens": 8818252.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 1.1746894717216492, | |
| "epoch": 0.6269710520122381, | |
| "grad_norm": 0.14540338516235352, | |
| "learning_rate": 0.000437593984962406, | |
| "loss": 1.1678, | |
| "mean_token_accuracy": 0.6856407299637794, | |
| "num_tokens": 8843904.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 1.143667384982109, | |
| "epoch": 0.6288538479642268, | |
| "grad_norm": 0.17852836847305298, | |
| "learning_rate": 0.000437406015037594, | |
| "loss": 1.1471, | |
| "mean_token_accuracy": 0.6907041072845459, | |
| "num_tokens": 8868115.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 1.1293998435139656, | |
| "epoch": 0.6307366439162155, | |
| "grad_norm": 0.13162344694137573, | |
| "learning_rate": 0.00043721804511278196, | |
| "loss": 1.123, | |
| "mean_token_accuracy": 0.7001049220561981, | |
| "num_tokens": 8894871.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 1.1313979178667068, | |
| "epoch": 0.6326194398682042, | |
| "grad_norm": 0.1321536898612976, | |
| "learning_rate": 0.0004370300751879699, | |
| "loss": 1.0987, | |
| "mean_token_accuracy": 0.7042840495705605, | |
| "num_tokens": 8921413.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 1.22024667263031, | |
| "epoch": 0.6345022358201929, | |
| "grad_norm": 0.14904777705669403, | |
| "learning_rate": 0.00043684210526315795, | |
| "loss": 1.1685, | |
| "mean_token_accuracy": 0.6839649677276611, | |
| "num_tokens": 8948016.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 1.200153261423111, | |
| "epoch": 0.6363850317721816, | |
| "grad_norm": 0.15332205593585968, | |
| "learning_rate": 0.00043665413533834586, | |
| "loss": 1.1599, | |
| "mean_token_accuracy": 0.6898418813943863, | |
| "num_tokens": 8974626.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 1.148691438138485, | |
| "epoch": 0.6382678277241703, | |
| "grad_norm": 0.1428363174200058, | |
| "learning_rate": 0.00043646616541353383, | |
| "loss": 1.1403, | |
| "mean_token_accuracy": 0.6996031925082207, | |
| "num_tokens": 9001421.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 1.1665330827236176, | |
| "epoch": 0.640150623676159, | |
| "grad_norm": 0.1439882218837738, | |
| "learning_rate": 0.00043627819548872185, | |
| "loss": 1.1849, | |
| "mean_token_accuracy": 0.6867435649037361, | |
| "num_tokens": 9028615.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.1208850890398026, | |
| "epoch": 0.6420334196281478, | |
| "grad_norm": 0.14697298407554626, | |
| "learning_rate": 0.00043609022556390976, | |
| "loss": 1.1336, | |
| "mean_token_accuracy": 0.6952601596713066, | |
| "num_tokens": 9056227.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 1.1804025322198868, | |
| "epoch": 0.6439162155801365, | |
| "grad_norm": 0.13762733340263367, | |
| "learning_rate": 0.00043590225563909773, | |
| "loss": 1.1556, | |
| "mean_token_accuracy": 0.6842042878270149, | |
| "num_tokens": 9081334.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 1.225020870566368, | |
| "epoch": 0.6457990115321252, | |
| "grad_norm": 0.15140774846076965, | |
| "learning_rate": 0.00043571428571428575, | |
| "loss": 1.1576, | |
| "mean_token_accuracy": 0.6892690062522888, | |
| "num_tokens": 9107740.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 1.178776428103447, | |
| "epoch": 0.6476818074841139, | |
| "grad_norm": 0.14922155439853668, | |
| "learning_rate": 0.0004355263157894737, | |
| "loss": 1.119, | |
| "mean_token_accuracy": 0.6988128572702408, | |
| "num_tokens": 9134004.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 1.1870884746313095, | |
| "epoch": 0.6495646034361026, | |
| "grad_norm": 0.13645216822624207, | |
| "learning_rate": 0.00043533834586466163, | |
| "loss": 1.1258, | |
| "mean_token_accuracy": 0.7014844194054604, | |
| "num_tokens": 9161858.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 1.1208381354808807, | |
| "epoch": 0.6514473993880913, | |
| "grad_norm": 0.15188747644424438, | |
| "learning_rate": 0.00043515037593984965, | |
| "loss": 1.126, | |
| "mean_token_accuracy": 0.6888753995299339, | |
| "num_tokens": 9187924.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 1.1246383488178253, | |
| "epoch": 0.65333019534008, | |
| "grad_norm": 0.18039844930171967, | |
| "learning_rate": 0.0004349624060150376, | |
| "loss": 1.1297, | |
| "mean_token_accuracy": 0.6954269483685493, | |
| "num_tokens": 9213952.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 1.181724175810814, | |
| "epoch": 0.6552129912920687, | |
| "grad_norm": 0.13552230596542358, | |
| "learning_rate": 0.0004347744360902256, | |
| "loss": 1.185, | |
| "mean_token_accuracy": 0.682334654033184, | |
| "num_tokens": 9240003.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 1.161278709769249, | |
| "epoch": 0.6570957872440574, | |
| "grad_norm": 0.13721586763858795, | |
| "learning_rate": 0.00043458646616541355, | |
| "loss": 1.1323, | |
| "mean_token_accuracy": 0.6919213533401489, | |
| "num_tokens": 9265180.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 1.167539969086647, | |
| "epoch": 0.6589785831960461, | |
| "grad_norm": 0.145475372672081, | |
| "learning_rate": 0.0004343984962406015, | |
| "loss": 1.1342, | |
| "mean_token_accuracy": 0.6932244300842285, | |
| "num_tokens": 9291467.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.2319505363702774, | |
| "epoch": 0.6608613791480348, | |
| "grad_norm": 0.13839372992515564, | |
| "learning_rate": 0.0004342105263157895, | |
| "loss": 1.2132, | |
| "mean_token_accuracy": 0.6786127388477325, | |
| "num_tokens": 9317382.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 1.2023252993822098, | |
| "epoch": 0.6627441751000235, | |
| "grad_norm": 0.1364511102437973, | |
| "learning_rate": 0.00043402255639097745, | |
| "loss": 1.19, | |
| "mean_token_accuracy": 0.6843428909778595, | |
| "num_tokens": 9343464.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 1.173360899090767, | |
| "epoch": 0.6646269710520122, | |
| "grad_norm": 0.1326543539762497, | |
| "learning_rate": 0.0004338345864661654, | |
| "loss": 1.1469, | |
| "mean_token_accuracy": 0.6877379715442657, | |
| "num_tokens": 9371170.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 1.1177352517843246, | |
| "epoch": 0.6665097670040009, | |
| "grad_norm": 0.1422666758298874, | |
| "learning_rate": 0.0004336466165413534, | |
| "loss": 1.0994, | |
| "mean_token_accuracy": 0.700407862663269, | |
| "num_tokens": 9397147.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 1.248588040471077, | |
| "epoch": 0.6683925629559897, | |
| "grad_norm": 0.13168664276599884, | |
| "learning_rate": 0.0004334586466165414, | |
| "loss": 1.2098, | |
| "mean_token_accuracy": 0.6834209859371185, | |
| "num_tokens": 9424363.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 1.1617062538862228, | |
| "epoch": 0.6702753589079784, | |
| "grad_norm": 0.15483741462230682, | |
| "learning_rate": 0.0004332706766917293, | |
| "loss": 1.114, | |
| "mean_token_accuracy": 0.7020522281527519, | |
| "num_tokens": 9450742.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 1.1978859603404999, | |
| "epoch": 0.6721581548599671, | |
| "grad_norm": 0.14632469415664673, | |
| "learning_rate": 0.0004330827067669173, | |
| "loss": 1.1847, | |
| "mean_token_accuracy": 0.6837000176310539, | |
| "num_tokens": 9475697.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 1.1161824762821198, | |
| "epoch": 0.6740409508119558, | |
| "grad_norm": 0.14072488248348236, | |
| "learning_rate": 0.0004328947368421053, | |
| "loss": 1.1272, | |
| "mean_token_accuracy": 0.6974566504359245, | |
| "num_tokens": 9502237.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 1.1397125273942947, | |
| "epoch": 0.6759237467639445, | |
| "grad_norm": 0.148344486951828, | |
| "learning_rate": 0.0004327067669172932, | |
| "loss": 1.1453, | |
| "mean_token_accuracy": 0.6873810589313507, | |
| "num_tokens": 9528201.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 1.2197502925992012, | |
| "epoch": 0.6778065427159332, | |
| "grad_norm": 0.14831538498401642, | |
| "learning_rate": 0.0004325187969924812, | |
| "loss": 1.1981, | |
| "mean_token_accuracy": 0.6797335669398308, | |
| "num_tokens": 9553887.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.2503347992897034, | |
| "epoch": 0.6796893386679219, | |
| "grad_norm": 0.14289598166942596, | |
| "learning_rate": 0.0004323308270676692, | |
| "loss": 1.1754, | |
| "mean_token_accuracy": 0.682529591023922, | |
| "num_tokens": 9578439.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 1.2314954698085785, | |
| "epoch": 0.6815721346199106, | |
| "grad_norm": 0.14386345446109772, | |
| "learning_rate": 0.0004321428571428572, | |
| "loss": 1.1499, | |
| "mean_token_accuracy": 0.6907836198806763, | |
| "num_tokens": 9603444.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 1.2456393241882324, | |
| "epoch": 0.6834549305718993, | |
| "grad_norm": 0.14364264905452728, | |
| "learning_rate": 0.0004319548872180451, | |
| "loss": 1.1933, | |
| "mean_token_accuracy": 0.6874497607350349, | |
| "num_tokens": 9629030.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 1.1722253412008286, | |
| "epoch": 0.685337726523888, | |
| "grad_norm": 0.1491105556488037, | |
| "learning_rate": 0.0004317669172932331, | |
| "loss": 1.152, | |
| "mean_token_accuracy": 0.6939368024468422, | |
| "num_tokens": 9656342.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 1.0892303064465523, | |
| "epoch": 0.6872205224758767, | |
| "grad_norm": 0.14881175756454468, | |
| "learning_rate": 0.0004315789473684211, | |
| "loss": 1.0922, | |
| "mean_token_accuracy": 0.7064904496073723, | |
| "num_tokens": 9680706.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 1.090978980064392, | |
| "epoch": 0.6891033184278654, | |
| "grad_norm": 0.14446662366390228, | |
| "learning_rate": 0.00043139097744360904, | |
| "loss": 1.1148, | |
| "mean_token_accuracy": 0.696795642375946, | |
| "num_tokens": 9705331.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 1.1398785412311554, | |
| "epoch": 0.6909861143798541, | |
| "grad_norm": 0.13684354722499847, | |
| "learning_rate": 0.000431203007518797, | |
| "loss": 1.1497, | |
| "mean_token_accuracy": 0.6912109777331352, | |
| "num_tokens": 9732400.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 1.17644502222538, | |
| "epoch": 0.6928689103318428, | |
| "grad_norm": 0.14162884652614594, | |
| "learning_rate": 0.000431015037593985, | |
| "loss": 1.1495, | |
| "mean_token_accuracy": 0.6945677846670151, | |
| "num_tokens": 9758948.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 1.1725402027368546, | |
| "epoch": 0.6947517062838315, | |
| "grad_norm": 0.13373105227947235, | |
| "learning_rate": 0.00043082706766917295, | |
| "loss": 1.1186, | |
| "mean_token_accuracy": 0.7017792239785194, | |
| "num_tokens": 9786609.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 1.1570321172475815, | |
| "epoch": 0.6966345022358202, | |
| "grad_norm": 0.13376620411872864, | |
| "learning_rate": 0.0004306390977443609, | |
| "loss": 1.1169, | |
| "mean_token_accuracy": 0.7013789564371109, | |
| "num_tokens": 9815091.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.2269478738307953, | |
| "epoch": 0.6985172981878089, | |
| "grad_norm": 0.15718406438827515, | |
| "learning_rate": 0.0004304511278195489, | |
| "loss": 1.1795, | |
| "mean_token_accuracy": 0.6809123381972313, | |
| "num_tokens": 9838924.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 1.2373632341623306, | |
| "epoch": 0.7004000941397976, | |
| "grad_norm": 0.13601046800613403, | |
| "learning_rate": 0.00043026315789473685, | |
| "loss": 1.1897, | |
| "mean_token_accuracy": 0.6842946112155914, | |
| "num_tokens": 9865745.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 1.2175681740045547, | |
| "epoch": 0.7022828900917863, | |
| "grad_norm": 0.14760908484458923, | |
| "learning_rate": 0.00043007518796992487, | |
| "loss": 1.2027, | |
| "mean_token_accuracy": 0.680089496076107, | |
| "num_tokens": 9891103.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 1.187382310628891, | |
| "epoch": 0.704165686043775, | |
| "grad_norm": 0.15881404280662537, | |
| "learning_rate": 0.0004298872180451128, | |
| "loss": 1.183, | |
| "mean_token_accuracy": 0.6840859726071358, | |
| "num_tokens": 9916491.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 1.1363441050052643, | |
| "epoch": 0.7060484819957638, | |
| "grad_norm": 0.14100411534309387, | |
| "learning_rate": 0.00042969924812030075, | |
| "loss": 1.1268, | |
| "mean_token_accuracy": 0.6940664201974869, | |
| "num_tokens": 9943115.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 1.1373258829116821, | |
| "epoch": 0.7079312779477525, | |
| "grad_norm": 0.14058925211429596, | |
| "learning_rate": 0.00042951127819548877, | |
| "loss": 1.1312, | |
| "mean_token_accuracy": 0.6918314695358276, | |
| "num_tokens": 9971012.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 1.1753637194633484, | |
| "epoch": 0.7098140738997412, | |
| "grad_norm": 0.15900634229183197, | |
| "learning_rate": 0.00042932330827067674, | |
| "loss": 1.1532, | |
| "mean_token_accuracy": 0.688523419201374, | |
| "num_tokens": 9997158.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 1.2038870453834534, | |
| "epoch": 0.7116968698517299, | |
| "grad_norm": 0.15579019486904144, | |
| "learning_rate": 0.00042913533834586465, | |
| "loss": 1.1634, | |
| "mean_token_accuracy": 0.6910874620079994, | |
| "num_tokens": 10023904.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 1.2042047381401062, | |
| "epoch": 0.7135796658037186, | |
| "grad_norm": 0.1458210051059723, | |
| "learning_rate": 0.0004289473684210526, | |
| "loss": 1.1303, | |
| "mean_token_accuracy": 0.6955228298902512, | |
| "num_tokens": 10050044.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 1.199434906244278, | |
| "epoch": 0.7154624617557073, | |
| "grad_norm": 0.13873904943466187, | |
| "learning_rate": 0.00042875939849624064, | |
| "loss": 1.143, | |
| "mean_token_accuracy": 0.6911288425326347, | |
| "num_tokens": 10077533.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 1.179319679737091, | |
| "epoch": 0.717345257707696, | |
| "grad_norm": 0.15580423176288605, | |
| "learning_rate": 0.00042857142857142855, | |
| "loss": 1.1516, | |
| "mean_token_accuracy": 0.6900925859808922, | |
| "num_tokens": 10102103.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 1.1498710662126541, | |
| "epoch": 0.7192280536596847, | |
| "grad_norm": 0.1526648849248886, | |
| "learning_rate": 0.0004283834586466165, | |
| "loss": 1.1463, | |
| "mean_token_accuracy": 0.6923620998859406, | |
| "num_tokens": 10127966.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 1.2051638066768646, | |
| "epoch": 0.7211108496116734, | |
| "grad_norm": 0.14739763736724854, | |
| "learning_rate": 0.00042819548872180454, | |
| "loss": 1.2125, | |
| "mean_token_accuracy": 0.6824790090322495, | |
| "num_tokens": 10153724.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 1.148889034986496, | |
| "epoch": 0.7229936455636621, | |
| "grad_norm": 0.13951475918293, | |
| "learning_rate": 0.0004280075187969925, | |
| "loss": 1.1431, | |
| "mean_token_accuracy": 0.6938719674944878, | |
| "num_tokens": 10178827.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 1.1680803298950195, | |
| "epoch": 0.7248764415156508, | |
| "grad_norm": 0.14505353569984436, | |
| "learning_rate": 0.0004278195488721804, | |
| "loss": 1.1278, | |
| "mean_token_accuracy": 0.6925608888268471, | |
| "num_tokens": 10204362.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 1.1652754694223404, | |
| "epoch": 0.7267592374676395, | |
| "grad_norm": 0.15343666076660156, | |
| "learning_rate": 0.00042763157894736844, | |
| "loss": 1.1347, | |
| "mean_token_accuracy": 0.6980648785829544, | |
| "num_tokens": 10232975.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 1.1660331934690475, | |
| "epoch": 0.7286420334196282, | |
| "grad_norm": 0.6029819250106812, | |
| "learning_rate": 0.0004274436090225564, | |
| "loss": 1.1252, | |
| "mean_token_accuracy": 0.6913493424654007, | |
| "num_tokens": 10258684.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 1.2207347601652145, | |
| "epoch": 0.7305248293716169, | |
| "grad_norm": 0.1639021635055542, | |
| "learning_rate": 0.00042725563909774437, | |
| "loss": 1.2, | |
| "mean_token_accuracy": 0.680275171995163, | |
| "num_tokens": 10284896.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 1.1547054946422577, | |
| "epoch": 0.7324076253236056, | |
| "grad_norm": 0.13551250100135803, | |
| "learning_rate": 0.00042706766917293234, | |
| "loss": 1.153, | |
| "mean_token_accuracy": 0.6940227970480919, | |
| "num_tokens": 10312039.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 1.173499509692192, | |
| "epoch": 0.7342904212755943, | |
| "grad_norm": 0.14394164085388184, | |
| "learning_rate": 0.0004268796992481203, | |
| "loss": 1.1401, | |
| "mean_token_accuracy": 0.6948181614279747, | |
| "num_tokens": 10338001.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.108071744441986, | |
| "epoch": 0.736173217227583, | |
| "grad_norm": 0.15528494119644165, | |
| "learning_rate": 0.0004266917293233083, | |
| "loss": 1.0993, | |
| "mean_token_accuracy": 0.7045417055487633, | |
| "num_tokens": 10364257.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 1.1832116544246674, | |
| "epoch": 0.7380560131795717, | |
| "grad_norm": 0.14551259577274323, | |
| "learning_rate": 0.00042650375939849624, | |
| "loss": 1.1514, | |
| "mean_token_accuracy": 0.6929153054952621, | |
| "num_tokens": 10389671.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 1.1930436193943024, | |
| "epoch": 0.7399388091315604, | |
| "grad_norm": 0.15499240159988403, | |
| "learning_rate": 0.0004263157894736842, | |
| "loss": 1.1429, | |
| "mean_token_accuracy": 0.688226006925106, | |
| "num_tokens": 10415575.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 1.2092433124780655, | |
| "epoch": 0.7418216050835491, | |
| "grad_norm": 0.15129360556602478, | |
| "learning_rate": 0.0004261278195488722, | |
| "loss": 1.1844, | |
| "mean_token_accuracy": 0.6808707118034363, | |
| "num_tokens": 10442443.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 1.293672189116478, | |
| "epoch": 0.7437044010355378, | |
| "grad_norm": 0.1603565663099289, | |
| "learning_rate": 0.0004259398496240602, | |
| "loss": 1.2682, | |
| "mean_token_accuracy": 0.6722560822963715, | |
| "num_tokens": 10466233.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 1.1358380764722824, | |
| "epoch": 0.7455871969875265, | |
| "grad_norm": 0.1485726684331894, | |
| "learning_rate": 0.0004257518796992481, | |
| "loss": 1.1388, | |
| "mean_token_accuracy": 0.6920513585209846, | |
| "num_tokens": 10491851.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 1.13677416741848, | |
| "epoch": 0.7474699929395152, | |
| "grad_norm": 0.1432713270187378, | |
| "learning_rate": 0.0004255639097744361, | |
| "loss": 1.1244, | |
| "mean_token_accuracy": 0.6951583921909332, | |
| "num_tokens": 10518737.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 1.2034449130296707, | |
| "epoch": 0.7493527888915039, | |
| "grad_norm": 0.16076122224330902, | |
| "learning_rate": 0.0004253759398496241, | |
| "loss": 1.2062, | |
| "mean_token_accuracy": 0.6785011366009712, | |
| "num_tokens": 10545857.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 1.1623305827379227, | |
| "epoch": 0.7512355848434926, | |
| "grad_norm": 0.15050064027309418, | |
| "learning_rate": 0.000425187969924812, | |
| "loss": 1.1163, | |
| "mean_token_accuracy": 0.6948087736964226, | |
| "num_tokens": 10571770.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 1.1117802858352661, | |
| "epoch": 0.7531183807954813, | |
| "grad_norm": 0.21685755252838135, | |
| "learning_rate": 0.000425, | |
| "loss": 1.0837, | |
| "mean_token_accuracy": 0.7059917375445366, | |
| "num_tokens": 10599528.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.1872282922267914, | |
| "epoch": 0.75500117674747, | |
| "grad_norm": 0.1475781798362732, | |
| "learning_rate": 0.000424812030075188, | |
| "loss": 1.1617, | |
| "mean_token_accuracy": 0.6920499876141548, | |
| "num_tokens": 10625575.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 1.1875766217708588, | |
| "epoch": 0.7568839726994587, | |
| "grad_norm": 0.15453127026557922, | |
| "learning_rate": 0.00042462406015037596, | |
| "loss": 1.1608, | |
| "mean_token_accuracy": 0.6888900995254517, | |
| "num_tokens": 10650929.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 1.120169810950756, | |
| "epoch": 0.7587667686514474, | |
| "grad_norm": 0.14685072004795074, | |
| "learning_rate": 0.0004244360902255639, | |
| "loss": 1.0894, | |
| "mean_token_accuracy": 0.700760155916214, | |
| "num_tokens": 10677930.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 1.178112044930458, | |
| "epoch": 0.7606495646034361, | |
| "grad_norm": 0.15392844378948212, | |
| "learning_rate": 0.0004242481203007519, | |
| "loss": 1.1488, | |
| "mean_token_accuracy": 0.6943765133619308, | |
| "num_tokens": 10701759.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 1.139440432190895, | |
| "epoch": 0.7625323605554248, | |
| "grad_norm": 0.14876064658164978, | |
| "learning_rate": 0.00042406015037593987, | |
| "loss": 1.1175, | |
| "mean_token_accuracy": 0.6995274350047112, | |
| "num_tokens": 10727920.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 1.1383692100644112, | |
| "epoch": 0.7644151565074135, | |
| "grad_norm": 0.16769041121006012, | |
| "learning_rate": 0.00042387218045112783, | |
| "loss": 1.1056, | |
| "mean_token_accuracy": 0.6987453699111938, | |
| "num_tokens": 10752826.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 1.219818040728569, | |
| "epoch": 0.7662979524594022, | |
| "grad_norm": 0.16228246688842773, | |
| "learning_rate": 0.0004236842105263158, | |
| "loss": 1.1982, | |
| "mean_token_accuracy": 0.6772318556904793, | |
| "num_tokens": 10777756.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 1.1474368646740913, | |
| "epoch": 0.768180748411391, | |
| "grad_norm": 0.14922939240932465, | |
| "learning_rate": 0.00042349624060150377, | |
| "loss": 1.1385, | |
| "mean_token_accuracy": 0.6920562386512756, | |
| "num_tokens": 10804768.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 1.1331078857183456, | |
| "epoch": 0.7700635443633796, | |
| "grad_norm": 0.1535317599773407, | |
| "learning_rate": 0.00042330827067669173, | |
| "loss": 1.1359, | |
| "mean_token_accuracy": 0.6879219114780426, | |
| "num_tokens": 10830286.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 1.146752119064331, | |
| "epoch": 0.7719463403153684, | |
| "grad_norm": 0.1524975448846817, | |
| "learning_rate": 0.0004231203007518797, | |
| "loss": 1.1448, | |
| "mean_token_accuracy": 0.6925338879227638, | |
| "num_tokens": 10855720.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.13744555413723, | |
| "epoch": 0.773829136267357, | |
| "grad_norm": 0.16938121616840363, | |
| "learning_rate": 0.00042293233082706767, | |
| "loss": 1.1189, | |
| "mean_token_accuracy": 0.7019513100385666, | |
| "num_tokens": 10881312.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 1.1643693000078201, | |
| "epoch": 0.7757119322193458, | |
| "grad_norm": 0.134382426738739, | |
| "learning_rate": 0.00042274436090225563, | |
| "loss": 1.1205, | |
| "mean_token_accuracy": 0.7012400701642036, | |
| "num_tokens": 10909609.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 1.1546955406665802, | |
| "epoch": 0.7775947281713345, | |
| "grad_norm": 0.15923891961574554, | |
| "learning_rate": 0.00042255639097744366, | |
| "loss": 1.1025, | |
| "mean_token_accuracy": 0.7031391486525536, | |
| "num_tokens": 10937878.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 1.1441723331809044, | |
| "epoch": 0.7794775241233232, | |
| "grad_norm": 0.16663163900375366, | |
| "learning_rate": 0.00042236842105263157, | |
| "loss": 1.1092, | |
| "mean_token_accuracy": 0.6957027688622475, | |
| "num_tokens": 10963268.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 1.168132722377777, | |
| "epoch": 0.7813603200753119, | |
| "grad_norm": 0.13848932087421417, | |
| "learning_rate": 0.00042218045112781954, | |
| "loss": 1.132, | |
| "mean_token_accuracy": 0.6938114240765572, | |
| "num_tokens": 10990727.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 1.1057742238044739, | |
| "epoch": 0.7832431160273006, | |
| "grad_norm": 0.13826268911361694, | |
| "learning_rate": 0.00042199248120300756, | |
| "loss": 1.0977, | |
| "mean_token_accuracy": 0.6982015743851662, | |
| "num_tokens": 11017384.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 1.1963546127080917, | |
| "epoch": 0.7851259119792893, | |
| "grad_norm": 0.1429852694272995, | |
| "learning_rate": 0.0004218045112781955, | |
| "loss": 1.1883, | |
| "mean_token_accuracy": 0.6860344484448433, | |
| "num_tokens": 11045688.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 1.1521967574954033, | |
| "epoch": 0.787008707931278, | |
| "grad_norm": 0.16643297672271729, | |
| "learning_rate": 0.00042161654135338344, | |
| "loss": 1.1547, | |
| "mean_token_accuracy": 0.6908131241798401, | |
| "num_tokens": 11070352.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 1.1493701189756393, | |
| "epoch": 0.7888915038832667, | |
| "grad_norm": 0.15780487656593323, | |
| "learning_rate": 0.00042142857142857146, | |
| "loss": 1.1631, | |
| "mean_token_accuracy": 0.6898321136832237, | |
| "num_tokens": 11097217.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 1.2399737238883972, | |
| "epoch": 0.7907742998352554, | |
| "grad_norm": 0.15339267253875732, | |
| "learning_rate": 0.0004212406015037594, | |
| "loss": 1.206, | |
| "mean_token_accuracy": 0.6820631548762321, | |
| "num_tokens": 11123692.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.1258632093667984, | |
| "epoch": 0.7926570957872441, | |
| "grad_norm": 0.1442951112985611, | |
| "learning_rate": 0.00042105263157894734, | |
| "loss": 1.0869, | |
| "mean_token_accuracy": 0.7083057761192322, | |
| "num_tokens": 11149050.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 1.2205425053834915, | |
| "epoch": 0.7945398917392328, | |
| "grad_norm": 0.1388903707265854, | |
| "learning_rate": 0.00042086466165413536, | |
| "loss": 1.1843, | |
| "mean_token_accuracy": 0.6856774613261223, | |
| "num_tokens": 11175990.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 1.1613269746303558, | |
| "epoch": 0.7964226876912215, | |
| "grad_norm": 0.15723979473114014, | |
| "learning_rate": 0.0004206766917293233, | |
| "loss": 1.1238, | |
| "mean_token_accuracy": 0.6957441344857216, | |
| "num_tokens": 11203684.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 1.15619857609272, | |
| "epoch": 0.7983054836432102, | |
| "grad_norm": 0.16091464459896088, | |
| "learning_rate": 0.0004204887218045113, | |
| "loss": 1.1275, | |
| "mean_token_accuracy": 0.6946544200181961, | |
| "num_tokens": 11230179.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 1.2017978131771088, | |
| "epoch": 0.8001882795951989, | |
| "grad_norm": 0.15011471509933472, | |
| "learning_rate": 0.00042030075187969926, | |
| "loss": 1.1685, | |
| "mean_token_accuracy": 0.6920702531933784, | |
| "num_tokens": 11256384.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 1.2229324877262115, | |
| "epoch": 0.8020710755471876, | |
| "grad_norm": 0.14569929242134094, | |
| "learning_rate": 0.0004201127819548872, | |
| "loss": 1.2065, | |
| "mean_token_accuracy": 0.6834921091794968, | |
| "num_tokens": 11284359.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 1.1204483732581139, | |
| "epoch": 0.8039538714991763, | |
| "grad_norm": 0.14004987478256226, | |
| "learning_rate": 0.0004199248120300752, | |
| "loss": 1.1147, | |
| "mean_token_accuracy": 0.7033949047327042, | |
| "num_tokens": 11313184.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 1.1141091734170914, | |
| "epoch": 0.805836667451165, | |
| "grad_norm": 0.14807014167308807, | |
| "learning_rate": 0.00041973684210526316, | |
| "loss": 1.1074, | |
| "mean_token_accuracy": 0.6922068670392036, | |
| "num_tokens": 11340757.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 1.2002304196357727, | |
| "epoch": 0.8077194634031537, | |
| "grad_norm": 0.17711348831653595, | |
| "learning_rate": 0.00041954887218045113, | |
| "loss": 1.1973, | |
| "mean_token_accuracy": 0.6831801310181618, | |
| "num_tokens": 11366871.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 1.2234468758106232, | |
| "epoch": 0.8096022593551424, | |
| "grad_norm": 0.16027556359767914, | |
| "learning_rate": 0.0004193609022556391, | |
| "loss": 1.1958, | |
| "mean_token_accuracy": 0.6806567907333374, | |
| "num_tokens": 11390392.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.1892322599887848, | |
| "epoch": 0.8114850553071311, | |
| "grad_norm": 0.14892058074474335, | |
| "learning_rate": 0.0004191729323308271, | |
| "loss": 1.124, | |
| "mean_token_accuracy": 0.6932070925831795, | |
| "num_tokens": 11415883.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 1.1975643932819366, | |
| "epoch": 0.8133678512591198, | |
| "grad_norm": 0.13819143176078796, | |
| "learning_rate": 0.00041898496240601503, | |
| "loss": 1.1446, | |
| "mean_token_accuracy": 0.6961016952991486, | |
| "num_tokens": 11445261.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 1.231493815779686, | |
| "epoch": 0.8152506472111085, | |
| "grad_norm": 0.14783842861652374, | |
| "learning_rate": 0.000418796992481203, | |
| "loss": 1.1956, | |
| "mean_token_accuracy": 0.6879047080874443, | |
| "num_tokens": 11471660.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 1.1187082305550575, | |
| "epoch": 0.8171334431630972, | |
| "grad_norm": 0.1379650980234146, | |
| "learning_rate": 0.000418609022556391, | |
| "loss": 1.1226, | |
| "mean_token_accuracy": 0.6993625611066818, | |
| "num_tokens": 11498274.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 1.272495910525322, | |
| "epoch": 0.8190162391150859, | |
| "grad_norm": 0.1640465259552002, | |
| "learning_rate": 0.000418421052631579, | |
| "loss": 1.2792, | |
| "mean_token_accuracy": 0.6701348200440407, | |
| "num_tokens": 11525102.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 1.1658570766448975, | |
| "epoch": 0.8208990350670746, | |
| "grad_norm": 0.14112910628318787, | |
| "learning_rate": 0.0004182330827067669, | |
| "loss": 1.171, | |
| "mean_token_accuracy": 0.6936748847365379, | |
| "num_tokens": 11555100.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 1.2729250341653824, | |
| "epoch": 0.8227818310190633, | |
| "grad_norm": 0.15435785055160522, | |
| "learning_rate": 0.0004180451127819549, | |
| "loss": 1.2133, | |
| "mean_token_accuracy": 0.6812319383025169, | |
| "num_tokens": 11580101.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 1.13491952419281, | |
| "epoch": 0.824664626971052, | |
| "grad_norm": 0.1388065367937088, | |
| "learning_rate": 0.0004178571428571429, | |
| "loss": 1.091, | |
| "mean_token_accuracy": 0.7023670971393585, | |
| "num_tokens": 11607990.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 1.1109650805592537, | |
| "epoch": 0.8265474229230407, | |
| "grad_norm": 0.13361488282680511, | |
| "learning_rate": 0.0004176691729323308, | |
| "loss": 1.0797, | |
| "mean_token_accuracy": 0.7052409499883652, | |
| "num_tokens": 11635249.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 1.128780521452427, | |
| "epoch": 0.8284302188750294, | |
| "grad_norm": 0.14179299771785736, | |
| "learning_rate": 0.0004174812030075188, | |
| "loss": 1.0756, | |
| "mean_token_accuracy": 0.6986876875162125, | |
| "num_tokens": 11661132.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.1229918599128723, | |
| "epoch": 0.8303130148270181, | |
| "grad_norm": 0.13364551961421967, | |
| "learning_rate": 0.0004172932330827068, | |
| "loss": 1.1159, | |
| "mean_token_accuracy": 0.7024848908185959, | |
| "num_tokens": 11688969.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 1.1451409384608269, | |
| "epoch": 0.8321958107790068, | |
| "grad_norm": 0.15363940596580505, | |
| "learning_rate": 0.00041710526315789475, | |
| "loss": 1.1742, | |
| "mean_token_accuracy": 0.6850685179233551, | |
| "num_tokens": 11714108.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 1.1217172518372536, | |
| "epoch": 0.8340786067309955, | |
| "grad_norm": 0.1592985838651657, | |
| "learning_rate": 0.0004169172932330827, | |
| "loss": 1.1189, | |
| "mean_token_accuracy": 0.698178730905056, | |
| "num_tokens": 11737727.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 1.1448046416044235, | |
| "epoch": 0.8359614026829842, | |
| "grad_norm": 0.15717987716197968, | |
| "learning_rate": 0.0004167293233082707, | |
| "loss": 1.1271, | |
| "mean_token_accuracy": 0.696114294230938, | |
| "num_tokens": 11763503.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 1.1910344362258911, | |
| "epoch": 0.837844198634973, | |
| "grad_norm": 0.1563824862241745, | |
| "learning_rate": 0.00041654135338345865, | |
| "loss": 1.1685, | |
| "mean_token_accuracy": 0.6853935644030571, | |
| "num_tokens": 11788216.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 1.1520782858133316, | |
| "epoch": 0.8397269945869617, | |
| "grad_norm": 0.15299555659294128, | |
| "learning_rate": 0.0004163533834586467, | |
| "loss": 1.1235, | |
| "mean_token_accuracy": 0.6957945972681046, | |
| "num_tokens": 11813250.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 1.157516971230507, | |
| "epoch": 0.8416097905389504, | |
| "grad_norm": 0.15409286320209503, | |
| "learning_rate": 0.0004161654135338346, | |
| "loss": 1.1292, | |
| "mean_token_accuracy": 0.6986691579222679, | |
| "num_tokens": 11840547.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 1.1751955449581146, | |
| "epoch": 0.8434925864909391, | |
| "grad_norm": 0.1436087191104889, | |
| "learning_rate": 0.00041597744360902255, | |
| "loss": 1.1498, | |
| "mean_token_accuracy": 0.692206360399723, | |
| "num_tokens": 11868040.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 1.1962674707174301, | |
| "epoch": 0.8453753824429278, | |
| "grad_norm": 0.14213787019252777, | |
| "learning_rate": 0.0004157894736842106, | |
| "loss": 1.1349, | |
| "mean_token_accuracy": 0.6944708526134491, | |
| "num_tokens": 11894177.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 1.201774999499321, | |
| "epoch": 0.8472581783949165, | |
| "grad_norm": 0.15118546783924103, | |
| "learning_rate": 0.0004156015037593985, | |
| "loss": 1.1868, | |
| "mean_token_accuracy": 0.6906943470239639, | |
| "num_tokens": 11920755.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.1439872980117798, | |
| "epoch": 0.8491409743469052, | |
| "grad_norm": 0.1536472737789154, | |
| "learning_rate": 0.00041541353383458646, | |
| "loss": 1.1091, | |
| "mean_token_accuracy": 0.6987525522708893, | |
| "num_tokens": 11946199.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 1.1865400224924088, | |
| "epoch": 0.8510237702988939, | |
| "grad_norm": 0.16255781054496765, | |
| "learning_rate": 0.0004152255639097745, | |
| "loss": 1.1606, | |
| "mean_token_accuracy": 0.6941612362861633, | |
| "num_tokens": 11970559.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 1.1555950492620468, | |
| "epoch": 0.8529065662508826, | |
| "grad_norm": 0.15296806395053864, | |
| "learning_rate": 0.00041503759398496244, | |
| "loss": 1.1647, | |
| "mean_token_accuracy": 0.6893363445997238, | |
| "num_tokens": 11998113.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 1.1035746112465858, | |
| "epoch": 0.8547893622028713, | |
| "grad_norm": 0.13151533901691437, | |
| "learning_rate": 0.00041484962406015036, | |
| "loss": 1.0917, | |
| "mean_token_accuracy": 0.7064924463629723, | |
| "num_tokens": 12025595.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 1.148128904402256, | |
| "epoch": 0.85667215815486, | |
| "grad_norm": 0.15572930872440338, | |
| "learning_rate": 0.0004146616541353384, | |
| "loss": 1.1516, | |
| "mean_token_accuracy": 0.6970530971884727, | |
| "num_tokens": 12051025.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 1.1640497595071793, | |
| "epoch": 0.8585549541068487, | |
| "grad_norm": 0.14575503766536713, | |
| "learning_rate": 0.00041447368421052634, | |
| "loss": 1.124, | |
| "mean_token_accuracy": 0.6972140222787857, | |
| "num_tokens": 12080372.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 1.1797401309013367, | |
| "epoch": 0.8604377500588374, | |
| "grad_norm": 0.1724129319190979, | |
| "learning_rate": 0.0004142857142857143, | |
| "loss": 1.1266, | |
| "mean_token_accuracy": 0.6963677033782005, | |
| "num_tokens": 12107881.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 1.1369287073612213, | |
| "epoch": 0.8623205460108261, | |
| "grad_norm": 0.1409987360239029, | |
| "learning_rate": 0.0004140977443609022, | |
| "loss": 1.1021, | |
| "mean_token_accuracy": 0.6983814239501953, | |
| "num_tokens": 12136975.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 1.203329399228096, | |
| "epoch": 0.8642033419628148, | |
| "grad_norm": 0.171426460146904, | |
| "learning_rate": 0.00041390977443609025, | |
| "loss": 1.1796, | |
| "mean_token_accuracy": 0.6895611882209778, | |
| "num_tokens": 12164452.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 1.1388862580060959, | |
| "epoch": 0.8660861379148035, | |
| "grad_norm": 0.1465880423784256, | |
| "learning_rate": 0.0004137218045112782, | |
| "loss": 1.1449, | |
| "mean_token_accuracy": 0.6952017247676849, | |
| "num_tokens": 12190700.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.165066435933113, | |
| "epoch": 0.8679689338667922, | |
| "grad_norm": 0.1510019600391388, | |
| "learning_rate": 0.0004135338345864661, | |
| "loss": 1.1519, | |
| "mean_token_accuracy": 0.6902508214116096, | |
| "num_tokens": 12216248.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 1.1071253940463066, | |
| "epoch": 0.8698517298187809, | |
| "grad_norm": 0.1569354087114334, | |
| "learning_rate": 0.00041334586466165415, | |
| "loss": 1.1008, | |
| "mean_token_accuracy": 0.7029130309820175, | |
| "num_tokens": 12242702.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 1.162157580256462, | |
| "epoch": 0.8717345257707696, | |
| "grad_norm": 0.15269963443279266, | |
| "learning_rate": 0.0004131578947368421, | |
| "loss": 1.1408, | |
| "mean_token_accuracy": 0.6991895586252213, | |
| "num_tokens": 12267065.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 1.164448007941246, | |
| "epoch": 0.8736173217227583, | |
| "grad_norm": 0.15020480751991272, | |
| "learning_rate": 0.0004129699248120301, | |
| "loss": 1.1331, | |
| "mean_token_accuracy": 0.6945090070366859, | |
| "num_tokens": 12294273.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 1.194659799337387, | |
| "epoch": 0.875500117674747, | |
| "grad_norm": 0.16067473590373993, | |
| "learning_rate": 0.00041278195488721805, | |
| "loss": 1.1384, | |
| "mean_token_accuracy": 0.692974790930748, | |
| "num_tokens": 12319075.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 1.1572427451610565, | |
| "epoch": 0.8773829136267357, | |
| "grad_norm": 0.14344556629657745, | |
| "learning_rate": 0.000412593984962406, | |
| "loss": 1.1239, | |
| "mean_token_accuracy": 0.6996137872338295, | |
| "num_tokens": 12345047.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 1.1310506239533424, | |
| "epoch": 0.8792657095787244, | |
| "grad_norm": 0.1469915211200714, | |
| "learning_rate": 0.000412406015037594, | |
| "loss": 1.1117, | |
| "mean_token_accuracy": 0.6948174610733986, | |
| "num_tokens": 12371084.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 1.1873999759554863, | |
| "epoch": 0.8811485055307131, | |
| "grad_norm": 0.14283262193202972, | |
| "learning_rate": 0.00041221804511278195, | |
| "loss": 1.1725, | |
| "mean_token_accuracy": 0.6882406696677208, | |
| "num_tokens": 12397086.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 1.1660784780979156, | |
| "epoch": 0.8830313014827018, | |
| "grad_norm": 0.1400137096643448, | |
| "learning_rate": 0.0004120300751879699, | |
| "loss": 1.1305, | |
| "mean_token_accuracy": 0.6928488984704018, | |
| "num_tokens": 12424840.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 1.1689551174640656, | |
| "epoch": 0.8849140974346905, | |
| "grad_norm": 0.17401744425296783, | |
| "learning_rate": 0.0004118421052631579, | |
| "loss": 1.1356, | |
| "mean_token_accuracy": 0.6973849907517433, | |
| "num_tokens": 12453038.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.16590516269207, | |
| "epoch": 0.8867968933866792, | |
| "grad_norm": 0.15749803185462952, | |
| "learning_rate": 0.0004116541353383459, | |
| "loss": 1.1388, | |
| "mean_token_accuracy": 0.690193310379982, | |
| "num_tokens": 12479755.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 1.1534086763858795, | |
| "epoch": 0.8886796893386679, | |
| "grad_norm": 0.13575902581214905, | |
| "learning_rate": 0.0004114661654135338, | |
| "loss": 1.1333, | |
| "mean_token_accuracy": 0.6930194050073624, | |
| "num_tokens": 12507911.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 1.166767194867134, | |
| "epoch": 0.8905624852906566, | |
| "grad_norm": 0.14083941280841827, | |
| "learning_rate": 0.0004112781954887218, | |
| "loss": 1.1433, | |
| "mean_token_accuracy": 0.6883162334561348, | |
| "num_tokens": 12534740.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 1.116583712399006, | |
| "epoch": 0.8924452812426453, | |
| "grad_norm": 0.18177185952663422, | |
| "learning_rate": 0.0004110902255639098, | |
| "loss": 1.1013, | |
| "mean_token_accuracy": 0.6984972059726715, | |
| "num_tokens": 12560495.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 1.0962852016091347, | |
| "epoch": 0.894328077194634, | |
| "grad_norm": 0.15513888001441956, | |
| "learning_rate": 0.00041090225563909777, | |
| "loss": 1.0659, | |
| "mean_token_accuracy": 0.7114295363426208, | |
| "num_tokens": 12586806.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 1.1862118691205978, | |
| "epoch": 0.8962108731466227, | |
| "grad_norm": 0.1506270319223404, | |
| "learning_rate": 0.0004107142857142857, | |
| "loss": 1.1887, | |
| "mean_token_accuracy": 0.6871896237134933, | |
| "num_tokens": 12612493.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 1.1081865057349205, | |
| "epoch": 0.8980936690986114, | |
| "grad_norm": 0.14710566401481628, | |
| "learning_rate": 0.0004105263157894737, | |
| "loss": 1.1012, | |
| "mean_token_accuracy": 0.6983359083533287, | |
| "num_tokens": 12639626.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 1.128834992647171, | |
| "epoch": 0.8999764650506001, | |
| "grad_norm": 0.14161938428878784, | |
| "learning_rate": 0.00041033834586466167, | |
| "loss": 1.0982, | |
| "mean_token_accuracy": 0.7014680877327919, | |
| "num_tokens": 12664733.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 1.1446367651224136, | |
| "epoch": 0.9018592610025888, | |
| "grad_norm": 0.14254848659038544, | |
| "learning_rate": 0.0004101503759398496, | |
| "loss": 1.082, | |
| "mean_token_accuracy": 0.7081187888979912, | |
| "num_tokens": 12690384.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 1.2071665897965431, | |
| "epoch": 0.9037420569545775, | |
| "grad_norm": 0.1451028734445572, | |
| "learning_rate": 0.0004099624060150376, | |
| "loss": 1.1573, | |
| "mean_token_accuracy": 0.6878824383020401, | |
| "num_tokens": 12717190.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.1643542423844337, | |
| "epoch": 0.9056248529065662, | |
| "grad_norm": 0.16808035969734192, | |
| "learning_rate": 0.0004097744360902256, | |
| "loss": 1.1289, | |
| "mean_token_accuracy": 0.6955900862812996, | |
| "num_tokens": 12744287.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 1.1430502980947495, | |
| "epoch": 0.907507648858555, | |
| "grad_norm": 0.14388366043567657, | |
| "learning_rate": 0.00040958646616541354, | |
| "loss": 1.1377, | |
| "mean_token_accuracy": 0.6985258162021637, | |
| "num_tokens": 12769478.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 1.171137735247612, | |
| "epoch": 0.9093904448105437, | |
| "grad_norm": 0.14661596715450287, | |
| "learning_rate": 0.0004093984962406015, | |
| "loss": 1.1764, | |
| "mean_token_accuracy": 0.6929311379790306, | |
| "num_tokens": 12795715.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 1.159026637673378, | |
| "epoch": 0.9112732407625324, | |
| "grad_norm": 0.14750456809997559, | |
| "learning_rate": 0.0004092105263157895, | |
| "loss": 1.1578, | |
| "mean_token_accuracy": 0.6937888264656067, | |
| "num_tokens": 12821869.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 1.1478636413812637, | |
| "epoch": 0.9131560367145211, | |
| "grad_norm": 0.14371232688426971, | |
| "learning_rate": 0.00040902255639097744, | |
| "loss": 1.1218, | |
| "mean_token_accuracy": 0.7008863463997841, | |
| "num_tokens": 12848215.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 1.120044082403183, | |
| "epoch": 0.9150388326665098, | |
| "grad_norm": 0.1404104232788086, | |
| "learning_rate": 0.00040883458646616546, | |
| "loss": 1.0728, | |
| "mean_token_accuracy": 0.7091679647564888, | |
| "num_tokens": 12876182.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 1.1101247519254684, | |
| "epoch": 0.9169216286184985, | |
| "grad_norm": 0.1421038955450058, | |
| "learning_rate": 0.0004086466165413534, | |
| "loss": 1.0967, | |
| "mean_token_accuracy": 0.7037186399102211, | |
| "num_tokens": 12902501.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 1.1512123197317123, | |
| "epoch": 0.9188044245704872, | |
| "grad_norm": 0.14930035173892975, | |
| "learning_rate": 0.00040845864661654134, | |
| "loss": 1.1259, | |
| "mean_token_accuracy": 0.6954185292124748, | |
| "num_tokens": 12928275.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 1.136143758893013, | |
| "epoch": 0.9206872205224759, | |
| "grad_norm": 0.1431557983160019, | |
| "learning_rate": 0.00040827067669172936, | |
| "loss": 1.1053, | |
| "mean_token_accuracy": 0.7004474848508835, | |
| "num_tokens": 12954596.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 1.1639841794967651, | |
| "epoch": 0.9225700164744646, | |
| "grad_norm": 0.1477883905172348, | |
| "learning_rate": 0.0004080827067669173, | |
| "loss": 1.129, | |
| "mean_token_accuracy": 0.6972065195441246, | |
| "num_tokens": 12980318.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.162917599081993, | |
| "epoch": 0.9244528124264533, | |
| "grad_norm": 0.14567728340625763, | |
| "learning_rate": 0.00040789473684210524, | |
| "loss": 1.1503, | |
| "mean_token_accuracy": 0.6907480135560036, | |
| "num_tokens": 13006238.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 1.1558719277381897, | |
| "epoch": 0.926335608378442, | |
| "grad_norm": 0.1421021670103073, | |
| "learning_rate": 0.00040770676691729326, | |
| "loss": 1.1429, | |
| "mean_token_accuracy": 0.6948621720075607, | |
| "num_tokens": 13034071.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 1.175887256860733, | |
| "epoch": 0.9282184043304307, | |
| "grad_norm": 0.14368657767772675, | |
| "learning_rate": 0.00040751879699248123, | |
| "loss": 1.1752, | |
| "mean_token_accuracy": 0.6898396164178848, | |
| "num_tokens": 13059425.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 1.1281049996614456, | |
| "epoch": 0.9301012002824194, | |
| "grad_norm": 0.13681703805923462, | |
| "learning_rate": 0.00040733082706766914, | |
| "loss": 1.1437, | |
| "mean_token_accuracy": 0.6920712366700172, | |
| "num_tokens": 13087803.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 1.1919779032468796, | |
| "epoch": 0.9319839962344081, | |
| "grad_norm": 0.14613422751426697, | |
| "learning_rate": 0.00040714285714285717, | |
| "loss": 1.1647, | |
| "mean_token_accuracy": 0.6862485483288765, | |
| "num_tokens": 13114083.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 1.1703974455595016, | |
| "epoch": 0.9338667921863968, | |
| "grad_norm": 0.13816098868846893, | |
| "learning_rate": 0.00040695488721804513, | |
| "loss": 1.1191, | |
| "mean_token_accuracy": 0.6944621205329895, | |
| "num_tokens": 13140806.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 1.1625728458166122, | |
| "epoch": 0.9357495881383855, | |
| "grad_norm": 0.1374853253364563, | |
| "learning_rate": 0.0004067669172932331, | |
| "loss": 1.1311, | |
| "mean_token_accuracy": 0.693043515086174, | |
| "num_tokens": 13167072.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 1.1611916273832321, | |
| "epoch": 0.9376323840903742, | |
| "grad_norm": 0.14068859815597534, | |
| "learning_rate": 0.00040657894736842107, | |
| "loss": 1.0958, | |
| "mean_token_accuracy": 0.7017333880066872, | |
| "num_tokens": 13193952.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 1.2519186586141586, | |
| "epoch": 0.9395151800423629, | |
| "grad_norm": 0.14739161729812622, | |
| "learning_rate": 0.00040639097744360903, | |
| "loss": 1.2033, | |
| "mean_token_accuracy": 0.6803731620311737, | |
| "num_tokens": 13219334.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 1.060287207365036, | |
| "epoch": 0.9413979759943516, | |
| "grad_norm": 0.13330809772014618, | |
| "learning_rate": 0.000406203007518797, | |
| "loss": 1.0607, | |
| "mean_token_accuracy": 0.7074964344501495, | |
| "num_tokens": 13247762.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.1315688639879227, | |
| "epoch": 0.9432807719463403, | |
| "grad_norm": 0.14858287572860718, | |
| "learning_rate": 0.00040601503759398497, | |
| "loss": 1.1534, | |
| "mean_token_accuracy": 0.6925570517778397, | |
| "num_tokens": 13274542.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 1.1256567761301994, | |
| "epoch": 0.945163567898329, | |
| "grad_norm": 0.13854491710662842, | |
| "learning_rate": 0.00040582706766917293, | |
| "loss": 1.1164, | |
| "mean_token_accuracy": 0.697671189904213, | |
| "num_tokens": 13301954.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 1.1095138639211655, | |
| "epoch": 0.9470463638503177, | |
| "grad_norm": 0.14951969683170319, | |
| "learning_rate": 0.0004056390977443609, | |
| "loss": 1.0913, | |
| "mean_token_accuracy": 0.7060349136590958, | |
| "num_tokens": 13325368.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 1.2117299437522888, | |
| "epoch": 0.9489291598023064, | |
| "grad_norm": 0.14555485546588898, | |
| "learning_rate": 0.0004054511278195489, | |
| "loss": 1.1771, | |
| "mean_token_accuracy": 0.6878413483500481, | |
| "num_tokens": 13350621.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 1.1814142614603043, | |
| "epoch": 0.9508119557542951, | |
| "grad_norm": 0.13946305215358734, | |
| "learning_rate": 0.00040526315789473684, | |
| "loss": 1.1187, | |
| "mean_token_accuracy": 0.6975477784872055, | |
| "num_tokens": 13378436.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 1.1588895320892334, | |
| "epoch": 0.9526947517062838, | |
| "grad_norm": 0.14052411913871765, | |
| "learning_rate": 0.0004050751879699248, | |
| "loss": 1.1139, | |
| "mean_token_accuracy": 0.6970377415418625, | |
| "num_tokens": 13405779.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 1.1744963377714157, | |
| "epoch": 0.9545775476582725, | |
| "grad_norm": 0.14011354744434357, | |
| "learning_rate": 0.0004048872180451128, | |
| "loss": 1.1443, | |
| "mean_token_accuracy": 0.6915831044316292, | |
| "num_tokens": 13431768.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 1.1022943705320358, | |
| "epoch": 0.9564603436102612, | |
| "grad_norm": 0.16085639595985413, | |
| "learning_rate": 0.00040469924812030074, | |
| "loss": 1.0872, | |
| "mean_token_accuracy": 0.7034497335553169, | |
| "num_tokens": 13458430.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 1.1168298870325089, | |
| "epoch": 0.9583431395622499, | |
| "grad_norm": 0.14645646512508392, | |
| "learning_rate": 0.0004045112781954887, | |
| "loss": 1.1366, | |
| "mean_token_accuracy": 0.6974723115563393, | |
| "num_tokens": 13483989.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 1.1111514419317245, | |
| "epoch": 0.9602259355142386, | |
| "grad_norm": 0.15530261397361755, | |
| "learning_rate": 0.0004043233082706767, | |
| "loss": 1.1068, | |
| "mean_token_accuracy": 0.7063265517354012, | |
| "num_tokens": 13510734.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.1187052130699158, | |
| "epoch": 0.9621087314662273, | |
| "grad_norm": 0.1410273313522339, | |
| "learning_rate": 0.0004041353383458647, | |
| "loss": 1.1007, | |
| "mean_token_accuracy": 0.6978159174323082, | |
| "num_tokens": 13536200.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 1.2634307444095612, | |
| "epoch": 0.963991527418216, | |
| "grad_norm": 0.14832766354084015, | |
| "learning_rate": 0.0004039473684210526, | |
| "loss": 1.2454, | |
| "mean_token_accuracy": 0.674240916967392, | |
| "num_tokens": 13562180.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 1.209633857011795, | |
| "epoch": 0.9658743233702047, | |
| "grad_norm": 0.14852747321128845, | |
| "learning_rate": 0.0004037593984962406, | |
| "loss": 1.151, | |
| "mean_token_accuracy": 0.6942615807056427, | |
| "num_tokens": 13587252.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 1.1802778542041779, | |
| "epoch": 0.9677571193221934, | |
| "grad_norm": 0.14167462289333344, | |
| "learning_rate": 0.0004035714285714286, | |
| "loss": 1.1268, | |
| "mean_token_accuracy": 0.6984767615795135, | |
| "num_tokens": 13614161.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 1.1260388046503067, | |
| "epoch": 0.9696399152741821, | |
| "grad_norm": 0.1389787793159485, | |
| "learning_rate": 0.00040338345864661656, | |
| "loss": 1.1044, | |
| "mean_token_accuracy": 0.698441170156002, | |
| "num_tokens": 13640906.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 1.1336752623319626, | |
| "epoch": 0.9715227112261708, | |
| "grad_norm": 0.13808688521385193, | |
| "learning_rate": 0.0004031954887218045, | |
| "loss": 1.1185, | |
| "mean_token_accuracy": 0.7005246728658676, | |
| "num_tokens": 13666938.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 1.1089581847190857, | |
| "epoch": 0.9734055071781595, | |
| "grad_norm": 0.1490076631307602, | |
| "learning_rate": 0.0004030075187969925, | |
| "loss": 1.1037, | |
| "mean_token_accuracy": 0.699261337518692, | |
| "num_tokens": 13692343.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 1.1778569370508194, | |
| "epoch": 0.9752883031301482, | |
| "grad_norm": 0.1503973752260208, | |
| "learning_rate": 0.00040281954887218046, | |
| "loss": 1.1704, | |
| "mean_token_accuracy": 0.6850240305066109, | |
| "num_tokens": 13717884.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 1.1599782705307007, | |
| "epoch": 0.977171099082137, | |
| "grad_norm": 0.14560772478580475, | |
| "learning_rate": 0.00040263157894736843, | |
| "loss": 1.1481, | |
| "mean_token_accuracy": 0.6967450231313705, | |
| "num_tokens": 13744454.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 1.2482303828001022, | |
| "epoch": 0.9790538950341257, | |
| "grad_norm": 0.1557229459285736, | |
| "learning_rate": 0.0004024436090225564, | |
| "loss": 1.2016, | |
| "mean_token_accuracy": 0.679645448923111, | |
| "num_tokens": 13771382.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.154101237654686, | |
| "epoch": 0.9809366909861144, | |
| "grad_norm": 0.1511804610490799, | |
| "learning_rate": 0.00040225563909774436, | |
| "loss": 1.1211, | |
| "mean_token_accuracy": 0.692274309694767, | |
| "num_tokens": 13797315.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 1.1659268885850906, | |
| "epoch": 0.9828194869381031, | |
| "grad_norm": 0.14492999017238617, | |
| "learning_rate": 0.0004020676691729324, | |
| "loss": 1.1276, | |
| "mean_token_accuracy": 0.6957960724830627, | |
| "num_tokens": 13823504.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 1.2255947291851044, | |
| "epoch": 0.9847022828900918, | |
| "grad_norm": 0.16592226922512054, | |
| "learning_rate": 0.0004018796992481203, | |
| "loss": 1.2034, | |
| "mean_token_accuracy": 0.6800813153386116, | |
| "num_tokens": 13849682.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 1.181060180068016, | |
| "epoch": 0.9865850788420805, | |
| "grad_norm": 0.14438042044639587, | |
| "learning_rate": 0.00040169172932330826, | |
| "loss": 1.1422, | |
| "mean_token_accuracy": 0.6908884271979332, | |
| "num_tokens": 13877151.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 1.195601612329483, | |
| "epoch": 0.9884678747940692, | |
| "grad_norm": 0.1490834802389145, | |
| "learning_rate": 0.0004015037593984963, | |
| "loss": 1.1609, | |
| "mean_token_accuracy": 0.687875397503376, | |
| "num_tokens": 13902812.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 1.1874232441186905, | |
| "epoch": 0.9903506707460579, | |
| "grad_norm": 0.15240395069122314, | |
| "learning_rate": 0.00040131578947368425, | |
| "loss": 1.171, | |
| "mean_token_accuracy": 0.6891705989837646, | |
| "num_tokens": 13926800.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 1.0726541802287102, | |
| "epoch": 0.9922334666980466, | |
| "grad_norm": 0.1472628116607666, | |
| "learning_rate": 0.00040112781954887216, | |
| "loss": 1.0744, | |
| "mean_token_accuracy": 0.7090674415230751, | |
| "num_tokens": 13952161.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 1.1295729503035545, | |
| "epoch": 0.9941162626500353, | |
| "grad_norm": 0.1415957808494568, | |
| "learning_rate": 0.0004009398496240602, | |
| "loss": 1.1086, | |
| "mean_token_accuracy": 0.7027467861771584, | |
| "num_tokens": 13978937.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 1.1615847125649452, | |
| "epoch": 0.995999058602024, | |
| "grad_norm": 0.14748550951480865, | |
| "learning_rate": 0.00040075187969924815, | |
| "loss": 1.1474, | |
| "mean_token_accuracy": 0.6950105875730515, | |
| "num_tokens": 14005138.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 1.176683247089386, | |
| "epoch": 0.9978818545540127, | |
| "grad_norm": 0.1543041467666626, | |
| "learning_rate": 0.00040056390977443606, | |
| "loss": 1.1807, | |
| "mean_token_accuracy": 0.684785395860672, | |
| "num_tokens": 14028706.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.1374549865722656, | |
| "epoch": 0.9997646505060014, | |
| "grad_norm": 0.13411332666873932, | |
| "learning_rate": 0.0004003759398496241, | |
| "loss": 1.0976, | |
| "mean_token_accuracy": 0.7099665105342865, | |
| "num_tokens": 14056095.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 1.4449238777160645, | |
| "epoch": 1.0, | |
| "grad_norm": 0.5150332450866699, | |
| "learning_rate": 0.00040018796992481205, | |
| "loss": 1.4328, | |
| "mean_token_accuracy": 0.6301905512809753, | |
| "num_tokens": 14058143.0, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_entropy": 1.273110066141401, | |
| "eval_loss": 1.215613603591919, | |
| "eval_mean_token_accuracy": 0.6747710279056004, | |
| "eval_num_tokens": 14058143.0, | |
| "eval_runtime": 8.5294, | |
| "eval_samples_per_second": 5.745, | |
| "eval_steps_per_second": 0.821, | |
| "step": 532 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 2660, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.860994210304512e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |