{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 532, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.174624726176262, "epoch": 0.0018827959519887032, "grad_norm": 0.3589564859867096, "learning_rate": 0.0005, "loss": 1.7667, "mean_token_accuracy": 0.6097231954336166, "num_tokens": 26212.0, "step": 1 }, { "entropy": 1.3834485709667206, "epoch": 0.0037655919039774064, "grad_norm": 0.273681104183197, "learning_rate": 0.000499812030075188, "loss": 1.6137, "mean_token_accuracy": 0.6240904033184052, "num_tokens": 53331.0, "step": 2 }, { "entropy": 2.3064600229263306, "epoch": 0.00564838785596611, "grad_norm": 0.8047769665718079, "learning_rate": 0.0004996240601503759, "loss": 1.6899, "mean_token_accuracy": 0.6088793724775314, "num_tokens": 80291.0, "step": 3 }, { "entropy": 1.630955085158348, "epoch": 0.007531183807954813, "grad_norm": 0.30714720487594604, "learning_rate": 0.0004994360902255639, "loss": 1.5608, "mean_token_accuracy": 0.6291212365031242, "num_tokens": 106966.0, "step": 4 }, { "entropy": 1.3567735850811005, "epoch": 0.009413979759943516, "grad_norm": 0.2066618800163269, "learning_rate": 0.0004992481203007519, "loss": 1.4887, "mean_token_accuracy": 0.6415289863944054, "num_tokens": 132786.0, "step": 5 }, { "entropy": 1.361013576388359, "epoch": 0.01129677571193222, "grad_norm": 0.24627672135829926, "learning_rate": 0.0004990601503759398, "loss": 1.4956, "mean_token_accuracy": 0.6329040080308914, "num_tokens": 157854.0, "step": 6 }, { "entropy": 1.4551365226507187, "epoch": 0.013179571663920923, "grad_norm": 0.24504677951335907, "learning_rate": 0.0004988721804511278, "loss": 1.4555, "mean_token_accuracy": 0.6410629153251648, "num_tokens": 183628.0, "step": 7 }, { "entropy": 1.558847650885582, "epoch": 0.015062367615909626, "grad_norm": 0.24714401364326477, "learning_rate": 0.0004986842105263158, "loss": 1.4574, "mean_token_accuracy": 0.6385244429111481, "num_tokens": 212024.0, "step": 8 }, { "entropy": 1.4725914895534515, "epoch": 0.016945163567898328, "grad_norm": 0.14686766266822815, "learning_rate": 0.0004984962406015037, "loss": 1.4077, "mean_token_accuracy": 0.6496255323290825, "num_tokens": 239247.0, "step": 9 }, { "entropy": 1.399958148598671, "epoch": 0.01882795951988703, "grad_norm": 0.2573543190956116, "learning_rate": 0.0004983082706766917, "loss": 1.4648, "mean_token_accuracy": 0.6321976333856583, "num_tokens": 265365.0, "step": 10 }, { "entropy": 1.3477602005004883, "epoch": 0.020710755471875734, "grad_norm": 0.19095759093761444, "learning_rate": 0.0004981203007518797, "loss": 1.3914, "mean_token_accuracy": 0.6472064480185509, "num_tokens": 292664.0, "step": 11 }, { "entropy": 1.3985529839992523, "epoch": 0.02259355142386444, "grad_norm": 0.12443722784519196, "learning_rate": 0.0004979323308270676, "loss": 1.3841, "mean_token_accuracy": 0.6470160931348801, "num_tokens": 318823.0, "step": 12 }, { "entropy": 1.4100047498941422, "epoch": 0.024476347375853143, "grad_norm": 0.18163365125656128, "learning_rate": 0.0004977443609022556, "loss": 1.3475, "mean_token_accuracy": 0.6554316207766533, "num_tokens": 345276.0, "step": 13 }, { "entropy": 1.3673983961343765, "epoch": 0.026359143327841845, "grad_norm": 0.21292470395565033, "learning_rate": 0.0004975563909774436, "loss": 1.3423, "mean_token_accuracy": 0.6571086272597313, "num_tokens": 372290.0, "step": 14 }, { "entropy": 1.3170630186796188, "epoch": 0.028241939279830548, "grad_norm": 0.14680063724517822, "learning_rate": 0.0004973684210526315, "loss": 1.3433, "mean_token_accuracy": 0.6587843522429466, "num_tokens": 398806.0, "step": 15 }, { "entropy": 1.4363876283168793, "epoch": 0.03012473523181925, "grad_norm": 0.1492491513490677, "learning_rate": 0.0004971804511278195, "loss": 1.3881, "mean_token_accuracy": 0.6493127718567848, "num_tokens": 427973.0, "step": 16 }, { "entropy": 1.3928384333848953, "epoch": 0.032007531183807954, "grad_norm": 0.21353831887245178, "learning_rate": 0.0004969924812030076, "loss": 1.3303, "mean_token_accuracy": 0.6532666012644768, "num_tokens": 455705.0, "step": 17 }, { "entropy": 1.3039959222078323, "epoch": 0.033890327135796657, "grad_norm": 0.12421785295009613, "learning_rate": 0.0004968045112781954, "loss": 1.3078, "mean_token_accuracy": 0.6589679047465324, "num_tokens": 481697.0, "step": 18 }, { "entropy": 1.323414146900177, "epoch": 0.03577312308778536, "grad_norm": 0.13252823054790497, "learning_rate": 0.0004966165413533834, "loss": 1.3682, "mean_token_accuracy": 0.6478805840015411, "num_tokens": 508637.0, "step": 19 }, { "entropy": 1.320784792304039, "epoch": 0.03765591903977406, "grad_norm": 0.13821907341480255, "learning_rate": 0.0004964285714285715, "loss": 1.3087, "mean_token_accuracy": 0.6556096524000168, "num_tokens": 533762.0, "step": 20 }, { "entropy": 1.435991793870926, "epoch": 0.039538714991762765, "grad_norm": 0.13946449756622314, "learning_rate": 0.0004962406015037594, "loss": 1.4031, "mean_token_accuracy": 0.6474809646606445, "num_tokens": 558068.0, "step": 21 }, { "entropy": 1.3843661397695541, "epoch": 0.04142151094375147, "grad_norm": 0.14075031876564026, "learning_rate": 0.0004960526315789473, "loss": 1.3313, "mean_token_accuracy": 0.6577248424291611, "num_tokens": 585582.0, "step": 22 }, { "entropy": 1.3438803404569626, "epoch": 0.04330430689574018, "grad_norm": 0.12071845680475235, "learning_rate": 0.0004958646616541354, "loss": 1.3205, "mean_token_accuracy": 0.6598646715283394, "num_tokens": 614078.0, "step": 23 }, { "entropy": 1.2872049808502197, "epoch": 0.04518710284772888, "grad_norm": 0.13585081696510315, "learning_rate": 0.0004956766917293234, "loss": 1.2847, "mean_token_accuracy": 0.6646199747920036, "num_tokens": 641604.0, "step": 24 }, { "entropy": 1.4031487703323364, "epoch": 0.04706989879971758, "grad_norm": 0.16168682277202606, "learning_rate": 0.0004954887218045112, "loss": 1.3906, "mean_token_accuracy": 0.6470670253038406, "num_tokens": 668099.0, "step": 25 }, { "entropy": 1.3954781144857407, "epoch": 0.048952694751706285, "grad_norm": 0.1519748568534851, "learning_rate": 0.0004953007518796993, "loss": 1.3143, "mean_token_accuracy": 0.6569681242108345, "num_tokens": 693467.0, "step": 26 }, { "entropy": 1.4201241582632065, "epoch": 0.05083549070369499, "grad_norm": 0.12228523939847946, "learning_rate": 0.0004951127819548873, "loss": 1.3585, "mean_token_accuracy": 0.6522250324487686, "num_tokens": 719428.0, "step": 27 }, { "entropy": 1.3096809834241867, "epoch": 0.05271828665568369, "grad_norm": 0.12990325689315796, "learning_rate": 0.0004949248120300752, "loss": 1.3363, "mean_token_accuracy": 0.6576437503099442, "num_tokens": 743498.0, "step": 28 }, { "entropy": 1.2695416510105133, "epoch": 0.054601082607672394, "grad_norm": 0.12629908323287964, "learning_rate": 0.0004947368421052632, "loss": 1.256, "mean_token_accuracy": 0.6671914085745811, "num_tokens": 771083.0, "step": 29 }, { "entropy": 1.3144675344228745, "epoch": 0.056483878559661096, "grad_norm": 0.13920928537845612, "learning_rate": 0.0004945488721804512, "loss": 1.2797, "mean_token_accuracy": 0.6726761981844902, "num_tokens": 798194.0, "step": 30 }, { "entropy": 1.3235575556755066, "epoch": 0.0583666745116498, "grad_norm": 0.1421487033367157, "learning_rate": 0.0004943609022556391, "loss": 1.3095, "mean_token_accuracy": 0.6596867814660072, "num_tokens": 823348.0, "step": 31 }, { "entropy": 1.2517389357089996, "epoch": 0.0602494704636385, "grad_norm": 0.11075025051832199, "learning_rate": 0.0004941729323308271, "loss": 1.2458, "mean_token_accuracy": 0.6723818778991699, "num_tokens": 849713.0, "step": 32 }, { "entropy": 1.2159670144319534, "epoch": 0.062132266415627205, "grad_norm": 0.11285679787397385, "learning_rate": 0.0004939849624060151, "loss": 1.2158, "mean_token_accuracy": 0.6808358430862427, "num_tokens": 876659.0, "step": 33 }, { "entropy": 1.2742353826761246, "epoch": 0.06401506236761591, "grad_norm": 0.1200110912322998, "learning_rate": 0.000493796992481203, "loss": 1.2414, "mean_token_accuracy": 0.6697632297873497, "num_tokens": 904196.0, "step": 34 }, { "entropy": 1.3724654912948608, "epoch": 0.06589785831960461, "grad_norm": 0.11141709238290787, "learning_rate": 0.000493609022556391, "loss": 1.3037, "mean_token_accuracy": 0.6641954258084297, "num_tokens": 930650.0, "step": 35 }, { "entropy": 1.332644298672676, "epoch": 0.06778065427159331, "grad_norm": 0.11270242929458618, "learning_rate": 0.000493421052631579, "loss": 1.2723, "mean_token_accuracy": 0.6652832478284836, "num_tokens": 958361.0, "step": 36 }, { "entropy": 1.2781042605638504, "epoch": 0.06966345022358202, "grad_norm": 0.12608197331428528, "learning_rate": 0.0004932330827067669, "loss": 1.2664, "mean_token_accuracy": 0.6701500117778778, "num_tokens": 982981.0, "step": 37 }, { "entropy": 1.2652703523635864, "epoch": 0.07154624617557072, "grad_norm": 0.11680380254983902, "learning_rate": 0.0004930451127819549, "loss": 1.2363, "mean_token_accuracy": 0.6758281961083412, "num_tokens": 1010214.0, "step": 38 }, { "entropy": 1.2895056456327438, "epoch": 0.07342904212755942, "grad_norm": 0.13060909509658813, "learning_rate": 0.0004928571428571429, "loss": 1.2921, "mean_token_accuracy": 0.6617036908864975, "num_tokens": 1036007.0, "step": 39 }, { "entropy": 1.2508063912391663, "epoch": 0.07531183807954812, "grad_norm": 0.11048955470323563, "learning_rate": 0.0004926691729323308, "loss": 1.2388, "mean_token_accuracy": 0.6743078008294106, "num_tokens": 1064839.0, "step": 40 }, { "entropy": 1.2910813689231873, "epoch": 0.07719463403153683, "grad_norm": 0.12634366750717163, "learning_rate": 0.0004924812030075188, "loss": 1.2923, "mean_token_accuracy": 0.6658936813473701, "num_tokens": 1089267.0, "step": 41 }, { "entropy": 1.314329817891121, "epoch": 0.07907742998352553, "grad_norm": 0.11990135908126831, "learning_rate": 0.0004922932330827068, "loss": 1.2823, "mean_token_accuracy": 0.6621334031224251, "num_tokens": 1114747.0, "step": 42 }, { "entropy": 1.372491493821144, "epoch": 0.08096022593551423, "grad_norm": 0.14962127804756165, "learning_rate": 0.0004921052631578947, "loss": 1.3012, "mean_token_accuracy": 0.6624018624424934, "num_tokens": 1140568.0, "step": 43 }, { "entropy": 1.3109306246042252, "epoch": 0.08284302188750294, "grad_norm": 0.1251574158668518, "learning_rate": 0.0004919172932330827, "loss": 1.2753, "mean_token_accuracy": 0.6643748208880424, "num_tokens": 1166132.0, "step": 44 }, { "entropy": 1.2547127306461334, "epoch": 0.08472581783949165, "grad_norm": 0.14988984167575836, "learning_rate": 0.0004917293233082707, "loss": 1.2591, "mean_token_accuracy": 0.6667659133672714, "num_tokens": 1191773.0, "step": 45 }, { "entropy": 1.2385195791721344, "epoch": 0.08660861379148035, "grad_norm": 0.14218594133853912, "learning_rate": 0.0004915413533834586, "loss": 1.2551, "mean_token_accuracy": 0.67237289249897, "num_tokens": 1217928.0, "step": 46 }, { "entropy": 1.286237582564354, "epoch": 0.08849140974346906, "grad_norm": 0.1285715401172638, "learning_rate": 0.0004913533834586466, "loss": 1.228, "mean_token_accuracy": 0.6695188358426094, "num_tokens": 1243853.0, "step": 47 }, { "entropy": 1.2577073574066162, "epoch": 0.09037420569545776, "grad_norm": 0.1297583132982254, "learning_rate": 0.0004911654135338346, "loss": 1.1889, "mean_token_accuracy": 0.6802271753549576, "num_tokens": 1270883.0, "step": 48 }, { "entropy": 1.2520407736301422, "epoch": 0.09225700164744646, "grad_norm": 0.10652397572994232, "learning_rate": 0.0004909774436090225, "loss": 1.2295, "mean_token_accuracy": 0.675907552242279, "num_tokens": 1296937.0, "step": 49 }, { "entropy": 1.2889134734869003, "epoch": 0.09413979759943517, "grad_norm": 0.15478400886058807, "learning_rate": 0.0004907894736842106, "loss": 1.325, "mean_token_accuracy": 0.656628705561161, "num_tokens": 1323691.0, "step": 50 }, { "entropy": 1.319000005722046, "epoch": 0.09602259355142387, "grad_norm": 0.14395709335803986, "learning_rate": 0.0004906015037593985, "loss": 1.2879, "mean_token_accuracy": 0.6644657775759697, "num_tokens": 1347574.0, "step": 51 }, { "entropy": 1.265960842370987, "epoch": 0.09790538950341257, "grad_norm": 0.1301705241203308, "learning_rate": 0.0004904135338345864, "loss": 1.1913, "mean_token_accuracy": 0.6857202649116516, "num_tokens": 1376965.0, "step": 52 }, { "entropy": 1.2671979069709778, "epoch": 0.09978818545540127, "grad_norm": 0.12502525746822357, "learning_rate": 0.0004902255639097745, "loss": 1.2473, "mean_token_accuracy": 0.666202001273632, "num_tokens": 1402456.0, "step": 53 }, { "entropy": 1.2768708020448685, "epoch": 0.10167098140738998, "grad_norm": 0.1106332466006279, "learning_rate": 0.0004900375939849624, "loss": 1.2406, "mean_token_accuracy": 0.6731417253613472, "num_tokens": 1430744.0, "step": 54 }, { "entropy": 1.2286315560340881, "epoch": 0.10355377735937868, "grad_norm": 0.12362819164991379, "learning_rate": 0.0004898496240601503, "loss": 1.2452, "mean_token_accuracy": 0.6803877055644989, "num_tokens": 1459596.0, "step": 55 }, { "entropy": 1.2663686275482178, "epoch": 0.10543657331136738, "grad_norm": 0.11787568777799606, "learning_rate": 0.0004896616541353384, "loss": 1.2594, "mean_token_accuracy": 0.6688775643706322, "num_tokens": 1487663.0, "step": 56 }, { "entropy": 1.2797971814870834, "epoch": 0.10731936926335608, "grad_norm": 0.11497815698385239, "learning_rate": 0.0004894736842105264, "loss": 1.2556, "mean_token_accuracy": 0.6690255850553513, "num_tokens": 1514365.0, "step": 57 }, { "entropy": 1.2839107066392899, "epoch": 0.10920216521534479, "grad_norm": 0.11505855619907379, "learning_rate": 0.0004892857142857142, "loss": 1.2213, "mean_token_accuracy": 0.6812370792031288, "num_tokens": 1542885.0, "step": 58 }, { "entropy": 1.290139302611351, "epoch": 0.11108496116733349, "grad_norm": 0.11844398826360703, "learning_rate": 0.0004890977443609023, "loss": 1.2462, "mean_token_accuracy": 0.6695830523967743, "num_tokens": 1567898.0, "step": 59 }, { "entropy": 1.2590511292219162, "epoch": 0.11296775711932219, "grad_norm": 0.12767820060253143, "learning_rate": 0.0004889097744360903, "loss": 1.2515, "mean_token_accuracy": 0.6738757342100143, "num_tokens": 1594742.0, "step": 60 }, { "entropy": 1.2260379791259766, "epoch": 0.1148505530713109, "grad_norm": 0.11811124533414841, "learning_rate": 0.0004887218045112781, "loss": 1.1979, "mean_token_accuracy": 0.6808087155222893, "num_tokens": 1620685.0, "step": 61 }, { "entropy": 1.301318883895874, "epoch": 0.1167333490232996, "grad_norm": 0.13785120844841003, "learning_rate": 0.0004885338345864662, "loss": 1.3155, "mean_token_accuracy": 0.6592775583267212, "num_tokens": 1646541.0, "step": 62 }, { "entropy": 1.2704945504665375, "epoch": 0.1186161449752883, "grad_norm": 0.11612152308225632, "learning_rate": 0.0004883458646616542, "loss": 1.2429, "mean_token_accuracy": 0.6690341830253601, "num_tokens": 1674445.0, "step": 63 }, { "entropy": 1.2772111147642136, "epoch": 0.120498940927277, "grad_norm": 0.12045788764953613, "learning_rate": 0.00048815789473684215, "loss": 1.2114, "mean_token_accuracy": 0.6808006837964058, "num_tokens": 1701277.0, "step": 64 }, { "entropy": 1.2712904959917068, "epoch": 0.1223817368792657, "grad_norm": 0.11429794877767563, "learning_rate": 0.00048796992481203006, "loss": 1.216, "mean_token_accuracy": 0.6720417365431786, "num_tokens": 1728984.0, "step": 65 }, { "entropy": 1.3161986768245697, "epoch": 0.12426453283125441, "grad_norm": 0.1338111013174057, "learning_rate": 0.00048778195488721803, "loss": 1.3229, "mean_token_accuracy": 0.6602049320936203, "num_tokens": 1755598.0, "step": 66 }, { "entropy": 1.2473317682743073, "epoch": 0.1261473287832431, "grad_norm": 0.10488025099039078, "learning_rate": 0.00048759398496240605, "loss": 1.2263, "mean_token_accuracy": 0.6753234788775444, "num_tokens": 1783417.0, "step": 67 }, { "entropy": 1.2551011592149734, "epoch": 0.12803012473523182, "grad_norm": 0.11638512462377548, "learning_rate": 0.000487406015037594, "loss": 1.224, "mean_token_accuracy": 0.6783930733799934, "num_tokens": 1809462.0, "step": 68 }, { "entropy": 1.2382186502218246, "epoch": 0.12991292068722052, "grad_norm": 0.14887025952339172, "learning_rate": 0.00048721804511278193, "loss": 1.2175, "mean_token_accuracy": 0.6787804737687111, "num_tokens": 1835642.0, "step": 69 }, { "entropy": 1.274851605296135, "epoch": 0.13179571663920922, "grad_norm": 0.13403619825839996, "learning_rate": 0.00048703007518796995, "loss": 1.2662, "mean_token_accuracy": 0.6663196384906769, "num_tokens": 1859904.0, "step": 70 }, { "entropy": 1.303640365600586, "epoch": 0.13367851259119792, "grad_norm": 0.11801115423440933, "learning_rate": 0.0004868421052631579, "loss": 1.3138, "mean_token_accuracy": 0.6627907082438469, "num_tokens": 1886915.0, "step": 71 }, { "entropy": 1.2814981341362, "epoch": 0.13556130854318663, "grad_norm": 0.12543627619743347, "learning_rate": 0.00048665413533834583, "loss": 1.2599, "mean_token_accuracy": 0.6737553998827934, "num_tokens": 1912683.0, "step": 72 }, { "entropy": 1.2715606987476349, "epoch": 0.13744410449517533, "grad_norm": 0.11963653564453125, "learning_rate": 0.00048646616541353385, "loss": 1.2075, "mean_token_accuracy": 0.6787137389183044, "num_tokens": 1940455.0, "step": 73 }, { "entropy": 1.2765703648328781, "epoch": 0.13932690044716403, "grad_norm": 0.13952264189720154, "learning_rate": 0.0004862781954887218, "loss": 1.2043, "mean_token_accuracy": 0.6798917651176453, "num_tokens": 1965949.0, "step": 74 }, { "entropy": 1.229781836271286, "epoch": 0.14120969639915273, "grad_norm": 0.11769476532936096, "learning_rate": 0.0004860902255639098, "loss": 1.2063, "mean_token_accuracy": 0.6715990677475929, "num_tokens": 1992293.0, "step": 75 }, { "entropy": 1.1944819241762161, "epoch": 0.14309249235114144, "grad_norm": 0.12095087021589279, "learning_rate": 0.00048590225563909775, "loss": 1.217, "mean_token_accuracy": 0.6814620569348335, "num_tokens": 2019182.0, "step": 76 }, { "entropy": 1.2649260014295578, "epoch": 0.14497528830313014, "grad_norm": 0.12220579385757446, "learning_rate": 0.0004857142857142857, "loss": 1.2827, "mean_token_accuracy": 0.6689692661166191, "num_tokens": 2045357.0, "step": 77 }, { "entropy": 1.2532286047935486, "epoch": 0.14685808425511884, "grad_norm": 0.12137361615896225, "learning_rate": 0.0004855263157894737, "loss": 1.202, "mean_token_accuracy": 0.6808355078101158, "num_tokens": 2071015.0, "step": 78 }, { "entropy": 1.334955409169197, "epoch": 0.14874088020710755, "grad_norm": 0.12754660844802856, "learning_rate": 0.0004853383458646617, "loss": 1.2514, "mean_token_accuracy": 0.6797578409314156, "num_tokens": 2096831.0, "step": 79 }, { "entropy": 1.2261384725570679, "epoch": 0.15062367615909625, "grad_norm": 0.11096950620412827, "learning_rate": 0.0004851503759398496, "loss": 1.1933, "mean_token_accuracy": 0.6880421414971352, "num_tokens": 2126421.0, "step": 80 }, { "entropy": 1.2615373581647873, "epoch": 0.15250647211108495, "grad_norm": 0.13106736540794373, "learning_rate": 0.0004849624060150376, "loss": 1.2198, "mean_token_accuracy": 0.6821138635277748, "num_tokens": 2153303.0, "step": 81 }, { "entropy": 1.2859619706869125, "epoch": 0.15438926806307365, "grad_norm": 0.13115623593330383, "learning_rate": 0.0004847744360902256, "loss": 1.2783, "mean_token_accuracy": 0.6689222902059555, "num_tokens": 2180250.0, "step": 82 }, { "entropy": 1.248913735151291, "epoch": 0.15627206401506236, "grad_norm": 0.11291101574897766, "learning_rate": 0.0004845864661654135, "loss": 1.2351, "mean_token_accuracy": 0.6730126142501831, "num_tokens": 2207001.0, "step": 83 }, { "entropy": 1.2413169145584106, "epoch": 0.15815485996705106, "grad_norm": 0.1277051717042923, "learning_rate": 0.0004843984962406015, "loss": 1.2159, "mean_token_accuracy": 0.681744784116745, "num_tokens": 2232587.0, "step": 84 }, { "entropy": 1.2155817747116089, "epoch": 0.16003765591903976, "grad_norm": 0.15200501680374146, "learning_rate": 0.0004842105263157895, "loss": 1.1881, "mean_token_accuracy": 0.6845081895589828, "num_tokens": 2260040.0, "step": 85 }, { "entropy": 1.1750262528657913, "epoch": 0.16192045187102846, "grad_norm": 0.13496170938014984, "learning_rate": 0.0004840225563909775, "loss": 1.1566, "mean_token_accuracy": 0.6882026270031929, "num_tokens": 2286811.0, "step": 86 }, { "entropy": 1.2582080215215683, "epoch": 0.16380324782301717, "grad_norm": 0.12751278281211853, "learning_rate": 0.0004838345864661654, "loss": 1.2334, "mean_token_accuracy": 0.6756840199232101, "num_tokens": 2312376.0, "step": 87 }, { "entropy": 1.2530706375837326, "epoch": 0.16568604377500587, "grad_norm": 0.12347429990768433, "learning_rate": 0.0004836466165413534, "loss": 1.2358, "mean_token_accuracy": 0.6713104099035263, "num_tokens": 2338959.0, "step": 88 }, { "entropy": 1.2693426012992859, "epoch": 0.1675688397269946, "grad_norm": 0.16009417176246643, "learning_rate": 0.0004834586466165414, "loss": 1.2511, "mean_token_accuracy": 0.6736921593546867, "num_tokens": 2366183.0, "step": 89 }, { "entropy": 1.255973756313324, "epoch": 0.1694516356789833, "grad_norm": 0.12181756645441055, "learning_rate": 0.00048327067669172934, "loss": 1.2052, "mean_token_accuracy": 0.6734501421451569, "num_tokens": 2392856.0, "step": 90 }, { "entropy": 1.2562214732170105, "epoch": 0.171334431630972, "grad_norm": 0.12082800269126892, "learning_rate": 0.0004830827067669173, "loss": 1.2519, "mean_token_accuracy": 0.6692837849259377, "num_tokens": 2419897.0, "step": 91 }, { "entropy": 1.1730956435203552, "epoch": 0.1732172275829607, "grad_norm": 0.11969847977161407, "learning_rate": 0.0004828947368421053, "loss": 1.1305, "mean_token_accuracy": 0.6944040432572365, "num_tokens": 2449131.0, "step": 92 }, { "entropy": 1.2573560923337936, "epoch": 0.1751000235349494, "grad_norm": 0.1183922290802002, "learning_rate": 0.00048270676691729324, "loss": 1.224, "mean_token_accuracy": 0.6771978959441185, "num_tokens": 2474107.0, "step": 93 }, { "entropy": 1.2122257351875305, "epoch": 0.17698281948693811, "grad_norm": 0.1325969696044922, "learning_rate": 0.0004825187969924812, "loss": 1.1754, "mean_token_accuracy": 0.6865298077464104, "num_tokens": 2501837.0, "step": 94 }, { "entropy": 1.2060312926769257, "epoch": 0.17886561543892682, "grad_norm": 0.12340355664491653, "learning_rate": 0.0004823308270676692, "loss": 1.2042, "mean_token_accuracy": 0.6752656251192093, "num_tokens": 2528769.0, "step": 95 }, { "entropy": 1.268461525440216, "epoch": 0.18074841139091552, "grad_norm": 0.1260639727115631, "learning_rate": 0.00048214285714285715, "loss": 1.2781, "mean_token_accuracy": 0.6681492626667023, "num_tokens": 2555451.0, "step": 96 }, { "entropy": 1.2650732845067978, "epoch": 0.18263120734290422, "grad_norm": 0.12851010262966156, "learning_rate": 0.00048195488721804517, "loss": 1.2458, "mean_token_accuracy": 0.671695739030838, "num_tokens": 2582196.0, "step": 97 }, { "entropy": 1.2784437835216522, "epoch": 0.18451400329489293, "grad_norm": 0.1278950273990631, "learning_rate": 0.0004817669172932331, "loss": 1.2319, "mean_token_accuracy": 0.6702851504087448, "num_tokens": 2608444.0, "step": 98 }, { "entropy": 1.2551447749137878, "epoch": 0.18639679924688163, "grad_norm": 0.1206209808588028, "learning_rate": 0.00048157894736842105, "loss": 1.2044, "mean_token_accuracy": 0.677789680659771, "num_tokens": 2634109.0, "step": 99 }, { "entropy": 1.2039145231246948, "epoch": 0.18827959519887033, "grad_norm": 0.12305069714784622, "learning_rate": 0.00048139097744360907, "loss": 1.1637, "mean_token_accuracy": 0.6861624270677567, "num_tokens": 2659548.0, "step": 100 }, { "entropy": 1.2327278852462769, "epoch": 0.19016239115085903, "grad_norm": 0.13643652200698853, "learning_rate": 0.000481203007518797, "loss": 1.212, "mean_token_accuracy": 0.6804677918553352, "num_tokens": 2684638.0, "step": 101 }, { "entropy": 1.194289356470108, "epoch": 0.19204518710284774, "grad_norm": 0.15666837990283966, "learning_rate": 0.00048101503759398495, "loss": 1.1797, "mean_token_accuracy": 0.683199092745781, "num_tokens": 2711970.0, "step": 102 }, { "entropy": 1.2052866965532303, "epoch": 0.19392798305483644, "grad_norm": 0.12934386730194092, "learning_rate": 0.00048082706766917297, "loss": 1.1954, "mean_token_accuracy": 0.6831924915313721, "num_tokens": 2738028.0, "step": 103 }, { "entropy": 1.2316648960113525, "epoch": 0.19581077900682514, "grad_norm": 0.12603920698165894, "learning_rate": 0.00048063909774436094, "loss": 1.2112, "mean_token_accuracy": 0.6792290285229683, "num_tokens": 2765091.0, "step": 104 }, { "entropy": 1.2624593675136566, "epoch": 0.19769357495881384, "grad_norm": 0.1318008452653885, "learning_rate": 0.00048045112781954885, "loss": 1.2389, "mean_token_accuracy": 0.6782659739255905, "num_tokens": 2792661.0, "step": 105 }, { "entropy": 1.2824029475450516, "epoch": 0.19957637091080255, "grad_norm": 0.13028129935264587, "learning_rate": 0.00048026315789473687, "loss": 1.2581, "mean_token_accuracy": 0.6727664992213249, "num_tokens": 2819535.0, "step": 106 }, { "entropy": 1.1964116394519806, "epoch": 0.20145916686279125, "grad_norm": 0.16565856337547302, "learning_rate": 0.00048007518796992484, "loss": 1.1427, "mean_token_accuracy": 0.6922469958662987, "num_tokens": 2848429.0, "step": 107 }, { "entropy": 1.2726367861032486, "epoch": 0.20334196281477995, "grad_norm": 0.1416698843240738, "learning_rate": 0.0004798872180451128, "loss": 1.225, "mean_token_accuracy": 0.6754879876971245, "num_tokens": 2874776.0, "step": 108 }, { "entropy": 1.2357124537229538, "epoch": 0.20522475876676866, "grad_norm": 0.12491658329963684, "learning_rate": 0.00047969924812030077, "loss": 1.204, "mean_token_accuracy": 0.6739878728985786, "num_tokens": 2902602.0, "step": 109 }, { "entropy": 1.2650941908359528, "epoch": 0.20710755471875736, "grad_norm": 0.13329921662807465, "learning_rate": 0.00047951127819548874, "loss": 1.2432, "mean_token_accuracy": 0.6738255694508553, "num_tokens": 2929536.0, "step": 110 }, { "entropy": 1.2259162962436676, "epoch": 0.20899035067074606, "grad_norm": 0.14152902364730835, "learning_rate": 0.0004793233082706767, "loss": 1.1886, "mean_token_accuracy": 0.6813376769423485, "num_tokens": 2955236.0, "step": 111 }, { "entropy": 1.1335331127047539, "epoch": 0.21087314662273476, "grad_norm": 0.13298991322517395, "learning_rate": 0.00047913533834586467, "loss": 1.1339, "mean_token_accuracy": 0.6916593015193939, "num_tokens": 2979921.0, "step": 112 }, { "entropy": 1.2154437899589539, "epoch": 0.21275594257472347, "grad_norm": 0.15994608402252197, "learning_rate": 0.00047894736842105264, "loss": 1.2115, "mean_token_accuracy": 0.679818794131279, "num_tokens": 3005638.0, "step": 113 }, { "entropy": 1.215769276022911, "epoch": 0.21463873852671217, "grad_norm": 0.11282095313072205, "learning_rate": 0.0004787593984962406, "loss": 1.1821, "mean_token_accuracy": 0.6841456890106201, "num_tokens": 3033979.0, "step": 114 }, { "entropy": 1.2786222100257874, "epoch": 0.21652153447870087, "grad_norm": 0.13811451196670532, "learning_rate": 0.0004785714285714286, "loss": 1.2177, "mean_token_accuracy": 0.6760591194033623, "num_tokens": 3060581.0, "step": 115 }, { "entropy": 1.1782392710447311, "epoch": 0.21840433043068957, "grad_norm": 0.12641046941280365, "learning_rate": 0.00047838345864661654, "loss": 1.1449, "mean_token_accuracy": 0.6954788789153099, "num_tokens": 3086594.0, "step": 116 }, { "entropy": 1.2415330708026886, "epoch": 0.22028712638267828, "grad_norm": 0.1396101415157318, "learning_rate": 0.0004781954887218045, "loss": 1.2245, "mean_token_accuracy": 0.6794020012021065, "num_tokens": 3114117.0, "step": 117 }, { "entropy": 1.2689218074083328, "epoch": 0.22216992233466698, "grad_norm": 0.13006678223609924, "learning_rate": 0.00047800751879699253, "loss": 1.2523, "mean_token_accuracy": 0.6741964370012283, "num_tokens": 3140643.0, "step": 118 }, { "entropy": 1.2656696736812592, "epoch": 0.22405271828665568, "grad_norm": 0.15107867121696472, "learning_rate": 0.0004778195488721805, "loss": 1.2539, "mean_token_accuracy": 0.6682558432221413, "num_tokens": 3166141.0, "step": 119 }, { "entropy": 1.1993789225816727, "epoch": 0.22593551423864439, "grad_norm": 0.11653780192136765, "learning_rate": 0.0004776315789473684, "loss": 1.1753, "mean_token_accuracy": 0.6902748569846153, "num_tokens": 3193339.0, "step": 120 }, { "entropy": 1.231392353773117, "epoch": 0.2278183101906331, "grad_norm": 0.1314115673303604, "learning_rate": 0.00047744360902255643, "loss": 1.2005, "mean_token_accuracy": 0.6799951046705246, "num_tokens": 3219993.0, "step": 121 }, { "entropy": 1.2121622115373611, "epoch": 0.2297011061426218, "grad_norm": 0.12394538521766663, "learning_rate": 0.0004772556390977444, "loss": 1.1715, "mean_token_accuracy": 0.6903199851512909, "num_tokens": 3247444.0, "step": 122 }, { "entropy": 1.2413930743932724, "epoch": 0.2315839020946105, "grad_norm": 0.1266545057296753, "learning_rate": 0.0004770676691729323, "loss": 1.1899, "mean_token_accuracy": 0.682403139770031, "num_tokens": 3272627.0, "step": 123 }, { "entropy": 1.1818571537733078, "epoch": 0.2334666980465992, "grad_norm": 0.15664935111999512, "learning_rate": 0.00047687969924812033, "loss": 1.1479, "mean_token_accuracy": 0.6944203674793243, "num_tokens": 3296898.0, "step": 124 }, { "entropy": 1.2419498413801193, "epoch": 0.2353494939985879, "grad_norm": 0.15578152239322662, "learning_rate": 0.0004766917293233083, "loss": 1.2335, "mean_token_accuracy": 0.6732713803648949, "num_tokens": 3322692.0, "step": 125 }, { "entropy": 1.2249382436275482, "epoch": 0.2372322899505766, "grad_norm": 0.14584508538246155, "learning_rate": 0.00047650375939849626, "loss": 1.2124, "mean_token_accuracy": 0.6797131448984146, "num_tokens": 3348376.0, "step": 126 }, { "entropy": 1.2090249583125114, "epoch": 0.2391150859025653, "grad_norm": 0.15335120260715485, "learning_rate": 0.0004763157894736842, "loss": 1.1861, "mean_token_accuracy": 0.6816836297512054, "num_tokens": 3375056.0, "step": 127 }, { "entropy": 1.2331191301345825, "epoch": 0.240997881854554, "grad_norm": 0.13854444026947021, "learning_rate": 0.0004761278195488722, "loss": 1.1867, "mean_token_accuracy": 0.6822093352675438, "num_tokens": 3401338.0, "step": 128 }, { "entropy": 1.2083263993263245, "epoch": 0.2428806778065427, "grad_norm": 0.1330289989709854, "learning_rate": 0.00047593984962406016, "loss": 1.1774, "mean_token_accuracy": 0.6801193058490753, "num_tokens": 3426232.0, "step": 129 }, { "entropy": 1.2008604258298874, "epoch": 0.2447634737585314, "grad_norm": 0.14914868772029877, "learning_rate": 0.00047575187969924813, "loss": 1.1679, "mean_token_accuracy": 0.6855365261435509, "num_tokens": 3454080.0, "step": 130 }, { "entropy": 1.2279947251081467, "epoch": 0.24664626971052012, "grad_norm": 0.18307369947433472, "learning_rate": 0.0004755639097744361, "loss": 1.2333, "mean_token_accuracy": 0.672551229596138, "num_tokens": 3478258.0, "step": 131 }, { "entropy": 1.1894963383674622, "epoch": 0.24852906566250882, "grad_norm": 0.13398650288581848, "learning_rate": 0.00047537593984962407, "loss": 1.1953, "mean_token_accuracy": 0.6832383349537849, "num_tokens": 3504254.0, "step": 132 }, { "entropy": 1.2269657999277115, "epoch": 0.2504118616144975, "grad_norm": 0.13811668753623962, "learning_rate": 0.00047518796992481203, "loss": 1.1741, "mean_token_accuracy": 0.6880706697702408, "num_tokens": 3531225.0, "step": 133 }, { "entropy": 1.198286533355713, "epoch": 0.2522946575664862, "grad_norm": 0.17705924808979034, "learning_rate": 0.000475, "loss": 1.1395, "mean_token_accuracy": 0.691774420440197, "num_tokens": 3556428.0, "step": 134 }, { "entropy": 1.2244715094566345, "epoch": 0.2541774535184749, "grad_norm": 0.17644067108631134, "learning_rate": 0.00047481203007518797, "loss": 1.2204, "mean_token_accuracy": 0.6757577136158943, "num_tokens": 3583373.0, "step": 135 }, { "entropy": 1.208250641822815, "epoch": 0.25606024947046363, "grad_norm": 0.12975312769412994, "learning_rate": 0.00047462406015037593, "loss": 1.2032, "mean_token_accuracy": 0.68288903683424, "num_tokens": 3610878.0, "step": 136 }, { "entropy": 1.1764077246189117, "epoch": 0.25794304542245233, "grad_norm": 0.13420140743255615, "learning_rate": 0.00047443609022556395, "loss": 1.1343, "mean_token_accuracy": 0.6927010640501976, "num_tokens": 3636794.0, "step": 137 }, { "entropy": 1.2354558259248734, "epoch": 0.25982584137444104, "grad_norm": 0.12880398333072662, "learning_rate": 0.00047424812030075187, "loss": 1.1809, "mean_token_accuracy": 0.682947002351284, "num_tokens": 3665578.0, "step": 138 }, { "entropy": 1.175147533416748, "epoch": 0.26170863732642974, "grad_norm": 0.15634110569953918, "learning_rate": 0.00047406015037593983, "loss": 1.1483, "mean_token_accuracy": 0.6907549053430557, "num_tokens": 3691407.0, "step": 139 }, { "entropy": 1.1331272423267365, "epoch": 0.26359143327841844, "grad_norm": 0.13562822341918945, "learning_rate": 0.00047387218045112786, "loss": 1.119, "mean_token_accuracy": 0.6953889951109886, "num_tokens": 3718468.0, "step": 140 }, { "entropy": 1.2285344004631042, "epoch": 0.26547422923040714, "grad_norm": 0.1443127691745758, "learning_rate": 0.00047368421052631577, "loss": 1.2352, "mean_token_accuracy": 0.6712902784347534, "num_tokens": 3744121.0, "step": 141 }, { "entropy": 1.2572973817586899, "epoch": 0.26735702518239585, "grad_norm": 0.14697600901126862, "learning_rate": 0.00047349624060150373, "loss": 1.2545, "mean_token_accuracy": 0.6712752804160118, "num_tokens": 3768665.0, "step": 142 }, { "entropy": 1.2219904512166977, "epoch": 0.26923982113438455, "grad_norm": 0.1259946972131729, "learning_rate": 0.00047330827067669176, "loss": 1.1953, "mean_token_accuracy": 0.6853306293487549, "num_tokens": 3798421.0, "step": 143 }, { "entropy": 1.2031358480453491, "epoch": 0.27112261708637325, "grad_norm": 0.1336822658777237, "learning_rate": 0.0004731203007518797, "loss": 1.1158, "mean_token_accuracy": 0.7008628249168396, "num_tokens": 3826569.0, "step": 144 }, { "entropy": 1.2654242366552353, "epoch": 0.27300541303836195, "grad_norm": 0.12933260202407837, "learning_rate": 0.00047293233082706764, "loss": 1.2125, "mean_token_accuracy": 0.6849671006202698, "num_tokens": 3853128.0, "step": 145 }, { "entropy": 1.1577993482351303, "epoch": 0.27488820899035066, "grad_norm": 0.13406828045845032, "learning_rate": 0.00047274436090225566, "loss": 1.1624, "mean_token_accuracy": 0.6865072473883629, "num_tokens": 3880569.0, "step": 146 }, { "entropy": 1.1901942938566208, "epoch": 0.27677100494233936, "grad_norm": 0.14410416781902313, "learning_rate": 0.0004725563909774436, "loss": 1.2313, "mean_token_accuracy": 0.6749508231878281, "num_tokens": 3907559.0, "step": 147 }, { "entropy": 1.1600831672549248, "epoch": 0.27865380089432806, "grad_norm": 0.1339792162179947, "learning_rate": 0.0004723684210526316, "loss": 1.1987, "mean_token_accuracy": 0.6836483106017113, "num_tokens": 3934255.0, "step": 148 }, { "entropy": 1.2559089958667755, "epoch": 0.28053659684631677, "grad_norm": 0.12650057673454285, "learning_rate": 0.00047218045112781956, "loss": 1.2294, "mean_token_accuracy": 0.6761154308915138, "num_tokens": 3959809.0, "step": 149 }, { "entropy": 1.2887302935123444, "epoch": 0.28241939279830547, "grad_norm": 0.14123603701591492, "learning_rate": 0.0004719924812030075, "loss": 1.1892, "mean_token_accuracy": 0.6841337457299232, "num_tokens": 3984834.0, "step": 150 }, { "entropy": 1.2641656994819641, "epoch": 0.28430218875029417, "grad_norm": 0.13069137930870056, "learning_rate": 0.0004718045112781955, "loss": 1.178, "mean_token_accuracy": 0.6903347223997116, "num_tokens": 4011854.0, "step": 151 }, { "entropy": 1.2745257169008255, "epoch": 0.2861849847022829, "grad_norm": 0.12974441051483154, "learning_rate": 0.00047161654135338346, "loss": 1.2299, "mean_token_accuracy": 0.6787015795707703, "num_tokens": 4038272.0, "step": 152 }, { "entropy": 1.2451976537704468, "epoch": 0.2880677806542716, "grad_norm": 0.15594416856765747, "learning_rate": 0.0004714285714285714, "loss": 1.2506, "mean_token_accuracy": 0.6727647334337234, "num_tokens": 4066761.0, "step": 153 }, { "entropy": 1.1639655232429504, "epoch": 0.2899505766062603, "grad_norm": 0.12053865194320679, "learning_rate": 0.0004712406015037594, "loss": 1.167, "mean_token_accuracy": 0.6889369264245033, "num_tokens": 4094208.0, "step": 154 }, { "entropy": 1.1459853649139404, "epoch": 0.291833372558249, "grad_norm": 0.15322330594062805, "learning_rate": 0.0004710526315789474, "loss": 1.1297, "mean_token_accuracy": 0.691886380314827, "num_tokens": 4121959.0, "step": 155 }, { "entropy": 1.2293187081813812, "epoch": 0.2937161685102377, "grad_norm": 0.135823056101799, "learning_rate": 0.0004708646616541353, "loss": 1.2266, "mean_token_accuracy": 0.6803058981895447, "num_tokens": 4147782.0, "step": 156 }, { "entropy": 1.192505158483982, "epoch": 0.2955989644622264, "grad_norm": 0.13535255193710327, "learning_rate": 0.0004706766917293233, "loss": 1.1608, "mean_token_accuracy": 0.6955654844641685, "num_tokens": 4176277.0, "step": 157 }, { "entropy": 1.2871312350034714, "epoch": 0.2974817604142151, "grad_norm": 0.12719225883483887, "learning_rate": 0.0004704887218045113, "loss": 1.2311, "mean_token_accuracy": 0.6765939891338348, "num_tokens": 4202697.0, "step": 158 }, { "entropy": 1.2744830250740051, "epoch": 0.2993645563662038, "grad_norm": 0.15343067049980164, "learning_rate": 0.0004703007518796993, "loss": 1.2229, "mean_token_accuracy": 0.671116054058075, "num_tokens": 4229068.0, "step": 159 }, { "entropy": 1.2606779783964157, "epoch": 0.3012473523181925, "grad_norm": 0.12448015809059143, "learning_rate": 0.0004701127819548872, "loss": 1.2061, "mean_token_accuracy": 0.6829146966338158, "num_tokens": 4256896.0, "step": 160 }, { "entropy": 1.150521382689476, "epoch": 0.3031301482701812, "grad_norm": 0.1213938444852829, "learning_rate": 0.0004699248120300752, "loss": 1.128, "mean_token_accuracy": 0.6945177465677261, "num_tokens": 4283765.0, "step": 161 }, { "entropy": 1.1809571981430054, "epoch": 0.3050129442221699, "grad_norm": 0.13989101350307465, "learning_rate": 0.0004697368421052632, "loss": 1.1549, "mean_token_accuracy": 0.6888199374079704, "num_tokens": 4308970.0, "step": 162 }, { "entropy": 1.151911549270153, "epoch": 0.3068957401741586, "grad_norm": 0.2074657380580902, "learning_rate": 0.0004695488721804511, "loss": 1.1309, "mean_token_accuracy": 0.6942140832543373, "num_tokens": 4333158.0, "step": 163 }, { "entropy": 1.1968079656362534, "epoch": 0.3087785361261473, "grad_norm": 0.13570360839366913, "learning_rate": 0.0004693609022556391, "loss": 1.1814, "mean_token_accuracy": 0.6869696602225304, "num_tokens": 4360040.0, "step": 164 }, { "entropy": 1.1787877827882767, "epoch": 0.310661332078136, "grad_norm": 0.13379861414432526, "learning_rate": 0.0004691729323308271, "loss": 1.1791, "mean_token_accuracy": 0.6811994835734367, "num_tokens": 4386186.0, "step": 165 }, { "entropy": 1.2168269157409668, "epoch": 0.3125441280301247, "grad_norm": 0.1466514617204666, "learning_rate": 0.00046898496240601505, "loss": 1.2131, "mean_token_accuracy": 0.6801121830940247, "num_tokens": 4412572.0, "step": 166 }, { "entropy": 1.191074714064598, "epoch": 0.3144269239821134, "grad_norm": 0.13052161037921906, "learning_rate": 0.000468796992481203, "loss": 1.1818, "mean_token_accuracy": 0.6877126544713974, "num_tokens": 4439798.0, "step": 167 }, { "entropy": 1.310966208577156, "epoch": 0.3163097199341021, "grad_norm": 0.14339525997638702, "learning_rate": 0.000468609022556391, "loss": 1.2826, "mean_token_accuracy": 0.6668709591031075, "num_tokens": 4465182.0, "step": 168 }, { "entropy": 1.249758929014206, "epoch": 0.3181925158860908, "grad_norm": 0.14204370975494385, "learning_rate": 0.00046842105263157895, "loss": 1.1944, "mean_token_accuracy": 0.6822869181632996, "num_tokens": 4491690.0, "step": 169 }, { "entropy": 1.2281111925840378, "epoch": 0.3200753118380795, "grad_norm": 0.13778182864189148, "learning_rate": 0.0004682330827067669, "loss": 1.1821, "mean_token_accuracy": 0.6827872395515442, "num_tokens": 4518668.0, "step": 170 }, { "entropy": 1.1907898932695389, "epoch": 0.3219581077900682, "grad_norm": 0.13682714104652405, "learning_rate": 0.0004680451127819549, "loss": 1.1654, "mean_token_accuracy": 0.6878219619393349, "num_tokens": 4544500.0, "step": 171 }, { "entropy": 1.2053745537996292, "epoch": 0.32384090374205693, "grad_norm": 0.1406177431344986, "learning_rate": 0.00046785714285714285, "loss": 1.2351, "mean_token_accuracy": 0.6759226024150848, "num_tokens": 4570672.0, "step": 172 }, { "entropy": 1.1686365455389023, "epoch": 0.32572369969404563, "grad_norm": 0.1390364021062851, "learning_rate": 0.0004676691729323309, "loss": 1.1563, "mean_token_accuracy": 0.6870525777339935, "num_tokens": 4597157.0, "step": 173 }, { "entropy": 1.1847928017377853, "epoch": 0.32760649564603433, "grad_norm": 0.12553362548351288, "learning_rate": 0.0004674812030075188, "loss": 1.1464, "mean_token_accuracy": 0.6896436884999275, "num_tokens": 4622963.0, "step": 174 }, { "entropy": 1.2175119668245316, "epoch": 0.32948929159802304, "grad_norm": 0.12723615765571594, "learning_rate": 0.00046729323308270675, "loss": 1.1887, "mean_token_accuracy": 0.6839049756526947, "num_tokens": 4650796.0, "step": 175 }, { "entropy": 1.2538534700870514, "epoch": 0.33137208755001174, "grad_norm": 0.1439773291349411, "learning_rate": 0.0004671052631578948, "loss": 1.1796, "mean_token_accuracy": 0.6849694699048996, "num_tokens": 4675067.0, "step": 176 }, { "entropy": 1.2113288342952728, "epoch": 0.33325488350200044, "grad_norm": 0.20407459139823914, "learning_rate": 0.00046691729323308274, "loss": 1.1616, "mean_token_accuracy": 0.6856766641139984, "num_tokens": 4700943.0, "step": 177 }, { "entropy": 1.1914596557617188, "epoch": 0.3351376794539892, "grad_norm": 0.13831955194473267, "learning_rate": 0.00046672932330827065, "loss": 1.1938, "mean_token_accuracy": 0.6882949769496918, "num_tokens": 4728608.0, "step": 178 }, { "entropy": 1.1632477790117264, "epoch": 0.3370204754059779, "grad_norm": 0.1430656909942627, "learning_rate": 0.0004665413533834587, "loss": 1.1745, "mean_token_accuracy": 0.6857840716838837, "num_tokens": 4754323.0, "step": 179 }, { "entropy": 1.1661407798528671, "epoch": 0.3389032713579666, "grad_norm": 0.13480572402477264, "learning_rate": 0.00046635338345864664, "loss": 1.1677, "mean_token_accuracy": 0.6842626482248306, "num_tokens": 4777734.0, "step": 180 }, { "entropy": 1.2307626903057098, "epoch": 0.3407860673099553, "grad_norm": 0.14171424508094788, "learning_rate": 0.00046616541353383456, "loss": 1.2112, "mean_token_accuracy": 0.6779276877641678, "num_tokens": 4803062.0, "step": 181 }, { "entropy": 1.2344750761985779, "epoch": 0.342668863261944, "grad_norm": 0.1366141438484192, "learning_rate": 0.0004659774436090226, "loss": 1.1521, "mean_token_accuracy": 0.6871028989553452, "num_tokens": 4828406.0, "step": 182 }, { "entropy": 1.2267533838748932, "epoch": 0.3445516592139327, "grad_norm": 0.12364047765731812, "learning_rate": 0.00046578947368421054, "loss": 1.157, "mean_token_accuracy": 0.6939859166741371, "num_tokens": 4855048.0, "step": 183 }, { "entropy": 1.25662961602211, "epoch": 0.3464344551659214, "grad_norm": 0.14521241188049316, "learning_rate": 0.0004656015037593985, "loss": 1.2005, "mean_token_accuracy": 0.6837843209505081, "num_tokens": 4879838.0, "step": 184 }, { "entropy": 1.1265386119484901, "epoch": 0.3483172511179101, "grad_norm": 0.13281729817390442, "learning_rate": 0.0004654135338345865, "loss": 1.1245, "mean_token_accuracy": 0.7005239203572273, "num_tokens": 4906673.0, "step": 185 }, { "entropy": 1.1675947606563568, "epoch": 0.3502000470698988, "grad_norm": 0.13612613081932068, "learning_rate": 0.00046522556390977444, "loss": 1.1783, "mean_token_accuracy": 0.6867906153202057, "num_tokens": 4932081.0, "step": 186 }, { "entropy": 1.1747846454381943, "epoch": 0.3520828430218875, "grad_norm": 0.14062775671482086, "learning_rate": 0.0004650375939849624, "loss": 1.1849, "mean_token_accuracy": 0.6804407685995102, "num_tokens": 4957805.0, "step": 187 }, { "entropy": 1.3040417283773422, "epoch": 0.35396563897387623, "grad_norm": 0.13647155463695526, "learning_rate": 0.00046484962406015043, "loss": 1.2723, "mean_token_accuracy": 0.6708482652902603, "num_tokens": 4982727.0, "step": 188 }, { "entropy": 1.273634523153305, "epoch": 0.35584843492586493, "grad_norm": 0.2908094823360443, "learning_rate": 0.00046466165413533835, "loss": 1.2188, "mean_token_accuracy": 0.6769787892699242, "num_tokens": 5008167.0, "step": 189 }, { "entropy": 1.294351875782013, "epoch": 0.35773123087785363, "grad_norm": 0.14780114591121674, "learning_rate": 0.0004644736842105263, "loss": 1.2497, "mean_token_accuracy": 0.6740161553025246, "num_tokens": 5031994.0, "step": 190 }, { "entropy": 1.164976328611374, "epoch": 0.35961402682984234, "grad_norm": 0.1321694701910019, "learning_rate": 0.00046428571428571433, "loss": 1.1297, "mean_token_accuracy": 0.6937556862831116, "num_tokens": 5058242.0, "step": 191 }, { "entropy": 1.1738992556929588, "epoch": 0.36149682278183104, "grad_norm": 0.13215236365795135, "learning_rate": 0.00046409774436090225, "loss": 1.1639, "mean_token_accuracy": 0.688830278813839, "num_tokens": 5086002.0, "step": 192 }, { "entropy": 1.2423847168684006, "epoch": 0.36337961873381974, "grad_norm": 0.13844619691371918, "learning_rate": 0.0004639097744360902, "loss": 1.2462, "mean_token_accuracy": 0.6728790327906609, "num_tokens": 5115116.0, "step": 193 }, { "entropy": 1.188772901892662, "epoch": 0.36526241468580845, "grad_norm": 0.1350889950990677, "learning_rate": 0.00046372180451127824, "loss": 1.162, "mean_token_accuracy": 0.6961116194725037, "num_tokens": 5141316.0, "step": 194 }, { "entropy": 1.2510673254728317, "epoch": 0.36714521063779715, "grad_norm": 0.13393868505954742, "learning_rate": 0.0004635338345864662, "loss": 1.2165, "mean_token_accuracy": 0.675739549100399, "num_tokens": 5168389.0, "step": 195 }, { "entropy": 1.2140327990055084, "epoch": 0.36902800658978585, "grad_norm": 0.15341585874557495, "learning_rate": 0.0004633458646616541, "loss": 1.1891, "mean_token_accuracy": 0.6846036836504936, "num_tokens": 5196797.0, "step": 196 }, { "entropy": 1.140480324625969, "epoch": 0.37091080254177455, "grad_norm": 0.14681561291217804, "learning_rate": 0.00046315789473684214, "loss": 1.1129, "mean_token_accuracy": 0.7001371458172798, "num_tokens": 5221689.0, "step": 197 }, { "entropy": 1.149554505944252, "epoch": 0.37279359849376326, "grad_norm": 0.12448862940073013, "learning_rate": 0.0004629699248120301, "loss": 1.0918, "mean_token_accuracy": 0.7011524215340614, "num_tokens": 5248151.0, "step": 198 }, { "entropy": 1.1877187192440033, "epoch": 0.37467639444575196, "grad_norm": 0.12904192507266998, "learning_rate": 0.00046278195488721807, "loss": 1.1381, "mean_token_accuracy": 0.6980564966797829, "num_tokens": 5276462.0, "step": 199 }, { "entropy": 1.1336260885000229, "epoch": 0.37655919039774066, "grad_norm": 0.14019370079040527, "learning_rate": 0.00046259398496240604, "loss": 1.1408, "mean_token_accuracy": 0.6882188692688942, "num_tokens": 5303965.0, "step": 200 }, { "entropy": 1.142029918730259, "epoch": 0.37844198634972936, "grad_norm": 0.12954500317573547, "learning_rate": 0.000462406015037594, "loss": 1.1225, "mean_token_accuracy": 0.7019821628928185, "num_tokens": 5333147.0, "step": 201 }, { "entropy": 1.1055554077029228, "epoch": 0.38032478230171807, "grad_norm": 0.14525440335273743, "learning_rate": 0.00046221804511278197, "loss": 1.0873, "mean_token_accuracy": 0.6984671205282211, "num_tokens": 5360603.0, "step": 202 }, { "entropy": 1.1669521182775497, "epoch": 0.38220757825370677, "grad_norm": 0.12719959020614624, "learning_rate": 0.00046203007518796994, "loss": 1.1408, "mean_token_accuracy": 0.6958698183298111, "num_tokens": 5386882.0, "step": 203 }, { "entropy": 1.2504252791404724, "epoch": 0.3840903742056955, "grad_norm": 0.14054498076438904, "learning_rate": 0.0004618421052631579, "loss": 1.2147, "mean_token_accuracy": 0.6776561290025711, "num_tokens": 5413184.0, "step": 204 }, { "entropy": 1.226726457476616, "epoch": 0.3859731701576842, "grad_norm": 0.13887910544872284, "learning_rate": 0.00046165413533834587, "loss": 1.193, "mean_token_accuracy": 0.6823991388082504, "num_tokens": 5438606.0, "step": 205 }, { "entropy": 1.1875706166028976, "epoch": 0.3878559661096729, "grad_norm": 0.14024114608764648, "learning_rate": 0.0004614661654135339, "loss": 1.1676, "mean_token_accuracy": 0.684231162071228, "num_tokens": 5464123.0, "step": 206 }, { "entropy": 1.2047923803329468, "epoch": 0.3897387620616616, "grad_norm": 0.1310993880033493, "learning_rate": 0.0004612781954887218, "loss": 1.1851, "mean_token_accuracy": 0.6833815798163414, "num_tokens": 5491426.0, "step": 207 }, { "entropy": 1.2198069095611572, "epoch": 0.3916215580136503, "grad_norm": 0.13591070473194122, "learning_rate": 0.00046109022556390977, "loss": 1.2115, "mean_token_accuracy": 0.6876263841986656, "num_tokens": 5517873.0, "step": 208 }, { "entropy": 1.2492990344762802, "epoch": 0.393504353965639, "grad_norm": 0.1313110738992691, "learning_rate": 0.0004609022556390978, "loss": 1.2303, "mean_token_accuracy": 0.6741604581475258, "num_tokens": 5545541.0, "step": 209 }, { "entropy": 1.2249716967344284, "epoch": 0.3953871499176277, "grad_norm": 0.13691024482250214, "learning_rate": 0.0004607142857142857, "loss": 1.1994, "mean_token_accuracy": 0.6825065985321999, "num_tokens": 5571818.0, "step": 210 }, { "entropy": 1.2132453471422195, "epoch": 0.3972699458696164, "grad_norm": 0.13897888362407684, "learning_rate": 0.0004605263157894737, "loss": 1.2105, "mean_token_accuracy": 0.6761833131313324, "num_tokens": 5598744.0, "step": 211 }, { "entropy": 1.1871661990880966, "epoch": 0.3991527418216051, "grad_norm": 0.13007131218910217, "learning_rate": 0.00046033834586466164, "loss": 1.1726, "mean_token_accuracy": 0.6834597215056419, "num_tokens": 5625839.0, "step": 212 }, { "entropy": 1.1333737969398499, "epoch": 0.4010355377735938, "grad_norm": 0.12430460005998611, "learning_rate": 0.00046015037593984966, "loss": 1.1019, "mean_token_accuracy": 0.7014463916420937, "num_tokens": 5654141.0, "step": 213 }, { "entropy": 1.2297871708869934, "epoch": 0.4029183337255825, "grad_norm": 0.13888096809387207, "learning_rate": 0.0004599624060150376, "loss": 1.1764, "mean_token_accuracy": 0.6898130550980568, "num_tokens": 5678609.0, "step": 214 }, { "entropy": 1.2013902068138123, "epoch": 0.4048011296775712, "grad_norm": 0.12778723239898682, "learning_rate": 0.00045977443609022554, "loss": 1.1552, "mean_token_accuracy": 0.6898351311683655, "num_tokens": 5705310.0, "step": 215 }, { "entropy": 1.2131111025810242, "epoch": 0.4066839256295599, "grad_norm": 0.1250849962234497, "learning_rate": 0.00045958646616541356, "loss": 1.1997, "mean_token_accuracy": 0.6817116960883141, "num_tokens": 5733075.0, "step": 216 }, { "entropy": 1.195549488067627, "epoch": 0.4085667215815486, "grad_norm": 0.14742979407310486, "learning_rate": 0.00045939849624060153, "loss": 1.1542, "mean_token_accuracy": 0.6895313560962677, "num_tokens": 5758265.0, "step": 217 }, { "entropy": 1.169806808233261, "epoch": 0.4104495175335373, "grad_norm": 0.13026666641235352, "learning_rate": 0.00045921052631578944, "loss": 1.1244, "mean_token_accuracy": 0.6982120722532272, "num_tokens": 5784948.0, "step": 218 }, { "entropy": 1.182911455631256, "epoch": 0.412332313485526, "grad_norm": 0.13583756983280182, "learning_rate": 0.00045902255639097746, "loss": 1.168, "mean_token_accuracy": 0.6856559291481972, "num_tokens": 5811165.0, "step": 219 }, { "entropy": 1.0761431455612183, "epoch": 0.4142151094375147, "grad_norm": 0.13843543827533722, "learning_rate": 0.00045883458646616543, "loss": 1.0857, "mean_token_accuracy": 0.7090724036097527, "num_tokens": 5839268.0, "step": 220 }, { "entropy": 1.1751226484775543, "epoch": 0.4160979053895034, "grad_norm": 0.13362666964530945, "learning_rate": 0.00045864661654135334, "loss": 1.1766, "mean_token_accuracy": 0.6880608201026917, "num_tokens": 5866181.0, "step": 221 }, { "entropy": 1.1817846149206161, "epoch": 0.4179807013414921, "grad_norm": 0.1283264309167862, "learning_rate": 0.00045845864661654136, "loss": 1.1698, "mean_token_accuracy": 0.6846595779061317, "num_tokens": 5894863.0, "step": 222 }, { "entropy": 1.2609765976667404, "epoch": 0.4198634972934808, "grad_norm": 0.1493021547794342, "learning_rate": 0.00045827067669172933, "loss": 1.2032, "mean_token_accuracy": 0.6831384673714638, "num_tokens": 5919134.0, "step": 223 }, { "entropy": 1.239750549197197, "epoch": 0.42174629324546953, "grad_norm": 0.14113545417785645, "learning_rate": 0.0004580827067669173, "loss": 1.186, "mean_token_accuracy": 0.6857739984989166, "num_tokens": 5944399.0, "step": 224 }, { "entropy": 1.2144103646278381, "epoch": 0.42362908919745823, "grad_norm": 0.13381649553775787, "learning_rate": 0.00045789473684210527, "loss": 1.1787, "mean_token_accuracy": 0.6889763921499252, "num_tokens": 5969936.0, "step": 225 }, { "entropy": 1.157375693321228, "epoch": 0.42551188514944693, "grad_norm": 0.13331881165504456, "learning_rate": 0.00045770676691729323, "loss": 1.1613, "mean_token_accuracy": 0.6869198232889175, "num_tokens": 5998086.0, "step": 226 }, { "entropy": 1.16208166629076, "epoch": 0.42739468110143564, "grad_norm": 0.1284441202878952, "learning_rate": 0.0004575187969924812, "loss": 1.1593, "mean_token_accuracy": 0.6875879392027855, "num_tokens": 6027253.0, "step": 227 }, { "entropy": 1.1543057709932327, "epoch": 0.42927747705342434, "grad_norm": 0.13240714371204376, "learning_rate": 0.0004573308270676692, "loss": 1.1397, "mean_token_accuracy": 0.6932123303413391, "num_tokens": 6053458.0, "step": 228 }, { "entropy": 1.2234352231025696, "epoch": 0.43116027300541304, "grad_norm": 0.13276036083698273, "learning_rate": 0.00045714285714285713, "loss": 1.1783, "mean_token_accuracy": 0.6839658245444298, "num_tokens": 6077746.0, "step": 229 }, { "entropy": 1.2401353865861893, "epoch": 0.43304306895740174, "grad_norm": 0.13763296604156494, "learning_rate": 0.0004569548872180451, "loss": 1.2126, "mean_token_accuracy": 0.6801036223769188, "num_tokens": 6104277.0, "step": 230 }, { "entropy": 1.1862784177064896, "epoch": 0.43492586490939045, "grad_norm": 0.14408177137374878, "learning_rate": 0.0004567669172932331, "loss": 1.1804, "mean_token_accuracy": 0.6879640221595764, "num_tokens": 6131048.0, "step": 231 }, { "entropy": 1.2236796170473099, "epoch": 0.43680866086137915, "grad_norm": 0.1351345330476761, "learning_rate": 0.00045657894736842103, "loss": 1.1814, "mean_token_accuracy": 0.6808154359459877, "num_tokens": 6157407.0, "step": 232 }, { "entropy": 1.2412819564342499, "epoch": 0.43869145681336785, "grad_norm": 0.1346222460269928, "learning_rate": 0.000456390977443609, "loss": 1.2092, "mean_token_accuracy": 0.676831878721714, "num_tokens": 6183884.0, "step": 233 }, { "entropy": 1.2513677477836609, "epoch": 0.44057425276535656, "grad_norm": 0.14077451825141907, "learning_rate": 0.000456203007518797, "loss": 1.2274, "mean_token_accuracy": 0.6783920973539352, "num_tokens": 6210214.0, "step": 234 }, { "entropy": 1.1642959266901016, "epoch": 0.44245704871734526, "grad_norm": 0.1407959908246994, "learning_rate": 0.000456015037593985, "loss": 1.1149, "mean_token_accuracy": 0.6936823204159737, "num_tokens": 6237636.0, "step": 235 }, { "entropy": 1.1751240193843842, "epoch": 0.44433984466933396, "grad_norm": 0.1335555762052536, "learning_rate": 0.0004558270676691729, "loss": 1.1695, "mean_token_accuracy": 0.6895338296890259, "num_tokens": 6263952.0, "step": 236 }, { "entropy": 1.1486622989177704, "epoch": 0.44622264062132266, "grad_norm": 0.17950989305973053, "learning_rate": 0.0004556390977443609, "loss": 1.155, "mean_token_accuracy": 0.6848675832152367, "num_tokens": 6292031.0, "step": 237 }, { "entropy": 1.185767188668251, "epoch": 0.44810543657331137, "grad_norm": 0.1306653767824173, "learning_rate": 0.0004554511278195489, "loss": 1.1606, "mean_token_accuracy": 0.6900418549776077, "num_tokens": 6321764.0, "step": 238 }, { "entropy": 1.2462199479341507, "epoch": 0.44998823252530007, "grad_norm": 0.1400284469127655, "learning_rate": 0.00045526315789473686, "loss": 1.2094, "mean_token_accuracy": 0.6798161789774895, "num_tokens": 6347788.0, "step": 239 }, { "entropy": 1.2244273871183395, "epoch": 0.45187102847728877, "grad_norm": 0.1347157508134842, "learning_rate": 0.0004550751879699248, "loss": 1.1674, "mean_token_accuracy": 0.6886308640241623, "num_tokens": 6374007.0, "step": 240 }, { "entropy": 1.2273097336292267, "epoch": 0.4537538244292775, "grad_norm": 0.1288744956254959, "learning_rate": 0.0004548872180451128, "loss": 1.1775, "mean_token_accuracy": 0.6868400648236275, "num_tokens": 6400589.0, "step": 241 }, { "entropy": 1.2171413898468018, "epoch": 0.4556366203812662, "grad_norm": 0.14212685823440552, "learning_rate": 0.00045469924812030076, "loss": 1.2173, "mean_token_accuracy": 0.680756650865078, "num_tokens": 6428529.0, "step": 242 }, { "entropy": 1.1739053502678871, "epoch": 0.4575194163332549, "grad_norm": 0.13274581730365753, "learning_rate": 0.0004545112781954887, "loss": 1.1491, "mean_token_accuracy": 0.6945304796099663, "num_tokens": 6456003.0, "step": 243 }, { "entropy": 1.1879045367240906, "epoch": 0.4594022122852436, "grad_norm": 0.14754825830459595, "learning_rate": 0.0004543233082706767, "loss": 1.153, "mean_token_accuracy": 0.6907599717378616, "num_tokens": 6481488.0, "step": 244 }, { "entropy": 1.1874423921108246, "epoch": 0.4612850082372323, "grad_norm": 0.14292332530021667, "learning_rate": 0.00045413533834586466, "loss": 1.1531, "mean_token_accuracy": 0.6900304704904556, "num_tokens": 6509304.0, "step": 245 }, { "entropy": 1.1584448963403702, "epoch": 0.463167804189221, "grad_norm": 0.13040532171726227, "learning_rate": 0.0004539473684210527, "loss": 1.1492, "mean_token_accuracy": 0.6877822354435921, "num_tokens": 6536066.0, "step": 246 }, { "entropy": 1.1855371445417404, "epoch": 0.4650506001412097, "grad_norm": 0.13368549942970276, "learning_rate": 0.0004537593984962406, "loss": 1.1777, "mean_token_accuracy": 0.6852287128567696, "num_tokens": 6565018.0, "step": 247 }, { "entropy": 1.1443724185228348, "epoch": 0.4669333960931984, "grad_norm": 0.14028339087963104, "learning_rate": 0.00045357142857142856, "loss": 1.1356, "mean_token_accuracy": 0.6946588978171349, "num_tokens": 6592536.0, "step": 248 }, { "entropy": 1.1854888200759888, "epoch": 0.4688161920451871, "grad_norm": 0.13055366277694702, "learning_rate": 0.0004533834586466166, "loss": 1.1731, "mean_token_accuracy": 0.6873556599020958, "num_tokens": 6620329.0, "step": 249 }, { "entropy": 1.1635265052318573, "epoch": 0.4706989879971758, "grad_norm": 0.12299590557813644, "learning_rate": 0.0004531954887218045, "loss": 1.1174, "mean_token_accuracy": 0.6956649720668793, "num_tokens": 6647929.0, "step": 250 }, { "entropy": 1.1612417101860046, "epoch": 0.4725817839491645, "grad_norm": 0.14049823582172394, "learning_rate": 0.00045300751879699246, "loss": 1.1348, "mean_token_accuracy": 0.694083645939827, "num_tokens": 6674419.0, "step": 251 }, { "entropy": 1.2213299870491028, "epoch": 0.4744645799011532, "grad_norm": 0.13414214551448822, "learning_rate": 0.0004528195488721805, "loss": 1.2013, "mean_token_accuracy": 0.6825797632336617, "num_tokens": 6701851.0, "step": 252 }, { "entropy": 1.183507114648819, "epoch": 0.4763473758531419, "grad_norm": 0.15232087671756744, "learning_rate": 0.00045263157894736845, "loss": 1.162, "mean_token_accuracy": 0.6850753352046013, "num_tokens": 6729161.0, "step": 253 }, { "entropy": 1.0959549844264984, "epoch": 0.4782301718051306, "grad_norm": 0.12658758461475372, "learning_rate": 0.00045244360902255636, "loss": 1.0808, "mean_token_accuracy": 0.7000140845775604, "num_tokens": 6756047.0, "step": 254 }, { "entropy": 1.193654179573059, "epoch": 0.4801129677571193, "grad_norm": 0.14304682612419128, "learning_rate": 0.0004522556390977444, "loss": 1.1611, "mean_token_accuracy": 0.6860647276043892, "num_tokens": 6782155.0, "step": 255 }, { "entropy": 1.189740851521492, "epoch": 0.481995763709108, "grad_norm": 0.1279287487268448, "learning_rate": 0.00045206766917293235, "loss": 1.1533, "mean_token_accuracy": 0.6969729140400887, "num_tokens": 6809906.0, "step": 256 }, { "entropy": 1.1370235309004784, "epoch": 0.4838785596610967, "grad_norm": 0.12549139559268951, "learning_rate": 0.0004518796992481203, "loss": 1.1005, "mean_token_accuracy": 0.6986983045935631, "num_tokens": 6837978.0, "step": 257 }, { "entropy": 1.1274943947792053, "epoch": 0.4857613556130854, "grad_norm": 0.13078007102012634, "learning_rate": 0.0004516917293233083, "loss": 1.116, "mean_token_accuracy": 0.6968672722578049, "num_tokens": 6863894.0, "step": 258 }, { "entropy": 1.1707115471363068, "epoch": 0.4876441515650741, "grad_norm": 0.13655990362167358, "learning_rate": 0.00045150375939849625, "loss": 1.1502, "mean_token_accuracy": 0.6891424879431725, "num_tokens": 6889219.0, "step": 259 }, { "entropy": 1.1765428930521011, "epoch": 0.4895269475170628, "grad_norm": 0.13517631590366364, "learning_rate": 0.0004513157894736842, "loss": 1.1736, "mean_token_accuracy": 0.6828250586986542, "num_tokens": 6915957.0, "step": 260 }, { "entropy": 1.1622217297554016, "epoch": 0.49140974346905153, "grad_norm": 0.1339031159877777, "learning_rate": 0.0004511278195488722, "loss": 1.1602, "mean_token_accuracy": 0.6858406886458397, "num_tokens": 6942729.0, "step": 261 }, { "entropy": 1.188800647854805, "epoch": 0.49329253942104023, "grad_norm": 0.1516953706741333, "learning_rate": 0.00045093984962406015, "loss": 1.1541, "mean_token_accuracy": 0.6871596127748489, "num_tokens": 6966884.0, "step": 262 }, { "entropy": 1.1681264340877533, "epoch": 0.49517533537302894, "grad_norm": 0.14556634426116943, "learning_rate": 0.0004507518796992481, "loss": 1.1307, "mean_token_accuracy": 0.6948810294270515, "num_tokens": 6992842.0, "step": 263 }, { "entropy": 1.1910002678632736, "epoch": 0.49705813132501764, "grad_norm": 0.1371603161096573, "learning_rate": 0.00045056390977443614, "loss": 1.1469, "mean_token_accuracy": 0.6974197626113892, "num_tokens": 7018704.0, "step": 264 }, { "entropy": 1.2533641755580902, "epoch": 0.49894092727700634, "grad_norm": 0.15122705698013306, "learning_rate": 0.00045037593984962405, "loss": 1.1964, "mean_token_accuracy": 0.6835278943181038, "num_tokens": 7045985.0, "step": 265 }, { "entropy": 1.18770419806242, "epoch": 0.500823723228995, "grad_norm": 0.1283893585205078, "learning_rate": 0.000450187969924812, "loss": 1.1613, "mean_token_accuracy": 0.6964623779058456, "num_tokens": 7073668.0, "step": 266 }, { "entropy": 1.1760464161634445, "epoch": 0.5027065191809837, "grad_norm": 0.13645370304584503, "learning_rate": 0.00045000000000000004, "loss": 1.181, "mean_token_accuracy": 0.6851188093423843, "num_tokens": 7100612.0, "step": 267 }, { "entropy": 1.1559069901704788, "epoch": 0.5045893151329724, "grad_norm": 0.14222431182861328, "learning_rate": 0.000449812030075188, "loss": 1.1661, "mean_token_accuracy": 0.6858489215373993, "num_tokens": 7127648.0, "step": 268 }, { "entropy": 1.155109003186226, "epoch": 0.5064721110849612, "grad_norm": 0.14752890169620514, "learning_rate": 0.0004496240601503759, "loss": 1.1549, "mean_token_accuracy": 0.6923946589231491, "num_tokens": 7153048.0, "step": 269 }, { "entropy": 1.2506433129310608, "epoch": 0.5083549070369499, "grad_norm": 0.14298772811889648, "learning_rate": 0.00044943609022556394, "loss": 1.193, "mean_token_accuracy": 0.684316597878933, "num_tokens": 7177628.0, "step": 270 }, { "entropy": 1.2653572857379913, "epoch": 0.5102377029889386, "grad_norm": 0.167319193482399, "learning_rate": 0.0004492481203007519, "loss": 1.1959, "mean_token_accuracy": 0.6871765851974487, "num_tokens": 7201577.0, "step": 271 }, { "entropy": 1.2064370959997177, "epoch": 0.5121204989409273, "grad_norm": 0.15246403217315674, "learning_rate": 0.0004490601503759398, "loss": 1.1574, "mean_token_accuracy": 0.6841192170977592, "num_tokens": 7226259.0, "step": 272 }, { "entropy": 1.1363181620836258, "epoch": 0.514003294892916, "grad_norm": 0.13937003910541534, "learning_rate": 0.00044887218045112784, "loss": 1.1257, "mean_token_accuracy": 0.6941032037138939, "num_tokens": 7253373.0, "step": 273 }, { "entropy": 1.1732933074235916, "epoch": 0.5158860908449047, "grad_norm": 0.14371132850646973, "learning_rate": 0.0004486842105263158, "loss": 1.1715, "mean_token_accuracy": 0.6919308379292488, "num_tokens": 7278945.0, "step": 274 }, { "entropy": 1.175576038658619, "epoch": 0.5177688867968934, "grad_norm": 0.1441759318113327, "learning_rate": 0.0004484962406015038, "loss": 1.1515, "mean_token_accuracy": 0.694126233458519, "num_tokens": 7305391.0, "step": 275 }, { "entropy": 1.2058104127645493, "epoch": 0.5196516827488821, "grad_norm": 0.13355745375156403, "learning_rate": 0.00044830827067669174, "loss": 1.1916, "mean_token_accuracy": 0.687326617538929, "num_tokens": 7332607.0, "step": 276 }, { "entropy": 1.2485528588294983, "epoch": 0.5215344787008708, "grad_norm": 0.14986877143383026, "learning_rate": 0.0004481203007518797, "loss": 1.2103, "mean_token_accuracy": 0.6793005913496017, "num_tokens": 7358139.0, "step": 277 }, { "entropy": 1.187769129872322, "epoch": 0.5234172746528595, "grad_norm": 0.14205658435821533, "learning_rate": 0.0004479323308270677, "loss": 1.1564, "mean_token_accuracy": 0.6925127878785133, "num_tokens": 7384537.0, "step": 278 }, { "entropy": 1.1303328722715378, "epoch": 0.5253000706048482, "grad_norm": 0.14045588672161102, "learning_rate": 0.00044774436090225565, "loss": 1.1287, "mean_token_accuracy": 0.6949460133910179, "num_tokens": 7411036.0, "step": 279 }, { "entropy": 1.2028415352106094, "epoch": 0.5271828665568369, "grad_norm": 0.1550549864768982, "learning_rate": 0.0004475563909774436, "loss": 1.2004, "mean_token_accuracy": 0.6846116036176682, "num_tokens": 7437443.0, "step": 280 }, { "entropy": 1.182666465640068, "epoch": 0.5290656625088256, "grad_norm": 0.2469193935394287, "learning_rate": 0.0004473684210526316, "loss": 1.1759, "mean_token_accuracy": 0.6844401434063911, "num_tokens": 7462227.0, "step": 281 }, { "entropy": 1.202811524271965, "epoch": 0.5309484584608143, "grad_norm": 0.14160913228988647, "learning_rate": 0.0004471804511278196, "loss": 1.1957, "mean_token_accuracy": 0.6817988455295563, "num_tokens": 7487080.0, "step": 282 }, { "entropy": 1.1812713742256165, "epoch": 0.532831254412803, "grad_norm": 0.15075385570526123, "learning_rate": 0.0004469924812030075, "loss": 1.1481, "mean_token_accuracy": 0.6930856108665466, "num_tokens": 7511921.0, "step": 283 }, { "entropy": 1.2214877009391785, "epoch": 0.5347140503647917, "grad_norm": 0.1399138867855072, "learning_rate": 0.0004468045112781955, "loss": 1.1678, "mean_token_accuracy": 0.6885346695780754, "num_tokens": 7538663.0, "step": 284 }, { "entropy": 1.2207457572221756, "epoch": 0.5365968463167804, "grad_norm": 0.16030077636241913, "learning_rate": 0.0004466165413533835, "loss": 1.1498, "mean_token_accuracy": 0.6934774816036224, "num_tokens": 7563898.0, "step": 285 }, { "entropy": 1.1787783950567245, "epoch": 0.5384796422687691, "grad_norm": 0.13601085543632507, "learning_rate": 0.00044642857142857147, "loss": 1.145, "mean_token_accuracy": 0.6905470564961433, "num_tokens": 7590702.0, "step": 286 }, { "entropy": 1.081341713666916, "epoch": 0.5403624382207578, "grad_norm": 0.13594649732112885, "learning_rate": 0.0004462406015037594, "loss": 1.0881, "mean_token_accuracy": 0.7003285214304924, "num_tokens": 7618002.0, "step": 287 }, { "entropy": 1.1418119072914124, "epoch": 0.5422452341727465, "grad_norm": 0.15701550245285034, "learning_rate": 0.0004460526315789474, "loss": 1.1544, "mean_token_accuracy": 0.6906085088849068, "num_tokens": 7644482.0, "step": 288 }, { "entropy": 1.1627637073397636, "epoch": 0.5441280301247352, "grad_norm": 0.13722968101501465, "learning_rate": 0.00044586466165413537, "loss": 1.1586, "mean_token_accuracy": 0.6932996585965157, "num_tokens": 7671479.0, "step": 289 }, { "entropy": 1.1320042312145233, "epoch": 0.5460108260767239, "grad_norm": 0.15330596268177032, "learning_rate": 0.0004456766917293233, "loss": 1.108, "mean_token_accuracy": 0.6965923383831978, "num_tokens": 7697013.0, "step": 290 }, { "entropy": 1.2310521453619003, "epoch": 0.5478936220287126, "grad_norm": 0.14045506715774536, "learning_rate": 0.00044548872180451125, "loss": 1.1978, "mean_token_accuracy": 0.6855576112866402, "num_tokens": 7722551.0, "step": 291 }, { "entropy": 1.1880534440279007, "epoch": 0.5497764179807013, "grad_norm": 0.14293448626995087, "learning_rate": 0.00044530075187969927, "loss": 1.1251, "mean_token_accuracy": 0.701711505651474, "num_tokens": 7748016.0, "step": 292 }, { "entropy": 1.141702115535736, "epoch": 0.55165921393269, "grad_norm": 0.1439259648323059, "learning_rate": 0.00044511278195488724, "loss": 1.1361, "mean_token_accuracy": 0.6944170445203781, "num_tokens": 7774858.0, "step": 293 }, { "entropy": 1.1963759511709213, "epoch": 0.5535420098846787, "grad_norm": 0.15148387849330902, "learning_rate": 0.00044492481203007515, "loss": 1.1768, "mean_token_accuracy": 0.6924594268202782, "num_tokens": 7800802.0, "step": 294 }, { "entropy": 1.2073182165622711, "epoch": 0.5554248058366674, "grad_norm": 0.14503706991672516, "learning_rate": 0.00044473684210526317, "loss": 1.2075, "mean_token_accuracy": 0.6802205815911293, "num_tokens": 7825288.0, "step": 295 }, { "entropy": 1.1897266507148743, "epoch": 0.5573076017886561, "grad_norm": 0.13914930820465088, "learning_rate": 0.00044454887218045114, "loss": 1.1668, "mean_token_accuracy": 0.6842218562960625, "num_tokens": 7853255.0, "step": 296 }, { "entropy": 1.138252004981041, "epoch": 0.5591903977406448, "grad_norm": 0.1277482956647873, "learning_rate": 0.0004443609022556391, "loss": 1.095, "mean_token_accuracy": 0.6993494555354118, "num_tokens": 7880497.0, "step": 297 }, { "entropy": 1.1767967641353607, "epoch": 0.5610731936926335, "grad_norm": 0.14053884148597717, "learning_rate": 0.00044417293233082707, "loss": 1.1443, "mean_token_accuracy": 0.6948733255267143, "num_tokens": 7906730.0, "step": 298 }, { "entropy": 1.2134106159210205, "epoch": 0.5629559896446222, "grad_norm": 0.14005884528160095, "learning_rate": 0.00044398496240601504, "loss": 1.1822, "mean_token_accuracy": 0.6892389133572578, "num_tokens": 7933216.0, "step": 299 }, { "entropy": 1.1945680975914001, "epoch": 0.5648387855966109, "grad_norm": 0.1356893926858902, "learning_rate": 0.000443796992481203, "loss": 1.1689, "mean_token_accuracy": 0.6882117986679077, "num_tokens": 7960270.0, "step": 300 }, { "entropy": 1.1890588849782944, "epoch": 0.5667215815485996, "grad_norm": 0.14139321446418762, "learning_rate": 0.000443609022556391, "loss": 1.1757, "mean_token_accuracy": 0.6851599663496017, "num_tokens": 7987900.0, "step": 301 }, { "entropy": 1.1338028833270073, "epoch": 0.5686043775005883, "grad_norm": 0.14264994859695435, "learning_rate": 0.00044342105263157894, "loss": 1.1502, "mean_token_accuracy": 0.6855240687727928, "num_tokens": 8013351.0, "step": 302 }, { "entropy": 1.1318519860506058, "epoch": 0.570487173452577, "grad_norm": 0.13565586507320404, "learning_rate": 0.0004432330827067669, "loss": 1.1165, "mean_token_accuracy": 0.6999509632587433, "num_tokens": 8038918.0, "step": 303 }, { "entropy": 1.2122758030891418, "epoch": 0.5723699694045657, "grad_norm": 0.13487568497657776, "learning_rate": 0.00044304511278195493, "loss": 1.1738, "mean_token_accuracy": 0.681725949048996, "num_tokens": 8066501.0, "step": 304 }, { "entropy": 1.1797229945659637, "epoch": 0.5742527653565545, "grad_norm": 0.13627903163433075, "learning_rate": 0.00044285714285714284, "loss": 1.1376, "mean_token_accuracy": 0.689607098698616, "num_tokens": 8093242.0, "step": 305 }, { "entropy": 1.1857865750789642, "epoch": 0.5761355613085432, "grad_norm": 0.13779953122138977, "learning_rate": 0.0004426691729323308, "loss": 1.1367, "mean_token_accuracy": 0.6948609203100204, "num_tokens": 8121053.0, "step": 306 }, { "entropy": 1.1960344910621643, "epoch": 0.5780183572605319, "grad_norm": 0.13792765140533447, "learning_rate": 0.00044248120300751883, "loss": 1.1472, "mean_token_accuracy": 0.6897515431046486, "num_tokens": 8147832.0, "step": 307 }, { "entropy": 1.19243024289608, "epoch": 0.5799011532125206, "grad_norm": 0.1438818722963333, "learning_rate": 0.0004422932330827068, "loss": 1.1905, "mean_token_accuracy": 0.6858177557587624, "num_tokens": 8173841.0, "step": 308 }, { "entropy": 1.211151197552681, "epoch": 0.5817839491645093, "grad_norm": 0.1361284852027893, "learning_rate": 0.0004421052631578947, "loss": 1.214, "mean_token_accuracy": 0.67852383852005, "num_tokens": 8202120.0, "step": 309 }, { "entropy": 1.1578274965286255, "epoch": 0.583666745116498, "grad_norm": 0.14872749149799347, "learning_rate": 0.00044191729323308273, "loss": 1.1497, "mean_token_accuracy": 0.6920148581266403, "num_tokens": 8229217.0, "step": 310 }, { "entropy": 1.1631289571523666, "epoch": 0.5855495410684867, "grad_norm": 0.15371911227703094, "learning_rate": 0.0004417293233082707, "loss": 1.1437, "mean_token_accuracy": 0.6945102214813232, "num_tokens": 8254581.0, "step": 311 }, { "entropy": 1.1813505440950394, "epoch": 0.5874323370204754, "grad_norm": 0.14172406494617462, "learning_rate": 0.0004415413533834586, "loss": 1.1445, "mean_token_accuracy": 0.7006291374564171, "num_tokens": 8280615.0, "step": 312 }, { "entropy": 1.1823447942733765, "epoch": 0.5893151329724641, "grad_norm": 0.14375410974025726, "learning_rate": 0.00044135338345864663, "loss": 1.1497, "mean_token_accuracy": 0.6918843537569046, "num_tokens": 8307395.0, "step": 313 }, { "entropy": 1.1527684777975082, "epoch": 0.5911979289244528, "grad_norm": 0.1389397829771042, "learning_rate": 0.0004411654135338346, "loss": 1.1189, "mean_token_accuracy": 0.6944358944892883, "num_tokens": 8332107.0, "step": 314 }, { "entropy": 1.165027841925621, "epoch": 0.5930807248764415, "grad_norm": 0.14531069993972778, "learning_rate": 0.00044097744360902257, "loss": 1.161, "mean_token_accuracy": 0.6896175295114517, "num_tokens": 8358194.0, "step": 315 }, { "entropy": 1.2045851200819016, "epoch": 0.5949635208284302, "grad_norm": 0.1540374457836151, "learning_rate": 0.00044078947368421053, "loss": 1.1797, "mean_token_accuracy": 0.6859044209122658, "num_tokens": 8386180.0, "step": 316 }, { "entropy": 1.194406397640705, "epoch": 0.5968463167804189, "grad_norm": 0.14392457902431488, "learning_rate": 0.0004406015037593985, "loss": 1.1483, "mean_token_accuracy": 0.6856495141983032, "num_tokens": 8412257.0, "step": 317 }, { "entropy": 1.1843983232975006, "epoch": 0.5987291127324076, "grad_norm": 0.12984612584114075, "learning_rate": 0.00044041353383458647, "loss": 1.159, "mean_token_accuracy": 0.6899672672152519, "num_tokens": 8440139.0, "step": 318 }, { "entropy": 1.159614846110344, "epoch": 0.6006119086843963, "grad_norm": 0.13649439811706543, "learning_rate": 0.00044022556390977443, "loss": 1.1277, "mean_token_accuracy": 0.6980894953012466, "num_tokens": 8466297.0, "step": 319 }, { "entropy": 1.1729088872671127, "epoch": 0.602494704636385, "grad_norm": 0.14619147777557373, "learning_rate": 0.0004400375939849624, "loss": 1.1511, "mean_token_accuracy": 0.6904428154230118, "num_tokens": 8492672.0, "step": 320 }, { "entropy": 1.1907424926757812, "epoch": 0.6043775005883737, "grad_norm": 0.14279942214488983, "learning_rate": 0.00043984962406015037, "loss": 1.1775, "mean_token_accuracy": 0.6842730417847633, "num_tokens": 8521582.0, "step": 321 }, { "entropy": 1.1668616235256195, "epoch": 0.6062602965403624, "grad_norm": 0.1608172506093979, "learning_rate": 0.0004396616541353384, "loss": 1.1169, "mean_token_accuracy": 0.6961806491017342, "num_tokens": 8549037.0, "step": 322 }, { "entropy": 1.172086626291275, "epoch": 0.6081430924923511, "grad_norm": 0.13843871653079987, "learning_rate": 0.0004394736842105263, "loss": 1.1337, "mean_token_accuracy": 0.6961240246891975, "num_tokens": 8577320.0, "step": 323 }, { "entropy": 1.1471307575702667, "epoch": 0.6100258884443398, "grad_norm": 0.17384615540504456, "learning_rate": 0.00043928571428571427, "loss": 1.132, "mean_token_accuracy": 0.6966283246874809, "num_tokens": 8604513.0, "step": 324 }, { "entropy": 1.1775583177804947, "epoch": 0.6119086843963285, "grad_norm": 0.1405702829360962, "learning_rate": 0.0004390977443609023, "loss": 1.1713, "mean_token_accuracy": 0.6833978369832039, "num_tokens": 8631088.0, "step": 325 }, { "entropy": 1.1986607536673546, "epoch": 0.6137914803483172, "grad_norm": 0.17384964227676392, "learning_rate": 0.00043890977443609026, "loss": 1.1903, "mean_token_accuracy": 0.6892447099089622, "num_tokens": 8658317.0, "step": 326 }, { "entropy": 1.1727805137634277, "epoch": 0.6156742763003059, "grad_norm": 0.14653940498828888, "learning_rate": 0.00043872180451127817, "loss": 1.1706, "mean_token_accuracy": 0.6892889738082886, "num_tokens": 8685883.0, "step": 327 }, { "entropy": 1.1792996972799301, "epoch": 0.6175570722522946, "grad_norm": 0.14093339443206787, "learning_rate": 0.0004385338345864662, "loss": 1.1659, "mean_token_accuracy": 0.6881109997630119, "num_tokens": 8710584.0, "step": 328 }, { "entropy": 1.1784557923674583, "epoch": 0.6194398682042833, "grad_norm": 0.14964358508586884, "learning_rate": 0.00043834586466165416, "loss": 1.1098, "mean_token_accuracy": 0.6995358616113663, "num_tokens": 8737455.0, "step": 329 }, { "entropy": 1.2075697928667068, "epoch": 0.621322664156272, "grad_norm": 0.14746899902820587, "learning_rate": 0.00043815789473684207, "loss": 1.1564, "mean_token_accuracy": 0.6904364302754402, "num_tokens": 8764718.0, "step": 330 }, { "entropy": 1.259048119187355, "epoch": 0.6232054601082607, "grad_norm": 0.13727432489395142, "learning_rate": 0.0004379699248120301, "loss": 1.2152, "mean_token_accuracy": 0.6816830709576607, "num_tokens": 8792699.0, "step": 331 }, { "entropy": 1.176329106092453, "epoch": 0.6250882560602494, "grad_norm": 0.13555607199668884, "learning_rate": 0.00043778195488721806, "loss": 1.1337, "mean_token_accuracy": 0.6938095465302467, "num_tokens": 8818252.0, "step": 332 }, { "entropy": 1.1746894717216492, "epoch": 0.6269710520122381, "grad_norm": 0.14540338516235352, "learning_rate": 0.000437593984962406, "loss": 1.1678, "mean_token_accuracy": 0.6856407299637794, "num_tokens": 8843904.0, "step": 333 }, { "entropy": 1.143667384982109, "epoch": 0.6288538479642268, "grad_norm": 0.17852836847305298, "learning_rate": 0.000437406015037594, "loss": 1.1471, "mean_token_accuracy": 0.6907041072845459, "num_tokens": 8868115.0, "step": 334 }, { "entropy": 1.1293998435139656, "epoch": 0.6307366439162155, "grad_norm": 0.13162344694137573, "learning_rate": 0.00043721804511278196, "loss": 1.123, "mean_token_accuracy": 0.7001049220561981, "num_tokens": 8894871.0, "step": 335 }, { "entropy": 1.1313979178667068, "epoch": 0.6326194398682042, "grad_norm": 0.1321536898612976, "learning_rate": 0.0004370300751879699, "loss": 1.0987, "mean_token_accuracy": 0.7042840495705605, "num_tokens": 8921413.0, "step": 336 }, { "entropy": 1.22024667263031, "epoch": 0.6345022358201929, "grad_norm": 0.14904777705669403, "learning_rate": 0.00043684210526315795, "loss": 1.1685, "mean_token_accuracy": 0.6839649677276611, "num_tokens": 8948016.0, "step": 337 }, { "entropy": 1.200153261423111, "epoch": 0.6363850317721816, "grad_norm": 0.15332205593585968, "learning_rate": 0.00043665413533834586, "loss": 1.1599, "mean_token_accuracy": 0.6898418813943863, "num_tokens": 8974626.0, "step": 338 }, { "entropy": 1.148691438138485, "epoch": 0.6382678277241703, "grad_norm": 0.1428363174200058, "learning_rate": 0.00043646616541353383, "loss": 1.1403, "mean_token_accuracy": 0.6996031925082207, "num_tokens": 9001421.0, "step": 339 }, { "entropy": 1.1665330827236176, "epoch": 0.640150623676159, "grad_norm": 0.1439882218837738, "learning_rate": 0.00043627819548872185, "loss": 1.1849, "mean_token_accuracy": 0.6867435649037361, "num_tokens": 9028615.0, "step": 340 }, { "entropy": 1.1208850890398026, "epoch": 0.6420334196281478, "grad_norm": 0.14697298407554626, "learning_rate": 0.00043609022556390976, "loss": 1.1336, "mean_token_accuracy": 0.6952601596713066, "num_tokens": 9056227.0, "step": 341 }, { "entropy": 1.1804025322198868, "epoch": 0.6439162155801365, "grad_norm": 0.13762733340263367, "learning_rate": 0.00043590225563909773, "loss": 1.1556, "mean_token_accuracy": 0.6842042878270149, "num_tokens": 9081334.0, "step": 342 }, { "entropy": 1.225020870566368, "epoch": 0.6457990115321252, "grad_norm": 0.15140774846076965, "learning_rate": 0.00043571428571428575, "loss": 1.1576, "mean_token_accuracy": 0.6892690062522888, "num_tokens": 9107740.0, "step": 343 }, { "entropy": 1.178776428103447, "epoch": 0.6476818074841139, "grad_norm": 0.14922155439853668, "learning_rate": 0.0004355263157894737, "loss": 1.119, "mean_token_accuracy": 0.6988128572702408, "num_tokens": 9134004.0, "step": 344 }, { "entropy": 1.1870884746313095, "epoch": 0.6495646034361026, "grad_norm": 0.13645216822624207, "learning_rate": 0.00043533834586466163, "loss": 1.1258, "mean_token_accuracy": 0.7014844194054604, "num_tokens": 9161858.0, "step": 345 }, { "entropy": 1.1208381354808807, "epoch": 0.6514473993880913, "grad_norm": 0.15188747644424438, "learning_rate": 0.00043515037593984965, "loss": 1.126, "mean_token_accuracy": 0.6888753995299339, "num_tokens": 9187924.0, "step": 346 }, { "entropy": 1.1246383488178253, "epoch": 0.65333019534008, "grad_norm": 0.18039844930171967, "learning_rate": 0.0004349624060150376, "loss": 1.1297, "mean_token_accuracy": 0.6954269483685493, "num_tokens": 9213952.0, "step": 347 }, { "entropy": 1.181724175810814, "epoch": 0.6552129912920687, "grad_norm": 0.13552230596542358, "learning_rate": 0.0004347744360902256, "loss": 1.185, "mean_token_accuracy": 0.682334654033184, "num_tokens": 9240003.0, "step": 348 }, { "entropy": 1.161278709769249, "epoch": 0.6570957872440574, "grad_norm": 0.13721586763858795, "learning_rate": 0.00043458646616541355, "loss": 1.1323, "mean_token_accuracy": 0.6919213533401489, "num_tokens": 9265180.0, "step": 349 }, { "entropy": 1.167539969086647, "epoch": 0.6589785831960461, "grad_norm": 0.145475372672081, "learning_rate": 0.0004343984962406015, "loss": 1.1342, "mean_token_accuracy": 0.6932244300842285, "num_tokens": 9291467.0, "step": 350 }, { "entropy": 1.2319505363702774, "epoch": 0.6608613791480348, "grad_norm": 0.13839372992515564, "learning_rate": 0.0004342105263157895, "loss": 1.2132, "mean_token_accuracy": 0.6786127388477325, "num_tokens": 9317382.0, "step": 351 }, { "entropy": 1.2023252993822098, "epoch": 0.6627441751000235, "grad_norm": 0.1364511102437973, "learning_rate": 0.00043402255639097745, "loss": 1.19, "mean_token_accuracy": 0.6843428909778595, "num_tokens": 9343464.0, "step": 352 }, { "entropy": 1.173360899090767, "epoch": 0.6646269710520122, "grad_norm": 0.1326543539762497, "learning_rate": 0.0004338345864661654, "loss": 1.1469, "mean_token_accuracy": 0.6877379715442657, "num_tokens": 9371170.0, "step": 353 }, { "entropy": 1.1177352517843246, "epoch": 0.6665097670040009, "grad_norm": 0.1422666758298874, "learning_rate": 0.0004336466165413534, "loss": 1.0994, "mean_token_accuracy": 0.700407862663269, "num_tokens": 9397147.0, "step": 354 }, { "entropy": 1.248588040471077, "epoch": 0.6683925629559897, "grad_norm": 0.13168664276599884, "learning_rate": 0.0004334586466165414, "loss": 1.2098, "mean_token_accuracy": 0.6834209859371185, "num_tokens": 9424363.0, "step": 355 }, { "entropy": 1.1617062538862228, "epoch": 0.6702753589079784, "grad_norm": 0.15483741462230682, "learning_rate": 0.0004332706766917293, "loss": 1.114, "mean_token_accuracy": 0.7020522281527519, "num_tokens": 9450742.0, "step": 356 }, { "entropy": 1.1978859603404999, "epoch": 0.6721581548599671, "grad_norm": 0.14632469415664673, "learning_rate": 0.0004330827067669173, "loss": 1.1847, "mean_token_accuracy": 0.6837000176310539, "num_tokens": 9475697.0, "step": 357 }, { "entropy": 1.1161824762821198, "epoch": 0.6740409508119558, "grad_norm": 0.14072488248348236, "learning_rate": 0.0004328947368421053, "loss": 1.1272, "mean_token_accuracy": 0.6974566504359245, "num_tokens": 9502237.0, "step": 358 }, { "entropy": 1.1397125273942947, "epoch": 0.6759237467639445, "grad_norm": 0.148344486951828, "learning_rate": 0.0004327067669172932, "loss": 1.1453, "mean_token_accuracy": 0.6873810589313507, "num_tokens": 9528201.0, "step": 359 }, { "entropy": 1.2197502925992012, "epoch": 0.6778065427159332, "grad_norm": 0.14831538498401642, "learning_rate": 0.0004325187969924812, "loss": 1.1981, "mean_token_accuracy": 0.6797335669398308, "num_tokens": 9553887.0, "step": 360 }, { "entropy": 1.2503347992897034, "epoch": 0.6796893386679219, "grad_norm": 0.14289598166942596, "learning_rate": 0.0004323308270676692, "loss": 1.1754, "mean_token_accuracy": 0.682529591023922, "num_tokens": 9578439.0, "step": 361 }, { "entropy": 1.2314954698085785, "epoch": 0.6815721346199106, "grad_norm": 0.14386345446109772, "learning_rate": 0.0004321428571428572, "loss": 1.1499, "mean_token_accuracy": 0.6907836198806763, "num_tokens": 9603444.0, "step": 362 }, { "entropy": 1.2456393241882324, "epoch": 0.6834549305718993, "grad_norm": 0.14364264905452728, "learning_rate": 0.0004319548872180451, "loss": 1.1933, "mean_token_accuracy": 0.6874497607350349, "num_tokens": 9629030.0, "step": 363 }, { "entropy": 1.1722253412008286, "epoch": 0.685337726523888, "grad_norm": 0.1491105556488037, "learning_rate": 0.0004317669172932331, "loss": 1.152, "mean_token_accuracy": 0.6939368024468422, "num_tokens": 9656342.0, "step": 364 }, { "entropy": 1.0892303064465523, "epoch": 0.6872205224758767, "grad_norm": 0.14881175756454468, "learning_rate": 0.0004315789473684211, "loss": 1.0922, "mean_token_accuracy": 0.7064904496073723, "num_tokens": 9680706.0, "step": 365 }, { "entropy": 1.090978980064392, "epoch": 0.6891033184278654, "grad_norm": 0.14446662366390228, "learning_rate": 0.00043139097744360904, "loss": 1.1148, "mean_token_accuracy": 0.696795642375946, "num_tokens": 9705331.0, "step": 366 }, { "entropy": 1.1398785412311554, "epoch": 0.6909861143798541, "grad_norm": 0.13684354722499847, "learning_rate": 0.000431203007518797, "loss": 1.1497, "mean_token_accuracy": 0.6912109777331352, "num_tokens": 9732400.0, "step": 367 }, { "entropy": 1.17644502222538, "epoch": 0.6928689103318428, "grad_norm": 0.14162884652614594, "learning_rate": 0.000431015037593985, "loss": 1.1495, "mean_token_accuracy": 0.6945677846670151, "num_tokens": 9758948.0, "step": 368 }, { "entropy": 1.1725402027368546, "epoch": 0.6947517062838315, "grad_norm": 0.13373105227947235, "learning_rate": 0.00043082706766917295, "loss": 1.1186, "mean_token_accuracy": 0.7017792239785194, "num_tokens": 9786609.0, "step": 369 }, { "entropy": 1.1570321172475815, "epoch": 0.6966345022358202, "grad_norm": 0.13376620411872864, "learning_rate": 0.0004306390977443609, "loss": 1.1169, "mean_token_accuracy": 0.7013789564371109, "num_tokens": 9815091.0, "step": 370 }, { "entropy": 1.2269478738307953, "epoch": 0.6985172981878089, "grad_norm": 0.15718406438827515, "learning_rate": 0.0004304511278195489, "loss": 1.1795, "mean_token_accuracy": 0.6809123381972313, "num_tokens": 9838924.0, "step": 371 }, { "entropy": 1.2373632341623306, "epoch": 0.7004000941397976, "grad_norm": 0.13601046800613403, "learning_rate": 0.00043026315789473685, "loss": 1.1897, "mean_token_accuracy": 0.6842946112155914, "num_tokens": 9865745.0, "step": 372 }, { "entropy": 1.2175681740045547, "epoch": 0.7022828900917863, "grad_norm": 0.14760908484458923, "learning_rate": 0.00043007518796992487, "loss": 1.2027, "mean_token_accuracy": 0.680089496076107, "num_tokens": 9891103.0, "step": 373 }, { "entropy": 1.187382310628891, "epoch": 0.704165686043775, "grad_norm": 0.15881404280662537, "learning_rate": 0.0004298872180451128, "loss": 1.183, "mean_token_accuracy": 0.6840859726071358, "num_tokens": 9916491.0, "step": 374 }, { "entropy": 1.1363441050052643, "epoch": 0.7060484819957638, "grad_norm": 0.14100411534309387, "learning_rate": 0.00042969924812030075, "loss": 1.1268, "mean_token_accuracy": 0.6940664201974869, "num_tokens": 9943115.0, "step": 375 }, { "entropy": 1.1373258829116821, "epoch": 0.7079312779477525, "grad_norm": 0.14058925211429596, "learning_rate": 0.00042951127819548877, "loss": 1.1312, "mean_token_accuracy": 0.6918314695358276, "num_tokens": 9971012.0, "step": 376 }, { "entropy": 1.1753637194633484, "epoch": 0.7098140738997412, "grad_norm": 0.15900634229183197, "learning_rate": 0.00042932330827067674, "loss": 1.1532, "mean_token_accuracy": 0.688523419201374, "num_tokens": 9997158.0, "step": 377 }, { "entropy": 1.2038870453834534, "epoch": 0.7116968698517299, "grad_norm": 0.15579019486904144, "learning_rate": 0.00042913533834586465, "loss": 1.1634, "mean_token_accuracy": 0.6910874620079994, "num_tokens": 10023904.0, "step": 378 }, { "entropy": 1.2042047381401062, "epoch": 0.7135796658037186, "grad_norm": 0.1458210051059723, "learning_rate": 0.0004289473684210526, "loss": 1.1303, "mean_token_accuracy": 0.6955228298902512, "num_tokens": 10050044.0, "step": 379 }, { "entropy": 1.199434906244278, "epoch": 0.7154624617557073, "grad_norm": 0.13873904943466187, "learning_rate": 0.00042875939849624064, "loss": 1.143, "mean_token_accuracy": 0.6911288425326347, "num_tokens": 10077533.0, "step": 380 }, { "entropy": 1.179319679737091, "epoch": 0.717345257707696, "grad_norm": 0.15580423176288605, "learning_rate": 0.00042857142857142855, "loss": 1.1516, "mean_token_accuracy": 0.6900925859808922, "num_tokens": 10102103.0, "step": 381 }, { "entropy": 1.1498710662126541, "epoch": 0.7192280536596847, "grad_norm": 0.1526648849248886, "learning_rate": 0.0004283834586466165, "loss": 1.1463, "mean_token_accuracy": 0.6923620998859406, "num_tokens": 10127966.0, "step": 382 }, { "entropy": 1.2051638066768646, "epoch": 0.7211108496116734, "grad_norm": 0.14739763736724854, "learning_rate": 0.00042819548872180454, "loss": 1.2125, "mean_token_accuracy": 0.6824790090322495, "num_tokens": 10153724.0, "step": 383 }, { "entropy": 1.148889034986496, "epoch": 0.7229936455636621, "grad_norm": 0.13951475918293, "learning_rate": 0.0004280075187969925, "loss": 1.1431, "mean_token_accuracy": 0.6938719674944878, "num_tokens": 10178827.0, "step": 384 }, { "entropy": 1.1680803298950195, "epoch": 0.7248764415156508, "grad_norm": 0.14505353569984436, "learning_rate": 0.0004278195488721804, "loss": 1.1278, "mean_token_accuracy": 0.6925608888268471, "num_tokens": 10204362.0, "step": 385 }, { "entropy": 1.1652754694223404, "epoch": 0.7267592374676395, "grad_norm": 0.15343666076660156, "learning_rate": 0.00042763157894736844, "loss": 1.1347, "mean_token_accuracy": 0.6980648785829544, "num_tokens": 10232975.0, "step": 386 }, { "entropy": 1.1660331934690475, "epoch": 0.7286420334196282, "grad_norm": 0.6029819250106812, "learning_rate": 0.0004274436090225564, "loss": 1.1252, "mean_token_accuracy": 0.6913493424654007, "num_tokens": 10258684.0, "step": 387 }, { "entropy": 1.2207347601652145, "epoch": 0.7305248293716169, "grad_norm": 0.1639021635055542, "learning_rate": 0.00042725563909774437, "loss": 1.2, "mean_token_accuracy": 0.680275171995163, "num_tokens": 10284896.0, "step": 388 }, { "entropy": 1.1547054946422577, "epoch": 0.7324076253236056, "grad_norm": 0.13551250100135803, "learning_rate": 0.00042706766917293234, "loss": 1.153, "mean_token_accuracy": 0.6940227970480919, "num_tokens": 10312039.0, "step": 389 }, { "entropy": 1.173499509692192, "epoch": 0.7342904212755943, "grad_norm": 0.14394164085388184, "learning_rate": 0.0004268796992481203, "loss": 1.1401, "mean_token_accuracy": 0.6948181614279747, "num_tokens": 10338001.0, "step": 390 }, { "entropy": 1.108071744441986, "epoch": 0.736173217227583, "grad_norm": 0.15528494119644165, "learning_rate": 0.0004266917293233083, "loss": 1.0993, "mean_token_accuracy": 0.7045417055487633, "num_tokens": 10364257.0, "step": 391 }, { "entropy": 1.1832116544246674, "epoch": 0.7380560131795717, "grad_norm": 0.14551259577274323, "learning_rate": 0.00042650375939849624, "loss": 1.1514, "mean_token_accuracy": 0.6929153054952621, "num_tokens": 10389671.0, "step": 392 }, { "entropy": 1.1930436193943024, "epoch": 0.7399388091315604, "grad_norm": 0.15499240159988403, "learning_rate": 0.0004263157894736842, "loss": 1.1429, "mean_token_accuracy": 0.688226006925106, "num_tokens": 10415575.0, "step": 393 }, { "entropy": 1.2092433124780655, "epoch": 0.7418216050835491, "grad_norm": 0.15129360556602478, "learning_rate": 0.0004261278195488722, "loss": 1.1844, "mean_token_accuracy": 0.6808707118034363, "num_tokens": 10442443.0, "step": 394 }, { "entropy": 1.293672189116478, "epoch": 0.7437044010355378, "grad_norm": 0.1603565663099289, "learning_rate": 0.0004259398496240602, "loss": 1.2682, "mean_token_accuracy": 0.6722560822963715, "num_tokens": 10466233.0, "step": 395 }, { "entropy": 1.1358380764722824, "epoch": 0.7455871969875265, "grad_norm": 0.1485726684331894, "learning_rate": 0.0004257518796992481, "loss": 1.1388, "mean_token_accuracy": 0.6920513585209846, "num_tokens": 10491851.0, "step": 396 }, { "entropy": 1.13677416741848, "epoch": 0.7474699929395152, "grad_norm": 0.1432713270187378, "learning_rate": 0.0004255639097744361, "loss": 1.1244, "mean_token_accuracy": 0.6951583921909332, "num_tokens": 10518737.0, "step": 397 }, { "entropy": 1.2034449130296707, "epoch": 0.7493527888915039, "grad_norm": 0.16076122224330902, "learning_rate": 0.0004253759398496241, "loss": 1.2062, "mean_token_accuracy": 0.6785011366009712, "num_tokens": 10545857.0, "step": 398 }, { "entropy": 1.1623305827379227, "epoch": 0.7512355848434926, "grad_norm": 0.15050064027309418, "learning_rate": 0.000425187969924812, "loss": 1.1163, "mean_token_accuracy": 0.6948087736964226, "num_tokens": 10571770.0, "step": 399 }, { "entropy": 1.1117802858352661, "epoch": 0.7531183807954813, "grad_norm": 0.21685755252838135, "learning_rate": 0.000425, "loss": 1.0837, "mean_token_accuracy": 0.7059917375445366, "num_tokens": 10599528.0, "step": 400 }, { "entropy": 1.1872282922267914, "epoch": 0.75500117674747, "grad_norm": 0.1475781798362732, "learning_rate": 0.000424812030075188, "loss": 1.1617, "mean_token_accuracy": 0.6920499876141548, "num_tokens": 10625575.0, "step": 401 }, { "entropy": 1.1875766217708588, "epoch": 0.7568839726994587, "grad_norm": 0.15453127026557922, "learning_rate": 0.00042462406015037596, "loss": 1.1608, "mean_token_accuracy": 0.6888900995254517, "num_tokens": 10650929.0, "step": 402 }, { "entropy": 1.120169810950756, "epoch": 0.7587667686514474, "grad_norm": 0.14685072004795074, "learning_rate": 0.0004244360902255639, "loss": 1.0894, "mean_token_accuracy": 0.700760155916214, "num_tokens": 10677930.0, "step": 403 }, { "entropy": 1.178112044930458, "epoch": 0.7606495646034361, "grad_norm": 0.15392844378948212, "learning_rate": 0.0004242481203007519, "loss": 1.1488, "mean_token_accuracy": 0.6943765133619308, "num_tokens": 10701759.0, "step": 404 }, { "entropy": 1.139440432190895, "epoch": 0.7625323605554248, "grad_norm": 0.14876064658164978, "learning_rate": 0.00042406015037593987, "loss": 1.1175, "mean_token_accuracy": 0.6995274350047112, "num_tokens": 10727920.0, "step": 405 }, { "entropy": 1.1383692100644112, "epoch": 0.7644151565074135, "grad_norm": 0.16769041121006012, "learning_rate": 0.00042387218045112783, "loss": 1.1056, "mean_token_accuracy": 0.6987453699111938, "num_tokens": 10752826.0, "step": 406 }, { "entropy": 1.219818040728569, "epoch": 0.7662979524594022, "grad_norm": 0.16228246688842773, "learning_rate": 0.0004236842105263158, "loss": 1.1982, "mean_token_accuracy": 0.6772318556904793, "num_tokens": 10777756.0, "step": 407 }, { "entropy": 1.1474368646740913, "epoch": 0.768180748411391, "grad_norm": 0.14922939240932465, "learning_rate": 0.00042349624060150377, "loss": 1.1385, "mean_token_accuracy": 0.6920562386512756, "num_tokens": 10804768.0, "step": 408 }, { "entropy": 1.1331078857183456, "epoch": 0.7700635443633796, "grad_norm": 0.1535317599773407, "learning_rate": 0.00042330827067669173, "loss": 1.1359, "mean_token_accuracy": 0.6879219114780426, "num_tokens": 10830286.0, "step": 409 }, { "entropy": 1.146752119064331, "epoch": 0.7719463403153684, "grad_norm": 0.1524975448846817, "learning_rate": 0.0004231203007518797, "loss": 1.1448, "mean_token_accuracy": 0.6925338879227638, "num_tokens": 10855720.0, "step": 410 }, { "entropy": 1.13744555413723, "epoch": 0.773829136267357, "grad_norm": 0.16938121616840363, "learning_rate": 0.00042293233082706767, "loss": 1.1189, "mean_token_accuracy": 0.7019513100385666, "num_tokens": 10881312.0, "step": 411 }, { "entropy": 1.1643693000078201, "epoch": 0.7757119322193458, "grad_norm": 0.134382426738739, "learning_rate": 0.00042274436090225563, "loss": 1.1205, "mean_token_accuracy": 0.7012400701642036, "num_tokens": 10909609.0, "step": 412 }, { "entropy": 1.1546955406665802, "epoch": 0.7775947281713345, "grad_norm": 0.15923891961574554, "learning_rate": 0.00042255639097744366, "loss": 1.1025, "mean_token_accuracy": 0.7031391486525536, "num_tokens": 10937878.0, "step": 413 }, { "entropy": 1.1441723331809044, "epoch": 0.7794775241233232, "grad_norm": 0.16663163900375366, "learning_rate": 0.00042236842105263157, "loss": 1.1092, "mean_token_accuracy": 0.6957027688622475, "num_tokens": 10963268.0, "step": 414 }, { "entropy": 1.168132722377777, "epoch": 0.7813603200753119, "grad_norm": 0.13848932087421417, "learning_rate": 0.00042218045112781954, "loss": 1.132, "mean_token_accuracy": 0.6938114240765572, "num_tokens": 10990727.0, "step": 415 }, { "entropy": 1.1057742238044739, "epoch": 0.7832431160273006, "grad_norm": 0.13826268911361694, "learning_rate": 0.00042199248120300756, "loss": 1.0977, "mean_token_accuracy": 0.6982015743851662, "num_tokens": 11017384.0, "step": 416 }, { "entropy": 1.1963546127080917, "epoch": 0.7851259119792893, "grad_norm": 0.1429852694272995, "learning_rate": 0.0004218045112781955, "loss": 1.1883, "mean_token_accuracy": 0.6860344484448433, "num_tokens": 11045688.0, "step": 417 }, { "entropy": 1.1521967574954033, "epoch": 0.787008707931278, "grad_norm": 0.16643297672271729, "learning_rate": 0.00042161654135338344, "loss": 1.1547, "mean_token_accuracy": 0.6908131241798401, "num_tokens": 11070352.0, "step": 418 }, { "entropy": 1.1493701189756393, "epoch": 0.7888915038832667, "grad_norm": 0.15780487656593323, "learning_rate": 0.00042142857142857146, "loss": 1.1631, "mean_token_accuracy": 0.6898321136832237, "num_tokens": 11097217.0, "step": 419 }, { "entropy": 1.2399737238883972, "epoch": 0.7907742998352554, "grad_norm": 0.15339267253875732, "learning_rate": 0.0004212406015037594, "loss": 1.206, "mean_token_accuracy": 0.6820631548762321, "num_tokens": 11123692.0, "step": 420 }, { "entropy": 1.1258632093667984, "epoch": 0.7926570957872441, "grad_norm": 0.1442951112985611, "learning_rate": 0.00042105263157894734, "loss": 1.0869, "mean_token_accuracy": 0.7083057761192322, "num_tokens": 11149050.0, "step": 421 }, { "entropy": 1.2205425053834915, "epoch": 0.7945398917392328, "grad_norm": 0.1388903707265854, "learning_rate": 0.00042086466165413536, "loss": 1.1843, "mean_token_accuracy": 0.6856774613261223, "num_tokens": 11175990.0, "step": 422 }, { "entropy": 1.1613269746303558, "epoch": 0.7964226876912215, "grad_norm": 0.15723979473114014, "learning_rate": 0.0004206766917293233, "loss": 1.1238, "mean_token_accuracy": 0.6957441344857216, "num_tokens": 11203684.0, "step": 423 }, { "entropy": 1.15619857609272, "epoch": 0.7983054836432102, "grad_norm": 0.16091464459896088, "learning_rate": 0.0004204887218045113, "loss": 1.1275, "mean_token_accuracy": 0.6946544200181961, "num_tokens": 11230179.0, "step": 424 }, { "entropy": 1.2017978131771088, "epoch": 0.8001882795951989, "grad_norm": 0.15011471509933472, "learning_rate": 0.00042030075187969926, "loss": 1.1685, "mean_token_accuracy": 0.6920702531933784, "num_tokens": 11256384.0, "step": 425 }, { "entropy": 1.2229324877262115, "epoch": 0.8020710755471876, "grad_norm": 0.14569929242134094, "learning_rate": 0.0004201127819548872, "loss": 1.2065, "mean_token_accuracy": 0.6834921091794968, "num_tokens": 11284359.0, "step": 426 }, { "entropy": 1.1204483732581139, "epoch": 0.8039538714991763, "grad_norm": 0.14004987478256226, "learning_rate": 0.0004199248120300752, "loss": 1.1147, "mean_token_accuracy": 0.7033949047327042, "num_tokens": 11313184.0, "step": 427 }, { "entropy": 1.1141091734170914, "epoch": 0.805836667451165, "grad_norm": 0.14807014167308807, "learning_rate": 0.00041973684210526316, "loss": 1.1074, "mean_token_accuracy": 0.6922068670392036, "num_tokens": 11340757.0, "step": 428 }, { "entropy": 1.2002304196357727, "epoch": 0.8077194634031537, "grad_norm": 0.17711348831653595, "learning_rate": 0.00041954887218045113, "loss": 1.1973, "mean_token_accuracy": 0.6831801310181618, "num_tokens": 11366871.0, "step": 429 }, { "entropy": 1.2234468758106232, "epoch": 0.8096022593551424, "grad_norm": 0.16027556359767914, "learning_rate": 0.0004193609022556391, "loss": 1.1958, "mean_token_accuracy": 0.6806567907333374, "num_tokens": 11390392.0, "step": 430 }, { "entropy": 1.1892322599887848, "epoch": 0.8114850553071311, "grad_norm": 0.14892058074474335, "learning_rate": 0.0004191729323308271, "loss": 1.124, "mean_token_accuracy": 0.6932070925831795, "num_tokens": 11415883.0, "step": 431 }, { "entropy": 1.1975643932819366, "epoch": 0.8133678512591198, "grad_norm": 0.13819143176078796, "learning_rate": 0.00041898496240601503, "loss": 1.1446, "mean_token_accuracy": 0.6961016952991486, "num_tokens": 11445261.0, "step": 432 }, { "entropy": 1.231493815779686, "epoch": 0.8152506472111085, "grad_norm": 0.14783842861652374, "learning_rate": 0.000418796992481203, "loss": 1.1956, "mean_token_accuracy": 0.6879047080874443, "num_tokens": 11471660.0, "step": 433 }, { "entropy": 1.1187082305550575, "epoch": 0.8171334431630972, "grad_norm": 0.1379650980234146, "learning_rate": 0.000418609022556391, "loss": 1.1226, "mean_token_accuracy": 0.6993625611066818, "num_tokens": 11498274.0, "step": 434 }, { "entropy": 1.272495910525322, "epoch": 0.8190162391150859, "grad_norm": 0.1640465259552002, "learning_rate": 0.000418421052631579, "loss": 1.2792, "mean_token_accuracy": 0.6701348200440407, "num_tokens": 11525102.0, "step": 435 }, { "entropy": 1.1658570766448975, "epoch": 0.8208990350670746, "grad_norm": 0.14112910628318787, "learning_rate": 0.0004182330827067669, "loss": 1.171, "mean_token_accuracy": 0.6936748847365379, "num_tokens": 11555100.0, "step": 436 }, { "entropy": 1.2729250341653824, "epoch": 0.8227818310190633, "grad_norm": 0.15435785055160522, "learning_rate": 0.0004180451127819549, "loss": 1.2133, "mean_token_accuracy": 0.6812319383025169, "num_tokens": 11580101.0, "step": 437 }, { "entropy": 1.13491952419281, "epoch": 0.824664626971052, "grad_norm": 0.1388065367937088, "learning_rate": 0.0004178571428571429, "loss": 1.091, "mean_token_accuracy": 0.7023670971393585, "num_tokens": 11607990.0, "step": 438 }, { "entropy": 1.1109650805592537, "epoch": 0.8265474229230407, "grad_norm": 0.13361488282680511, "learning_rate": 0.0004176691729323308, "loss": 1.0797, "mean_token_accuracy": 0.7052409499883652, "num_tokens": 11635249.0, "step": 439 }, { "entropy": 1.128780521452427, "epoch": 0.8284302188750294, "grad_norm": 0.14179299771785736, "learning_rate": 0.0004174812030075188, "loss": 1.0756, "mean_token_accuracy": 0.6986876875162125, "num_tokens": 11661132.0, "step": 440 }, { "entropy": 1.1229918599128723, "epoch": 0.8303130148270181, "grad_norm": 0.13364551961421967, "learning_rate": 0.0004172932330827068, "loss": 1.1159, "mean_token_accuracy": 0.7024848908185959, "num_tokens": 11688969.0, "step": 441 }, { "entropy": 1.1451409384608269, "epoch": 0.8321958107790068, "grad_norm": 0.15363940596580505, "learning_rate": 0.00041710526315789475, "loss": 1.1742, "mean_token_accuracy": 0.6850685179233551, "num_tokens": 11714108.0, "step": 442 }, { "entropy": 1.1217172518372536, "epoch": 0.8340786067309955, "grad_norm": 0.1592985838651657, "learning_rate": 0.0004169172932330827, "loss": 1.1189, "mean_token_accuracy": 0.698178730905056, "num_tokens": 11737727.0, "step": 443 }, { "entropy": 1.1448046416044235, "epoch": 0.8359614026829842, "grad_norm": 0.15717987716197968, "learning_rate": 0.0004167293233082707, "loss": 1.1271, "mean_token_accuracy": 0.696114294230938, "num_tokens": 11763503.0, "step": 444 }, { "entropy": 1.1910344362258911, "epoch": 0.837844198634973, "grad_norm": 0.1563824862241745, "learning_rate": 0.00041654135338345865, "loss": 1.1685, "mean_token_accuracy": 0.6853935644030571, "num_tokens": 11788216.0, "step": 445 }, { "entropy": 1.1520782858133316, "epoch": 0.8397269945869617, "grad_norm": 0.15299555659294128, "learning_rate": 0.0004163533834586467, "loss": 1.1235, "mean_token_accuracy": 0.6957945972681046, "num_tokens": 11813250.0, "step": 446 }, { "entropy": 1.157516971230507, "epoch": 0.8416097905389504, "grad_norm": 0.15409286320209503, "learning_rate": 0.0004161654135338346, "loss": 1.1292, "mean_token_accuracy": 0.6986691579222679, "num_tokens": 11840547.0, "step": 447 }, { "entropy": 1.1751955449581146, "epoch": 0.8434925864909391, "grad_norm": 0.1436087191104889, "learning_rate": 0.00041597744360902255, "loss": 1.1498, "mean_token_accuracy": 0.692206360399723, "num_tokens": 11868040.0, "step": 448 }, { "entropy": 1.1962674707174301, "epoch": 0.8453753824429278, "grad_norm": 0.14213787019252777, "learning_rate": 0.0004157894736842106, "loss": 1.1349, "mean_token_accuracy": 0.6944708526134491, "num_tokens": 11894177.0, "step": 449 }, { "entropy": 1.201774999499321, "epoch": 0.8472581783949165, "grad_norm": 0.15118546783924103, "learning_rate": 0.0004156015037593985, "loss": 1.1868, "mean_token_accuracy": 0.6906943470239639, "num_tokens": 11920755.0, "step": 450 }, { "entropy": 1.1439872980117798, "epoch": 0.8491409743469052, "grad_norm": 0.1536472737789154, "learning_rate": 0.00041541353383458646, "loss": 1.1091, "mean_token_accuracy": 0.6987525522708893, "num_tokens": 11946199.0, "step": 451 }, { "entropy": 1.1865400224924088, "epoch": 0.8510237702988939, "grad_norm": 0.16255781054496765, "learning_rate": 0.0004152255639097745, "loss": 1.1606, "mean_token_accuracy": 0.6941612362861633, "num_tokens": 11970559.0, "step": 452 }, { "entropy": 1.1555950492620468, "epoch": 0.8529065662508826, "grad_norm": 0.15296806395053864, "learning_rate": 0.00041503759398496244, "loss": 1.1647, "mean_token_accuracy": 0.6893363445997238, "num_tokens": 11998113.0, "step": 453 }, { "entropy": 1.1035746112465858, "epoch": 0.8547893622028713, "grad_norm": 0.13151533901691437, "learning_rate": 0.00041484962406015036, "loss": 1.0917, "mean_token_accuracy": 0.7064924463629723, "num_tokens": 12025595.0, "step": 454 }, { "entropy": 1.148128904402256, "epoch": 0.85667215815486, "grad_norm": 0.15572930872440338, "learning_rate": 0.0004146616541353384, "loss": 1.1516, "mean_token_accuracy": 0.6970530971884727, "num_tokens": 12051025.0, "step": 455 }, { "entropy": 1.1640497595071793, "epoch": 0.8585549541068487, "grad_norm": 0.14575503766536713, "learning_rate": 0.00041447368421052634, "loss": 1.124, "mean_token_accuracy": 0.6972140222787857, "num_tokens": 12080372.0, "step": 456 }, { "entropy": 1.1797401309013367, "epoch": 0.8604377500588374, "grad_norm": 0.1724129319190979, "learning_rate": 0.0004142857142857143, "loss": 1.1266, "mean_token_accuracy": 0.6963677033782005, "num_tokens": 12107881.0, "step": 457 }, { "entropy": 1.1369287073612213, "epoch": 0.8623205460108261, "grad_norm": 0.1409987360239029, "learning_rate": 0.0004140977443609022, "loss": 1.1021, "mean_token_accuracy": 0.6983814239501953, "num_tokens": 12136975.0, "step": 458 }, { "entropy": 1.203329399228096, "epoch": 0.8642033419628148, "grad_norm": 0.171426460146904, "learning_rate": 0.00041390977443609025, "loss": 1.1796, "mean_token_accuracy": 0.6895611882209778, "num_tokens": 12164452.0, "step": 459 }, { "entropy": 1.1388862580060959, "epoch": 0.8660861379148035, "grad_norm": 0.1465880423784256, "learning_rate": 0.0004137218045112782, "loss": 1.1449, "mean_token_accuracy": 0.6952017247676849, "num_tokens": 12190700.0, "step": 460 }, { "entropy": 1.165066435933113, "epoch": 0.8679689338667922, "grad_norm": 0.1510019600391388, "learning_rate": 0.0004135338345864661, "loss": 1.1519, "mean_token_accuracy": 0.6902508214116096, "num_tokens": 12216248.0, "step": 461 }, { "entropy": 1.1071253940463066, "epoch": 0.8698517298187809, "grad_norm": 0.1569354087114334, "learning_rate": 0.00041334586466165415, "loss": 1.1008, "mean_token_accuracy": 0.7029130309820175, "num_tokens": 12242702.0, "step": 462 }, { "entropy": 1.162157580256462, "epoch": 0.8717345257707696, "grad_norm": 0.15269963443279266, "learning_rate": 0.0004131578947368421, "loss": 1.1408, "mean_token_accuracy": 0.6991895586252213, "num_tokens": 12267065.0, "step": 463 }, { "entropy": 1.164448007941246, "epoch": 0.8736173217227583, "grad_norm": 0.15020480751991272, "learning_rate": 0.0004129699248120301, "loss": 1.1331, "mean_token_accuracy": 0.6945090070366859, "num_tokens": 12294273.0, "step": 464 }, { "entropy": 1.194659799337387, "epoch": 0.875500117674747, "grad_norm": 0.16067473590373993, "learning_rate": 0.00041278195488721805, "loss": 1.1384, "mean_token_accuracy": 0.692974790930748, "num_tokens": 12319075.0, "step": 465 }, { "entropy": 1.1572427451610565, "epoch": 0.8773829136267357, "grad_norm": 0.14344556629657745, "learning_rate": 0.000412593984962406, "loss": 1.1239, "mean_token_accuracy": 0.6996137872338295, "num_tokens": 12345047.0, "step": 466 }, { "entropy": 1.1310506239533424, "epoch": 0.8792657095787244, "grad_norm": 0.1469915211200714, "learning_rate": 0.000412406015037594, "loss": 1.1117, "mean_token_accuracy": 0.6948174610733986, "num_tokens": 12371084.0, "step": 467 }, { "entropy": 1.1873999759554863, "epoch": 0.8811485055307131, "grad_norm": 0.14283262193202972, "learning_rate": 0.00041221804511278195, "loss": 1.1725, "mean_token_accuracy": 0.6882406696677208, "num_tokens": 12397086.0, "step": 468 }, { "entropy": 1.1660784780979156, "epoch": 0.8830313014827018, "grad_norm": 0.1400137096643448, "learning_rate": 0.0004120300751879699, "loss": 1.1305, "mean_token_accuracy": 0.6928488984704018, "num_tokens": 12424840.0, "step": 469 }, { "entropy": 1.1689551174640656, "epoch": 0.8849140974346905, "grad_norm": 0.17401744425296783, "learning_rate": 0.0004118421052631579, "loss": 1.1356, "mean_token_accuracy": 0.6973849907517433, "num_tokens": 12453038.0, "step": 470 }, { "entropy": 1.16590516269207, "epoch": 0.8867968933866792, "grad_norm": 0.15749803185462952, "learning_rate": 0.0004116541353383459, "loss": 1.1388, "mean_token_accuracy": 0.690193310379982, "num_tokens": 12479755.0, "step": 471 }, { "entropy": 1.1534086763858795, "epoch": 0.8886796893386679, "grad_norm": 0.13575902581214905, "learning_rate": 0.0004114661654135338, "loss": 1.1333, "mean_token_accuracy": 0.6930194050073624, "num_tokens": 12507911.0, "step": 472 }, { "entropy": 1.166767194867134, "epoch": 0.8905624852906566, "grad_norm": 0.14083941280841827, "learning_rate": 0.0004112781954887218, "loss": 1.1433, "mean_token_accuracy": 0.6883162334561348, "num_tokens": 12534740.0, "step": 473 }, { "entropy": 1.116583712399006, "epoch": 0.8924452812426453, "grad_norm": 0.18177185952663422, "learning_rate": 0.0004110902255639098, "loss": 1.1013, "mean_token_accuracy": 0.6984972059726715, "num_tokens": 12560495.0, "step": 474 }, { "entropy": 1.0962852016091347, "epoch": 0.894328077194634, "grad_norm": 0.15513888001441956, "learning_rate": 0.00041090225563909777, "loss": 1.0659, "mean_token_accuracy": 0.7114295363426208, "num_tokens": 12586806.0, "step": 475 }, { "entropy": 1.1862118691205978, "epoch": 0.8962108731466227, "grad_norm": 0.1506270319223404, "learning_rate": 0.0004107142857142857, "loss": 1.1887, "mean_token_accuracy": 0.6871896237134933, "num_tokens": 12612493.0, "step": 476 }, { "entropy": 1.1081865057349205, "epoch": 0.8980936690986114, "grad_norm": 0.14710566401481628, "learning_rate": 0.0004105263157894737, "loss": 1.1012, "mean_token_accuracy": 0.6983359083533287, "num_tokens": 12639626.0, "step": 477 }, { "entropy": 1.128834992647171, "epoch": 0.8999764650506001, "grad_norm": 0.14161938428878784, "learning_rate": 0.00041033834586466167, "loss": 1.0982, "mean_token_accuracy": 0.7014680877327919, "num_tokens": 12664733.0, "step": 478 }, { "entropy": 1.1446367651224136, "epoch": 0.9018592610025888, "grad_norm": 0.14254848659038544, "learning_rate": 0.0004101503759398496, "loss": 1.082, "mean_token_accuracy": 0.7081187888979912, "num_tokens": 12690384.0, "step": 479 }, { "entropy": 1.2071665897965431, "epoch": 0.9037420569545775, "grad_norm": 0.1451028734445572, "learning_rate": 0.0004099624060150376, "loss": 1.1573, "mean_token_accuracy": 0.6878824383020401, "num_tokens": 12717190.0, "step": 480 }, { "entropy": 1.1643542423844337, "epoch": 0.9056248529065662, "grad_norm": 0.16808035969734192, "learning_rate": 0.0004097744360902256, "loss": 1.1289, "mean_token_accuracy": 0.6955900862812996, "num_tokens": 12744287.0, "step": 481 }, { "entropy": 1.1430502980947495, "epoch": 0.907507648858555, "grad_norm": 0.14388366043567657, "learning_rate": 0.00040958646616541354, "loss": 1.1377, "mean_token_accuracy": 0.6985258162021637, "num_tokens": 12769478.0, "step": 482 }, { "entropy": 1.171137735247612, "epoch": 0.9093904448105437, "grad_norm": 0.14661596715450287, "learning_rate": 0.0004093984962406015, "loss": 1.1764, "mean_token_accuracy": 0.6929311379790306, "num_tokens": 12795715.0, "step": 483 }, { "entropy": 1.159026637673378, "epoch": 0.9112732407625324, "grad_norm": 0.14750456809997559, "learning_rate": 0.0004092105263157895, "loss": 1.1578, "mean_token_accuracy": 0.6937888264656067, "num_tokens": 12821869.0, "step": 484 }, { "entropy": 1.1478636413812637, "epoch": 0.9131560367145211, "grad_norm": 0.14371232688426971, "learning_rate": 0.00040902255639097744, "loss": 1.1218, "mean_token_accuracy": 0.7008863463997841, "num_tokens": 12848215.0, "step": 485 }, { "entropy": 1.120044082403183, "epoch": 0.9150388326665098, "grad_norm": 0.1404104232788086, "learning_rate": 0.00040883458646616546, "loss": 1.0728, "mean_token_accuracy": 0.7091679647564888, "num_tokens": 12876182.0, "step": 486 }, { "entropy": 1.1101247519254684, "epoch": 0.9169216286184985, "grad_norm": 0.1421038955450058, "learning_rate": 0.0004086466165413534, "loss": 1.0967, "mean_token_accuracy": 0.7037186399102211, "num_tokens": 12902501.0, "step": 487 }, { "entropy": 1.1512123197317123, "epoch": 0.9188044245704872, "grad_norm": 0.14930035173892975, "learning_rate": 0.00040845864661654134, "loss": 1.1259, "mean_token_accuracy": 0.6954185292124748, "num_tokens": 12928275.0, "step": 488 }, { "entropy": 1.136143758893013, "epoch": 0.9206872205224759, "grad_norm": 0.1431557983160019, "learning_rate": 0.00040827067669172936, "loss": 1.1053, "mean_token_accuracy": 0.7004474848508835, "num_tokens": 12954596.0, "step": 489 }, { "entropy": 1.1639841794967651, "epoch": 0.9225700164744646, "grad_norm": 0.1477883905172348, "learning_rate": 0.0004080827067669173, "loss": 1.129, "mean_token_accuracy": 0.6972065195441246, "num_tokens": 12980318.0, "step": 490 }, { "entropy": 1.162917599081993, "epoch": 0.9244528124264533, "grad_norm": 0.14567728340625763, "learning_rate": 0.00040789473684210524, "loss": 1.1503, "mean_token_accuracy": 0.6907480135560036, "num_tokens": 13006238.0, "step": 491 }, { "entropy": 1.1558719277381897, "epoch": 0.926335608378442, "grad_norm": 0.1421021670103073, "learning_rate": 0.00040770676691729326, "loss": 1.1429, "mean_token_accuracy": 0.6948621720075607, "num_tokens": 13034071.0, "step": 492 }, { "entropy": 1.175887256860733, "epoch": 0.9282184043304307, "grad_norm": 0.14368657767772675, "learning_rate": 0.00040751879699248123, "loss": 1.1752, "mean_token_accuracy": 0.6898396164178848, "num_tokens": 13059425.0, "step": 493 }, { "entropy": 1.1281049996614456, "epoch": 0.9301012002824194, "grad_norm": 0.13681703805923462, "learning_rate": 0.00040733082706766914, "loss": 1.1437, "mean_token_accuracy": 0.6920712366700172, "num_tokens": 13087803.0, "step": 494 }, { "entropy": 1.1919779032468796, "epoch": 0.9319839962344081, "grad_norm": 0.14613422751426697, "learning_rate": 0.00040714285714285717, "loss": 1.1647, "mean_token_accuracy": 0.6862485483288765, "num_tokens": 13114083.0, "step": 495 }, { "entropy": 1.1703974455595016, "epoch": 0.9338667921863968, "grad_norm": 0.13816098868846893, "learning_rate": 0.00040695488721804513, "loss": 1.1191, "mean_token_accuracy": 0.6944621205329895, "num_tokens": 13140806.0, "step": 496 }, { "entropy": 1.1625728458166122, "epoch": 0.9357495881383855, "grad_norm": 0.1374853253364563, "learning_rate": 0.0004067669172932331, "loss": 1.1311, "mean_token_accuracy": 0.693043515086174, "num_tokens": 13167072.0, "step": 497 }, { "entropy": 1.1611916273832321, "epoch": 0.9376323840903742, "grad_norm": 0.14068859815597534, "learning_rate": 0.00040657894736842107, "loss": 1.0958, "mean_token_accuracy": 0.7017333880066872, "num_tokens": 13193952.0, "step": 498 }, { "entropy": 1.2519186586141586, "epoch": 0.9395151800423629, "grad_norm": 0.14739161729812622, "learning_rate": 0.00040639097744360903, "loss": 1.2033, "mean_token_accuracy": 0.6803731620311737, "num_tokens": 13219334.0, "step": 499 }, { "entropy": 1.060287207365036, "epoch": 0.9413979759943516, "grad_norm": 0.13330809772014618, "learning_rate": 0.000406203007518797, "loss": 1.0607, "mean_token_accuracy": 0.7074964344501495, "num_tokens": 13247762.0, "step": 500 }, { "entropy": 1.1315688639879227, "epoch": 0.9432807719463403, "grad_norm": 0.14858287572860718, "learning_rate": 0.00040601503759398497, "loss": 1.1534, "mean_token_accuracy": 0.6925570517778397, "num_tokens": 13274542.0, "step": 501 }, { "entropy": 1.1256567761301994, "epoch": 0.945163567898329, "grad_norm": 0.13854491710662842, "learning_rate": 0.00040582706766917293, "loss": 1.1164, "mean_token_accuracy": 0.697671189904213, "num_tokens": 13301954.0, "step": 502 }, { "entropy": 1.1095138639211655, "epoch": 0.9470463638503177, "grad_norm": 0.14951969683170319, "learning_rate": 0.0004056390977443609, "loss": 1.0913, "mean_token_accuracy": 0.7060349136590958, "num_tokens": 13325368.0, "step": 503 }, { "entropy": 1.2117299437522888, "epoch": 0.9489291598023064, "grad_norm": 0.14555485546588898, "learning_rate": 0.0004054511278195489, "loss": 1.1771, "mean_token_accuracy": 0.6878413483500481, "num_tokens": 13350621.0, "step": 504 }, { "entropy": 1.1814142614603043, "epoch": 0.9508119557542951, "grad_norm": 0.13946305215358734, "learning_rate": 0.00040526315789473684, "loss": 1.1187, "mean_token_accuracy": 0.6975477784872055, "num_tokens": 13378436.0, "step": 505 }, { "entropy": 1.1588895320892334, "epoch": 0.9526947517062838, "grad_norm": 0.14052411913871765, "learning_rate": 0.0004050751879699248, "loss": 1.1139, "mean_token_accuracy": 0.6970377415418625, "num_tokens": 13405779.0, "step": 506 }, { "entropy": 1.1744963377714157, "epoch": 0.9545775476582725, "grad_norm": 0.14011354744434357, "learning_rate": 0.0004048872180451128, "loss": 1.1443, "mean_token_accuracy": 0.6915831044316292, "num_tokens": 13431768.0, "step": 507 }, { "entropy": 1.1022943705320358, "epoch": 0.9564603436102612, "grad_norm": 0.16085639595985413, "learning_rate": 0.00040469924812030074, "loss": 1.0872, "mean_token_accuracy": 0.7034497335553169, "num_tokens": 13458430.0, "step": 508 }, { "entropy": 1.1168298870325089, "epoch": 0.9583431395622499, "grad_norm": 0.14645646512508392, "learning_rate": 0.0004045112781954887, "loss": 1.1366, "mean_token_accuracy": 0.6974723115563393, "num_tokens": 13483989.0, "step": 509 }, { "entropy": 1.1111514419317245, "epoch": 0.9602259355142386, "grad_norm": 0.15530261397361755, "learning_rate": 0.0004043233082706767, "loss": 1.1068, "mean_token_accuracy": 0.7063265517354012, "num_tokens": 13510734.0, "step": 510 }, { "entropy": 1.1187052130699158, "epoch": 0.9621087314662273, "grad_norm": 0.1410273313522339, "learning_rate": 0.0004041353383458647, "loss": 1.1007, "mean_token_accuracy": 0.6978159174323082, "num_tokens": 13536200.0, "step": 511 }, { "entropy": 1.2634307444095612, "epoch": 0.963991527418216, "grad_norm": 0.14832766354084015, "learning_rate": 0.0004039473684210526, "loss": 1.2454, "mean_token_accuracy": 0.674240916967392, "num_tokens": 13562180.0, "step": 512 }, { "entropy": 1.209633857011795, "epoch": 0.9658743233702047, "grad_norm": 0.14852747321128845, "learning_rate": 0.0004037593984962406, "loss": 1.151, "mean_token_accuracy": 0.6942615807056427, "num_tokens": 13587252.0, "step": 513 }, { "entropy": 1.1802778542041779, "epoch": 0.9677571193221934, "grad_norm": 0.14167462289333344, "learning_rate": 0.0004035714285714286, "loss": 1.1268, "mean_token_accuracy": 0.6984767615795135, "num_tokens": 13614161.0, "step": 514 }, { "entropy": 1.1260388046503067, "epoch": 0.9696399152741821, "grad_norm": 0.1389787793159485, "learning_rate": 0.00040338345864661656, "loss": 1.1044, "mean_token_accuracy": 0.698441170156002, "num_tokens": 13640906.0, "step": 515 }, { "entropy": 1.1336752623319626, "epoch": 0.9715227112261708, "grad_norm": 0.13808688521385193, "learning_rate": 0.0004031954887218045, "loss": 1.1185, "mean_token_accuracy": 0.7005246728658676, "num_tokens": 13666938.0, "step": 516 }, { "entropy": 1.1089581847190857, "epoch": 0.9734055071781595, "grad_norm": 0.1490076631307602, "learning_rate": 0.0004030075187969925, "loss": 1.1037, "mean_token_accuracy": 0.699261337518692, "num_tokens": 13692343.0, "step": 517 }, { "entropy": 1.1778569370508194, "epoch": 0.9752883031301482, "grad_norm": 0.1503973752260208, "learning_rate": 0.00040281954887218046, "loss": 1.1704, "mean_token_accuracy": 0.6850240305066109, "num_tokens": 13717884.0, "step": 518 }, { "entropy": 1.1599782705307007, "epoch": 0.977171099082137, "grad_norm": 0.14560772478580475, "learning_rate": 0.00040263157894736843, "loss": 1.1481, "mean_token_accuracy": 0.6967450231313705, "num_tokens": 13744454.0, "step": 519 }, { "entropy": 1.2482303828001022, "epoch": 0.9790538950341257, "grad_norm": 0.1557229459285736, "learning_rate": 0.0004024436090225564, "loss": 1.2016, "mean_token_accuracy": 0.679645448923111, "num_tokens": 13771382.0, "step": 520 }, { "entropy": 1.154101237654686, "epoch": 0.9809366909861144, "grad_norm": 0.1511804610490799, "learning_rate": 0.00040225563909774436, "loss": 1.1211, "mean_token_accuracy": 0.692274309694767, "num_tokens": 13797315.0, "step": 521 }, { "entropy": 1.1659268885850906, "epoch": 0.9828194869381031, "grad_norm": 0.14492999017238617, "learning_rate": 0.0004020676691729324, "loss": 1.1276, "mean_token_accuracy": 0.6957960724830627, "num_tokens": 13823504.0, "step": 522 }, { "entropy": 1.2255947291851044, "epoch": 0.9847022828900918, "grad_norm": 0.16592226922512054, "learning_rate": 0.0004018796992481203, "loss": 1.2034, "mean_token_accuracy": 0.6800813153386116, "num_tokens": 13849682.0, "step": 523 }, { "entropy": 1.181060180068016, "epoch": 0.9865850788420805, "grad_norm": 0.14438042044639587, "learning_rate": 0.00040169172932330826, "loss": 1.1422, "mean_token_accuracy": 0.6908884271979332, "num_tokens": 13877151.0, "step": 524 }, { "entropy": 1.195601612329483, "epoch": 0.9884678747940692, "grad_norm": 0.1490834802389145, "learning_rate": 0.0004015037593984963, "loss": 1.1609, "mean_token_accuracy": 0.687875397503376, "num_tokens": 13902812.0, "step": 525 }, { "entropy": 1.1874232441186905, "epoch": 0.9903506707460579, "grad_norm": 0.15240395069122314, "learning_rate": 0.00040131578947368425, "loss": 1.171, "mean_token_accuracy": 0.6891705989837646, "num_tokens": 13926800.0, "step": 526 }, { "entropy": 1.0726541802287102, "epoch": 0.9922334666980466, "grad_norm": 0.1472628116607666, "learning_rate": 0.00040112781954887216, "loss": 1.0744, "mean_token_accuracy": 0.7090674415230751, "num_tokens": 13952161.0, "step": 527 }, { "entropy": 1.1295729503035545, "epoch": 0.9941162626500353, "grad_norm": 0.1415957808494568, "learning_rate": 0.0004009398496240602, "loss": 1.1086, "mean_token_accuracy": 0.7027467861771584, "num_tokens": 13978937.0, "step": 528 }, { "entropy": 1.1615847125649452, "epoch": 0.995999058602024, "grad_norm": 0.14748550951480865, "learning_rate": 0.00040075187969924815, "loss": 1.1474, "mean_token_accuracy": 0.6950105875730515, "num_tokens": 14005138.0, "step": 529 }, { "entropy": 1.176683247089386, "epoch": 0.9978818545540127, "grad_norm": 0.1543041467666626, "learning_rate": 0.00040056390977443606, "loss": 1.1807, "mean_token_accuracy": 0.684785395860672, "num_tokens": 14028706.0, "step": 530 }, { "entropy": 1.1374549865722656, "epoch": 0.9997646505060014, "grad_norm": 0.13411332666873932, "learning_rate": 0.0004003759398496241, "loss": 1.0976, "mean_token_accuracy": 0.7099665105342865, "num_tokens": 14056095.0, "step": 531 }, { "entropy": 1.4449238777160645, "epoch": 1.0, "grad_norm": 0.5150332450866699, "learning_rate": 0.00040018796992481205, "loss": 1.4328, "mean_token_accuracy": 0.6301905512809753, "num_tokens": 14058143.0, "step": 532 }, { "epoch": 1.0, "eval_entropy": 1.273110066141401, "eval_loss": 1.215613603591919, "eval_mean_token_accuracy": 0.6747710279056004, "eval_num_tokens": 14058143.0, "eval_runtime": 8.5294, "eval_samples_per_second": 5.745, "eval_steps_per_second": 0.821, "step": 532 } ], "logging_steps": 1.0, "max_steps": 2660, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.860994210304512e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }