roleplay-no-actions / trainer_state.json
thanhhau097's picture
Upload folder using huggingface_hub
09f20ff verified
raw
history blame
156 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 532,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.174624726176262,
"epoch": 0.0018827959519887032,
"grad_norm": 0.3589564859867096,
"learning_rate": 0.0005,
"loss": 1.7667,
"mean_token_accuracy": 0.6097231954336166,
"num_tokens": 26212.0,
"step": 1
},
{
"entropy": 1.3834485709667206,
"epoch": 0.0037655919039774064,
"grad_norm": 0.273681104183197,
"learning_rate": 0.000499812030075188,
"loss": 1.6137,
"mean_token_accuracy": 0.6240904033184052,
"num_tokens": 53331.0,
"step": 2
},
{
"entropy": 2.3064600229263306,
"epoch": 0.00564838785596611,
"grad_norm": 0.8047769665718079,
"learning_rate": 0.0004996240601503759,
"loss": 1.6899,
"mean_token_accuracy": 0.6088793724775314,
"num_tokens": 80291.0,
"step": 3
},
{
"entropy": 1.630955085158348,
"epoch": 0.007531183807954813,
"grad_norm": 0.30714720487594604,
"learning_rate": 0.0004994360902255639,
"loss": 1.5608,
"mean_token_accuracy": 0.6291212365031242,
"num_tokens": 106966.0,
"step": 4
},
{
"entropy": 1.3567735850811005,
"epoch": 0.009413979759943516,
"grad_norm": 0.2066618800163269,
"learning_rate": 0.0004992481203007519,
"loss": 1.4887,
"mean_token_accuracy": 0.6415289863944054,
"num_tokens": 132786.0,
"step": 5
},
{
"entropy": 1.361013576388359,
"epoch": 0.01129677571193222,
"grad_norm": 0.24627672135829926,
"learning_rate": 0.0004990601503759398,
"loss": 1.4956,
"mean_token_accuracy": 0.6329040080308914,
"num_tokens": 157854.0,
"step": 6
},
{
"entropy": 1.4551365226507187,
"epoch": 0.013179571663920923,
"grad_norm": 0.24504677951335907,
"learning_rate": 0.0004988721804511278,
"loss": 1.4555,
"mean_token_accuracy": 0.6410629153251648,
"num_tokens": 183628.0,
"step": 7
},
{
"entropy": 1.558847650885582,
"epoch": 0.015062367615909626,
"grad_norm": 0.24714401364326477,
"learning_rate": 0.0004986842105263158,
"loss": 1.4574,
"mean_token_accuracy": 0.6385244429111481,
"num_tokens": 212024.0,
"step": 8
},
{
"entropy": 1.4725914895534515,
"epoch": 0.016945163567898328,
"grad_norm": 0.14686766266822815,
"learning_rate": 0.0004984962406015037,
"loss": 1.4077,
"mean_token_accuracy": 0.6496255323290825,
"num_tokens": 239247.0,
"step": 9
},
{
"entropy": 1.399958148598671,
"epoch": 0.01882795951988703,
"grad_norm": 0.2573543190956116,
"learning_rate": 0.0004983082706766917,
"loss": 1.4648,
"mean_token_accuracy": 0.6321976333856583,
"num_tokens": 265365.0,
"step": 10
},
{
"entropy": 1.3477602005004883,
"epoch": 0.020710755471875734,
"grad_norm": 0.19095759093761444,
"learning_rate": 0.0004981203007518797,
"loss": 1.3914,
"mean_token_accuracy": 0.6472064480185509,
"num_tokens": 292664.0,
"step": 11
},
{
"entropy": 1.3985529839992523,
"epoch": 0.02259355142386444,
"grad_norm": 0.12443722784519196,
"learning_rate": 0.0004979323308270676,
"loss": 1.3841,
"mean_token_accuracy": 0.6470160931348801,
"num_tokens": 318823.0,
"step": 12
},
{
"entropy": 1.4100047498941422,
"epoch": 0.024476347375853143,
"grad_norm": 0.18163365125656128,
"learning_rate": 0.0004977443609022556,
"loss": 1.3475,
"mean_token_accuracy": 0.6554316207766533,
"num_tokens": 345276.0,
"step": 13
},
{
"entropy": 1.3673983961343765,
"epoch": 0.026359143327841845,
"grad_norm": 0.21292470395565033,
"learning_rate": 0.0004975563909774436,
"loss": 1.3423,
"mean_token_accuracy": 0.6571086272597313,
"num_tokens": 372290.0,
"step": 14
},
{
"entropy": 1.3170630186796188,
"epoch": 0.028241939279830548,
"grad_norm": 0.14680063724517822,
"learning_rate": 0.0004973684210526315,
"loss": 1.3433,
"mean_token_accuracy": 0.6587843522429466,
"num_tokens": 398806.0,
"step": 15
},
{
"entropy": 1.4363876283168793,
"epoch": 0.03012473523181925,
"grad_norm": 0.1492491513490677,
"learning_rate": 0.0004971804511278195,
"loss": 1.3881,
"mean_token_accuracy": 0.6493127718567848,
"num_tokens": 427973.0,
"step": 16
},
{
"entropy": 1.3928384333848953,
"epoch": 0.032007531183807954,
"grad_norm": 0.21353831887245178,
"learning_rate": 0.0004969924812030076,
"loss": 1.3303,
"mean_token_accuracy": 0.6532666012644768,
"num_tokens": 455705.0,
"step": 17
},
{
"entropy": 1.3039959222078323,
"epoch": 0.033890327135796657,
"grad_norm": 0.12421785295009613,
"learning_rate": 0.0004968045112781954,
"loss": 1.3078,
"mean_token_accuracy": 0.6589679047465324,
"num_tokens": 481697.0,
"step": 18
},
{
"entropy": 1.323414146900177,
"epoch": 0.03577312308778536,
"grad_norm": 0.13252823054790497,
"learning_rate": 0.0004966165413533834,
"loss": 1.3682,
"mean_token_accuracy": 0.6478805840015411,
"num_tokens": 508637.0,
"step": 19
},
{
"entropy": 1.320784792304039,
"epoch": 0.03765591903977406,
"grad_norm": 0.13821907341480255,
"learning_rate": 0.0004964285714285715,
"loss": 1.3087,
"mean_token_accuracy": 0.6556096524000168,
"num_tokens": 533762.0,
"step": 20
},
{
"entropy": 1.435991793870926,
"epoch": 0.039538714991762765,
"grad_norm": 0.13946449756622314,
"learning_rate": 0.0004962406015037594,
"loss": 1.4031,
"mean_token_accuracy": 0.6474809646606445,
"num_tokens": 558068.0,
"step": 21
},
{
"entropy": 1.3843661397695541,
"epoch": 0.04142151094375147,
"grad_norm": 0.14075031876564026,
"learning_rate": 0.0004960526315789473,
"loss": 1.3313,
"mean_token_accuracy": 0.6577248424291611,
"num_tokens": 585582.0,
"step": 22
},
{
"entropy": 1.3438803404569626,
"epoch": 0.04330430689574018,
"grad_norm": 0.12071845680475235,
"learning_rate": 0.0004958646616541354,
"loss": 1.3205,
"mean_token_accuracy": 0.6598646715283394,
"num_tokens": 614078.0,
"step": 23
},
{
"entropy": 1.2872049808502197,
"epoch": 0.04518710284772888,
"grad_norm": 0.13585081696510315,
"learning_rate": 0.0004956766917293234,
"loss": 1.2847,
"mean_token_accuracy": 0.6646199747920036,
"num_tokens": 641604.0,
"step": 24
},
{
"entropy": 1.4031487703323364,
"epoch": 0.04706989879971758,
"grad_norm": 0.16168682277202606,
"learning_rate": 0.0004954887218045112,
"loss": 1.3906,
"mean_token_accuracy": 0.6470670253038406,
"num_tokens": 668099.0,
"step": 25
},
{
"entropy": 1.3954781144857407,
"epoch": 0.048952694751706285,
"grad_norm": 0.1519748568534851,
"learning_rate": 0.0004953007518796993,
"loss": 1.3143,
"mean_token_accuracy": 0.6569681242108345,
"num_tokens": 693467.0,
"step": 26
},
{
"entropy": 1.4201241582632065,
"epoch": 0.05083549070369499,
"grad_norm": 0.12228523939847946,
"learning_rate": 0.0004951127819548873,
"loss": 1.3585,
"mean_token_accuracy": 0.6522250324487686,
"num_tokens": 719428.0,
"step": 27
},
{
"entropy": 1.3096809834241867,
"epoch": 0.05271828665568369,
"grad_norm": 0.12990325689315796,
"learning_rate": 0.0004949248120300752,
"loss": 1.3363,
"mean_token_accuracy": 0.6576437503099442,
"num_tokens": 743498.0,
"step": 28
},
{
"entropy": 1.2695416510105133,
"epoch": 0.054601082607672394,
"grad_norm": 0.12629908323287964,
"learning_rate": 0.0004947368421052632,
"loss": 1.256,
"mean_token_accuracy": 0.6671914085745811,
"num_tokens": 771083.0,
"step": 29
},
{
"entropy": 1.3144675344228745,
"epoch": 0.056483878559661096,
"grad_norm": 0.13920928537845612,
"learning_rate": 0.0004945488721804512,
"loss": 1.2797,
"mean_token_accuracy": 0.6726761981844902,
"num_tokens": 798194.0,
"step": 30
},
{
"entropy": 1.3235575556755066,
"epoch": 0.0583666745116498,
"grad_norm": 0.1421487033367157,
"learning_rate": 0.0004943609022556391,
"loss": 1.3095,
"mean_token_accuracy": 0.6596867814660072,
"num_tokens": 823348.0,
"step": 31
},
{
"entropy": 1.2517389357089996,
"epoch": 0.0602494704636385,
"grad_norm": 0.11075025051832199,
"learning_rate": 0.0004941729323308271,
"loss": 1.2458,
"mean_token_accuracy": 0.6723818778991699,
"num_tokens": 849713.0,
"step": 32
},
{
"entropy": 1.2159670144319534,
"epoch": 0.062132266415627205,
"grad_norm": 0.11285679787397385,
"learning_rate": 0.0004939849624060151,
"loss": 1.2158,
"mean_token_accuracy": 0.6808358430862427,
"num_tokens": 876659.0,
"step": 33
},
{
"entropy": 1.2742353826761246,
"epoch": 0.06401506236761591,
"grad_norm": 0.1200110912322998,
"learning_rate": 0.000493796992481203,
"loss": 1.2414,
"mean_token_accuracy": 0.6697632297873497,
"num_tokens": 904196.0,
"step": 34
},
{
"entropy": 1.3724654912948608,
"epoch": 0.06589785831960461,
"grad_norm": 0.11141709238290787,
"learning_rate": 0.000493609022556391,
"loss": 1.3037,
"mean_token_accuracy": 0.6641954258084297,
"num_tokens": 930650.0,
"step": 35
},
{
"entropy": 1.332644298672676,
"epoch": 0.06778065427159331,
"grad_norm": 0.11270242929458618,
"learning_rate": 0.000493421052631579,
"loss": 1.2723,
"mean_token_accuracy": 0.6652832478284836,
"num_tokens": 958361.0,
"step": 36
},
{
"entropy": 1.2781042605638504,
"epoch": 0.06966345022358202,
"grad_norm": 0.12608197331428528,
"learning_rate": 0.0004932330827067669,
"loss": 1.2664,
"mean_token_accuracy": 0.6701500117778778,
"num_tokens": 982981.0,
"step": 37
},
{
"entropy": 1.2652703523635864,
"epoch": 0.07154624617557072,
"grad_norm": 0.11680380254983902,
"learning_rate": 0.0004930451127819549,
"loss": 1.2363,
"mean_token_accuracy": 0.6758281961083412,
"num_tokens": 1010214.0,
"step": 38
},
{
"entropy": 1.2895056456327438,
"epoch": 0.07342904212755942,
"grad_norm": 0.13060909509658813,
"learning_rate": 0.0004928571428571429,
"loss": 1.2921,
"mean_token_accuracy": 0.6617036908864975,
"num_tokens": 1036007.0,
"step": 39
},
{
"entropy": 1.2508063912391663,
"epoch": 0.07531183807954812,
"grad_norm": 0.11048955470323563,
"learning_rate": 0.0004926691729323308,
"loss": 1.2388,
"mean_token_accuracy": 0.6743078008294106,
"num_tokens": 1064839.0,
"step": 40
},
{
"entropy": 1.2910813689231873,
"epoch": 0.07719463403153683,
"grad_norm": 0.12634366750717163,
"learning_rate": 0.0004924812030075188,
"loss": 1.2923,
"mean_token_accuracy": 0.6658936813473701,
"num_tokens": 1089267.0,
"step": 41
},
{
"entropy": 1.314329817891121,
"epoch": 0.07907742998352553,
"grad_norm": 0.11990135908126831,
"learning_rate": 0.0004922932330827068,
"loss": 1.2823,
"mean_token_accuracy": 0.6621334031224251,
"num_tokens": 1114747.0,
"step": 42
},
{
"entropy": 1.372491493821144,
"epoch": 0.08096022593551423,
"grad_norm": 0.14962127804756165,
"learning_rate": 0.0004921052631578947,
"loss": 1.3012,
"mean_token_accuracy": 0.6624018624424934,
"num_tokens": 1140568.0,
"step": 43
},
{
"entropy": 1.3109306246042252,
"epoch": 0.08284302188750294,
"grad_norm": 0.1251574158668518,
"learning_rate": 0.0004919172932330827,
"loss": 1.2753,
"mean_token_accuracy": 0.6643748208880424,
"num_tokens": 1166132.0,
"step": 44
},
{
"entropy": 1.2547127306461334,
"epoch": 0.08472581783949165,
"grad_norm": 0.14988984167575836,
"learning_rate": 0.0004917293233082707,
"loss": 1.2591,
"mean_token_accuracy": 0.6667659133672714,
"num_tokens": 1191773.0,
"step": 45
},
{
"entropy": 1.2385195791721344,
"epoch": 0.08660861379148035,
"grad_norm": 0.14218594133853912,
"learning_rate": 0.0004915413533834586,
"loss": 1.2551,
"mean_token_accuracy": 0.67237289249897,
"num_tokens": 1217928.0,
"step": 46
},
{
"entropy": 1.286237582564354,
"epoch": 0.08849140974346906,
"grad_norm": 0.1285715401172638,
"learning_rate": 0.0004913533834586466,
"loss": 1.228,
"mean_token_accuracy": 0.6695188358426094,
"num_tokens": 1243853.0,
"step": 47
},
{
"entropy": 1.2577073574066162,
"epoch": 0.09037420569545776,
"grad_norm": 0.1297583132982254,
"learning_rate": 0.0004911654135338346,
"loss": 1.1889,
"mean_token_accuracy": 0.6802271753549576,
"num_tokens": 1270883.0,
"step": 48
},
{
"entropy": 1.2520407736301422,
"epoch": 0.09225700164744646,
"grad_norm": 0.10652397572994232,
"learning_rate": 0.0004909774436090225,
"loss": 1.2295,
"mean_token_accuracy": 0.675907552242279,
"num_tokens": 1296937.0,
"step": 49
},
{
"entropy": 1.2889134734869003,
"epoch": 0.09413979759943517,
"grad_norm": 0.15478400886058807,
"learning_rate": 0.0004907894736842106,
"loss": 1.325,
"mean_token_accuracy": 0.656628705561161,
"num_tokens": 1323691.0,
"step": 50
},
{
"entropy": 1.319000005722046,
"epoch": 0.09602259355142387,
"grad_norm": 0.14395709335803986,
"learning_rate": 0.0004906015037593985,
"loss": 1.2879,
"mean_token_accuracy": 0.6644657775759697,
"num_tokens": 1347574.0,
"step": 51
},
{
"entropy": 1.265960842370987,
"epoch": 0.09790538950341257,
"grad_norm": 0.1301705241203308,
"learning_rate": 0.0004904135338345864,
"loss": 1.1913,
"mean_token_accuracy": 0.6857202649116516,
"num_tokens": 1376965.0,
"step": 52
},
{
"entropy": 1.2671979069709778,
"epoch": 0.09978818545540127,
"grad_norm": 0.12502525746822357,
"learning_rate": 0.0004902255639097745,
"loss": 1.2473,
"mean_token_accuracy": 0.666202001273632,
"num_tokens": 1402456.0,
"step": 53
},
{
"entropy": 1.2768708020448685,
"epoch": 0.10167098140738998,
"grad_norm": 0.1106332466006279,
"learning_rate": 0.0004900375939849624,
"loss": 1.2406,
"mean_token_accuracy": 0.6731417253613472,
"num_tokens": 1430744.0,
"step": 54
},
{
"entropy": 1.2286315560340881,
"epoch": 0.10355377735937868,
"grad_norm": 0.12362819164991379,
"learning_rate": 0.0004898496240601503,
"loss": 1.2452,
"mean_token_accuracy": 0.6803877055644989,
"num_tokens": 1459596.0,
"step": 55
},
{
"entropy": 1.2663686275482178,
"epoch": 0.10543657331136738,
"grad_norm": 0.11787568777799606,
"learning_rate": 0.0004896616541353384,
"loss": 1.2594,
"mean_token_accuracy": 0.6688775643706322,
"num_tokens": 1487663.0,
"step": 56
},
{
"entropy": 1.2797971814870834,
"epoch": 0.10731936926335608,
"grad_norm": 0.11497815698385239,
"learning_rate": 0.0004894736842105264,
"loss": 1.2556,
"mean_token_accuracy": 0.6690255850553513,
"num_tokens": 1514365.0,
"step": 57
},
{
"entropy": 1.2839107066392899,
"epoch": 0.10920216521534479,
"grad_norm": 0.11505855619907379,
"learning_rate": 0.0004892857142857142,
"loss": 1.2213,
"mean_token_accuracy": 0.6812370792031288,
"num_tokens": 1542885.0,
"step": 58
},
{
"entropy": 1.290139302611351,
"epoch": 0.11108496116733349,
"grad_norm": 0.11844398826360703,
"learning_rate": 0.0004890977443609023,
"loss": 1.2462,
"mean_token_accuracy": 0.6695830523967743,
"num_tokens": 1567898.0,
"step": 59
},
{
"entropy": 1.2590511292219162,
"epoch": 0.11296775711932219,
"grad_norm": 0.12767820060253143,
"learning_rate": 0.0004889097744360903,
"loss": 1.2515,
"mean_token_accuracy": 0.6738757342100143,
"num_tokens": 1594742.0,
"step": 60
},
{
"entropy": 1.2260379791259766,
"epoch": 0.1148505530713109,
"grad_norm": 0.11811124533414841,
"learning_rate": 0.0004887218045112781,
"loss": 1.1979,
"mean_token_accuracy": 0.6808087155222893,
"num_tokens": 1620685.0,
"step": 61
},
{
"entropy": 1.301318883895874,
"epoch": 0.1167333490232996,
"grad_norm": 0.13785120844841003,
"learning_rate": 0.0004885338345864662,
"loss": 1.3155,
"mean_token_accuracy": 0.6592775583267212,
"num_tokens": 1646541.0,
"step": 62
},
{
"entropy": 1.2704945504665375,
"epoch": 0.1186161449752883,
"grad_norm": 0.11612152308225632,
"learning_rate": 0.0004883458646616542,
"loss": 1.2429,
"mean_token_accuracy": 0.6690341830253601,
"num_tokens": 1674445.0,
"step": 63
},
{
"entropy": 1.2772111147642136,
"epoch": 0.120498940927277,
"grad_norm": 0.12045788764953613,
"learning_rate": 0.00048815789473684215,
"loss": 1.2114,
"mean_token_accuracy": 0.6808006837964058,
"num_tokens": 1701277.0,
"step": 64
},
{
"entropy": 1.2712904959917068,
"epoch": 0.1223817368792657,
"grad_norm": 0.11429794877767563,
"learning_rate": 0.00048796992481203006,
"loss": 1.216,
"mean_token_accuracy": 0.6720417365431786,
"num_tokens": 1728984.0,
"step": 65
},
{
"entropy": 1.3161986768245697,
"epoch": 0.12426453283125441,
"grad_norm": 0.1338111013174057,
"learning_rate": 0.00048778195488721803,
"loss": 1.3229,
"mean_token_accuracy": 0.6602049320936203,
"num_tokens": 1755598.0,
"step": 66
},
{
"entropy": 1.2473317682743073,
"epoch": 0.1261473287832431,
"grad_norm": 0.10488025099039078,
"learning_rate": 0.00048759398496240605,
"loss": 1.2263,
"mean_token_accuracy": 0.6753234788775444,
"num_tokens": 1783417.0,
"step": 67
},
{
"entropy": 1.2551011592149734,
"epoch": 0.12803012473523182,
"grad_norm": 0.11638512462377548,
"learning_rate": 0.000487406015037594,
"loss": 1.224,
"mean_token_accuracy": 0.6783930733799934,
"num_tokens": 1809462.0,
"step": 68
},
{
"entropy": 1.2382186502218246,
"epoch": 0.12991292068722052,
"grad_norm": 0.14887025952339172,
"learning_rate": 0.00048721804511278193,
"loss": 1.2175,
"mean_token_accuracy": 0.6787804737687111,
"num_tokens": 1835642.0,
"step": 69
},
{
"entropy": 1.274851605296135,
"epoch": 0.13179571663920922,
"grad_norm": 0.13403619825839996,
"learning_rate": 0.00048703007518796995,
"loss": 1.2662,
"mean_token_accuracy": 0.6663196384906769,
"num_tokens": 1859904.0,
"step": 70
},
{
"entropy": 1.303640365600586,
"epoch": 0.13367851259119792,
"grad_norm": 0.11801115423440933,
"learning_rate": 0.0004868421052631579,
"loss": 1.3138,
"mean_token_accuracy": 0.6627907082438469,
"num_tokens": 1886915.0,
"step": 71
},
{
"entropy": 1.2814981341362,
"epoch": 0.13556130854318663,
"grad_norm": 0.12543627619743347,
"learning_rate": 0.00048665413533834583,
"loss": 1.2599,
"mean_token_accuracy": 0.6737553998827934,
"num_tokens": 1912683.0,
"step": 72
},
{
"entropy": 1.2715606987476349,
"epoch": 0.13744410449517533,
"grad_norm": 0.11963653564453125,
"learning_rate": 0.00048646616541353385,
"loss": 1.2075,
"mean_token_accuracy": 0.6787137389183044,
"num_tokens": 1940455.0,
"step": 73
},
{
"entropy": 1.2765703648328781,
"epoch": 0.13932690044716403,
"grad_norm": 0.13952264189720154,
"learning_rate": 0.0004862781954887218,
"loss": 1.2043,
"mean_token_accuracy": 0.6798917651176453,
"num_tokens": 1965949.0,
"step": 74
},
{
"entropy": 1.229781836271286,
"epoch": 0.14120969639915273,
"grad_norm": 0.11769476532936096,
"learning_rate": 0.0004860902255639098,
"loss": 1.2063,
"mean_token_accuracy": 0.6715990677475929,
"num_tokens": 1992293.0,
"step": 75
},
{
"entropy": 1.1944819241762161,
"epoch": 0.14309249235114144,
"grad_norm": 0.12095087021589279,
"learning_rate": 0.00048590225563909775,
"loss": 1.217,
"mean_token_accuracy": 0.6814620569348335,
"num_tokens": 2019182.0,
"step": 76
},
{
"entropy": 1.2649260014295578,
"epoch": 0.14497528830313014,
"grad_norm": 0.12220579385757446,
"learning_rate": 0.0004857142857142857,
"loss": 1.2827,
"mean_token_accuracy": 0.6689692661166191,
"num_tokens": 2045357.0,
"step": 77
},
{
"entropy": 1.2532286047935486,
"epoch": 0.14685808425511884,
"grad_norm": 0.12137361615896225,
"learning_rate": 0.0004855263157894737,
"loss": 1.202,
"mean_token_accuracy": 0.6808355078101158,
"num_tokens": 2071015.0,
"step": 78
},
{
"entropy": 1.334955409169197,
"epoch": 0.14874088020710755,
"grad_norm": 0.12754660844802856,
"learning_rate": 0.0004853383458646617,
"loss": 1.2514,
"mean_token_accuracy": 0.6797578409314156,
"num_tokens": 2096831.0,
"step": 79
},
{
"entropy": 1.2261384725570679,
"epoch": 0.15062367615909625,
"grad_norm": 0.11096950620412827,
"learning_rate": 0.0004851503759398496,
"loss": 1.1933,
"mean_token_accuracy": 0.6880421414971352,
"num_tokens": 2126421.0,
"step": 80
},
{
"entropy": 1.2615373581647873,
"epoch": 0.15250647211108495,
"grad_norm": 0.13106736540794373,
"learning_rate": 0.0004849624060150376,
"loss": 1.2198,
"mean_token_accuracy": 0.6821138635277748,
"num_tokens": 2153303.0,
"step": 81
},
{
"entropy": 1.2859619706869125,
"epoch": 0.15438926806307365,
"grad_norm": 0.13115623593330383,
"learning_rate": 0.0004847744360902256,
"loss": 1.2783,
"mean_token_accuracy": 0.6689222902059555,
"num_tokens": 2180250.0,
"step": 82
},
{
"entropy": 1.248913735151291,
"epoch": 0.15627206401506236,
"grad_norm": 0.11291101574897766,
"learning_rate": 0.0004845864661654135,
"loss": 1.2351,
"mean_token_accuracy": 0.6730126142501831,
"num_tokens": 2207001.0,
"step": 83
},
{
"entropy": 1.2413169145584106,
"epoch": 0.15815485996705106,
"grad_norm": 0.1277051717042923,
"learning_rate": 0.0004843984962406015,
"loss": 1.2159,
"mean_token_accuracy": 0.681744784116745,
"num_tokens": 2232587.0,
"step": 84
},
{
"entropy": 1.2155817747116089,
"epoch": 0.16003765591903976,
"grad_norm": 0.15200501680374146,
"learning_rate": 0.0004842105263157895,
"loss": 1.1881,
"mean_token_accuracy": 0.6845081895589828,
"num_tokens": 2260040.0,
"step": 85
},
{
"entropy": 1.1750262528657913,
"epoch": 0.16192045187102846,
"grad_norm": 0.13496170938014984,
"learning_rate": 0.0004840225563909775,
"loss": 1.1566,
"mean_token_accuracy": 0.6882026270031929,
"num_tokens": 2286811.0,
"step": 86
},
{
"entropy": 1.2582080215215683,
"epoch": 0.16380324782301717,
"grad_norm": 0.12751278281211853,
"learning_rate": 0.0004838345864661654,
"loss": 1.2334,
"mean_token_accuracy": 0.6756840199232101,
"num_tokens": 2312376.0,
"step": 87
},
{
"entropy": 1.2530706375837326,
"epoch": 0.16568604377500587,
"grad_norm": 0.12347429990768433,
"learning_rate": 0.0004836466165413534,
"loss": 1.2358,
"mean_token_accuracy": 0.6713104099035263,
"num_tokens": 2338959.0,
"step": 88
},
{
"entropy": 1.2693426012992859,
"epoch": 0.1675688397269946,
"grad_norm": 0.16009417176246643,
"learning_rate": 0.0004834586466165414,
"loss": 1.2511,
"mean_token_accuracy": 0.6736921593546867,
"num_tokens": 2366183.0,
"step": 89
},
{
"entropy": 1.255973756313324,
"epoch": 0.1694516356789833,
"grad_norm": 0.12181756645441055,
"learning_rate": 0.00048327067669172934,
"loss": 1.2052,
"mean_token_accuracy": 0.6734501421451569,
"num_tokens": 2392856.0,
"step": 90
},
{
"entropy": 1.2562214732170105,
"epoch": 0.171334431630972,
"grad_norm": 0.12082800269126892,
"learning_rate": 0.0004830827067669173,
"loss": 1.2519,
"mean_token_accuracy": 0.6692837849259377,
"num_tokens": 2419897.0,
"step": 91
},
{
"entropy": 1.1730956435203552,
"epoch": 0.1732172275829607,
"grad_norm": 0.11969847977161407,
"learning_rate": 0.0004828947368421053,
"loss": 1.1305,
"mean_token_accuracy": 0.6944040432572365,
"num_tokens": 2449131.0,
"step": 92
},
{
"entropy": 1.2573560923337936,
"epoch": 0.1751000235349494,
"grad_norm": 0.1183922290802002,
"learning_rate": 0.00048270676691729324,
"loss": 1.224,
"mean_token_accuracy": 0.6771978959441185,
"num_tokens": 2474107.0,
"step": 93
},
{
"entropy": 1.2122257351875305,
"epoch": 0.17698281948693811,
"grad_norm": 0.1325969696044922,
"learning_rate": 0.0004825187969924812,
"loss": 1.1754,
"mean_token_accuracy": 0.6865298077464104,
"num_tokens": 2501837.0,
"step": 94
},
{
"entropy": 1.2060312926769257,
"epoch": 0.17886561543892682,
"grad_norm": 0.12340355664491653,
"learning_rate": 0.0004823308270676692,
"loss": 1.2042,
"mean_token_accuracy": 0.6752656251192093,
"num_tokens": 2528769.0,
"step": 95
},
{
"entropy": 1.268461525440216,
"epoch": 0.18074841139091552,
"grad_norm": 0.1260639727115631,
"learning_rate": 0.00048214285714285715,
"loss": 1.2781,
"mean_token_accuracy": 0.6681492626667023,
"num_tokens": 2555451.0,
"step": 96
},
{
"entropy": 1.2650732845067978,
"epoch": 0.18263120734290422,
"grad_norm": 0.12851010262966156,
"learning_rate": 0.00048195488721804517,
"loss": 1.2458,
"mean_token_accuracy": 0.671695739030838,
"num_tokens": 2582196.0,
"step": 97
},
{
"entropy": 1.2784437835216522,
"epoch": 0.18451400329489293,
"grad_norm": 0.1278950273990631,
"learning_rate": 0.0004817669172932331,
"loss": 1.2319,
"mean_token_accuracy": 0.6702851504087448,
"num_tokens": 2608444.0,
"step": 98
},
{
"entropy": 1.2551447749137878,
"epoch": 0.18639679924688163,
"grad_norm": 0.1206209808588028,
"learning_rate": 0.00048157894736842105,
"loss": 1.2044,
"mean_token_accuracy": 0.677789680659771,
"num_tokens": 2634109.0,
"step": 99
},
{
"entropy": 1.2039145231246948,
"epoch": 0.18827959519887033,
"grad_norm": 0.12305069714784622,
"learning_rate": 0.00048139097744360907,
"loss": 1.1637,
"mean_token_accuracy": 0.6861624270677567,
"num_tokens": 2659548.0,
"step": 100
},
{
"entropy": 1.2327278852462769,
"epoch": 0.19016239115085903,
"grad_norm": 0.13643652200698853,
"learning_rate": 0.000481203007518797,
"loss": 1.212,
"mean_token_accuracy": 0.6804677918553352,
"num_tokens": 2684638.0,
"step": 101
},
{
"entropy": 1.194289356470108,
"epoch": 0.19204518710284774,
"grad_norm": 0.15666837990283966,
"learning_rate": 0.00048101503759398495,
"loss": 1.1797,
"mean_token_accuracy": 0.683199092745781,
"num_tokens": 2711970.0,
"step": 102
},
{
"entropy": 1.2052866965532303,
"epoch": 0.19392798305483644,
"grad_norm": 0.12934386730194092,
"learning_rate": 0.00048082706766917297,
"loss": 1.1954,
"mean_token_accuracy": 0.6831924915313721,
"num_tokens": 2738028.0,
"step": 103
},
{
"entropy": 1.2316648960113525,
"epoch": 0.19581077900682514,
"grad_norm": 0.12603920698165894,
"learning_rate": 0.00048063909774436094,
"loss": 1.2112,
"mean_token_accuracy": 0.6792290285229683,
"num_tokens": 2765091.0,
"step": 104
},
{
"entropy": 1.2624593675136566,
"epoch": 0.19769357495881384,
"grad_norm": 0.1318008452653885,
"learning_rate": 0.00048045112781954885,
"loss": 1.2389,
"mean_token_accuracy": 0.6782659739255905,
"num_tokens": 2792661.0,
"step": 105
},
{
"entropy": 1.2824029475450516,
"epoch": 0.19957637091080255,
"grad_norm": 0.13028129935264587,
"learning_rate": 0.00048026315789473687,
"loss": 1.2581,
"mean_token_accuracy": 0.6727664992213249,
"num_tokens": 2819535.0,
"step": 106
},
{
"entropy": 1.1964116394519806,
"epoch": 0.20145916686279125,
"grad_norm": 0.16565856337547302,
"learning_rate": 0.00048007518796992484,
"loss": 1.1427,
"mean_token_accuracy": 0.6922469958662987,
"num_tokens": 2848429.0,
"step": 107
},
{
"entropy": 1.2726367861032486,
"epoch": 0.20334196281477995,
"grad_norm": 0.1416698843240738,
"learning_rate": 0.0004798872180451128,
"loss": 1.225,
"mean_token_accuracy": 0.6754879876971245,
"num_tokens": 2874776.0,
"step": 108
},
{
"entropy": 1.2357124537229538,
"epoch": 0.20522475876676866,
"grad_norm": 0.12491658329963684,
"learning_rate": 0.00047969924812030077,
"loss": 1.204,
"mean_token_accuracy": 0.6739878728985786,
"num_tokens": 2902602.0,
"step": 109
},
{
"entropy": 1.2650941908359528,
"epoch": 0.20710755471875736,
"grad_norm": 0.13329921662807465,
"learning_rate": 0.00047951127819548874,
"loss": 1.2432,
"mean_token_accuracy": 0.6738255694508553,
"num_tokens": 2929536.0,
"step": 110
},
{
"entropy": 1.2259162962436676,
"epoch": 0.20899035067074606,
"grad_norm": 0.14152902364730835,
"learning_rate": 0.0004793233082706767,
"loss": 1.1886,
"mean_token_accuracy": 0.6813376769423485,
"num_tokens": 2955236.0,
"step": 111
},
{
"entropy": 1.1335331127047539,
"epoch": 0.21087314662273476,
"grad_norm": 0.13298991322517395,
"learning_rate": 0.00047913533834586467,
"loss": 1.1339,
"mean_token_accuracy": 0.6916593015193939,
"num_tokens": 2979921.0,
"step": 112
},
{
"entropy": 1.2154437899589539,
"epoch": 0.21275594257472347,
"grad_norm": 0.15994608402252197,
"learning_rate": 0.00047894736842105264,
"loss": 1.2115,
"mean_token_accuracy": 0.679818794131279,
"num_tokens": 3005638.0,
"step": 113
},
{
"entropy": 1.215769276022911,
"epoch": 0.21463873852671217,
"grad_norm": 0.11282095313072205,
"learning_rate": 0.0004787593984962406,
"loss": 1.1821,
"mean_token_accuracy": 0.6841456890106201,
"num_tokens": 3033979.0,
"step": 114
},
{
"entropy": 1.2786222100257874,
"epoch": 0.21652153447870087,
"grad_norm": 0.13811451196670532,
"learning_rate": 0.0004785714285714286,
"loss": 1.2177,
"mean_token_accuracy": 0.6760591194033623,
"num_tokens": 3060581.0,
"step": 115
},
{
"entropy": 1.1782392710447311,
"epoch": 0.21840433043068957,
"grad_norm": 0.12641046941280365,
"learning_rate": 0.00047838345864661654,
"loss": 1.1449,
"mean_token_accuracy": 0.6954788789153099,
"num_tokens": 3086594.0,
"step": 116
},
{
"entropy": 1.2415330708026886,
"epoch": 0.22028712638267828,
"grad_norm": 0.1396101415157318,
"learning_rate": 0.0004781954887218045,
"loss": 1.2245,
"mean_token_accuracy": 0.6794020012021065,
"num_tokens": 3114117.0,
"step": 117
},
{
"entropy": 1.2689218074083328,
"epoch": 0.22216992233466698,
"grad_norm": 0.13006678223609924,
"learning_rate": 0.00047800751879699253,
"loss": 1.2523,
"mean_token_accuracy": 0.6741964370012283,
"num_tokens": 3140643.0,
"step": 118
},
{
"entropy": 1.2656696736812592,
"epoch": 0.22405271828665568,
"grad_norm": 0.15107867121696472,
"learning_rate": 0.0004778195488721805,
"loss": 1.2539,
"mean_token_accuracy": 0.6682558432221413,
"num_tokens": 3166141.0,
"step": 119
},
{
"entropy": 1.1993789225816727,
"epoch": 0.22593551423864439,
"grad_norm": 0.11653780192136765,
"learning_rate": 0.0004776315789473684,
"loss": 1.1753,
"mean_token_accuracy": 0.6902748569846153,
"num_tokens": 3193339.0,
"step": 120
},
{
"entropy": 1.231392353773117,
"epoch": 0.2278183101906331,
"grad_norm": 0.1314115673303604,
"learning_rate": 0.00047744360902255643,
"loss": 1.2005,
"mean_token_accuracy": 0.6799951046705246,
"num_tokens": 3219993.0,
"step": 121
},
{
"entropy": 1.2121622115373611,
"epoch": 0.2297011061426218,
"grad_norm": 0.12394538521766663,
"learning_rate": 0.0004772556390977444,
"loss": 1.1715,
"mean_token_accuracy": 0.6903199851512909,
"num_tokens": 3247444.0,
"step": 122
},
{
"entropy": 1.2413930743932724,
"epoch": 0.2315839020946105,
"grad_norm": 0.1266545057296753,
"learning_rate": 0.0004770676691729323,
"loss": 1.1899,
"mean_token_accuracy": 0.682403139770031,
"num_tokens": 3272627.0,
"step": 123
},
{
"entropy": 1.1818571537733078,
"epoch": 0.2334666980465992,
"grad_norm": 0.15664935111999512,
"learning_rate": 0.00047687969924812033,
"loss": 1.1479,
"mean_token_accuracy": 0.6944203674793243,
"num_tokens": 3296898.0,
"step": 124
},
{
"entropy": 1.2419498413801193,
"epoch": 0.2353494939985879,
"grad_norm": 0.15578152239322662,
"learning_rate": 0.0004766917293233083,
"loss": 1.2335,
"mean_token_accuracy": 0.6732713803648949,
"num_tokens": 3322692.0,
"step": 125
},
{
"entropy": 1.2249382436275482,
"epoch": 0.2372322899505766,
"grad_norm": 0.14584508538246155,
"learning_rate": 0.00047650375939849626,
"loss": 1.2124,
"mean_token_accuracy": 0.6797131448984146,
"num_tokens": 3348376.0,
"step": 126
},
{
"entropy": 1.2090249583125114,
"epoch": 0.2391150859025653,
"grad_norm": 0.15335120260715485,
"learning_rate": 0.0004763157894736842,
"loss": 1.1861,
"mean_token_accuracy": 0.6816836297512054,
"num_tokens": 3375056.0,
"step": 127
},
{
"entropy": 1.2331191301345825,
"epoch": 0.240997881854554,
"grad_norm": 0.13854444026947021,
"learning_rate": 0.0004761278195488722,
"loss": 1.1867,
"mean_token_accuracy": 0.6822093352675438,
"num_tokens": 3401338.0,
"step": 128
},
{
"entropy": 1.2083263993263245,
"epoch": 0.2428806778065427,
"grad_norm": 0.1330289989709854,
"learning_rate": 0.00047593984962406016,
"loss": 1.1774,
"mean_token_accuracy": 0.6801193058490753,
"num_tokens": 3426232.0,
"step": 129
},
{
"entropy": 1.2008604258298874,
"epoch": 0.2447634737585314,
"grad_norm": 0.14914868772029877,
"learning_rate": 0.00047575187969924813,
"loss": 1.1679,
"mean_token_accuracy": 0.6855365261435509,
"num_tokens": 3454080.0,
"step": 130
},
{
"entropy": 1.2279947251081467,
"epoch": 0.24664626971052012,
"grad_norm": 0.18307369947433472,
"learning_rate": 0.0004755639097744361,
"loss": 1.2333,
"mean_token_accuracy": 0.672551229596138,
"num_tokens": 3478258.0,
"step": 131
},
{
"entropy": 1.1894963383674622,
"epoch": 0.24852906566250882,
"grad_norm": 0.13398650288581848,
"learning_rate": 0.00047537593984962407,
"loss": 1.1953,
"mean_token_accuracy": 0.6832383349537849,
"num_tokens": 3504254.0,
"step": 132
},
{
"entropy": 1.2269657999277115,
"epoch": 0.2504118616144975,
"grad_norm": 0.13811668753623962,
"learning_rate": 0.00047518796992481203,
"loss": 1.1741,
"mean_token_accuracy": 0.6880706697702408,
"num_tokens": 3531225.0,
"step": 133
},
{
"entropy": 1.198286533355713,
"epoch": 0.2522946575664862,
"grad_norm": 0.17705924808979034,
"learning_rate": 0.000475,
"loss": 1.1395,
"mean_token_accuracy": 0.691774420440197,
"num_tokens": 3556428.0,
"step": 134
},
{
"entropy": 1.2244715094566345,
"epoch": 0.2541774535184749,
"grad_norm": 0.17644067108631134,
"learning_rate": 0.00047481203007518797,
"loss": 1.2204,
"mean_token_accuracy": 0.6757577136158943,
"num_tokens": 3583373.0,
"step": 135
},
{
"entropy": 1.208250641822815,
"epoch": 0.25606024947046363,
"grad_norm": 0.12975312769412994,
"learning_rate": 0.00047462406015037593,
"loss": 1.2032,
"mean_token_accuracy": 0.68288903683424,
"num_tokens": 3610878.0,
"step": 136
},
{
"entropy": 1.1764077246189117,
"epoch": 0.25794304542245233,
"grad_norm": 0.13420140743255615,
"learning_rate": 0.00047443609022556395,
"loss": 1.1343,
"mean_token_accuracy": 0.6927010640501976,
"num_tokens": 3636794.0,
"step": 137
},
{
"entropy": 1.2354558259248734,
"epoch": 0.25982584137444104,
"grad_norm": 0.12880398333072662,
"learning_rate": 0.00047424812030075187,
"loss": 1.1809,
"mean_token_accuracy": 0.682947002351284,
"num_tokens": 3665578.0,
"step": 138
},
{
"entropy": 1.175147533416748,
"epoch": 0.26170863732642974,
"grad_norm": 0.15634110569953918,
"learning_rate": 0.00047406015037593983,
"loss": 1.1483,
"mean_token_accuracy": 0.6907549053430557,
"num_tokens": 3691407.0,
"step": 139
},
{
"entropy": 1.1331272423267365,
"epoch": 0.26359143327841844,
"grad_norm": 0.13562822341918945,
"learning_rate": 0.00047387218045112786,
"loss": 1.119,
"mean_token_accuracy": 0.6953889951109886,
"num_tokens": 3718468.0,
"step": 140
},
{
"entropy": 1.2285344004631042,
"epoch": 0.26547422923040714,
"grad_norm": 0.1443127691745758,
"learning_rate": 0.00047368421052631577,
"loss": 1.2352,
"mean_token_accuracy": 0.6712902784347534,
"num_tokens": 3744121.0,
"step": 141
},
{
"entropy": 1.2572973817586899,
"epoch": 0.26735702518239585,
"grad_norm": 0.14697600901126862,
"learning_rate": 0.00047349624060150373,
"loss": 1.2545,
"mean_token_accuracy": 0.6712752804160118,
"num_tokens": 3768665.0,
"step": 142
},
{
"entropy": 1.2219904512166977,
"epoch": 0.26923982113438455,
"grad_norm": 0.1259946972131729,
"learning_rate": 0.00047330827067669176,
"loss": 1.1953,
"mean_token_accuracy": 0.6853306293487549,
"num_tokens": 3798421.0,
"step": 143
},
{
"entropy": 1.2031358480453491,
"epoch": 0.27112261708637325,
"grad_norm": 0.1336822658777237,
"learning_rate": 0.0004731203007518797,
"loss": 1.1158,
"mean_token_accuracy": 0.7008628249168396,
"num_tokens": 3826569.0,
"step": 144
},
{
"entropy": 1.2654242366552353,
"epoch": 0.27300541303836195,
"grad_norm": 0.12933260202407837,
"learning_rate": 0.00047293233082706764,
"loss": 1.2125,
"mean_token_accuracy": 0.6849671006202698,
"num_tokens": 3853128.0,
"step": 145
},
{
"entropy": 1.1577993482351303,
"epoch": 0.27488820899035066,
"grad_norm": 0.13406828045845032,
"learning_rate": 0.00047274436090225566,
"loss": 1.1624,
"mean_token_accuracy": 0.6865072473883629,
"num_tokens": 3880569.0,
"step": 146
},
{
"entropy": 1.1901942938566208,
"epoch": 0.27677100494233936,
"grad_norm": 0.14410416781902313,
"learning_rate": 0.0004725563909774436,
"loss": 1.2313,
"mean_token_accuracy": 0.6749508231878281,
"num_tokens": 3907559.0,
"step": 147
},
{
"entropy": 1.1600831672549248,
"epoch": 0.27865380089432806,
"grad_norm": 0.1339792162179947,
"learning_rate": 0.0004723684210526316,
"loss": 1.1987,
"mean_token_accuracy": 0.6836483106017113,
"num_tokens": 3934255.0,
"step": 148
},
{
"entropy": 1.2559089958667755,
"epoch": 0.28053659684631677,
"grad_norm": 0.12650057673454285,
"learning_rate": 0.00047218045112781956,
"loss": 1.2294,
"mean_token_accuracy": 0.6761154308915138,
"num_tokens": 3959809.0,
"step": 149
},
{
"entropy": 1.2887302935123444,
"epoch": 0.28241939279830547,
"grad_norm": 0.14123603701591492,
"learning_rate": 0.0004719924812030075,
"loss": 1.1892,
"mean_token_accuracy": 0.6841337457299232,
"num_tokens": 3984834.0,
"step": 150
},
{
"entropy": 1.2641656994819641,
"epoch": 0.28430218875029417,
"grad_norm": 0.13069137930870056,
"learning_rate": 0.0004718045112781955,
"loss": 1.178,
"mean_token_accuracy": 0.6903347223997116,
"num_tokens": 4011854.0,
"step": 151
},
{
"entropy": 1.2745257169008255,
"epoch": 0.2861849847022829,
"grad_norm": 0.12974441051483154,
"learning_rate": 0.00047161654135338346,
"loss": 1.2299,
"mean_token_accuracy": 0.6787015795707703,
"num_tokens": 4038272.0,
"step": 152
},
{
"entropy": 1.2451976537704468,
"epoch": 0.2880677806542716,
"grad_norm": 0.15594416856765747,
"learning_rate": 0.0004714285714285714,
"loss": 1.2506,
"mean_token_accuracy": 0.6727647334337234,
"num_tokens": 4066761.0,
"step": 153
},
{
"entropy": 1.1639655232429504,
"epoch": 0.2899505766062603,
"grad_norm": 0.12053865194320679,
"learning_rate": 0.0004712406015037594,
"loss": 1.167,
"mean_token_accuracy": 0.6889369264245033,
"num_tokens": 4094208.0,
"step": 154
},
{
"entropy": 1.1459853649139404,
"epoch": 0.291833372558249,
"grad_norm": 0.15322330594062805,
"learning_rate": 0.0004710526315789474,
"loss": 1.1297,
"mean_token_accuracy": 0.691886380314827,
"num_tokens": 4121959.0,
"step": 155
},
{
"entropy": 1.2293187081813812,
"epoch": 0.2937161685102377,
"grad_norm": 0.135823056101799,
"learning_rate": 0.0004708646616541353,
"loss": 1.2266,
"mean_token_accuracy": 0.6803058981895447,
"num_tokens": 4147782.0,
"step": 156
},
{
"entropy": 1.192505158483982,
"epoch": 0.2955989644622264,
"grad_norm": 0.13535255193710327,
"learning_rate": 0.0004706766917293233,
"loss": 1.1608,
"mean_token_accuracy": 0.6955654844641685,
"num_tokens": 4176277.0,
"step": 157
},
{
"entropy": 1.2871312350034714,
"epoch": 0.2974817604142151,
"grad_norm": 0.12719225883483887,
"learning_rate": 0.0004704887218045113,
"loss": 1.2311,
"mean_token_accuracy": 0.6765939891338348,
"num_tokens": 4202697.0,
"step": 158
},
{
"entropy": 1.2744830250740051,
"epoch": 0.2993645563662038,
"grad_norm": 0.15343067049980164,
"learning_rate": 0.0004703007518796993,
"loss": 1.2229,
"mean_token_accuracy": 0.671116054058075,
"num_tokens": 4229068.0,
"step": 159
},
{
"entropy": 1.2606779783964157,
"epoch": 0.3012473523181925,
"grad_norm": 0.12448015809059143,
"learning_rate": 0.0004701127819548872,
"loss": 1.2061,
"mean_token_accuracy": 0.6829146966338158,
"num_tokens": 4256896.0,
"step": 160
},
{
"entropy": 1.150521382689476,
"epoch": 0.3031301482701812,
"grad_norm": 0.1213938444852829,
"learning_rate": 0.0004699248120300752,
"loss": 1.128,
"mean_token_accuracy": 0.6945177465677261,
"num_tokens": 4283765.0,
"step": 161
},
{
"entropy": 1.1809571981430054,
"epoch": 0.3050129442221699,
"grad_norm": 0.13989101350307465,
"learning_rate": 0.0004697368421052632,
"loss": 1.1549,
"mean_token_accuracy": 0.6888199374079704,
"num_tokens": 4308970.0,
"step": 162
},
{
"entropy": 1.151911549270153,
"epoch": 0.3068957401741586,
"grad_norm": 0.2074657380580902,
"learning_rate": 0.0004695488721804511,
"loss": 1.1309,
"mean_token_accuracy": 0.6942140832543373,
"num_tokens": 4333158.0,
"step": 163
},
{
"entropy": 1.1968079656362534,
"epoch": 0.3087785361261473,
"grad_norm": 0.13570360839366913,
"learning_rate": 0.0004693609022556391,
"loss": 1.1814,
"mean_token_accuracy": 0.6869696602225304,
"num_tokens": 4360040.0,
"step": 164
},
{
"entropy": 1.1787877827882767,
"epoch": 0.310661332078136,
"grad_norm": 0.13379861414432526,
"learning_rate": 0.0004691729323308271,
"loss": 1.1791,
"mean_token_accuracy": 0.6811994835734367,
"num_tokens": 4386186.0,
"step": 165
},
{
"entropy": 1.2168269157409668,
"epoch": 0.3125441280301247,
"grad_norm": 0.1466514617204666,
"learning_rate": 0.00046898496240601505,
"loss": 1.2131,
"mean_token_accuracy": 0.6801121830940247,
"num_tokens": 4412572.0,
"step": 166
},
{
"entropy": 1.191074714064598,
"epoch": 0.3144269239821134,
"grad_norm": 0.13052161037921906,
"learning_rate": 0.000468796992481203,
"loss": 1.1818,
"mean_token_accuracy": 0.6877126544713974,
"num_tokens": 4439798.0,
"step": 167
},
{
"entropy": 1.310966208577156,
"epoch": 0.3163097199341021,
"grad_norm": 0.14339525997638702,
"learning_rate": 0.000468609022556391,
"loss": 1.2826,
"mean_token_accuracy": 0.6668709591031075,
"num_tokens": 4465182.0,
"step": 168
},
{
"entropy": 1.249758929014206,
"epoch": 0.3181925158860908,
"grad_norm": 0.14204370975494385,
"learning_rate": 0.00046842105263157895,
"loss": 1.1944,
"mean_token_accuracy": 0.6822869181632996,
"num_tokens": 4491690.0,
"step": 169
},
{
"entropy": 1.2281111925840378,
"epoch": 0.3200753118380795,
"grad_norm": 0.13778182864189148,
"learning_rate": 0.0004682330827067669,
"loss": 1.1821,
"mean_token_accuracy": 0.6827872395515442,
"num_tokens": 4518668.0,
"step": 170
},
{
"entropy": 1.1907898932695389,
"epoch": 0.3219581077900682,
"grad_norm": 0.13682714104652405,
"learning_rate": 0.0004680451127819549,
"loss": 1.1654,
"mean_token_accuracy": 0.6878219619393349,
"num_tokens": 4544500.0,
"step": 171
},
{
"entropy": 1.2053745537996292,
"epoch": 0.32384090374205693,
"grad_norm": 0.1406177431344986,
"learning_rate": 0.00046785714285714285,
"loss": 1.2351,
"mean_token_accuracy": 0.6759226024150848,
"num_tokens": 4570672.0,
"step": 172
},
{
"entropy": 1.1686365455389023,
"epoch": 0.32572369969404563,
"grad_norm": 0.1390364021062851,
"learning_rate": 0.0004676691729323309,
"loss": 1.1563,
"mean_token_accuracy": 0.6870525777339935,
"num_tokens": 4597157.0,
"step": 173
},
{
"entropy": 1.1847928017377853,
"epoch": 0.32760649564603433,
"grad_norm": 0.12553362548351288,
"learning_rate": 0.0004674812030075188,
"loss": 1.1464,
"mean_token_accuracy": 0.6896436884999275,
"num_tokens": 4622963.0,
"step": 174
},
{
"entropy": 1.2175119668245316,
"epoch": 0.32948929159802304,
"grad_norm": 0.12723615765571594,
"learning_rate": 0.00046729323308270675,
"loss": 1.1887,
"mean_token_accuracy": 0.6839049756526947,
"num_tokens": 4650796.0,
"step": 175
},
{
"entropy": 1.2538534700870514,
"epoch": 0.33137208755001174,
"grad_norm": 0.1439773291349411,
"learning_rate": 0.0004671052631578948,
"loss": 1.1796,
"mean_token_accuracy": 0.6849694699048996,
"num_tokens": 4675067.0,
"step": 176
},
{
"entropy": 1.2113288342952728,
"epoch": 0.33325488350200044,
"grad_norm": 0.20407459139823914,
"learning_rate": 0.00046691729323308274,
"loss": 1.1616,
"mean_token_accuracy": 0.6856766641139984,
"num_tokens": 4700943.0,
"step": 177
},
{
"entropy": 1.1914596557617188,
"epoch": 0.3351376794539892,
"grad_norm": 0.13831955194473267,
"learning_rate": 0.00046672932330827065,
"loss": 1.1938,
"mean_token_accuracy": 0.6882949769496918,
"num_tokens": 4728608.0,
"step": 178
},
{
"entropy": 1.1632477790117264,
"epoch": 0.3370204754059779,
"grad_norm": 0.1430656909942627,
"learning_rate": 0.0004665413533834587,
"loss": 1.1745,
"mean_token_accuracy": 0.6857840716838837,
"num_tokens": 4754323.0,
"step": 179
},
{
"entropy": 1.1661407798528671,
"epoch": 0.3389032713579666,
"grad_norm": 0.13480572402477264,
"learning_rate": 0.00046635338345864664,
"loss": 1.1677,
"mean_token_accuracy": 0.6842626482248306,
"num_tokens": 4777734.0,
"step": 180
},
{
"entropy": 1.2307626903057098,
"epoch": 0.3407860673099553,
"grad_norm": 0.14171424508094788,
"learning_rate": 0.00046616541353383456,
"loss": 1.2112,
"mean_token_accuracy": 0.6779276877641678,
"num_tokens": 4803062.0,
"step": 181
},
{
"entropy": 1.2344750761985779,
"epoch": 0.342668863261944,
"grad_norm": 0.1366141438484192,
"learning_rate": 0.0004659774436090226,
"loss": 1.1521,
"mean_token_accuracy": 0.6871028989553452,
"num_tokens": 4828406.0,
"step": 182
},
{
"entropy": 1.2267533838748932,
"epoch": 0.3445516592139327,
"grad_norm": 0.12364047765731812,
"learning_rate": 0.00046578947368421054,
"loss": 1.157,
"mean_token_accuracy": 0.6939859166741371,
"num_tokens": 4855048.0,
"step": 183
},
{
"entropy": 1.25662961602211,
"epoch": 0.3464344551659214,
"grad_norm": 0.14521241188049316,
"learning_rate": 0.0004656015037593985,
"loss": 1.2005,
"mean_token_accuracy": 0.6837843209505081,
"num_tokens": 4879838.0,
"step": 184
},
{
"entropy": 1.1265386119484901,
"epoch": 0.3483172511179101,
"grad_norm": 0.13281729817390442,
"learning_rate": 0.0004654135338345865,
"loss": 1.1245,
"mean_token_accuracy": 0.7005239203572273,
"num_tokens": 4906673.0,
"step": 185
},
{
"entropy": 1.1675947606563568,
"epoch": 0.3502000470698988,
"grad_norm": 0.13612613081932068,
"learning_rate": 0.00046522556390977444,
"loss": 1.1783,
"mean_token_accuracy": 0.6867906153202057,
"num_tokens": 4932081.0,
"step": 186
},
{
"entropy": 1.1747846454381943,
"epoch": 0.3520828430218875,
"grad_norm": 0.14062775671482086,
"learning_rate": 0.0004650375939849624,
"loss": 1.1849,
"mean_token_accuracy": 0.6804407685995102,
"num_tokens": 4957805.0,
"step": 187
},
{
"entropy": 1.3040417283773422,
"epoch": 0.35396563897387623,
"grad_norm": 0.13647155463695526,
"learning_rate": 0.00046484962406015043,
"loss": 1.2723,
"mean_token_accuracy": 0.6708482652902603,
"num_tokens": 4982727.0,
"step": 188
},
{
"entropy": 1.273634523153305,
"epoch": 0.35584843492586493,
"grad_norm": 0.2908094823360443,
"learning_rate": 0.00046466165413533835,
"loss": 1.2188,
"mean_token_accuracy": 0.6769787892699242,
"num_tokens": 5008167.0,
"step": 189
},
{
"entropy": 1.294351875782013,
"epoch": 0.35773123087785363,
"grad_norm": 0.14780114591121674,
"learning_rate": 0.0004644736842105263,
"loss": 1.2497,
"mean_token_accuracy": 0.6740161553025246,
"num_tokens": 5031994.0,
"step": 190
},
{
"entropy": 1.164976328611374,
"epoch": 0.35961402682984234,
"grad_norm": 0.1321694701910019,
"learning_rate": 0.00046428571428571433,
"loss": 1.1297,
"mean_token_accuracy": 0.6937556862831116,
"num_tokens": 5058242.0,
"step": 191
},
{
"entropy": 1.1738992556929588,
"epoch": 0.36149682278183104,
"grad_norm": 0.13215236365795135,
"learning_rate": 0.00046409774436090225,
"loss": 1.1639,
"mean_token_accuracy": 0.688830278813839,
"num_tokens": 5086002.0,
"step": 192
},
{
"entropy": 1.2423847168684006,
"epoch": 0.36337961873381974,
"grad_norm": 0.13844619691371918,
"learning_rate": 0.0004639097744360902,
"loss": 1.2462,
"mean_token_accuracy": 0.6728790327906609,
"num_tokens": 5115116.0,
"step": 193
},
{
"entropy": 1.188772901892662,
"epoch": 0.36526241468580845,
"grad_norm": 0.1350889950990677,
"learning_rate": 0.00046372180451127824,
"loss": 1.162,
"mean_token_accuracy": 0.6961116194725037,
"num_tokens": 5141316.0,
"step": 194
},
{
"entropy": 1.2510673254728317,
"epoch": 0.36714521063779715,
"grad_norm": 0.13393868505954742,
"learning_rate": 0.0004635338345864662,
"loss": 1.2165,
"mean_token_accuracy": 0.675739549100399,
"num_tokens": 5168389.0,
"step": 195
},
{
"entropy": 1.2140327990055084,
"epoch": 0.36902800658978585,
"grad_norm": 0.15341585874557495,
"learning_rate": 0.0004633458646616541,
"loss": 1.1891,
"mean_token_accuracy": 0.6846036836504936,
"num_tokens": 5196797.0,
"step": 196
},
{
"entropy": 1.140480324625969,
"epoch": 0.37091080254177455,
"grad_norm": 0.14681561291217804,
"learning_rate": 0.00046315789473684214,
"loss": 1.1129,
"mean_token_accuracy": 0.7001371458172798,
"num_tokens": 5221689.0,
"step": 197
},
{
"entropy": 1.149554505944252,
"epoch": 0.37279359849376326,
"grad_norm": 0.12448862940073013,
"learning_rate": 0.0004629699248120301,
"loss": 1.0918,
"mean_token_accuracy": 0.7011524215340614,
"num_tokens": 5248151.0,
"step": 198
},
{
"entropy": 1.1877187192440033,
"epoch": 0.37467639444575196,
"grad_norm": 0.12904192507266998,
"learning_rate": 0.00046278195488721807,
"loss": 1.1381,
"mean_token_accuracy": 0.6980564966797829,
"num_tokens": 5276462.0,
"step": 199
},
{
"entropy": 1.1336260885000229,
"epoch": 0.37655919039774066,
"grad_norm": 0.14019370079040527,
"learning_rate": 0.00046259398496240604,
"loss": 1.1408,
"mean_token_accuracy": 0.6882188692688942,
"num_tokens": 5303965.0,
"step": 200
},
{
"entropy": 1.142029918730259,
"epoch": 0.37844198634972936,
"grad_norm": 0.12954500317573547,
"learning_rate": 0.000462406015037594,
"loss": 1.1225,
"mean_token_accuracy": 0.7019821628928185,
"num_tokens": 5333147.0,
"step": 201
},
{
"entropy": 1.1055554077029228,
"epoch": 0.38032478230171807,
"grad_norm": 0.14525440335273743,
"learning_rate": 0.00046221804511278197,
"loss": 1.0873,
"mean_token_accuracy": 0.6984671205282211,
"num_tokens": 5360603.0,
"step": 202
},
{
"entropy": 1.1669521182775497,
"epoch": 0.38220757825370677,
"grad_norm": 0.12719959020614624,
"learning_rate": 0.00046203007518796994,
"loss": 1.1408,
"mean_token_accuracy": 0.6958698183298111,
"num_tokens": 5386882.0,
"step": 203
},
{
"entropy": 1.2504252791404724,
"epoch": 0.3840903742056955,
"grad_norm": 0.14054498076438904,
"learning_rate": 0.0004618421052631579,
"loss": 1.2147,
"mean_token_accuracy": 0.6776561290025711,
"num_tokens": 5413184.0,
"step": 204
},
{
"entropy": 1.226726457476616,
"epoch": 0.3859731701576842,
"grad_norm": 0.13887910544872284,
"learning_rate": 0.00046165413533834587,
"loss": 1.193,
"mean_token_accuracy": 0.6823991388082504,
"num_tokens": 5438606.0,
"step": 205
},
{
"entropy": 1.1875706166028976,
"epoch": 0.3878559661096729,
"grad_norm": 0.14024114608764648,
"learning_rate": 0.0004614661654135339,
"loss": 1.1676,
"mean_token_accuracy": 0.684231162071228,
"num_tokens": 5464123.0,
"step": 206
},
{
"entropy": 1.2047923803329468,
"epoch": 0.3897387620616616,
"grad_norm": 0.1310993880033493,
"learning_rate": 0.0004612781954887218,
"loss": 1.1851,
"mean_token_accuracy": 0.6833815798163414,
"num_tokens": 5491426.0,
"step": 207
},
{
"entropy": 1.2198069095611572,
"epoch": 0.3916215580136503,
"grad_norm": 0.13591070473194122,
"learning_rate": 0.00046109022556390977,
"loss": 1.2115,
"mean_token_accuracy": 0.6876263841986656,
"num_tokens": 5517873.0,
"step": 208
},
{
"entropy": 1.2492990344762802,
"epoch": 0.393504353965639,
"grad_norm": 0.1313110738992691,
"learning_rate": 0.0004609022556390978,
"loss": 1.2303,
"mean_token_accuracy": 0.6741604581475258,
"num_tokens": 5545541.0,
"step": 209
},
{
"entropy": 1.2249716967344284,
"epoch": 0.3953871499176277,
"grad_norm": 0.13691024482250214,
"learning_rate": 0.0004607142857142857,
"loss": 1.1994,
"mean_token_accuracy": 0.6825065985321999,
"num_tokens": 5571818.0,
"step": 210
},
{
"entropy": 1.2132453471422195,
"epoch": 0.3972699458696164,
"grad_norm": 0.13897888362407684,
"learning_rate": 0.0004605263157894737,
"loss": 1.2105,
"mean_token_accuracy": 0.6761833131313324,
"num_tokens": 5598744.0,
"step": 211
},
{
"entropy": 1.1871661990880966,
"epoch": 0.3991527418216051,
"grad_norm": 0.13007131218910217,
"learning_rate": 0.00046033834586466164,
"loss": 1.1726,
"mean_token_accuracy": 0.6834597215056419,
"num_tokens": 5625839.0,
"step": 212
},
{
"entropy": 1.1333737969398499,
"epoch": 0.4010355377735938,
"grad_norm": 0.12430460005998611,
"learning_rate": 0.00046015037593984966,
"loss": 1.1019,
"mean_token_accuracy": 0.7014463916420937,
"num_tokens": 5654141.0,
"step": 213
},
{
"entropy": 1.2297871708869934,
"epoch": 0.4029183337255825,
"grad_norm": 0.13888096809387207,
"learning_rate": 0.0004599624060150376,
"loss": 1.1764,
"mean_token_accuracy": 0.6898130550980568,
"num_tokens": 5678609.0,
"step": 214
},
{
"entropy": 1.2013902068138123,
"epoch": 0.4048011296775712,
"grad_norm": 0.12778723239898682,
"learning_rate": 0.00045977443609022554,
"loss": 1.1552,
"mean_token_accuracy": 0.6898351311683655,
"num_tokens": 5705310.0,
"step": 215
},
{
"entropy": 1.2131111025810242,
"epoch": 0.4066839256295599,
"grad_norm": 0.1250849962234497,
"learning_rate": 0.00045958646616541356,
"loss": 1.1997,
"mean_token_accuracy": 0.6817116960883141,
"num_tokens": 5733075.0,
"step": 216
},
{
"entropy": 1.195549488067627,
"epoch": 0.4085667215815486,
"grad_norm": 0.14742979407310486,
"learning_rate": 0.00045939849624060153,
"loss": 1.1542,
"mean_token_accuracy": 0.6895313560962677,
"num_tokens": 5758265.0,
"step": 217
},
{
"entropy": 1.169806808233261,
"epoch": 0.4104495175335373,
"grad_norm": 0.13026666641235352,
"learning_rate": 0.00045921052631578944,
"loss": 1.1244,
"mean_token_accuracy": 0.6982120722532272,
"num_tokens": 5784948.0,
"step": 218
},
{
"entropy": 1.182911455631256,
"epoch": 0.412332313485526,
"grad_norm": 0.13583756983280182,
"learning_rate": 0.00045902255639097746,
"loss": 1.168,
"mean_token_accuracy": 0.6856559291481972,
"num_tokens": 5811165.0,
"step": 219
},
{
"entropy": 1.0761431455612183,
"epoch": 0.4142151094375147,
"grad_norm": 0.13843543827533722,
"learning_rate": 0.00045883458646616543,
"loss": 1.0857,
"mean_token_accuracy": 0.7090724036097527,
"num_tokens": 5839268.0,
"step": 220
},
{
"entropy": 1.1751226484775543,
"epoch": 0.4160979053895034,
"grad_norm": 0.13362666964530945,
"learning_rate": 0.00045864661654135334,
"loss": 1.1766,
"mean_token_accuracy": 0.6880608201026917,
"num_tokens": 5866181.0,
"step": 221
},
{
"entropy": 1.1817846149206161,
"epoch": 0.4179807013414921,
"grad_norm": 0.1283264309167862,
"learning_rate": 0.00045845864661654136,
"loss": 1.1698,
"mean_token_accuracy": 0.6846595779061317,
"num_tokens": 5894863.0,
"step": 222
},
{
"entropy": 1.2609765976667404,
"epoch": 0.4198634972934808,
"grad_norm": 0.1493021547794342,
"learning_rate": 0.00045827067669172933,
"loss": 1.2032,
"mean_token_accuracy": 0.6831384673714638,
"num_tokens": 5919134.0,
"step": 223
},
{
"entropy": 1.239750549197197,
"epoch": 0.42174629324546953,
"grad_norm": 0.14113545417785645,
"learning_rate": 0.0004580827067669173,
"loss": 1.186,
"mean_token_accuracy": 0.6857739984989166,
"num_tokens": 5944399.0,
"step": 224
},
{
"entropy": 1.2144103646278381,
"epoch": 0.42362908919745823,
"grad_norm": 0.13381649553775787,
"learning_rate": 0.00045789473684210527,
"loss": 1.1787,
"mean_token_accuracy": 0.6889763921499252,
"num_tokens": 5969936.0,
"step": 225
},
{
"entropy": 1.157375693321228,
"epoch": 0.42551188514944693,
"grad_norm": 0.13331881165504456,
"learning_rate": 0.00045770676691729323,
"loss": 1.1613,
"mean_token_accuracy": 0.6869198232889175,
"num_tokens": 5998086.0,
"step": 226
},
{
"entropy": 1.16208166629076,
"epoch": 0.42739468110143564,
"grad_norm": 0.1284441202878952,
"learning_rate": 0.0004575187969924812,
"loss": 1.1593,
"mean_token_accuracy": 0.6875879392027855,
"num_tokens": 6027253.0,
"step": 227
},
{
"entropy": 1.1543057709932327,
"epoch": 0.42927747705342434,
"grad_norm": 0.13240714371204376,
"learning_rate": 0.0004573308270676692,
"loss": 1.1397,
"mean_token_accuracy": 0.6932123303413391,
"num_tokens": 6053458.0,
"step": 228
},
{
"entropy": 1.2234352231025696,
"epoch": 0.43116027300541304,
"grad_norm": 0.13276036083698273,
"learning_rate": 0.00045714285714285713,
"loss": 1.1783,
"mean_token_accuracy": 0.6839658245444298,
"num_tokens": 6077746.0,
"step": 229
},
{
"entropy": 1.2401353865861893,
"epoch": 0.43304306895740174,
"grad_norm": 0.13763296604156494,
"learning_rate": 0.0004569548872180451,
"loss": 1.2126,
"mean_token_accuracy": 0.6801036223769188,
"num_tokens": 6104277.0,
"step": 230
},
{
"entropy": 1.1862784177064896,
"epoch": 0.43492586490939045,
"grad_norm": 0.14408177137374878,
"learning_rate": 0.0004567669172932331,
"loss": 1.1804,
"mean_token_accuracy": 0.6879640221595764,
"num_tokens": 6131048.0,
"step": 231
},
{
"entropy": 1.2236796170473099,
"epoch": 0.43680866086137915,
"grad_norm": 0.1351345330476761,
"learning_rate": 0.00045657894736842103,
"loss": 1.1814,
"mean_token_accuracy": 0.6808154359459877,
"num_tokens": 6157407.0,
"step": 232
},
{
"entropy": 1.2412819564342499,
"epoch": 0.43869145681336785,
"grad_norm": 0.1346222460269928,
"learning_rate": 0.000456390977443609,
"loss": 1.2092,
"mean_token_accuracy": 0.676831878721714,
"num_tokens": 6183884.0,
"step": 233
},
{
"entropy": 1.2513677477836609,
"epoch": 0.44057425276535656,
"grad_norm": 0.14077451825141907,
"learning_rate": 0.000456203007518797,
"loss": 1.2274,
"mean_token_accuracy": 0.6783920973539352,
"num_tokens": 6210214.0,
"step": 234
},
{
"entropy": 1.1642959266901016,
"epoch": 0.44245704871734526,
"grad_norm": 0.1407959908246994,
"learning_rate": 0.000456015037593985,
"loss": 1.1149,
"mean_token_accuracy": 0.6936823204159737,
"num_tokens": 6237636.0,
"step": 235
},
{
"entropy": 1.1751240193843842,
"epoch": 0.44433984466933396,
"grad_norm": 0.1335555762052536,
"learning_rate": 0.0004558270676691729,
"loss": 1.1695,
"mean_token_accuracy": 0.6895338296890259,
"num_tokens": 6263952.0,
"step": 236
},
{
"entropy": 1.1486622989177704,
"epoch": 0.44622264062132266,
"grad_norm": 0.17950989305973053,
"learning_rate": 0.0004556390977443609,
"loss": 1.155,
"mean_token_accuracy": 0.6848675832152367,
"num_tokens": 6292031.0,
"step": 237
},
{
"entropy": 1.185767188668251,
"epoch": 0.44810543657331137,
"grad_norm": 0.1306653767824173,
"learning_rate": 0.0004554511278195489,
"loss": 1.1606,
"mean_token_accuracy": 0.6900418549776077,
"num_tokens": 6321764.0,
"step": 238
},
{
"entropy": 1.2462199479341507,
"epoch": 0.44998823252530007,
"grad_norm": 0.1400284469127655,
"learning_rate": 0.00045526315789473686,
"loss": 1.2094,
"mean_token_accuracy": 0.6798161789774895,
"num_tokens": 6347788.0,
"step": 239
},
{
"entropy": 1.2244273871183395,
"epoch": 0.45187102847728877,
"grad_norm": 0.1347157508134842,
"learning_rate": 0.0004550751879699248,
"loss": 1.1674,
"mean_token_accuracy": 0.6886308640241623,
"num_tokens": 6374007.0,
"step": 240
},
{
"entropy": 1.2273097336292267,
"epoch": 0.4537538244292775,
"grad_norm": 0.1288744956254959,
"learning_rate": 0.0004548872180451128,
"loss": 1.1775,
"mean_token_accuracy": 0.6868400648236275,
"num_tokens": 6400589.0,
"step": 241
},
{
"entropy": 1.2171413898468018,
"epoch": 0.4556366203812662,
"grad_norm": 0.14212685823440552,
"learning_rate": 0.00045469924812030076,
"loss": 1.2173,
"mean_token_accuracy": 0.680756650865078,
"num_tokens": 6428529.0,
"step": 242
},
{
"entropy": 1.1739053502678871,
"epoch": 0.4575194163332549,
"grad_norm": 0.13274581730365753,
"learning_rate": 0.0004545112781954887,
"loss": 1.1491,
"mean_token_accuracy": 0.6945304796099663,
"num_tokens": 6456003.0,
"step": 243
},
{
"entropy": 1.1879045367240906,
"epoch": 0.4594022122852436,
"grad_norm": 0.14754825830459595,
"learning_rate": 0.0004543233082706767,
"loss": 1.153,
"mean_token_accuracy": 0.6907599717378616,
"num_tokens": 6481488.0,
"step": 244
},
{
"entropy": 1.1874423921108246,
"epoch": 0.4612850082372323,
"grad_norm": 0.14292332530021667,
"learning_rate": 0.00045413533834586466,
"loss": 1.1531,
"mean_token_accuracy": 0.6900304704904556,
"num_tokens": 6509304.0,
"step": 245
},
{
"entropy": 1.1584448963403702,
"epoch": 0.463167804189221,
"grad_norm": 0.13040532171726227,
"learning_rate": 0.0004539473684210527,
"loss": 1.1492,
"mean_token_accuracy": 0.6877822354435921,
"num_tokens": 6536066.0,
"step": 246
},
{
"entropy": 1.1855371445417404,
"epoch": 0.4650506001412097,
"grad_norm": 0.13368549942970276,
"learning_rate": 0.0004537593984962406,
"loss": 1.1777,
"mean_token_accuracy": 0.6852287128567696,
"num_tokens": 6565018.0,
"step": 247
},
{
"entropy": 1.1443724185228348,
"epoch": 0.4669333960931984,
"grad_norm": 0.14028339087963104,
"learning_rate": 0.00045357142857142856,
"loss": 1.1356,
"mean_token_accuracy": 0.6946588978171349,
"num_tokens": 6592536.0,
"step": 248
},
{
"entropy": 1.1854888200759888,
"epoch": 0.4688161920451871,
"grad_norm": 0.13055366277694702,
"learning_rate": 0.0004533834586466166,
"loss": 1.1731,
"mean_token_accuracy": 0.6873556599020958,
"num_tokens": 6620329.0,
"step": 249
},
{
"entropy": 1.1635265052318573,
"epoch": 0.4706989879971758,
"grad_norm": 0.12299590557813644,
"learning_rate": 0.0004531954887218045,
"loss": 1.1174,
"mean_token_accuracy": 0.6956649720668793,
"num_tokens": 6647929.0,
"step": 250
},
{
"entropy": 1.1612417101860046,
"epoch": 0.4725817839491645,
"grad_norm": 0.14049823582172394,
"learning_rate": 0.00045300751879699246,
"loss": 1.1348,
"mean_token_accuracy": 0.694083645939827,
"num_tokens": 6674419.0,
"step": 251
},
{
"entropy": 1.2213299870491028,
"epoch": 0.4744645799011532,
"grad_norm": 0.13414214551448822,
"learning_rate": 0.0004528195488721805,
"loss": 1.2013,
"mean_token_accuracy": 0.6825797632336617,
"num_tokens": 6701851.0,
"step": 252
},
{
"entropy": 1.183507114648819,
"epoch": 0.4763473758531419,
"grad_norm": 0.15232087671756744,
"learning_rate": 0.00045263157894736845,
"loss": 1.162,
"mean_token_accuracy": 0.6850753352046013,
"num_tokens": 6729161.0,
"step": 253
},
{
"entropy": 1.0959549844264984,
"epoch": 0.4782301718051306,
"grad_norm": 0.12658758461475372,
"learning_rate": 0.00045244360902255636,
"loss": 1.0808,
"mean_token_accuracy": 0.7000140845775604,
"num_tokens": 6756047.0,
"step": 254
},
{
"entropy": 1.193654179573059,
"epoch": 0.4801129677571193,
"grad_norm": 0.14304682612419128,
"learning_rate": 0.0004522556390977444,
"loss": 1.1611,
"mean_token_accuracy": 0.6860647276043892,
"num_tokens": 6782155.0,
"step": 255
},
{
"entropy": 1.189740851521492,
"epoch": 0.481995763709108,
"grad_norm": 0.1279287487268448,
"learning_rate": 0.00045206766917293235,
"loss": 1.1533,
"mean_token_accuracy": 0.6969729140400887,
"num_tokens": 6809906.0,
"step": 256
},
{
"entropy": 1.1370235309004784,
"epoch": 0.4838785596610967,
"grad_norm": 0.12549139559268951,
"learning_rate": 0.0004518796992481203,
"loss": 1.1005,
"mean_token_accuracy": 0.6986983045935631,
"num_tokens": 6837978.0,
"step": 257
},
{
"entropy": 1.1274943947792053,
"epoch": 0.4857613556130854,
"grad_norm": 0.13078007102012634,
"learning_rate": 0.0004516917293233083,
"loss": 1.116,
"mean_token_accuracy": 0.6968672722578049,
"num_tokens": 6863894.0,
"step": 258
},
{
"entropy": 1.1707115471363068,
"epoch": 0.4876441515650741,
"grad_norm": 0.13655990362167358,
"learning_rate": 0.00045150375939849625,
"loss": 1.1502,
"mean_token_accuracy": 0.6891424879431725,
"num_tokens": 6889219.0,
"step": 259
},
{
"entropy": 1.1765428930521011,
"epoch": 0.4895269475170628,
"grad_norm": 0.13517631590366364,
"learning_rate": 0.0004513157894736842,
"loss": 1.1736,
"mean_token_accuracy": 0.6828250586986542,
"num_tokens": 6915957.0,
"step": 260
},
{
"entropy": 1.1622217297554016,
"epoch": 0.49140974346905153,
"grad_norm": 0.1339031159877777,
"learning_rate": 0.0004511278195488722,
"loss": 1.1602,
"mean_token_accuracy": 0.6858406886458397,
"num_tokens": 6942729.0,
"step": 261
},
{
"entropy": 1.188800647854805,
"epoch": 0.49329253942104023,
"grad_norm": 0.1516953706741333,
"learning_rate": 0.00045093984962406015,
"loss": 1.1541,
"mean_token_accuracy": 0.6871596127748489,
"num_tokens": 6966884.0,
"step": 262
},
{
"entropy": 1.1681264340877533,
"epoch": 0.49517533537302894,
"grad_norm": 0.14556634426116943,
"learning_rate": 0.0004507518796992481,
"loss": 1.1307,
"mean_token_accuracy": 0.6948810294270515,
"num_tokens": 6992842.0,
"step": 263
},
{
"entropy": 1.1910002678632736,
"epoch": 0.49705813132501764,
"grad_norm": 0.1371603161096573,
"learning_rate": 0.00045056390977443614,
"loss": 1.1469,
"mean_token_accuracy": 0.6974197626113892,
"num_tokens": 7018704.0,
"step": 264
},
{
"entropy": 1.2533641755580902,
"epoch": 0.49894092727700634,
"grad_norm": 0.15122705698013306,
"learning_rate": 0.00045037593984962405,
"loss": 1.1964,
"mean_token_accuracy": 0.6835278943181038,
"num_tokens": 7045985.0,
"step": 265
},
{
"entropy": 1.18770419806242,
"epoch": 0.500823723228995,
"grad_norm": 0.1283893585205078,
"learning_rate": 0.000450187969924812,
"loss": 1.1613,
"mean_token_accuracy": 0.6964623779058456,
"num_tokens": 7073668.0,
"step": 266
},
{
"entropy": 1.1760464161634445,
"epoch": 0.5027065191809837,
"grad_norm": 0.13645370304584503,
"learning_rate": 0.00045000000000000004,
"loss": 1.181,
"mean_token_accuracy": 0.6851188093423843,
"num_tokens": 7100612.0,
"step": 267
},
{
"entropy": 1.1559069901704788,
"epoch": 0.5045893151329724,
"grad_norm": 0.14222431182861328,
"learning_rate": 0.000449812030075188,
"loss": 1.1661,
"mean_token_accuracy": 0.6858489215373993,
"num_tokens": 7127648.0,
"step": 268
},
{
"entropy": 1.155109003186226,
"epoch": 0.5064721110849612,
"grad_norm": 0.14752890169620514,
"learning_rate": 0.0004496240601503759,
"loss": 1.1549,
"mean_token_accuracy": 0.6923946589231491,
"num_tokens": 7153048.0,
"step": 269
},
{
"entropy": 1.2506433129310608,
"epoch": 0.5083549070369499,
"grad_norm": 0.14298772811889648,
"learning_rate": 0.00044943609022556394,
"loss": 1.193,
"mean_token_accuracy": 0.684316597878933,
"num_tokens": 7177628.0,
"step": 270
},
{
"entropy": 1.2653572857379913,
"epoch": 0.5102377029889386,
"grad_norm": 0.167319193482399,
"learning_rate": 0.0004492481203007519,
"loss": 1.1959,
"mean_token_accuracy": 0.6871765851974487,
"num_tokens": 7201577.0,
"step": 271
},
{
"entropy": 1.2064370959997177,
"epoch": 0.5121204989409273,
"grad_norm": 0.15246403217315674,
"learning_rate": 0.0004490601503759398,
"loss": 1.1574,
"mean_token_accuracy": 0.6841192170977592,
"num_tokens": 7226259.0,
"step": 272
},
{
"entropy": 1.1363181620836258,
"epoch": 0.514003294892916,
"grad_norm": 0.13937003910541534,
"learning_rate": 0.00044887218045112784,
"loss": 1.1257,
"mean_token_accuracy": 0.6941032037138939,
"num_tokens": 7253373.0,
"step": 273
},
{
"entropy": 1.1732933074235916,
"epoch": 0.5158860908449047,
"grad_norm": 0.14371132850646973,
"learning_rate": 0.0004486842105263158,
"loss": 1.1715,
"mean_token_accuracy": 0.6919308379292488,
"num_tokens": 7278945.0,
"step": 274
},
{
"entropy": 1.175576038658619,
"epoch": 0.5177688867968934,
"grad_norm": 0.1441759318113327,
"learning_rate": 0.0004484962406015038,
"loss": 1.1515,
"mean_token_accuracy": 0.694126233458519,
"num_tokens": 7305391.0,
"step": 275
},
{
"entropy": 1.2058104127645493,
"epoch": 0.5196516827488821,
"grad_norm": 0.13355745375156403,
"learning_rate": 0.00044830827067669174,
"loss": 1.1916,
"mean_token_accuracy": 0.687326617538929,
"num_tokens": 7332607.0,
"step": 276
},
{
"entropy": 1.2485528588294983,
"epoch": 0.5215344787008708,
"grad_norm": 0.14986877143383026,
"learning_rate": 0.0004481203007518797,
"loss": 1.2103,
"mean_token_accuracy": 0.6793005913496017,
"num_tokens": 7358139.0,
"step": 277
},
{
"entropy": 1.187769129872322,
"epoch": 0.5234172746528595,
"grad_norm": 0.14205658435821533,
"learning_rate": 0.0004479323308270677,
"loss": 1.1564,
"mean_token_accuracy": 0.6925127878785133,
"num_tokens": 7384537.0,
"step": 278
},
{
"entropy": 1.1303328722715378,
"epoch": 0.5253000706048482,
"grad_norm": 0.14045588672161102,
"learning_rate": 0.00044774436090225565,
"loss": 1.1287,
"mean_token_accuracy": 0.6949460133910179,
"num_tokens": 7411036.0,
"step": 279
},
{
"entropy": 1.2028415352106094,
"epoch": 0.5271828665568369,
"grad_norm": 0.1550549864768982,
"learning_rate": 0.0004475563909774436,
"loss": 1.2004,
"mean_token_accuracy": 0.6846116036176682,
"num_tokens": 7437443.0,
"step": 280
},
{
"entropy": 1.182666465640068,
"epoch": 0.5290656625088256,
"grad_norm": 0.2469193935394287,
"learning_rate": 0.0004473684210526316,
"loss": 1.1759,
"mean_token_accuracy": 0.6844401434063911,
"num_tokens": 7462227.0,
"step": 281
},
{
"entropy": 1.202811524271965,
"epoch": 0.5309484584608143,
"grad_norm": 0.14160913228988647,
"learning_rate": 0.0004471804511278196,
"loss": 1.1957,
"mean_token_accuracy": 0.6817988455295563,
"num_tokens": 7487080.0,
"step": 282
},
{
"entropy": 1.1812713742256165,
"epoch": 0.532831254412803,
"grad_norm": 0.15075385570526123,
"learning_rate": 0.0004469924812030075,
"loss": 1.1481,
"mean_token_accuracy": 0.6930856108665466,
"num_tokens": 7511921.0,
"step": 283
},
{
"entropy": 1.2214877009391785,
"epoch": 0.5347140503647917,
"grad_norm": 0.1399138867855072,
"learning_rate": 0.0004468045112781955,
"loss": 1.1678,
"mean_token_accuracy": 0.6885346695780754,
"num_tokens": 7538663.0,
"step": 284
},
{
"entropy": 1.2207457572221756,
"epoch": 0.5365968463167804,
"grad_norm": 0.16030077636241913,
"learning_rate": 0.0004466165413533835,
"loss": 1.1498,
"mean_token_accuracy": 0.6934774816036224,
"num_tokens": 7563898.0,
"step": 285
},
{
"entropy": 1.1787783950567245,
"epoch": 0.5384796422687691,
"grad_norm": 0.13601085543632507,
"learning_rate": 0.00044642857142857147,
"loss": 1.145,
"mean_token_accuracy": 0.6905470564961433,
"num_tokens": 7590702.0,
"step": 286
},
{
"entropy": 1.081341713666916,
"epoch": 0.5403624382207578,
"grad_norm": 0.13594649732112885,
"learning_rate": 0.0004462406015037594,
"loss": 1.0881,
"mean_token_accuracy": 0.7003285214304924,
"num_tokens": 7618002.0,
"step": 287
},
{
"entropy": 1.1418119072914124,
"epoch": 0.5422452341727465,
"grad_norm": 0.15701550245285034,
"learning_rate": 0.0004460526315789474,
"loss": 1.1544,
"mean_token_accuracy": 0.6906085088849068,
"num_tokens": 7644482.0,
"step": 288
},
{
"entropy": 1.1627637073397636,
"epoch": 0.5441280301247352,
"grad_norm": 0.13722968101501465,
"learning_rate": 0.00044586466165413537,
"loss": 1.1586,
"mean_token_accuracy": 0.6932996585965157,
"num_tokens": 7671479.0,
"step": 289
},
{
"entropy": 1.1320042312145233,
"epoch": 0.5460108260767239,
"grad_norm": 0.15330596268177032,
"learning_rate": 0.0004456766917293233,
"loss": 1.108,
"mean_token_accuracy": 0.6965923383831978,
"num_tokens": 7697013.0,
"step": 290
},
{
"entropy": 1.2310521453619003,
"epoch": 0.5478936220287126,
"grad_norm": 0.14045506715774536,
"learning_rate": 0.00044548872180451125,
"loss": 1.1978,
"mean_token_accuracy": 0.6855576112866402,
"num_tokens": 7722551.0,
"step": 291
},
{
"entropy": 1.1880534440279007,
"epoch": 0.5497764179807013,
"grad_norm": 0.14293448626995087,
"learning_rate": 0.00044530075187969927,
"loss": 1.1251,
"mean_token_accuracy": 0.701711505651474,
"num_tokens": 7748016.0,
"step": 292
},
{
"entropy": 1.141702115535736,
"epoch": 0.55165921393269,
"grad_norm": 0.1439259648323059,
"learning_rate": 0.00044511278195488724,
"loss": 1.1361,
"mean_token_accuracy": 0.6944170445203781,
"num_tokens": 7774858.0,
"step": 293
},
{
"entropy": 1.1963759511709213,
"epoch": 0.5535420098846787,
"grad_norm": 0.15148387849330902,
"learning_rate": 0.00044492481203007515,
"loss": 1.1768,
"mean_token_accuracy": 0.6924594268202782,
"num_tokens": 7800802.0,
"step": 294
},
{
"entropy": 1.2073182165622711,
"epoch": 0.5554248058366674,
"grad_norm": 0.14503706991672516,
"learning_rate": 0.00044473684210526317,
"loss": 1.2075,
"mean_token_accuracy": 0.6802205815911293,
"num_tokens": 7825288.0,
"step": 295
},
{
"entropy": 1.1897266507148743,
"epoch": 0.5573076017886561,
"grad_norm": 0.13914930820465088,
"learning_rate": 0.00044454887218045114,
"loss": 1.1668,
"mean_token_accuracy": 0.6842218562960625,
"num_tokens": 7853255.0,
"step": 296
},
{
"entropy": 1.138252004981041,
"epoch": 0.5591903977406448,
"grad_norm": 0.1277482956647873,
"learning_rate": 0.0004443609022556391,
"loss": 1.095,
"mean_token_accuracy": 0.6993494555354118,
"num_tokens": 7880497.0,
"step": 297
},
{
"entropy": 1.1767967641353607,
"epoch": 0.5610731936926335,
"grad_norm": 0.14053884148597717,
"learning_rate": 0.00044417293233082707,
"loss": 1.1443,
"mean_token_accuracy": 0.6948733255267143,
"num_tokens": 7906730.0,
"step": 298
},
{
"entropy": 1.2134106159210205,
"epoch": 0.5629559896446222,
"grad_norm": 0.14005884528160095,
"learning_rate": 0.00044398496240601504,
"loss": 1.1822,
"mean_token_accuracy": 0.6892389133572578,
"num_tokens": 7933216.0,
"step": 299
},
{
"entropy": 1.1945680975914001,
"epoch": 0.5648387855966109,
"grad_norm": 0.1356893926858902,
"learning_rate": 0.000443796992481203,
"loss": 1.1689,
"mean_token_accuracy": 0.6882117986679077,
"num_tokens": 7960270.0,
"step": 300
},
{
"entropy": 1.1890588849782944,
"epoch": 0.5667215815485996,
"grad_norm": 0.14139321446418762,
"learning_rate": 0.000443609022556391,
"loss": 1.1757,
"mean_token_accuracy": 0.6851599663496017,
"num_tokens": 7987900.0,
"step": 301
},
{
"entropy": 1.1338028833270073,
"epoch": 0.5686043775005883,
"grad_norm": 0.14264994859695435,
"learning_rate": 0.00044342105263157894,
"loss": 1.1502,
"mean_token_accuracy": 0.6855240687727928,
"num_tokens": 8013351.0,
"step": 302
},
{
"entropy": 1.1318519860506058,
"epoch": 0.570487173452577,
"grad_norm": 0.13565586507320404,
"learning_rate": 0.0004432330827067669,
"loss": 1.1165,
"mean_token_accuracy": 0.6999509632587433,
"num_tokens": 8038918.0,
"step": 303
},
{
"entropy": 1.2122758030891418,
"epoch": 0.5723699694045657,
"grad_norm": 0.13487568497657776,
"learning_rate": 0.00044304511278195493,
"loss": 1.1738,
"mean_token_accuracy": 0.681725949048996,
"num_tokens": 8066501.0,
"step": 304
},
{
"entropy": 1.1797229945659637,
"epoch": 0.5742527653565545,
"grad_norm": 0.13627903163433075,
"learning_rate": 0.00044285714285714284,
"loss": 1.1376,
"mean_token_accuracy": 0.689607098698616,
"num_tokens": 8093242.0,
"step": 305
},
{
"entropy": 1.1857865750789642,
"epoch": 0.5761355613085432,
"grad_norm": 0.13779953122138977,
"learning_rate": 0.0004426691729323308,
"loss": 1.1367,
"mean_token_accuracy": 0.6948609203100204,
"num_tokens": 8121053.0,
"step": 306
},
{
"entropy": 1.1960344910621643,
"epoch": 0.5780183572605319,
"grad_norm": 0.13792765140533447,
"learning_rate": 0.00044248120300751883,
"loss": 1.1472,
"mean_token_accuracy": 0.6897515431046486,
"num_tokens": 8147832.0,
"step": 307
},
{
"entropy": 1.19243024289608,
"epoch": 0.5799011532125206,
"grad_norm": 0.1438818722963333,
"learning_rate": 0.0004422932330827068,
"loss": 1.1905,
"mean_token_accuracy": 0.6858177557587624,
"num_tokens": 8173841.0,
"step": 308
},
{
"entropy": 1.211151197552681,
"epoch": 0.5817839491645093,
"grad_norm": 0.1361284852027893,
"learning_rate": 0.0004421052631578947,
"loss": 1.214,
"mean_token_accuracy": 0.67852383852005,
"num_tokens": 8202120.0,
"step": 309
},
{
"entropy": 1.1578274965286255,
"epoch": 0.583666745116498,
"grad_norm": 0.14872749149799347,
"learning_rate": 0.00044191729323308273,
"loss": 1.1497,
"mean_token_accuracy": 0.6920148581266403,
"num_tokens": 8229217.0,
"step": 310
},
{
"entropy": 1.1631289571523666,
"epoch": 0.5855495410684867,
"grad_norm": 0.15371911227703094,
"learning_rate": 0.0004417293233082707,
"loss": 1.1437,
"mean_token_accuracy": 0.6945102214813232,
"num_tokens": 8254581.0,
"step": 311
},
{
"entropy": 1.1813505440950394,
"epoch": 0.5874323370204754,
"grad_norm": 0.14172406494617462,
"learning_rate": 0.0004415413533834586,
"loss": 1.1445,
"mean_token_accuracy": 0.7006291374564171,
"num_tokens": 8280615.0,
"step": 312
},
{
"entropy": 1.1823447942733765,
"epoch": 0.5893151329724641,
"grad_norm": 0.14375410974025726,
"learning_rate": 0.00044135338345864663,
"loss": 1.1497,
"mean_token_accuracy": 0.6918843537569046,
"num_tokens": 8307395.0,
"step": 313
},
{
"entropy": 1.1527684777975082,
"epoch": 0.5911979289244528,
"grad_norm": 0.1389397829771042,
"learning_rate": 0.0004411654135338346,
"loss": 1.1189,
"mean_token_accuracy": 0.6944358944892883,
"num_tokens": 8332107.0,
"step": 314
},
{
"entropy": 1.165027841925621,
"epoch": 0.5930807248764415,
"grad_norm": 0.14531069993972778,
"learning_rate": 0.00044097744360902257,
"loss": 1.161,
"mean_token_accuracy": 0.6896175295114517,
"num_tokens": 8358194.0,
"step": 315
},
{
"entropy": 1.2045851200819016,
"epoch": 0.5949635208284302,
"grad_norm": 0.1540374457836151,
"learning_rate": 0.00044078947368421053,
"loss": 1.1797,
"mean_token_accuracy": 0.6859044209122658,
"num_tokens": 8386180.0,
"step": 316
},
{
"entropy": 1.194406397640705,
"epoch": 0.5968463167804189,
"grad_norm": 0.14392457902431488,
"learning_rate": 0.0004406015037593985,
"loss": 1.1483,
"mean_token_accuracy": 0.6856495141983032,
"num_tokens": 8412257.0,
"step": 317
},
{
"entropy": 1.1843983232975006,
"epoch": 0.5987291127324076,
"grad_norm": 0.12984612584114075,
"learning_rate": 0.00044041353383458647,
"loss": 1.159,
"mean_token_accuracy": 0.6899672672152519,
"num_tokens": 8440139.0,
"step": 318
},
{
"entropy": 1.159614846110344,
"epoch": 0.6006119086843963,
"grad_norm": 0.13649439811706543,
"learning_rate": 0.00044022556390977443,
"loss": 1.1277,
"mean_token_accuracy": 0.6980894953012466,
"num_tokens": 8466297.0,
"step": 319
},
{
"entropy": 1.1729088872671127,
"epoch": 0.602494704636385,
"grad_norm": 0.14619147777557373,
"learning_rate": 0.0004400375939849624,
"loss": 1.1511,
"mean_token_accuracy": 0.6904428154230118,
"num_tokens": 8492672.0,
"step": 320
},
{
"entropy": 1.1907424926757812,
"epoch": 0.6043775005883737,
"grad_norm": 0.14279942214488983,
"learning_rate": 0.00043984962406015037,
"loss": 1.1775,
"mean_token_accuracy": 0.6842730417847633,
"num_tokens": 8521582.0,
"step": 321
},
{
"entropy": 1.1668616235256195,
"epoch": 0.6062602965403624,
"grad_norm": 0.1608172506093979,
"learning_rate": 0.0004396616541353384,
"loss": 1.1169,
"mean_token_accuracy": 0.6961806491017342,
"num_tokens": 8549037.0,
"step": 322
},
{
"entropy": 1.172086626291275,
"epoch": 0.6081430924923511,
"grad_norm": 0.13843871653079987,
"learning_rate": 0.0004394736842105263,
"loss": 1.1337,
"mean_token_accuracy": 0.6961240246891975,
"num_tokens": 8577320.0,
"step": 323
},
{
"entropy": 1.1471307575702667,
"epoch": 0.6100258884443398,
"grad_norm": 0.17384615540504456,
"learning_rate": 0.00043928571428571427,
"loss": 1.132,
"mean_token_accuracy": 0.6966283246874809,
"num_tokens": 8604513.0,
"step": 324
},
{
"entropy": 1.1775583177804947,
"epoch": 0.6119086843963285,
"grad_norm": 0.1405702829360962,
"learning_rate": 0.0004390977443609023,
"loss": 1.1713,
"mean_token_accuracy": 0.6833978369832039,
"num_tokens": 8631088.0,
"step": 325
},
{
"entropy": 1.1986607536673546,
"epoch": 0.6137914803483172,
"grad_norm": 0.17384964227676392,
"learning_rate": 0.00043890977443609026,
"loss": 1.1903,
"mean_token_accuracy": 0.6892447099089622,
"num_tokens": 8658317.0,
"step": 326
},
{
"entropy": 1.1727805137634277,
"epoch": 0.6156742763003059,
"grad_norm": 0.14653940498828888,
"learning_rate": 0.00043872180451127817,
"loss": 1.1706,
"mean_token_accuracy": 0.6892889738082886,
"num_tokens": 8685883.0,
"step": 327
},
{
"entropy": 1.1792996972799301,
"epoch": 0.6175570722522946,
"grad_norm": 0.14093339443206787,
"learning_rate": 0.0004385338345864662,
"loss": 1.1659,
"mean_token_accuracy": 0.6881109997630119,
"num_tokens": 8710584.0,
"step": 328
},
{
"entropy": 1.1784557923674583,
"epoch": 0.6194398682042833,
"grad_norm": 0.14964358508586884,
"learning_rate": 0.00043834586466165416,
"loss": 1.1098,
"mean_token_accuracy": 0.6995358616113663,
"num_tokens": 8737455.0,
"step": 329
},
{
"entropy": 1.2075697928667068,
"epoch": 0.621322664156272,
"grad_norm": 0.14746899902820587,
"learning_rate": 0.00043815789473684207,
"loss": 1.1564,
"mean_token_accuracy": 0.6904364302754402,
"num_tokens": 8764718.0,
"step": 330
},
{
"entropy": 1.259048119187355,
"epoch": 0.6232054601082607,
"grad_norm": 0.13727432489395142,
"learning_rate": 0.0004379699248120301,
"loss": 1.2152,
"mean_token_accuracy": 0.6816830709576607,
"num_tokens": 8792699.0,
"step": 331
},
{
"entropy": 1.176329106092453,
"epoch": 0.6250882560602494,
"grad_norm": 0.13555607199668884,
"learning_rate": 0.00043778195488721806,
"loss": 1.1337,
"mean_token_accuracy": 0.6938095465302467,
"num_tokens": 8818252.0,
"step": 332
},
{
"entropy": 1.1746894717216492,
"epoch": 0.6269710520122381,
"grad_norm": 0.14540338516235352,
"learning_rate": 0.000437593984962406,
"loss": 1.1678,
"mean_token_accuracy": 0.6856407299637794,
"num_tokens": 8843904.0,
"step": 333
},
{
"entropy": 1.143667384982109,
"epoch": 0.6288538479642268,
"grad_norm": 0.17852836847305298,
"learning_rate": 0.000437406015037594,
"loss": 1.1471,
"mean_token_accuracy": 0.6907041072845459,
"num_tokens": 8868115.0,
"step": 334
},
{
"entropy": 1.1293998435139656,
"epoch": 0.6307366439162155,
"grad_norm": 0.13162344694137573,
"learning_rate": 0.00043721804511278196,
"loss": 1.123,
"mean_token_accuracy": 0.7001049220561981,
"num_tokens": 8894871.0,
"step": 335
},
{
"entropy": 1.1313979178667068,
"epoch": 0.6326194398682042,
"grad_norm": 0.1321536898612976,
"learning_rate": 0.0004370300751879699,
"loss": 1.0987,
"mean_token_accuracy": 0.7042840495705605,
"num_tokens": 8921413.0,
"step": 336
},
{
"entropy": 1.22024667263031,
"epoch": 0.6345022358201929,
"grad_norm": 0.14904777705669403,
"learning_rate": 0.00043684210526315795,
"loss": 1.1685,
"mean_token_accuracy": 0.6839649677276611,
"num_tokens": 8948016.0,
"step": 337
},
{
"entropy": 1.200153261423111,
"epoch": 0.6363850317721816,
"grad_norm": 0.15332205593585968,
"learning_rate": 0.00043665413533834586,
"loss": 1.1599,
"mean_token_accuracy": 0.6898418813943863,
"num_tokens": 8974626.0,
"step": 338
},
{
"entropy": 1.148691438138485,
"epoch": 0.6382678277241703,
"grad_norm": 0.1428363174200058,
"learning_rate": 0.00043646616541353383,
"loss": 1.1403,
"mean_token_accuracy": 0.6996031925082207,
"num_tokens": 9001421.0,
"step": 339
},
{
"entropy": 1.1665330827236176,
"epoch": 0.640150623676159,
"grad_norm": 0.1439882218837738,
"learning_rate": 0.00043627819548872185,
"loss": 1.1849,
"mean_token_accuracy": 0.6867435649037361,
"num_tokens": 9028615.0,
"step": 340
},
{
"entropy": 1.1208850890398026,
"epoch": 0.6420334196281478,
"grad_norm": 0.14697298407554626,
"learning_rate": 0.00043609022556390976,
"loss": 1.1336,
"mean_token_accuracy": 0.6952601596713066,
"num_tokens": 9056227.0,
"step": 341
},
{
"entropy": 1.1804025322198868,
"epoch": 0.6439162155801365,
"grad_norm": 0.13762733340263367,
"learning_rate": 0.00043590225563909773,
"loss": 1.1556,
"mean_token_accuracy": 0.6842042878270149,
"num_tokens": 9081334.0,
"step": 342
},
{
"entropy": 1.225020870566368,
"epoch": 0.6457990115321252,
"grad_norm": 0.15140774846076965,
"learning_rate": 0.00043571428571428575,
"loss": 1.1576,
"mean_token_accuracy": 0.6892690062522888,
"num_tokens": 9107740.0,
"step": 343
},
{
"entropy": 1.178776428103447,
"epoch": 0.6476818074841139,
"grad_norm": 0.14922155439853668,
"learning_rate": 0.0004355263157894737,
"loss": 1.119,
"mean_token_accuracy": 0.6988128572702408,
"num_tokens": 9134004.0,
"step": 344
},
{
"entropy": 1.1870884746313095,
"epoch": 0.6495646034361026,
"grad_norm": 0.13645216822624207,
"learning_rate": 0.00043533834586466163,
"loss": 1.1258,
"mean_token_accuracy": 0.7014844194054604,
"num_tokens": 9161858.0,
"step": 345
},
{
"entropy": 1.1208381354808807,
"epoch": 0.6514473993880913,
"grad_norm": 0.15188747644424438,
"learning_rate": 0.00043515037593984965,
"loss": 1.126,
"mean_token_accuracy": 0.6888753995299339,
"num_tokens": 9187924.0,
"step": 346
},
{
"entropy": 1.1246383488178253,
"epoch": 0.65333019534008,
"grad_norm": 0.18039844930171967,
"learning_rate": 0.0004349624060150376,
"loss": 1.1297,
"mean_token_accuracy": 0.6954269483685493,
"num_tokens": 9213952.0,
"step": 347
},
{
"entropy": 1.181724175810814,
"epoch": 0.6552129912920687,
"grad_norm": 0.13552230596542358,
"learning_rate": 0.0004347744360902256,
"loss": 1.185,
"mean_token_accuracy": 0.682334654033184,
"num_tokens": 9240003.0,
"step": 348
},
{
"entropy": 1.161278709769249,
"epoch": 0.6570957872440574,
"grad_norm": 0.13721586763858795,
"learning_rate": 0.00043458646616541355,
"loss": 1.1323,
"mean_token_accuracy": 0.6919213533401489,
"num_tokens": 9265180.0,
"step": 349
},
{
"entropy": 1.167539969086647,
"epoch": 0.6589785831960461,
"grad_norm": 0.145475372672081,
"learning_rate": 0.0004343984962406015,
"loss": 1.1342,
"mean_token_accuracy": 0.6932244300842285,
"num_tokens": 9291467.0,
"step": 350
},
{
"entropy": 1.2319505363702774,
"epoch": 0.6608613791480348,
"grad_norm": 0.13839372992515564,
"learning_rate": 0.0004342105263157895,
"loss": 1.2132,
"mean_token_accuracy": 0.6786127388477325,
"num_tokens": 9317382.0,
"step": 351
},
{
"entropy": 1.2023252993822098,
"epoch": 0.6627441751000235,
"grad_norm": 0.1364511102437973,
"learning_rate": 0.00043402255639097745,
"loss": 1.19,
"mean_token_accuracy": 0.6843428909778595,
"num_tokens": 9343464.0,
"step": 352
},
{
"entropy": 1.173360899090767,
"epoch": 0.6646269710520122,
"grad_norm": 0.1326543539762497,
"learning_rate": 0.0004338345864661654,
"loss": 1.1469,
"mean_token_accuracy": 0.6877379715442657,
"num_tokens": 9371170.0,
"step": 353
},
{
"entropy": 1.1177352517843246,
"epoch": 0.6665097670040009,
"grad_norm": 0.1422666758298874,
"learning_rate": 0.0004336466165413534,
"loss": 1.0994,
"mean_token_accuracy": 0.700407862663269,
"num_tokens": 9397147.0,
"step": 354
},
{
"entropy": 1.248588040471077,
"epoch": 0.6683925629559897,
"grad_norm": 0.13168664276599884,
"learning_rate": 0.0004334586466165414,
"loss": 1.2098,
"mean_token_accuracy": 0.6834209859371185,
"num_tokens": 9424363.0,
"step": 355
},
{
"entropy": 1.1617062538862228,
"epoch": 0.6702753589079784,
"grad_norm": 0.15483741462230682,
"learning_rate": 0.0004332706766917293,
"loss": 1.114,
"mean_token_accuracy": 0.7020522281527519,
"num_tokens": 9450742.0,
"step": 356
},
{
"entropy": 1.1978859603404999,
"epoch": 0.6721581548599671,
"grad_norm": 0.14632469415664673,
"learning_rate": 0.0004330827067669173,
"loss": 1.1847,
"mean_token_accuracy": 0.6837000176310539,
"num_tokens": 9475697.0,
"step": 357
},
{
"entropy": 1.1161824762821198,
"epoch": 0.6740409508119558,
"grad_norm": 0.14072488248348236,
"learning_rate": 0.0004328947368421053,
"loss": 1.1272,
"mean_token_accuracy": 0.6974566504359245,
"num_tokens": 9502237.0,
"step": 358
},
{
"entropy": 1.1397125273942947,
"epoch": 0.6759237467639445,
"grad_norm": 0.148344486951828,
"learning_rate": 0.0004327067669172932,
"loss": 1.1453,
"mean_token_accuracy": 0.6873810589313507,
"num_tokens": 9528201.0,
"step": 359
},
{
"entropy": 1.2197502925992012,
"epoch": 0.6778065427159332,
"grad_norm": 0.14831538498401642,
"learning_rate": 0.0004325187969924812,
"loss": 1.1981,
"mean_token_accuracy": 0.6797335669398308,
"num_tokens": 9553887.0,
"step": 360
},
{
"entropy": 1.2503347992897034,
"epoch": 0.6796893386679219,
"grad_norm": 0.14289598166942596,
"learning_rate": 0.0004323308270676692,
"loss": 1.1754,
"mean_token_accuracy": 0.682529591023922,
"num_tokens": 9578439.0,
"step": 361
},
{
"entropy": 1.2314954698085785,
"epoch": 0.6815721346199106,
"grad_norm": 0.14386345446109772,
"learning_rate": 0.0004321428571428572,
"loss": 1.1499,
"mean_token_accuracy": 0.6907836198806763,
"num_tokens": 9603444.0,
"step": 362
},
{
"entropy": 1.2456393241882324,
"epoch": 0.6834549305718993,
"grad_norm": 0.14364264905452728,
"learning_rate": 0.0004319548872180451,
"loss": 1.1933,
"mean_token_accuracy": 0.6874497607350349,
"num_tokens": 9629030.0,
"step": 363
},
{
"entropy": 1.1722253412008286,
"epoch": 0.685337726523888,
"grad_norm": 0.1491105556488037,
"learning_rate": 0.0004317669172932331,
"loss": 1.152,
"mean_token_accuracy": 0.6939368024468422,
"num_tokens": 9656342.0,
"step": 364
},
{
"entropy": 1.0892303064465523,
"epoch": 0.6872205224758767,
"grad_norm": 0.14881175756454468,
"learning_rate": 0.0004315789473684211,
"loss": 1.0922,
"mean_token_accuracy": 0.7064904496073723,
"num_tokens": 9680706.0,
"step": 365
},
{
"entropy": 1.090978980064392,
"epoch": 0.6891033184278654,
"grad_norm": 0.14446662366390228,
"learning_rate": 0.00043139097744360904,
"loss": 1.1148,
"mean_token_accuracy": 0.696795642375946,
"num_tokens": 9705331.0,
"step": 366
},
{
"entropy": 1.1398785412311554,
"epoch": 0.6909861143798541,
"grad_norm": 0.13684354722499847,
"learning_rate": 0.000431203007518797,
"loss": 1.1497,
"mean_token_accuracy": 0.6912109777331352,
"num_tokens": 9732400.0,
"step": 367
},
{
"entropy": 1.17644502222538,
"epoch": 0.6928689103318428,
"grad_norm": 0.14162884652614594,
"learning_rate": 0.000431015037593985,
"loss": 1.1495,
"mean_token_accuracy": 0.6945677846670151,
"num_tokens": 9758948.0,
"step": 368
},
{
"entropy": 1.1725402027368546,
"epoch": 0.6947517062838315,
"grad_norm": 0.13373105227947235,
"learning_rate": 0.00043082706766917295,
"loss": 1.1186,
"mean_token_accuracy": 0.7017792239785194,
"num_tokens": 9786609.0,
"step": 369
},
{
"entropy": 1.1570321172475815,
"epoch": 0.6966345022358202,
"grad_norm": 0.13376620411872864,
"learning_rate": 0.0004306390977443609,
"loss": 1.1169,
"mean_token_accuracy": 0.7013789564371109,
"num_tokens": 9815091.0,
"step": 370
},
{
"entropy": 1.2269478738307953,
"epoch": 0.6985172981878089,
"grad_norm": 0.15718406438827515,
"learning_rate": 0.0004304511278195489,
"loss": 1.1795,
"mean_token_accuracy": 0.6809123381972313,
"num_tokens": 9838924.0,
"step": 371
},
{
"entropy": 1.2373632341623306,
"epoch": 0.7004000941397976,
"grad_norm": 0.13601046800613403,
"learning_rate": 0.00043026315789473685,
"loss": 1.1897,
"mean_token_accuracy": 0.6842946112155914,
"num_tokens": 9865745.0,
"step": 372
},
{
"entropy": 1.2175681740045547,
"epoch": 0.7022828900917863,
"grad_norm": 0.14760908484458923,
"learning_rate": 0.00043007518796992487,
"loss": 1.2027,
"mean_token_accuracy": 0.680089496076107,
"num_tokens": 9891103.0,
"step": 373
},
{
"entropy": 1.187382310628891,
"epoch": 0.704165686043775,
"grad_norm": 0.15881404280662537,
"learning_rate": 0.0004298872180451128,
"loss": 1.183,
"mean_token_accuracy": 0.6840859726071358,
"num_tokens": 9916491.0,
"step": 374
},
{
"entropy": 1.1363441050052643,
"epoch": 0.7060484819957638,
"grad_norm": 0.14100411534309387,
"learning_rate": 0.00042969924812030075,
"loss": 1.1268,
"mean_token_accuracy": 0.6940664201974869,
"num_tokens": 9943115.0,
"step": 375
},
{
"entropy": 1.1373258829116821,
"epoch": 0.7079312779477525,
"grad_norm": 0.14058925211429596,
"learning_rate": 0.00042951127819548877,
"loss": 1.1312,
"mean_token_accuracy": 0.6918314695358276,
"num_tokens": 9971012.0,
"step": 376
},
{
"entropy": 1.1753637194633484,
"epoch": 0.7098140738997412,
"grad_norm": 0.15900634229183197,
"learning_rate": 0.00042932330827067674,
"loss": 1.1532,
"mean_token_accuracy": 0.688523419201374,
"num_tokens": 9997158.0,
"step": 377
},
{
"entropy": 1.2038870453834534,
"epoch": 0.7116968698517299,
"grad_norm": 0.15579019486904144,
"learning_rate": 0.00042913533834586465,
"loss": 1.1634,
"mean_token_accuracy": 0.6910874620079994,
"num_tokens": 10023904.0,
"step": 378
},
{
"entropy": 1.2042047381401062,
"epoch": 0.7135796658037186,
"grad_norm": 0.1458210051059723,
"learning_rate": 0.0004289473684210526,
"loss": 1.1303,
"mean_token_accuracy": 0.6955228298902512,
"num_tokens": 10050044.0,
"step": 379
},
{
"entropy": 1.199434906244278,
"epoch": 0.7154624617557073,
"grad_norm": 0.13873904943466187,
"learning_rate": 0.00042875939849624064,
"loss": 1.143,
"mean_token_accuracy": 0.6911288425326347,
"num_tokens": 10077533.0,
"step": 380
},
{
"entropy": 1.179319679737091,
"epoch": 0.717345257707696,
"grad_norm": 0.15580423176288605,
"learning_rate": 0.00042857142857142855,
"loss": 1.1516,
"mean_token_accuracy": 0.6900925859808922,
"num_tokens": 10102103.0,
"step": 381
},
{
"entropy": 1.1498710662126541,
"epoch": 0.7192280536596847,
"grad_norm": 0.1526648849248886,
"learning_rate": 0.0004283834586466165,
"loss": 1.1463,
"mean_token_accuracy": 0.6923620998859406,
"num_tokens": 10127966.0,
"step": 382
},
{
"entropy": 1.2051638066768646,
"epoch": 0.7211108496116734,
"grad_norm": 0.14739763736724854,
"learning_rate": 0.00042819548872180454,
"loss": 1.2125,
"mean_token_accuracy": 0.6824790090322495,
"num_tokens": 10153724.0,
"step": 383
},
{
"entropy": 1.148889034986496,
"epoch": 0.7229936455636621,
"grad_norm": 0.13951475918293,
"learning_rate": 0.0004280075187969925,
"loss": 1.1431,
"mean_token_accuracy": 0.6938719674944878,
"num_tokens": 10178827.0,
"step": 384
},
{
"entropy": 1.1680803298950195,
"epoch": 0.7248764415156508,
"grad_norm": 0.14505353569984436,
"learning_rate": 0.0004278195488721804,
"loss": 1.1278,
"mean_token_accuracy": 0.6925608888268471,
"num_tokens": 10204362.0,
"step": 385
},
{
"entropy": 1.1652754694223404,
"epoch": 0.7267592374676395,
"grad_norm": 0.15343666076660156,
"learning_rate": 0.00042763157894736844,
"loss": 1.1347,
"mean_token_accuracy": 0.6980648785829544,
"num_tokens": 10232975.0,
"step": 386
},
{
"entropy": 1.1660331934690475,
"epoch": 0.7286420334196282,
"grad_norm": 0.6029819250106812,
"learning_rate": 0.0004274436090225564,
"loss": 1.1252,
"mean_token_accuracy": 0.6913493424654007,
"num_tokens": 10258684.0,
"step": 387
},
{
"entropy": 1.2207347601652145,
"epoch": 0.7305248293716169,
"grad_norm": 0.1639021635055542,
"learning_rate": 0.00042725563909774437,
"loss": 1.2,
"mean_token_accuracy": 0.680275171995163,
"num_tokens": 10284896.0,
"step": 388
},
{
"entropy": 1.1547054946422577,
"epoch": 0.7324076253236056,
"grad_norm": 0.13551250100135803,
"learning_rate": 0.00042706766917293234,
"loss": 1.153,
"mean_token_accuracy": 0.6940227970480919,
"num_tokens": 10312039.0,
"step": 389
},
{
"entropy": 1.173499509692192,
"epoch": 0.7342904212755943,
"grad_norm": 0.14394164085388184,
"learning_rate": 0.0004268796992481203,
"loss": 1.1401,
"mean_token_accuracy": 0.6948181614279747,
"num_tokens": 10338001.0,
"step": 390
},
{
"entropy": 1.108071744441986,
"epoch": 0.736173217227583,
"grad_norm": 0.15528494119644165,
"learning_rate": 0.0004266917293233083,
"loss": 1.0993,
"mean_token_accuracy": 0.7045417055487633,
"num_tokens": 10364257.0,
"step": 391
},
{
"entropy": 1.1832116544246674,
"epoch": 0.7380560131795717,
"grad_norm": 0.14551259577274323,
"learning_rate": 0.00042650375939849624,
"loss": 1.1514,
"mean_token_accuracy": 0.6929153054952621,
"num_tokens": 10389671.0,
"step": 392
},
{
"entropy": 1.1930436193943024,
"epoch": 0.7399388091315604,
"grad_norm": 0.15499240159988403,
"learning_rate": 0.0004263157894736842,
"loss": 1.1429,
"mean_token_accuracy": 0.688226006925106,
"num_tokens": 10415575.0,
"step": 393
},
{
"entropy": 1.2092433124780655,
"epoch": 0.7418216050835491,
"grad_norm": 0.15129360556602478,
"learning_rate": 0.0004261278195488722,
"loss": 1.1844,
"mean_token_accuracy": 0.6808707118034363,
"num_tokens": 10442443.0,
"step": 394
},
{
"entropy": 1.293672189116478,
"epoch": 0.7437044010355378,
"grad_norm": 0.1603565663099289,
"learning_rate": 0.0004259398496240602,
"loss": 1.2682,
"mean_token_accuracy": 0.6722560822963715,
"num_tokens": 10466233.0,
"step": 395
},
{
"entropy": 1.1358380764722824,
"epoch": 0.7455871969875265,
"grad_norm": 0.1485726684331894,
"learning_rate": 0.0004257518796992481,
"loss": 1.1388,
"mean_token_accuracy": 0.6920513585209846,
"num_tokens": 10491851.0,
"step": 396
},
{
"entropy": 1.13677416741848,
"epoch": 0.7474699929395152,
"grad_norm": 0.1432713270187378,
"learning_rate": 0.0004255639097744361,
"loss": 1.1244,
"mean_token_accuracy": 0.6951583921909332,
"num_tokens": 10518737.0,
"step": 397
},
{
"entropy": 1.2034449130296707,
"epoch": 0.7493527888915039,
"grad_norm": 0.16076122224330902,
"learning_rate": 0.0004253759398496241,
"loss": 1.2062,
"mean_token_accuracy": 0.6785011366009712,
"num_tokens": 10545857.0,
"step": 398
},
{
"entropy": 1.1623305827379227,
"epoch": 0.7512355848434926,
"grad_norm": 0.15050064027309418,
"learning_rate": 0.000425187969924812,
"loss": 1.1163,
"mean_token_accuracy": 0.6948087736964226,
"num_tokens": 10571770.0,
"step": 399
},
{
"entropy": 1.1117802858352661,
"epoch": 0.7531183807954813,
"grad_norm": 0.21685755252838135,
"learning_rate": 0.000425,
"loss": 1.0837,
"mean_token_accuracy": 0.7059917375445366,
"num_tokens": 10599528.0,
"step": 400
},
{
"entropy": 1.1872282922267914,
"epoch": 0.75500117674747,
"grad_norm": 0.1475781798362732,
"learning_rate": 0.000424812030075188,
"loss": 1.1617,
"mean_token_accuracy": 0.6920499876141548,
"num_tokens": 10625575.0,
"step": 401
},
{
"entropy": 1.1875766217708588,
"epoch": 0.7568839726994587,
"grad_norm": 0.15453127026557922,
"learning_rate": 0.00042462406015037596,
"loss": 1.1608,
"mean_token_accuracy": 0.6888900995254517,
"num_tokens": 10650929.0,
"step": 402
},
{
"entropy": 1.120169810950756,
"epoch": 0.7587667686514474,
"grad_norm": 0.14685072004795074,
"learning_rate": 0.0004244360902255639,
"loss": 1.0894,
"mean_token_accuracy": 0.700760155916214,
"num_tokens": 10677930.0,
"step": 403
},
{
"entropy": 1.178112044930458,
"epoch": 0.7606495646034361,
"grad_norm": 0.15392844378948212,
"learning_rate": 0.0004242481203007519,
"loss": 1.1488,
"mean_token_accuracy": 0.6943765133619308,
"num_tokens": 10701759.0,
"step": 404
},
{
"entropy": 1.139440432190895,
"epoch": 0.7625323605554248,
"grad_norm": 0.14876064658164978,
"learning_rate": 0.00042406015037593987,
"loss": 1.1175,
"mean_token_accuracy": 0.6995274350047112,
"num_tokens": 10727920.0,
"step": 405
},
{
"entropy": 1.1383692100644112,
"epoch": 0.7644151565074135,
"grad_norm": 0.16769041121006012,
"learning_rate": 0.00042387218045112783,
"loss": 1.1056,
"mean_token_accuracy": 0.6987453699111938,
"num_tokens": 10752826.0,
"step": 406
},
{
"entropy": 1.219818040728569,
"epoch": 0.7662979524594022,
"grad_norm": 0.16228246688842773,
"learning_rate": 0.0004236842105263158,
"loss": 1.1982,
"mean_token_accuracy": 0.6772318556904793,
"num_tokens": 10777756.0,
"step": 407
},
{
"entropy": 1.1474368646740913,
"epoch": 0.768180748411391,
"grad_norm": 0.14922939240932465,
"learning_rate": 0.00042349624060150377,
"loss": 1.1385,
"mean_token_accuracy": 0.6920562386512756,
"num_tokens": 10804768.0,
"step": 408
},
{
"entropy": 1.1331078857183456,
"epoch": 0.7700635443633796,
"grad_norm": 0.1535317599773407,
"learning_rate": 0.00042330827067669173,
"loss": 1.1359,
"mean_token_accuracy": 0.6879219114780426,
"num_tokens": 10830286.0,
"step": 409
},
{
"entropy": 1.146752119064331,
"epoch": 0.7719463403153684,
"grad_norm": 0.1524975448846817,
"learning_rate": 0.0004231203007518797,
"loss": 1.1448,
"mean_token_accuracy": 0.6925338879227638,
"num_tokens": 10855720.0,
"step": 410
},
{
"entropy": 1.13744555413723,
"epoch": 0.773829136267357,
"grad_norm": 0.16938121616840363,
"learning_rate": 0.00042293233082706767,
"loss": 1.1189,
"mean_token_accuracy": 0.7019513100385666,
"num_tokens": 10881312.0,
"step": 411
},
{
"entropy": 1.1643693000078201,
"epoch": 0.7757119322193458,
"grad_norm": 0.134382426738739,
"learning_rate": 0.00042274436090225563,
"loss": 1.1205,
"mean_token_accuracy": 0.7012400701642036,
"num_tokens": 10909609.0,
"step": 412
},
{
"entropy": 1.1546955406665802,
"epoch": 0.7775947281713345,
"grad_norm": 0.15923891961574554,
"learning_rate": 0.00042255639097744366,
"loss": 1.1025,
"mean_token_accuracy": 0.7031391486525536,
"num_tokens": 10937878.0,
"step": 413
},
{
"entropy": 1.1441723331809044,
"epoch": 0.7794775241233232,
"grad_norm": 0.16663163900375366,
"learning_rate": 0.00042236842105263157,
"loss": 1.1092,
"mean_token_accuracy": 0.6957027688622475,
"num_tokens": 10963268.0,
"step": 414
},
{
"entropy": 1.168132722377777,
"epoch": 0.7813603200753119,
"grad_norm": 0.13848932087421417,
"learning_rate": 0.00042218045112781954,
"loss": 1.132,
"mean_token_accuracy": 0.6938114240765572,
"num_tokens": 10990727.0,
"step": 415
},
{
"entropy": 1.1057742238044739,
"epoch": 0.7832431160273006,
"grad_norm": 0.13826268911361694,
"learning_rate": 0.00042199248120300756,
"loss": 1.0977,
"mean_token_accuracy": 0.6982015743851662,
"num_tokens": 11017384.0,
"step": 416
},
{
"entropy": 1.1963546127080917,
"epoch": 0.7851259119792893,
"grad_norm": 0.1429852694272995,
"learning_rate": 0.0004218045112781955,
"loss": 1.1883,
"mean_token_accuracy": 0.6860344484448433,
"num_tokens": 11045688.0,
"step": 417
},
{
"entropy": 1.1521967574954033,
"epoch": 0.787008707931278,
"grad_norm": 0.16643297672271729,
"learning_rate": 0.00042161654135338344,
"loss": 1.1547,
"mean_token_accuracy": 0.6908131241798401,
"num_tokens": 11070352.0,
"step": 418
},
{
"entropy": 1.1493701189756393,
"epoch": 0.7888915038832667,
"grad_norm": 0.15780487656593323,
"learning_rate": 0.00042142857142857146,
"loss": 1.1631,
"mean_token_accuracy": 0.6898321136832237,
"num_tokens": 11097217.0,
"step": 419
},
{
"entropy": 1.2399737238883972,
"epoch": 0.7907742998352554,
"grad_norm": 0.15339267253875732,
"learning_rate": 0.0004212406015037594,
"loss": 1.206,
"mean_token_accuracy": 0.6820631548762321,
"num_tokens": 11123692.0,
"step": 420
},
{
"entropy": 1.1258632093667984,
"epoch": 0.7926570957872441,
"grad_norm": 0.1442951112985611,
"learning_rate": 0.00042105263157894734,
"loss": 1.0869,
"mean_token_accuracy": 0.7083057761192322,
"num_tokens": 11149050.0,
"step": 421
},
{
"entropy": 1.2205425053834915,
"epoch": 0.7945398917392328,
"grad_norm": 0.1388903707265854,
"learning_rate": 0.00042086466165413536,
"loss": 1.1843,
"mean_token_accuracy": 0.6856774613261223,
"num_tokens": 11175990.0,
"step": 422
},
{
"entropy": 1.1613269746303558,
"epoch": 0.7964226876912215,
"grad_norm": 0.15723979473114014,
"learning_rate": 0.0004206766917293233,
"loss": 1.1238,
"mean_token_accuracy": 0.6957441344857216,
"num_tokens": 11203684.0,
"step": 423
},
{
"entropy": 1.15619857609272,
"epoch": 0.7983054836432102,
"grad_norm": 0.16091464459896088,
"learning_rate": 0.0004204887218045113,
"loss": 1.1275,
"mean_token_accuracy": 0.6946544200181961,
"num_tokens": 11230179.0,
"step": 424
},
{
"entropy": 1.2017978131771088,
"epoch": 0.8001882795951989,
"grad_norm": 0.15011471509933472,
"learning_rate": 0.00042030075187969926,
"loss": 1.1685,
"mean_token_accuracy": 0.6920702531933784,
"num_tokens": 11256384.0,
"step": 425
},
{
"entropy": 1.2229324877262115,
"epoch": 0.8020710755471876,
"grad_norm": 0.14569929242134094,
"learning_rate": 0.0004201127819548872,
"loss": 1.2065,
"mean_token_accuracy": 0.6834921091794968,
"num_tokens": 11284359.0,
"step": 426
},
{
"entropy": 1.1204483732581139,
"epoch": 0.8039538714991763,
"grad_norm": 0.14004987478256226,
"learning_rate": 0.0004199248120300752,
"loss": 1.1147,
"mean_token_accuracy": 0.7033949047327042,
"num_tokens": 11313184.0,
"step": 427
},
{
"entropy": 1.1141091734170914,
"epoch": 0.805836667451165,
"grad_norm": 0.14807014167308807,
"learning_rate": 0.00041973684210526316,
"loss": 1.1074,
"mean_token_accuracy": 0.6922068670392036,
"num_tokens": 11340757.0,
"step": 428
},
{
"entropy": 1.2002304196357727,
"epoch": 0.8077194634031537,
"grad_norm": 0.17711348831653595,
"learning_rate": 0.00041954887218045113,
"loss": 1.1973,
"mean_token_accuracy": 0.6831801310181618,
"num_tokens": 11366871.0,
"step": 429
},
{
"entropy": 1.2234468758106232,
"epoch": 0.8096022593551424,
"grad_norm": 0.16027556359767914,
"learning_rate": 0.0004193609022556391,
"loss": 1.1958,
"mean_token_accuracy": 0.6806567907333374,
"num_tokens": 11390392.0,
"step": 430
},
{
"entropy": 1.1892322599887848,
"epoch": 0.8114850553071311,
"grad_norm": 0.14892058074474335,
"learning_rate": 0.0004191729323308271,
"loss": 1.124,
"mean_token_accuracy": 0.6932070925831795,
"num_tokens": 11415883.0,
"step": 431
},
{
"entropy": 1.1975643932819366,
"epoch": 0.8133678512591198,
"grad_norm": 0.13819143176078796,
"learning_rate": 0.00041898496240601503,
"loss": 1.1446,
"mean_token_accuracy": 0.6961016952991486,
"num_tokens": 11445261.0,
"step": 432
},
{
"entropy": 1.231493815779686,
"epoch": 0.8152506472111085,
"grad_norm": 0.14783842861652374,
"learning_rate": 0.000418796992481203,
"loss": 1.1956,
"mean_token_accuracy": 0.6879047080874443,
"num_tokens": 11471660.0,
"step": 433
},
{
"entropy": 1.1187082305550575,
"epoch": 0.8171334431630972,
"grad_norm": 0.1379650980234146,
"learning_rate": 0.000418609022556391,
"loss": 1.1226,
"mean_token_accuracy": 0.6993625611066818,
"num_tokens": 11498274.0,
"step": 434
},
{
"entropy": 1.272495910525322,
"epoch": 0.8190162391150859,
"grad_norm": 0.1640465259552002,
"learning_rate": 0.000418421052631579,
"loss": 1.2792,
"mean_token_accuracy": 0.6701348200440407,
"num_tokens": 11525102.0,
"step": 435
},
{
"entropy": 1.1658570766448975,
"epoch": 0.8208990350670746,
"grad_norm": 0.14112910628318787,
"learning_rate": 0.0004182330827067669,
"loss": 1.171,
"mean_token_accuracy": 0.6936748847365379,
"num_tokens": 11555100.0,
"step": 436
},
{
"entropy": 1.2729250341653824,
"epoch": 0.8227818310190633,
"grad_norm": 0.15435785055160522,
"learning_rate": 0.0004180451127819549,
"loss": 1.2133,
"mean_token_accuracy": 0.6812319383025169,
"num_tokens": 11580101.0,
"step": 437
},
{
"entropy": 1.13491952419281,
"epoch": 0.824664626971052,
"grad_norm": 0.1388065367937088,
"learning_rate": 0.0004178571428571429,
"loss": 1.091,
"mean_token_accuracy": 0.7023670971393585,
"num_tokens": 11607990.0,
"step": 438
},
{
"entropy": 1.1109650805592537,
"epoch": 0.8265474229230407,
"grad_norm": 0.13361488282680511,
"learning_rate": 0.0004176691729323308,
"loss": 1.0797,
"mean_token_accuracy": 0.7052409499883652,
"num_tokens": 11635249.0,
"step": 439
},
{
"entropy": 1.128780521452427,
"epoch": 0.8284302188750294,
"grad_norm": 0.14179299771785736,
"learning_rate": 0.0004174812030075188,
"loss": 1.0756,
"mean_token_accuracy": 0.6986876875162125,
"num_tokens": 11661132.0,
"step": 440
},
{
"entropy": 1.1229918599128723,
"epoch": 0.8303130148270181,
"grad_norm": 0.13364551961421967,
"learning_rate": 0.0004172932330827068,
"loss": 1.1159,
"mean_token_accuracy": 0.7024848908185959,
"num_tokens": 11688969.0,
"step": 441
},
{
"entropy": 1.1451409384608269,
"epoch": 0.8321958107790068,
"grad_norm": 0.15363940596580505,
"learning_rate": 0.00041710526315789475,
"loss": 1.1742,
"mean_token_accuracy": 0.6850685179233551,
"num_tokens": 11714108.0,
"step": 442
},
{
"entropy": 1.1217172518372536,
"epoch": 0.8340786067309955,
"grad_norm": 0.1592985838651657,
"learning_rate": 0.0004169172932330827,
"loss": 1.1189,
"mean_token_accuracy": 0.698178730905056,
"num_tokens": 11737727.0,
"step": 443
},
{
"entropy": 1.1448046416044235,
"epoch": 0.8359614026829842,
"grad_norm": 0.15717987716197968,
"learning_rate": 0.0004167293233082707,
"loss": 1.1271,
"mean_token_accuracy": 0.696114294230938,
"num_tokens": 11763503.0,
"step": 444
},
{
"entropy": 1.1910344362258911,
"epoch": 0.837844198634973,
"grad_norm": 0.1563824862241745,
"learning_rate": 0.00041654135338345865,
"loss": 1.1685,
"mean_token_accuracy": 0.6853935644030571,
"num_tokens": 11788216.0,
"step": 445
},
{
"entropy": 1.1520782858133316,
"epoch": 0.8397269945869617,
"grad_norm": 0.15299555659294128,
"learning_rate": 0.0004163533834586467,
"loss": 1.1235,
"mean_token_accuracy": 0.6957945972681046,
"num_tokens": 11813250.0,
"step": 446
},
{
"entropy": 1.157516971230507,
"epoch": 0.8416097905389504,
"grad_norm": 0.15409286320209503,
"learning_rate": 0.0004161654135338346,
"loss": 1.1292,
"mean_token_accuracy": 0.6986691579222679,
"num_tokens": 11840547.0,
"step": 447
},
{
"entropy": 1.1751955449581146,
"epoch": 0.8434925864909391,
"grad_norm": 0.1436087191104889,
"learning_rate": 0.00041597744360902255,
"loss": 1.1498,
"mean_token_accuracy": 0.692206360399723,
"num_tokens": 11868040.0,
"step": 448
},
{
"entropy": 1.1962674707174301,
"epoch": 0.8453753824429278,
"grad_norm": 0.14213787019252777,
"learning_rate": 0.0004157894736842106,
"loss": 1.1349,
"mean_token_accuracy": 0.6944708526134491,
"num_tokens": 11894177.0,
"step": 449
},
{
"entropy": 1.201774999499321,
"epoch": 0.8472581783949165,
"grad_norm": 0.15118546783924103,
"learning_rate": 0.0004156015037593985,
"loss": 1.1868,
"mean_token_accuracy": 0.6906943470239639,
"num_tokens": 11920755.0,
"step": 450
},
{
"entropy": 1.1439872980117798,
"epoch": 0.8491409743469052,
"grad_norm": 0.1536472737789154,
"learning_rate": 0.00041541353383458646,
"loss": 1.1091,
"mean_token_accuracy": 0.6987525522708893,
"num_tokens": 11946199.0,
"step": 451
},
{
"entropy": 1.1865400224924088,
"epoch": 0.8510237702988939,
"grad_norm": 0.16255781054496765,
"learning_rate": 0.0004152255639097745,
"loss": 1.1606,
"mean_token_accuracy": 0.6941612362861633,
"num_tokens": 11970559.0,
"step": 452
},
{
"entropy": 1.1555950492620468,
"epoch": 0.8529065662508826,
"grad_norm": 0.15296806395053864,
"learning_rate": 0.00041503759398496244,
"loss": 1.1647,
"mean_token_accuracy": 0.6893363445997238,
"num_tokens": 11998113.0,
"step": 453
},
{
"entropy": 1.1035746112465858,
"epoch": 0.8547893622028713,
"grad_norm": 0.13151533901691437,
"learning_rate": 0.00041484962406015036,
"loss": 1.0917,
"mean_token_accuracy": 0.7064924463629723,
"num_tokens": 12025595.0,
"step": 454
},
{
"entropy": 1.148128904402256,
"epoch": 0.85667215815486,
"grad_norm": 0.15572930872440338,
"learning_rate": 0.0004146616541353384,
"loss": 1.1516,
"mean_token_accuracy": 0.6970530971884727,
"num_tokens": 12051025.0,
"step": 455
},
{
"entropy": 1.1640497595071793,
"epoch": 0.8585549541068487,
"grad_norm": 0.14575503766536713,
"learning_rate": 0.00041447368421052634,
"loss": 1.124,
"mean_token_accuracy": 0.6972140222787857,
"num_tokens": 12080372.0,
"step": 456
},
{
"entropy": 1.1797401309013367,
"epoch": 0.8604377500588374,
"grad_norm": 0.1724129319190979,
"learning_rate": 0.0004142857142857143,
"loss": 1.1266,
"mean_token_accuracy": 0.6963677033782005,
"num_tokens": 12107881.0,
"step": 457
},
{
"entropy": 1.1369287073612213,
"epoch": 0.8623205460108261,
"grad_norm": 0.1409987360239029,
"learning_rate": 0.0004140977443609022,
"loss": 1.1021,
"mean_token_accuracy": 0.6983814239501953,
"num_tokens": 12136975.0,
"step": 458
},
{
"entropy": 1.203329399228096,
"epoch": 0.8642033419628148,
"grad_norm": 0.171426460146904,
"learning_rate": 0.00041390977443609025,
"loss": 1.1796,
"mean_token_accuracy": 0.6895611882209778,
"num_tokens": 12164452.0,
"step": 459
},
{
"entropy": 1.1388862580060959,
"epoch": 0.8660861379148035,
"grad_norm": 0.1465880423784256,
"learning_rate": 0.0004137218045112782,
"loss": 1.1449,
"mean_token_accuracy": 0.6952017247676849,
"num_tokens": 12190700.0,
"step": 460
},
{
"entropy": 1.165066435933113,
"epoch": 0.8679689338667922,
"grad_norm": 0.1510019600391388,
"learning_rate": 0.0004135338345864661,
"loss": 1.1519,
"mean_token_accuracy": 0.6902508214116096,
"num_tokens": 12216248.0,
"step": 461
},
{
"entropy": 1.1071253940463066,
"epoch": 0.8698517298187809,
"grad_norm": 0.1569354087114334,
"learning_rate": 0.00041334586466165415,
"loss": 1.1008,
"mean_token_accuracy": 0.7029130309820175,
"num_tokens": 12242702.0,
"step": 462
},
{
"entropy": 1.162157580256462,
"epoch": 0.8717345257707696,
"grad_norm": 0.15269963443279266,
"learning_rate": 0.0004131578947368421,
"loss": 1.1408,
"mean_token_accuracy": 0.6991895586252213,
"num_tokens": 12267065.0,
"step": 463
},
{
"entropy": 1.164448007941246,
"epoch": 0.8736173217227583,
"grad_norm": 0.15020480751991272,
"learning_rate": 0.0004129699248120301,
"loss": 1.1331,
"mean_token_accuracy": 0.6945090070366859,
"num_tokens": 12294273.0,
"step": 464
},
{
"entropy": 1.194659799337387,
"epoch": 0.875500117674747,
"grad_norm": 0.16067473590373993,
"learning_rate": 0.00041278195488721805,
"loss": 1.1384,
"mean_token_accuracy": 0.692974790930748,
"num_tokens": 12319075.0,
"step": 465
},
{
"entropy": 1.1572427451610565,
"epoch": 0.8773829136267357,
"grad_norm": 0.14344556629657745,
"learning_rate": 0.000412593984962406,
"loss": 1.1239,
"mean_token_accuracy": 0.6996137872338295,
"num_tokens": 12345047.0,
"step": 466
},
{
"entropy": 1.1310506239533424,
"epoch": 0.8792657095787244,
"grad_norm": 0.1469915211200714,
"learning_rate": 0.000412406015037594,
"loss": 1.1117,
"mean_token_accuracy": 0.6948174610733986,
"num_tokens": 12371084.0,
"step": 467
},
{
"entropy": 1.1873999759554863,
"epoch": 0.8811485055307131,
"grad_norm": 0.14283262193202972,
"learning_rate": 0.00041221804511278195,
"loss": 1.1725,
"mean_token_accuracy": 0.6882406696677208,
"num_tokens": 12397086.0,
"step": 468
},
{
"entropy": 1.1660784780979156,
"epoch": 0.8830313014827018,
"grad_norm": 0.1400137096643448,
"learning_rate": 0.0004120300751879699,
"loss": 1.1305,
"mean_token_accuracy": 0.6928488984704018,
"num_tokens": 12424840.0,
"step": 469
},
{
"entropy": 1.1689551174640656,
"epoch": 0.8849140974346905,
"grad_norm": 0.17401744425296783,
"learning_rate": 0.0004118421052631579,
"loss": 1.1356,
"mean_token_accuracy": 0.6973849907517433,
"num_tokens": 12453038.0,
"step": 470
},
{
"entropy": 1.16590516269207,
"epoch": 0.8867968933866792,
"grad_norm": 0.15749803185462952,
"learning_rate": 0.0004116541353383459,
"loss": 1.1388,
"mean_token_accuracy": 0.690193310379982,
"num_tokens": 12479755.0,
"step": 471
},
{
"entropy": 1.1534086763858795,
"epoch": 0.8886796893386679,
"grad_norm": 0.13575902581214905,
"learning_rate": 0.0004114661654135338,
"loss": 1.1333,
"mean_token_accuracy": 0.6930194050073624,
"num_tokens": 12507911.0,
"step": 472
},
{
"entropy": 1.166767194867134,
"epoch": 0.8905624852906566,
"grad_norm": 0.14083941280841827,
"learning_rate": 0.0004112781954887218,
"loss": 1.1433,
"mean_token_accuracy": 0.6883162334561348,
"num_tokens": 12534740.0,
"step": 473
},
{
"entropy": 1.116583712399006,
"epoch": 0.8924452812426453,
"grad_norm": 0.18177185952663422,
"learning_rate": 0.0004110902255639098,
"loss": 1.1013,
"mean_token_accuracy": 0.6984972059726715,
"num_tokens": 12560495.0,
"step": 474
},
{
"entropy": 1.0962852016091347,
"epoch": 0.894328077194634,
"grad_norm": 0.15513888001441956,
"learning_rate": 0.00041090225563909777,
"loss": 1.0659,
"mean_token_accuracy": 0.7114295363426208,
"num_tokens": 12586806.0,
"step": 475
},
{
"entropy": 1.1862118691205978,
"epoch": 0.8962108731466227,
"grad_norm": 0.1506270319223404,
"learning_rate": 0.0004107142857142857,
"loss": 1.1887,
"mean_token_accuracy": 0.6871896237134933,
"num_tokens": 12612493.0,
"step": 476
},
{
"entropy": 1.1081865057349205,
"epoch": 0.8980936690986114,
"grad_norm": 0.14710566401481628,
"learning_rate": 0.0004105263157894737,
"loss": 1.1012,
"mean_token_accuracy": 0.6983359083533287,
"num_tokens": 12639626.0,
"step": 477
},
{
"entropy": 1.128834992647171,
"epoch": 0.8999764650506001,
"grad_norm": 0.14161938428878784,
"learning_rate": 0.00041033834586466167,
"loss": 1.0982,
"mean_token_accuracy": 0.7014680877327919,
"num_tokens": 12664733.0,
"step": 478
},
{
"entropy": 1.1446367651224136,
"epoch": 0.9018592610025888,
"grad_norm": 0.14254848659038544,
"learning_rate": 0.0004101503759398496,
"loss": 1.082,
"mean_token_accuracy": 0.7081187888979912,
"num_tokens": 12690384.0,
"step": 479
},
{
"entropy": 1.2071665897965431,
"epoch": 0.9037420569545775,
"grad_norm": 0.1451028734445572,
"learning_rate": 0.0004099624060150376,
"loss": 1.1573,
"mean_token_accuracy": 0.6878824383020401,
"num_tokens": 12717190.0,
"step": 480
},
{
"entropy": 1.1643542423844337,
"epoch": 0.9056248529065662,
"grad_norm": 0.16808035969734192,
"learning_rate": 0.0004097744360902256,
"loss": 1.1289,
"mean_token_accuracy": 0.6955900862812996,
"num_tokens": 12744287.0,
"step": 481
},
{
"entropy": 1.1430502980947495,
"epoch": 0.907507648858555,
"grad_norm": 0.14388366043567657,
"learning_rate": 0.00040958646616541354,
"loss": 1.1377,
"mean_token_accuracy": 0.6985258162021637,
"num_tokens": 12769478.0,
"step": 482
},
{
"entropy": 1.171137735247612,
"epoch": 0.9093904448105437,
"grad_norm": 0.14661596715450287,
"learning_rate": 0.0004093984962406015,
"loss": 1.1764,
"mean_token_accuracy": 0.6929311379790306,
"num_tokens": 12795715.0,
"step": 483
},
{
"entropy": 1.159026637673378,
"epoch": 0.9112732407625324,
"grad_norm": 0.14750456809997559,
"learning_rate": 0.0004092105263157895,
"loss": 1.1578,
"mean_token_accuracy": 0.6937888264656067,
"num_tokens": 12821869.0,
"step": 484
},
{
"entropy": 1.1478636413812637,
"epoch": 0.9131560367145211,
"grad_norm": 0.14371232688426971,
"learning_rate": 0.00040902255639097744,
"loss": 1.1218,
"mean_token_accuracy": 0.7008863463997841,
"num_tokens": 12848215.0,
"step": 485
},
{
"entropy": 1.120044082403183,
"epoch": 0.9150388326665098,
"grad_norm": 0.1404104232788086,
"learning_rate": 0.00040883458646616546,
"loss": 1.0728,
"mean_token_accuracy": 0.7091679647564888,
"num_tokens": 12876182.0,
"step": 486
},
{
"entropy": 1.1101247519254684,
"epoch": 0.9169216286184985,
"grad_norm": 0.1421038955450058,
"learning_rate": 0.0004086466165413534,
"loss": 1.0967,
"mean_token_accuracy": 0.7037186399102211,
"num_tokens": 12902501.0,
"step": 487
},
{
"entropy": 1.1512123197317123,
"epoch": 0.9188044245704872,
"grad_norm": 0.14930035173892975,
"learning_rate": 0.00040845864661654134,
"loss": 1.1259,
"mean_token_accuracy": 0.6954185292124748,
"num_tokens": 12928275.0,
"step": 488
},
{
"entropy": 1.136143758893013,
"epoch": 0.9206872205224759,
"grad_norm": 0.1431557983160019,
"learning_rate": 0.00040827067669172936,
"loss": 1.1053,
"mean_token_accuracy": 0.7004474848508835,
"num_tokens": 12954596.0,
"step": 489
},
{
"entropy": 1.1639841794967651,
"epoch": 0.9225700164744646,
"grad_norm": 0.1477883905172348,
"learning_rate": 0.0004080827067669173,
"loss": 1.129,
"mean_token_accuracy": 0.6972065195441246,
"num_tokens": 12980318.0,
"step": 490
},
{
"entropy": 1.162917599081993,
"epoch": 0.9244528124264533,
"grad_norm": 0.14567728340625763,
"learning_rate": 0.00040789473684210524,
"loss": 1.1503,
"mean_token_accuracy": 0.6907480135560036,
"num_tokens": 13006238.0,
"step": 491
},
{
"entropy": 1.1558719277381897,
"epoch": 0.926335608378442,
"grad_norm": 0.1421021670103073,
"learning_rate": 0.00040770676691729326,
"loss": 1.1429,
"mean_token_accuracy": 0.6948621720075607,
"num_tokens": 13034071.0,
"step": 492
},
{
"entropy": 1.175887256860733,
"epoch": 0.9282184043304307,
"grad_norm": 0.14368657767772675,
"learning_rate": 0.00040751879699248123,
"loss": 1.1752,
"mean_token_accuracy": 0.6898396164178848,
"num_tokens": 13059425.0,
"step": 493
},
{
"entropy": 1.1281049996614456,
"epoch": 0.9301012002824194,
"grad_norm": 0.13681703805923462,
"learning_rate": 0.00040733082706766914,
"loss": 1.1437,
"mean_token_accuracy": 0.6920712366700172,
"num_tokens": 13087803.0,
"step": 494
},
{
"entropy": 1.1919779032468796,
"epoch": 0.9319839962344081,
"grad_norm": 0.14613422751426697,
"learning_rate": 0.00040714285714285717,
"loss": 1.1647,
"mean_token_accuracy": 0.6862485483288765,
"num_tokens": 13114083.0,
"step": 495
},
{
"entropy": 1.1703974455595016,
"epoch": 0.9338667921863968,
"grad_norm": 0.13816098868846893,
"learning_rate": 0.00040695488721804513,
"loss": 1.1191,
"mean_token_accuracy": 0.6944621205329895,
"num_tokens": 13140806.0,
"step": 496
},
{
"entropy": 1.1625728458166122,
"epoch": 0.9357495881383855,
"grad_norm": 0.1374853253364563,
"learning_rate": 0.0004067669172932331,
"loss": 1.1311,
"mean_token_accuracy": 0.693043515086174,
"num_tokens": 13167072.0,
"step": 497
},
{
"entropy": 1.1611916273832321,
"epoch": 0.9376323840903742,
"grad_norm": 0.14068859815597534,
"learning_rate": 0.00040657894736842107,
"loss": 1.0958,
"mean_token_accuracy": 0.7017333880066872,
"num_tokens": 13193952.0,
"step": 498
},
{
"entropy": 1.2519186586141586,
"epoch": 0.9395151800423629,
"grad_norm": 0.14739161729812622,
"learning_rate": 0.00040639097744360903,
"loss": 1.2033,
"mean_token_accuracy": 0.6803731620311737,
"num_tokens": 13219334.0,
"step": 499
},
{
"entropy": 1.060287207365036,
"epoch": 0.9413979759943516,
"grad_norm": 0.13330809772014618,
"learning_rate": 0.000406203007518797,
"loss": 1.0607,
"mean_token_accuracy": 0.7074964344501495,
"num_tokens": 13247762.0,
"step": 500
},
{
"entropy": 1.1315688639879227,
"epoch": 0.9432807719463403,
"grad_norm": 0.14858287572860718,
"learning_rate": 0.00040601503759398497,
"loss": 1.1534,
"mean_token_accuracy": 0.6925570517778397,
"num_tokens": 13274542.0,
"step": 501
},
{
"entropy": 1.1256567761301994,
"epoch": 0.945163567898329,
"grad_norm": 0.13854491710662842,
"learning_rate": 0.00040582706766917293,
"loss": 1.1164,
"mean_token_accuracy": 0.697671189904213,
"num_tokens": 13301954.0,
"step": 502
},
{
"entropy": 1.1095138639211655,
"epoch": 0.9470463638503177,
"grad_norm": 0.14951969683170319,
"learning_rate": 0.0004056390977443609,
"loss": 1.0913,
"mean_token_accuracy": 0.7060349136590958,
"num_tokens": 13325368.0,
"step": 503
},
{
"entropy": 1.2117299437522888,
"epoch": 0.9489291598023064,
"grad_norm": 0.14555485546588898,
"learning_rate": 0.0004054511278195489,
"loss": 1.1771,
"mean_token_accuracy": 0.6878413483500481,
"num_tokens": 13350621.0,
"step": 504
},
{
"entropy": 1.1814142614603043,
"epoch": 0.9508119557542951,
"grad_norm": 0.13946305215358734,
"learning_rate": 0.00040526315789473684,
"loss": 1.1187,
"mean_token_accuracy": 0.6975477784872055,
"num_tokens": 13378436.0,
"step": 505
},
{
"entropy": 1.1588895320892334,
"epoch": 0.9526947517062838,
"grad_norm": 0.14052411913871765,
"learning_rate": 0.0004050751879699248,
"loss": 1.1139,
"mean_token_accuracy": 0.6970377415418625,
"num_tokens": 13405779.0,
"step": 506
},
{
"entropy": 1.1744963377714157,
"epoch": 0.9545775476582725,
"grad_norm": 0.14011354744434357,
"learning_rate": 0.0004048872180451128,
"loss": 1.1443,
"mean_token_accuracy": 0.6915831044316292,
"num_tokens": 13431768.0,
"step": 507
},
{
"entropy": 1.1022943705320358,
"epoch": 0.9564603436102612,
"grad_norm": 0.16085639595985413,
"learning_rate": 0.00040469924812030074,
"loss": 1.0872,
"mean_token_accuracy": 0.7034497335553169,
"num_tokens": 13458430.0,
"step": 508
},
{
"entropy": 1.1168298870325089,
"epoch": 0.9583431395622499,
"grad_norm": 0.14645646512508392,
"learning_rate": 0.0004045112781954887,
"loss": 1.1366,
"mean_token_accuracy": 0.6974723115563393,
"num_tokens": 13483989.0,
"step": 509
},
{
"entropy": 1.1111514419317245,
"epoch": 0.9602259355142386,
"grad_norm": 0.15530261397361755,
"learning_rate": 0.0004043233082706767,
"loss": 1.1068,
"mean_token_accuracy": 0.7063265517354012,
"num_tokens": 13510734.0,
"step": 510
},
{
"entropy": 1.1187052130699158,
"epoch": 0.9621087314662273,
"grad_norm": 0.1410273313522339,
"learning_rate": 0.0004041353383458647,
"loss": 1.1007,
"mean_token_accuracy": 0.6978159174323082,
"num_tokens": 13536200.0,
"step": 511
},
{
"entropy": 1.2634307444095612,
"epoch": 0.963991527418216,
"grad_norm": 0.14832766354084015,
"learning_rate": 0.0004039473684210526,
"loss": 1.2454,
"mean_token_accuracy": 0.674240916967392,
"num_tokens": 13562180.0,
"step": 512
},
{
"entropy": 1.209633857011795,
"epoch": 0.9658743233702047,
"grad_norm": 0.14852747321128845,
"learning_rate": 0.0004037593984962406,
"loss": 1.151,
"mean_token_accuracy": 0.6942615807056427,
"num_tokens": 13587252.0,
"step": 513
},
{
"entropy": 1.1802778542041779,
"epoch": 0.9677571193221934,
"grad_norm": 0.14167462289333344,
"learning_rate": 0.0004035714285714286,
"loss": 1.1268,
"mean_token_accuracy": 0.6984767615795135,
"num_tokens": 13614161.0,
"step": 514
},
{
"entropy": 1.1260388046503067,
"epoch": 0.9696399152741821,
"grad_norm": 0.1389787793159485,
"learning_rate": 0.00040338345864661656,
"loss": 1.1044,
"mean_token_accuracy": 0.698441170156002,
"num_tokens": 13640906.0,
"step": 515
},
{
"entropy": 1.1336752623319626,
"epoch": 0.9715227112261708,
"grad_norm": 0.13808688521385193,
"learning_rate": 0.0004031954887218045,
"loss": 1.1185,
"mean_token_accuracy": 0.7005246728658676,
"num_tokens": 13666938.0,
"step": 516
},
{
"entropy": 1.1089581847190857,
"epoch": 0.9734055071781595,
"grad_norm": 0.1490076631307602,
"learning_rate": 0.0004030075187969925,
"loss": 1.1037,
"mean_token_accuracy": 0.699261337518692,
"num_tokens": 13692343.0,
"step": 517
},
{
"entropy": 1.1778569370508194,
"epoch": 0.9752883031301482,
"grad_norm": 0.1503973752260208,
"learning_rate": 0.00040281954887218046,
"loss": 1.1704,
"mean_token_accuracy": 0.6850240305066109,
"num_tokens": 13717884.0,
"step": 518
},
{
"entropy": 1.1599782705307007,
"epoch": 0.977171099082137,
"grad_norm": 0.14560772478580475,
"learning_rate": 0.00040263157894736843,
"loss": 1.1481,
"mean_token_accuracy": 0.6967450231313705,
"num_tokens": 13744454.0,
"step": 519
},
{
"entropy": 1.2482303828001022,
"epoch": 0.9790538950341257,
"grad_norm": 0.1557229459285736,
"learning_rate": 0.0004024436090225564,
"loss": 1.2016,
"mean_token_accuracy": 0.679645448923111,
"num_tokens": 13771382.0,
"step": 520
},
{
"entropy": 1.154101237654686,
"epoch": 0.9809366909861144,
"grad_norm": 0.1511804610490799,
"learning_rate": 0.00040225563909774436,
"loss": 1.1211,
"mean_token_accuracy": 0.692274309694767,
"num_tokens": 13797315.0,
"step": 521
},
{
"entropy": 1.1659268885850906,
"epoch": 0.9828194869381031,
"grad_norm": 0.14492999017238617,
"learning_rate": 0.0004020676691729324,
"loss": 1.1276,
"mean_token_accuracy": 0.6957960724830627,
"num_tokens": 13823504.0,
"step": 522
},
{
"entropy": 1.2255947291851044,
"epoch": 0.9847022828900918,
"grad_norm": 0.16592226922512054,
"learning_rate": 0.0004018796992481203,
"loss": 1.2034,
"mean_token_accuracy": 0.6800813153386116,
"num_tokens": 13849682.0,
"step": 523
},
{
"entropy": 1.181060180068016,
"epoch": 0.9865850788420805,
"grad_norm": 0.14438042044639587,
"learning_rate": 0.00040169172932330826,
"loss": 1.1422,
"mean_token_accuracy": 0.6908884271979332,
"num_tokens": 13877151.0,
"step": 524
},
{
"entropy": 1.195601612329483,
"epoch": 0.9884678747940692,
"grad_norm": 0.1490834802389145,
"learning_rate": 0.0004015037593984963,
"loss": 1.1609,
"mean_token_accuracy": 0.687875397503376,
"num_tokens": 13902812.0,
"step": 525
},
{
"entropy": 1.1874232441186905,
"epoch": 0.9903506707460579,
"grad_norm": 0.15240395069122314,
"learning_rate": 0.00040131578947368425,
"loss": 1.171,
"mean_token_accuracy": 0.6891705989837646,
"num_tokens": 13926800.0,
"step": 526
},
{
"entropy": 1.0726541802287102,
"epoch": 0.9922334666980466,
"grad_norm": 0.1472628116607666,
"learning_rate": 0.00040112781954887216,
"loss": 1.0744,
"mean_token_accuracy": 0.7090674415230751,
"num_tokens": 13952161.0,
"step": 527
},
{
"entropy": 1.1295729503035545,
"epoch": 0.9941162626500353,
"grad_norm": 0.1415957808494568,
"learning_rate": 0.0004009398496240602,
"loss": 1.1086,
"mean_token_accuracy": 0.7027467861771584,
"num_tokens": 13978937.0,
"step": 528
},
{
"entropy": 1.1615847125649452,
"epoch": 0.995999058602024,
"grad_norm": 0.14748550951480865,
"learning_rate": 0.00040075187969924815,
"loss": 1.1474,
"mean_token_accuracy": 0.6950105875730515,
"num_tokens": 14005138.0,
"step": 529
},
{
"entropy": 1.176683247089386,
"epoch": 0.9978818545540127,
"grad_norm": 0.1543041467666626,
"learning_rate": 0.00040056390977443606,
"loss": 1.1807,
"mean_token_accuracy": 0.684785395860672,
"num_tokens": 14028706.0,
"step": 530
},
{
"entropy": 1.1374549865722656,
"epoch": 0.9997646505060014,
"grad_norm": 0.13411332666873932,
"learning_rate": 0.0004003759398496241,
"loss": 1.0976,
"mean_token_accuracy": 0.7099665105342865,
"num_tokens": 14056095.0,
"step": 531
},
{
"entropy": 1.4449238777160645,
"epoch": 1.0,
"grad_norm": 0.5150332450866699,
"learning_rate": 0.00040018796992481205,
"loss": 1.4328,
"mean_token_accuracy": 0.6301905512809753,
"num_tokens": 14058143.0,
"step": 532
},
{
"epoch": 1.0,
"eval_entropy": 1.273110066141401,
"eval_loss": 1.215613603591919,
"eval_mean_token_accuracy": 0.6747710279056004,
"eval_num_tokens": 14058143.0,
"eval_runtime": 8.5294,
"eval_samples_per_second": 5.745,
"eval_steps_per_second": 0.821,
"step": 532
}
],
"logging_steps": 1.0,
"max_steps": 2660,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.860994210304512e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}