| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 3670, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0002725306171115161, | |
| "grad_norm": 49.46063232421875, | |
| "learning_rate": 0.0, | |
| "loss": 4.2357, | |
| "mean_token_accuracy": 0.2580853081308305, | |
| "num_tokens": 180497.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0027253061711151614, | |
| "grad_norm": 46.16090774536133, | |
| "learning_rate": 2.0361990950226245e-06, | |
| "loss": 4.1759, | |
| "mean_token_accuracy": 0.2622440684483283, | |
| "num_tokens": 1772191.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.005450612342230323, | |
| "grad_norm": 27.987140655517578, | |
| "learning_rate": 4.298642533936651e-06, | |
| "loss": 3.8947, | |
| "mean_token_accuracy": 0.29399702919181436, | |
| "num_tokens": 3528128.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.008175918513345483, | |
| "grad_norm": 12.687642097473145, | |
| "learning_rate": 6.5610859728506795e-06, | |
| "loss": 3.1855, | |
| "mean_token_accuracy": 0.37025331929326055, | |
| "num_tokens": 5321820.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.010901224684460645, | |
| "grad_norm": 4.484994411468506, | |
| "learning_rate": 8.823529411764707e-06, | |
| "loss": 2.5, | |
| "mean_token_accuracy": 0.46682517854496836, | |
| "num_tokens": 7036353.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.013626530855575806, | |
| "grad_norm": 2.473914384841919, | |
| "learning_rate": 1.1085972850678733e-05, | |
| "loss": 2.0545, | |
| "mean_token_accuracy": 0.5429269138723611, | |
| "num_tokens": 8794502.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.016351837026690966, | |
| "grad_norm": 1.9377564191818237, | |
| "learning_rate": 1.3348416289592761e-05, | |
| "loss": 1.7825, | |
| "mean_token_accuracy": 0.594660968054086, | |
| "num_tokens": 10519144.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.019077143197806127, | |
| "grad_norm": 1.8231310844421387, | |
| "learning_rate": 1.5610859728506788e-05, | |
| "loss": 1.6083, | |
| "mean_token_accuracy": 0.6288187805563211, | |
| "num_tokens": 12272638.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.02180244936892129, | |
| "grad_norm": 1.4802289009094238, | |
| "learning_rate": 1.7873303167420814e-05, | |
| "loss": 1.5204, | |
| "mean_token_accuracy": 0.6462318933568895, | |
| "num_tokens": 14037595.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.02452775554003645, | |
| "grad_norm": 1.4305676221847534, | |
| "learning_rate": 2.0135746606334844e-05, | |
| "loss": 1.4496, | |
| "mean_token_accuracy": 0.6578288937918841, | |
| "num_tokens": 15813413.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.027253061711151612, | |
| "grad_norm": 1.4573628902435303, | |
| "learning_rate": 2.239819004524887e-05, | |
| "loss": 1.4074, | |
| "mean_token_accuracy": 0.6659743607975542, | |
| "num_tokens": 17606667.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.029978367882266772, | |
| "grad_norm": 1.3907411098480225, | |
| "learning_rate": 2.4660633484162897e-05, | |
| "loss": 1.3354, | |
| "mean_token_accuracy": 0.6814010716974735, | |
| "num_tokens": 19387650.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.03270367405338193, | |
| "grad_norm": 1.2991719245910645, | |
| "learning_rate": 2.6923076923076923e-05, | |
| "loss": 1.315, | |
| "mean_token_accuracy": 0.6835047041997313, | |
| "num_tokens": 21111629.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.03542898022449709, | |
| "grad_norm": 1.2487128973007202, | |
| "learning_rate": 2.9185520361990953e-05, | |
| "loss": 1.2761, | |
| "mean_token_accuracy": 0.6918578458949923, | |
| "num_tokens": 22855847.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.038154286395612254, | |
| "grad_norm": 1.3445643186569214, | |
| "learning_rate": 3.1447963800904976e-05, | |
| "loss": 1.2535, | |
| "mean_token_accuracy": 0.6977143987081945, | |
| "num_tokens": 24647294.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.04087959256672742, | |
| "grad_norm": 1.273087978363037, | |
| "learning_rate": 3.371040723981901e-05, | |
| "loss": 1.2287, | |
| "mean_token_accuracy": 0.7020150443539024, | |
| "num_tokens": 26336128.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04360489873784258, | |
| "grad_norm": 1.3392159938812256, | |
| "learning_rate": 3.5972850678733036e-05, | |
| "loss": 1.254, | |
| "mean_token_accuracy": 0.6971366205252707, | |
| "num_tokens": 28061528.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.04633020490895774, | |
| "grad_norm": 1.2237101793289185, | |
| "learning_rate": 3.8235294117647055e-05, | |
| "loss": 1.2335, | |
| "mean_token_accuracy": 0.7016927156597376, | |
| "num_tokens": 29834651.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.0490555110800729, | |
| "grad_norm": 1.1867234706878662, | |
| "learning_rate": 4.049773755656109e-05, | |
| "loss": 1.2441, | |
| "mean_token_accuracy": 0.6992780463770032, | |
| "num_tokens": 31533806.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.05178081725118806, | |
| "grad_norm": 1.2524540424346924, | |
| "learning_rate": 4.2760180995475115e-05, | |
| "loss": 1.1962, | |
| "mean_token_accuracy": 0.7076324006542564, | |
| "num_tokens": 33328644.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.054506123422303224, | |
| "grad_norm": 1.2938035726547241, | |
| "learning_rate": 4.502262443438914e-05, | |
| "loss": 1.1923, | |
| "mean_token_accuracy": 0.7092770641669631, | |
| "num_tokens": 35085202.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.057231429593418384, | |
| "grad_norm": 1.2193968296051025, | |
| "learning_rate": 4.728506787330317e-05, | |
| "loss": 1.2002, | |
| "mean_token_accuracy": 0.7085541909560561, | |
| "num_tokens": 36899685.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.059956735764533545, | |
| "grad_norm": 1.1932718753814697, | |
| "learning_rate": 4.95475113122172e-05, | |
| "loss": 1.1231, | |
| "mean_token_accuracy": 0.7225744256749749, | |
| "num_tokens": 38663867.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.0626820419356487, | |
| "grad_norm": 1.2447905540466309, | |
| "learning_rate": 4.994381233319287e-05, | |
| "loss": 1.1412, | |
| "mean_token_accuracy": 0.7189388344995677, | |
| "num_tokens": 40368688.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.06540734810676387, | |
| "grad_norm": 1.2377219200134277, | |
| "learning_rate": 4.9873577749683945e-05, | |
| "loss": 1.1419, | |
| "mean_token_accuracy": 0.7174957160837948, | |
| "num_tokens": 42091556.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.06813265427787903, | |
| "grad_norm": 1.103188395500183, | |
| "learning_rate": 4.9803343166175026e-05, | |
| "loss": 1.1354, | |
| "mean_token_accuracy": 0.71980509320274, | |
| "num_tokens": 43832865.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07085796044899419, | |
| "grad_norm": 1.1445391178131104, | |
| "learning_rate": 4.9733108582666106e-05, | |
| "loss": 1.1425, | |
| "mean_token_accuracy": 0.7196099638007581, | |
| "num_tokens": 45556421.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.07358326662010935, | |
| "grad_norm": 1.1179693937301636, | |
| "learning_rate": 4.9662873999157186e-05, | |
| "loss": 1.1598, | |
| "mean_token_accuracy": 0.7173428479582071, | |
| "num_tokens": 47284838.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.07630857279122451, | |
| "grad_norm": 1.149162769317627, | |
| "learning_rate": 4.9592639415648266e-05, | |
| "loss": 1.1281, | |
| "mean_token_accuracy": 0.7214061733335256, | |
| "num_tokens": 49025592.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.07903387896233968, | |
| "grad_norm": 1.2658289670944214, | |
| "learning_rate": 4.9522404832139346e-05, | |
| "loss": 1.1848, | |
| "mean_token_accuracy": 0.7107397212646902, | |
| "num_tokens": 50771353.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.08175918513345484, | |
| "grad_norm": 1.1745802164077759, | |
| "learning_rate": 4.945217024863043e-05, | |
| "loss": 1.1083, | |
| "mean_token_accuracy": 0.7261597216129303, | |
| "num_tokens": 52521467.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08448449130457, | |
| "grad_norm": 1.2588995695114136, | |
| "learning_rate": 4.938193566512151e-05, | |
| "loss": 1.1305, | |
| "mean_token_accuracy": 0.7218387089669704, | |
| "num_tokens": 54338858.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.08720979747568516, | |
| "grad_norm": 1.2129034996032715, | |
| "learning_rate": 4.931170108161259e-05, | |
| "loss": 1.129, | |
| "mean_token_accuracy": 0.7227001185528934, | |
| "num_tokens": 56014081.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.08993510364680032, | |
| "grad_norm": 1.1418203115463257, | |
| "learning_rate": 4.924146649810367e-05, | |
| "loss": 1.1089, | |
| "mean_token_accuracy": 0.7253761961124837, | |
| "num_tokens": 57778337.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.09266040981791548, | |
| "grad_norm": 1.0563759803771973, | |
| "learning_rate": 4.9171231914594754e-05, | |
| "loss": 1.0984, | |
| "mean_token_accuracy": 0.7282240198925137, | |
| "num_tokens": 59528977.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.09538571598903064, | |
| "grad_norm": 1.0599093437194824, | |
| "learning_rate": 4.910099733108583e-05, | |
| "loss": 1.1142, | |
| "mean_token_accuracy": 0.7256616481579841, | |
| "num_tokens": 61295985.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0981110221601458, | |
| "grad_norm": 1.2205257415771484, | |
| "learning_rate": 4.903076274757691e-05, | |
| "loss": 1.099, | |
| "mean_token_accuracy": 0.7278626722283661, | |
| "num_tokens": 63026897.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.10083632833126097, | |
| "grad_norm": 1.069904088973999, | |
| "learning_rate": 4.896052816406799e-05, | |
| "loss": 1.0891, | |
| "mean_token_accuracy": 0.7307559937238693, | |
| "num_tokens": 64770711.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.10356163450237613, | |
| "grad_norm": 1.1615813970565796, | |
| "learning_rate": 4.889029358055907e-05, | |
| "loss": 1.0833, | |
| "mean_token_accuracy": 0.7321938696317375, | |
| "num_tokens": 66551008.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.10628694067349129, | |
| "grad_norm": 1.0941394567489624, | |
| "learning_rate": 4.882005899705015e-05, | |
| "loss": 1.1027, | |
| "mean_token_accuracy": 0.727668415941298, | |
| "num_tokens": 68320690.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.10901224684460645, | |
| "grad_norm": 1.0933481454849243, | |
| "learning_rate": 4.874982441354123e-05, | |
| "loss": 1.0649, | |
| "mean_token_accuracy": 0.7347688566893339, | |
| "num_tokens": 70095284.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.11173755301572161, | |
| "grad_norm": 1.0768437385559082, | |
| "learning_rate": 4.8679589830032316e-05, | |
| "loss": 1.0794, | |
| "mean_token_accuracy": 0.7327698688954115, | |
| "num_tokens": 71803511.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.11446285918683677, | |
| "grad_norm": 1.091327428817749, | |
| "learning_rate": 4.860935524652339e-05, | |
| "loss": 1.0419, | |
| "mean_token_accuracy": 0.7397373986430467, | |
| "num_tokens": 73528297.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.11718816535795193, | |
| "grad_norm": 1.06846284866333, | |
| "learning_rate": 4.853912066301447e-05, | |
| "loss": 1.0433, | |
| "mean_token_accuracy": 0.7402802865020931, | |
| "num_tokens": 75274289.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.11991347152906709, | |
| "grad_norm": 1.123704195022583, | |
| "learning_rate": 4.846888607950555e-05, | |
| "loss": 1.0839, | |
| "mean_token_accuracy": 0.7311916822567582, | |
| "num_tokens": 77077403.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.12263877770018225, | |
| "grad_norm": 1.0876643657684326, | |
| "learning_rate": 4.8398651495996636e-05, | |
| "loss": 1.0821, | |
| "mean_token_accuracy": 0.7309617185033858, | |
| "num_tokens": 78859882.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1253640838712974, | |
| "grad_norm": 1.0852820873260498, | |
| "learning_rate": 4.832841691248771e-05, | |
| "loss": 1.0927, | |
| "mean_token_accuracy": 0.7292127916589379, | |
| "num_tokens": 80590976.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.12808939004241257, | |
| "grad_norm": 1.0788687467575073, | |
| "learning_rate": 4.825818232897879e-05, | |
| "loss": 1.0541, | |
| "mean_token_accuracy": 0.7368990315124393, | |
| "num_tokens": 82275126.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.13081469621352773, | |
| "grad_norm": 1.0248864889144897, | |
| "learning_rate": 4.818794774546987e-05, | |
| "loss": 1.0448, | |
| "mean_token_accuracy": 0.7376345920376479, | |
| "num_tokens": 84096910.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1335400023846429, | |
| "grad_norm": 1.060374140739441, | |
| "learning_rate": 4.811771316196095e-05, | |
| "loss": 1.0771, | |
| "mean_token_accuracy": 0.7329897940158844, | |
| "num_tokens": 85876326.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.13626530855575805, | |
| "grad_norm": 1.0276813507080078, | |
| "learning_rate": 4.804747857845203e-05, | |
| "loss": 1.0774, | |
| "mean_token_accuracy": 0.733756088744849, | |
| "num_tokens": 87607478.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1389906147268732, | |
| "grad_norm": 0.996231734752655, | |
| "learning_rate": 4.797724399494311e-05, | |
| "loss": 1.0459, | |
| "mean_token_accuracy": 0.7390362743288279, | |
| "num_tokens": 89350066.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.14171592089798837, | |
| "grad_norm": 1.084644079208374, | |
| "learning_rate": 4.79070094114342e-05, | |
| "loss": 1.0518, | |
| "mean_token_accuracy": 0.7383495075628161, | |
| "num_tokens": 91067910.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.14444122706910353, | |
| "grad_norm": 1.0383051633834839, | |
| "learning_rate": 4.783677482792527e-05, | |
| "loss": 1.0475, | |
| "mean_token_accuracy": 0.7384993623942137, | |
| "num_tokens": 92797017.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.1471665332402187, | |
| "grad_norm": 1.021580457687378, | |
| "learning_rate": 4.776654024441635e-05, | |
| "loss": 1.0633, | |
| "mean_token_accuracy": 0.7345522255636752, | |
| "num_tokens": 94606329.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.14989183941133385, | |
| "grad_norm": 1.0029984712600708, | |
| "learning_rate": 4.769630566090743e-05, | |
| "loss": 1.0425, | |
| "mean_token_accuracy": 0.73898695576936, | |
| "num_tokens": 96331087.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.15261714558244902, | |
| "grad_norm": 0.9963593482971191, | |
| "learning_rate": 4.762607107739852e-05, | |
| "loss": 1.064, | |
| "mean_token_accuracy": 0.7353490410372615, | |
| "num_tokens": 98138711.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.15534245175356418, | |
| "grad_norm": 1.0283918380737305, | |
| "learning_rate": 4.755583649388959e-05, | |
| "loss": 1.0666, | |
| "mean_token_accuracy": 0.7352430403232575, | |
| "num_tokens": 99885005.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.15806775792467936, | |
| "grad_norm": 1.0260673761367798, | |
| "learning_rate": 4.748560191038067e-05, | |
| "loss": 0.9751, | |
| "mean_token_accuracy": 0.7531527349725365, | |
| "num_tokens": 101636075.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.16079306409579452, | |
| "grad_norm": 1.0847331285476685, | |
| "learning_rate": 4.741536732687175e-05, | |
| "loss": 1.0334, | |
| "mean_token_accuracy": 0.7419406285509467, | |
| "num_tokens": 103349118.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.16351837026690969, | |
| "grad_norm": 1.1022111177444458, | |
| "learning_rate": 4.734513274336283e-05, | |
| "loss": 1.0475, | |
| "mean_token_accuracy": 0.7382194061763585, | |
| "num_tokens": 105033010.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.16624367643802485, | |
| "grad_norm": 1.0745152235031128, | |
| "learning_rate": 4.727489815985391e-05, | |
| "loss": 1.0114, | |
| "mean_token_accuracy": 0.7453443594276905, | |
| "num_tokens": 106723283.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.16896898260914, | |
| "grad_norm": 0.9670913815498352, | |
| "learning_rate": 4.720466357634499e-05, | |
| "loss": 1.0299, | |
| "mean_token_accuracy": 0.7417304971255362, | |
| "num_tokens": 108436878.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.17169428878025517, | |
| "grad_norm": 1.0606757402420044, | |
| "learning_rate": 4.713442899283608e-05, | |
| "loss": 1.0134, | |
| "mean_token_accuracy": 0.7449460197240114, | |
| "num_tokens": 110203157.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.17441959495137033, | |
| "grad_norm": 1.1226489543914795, | |
| "learning_rate": 4.706419440932715e-05, | |
| "loss": 1.0392, | |
| "mean_token_accuracy": 0.7409128420054912, | |
| "num_tokens": 111949130.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.1771449011224855, | |
| "grad_norm": 1.0842260122299194, | |
| "learning_rate": 4.6993959825818233e-05, | |
| "loss": 1.0447, | |
| "mean_token_accuracy": 0.7380765706300736, | |
| "num_tokens": 113652926.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.17987020729360065, | |
| "grad_norm": 1.010992169380188, | |
| "learning_rate": 4.6923725242309314e-05, | |
| "loss": 1.0289, | |
| "mean_token_accuracy": 0.7402720710262656, | |
| "num_tokens": 115334647.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.1825955134647158, | |
| "grad_norm": 0.9638611674308777, | |
| "learning_rate": 4.68534906588004e-05, | |
| "loss": 0.9863, | |
| "mean_token_accuracy": 0.7511739198118448, | |
| "num_tokens": 117032381.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.18532081963583097, | |
| "grad_norm": 0.9948492050170898, | |
| "learning_rate": 4.6783256075291474e-05, | |
| "loss": 1.0236, | |
| "mean_token_accuracy": 0.7424138585105539, | |
| "num_tokens": 118801553.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.18804612580694613, | |
| "grad_norm": 1.0282810926437378, | |
| "learning_rate": 4.6713021491782554e-05, | |
| "loss": 1.0355, | |
| "mean_token_accuracy": 0.7400126025080681, | |
| "num_tokens": 120530271.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.1907714319780613, | |
| "grad_norm": 0.9453698992729187, | |
| "learning_rate": 4.6642786908273634e-05, | |
| "loss": 1.0278, | |
| "mean_token_accuracy": 0.7430232111364603, | |
| "num_tokens": 122314411.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.19349673814917645, | |
| "grad_norm": 0.9775828123092651, | |
| "learning_rate": 4.6572552324764715e-05, | |
| "loss": 1.0197, | |
| "mean_token_accuracy": 0.743807871080935, | |
| "num_tokens": 124054113.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.1962220443202916, | |
| "grad_norm": 1.0277308225631714, | |
| "learning_rate": 4.6502317741255795e-05, | |
| "loss": 1.0356, | |
| "mean_token_accuracy": 0.740591025352478, | |
| "num_tokens": 125786705.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.19894735049140677, | |
| "grad_norm": 1.0205105543136597, | |
| "learning_rate": 4.6432083157746875e-05, | |
| "loss": 1.0347, | |
| "mean_token_accuracy": 0.7396136365830899, | |
| "num_tokens": 127510112.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.20167265666252193, | |
| "grad_norm": 1.0093523263931274, | |
| "learning_rate": 4.636184857423796e-05, | |
| "loss": 1.0006, | |
| "mean_token_accuracy": 0.7481041301041842, | |
| "num_tokens": 129321733.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.2043979628336371, | |
| "grad_norm": 0.9474175572395325, | |
| "learning_rate": 4.6291613990729035e-05, | |
| "loss": 1.035, | |
| "mean_token_accuracy": 0.7410462098196149, | |
| "num_tokens": 131068939.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.20712326900475225, | |
| "grad_norm": 1.1145373582839966, | |
| "learning_rate": 4.6221379407220116e-05, | |
| "loss": 1.0676, | |
| "mean_token_accuracy": 0.7343734119087457, | |
| "num_tokens": 132800192.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.2098485751758674, | |
| "grad_norm": 0.999275803565979, | |
| "learning_rate": 4.6151144823711196e-05, | |
| "loss": 1.005, | |
| "mean_token_accuracy": 0.7478898199275136, | |
| "num_tokens": 134501880.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.21257388134698257, | |
| "grad_norm": 1.0402276515960693, | |
| "learning_rate": 4.608091024020228e-05, | |
| "loss": 0.9878, | |
| "mean_token_accuracy": 0.7501668559387327, | |
| "num_tokens": 136227230.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.21529918751809773, | |
| "grad_norm": 1.0302717685699463, | |
| "learning_rate": 4.6010675656693356e-05, | |
| "loss": 1.0, | |
| "mean_token_accuracy": 0.7479498274624348, | |
| "num_tokens": 137938433.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.2180244936892129, | |
| "grad_norm": 0.9707064032554626, | |
| "learning_rate": 4.5940441073184436e-05, | |
| "loss": 1.0062, | |
| "mean_token_accuracy": 0.7476970013231039, | |
| "num_tokens": 139667163.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.22074979986032806, | |
| "grad_norm": 0.9473689794540405, | |
| "learning_rate": 4.5870206489675517e-05, | |
| "loss": 0.9879, | |
| "mean_token_accuracy": 0.7510631861165166, | |
| "num_tokens": 141481099.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.22347510603144322, | |
| "grad_norm": 0.9907692670822144, | |
| "learning_rate": 4.57999719061666e-05, | |
| "loss": 1.0453, | |
| "mean_token_accuracy": 0.7398228641599417, | |
| "num_tokens": 143204243.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.22620041220255838, | |
| "grad_norm": 0.9675036668777466, | |
| "learning_rate": 4.572973732265768e-05, | |
| "loss": 1.0049, | |
| "mean_token_accuracy": 0.748301652725786, | |
| "num_tokens": 144995581.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.22892571837367354, | |
| "grad_norm": 0.9796574115753174, | |
| "learning_rate": 4.565950273914876e-05, | |
| "loss": 1.0159, | |
| "mean_token_accuracy": 0.7437716860324144, | |
| "num_tokens": 146711076.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.2316510245447887, | |
| "grad_norm": 0.9572359919548035, | |
| "learning_rate": 4.5589268155639844e-05, | |
| "loss": 1.0059, | |
| "mean_token_accuracy": 0.7478106670081616, | |
| "num_tokens": 148463902.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.23437633071590386, | |
| "grad_norm": 1.010580062866211, | |
| "learning_rate": 4.551903357213092e-05, | |
| "loss": 1.0323, | |
| "mean_token_accuracy": 0.7418697223067283, | |
| "num_tokens": 150212801.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.23710163688701902, | |
| "grad_norm": 0.9667695164680481, | |
| "learning_rate": 4.5448798988622e-05, | |
| "loss": 0.9818, | |
| "mean_token_accuracy": 0.7502022869884968, | |
| "num_tokens": 151950016.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.23982694305813418, | |
| "grad_norm": 1.0137828588485718, | |
| "learning_rate": 4.537856440511308e-05, | |
| "loss": 0.9907, | |
| "mean_token_accuracy": 0.7495149873197079, | |
| "num_tokens": 153686341.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.24255224922924934, | |
| "grad_norm": 0.9276532530784607, | |
| "learning_rate": 4.5308329821604165e-05, | |
| "loss": 1.0163, | |
| "mean_token_accuracy": 0.7445492129772902, | |
| "num_tokens": 155442220.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.2452775554003645, | |
| "grad_norm": 1.0047796964645386, | |
| "learning_rate": 4.523809523809524e-05, | |
| "loss": 0.9883, | |
| "mean_token_accuracy": 0.7502540521323681, | |
| "num_tokens": 157222278.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.24800286157147966, | |
| "grad_norm": 0.9885547757148743, | |
| "learning_rate": 4.516786065458632e-05, | |
| "loss": 0.9833, | |
| "mean_token_accuracy": 0.7508710121735931, | |
| "num_tokens": 158989873.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.2507281677425948, | |
| "grad_norm": 1.0419394969940186, | |
| "learning_rate": 4.50976260710774e-05, | |
| "loss": 1.0086, | |
| "mean_token_accuracy": 0.7485707288607955, | |
| "num_tokens": 160740690.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.25345347391371, | |
| "grad_norm": 0.9929084777832031, | |
| "learning_rate": 4.502739148756848e-05, | |
| "loss": 1.0127, | |
| "mean_token_accuracy": 0.7446823202073574, | |
| "num_tokens": 162505210.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.25617878008482514, | |
| "grad_norm": 1.0232727527618408, | |
| "learning_rate": 4.495715690405956e-05, | |
| "loss": 1.0191, | |
| "mean_token_accuracy": 0.744407182559371, | |
| "num_tokens": 164184119.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.2589040862559403, | |
| "grad_norm": 0.9282605648040771, | |
| "learning_rate": 4.488692232055064e-05, | |
| "loss": 0.9778, | |
| "mean_token_accuracy": 0.7518815349787473, | |
| "num_tokens": 165971146.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.26162939242705546, | |
| "grad_norm": 1.0070769786834717, | |
| "learning_rate": 4.4816687737041726e-05, | |
| "loss": 1.0109, | |
| "mean_token_accuracy": 0.7478598964400589, | |
| "num_tokens": 167701701.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.2643546985981706, | |
| "grad_norm": 1.0107619762420654, | |
| "learning_rate": 4.47464531535328e-05, | |
| "loss": 1.0031, | |
| "mean_token_accuracy": 0.7483802428469062, | |
| "num_tokens": 169443326.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.2670800047692858, | |
| "grad_norm": 0.9970401525497437, | |
| "learning_rate": 4.467621857002388e-05, | |
| "loss": 1.0022, | |
| "mean_token_accuracy": 0.7473449762910604, | |
| "num_tokens": 171199385.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.26980531094040094, | |
| "grad_norm": 1.0069605112075806, | |
| "learning_rate": 4.460598398651496e-05, | |
| "loss": 0.9716, | |
| "mean_token_accuracy": 0.7539871089160443, | |
| "num_tokens": 172929493.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.2725306171115161, | |
| "grad_norm": 1.0105503797531128, | |
| "learning_rate": 4.453574940300605e-05, | |
| "loss": 0.9771, | |
| "mean_token_accuracy": 0.7521027243696153, | |
| "num_tokens": 174672601.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.27525592328263127, | |
| "grad_norm": 0.9245153069496155, | |
| "learning_rate": 4.446551481949712e-05, | |
| "loss": 0.9871, | |
| "mean_token_accuracy": 0.7512713268399238, | |
| "num_tokens": 176441962.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.2779812294537464, | |
| "grad_norm": 1.0189464092254639, | |
| "learning_rate": 4.43952802359882e-05, | |
| "loss": 0.9791, | |
| "mean_token_accuracy": 0.7525585936382413, | |
| "num_tokens": 178197361.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.2807065356248616, | |
| "grad_norm": 1.071568250656128, | |
| "learning_rate": 4.432504565247929e-05, | |
| "loss": 1.009, | |
| "mean_token_accuracy": 0.744661932811141, | |
| "num_tokens": 179887580.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.28343184179597675, | |
| "grad_norm": 0.9799075126647949, | |
| "learning_rate": 4.425481106897036e-05, | |
| "loss": 0.9712, | |
| "mean_token_accuracy": 0.7534190637990832, | |
| "num_tokens": 181655865.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.2861571479670919, | |
| "grad_norm": 1.054100513458252, | |
| "learning_rate": 4.418457648546144e-05, | |
| "loss": 0.9937, | |
| "mean_token_accuracy": 0.7505493542179466, | |
| "num_tokens": 183445880.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.28888245413820707, | |
| "grad_norm": 0.9577687978744507, | |
| "learning_rate": 4.411434190195252e-05, | |
| "loss": 0.9624, | |
| "mean_token_accuracy": 0.7555918388999998, | |
| "num_tokens": 185175706.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.29160776030932223, | |
| "grad_norm": 1.0260660648345947, | |
| "learning_rate": 4.404410731844361e-05, | |
| "loss": 0.9755, | |
| "mean_token_accuracy": 0.7519860923290252, | |
| "num_tokens": 186873396.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.2943330664804374, | |
| "grad_norm": 0.9616529941558838, | |
| "learning_rate": 4.397387273493468e-05, | |
| "loss": 1.0078, | |
| "mean_token_accuracy": 0.7465968690812588, | |
| "num_tokens": 188591288.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.29705837265155255, | |
| "grad_norm": 0.9976760149002075, | |
| "learning_rate": 4.390363815142576e-05, | |
| "loss": 1.0004, | |
| "mean_token_accuracy": 0.7482189310714602, | |
| "num_tokens": 190375182.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.2997836788226677, | |
| "grad_norm": 1.0091749429702759, | |
| "learning_rate": 4.383340356791684e-05, | |
| "loss": 0.9886, | |
| "mean_token_accuracy": 0.7496372631751, | |
| "num_tokens": 192104609.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.30250898499378287, | |
| "grad_norm": 0.9447279572486877, | |
| "learning_rate": 4.376316898440793e-05, | |
| "loss": 0.9275, | |
| "mean_token_accuracy": 0.7625567795708775, | |
| "num_tokens": 193819768.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.30523429116489803, | |
| "grad_norm": 0.944039523601532, | |
| "learning_rate": 4.3692934400899e-05, | |
| "loss": 0.9894, | |
| "mean_token_accuracy": 0.7514046527445316, | |
| "num_tokens": 195541705.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.3079595973360132, | |
| "grad_norm": 0.9289810657501221, | |
| "learning_rate": 4.362269981739008e-05, | |
| "loss": 0.9757, | |
| "mean_token_accuracy": 0.7527969362214207, | |
| "num_tokens": 197250976.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.31068490350712835, | |
| "grad_norm": 0.9183096885681152, | |
| "learning_rate": 4.355246523388117e-05, | |
| "loss": 0.9469, | |
| "mean_token_accuracy": 0.7588935429230332, | |
| "num_tokens": 198964087.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.31341020967824357, | |
| "grad_norm": 1.0104970932006836, | |
| "learning_rate": 4.348223065037224e-05, | |
| "loss": 0.9919, | |
| "mean_token_accuracy": 0.7494912428781391, | |
| "num_tokens": 200654341.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.31613551584935873, | |
| "grad_norm": 0.9720707535743713, | |
| "learning_rate": 4.3411996066863323e-05, | |
| "loss": 1.0166, | |
| "mean_token_accuracy": 0.7451702112331986, | |
| "num_tokens": 202410389.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.3188608220204739, | |
| "grad_norm": 0.9572804570198059, | |
| "learning_rate": 4.3341761483354404e-05, | |
| "loss": 0.9747, | |
| "mean_token_accuracy": 0.753504987526685, | |
| "num_tokens": 204176073.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.32158612819158905, | |
| "grad_norm": 0.984469473361969, | |
| "learning_rate": 4.327152689984549e-05, | |
| "loss": 0.9684, | |
| "mean_token_accuracy": 0.7544271955266595, | |
| "num_tokens": 206017214.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.3243114343627042, | |
| "grad_norm": 1.0402361154556274, | |
| "learning_rate": 4.3201292316336564e-05, | |
| "loss": 0.9792, | |
| "mean_token_accuracy": 0.7519845139235258, | |
| "num_tokens": 207832984.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.32703674053381937, | |
| "grad_norm": 1.0185561180114746, | |
| "learning_rate": 4.3131057732827644e-05, | |
| "loss": 0.986, | |
| "mean_token_accuracy": 0.7506332467310131, | |
| "num_tokens": 209533427.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.32976204670493453, | |
| "grad_norm": 0.9821958541870117, | |
| "learning_rate": 4.3060823149318724e-05, | |
| "loss": 0.9666, | |
| "mean_token_accuracy": 0.7546056086197496, | |
| "num_tokens": 211344332.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.3324873528760497, | |
| "grad_norm": 1.068172574043274, | |
| "learning_rate": 4.299058856580981e-05, | |
| "loss": 0.9949, | |
| "mean_token_accuracy": 0.7498155074194074, | |
| "num_tokens": 213088746.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.33521265904716485, | |
| "grad_norm": 0.9833975434303284, | |
| "learning_rate": 4.2920353982300885e-05, | |
| "loss": 0.9931, | |
| "mean_token_accuracy": 0.7484281599521637, | |
| "num_tokens": 214889528.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.33793796521828, | |
| "grad_norm": 1.090116262435913, | |
| "learning_rate": 4.2850119398791965e-05, | |
| "loss": 1.0017, | |
| "mean_token_accuracy": 0.7485339365899563, | |
| "num_tokens": 216603081.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.3406632713893952, | |
| "grad_norm": 0.9591506719589233, | |
| "learning_rate": 4.277988481528305e-05, | |
| "loss": 0.9518, | |
| "mean_token_accuracy": 0.7570753434672952, | |
| "num_tokens": 218292845.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.34338857756051033, | |
| "grad_norm": 1.013917326927185, | |
| "learning_rate": 4.2709650231774125e-05, | |
| "loss": 0.9728, | |
| "mean_token_accuracy": 0.7542673271149397, | |
| "num_tokens": 220029696.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.3461138837316255, | |
| "grad_norm": 0.9289477467536926, | |
| "learning_rate": 4.2639415648265206e-05, | |
| "loss": 1.0286, | |
| "mean_token_accuracy": 0.7439531436190009, | |
| "num_tokens": 221810553.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.34883918990274065, | |
| "grad_norm": 0.977281391620636, | |
| "learning_rate": 4.2569181064756286e-05, | |
| "loss": 0.9739, | |
| "mean_token_accuracy": 0.7557552525773644, | |
| "num_tokens": 223552801.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.3515644960738558, | |
| "grad_norm": 1.0503572225570679, | |
| "learning_rate": 4.249894648124737e-05, | |
| "loss": 0.9786, | |
| "mean_token_accuracy": 0.7533216239884496, | |
| "num_tokens": 225254929.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.354289802244971, | |
| "grad_norm": 0.9800918102264404, | |
| "learning_rate": 4.2428711897738446e-05, | |
| "loss": 0.9967, | |
| "mean_token_accuracy": 0.7491257831454277, | |
| "num_tokens": 227017304.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.35701510841608614, | |
| "grad_norm": 1.0620349645614624, | |
| "learning_rate": 4.2358477314229526e-05, | |
| "loss": 0.9496, | |
| "mean_token_accuracy": 0.7591037628240883, | |
| "num_tokens": 228785771.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.3597404145872013, | |
| "grad_norm": 0.986772894859314, | |
| "learning_rate": 4.2288242730720607e-05, | |
| "loss": 0.9581, | |
| "mean_token_accuracy": 0.756846007797867, | |
| "num_tokens": 230489667.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.36246572075831646, | |
| "grad_norm": 0.8904594779014587, | |
| "learning_rate": 4.2218008147211694e-05, | |
| "loss": 0.9781, | |
| "mean_token_accuracy": 0.7530512401834131, | |
| "num_tokens": 232269581.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.3651910269294316, | |
| "grad_norm": 0.9492087364196777, | |
| "learning_rate": 4.214777356370277e-05, | |
| "loss": 0.9822, | |
| "mean_token_accuracy": 0.7503454959951341, | |
| "num_tokens": 234028126.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.3679163331005468, | |
| "grad_norm": 1.1163588762283325, | |
| "learning_rate": 4.207753898019385e-05, | |
| "loss": 0.9944, | |
| "mean_token_accuracy": 0.749394488800317, | |
| "num_tokens": 235817129.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.37064163927166194, | |
| "grad_norm": 0.9092262983322144, | |
| "learning_rate": 4.2007304396684934e-05, | |
| "loss": 0.9822, | |
| "mean_token_accuracy": 0.7527043742127717, | |
| "num_tokens": 237588984.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.3733669454427771, | |
| "grad_norm": 1.0118396282196045, | |
| "learning_rate": 4.193706981317601e-05, | |
| "loss": 0.9445, | |
| "mean_token_accuracy": 0.7591466994024814, | |
| "num_tokens": 239305200.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.37609225161389226, | |
| "grad_norm": 0.9351493716239929, | |
| "learning_rate": 4.186683522966709e-05, | |
| "loss": 0.9667, | |
| "mean_token_accuracy": 0.7552483780309558, | |
| "num_tokens": 241078759.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.3788175577850074, | |
| "grad_norm": 0.9622650742530823, | |
| "learning_rate": 4.179660064615817e-05, | |
| "loss": 0.9837, | |
| "mean_token_accuracy": 0.7518350075930357, | |
| "num_tokens": 242876841.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.3815428639561226, | |
| "grad_norm": 0.9375427961349487, | |
| "learning_rate": 4.1726366062649255e-05, | |
| "loss": 0.958, | |
| "mean_token_accuracy": 0.7548901244997979, | |
| "num_tokens": 244578724.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.38426817012723774, | |
| "grad_norm": 0.9655621647834778, | |
| "learning_rate": 4.165613147914033e-05, | |
| "loss": 0.9814, | |
| "mean_token_accuracy": 0.7507870549336075, | |
| "num_tokens": 246363009.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.3869934762983529, | |
| "grad_norm": 0.9648198485374451, | |
| "learning_rate": 4.158589689563141e-05, | |
| "loss": 1.0049, | |
| "mean_token_accuracy": 0.7473927522078156, | |
| "num_tokens": 248120613.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.38971878246946806, | |
| "grad_norm": 0.9323533773422241, | |
| "learning_rate": 4.151566231212249e-05, | |
| "loss": 0.9795, | |
| "mean_token_accuracy": 0.7529980653896928, | |
| "num_tokens": 249941867.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.3924440886405832, | |
| "grad_norm": 0.9345725178718567, | |
| "learning_rate": 4.1445427728613576e-05, | |
| "loss": 0.9479, | |
| "mean_token_accuracy": 0.7590054305270314, | |
| "num_tokens": 251695734.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.3951693948116984, | |
| "grad_norm": 0.9144307374954224, | |
| "learning_rate": 4.137519314510465e-05, | |
| "loss": 0.9915, | |
| "mean_token_accuracy": 0.75021045608446, | |
| "num_tokens": 253503211.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.39789470098281354, | |
| "grad_norm": 1.0824007987976074, | |
| "learning_rate": 4.130495856159573e-05, | |
| "loss": 0.9676, | |
| "mean_token_accuracy": 0.7551144331693649, | |
| "num_tokens": 255313734.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.4006200071539287, | |
| "grad_norm": 0.8734183311462402, | |
| "learning_rate": 4.1234723978086816e-05, | |
| "loss": 0.9193, | |
| "mean_token_accuracy": 0.7647721905261278, | |
| "num_tokens": 257016513.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.40334531332504386, | |
| "grad_norm": 1.014374017715454, | |
| "learning_rate": 4.1164489394577896e-05, | |
| "loss": 0.9919, | |
| "mean_token_accuracy": 0.7497567610815168, | |
| "num_tokens": 258760362.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.406070619496159, | |
| "grad_norm": 0.911683976650238, | |
| "learning_rate": 4.109425481106897e-05, | |
| "loss": 0.9483, | |
| "mean_token_accuracy": 0.7586142903193831, | |
| "num_tokens": 260546971.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.4087959256672742, | |
| "grad_norm": 0.9247537851333618, | |
| "learning_rate": 4.102402022756005e-05, | |
| "loss": 0.9632, | |
| "mean_token_accuracy": 0.7552125737071037, | |
| "num_tokens": 262307672.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.41152123183838935, | |
| "grad_norm": 1.0184024572372437, | |
| "learning_rate": 4.095378564405114e-05, | |
| "loss": 0.9731, | |
| "mean_token_accuracy": 0.7537676138803363, | |
| "num_tokens": 264023889.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.4142465380095045, | |
| "grad_norm": 0.9960761666297913, | |
| "learning_rate": 4.088355106054221e-05, | |
| "loss": 0.9698, | |
| "mean_token_accuracy": 0.75361382458359, | |
| "num_tokens": 265798519.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.41697184418061967, | |
| "grad_norm": 0.9082701802253723, | |
| "learning_rate": 4.081331647703329e-05, | |
| "loss": 0.9867, | |
| "mean_token_accuracy": 0.751113293133676, | |
| "num_tokens": 267546100.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.4196971503517348, | |
| "grad_norm": 0.8918993473052979, | |
| "learning_rate": 4.074308189352437e-05, | |
| "loss": 0.9536, | |
| "mean_token_accuracy": 0.7568896351382136, | |
| "num_tokens": 269297641.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.42242245652285, | |
| "grad_norm": 0.8429189324378967, | |
| "learning_rate": 4.067284731001546e-05, | |
| "loss": 0.9896, | |
| "mean_token_accuracy": 0.7498064401559532, | |
| "num_tokens": 271053006.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.42514776269396515, | |
| "grad_norm": 1.0133056640625, | |
| "learning_rate": 4.060261272650653e-05, | |
| "loss": 0.971, | |
| "mean_token_accuracy": 0.7551591267809272, | |
| "num_tokens": 272829487.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.4278730688650803, | |
| "grad_norm": 0.9307904839515686, | |
| "learning_rate": 4.053237814299761e-05, | |
| "loss": 0.966, | |
| "mean_token_accuracy": 0.7548410438001156, | |
| "num_tokens": 274621990.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.43059837503619547, | |
| "grad_norm": 0.9339297413825989, | |
| "learning_rate": 4.04621435594887e-05, | |
| "loss": 0.9842, | |
| "mean_token_accuracy": 0.7506074154749512, | |
| "num_tokens": 276398989.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.43332368120731063, | |
| "grad_norm": 0.9794987440109253, | |
| "learning_rate": 4.039190897597978e-05, | |
| "loss": 0.9494, | |
| "mean_token_accuracy": 0.7569991254247725, | |
| "num_tokens": 278192884.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.4360489873784258, | |
| "grad_norm": 0.9957991242408752, | |
| "learning_rate": 4.032167439247085e-05, | |
| "loss": 0.9528, | |
| "mean_token_accuracy": 0.7580153970047832, | |
| "num_tokens": 279937931.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.43877429354954095, | |
| "grad_norm": 0.8942903280258179, | |
| "learning_rate": 4.025143980896193e-05, | |
| "loss": 0.9528, | |
| "mean_token_accuracy": 0.7584472270682454, | |
| "num_tokens": 281698143.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.4414995997206561, | |
| "grad_norm": 0.9091892242431641, | |
| "learning_rate": 4.018120522545302e-05, | |
| "loss": 0.9642, | |
| "mean_token_accuracy": 0.7558311942964793, | |
| "num_tokens": 283412565.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.44422490589177127, | |
| "grad_norm": 0.9882811307907104, | |
| "learning_rate": 4.011097064194409e-05, | |
| "loss": 0.9676, | |
| "mean_token_accuracy": 0.754830582626164, | |
| "num_tokens": 285184244.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.44695021206288643, | |
| "grad_norm": 0.8823055624961853, | |
| "learning_rate": 4.004073605843517e-05, | |
| "loss": 0.9176, | |
| "mean_token_accuracy": 0.7649934707209468, | |
| "num_tokens": 286963226.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.4496755182340016, | |
| "grad_norm": 0.9638675451278687, | |
| "learning_rate": 3.997050147492625e-05, | |
| "loss": 0.9374, | |
| "mean_token_accuracy": 0.7607969364151359, | |
| "num_tokens": 288742524.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.45240082440511675, | |
| "grad_norm": 0.9809541702270508, | |
| "learning_rate": 3.990026689141734e-05, | |
| "loss": 0.9674, | |
| "mean_token_accuracy": 0.7548656595870853, | |
| "num_tokens": 290486439.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.4551261305762319, | |
| "grad_norm": 0.9652701616287231, | |
| "learning_rate": 3.9830032307908413e-05, | |
| "loss": 0.9778, | |
| "mean_token_accuracy": 0.7537269618362188, | |
| "num_tokens": 292244616.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.4578514367473471, | |
| "grad_norm": 0.9816784858703613, | |
| "learning_rate": 3.9759797724399494e-05, | |
| "loss": 0.9732, | |
| "mean_token_accuracy": 0.7538707010447979, | |
| "num_tokens": 294028863.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.46057674291846223, | |
| "grad_norm": 0.9191619157791138, | |
| "learning_rate": 3.968956314089058e-05, | |
| "loss": 0.9457, | |
| "mean_token_accuracy": 0.7579093240201473, | |
| "num_tokens": 295842660.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.4633020490895774, | |
| "grad_norm": 0.9497706890106201, | |
| "learning_rate": 3.961932855738166e-05, | |
| "loss": 0.9482, | |
| "mean_token_accuracy": 0.7591779384762048, | |
| "num_tokens": 297606076.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.46602735526069256, | |
| "grad_norm": 0.952664315700531, | |
| "learning_rate": 3.9549093973872734e-05, | |
| "loss": 0.9782, | |
| "mean_token_accuracy": 0.7524136954918503, | |
| "num_tokens": 299367632.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.4687526614318077, | |
| "grad_norm": 0.8512211441993713, | |
| "learning_rate": 3.9478859390363814e-05, | |
| "loss": 0.9332, | |
| "mean_token_accuracy": 0.7630375389009714, | |
| "num_tokens": 301097990.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.4714779676029229, | |
| "grad_norm": 1.0166341066360474, | |
| "learning_rate": 3.94086248068549e-05, | |
| "loss": 0.9418, | |
| "mean_token_accuracy": 0.7609906679950654, | |
| "num_tokens": 302847308.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.47420327377403804, | |
| "grad_norm": 0.961094081401825, | |
| "learning_rate": 3.9338390223345975e-05, | |
| "loss": 0.9633, | |
| "mean_token_accuracy": 0.7552885929122567, | |
| "num_tokens": 304582625.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.4769285799451532, | |
| "grad_norm": 0.9322654604911804, | |
| "learning_rate": 3.9268155639837055e-05, | |
| "loss": 0.9452, | |
| "mean_token_accuracy": 0.7605904465541243, | |
| "num_tokens": 306358206.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.47965388611626836, | |
| "grad_norm": 0.964504063129425, | |
| "learning_rate": 3.9197921056328135e-05, | |
| "loss": 0.9435, | |
| "mean_token_accuracy": 0.76006522141397, | |
| "num_tokens": 308026562.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.4823791922873835, | |
| "grad_norm": 0.9687669277191162, | |
| "learning_rate": 3.912768647281922e-05, | |
| "loss": 0.9745, | |
| "mean_token_accuracy": 0.7534444922581315, | |
| "num_tokens": 309804543.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.4851044984584987, | |
| "grad_norm": 0.9425972700119019, | |
| "learning_rate": 3.9057451889310296e-05, | |
| "loss": 0.9656, | |
| "mean_token_accuracy": 0.7545514106750488, | |
| "num_tokens": 311549771.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.48782980462961384, | |
| "grad_norm": 1.042758822441101, | |
| "learning_rate": 3.8987217305801376e-05, | |
| "loss": 0.9812, | |
| "mean_token_accuracy": 0.7521832747384906, | |
| "num_tokens": 313326889.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.490555110800729, | |
| "grad_norm": 1.0554726123809814, | |
| "learning_rate": 3.891698272229246e-05, | |
| "loss": 0.9397, | |
| "mean_token_accuracy": 0.7617883637547493, | |
| "num_tokens": 315064715.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.49328041697184416, | |
| "grad_norm": 0.9078469276428223, | |
| "learning_rate": 3.884674813878354e-05, | |
| "loss": 0.9575, | |
| "mean_token_accuracy": 0.7562927783466875, | |
| "num_tokens": 316809509.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.4960057231429593, | |
| "grad_norm": 0.9499340057373047, | |
| "learning_rate": 3.8776513555274616e-05, | |
| "loss": 0.9375, | |
| "mean_token_accuracy": 0.7615752270445227, | |
| "num_tokens": 318597481.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.4987310293140745, | |
| "grad_norm": 0.9514725804328918, | |
| "learning_rate": 3.8706278971765697e-05, | |
| "loss": 0.9378, | |
| "mean_token_accuracy": 0.7608531050384044, | |
| "num_tokens": 320382863.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.5014563354851896, | |
| "grad_norm": 0.9571301341056824, | |
| "learning_rate": 3.8636044388256784e-05, | |
| "loss": 0.9493, | |
| "mean_token_accuracy": 0.7597140209749341, | |
| "num_tokens": 322136431.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.5041816416563049, | |
| "grad_norm": 0.9752191305160522, | |
| "learning_rate": 3.856580980474786e-05, | |
| "loss": 0.9365, | |
| "mean_token_accuracy": 0.7619069669395685, | |
| "num_tokens": 323892089.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.50690694782742, | |
| "grad_norm": 0.859382152557373, | |
| "learning_rate": 3.849557522123894e-05, | |
| "loss": 0.947, | |
| "mean_token_accuracy": 0.7584124825894832, | |
| "num_tokens": 325693457.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.5096322539985352, | |
| "grad_norm": 1.022796630859375, | |
| "learning_rate": 3.842534063773002e-05, | |
| "loss": 0.925, | |
| "mean_token_accuracy": 0.7624462634325028, | |
| "num_tokens": 327397884.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.5123575601696503, | |
| "grad_norm": 1.0252681970596313, | |
| "learning_rate": 3.8355106054221104e-05, | |
| "loss": 0.9675, | |
| "mean_token_accuracy": 0.7556776776909828, | |
| "num_tokens": 329077502.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.5150828663407655, | |
| "grad_norm": 0.9402310252189636, | |
| "learning_rate": 3.828487147071218e-05, | |
| "loss": 0.956, | |
| "mean_token_accuracy": 0.7570736223831773, | |
| "num_tokens": 330847695.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.5178081725118806, | |
| "grad_norm": 0.9374983310699463, | |
| "learning_rate": 3.821463688720326e-05, | |
| "loss": 0.943, | |
| "mean_token_accuracy": 0.7579239157959818, | |
| "num_tokens": 332578366.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.5205334786829958, | |
| "grad_norm": 0.9612072110176086, | |
| "learning_rate": 3.8144402303694345e-05, | |
| "loss": 0.9656, | |
| "mean_token_accuracy": 0.7552163794636726, | |
| "num_tokens": 334293589.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.5232587848541109, | |
| "grad_norm": 0.8690987229347229, | |
| "learning_rate": 3.8074167720185425e-05, | |
| "loss": 0.9551, | |
| "mean_token_accuracy": 0.755389365926385, | |
| "num_tokens": 336016972.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.5259840910252261, | |
| "grad_norm": 0.8889273405075073, | |
| "learning_rate": 3.80039331366765e-05, | |
| "loss": 0.9312, | |
| "mean_token_accuracy": 0.7616344084963202, | |
| "num_tokens": 337805977.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.5287093971963412, | |
| "grad_norm": 0.888575553894043, | |
| "learning_rate": 3.793369855316758e-05, | |
| "loss": 0.9064, | |
| "mean_token_accuracy": 0.7666762206703425, | |
| "num_tokens": 339548221.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.5314347033674565, | |
| "grad_norm": 0.9260870814323425, | |
| "learning_rate": 3.7863463969658666e-05, | |
| "loss": 0.9829, | |
| "mean_token_accuracy": 0.750894641969353, | |
| "num_tokens": 341268344.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.5341600095385716, | |
| "grad_norm": 1.0482826232910156, | |
| "learning_rate": 3.779322938614974e-05, | |
| "loss": 0.9496, | |
| "mean_token_accuracy": 0.7581010499969125, | |
| "num_tokens": 343017522.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.5368853157096868, | |
| "grad_norm": 0.9690923690795898, | |
| "learning_rate": 3.772299480264082e-05, | |
| "loss": 0.9461, | |
| "mean_token_accuracy": 0.7593427566811443, | |
| "num_tokens": 344699192.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.5396106218808019, | |
| "grad_norm": 0.9007167220115662, | |
| "learning_rate": 3.7652760219131906e-05, | |
| "loss": 0.9322, | |
| "mean_token_accuracy": 0.7622509736567735, | |
| "num_tokens": 346458527.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.5423359280519171, | |
| "grad_norm": 0.9483133554458618, | |
| "learning_rate": 3.7582525635622986e-05, | |
| "loss": 0.9492, | |
| "mean_token_accuracy": 0.7583487136289477, | |
| "num_tokens": 348204585.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.5450612342230322, | |
| "grad_norm": 0.9575846195220947, | |
| "learning_rate": 3.751229105211406e-05, | |
| "loss": 0.9753, | |
| "mean_token_accuracy": 0.7533793544396759, | |
| "num_tokens": 349928259.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.5477865403941474, | |
| "grad_norm": 0.9323520660400391, | |
| "learning_rate": 3.744205646860514e-05, | |
| "loss": 0.9396, | |
| "mean_token_accuracy": 0.7598869156092405, | |
| "num_tokens": 351568018.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.5505118465652625, | |
| "grad_norm": 0.9729679822921753, | |
| "learning_rate": 3.737182188509623e-05, | |
| "loss": 0.9254, | |
| "mean_token_accuracy": 0.7636484606191516, | |
| "num_tokens": 353325716.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.5532371527363777, | |
| "grad_norm": 1.0164568424224854, | |
| "learning_rate": 3.730158730158731e-05, | |
| "loss": 0.9575, | |
| "mean_token_accuracy": 0.7566652336157859, | |
| "num_tokens": 355099655.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.5559624589074929, | |
| "grad_norm": 0.8739563822746277, | |
| "learning_rate": 3.723135271807838e-05, | |
| "loss": 0.9358, | |
| "mean_token_accuracy": 0.7612218523398042, | |
| "num_tokens": 356892448.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.5586877650786081, | |
| "grad_norm": 0.9450079798698425, | |
| "learning_rate": 3.716111813456946e-05, | |
| "loss": 0.9634, | |
| "mean_token_accuracy": 0.7554592994041741, | |
| "num_tokens": 358599855.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5614130712497232, | |
| "grad_norm": 0.9322670698165894, | |
| "learning_rate": 3.709088355106055e-05, | |
| "loss": 0.9664, | |
| "mean_token_accuracy": 0.7549236617982388, | |
| "num_tokens": 360366528.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.5641383774208384, | |
| "grad_norm": 0.9792261123657227, | |
| "learning_rate": 3.702064896755162e-05, | |
| "loss": 0.9418, | |
| "mean_token_accuracy": 0.7601142754778266, | |
| "num_tokens": 362123903.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.5668636835919535, | |
| "grad_norm": 0.883922815322876, | |
| "learning_rate": 3.69504143840427e-05, | |
| "loss": 0.9247, | |
| "mean_token_accuracy": 0.7645677644759417, | |
| "num_tokens": 363907270.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.5695889897630687, | |
| "grad_norm": 1.026827335357666, | |
| "learning_rate": 3.688017980053379e-05, | |
| "loss": 0.9139, | |
| "mean_token_accuracy": 0.7644737392663956, | |
| "num_tokens": 365602883.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.5723142959341838, | |
| "grad_norm": 1.0081133842468262, | |
| "learning_rate": 3.680994521702487e-05, | |
| "loss": 0.9346, | |
| "mean_token_accuracy": 0.7607566144317388, | |
| "num_tokens": 367308921.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.575039602105299, | |
| "grad_norm": 0.9480522871017456, | |
| "learning_rate": 3.673971063351594e-05, | |
| "loss": 0.9352, | |
| "mean_token_accuracy": 0.7604883845895529, | |
| "num_tokens": 369068618.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 0.5777649082764141, | |
| "grad_norm": 0.929105818271637, | |
| "learning_rate": 3.666947605000702e-05, | |
| "loss": 0.9531, | |
| "mean_token_accuracy": 0.7584813937544823, | |
| "num_tokens": 370792272.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 0.5804902144475294, | |
| "grad_norm": 0.9752338528633118, | |
| "learning_rate": 3.659924146649811e-05, | |
| "loss": 0.9549, | |
| "mean_token_accuracy": 0.7570922682061791, | |
| "num_tokens": 372506541.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 0.5832155206186445, | |
| "grad_norm": 0.9325385093688965, | |
| "learning_rate": 3.652900688298919e-05, | |
| "loss": 0.9075, | |
| "mean_token_accuracy": 0.7672267651185394, | |
| "num_tokens": 374308542.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 0.5859408267897597, | |
| "grad_norm": 0.9299280643463135, | |
| "learning_rate": 3.645877229948026e-05, | |
| "loss": 0.9157, | |
| "mean_token_accuracy": 0.7651827426627278, | |
| "num_tokens": 376016266.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.5886661329608748, | |
| "grad_norm": 0.8966337442398071, | |
| "learning_rate": 3.638853771597134e-05, | |
| "loss": 0.9567, | |
| "mean_token_accuracy": 0.7577152790501713, | |
| "num_tokens": 377732713.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 0.59139143913199, | |
| "grad_norm": 1.0337902307510376, | |
| "learning_rate": 3.631830313246243e-05, | |
| "loss": 0.9447, | |
| "mean_token_accuracy": 0.7596987700089812, | |
| "num_tokens": 379457927.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 0.5941167453031051, | |
| "grad_norm": 0.8205364346504211, | |
| "learning_rate": 3.6248068548953503e-05, | |
| "loss": 0.9786, | |
| "mean_token_accuracy": 0.7524961337447167, | |
| "num_tokens": 381311531.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 0.5968420514742203, | |
| "grad_norm": 0.9586334228515625, | |
| "learning_rate": 3.6177833965444584e-05, | |
| "loss": 0.9685, | |
| "mean_token_accuracy": 0.7561034603975714, | |
| "num_tokens": 383076036.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 0.5995673576453354, | |
| "grad_norm": 0.9479374885559082, | |
| "learning_rate": 3.610759938193567e-05, | |
| "loss": 0.9666, | |
| "mean_token_accuracy": 0.7556248934939503, | |
| "num_tokens": 384869981.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.6022926638164506, | |
| "grad_norm": 1.0398281812667847, | |
| "learning_rate": 3.603736479842675e-05, | |
| "loss": 0.9466, | |
| "mean_token_accuracy": 0.7579332664608955, | |
| "num_tokens": 386654361.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 0.6050179699875657, | |
| "grad_norm": 0.9523603916168213, | |
| "learning_rate": 3.5967130214917824e-05, | |
| "loss": 0.9224, | |
| "mean_token_accuracy": 0.7639172183349728, | |
| "num_tokens": 388405388.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 0.607743276158681, | |
| "grad_norm": 0.9337072372436523, | |
| "learning_rate": 3.5896895631408904e-05, | |
| "loss": 0.9531, | |
| "mean_token_accuracy": 0.7578973986208439, | |
| "num_tokens": 390160411.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 0.6104685823297961, | |
| "grad_norm": 0.9452222585678101, | |
| "learning_rate": 3.582666104789999e-05, | |
| "loss": 0.9597, | |
| "mean_token_accuracy": 0.75540736541152, | |
| "num_tokens": 391963393.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 0.6131938885009113, | |
| "grad_norm": 0.9237678050994873, | |
| "learning_rate": 3.575642646439107e-05, | |
| "loss": 0.9592, | |
| "mean_token_accuracy": 0.7570389699190855, | |
| "num_tokens": 393719494.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.6159191946720264, | |
| "grad_norm": 0.8745359182357788, | |
| "learning_rate": 3.5686191880882145e-05, | |
| "loss": 0.9544, | |
| "mean_token_accuracy": 0.7581097180023789, | |
| "num_tokens": 395419867.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 0.6186445008431416, | |
| "grad_norm": 0.872172474861145, | |
| "learning_rate": 3.5615957297373225e-05, | |
| "loss": 0.9348, | |
| "mean_token_accuracy": 0.7611732495948672, | |
| "num_tokens": 397164636.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 0.6213698070142567, | |
| "grad_norm": 0.9751588106155396, | |
| "learning_rate": 3.554572271386431e-05, | |
| "loss": 0.9649, | |
| "mean_token_accuracy": 0.7546903455629945, | |
| "num_tokens": 398897880.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 0.6240951131853719, | |
| "grad_norm": 1.0017331838607788, | |
| "learning_rate": 3.5475488130355386e-05, | |
| "loss": 0.9488, | |
| "mean_token_accuracy": 0.7583517892286181, | |
| "num_tokens": 400668616.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 0.6268204193564871, | |
| "grad_norm": 0.8993579745292664, | |
| "learning_rate": 3.5405253546846466e-05, | |
| "loss": 0.942, | |
| "mean_token_accuracy": 0.759977068938315, | |
| "num_tokens": 402379710.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.6295457255276022, | |
| "grad_norm": 0.9273610711097717, | |
| "learning_rate": 3.533501896333755e-05, | |
| "loss": 0.9388, | |
| "mean_token_accuracy": 0.759927311167121, | |
| "num_tokens": 404084345.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 0.6322710316987175, | |
| "grad_norm": 0.9769418835639954, | |
| "learning_rate": 3.526478437982863e-05, | |
| "loss": 0.9102, | |
| "mean_token_accuracy": 0.7673765732906759, | |
| "num_tokens": 405770321.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 0.6349963378698326, | |
| "grad_norm": 0.9049491882324219, | |
| "learning_rate": 3.5194549796319706e-05, | |
| "loss": 0.9333, | |
| "mean_token_accuracy": 0.7614351283758879, | |
| "num_tokens": 407492020.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 0.6377216440409478, | |
| "grad_norm": 0.9118947386741638, | |
| "learning_rate": 3.5124315212810787e-05, | |
| "loss": 0.9294, | |
| "mean_token_accuracy": 0.7637290453538299, | |
| "num_tokens": 409257115.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 0.6404469502120629, | |
| "grad_norm": 0.9527559280395508, | |
| "learning_rate": 3.5054080629301874e-05, | |
| "loss": 0.9522, | |
| "mean_token_accuracy": 0.7575726680457592, | |
| "num_tokens": 410986071.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.6431722563831781, | |
| "grad_norm": 0.9394711852073669, | |
| "learning_rate": 3.4983846045792954e-05, | |
| "loss": 0.9367, | |
| "mean_token_accuracy": 0.7613094063475728, | |
| "num_tokens": 412763897.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 0.6458975625542932, | |
| "grad_norm": 0.9034632444381714, | |
| "learning_rate": 3.491361146228403e-05, | |
| "loss": 0.9093, | |
| "mean_token_accuracy": 0.7680137138813734, | |
| "num_tokens": 414503168.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 0.6486228687254084, | |
| "grad_norm": 0.9068887829780579, | |
| "learning_rate": 3.484337687877511e-05, | |
| "loss": 0.9316, | |
| "mean_token_accuracy": 0.7629754545167089, | |
| "num_tokens": 416227145.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 0.6513481748965235, | |
| "grad_norm": 0.9191030859947205, | |
| "learning_rate": 3.4773142295266194e-05, | |
| "loss": 0.928, | |
| "mean_token_accuracy": 0.7619910618290305, | |
| "num_tokens": 417973040.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 0.6540734810676387, | |
| "grad_norm": 0.9926860928535461, | |
| "learning_rate": 3.470290771175727e-05, | |
| "loss": 0.9338, | |
| "mean_token_accuracy": 0.7614536901935935, | |
| "num_tokens": 419736780.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.6567987872387538, | |
| "grad_norm": 0.8897690176963806, | |
| "learning_rate": 3.463267312824835e-05, | |
| "loss": 0.9242, | |
| "mean_token_accuracy": 0.7639796357601881, | |
| "num_tokens": 421516023.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 0.6595240934098691, | |
| "grad_norm": 0.9441693425178528, | |
| "learning_rate": 3.4562438544739435e-05, | |
| "loss": 0.9664, | |
| "mean_token_accuracy": 0.7563766550272704, | |
| "num_tokens": 423324296.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 0.6622493995809842, | |
| "grad_norm": 0.9950588941574097, | |
| "learning_rate": 3.4492203961230515e-05, | |
| "loss": 0.9542, | |
| "mean_token_accuracy": 0.7551617925986648, | |
| "num_tokens": 425114987.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 0.6649747057520994, | |
| "grad_norm": 0.9122210741043091, | |
| "learning_rate": 3.442196937772159e-05, | |
| "loss": 0.9204, | |
| "mean_token_accuracy": 0.7638139262795448, | |
| "num_tokens": 426875751.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 0.6677000119232145, | |
| "grad_norm": 0.9361687302589417, | |
| "learning_rate": 3.435173479421267e-05, | |
| "loss": 0.9309, | |
| "mean_token_accuracy": 0.7621823664754629, | |
| "num_tokens": 428715433.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6704253180943297, | |
| "grad_norm": 1.0153151750564575, | |
| "learning_rate": 3.4281500210703756e-05, | |
| "loss": 0.9677, | |
| "mean_token_accuracy": 0.754531520511955, | |
| "num_tokens": 430443301.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 0.6731506242654448, | |
| "grad_norm": 0.9298137426376343, | |
| "learning_rate": 3.4211265627194836e-05, | |
| "loss": 0.8981, | |
| "mean_token_accuracy": 0.7690068047493697, | |
| "num_tokens": 432149898.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 0.67587593043656, | |
| "grad_norm": 0.979828953742981, | |
| "learning_rate": 3.414103104368591e-05, | |
| "loss": 0.9162, | |
| "mean_token_accuracy": 0.7655546896159648, | |
| "num_tokens": 433904775.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 0.6786012366076751, | |
| "grad_norm": 0.9502202868461609, | |
| "learning_rate": 3.407079646017699e-05, | |
| "loss": 0.9179, | |
| "mean_token_accuracy": 0.7641258521005512, | |
| "num_tokens": 435648944.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 0.6813265427787903, | |
| "grad_norm": 0.9376423954963684, | |
| "learning_rate": 3.4000561876668076e-05, | |
| "loss": 0.9536, | |
| "mean_token_accuracy": 0.7582374062389136, | |
| "num_tokens": 437390122.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6840518489499054, | |
| "grad_norm": 0.9456937313079834, | |
| "learning_rate": 3.393032729315915e-05, | |
| "loss": 0.9418, | |
| "mean_token_accuracy": 0.7602952811866999, | |
| "num_tokens": 439126311.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 0.6867771551210207, | |
| "grad_norm": 0.9349498748779297, | |
| "learning_rate": 3.386009270965023e-05, | |
| "loss": 0.9311, | |
| "mean_token_accuracy": 0.762849635258317, | |
| "num_tokens": 440890933.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 0.6895024612921358, | |
| "grad_norm": 1.079544186592102, | |
| "learning_rate": 3.378985812614132e-05, | |
| "loss": 0.9195, | |
| "mean_token_accuracy": 0.7645381474867463, | |
| "num_tokens": 442611770.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 0.692227767463251, | |
| "grad_norm": 0.9199313521385193, | |
| "learning_rate": 3.37196235426324e-05, | |
| "loss": 0.9249, | |
| "mean_token_accuracy": 0.7639479031786323, | |
| "num_tokens": 444358166.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 0.6949530736343661, | |
| "grad_norm": 0.9256271719932556, | |
| "learning_rate": 3.364938895912347e-05, | |
| "loss": 0.9609, | |
| "mean_token_accuracy": 0.756293723359704, | |
| "num_tokens": 446078790.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.6976783798054813, | |
| "grad_norm": 0.9971742630004883, | |
| "learning_rate": 3.357915437561455e-05, | |
| "loss": 0.9293, | |
| "mean_token_accuracy": 0.7617772882804275, | |
| "num_tokens": 447811186.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 0.7004036859765964, | |
| "grad_norm": 0.9106160998344421, | |
| "learning_rate": 3.350891979210564e-05, | |
| "loss": 0.94, | |
| "mean_token_accuracy": 0.7603334264829755, | |
| "num_tokens": 449537962.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 0.7031289921477116, | |
| "grad_norm": 0.9458938837051392, | |
| "learning_rate": 3.343868520859672e-05, | |
| "loss": 0.9134, | |
| "mean_token_accuracy": 0.7661536164581776, | |
| "num_tokens": 451259469.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 0.7058542983188267, | |
| "grad_norm": 0.9445596933364868, | |
| "learning_rate": 3.336845062508779e-05, | |
| "loss": 0.9209, | |
| "mean_token_accuracy": 0.7649673901498317, | |
| "num_tokens": 453041146.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 0.708579604489942, | |
| "grad_norm": 0.9680286049842834, | |
| "learning_rate": 3.329821604157887e-05, | |
| "loss": 0.9553, | |
| "mean_token_accuracy": 0.7577722139656544, | |
| "num_tokens": 454753651.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.711304910661057, | |
| "grad_norm": 0.9290615320205688, | |
| "learning_rate": 3.322798145806996e-05, | |
| "loss": 0.9247, | |
| "mean_token_accuracy": 0.7633149197325111, | |
| "num_tokens": 456537057.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 0.7140302168321723, | |
| "grad_norm": 0.9331244230270386, | |
| "learning_rate": 3.315774687456103e-05, | |
| "loss": 0.9675, | |
| "mean_token_accuracy": 0.7562482981011271, | |
| "num_tokens": 458271296.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 0.7167555230032874, | |
| "grad_norm": 0.885105550289154, | |
| "learning_rate": 3.308751229105211e-05, | |
| "loss": 0.9403, | |
| "mean_token_accuracy": 0.7598960697650909, | |
| "num_tokens": 460014932.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 0.7194808291744026, | |
| "grad_norm": 1.002734899520874, | |
| "learning_rate": 3.30172777075432e-05, | |
| "loss": 0.9506, | |
| "mean_token_accuracy": 0.7589968075975776, | |
| "num_tokens": 461818480.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 0.7222061353455177, | |
| "grad_norm": 0.9839244484901428, | |
| "learning_rate": 3.294704312403428e-05, | |
| "loss": 0.9718, | |
| "mean_token_accuracy": 0.753429920040071, | |
| "num_tokens": 463519993.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.7249314415166329, | |
| "grad_norm": 1.0279382467269897, | |
| "learning_rate": 3.287680854052535e-05, | |
| "loss": 0.9391, | |
| "mean_token_accuracy": 0.7601042149588466, | |
| "num_tokens": 465333101.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 0.727656747687748, | |
| "grad_norm": 0.9534622430801392, | |
| "learning_rate": 3.280657395701643e-05, | |
| "loss": 0.9328, | |
| "mean_token_accuracy": 0.7614490607753396, | |
| "num_tokens": 467047198.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 0.7303820538588632, | |
| "grad_norm": 0.9373722672462463, | |
| "learning_rate": 3.273633937350752e-05, | |
| "loss": 0.9465, | |
| "mean_token_accuracy": 0.7602417379617691, | |
| "num_tokens": 468806711.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 0.7331073600299783, | |
| "grad_norm": 0.8950750827789307, | |
| "learning_rate": 3.26661047899986e-05, | |
| "loss": 0.9042, | |
| "mean_token_accuracy": 0.7677352372556925, | |
| "num_tokens": 470600782.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 0.7358326662010936, | |
| "grad_norm": 0.9187238812446594, | |
| "learning_rate": 3.2595870206489674e-05, | |
| "loss": 0.9443, | |
| "mean_token_accuracy": 0.7598759381100535, | |
| "num_tokens": 472349330.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.7385579723722087, | |
| "grad_norm": 0.9190787076950073, | |
| "learning_rate": 3.2525635622980754e-05, | |
| "loss": 0.9309, | |
| "mean_token_accuracy": 0.7636961450800299, | |
| "num_tokens": 474106983.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 0.7412832785433239, | |
| "grad_norm": 0.9724632501602173, | |
| "learning_rate": 3.245540103947184e-05, | |
| "loss": 0.9642, | |
| "mean_token_accuracy": 0.7555989472195506, | |
| "num_tokens": 475902407.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 0.744008584714439, | |
| "grad_norm": 0.8691114187240601, | |
| "learning_rate": 3.2385166455962914e-05, | |
| "loss": 0.8866, | |
| "mean_token_accuracy": 0.7711342711001634, | |
| "num_tokens": 477654644.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 0.7467338908855542, | |
| "grad_norm": 0.9264854192733765, | |
| "learning_rate": 3.2314931872453994e-05, | |
| "loss": 0.9532, | |
| "mean_token_accuracy": 0.7577282522805036, | |
| "num_tokens": 479409078.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 0.7494591970566693, | |
| "grad_norm": 1.0483834743499756, | |
| "learning_rate": 3.224469728894508e-05, | |
| "loss": 0.9366, | |
| "mean_token_accuracy": 0.7610113574191928, | |
| "num_tokens": 481154972.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.7521845032277845, | |
| "grad_norm": 0.9823554158210754, | |
| "learning_rate": 3.217446270543616e-05, | |
| "loss": 0.9622, | |
| "mean_token_accuracy": 0.7564713628962636, | |
| "num_tokens": 482957554.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 0.7549098093988996, | |
| "grad_norm": 0.931236743927002, | |
| "learning_rate": 3.2104228121927235e-05, | |
| "loss": 0.9375, | |
| "mean_token_accuracy": 0.7607782265171409, | |
| "num_tokens": 484730289.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 0.7576351155700148, | |
| "grad_norm": 1.044001817703247, | |
| "learning_rate": 3.2033993538418315e-05, | |
| "loss": 0.9437, | |
| "mean_token_accuracy": 0.7582019144669175, | |
| "num_tokens": 486390658.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 0.7603604217411299, | |
| "grad_norm": 1.0096402168273926, | |
| "learning_rate": 3.19637589549094e-05, | |
| "loss": 0.9055, | |
| "mean_token_accuracy": 0.7680035123601556, | |
| "num_tokens": 488084357.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 0.7630857279122452, | |
| "grad_norm": 1.0040942430496216, | |
| "learning_rate": 3.189352437140048e-05, | |
| "loss": 0.9812, | |
| "mean_token_accuracy": 0.7524819139391183, | |
| "num_tokens": 489867504.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.7658110340833603, | |
| "grad_norm": 0.9055443406105042, | |
| "learning_rate": 3.1823289787891556e-05, | |
| "loss": 0.9519, | |
| "mean_token_accuracy": 0.7572958417236805, | |
| "num_tokens": 491614120.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 0.7685363402544755, | |
| "grad_norm": 0.8963329195976257, | |
| "learning_rate": 3.1753055204382636e-05, | |
| "loss": 0.9504, | |
| "mean_token_accuracy": 0.75969771258533, | |
| "num_tokens": 493360108.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 0.7712616464255906, | |
| "grad_norm": 0.9222882986068726, | |
| "learning_rate": 3.168282062087372e-05, | |
| "loss": 0.9273, | |
| "mean_token_accuracy": 0.764968883432448, | |
| "num_tokens": 495060867.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 0.7739869525967058, | |
| "grad_norm": 0.9971688985824585, | |
| "learning_rate": 3.16125860373648e-05, | |
| "loss": 0.9498, | |
| "mean_token_accuracy": 0.7571736957877875, | |
| "num_tokens": 496731129.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 0.7767122587678209, | |
| "grad_norm": 0.8665016293525696, | |
| "learning_rate": 3.1542351453855877e-05, | |
| "loss": 0.9189, | |
| "mean_token_accuracy": 0.7652535479515791, | |
| "num_tokens": 498416929.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7794375649389361, | |
| "grad_norm": 0.9980199933052063, | |
| "learning_rate": 3.1472116870346964e-05, | |
| "loss": 0.9046, | |
| "mean_token_accuracy": 0.7680843161419034, | |
| "num_tokens": 500131030.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 0.7821628711100512, | |
| "grad_norm": 0.8767507076263428, | |
| "learning_rate": 3.1401882286838044e-05, | |
| "loss": 0.9157, | |
| "mean_token_accuracy": 0.7653609652072191, | |
| "num_tokens": 501860254.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 0.7848881772811664, | |
| "grad_norm": 0.9489269852638245, | |
| "learning_rate": 3.133164770332912e-05, | |
| "loss": 0.9644, | |
| "mean_token_accuracy": 0.7552736889570951, | |
| "num_tokens": 503599791.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 0.7876134834522815, | |
| "grad_norm": 0.9339836835861206, | |
| "learning_rate": 3.12614131198202e-05, | |
| "loss": 0.9844, | |
| "mean_token_accuracy": 0.752055324614048, | |
| "num_tokens": 505277304.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 0.7903387896233968, | |
| "grad_norm": 1.0076805353164673, | |
| "learning_rate": 3.1191178536311284e-05, | |
| "loss": 0.9374, | |
| "mean_token_accuracy": 0.7605025995522737, | |
| "num_tokens": 507010506.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7930640957945119, | |
| "grad_norm": 0.9303880929946899, | |
| "learning_rate": 3.1120943952802364e-05, | |
| "loss": 0.9236, | |
| "mean_token_accuracy": 0.763912508264184, | |
| "num_tokens": 508717678.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 0.7957894019656271, | |
| "grad_norm": 0.9282418489456177, | |
| "learning_rate": 3.105070936929344e-05, | |
| "loss": 0.9036, | |
| "mean_token_accuracy": 0.7691353503614664, | |
| "num_tokens": 510478982.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 0.7985147081367422, | |
| "grad_norm": 0.9317098259925842, | |
| "learning_rate": 3.0980474785784525e-05, | |
| "loss": 0.9013, | |
| "mean_token_accuracy": 0.7674754545092582, | |
| "num_tokens": 512223492.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 0.8012400143078574, | |
| "grad_norm": 0.8785368800163269, | |
| "learning_rate": 3.0910240202275605e-05, | |
| "loss": 0.9245, | |
| "mean_token_accuracy": 0.7631963776424527, | |
| "num_tokens": 513950826.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 0.8039653204789725, | |
| "grad_norm": 0.9449997544288635, | |
| "learning_rate": 3.0840005618766685e-05, | |
| "loss": 0.9511, | |
| "mean_token_accuracy": 0.7589579506777226, | |
| "num_tokens": 515747512.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.8066906266500877, | |
| "grad_norm": 0.9044075012207031, | |
| "learning_rate": 3.076977103525776e-05, | |
| "loss": 0.9251, | |
| "mean_token_accuracy": 0.7633269606158137, | |
| "num_tokens": 517564408.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 0.8094159328212028, | |
| "grad_norm": 0.8995125889778137, | |
| "learning_rate": 3.0699536451748846e-05, | |
| "loss": 0.8688, | |
| "mean_token_accuracy": 0.7750391457229853, | |
| "num_tokens": 519335113.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 0.812141238992318, | |
| "grad_norm": 0.8972451090812683, | |
| "learning_rate": 3.0629301868239926e-05, | |
| "loss": 0.9102, | |
| "mean_token_accuracy": 0.7668128840625286, | |
| "num_tokens": 521052694.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 0.8148665451634332, | |
| "grad_norm": 0.8758600354194641, | |
| "learning_rate": 3.0559067284731e-05, | |
| "loss": 0.9454, | |
| "mean_token_accuracy": 0.7591741172596812, | |
| "num_tokens": 522838886.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 0.8175918513345484, | |
| "grad_norm": 0.871060848236084, | |
| "learning_rate": 3.048883270122208e-05, | |
| "loss": 0.9286, | |
| "mean_token_accuracy": 0.762829508818686, | |
| "num_tokens": 524659688.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8203171575056635, | |
| "grad_norm": 0.9444119334220886, | |
| "learning_rate": 3.0418598117713166e-05, | |
| "loss": 0.9142, | |
| "mean_token_accuracy": 0.7660003494471311, | |
| "num_tokens": 526428828.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 0.8230424636767787, | |
| "grad_norm": 0.9209753274917603, | |
| "learning_rate": 3.0348363534204243e-05, | |
| "loss": 0.9376, | |
| "mean_token_accuracy": 0.7604016859084368, | |
| "num_tokens": 528178357.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 0.8257677698478938, | |
| "grad_norm": 0.898345947265625, | |
| "learning_rate": 3.0278128950695323e-05, | |
| "loss": 0.9201, | |
| "mean_token_accuracy": 0.7637051574885845, | |
| "num_tokens": 529968661.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 0.828493076019009, | |
| "grad_norm": 0.9637786746025085, | |
| "learning_rate": 3.0207894367186407e-05, | |
| "loss": 0.925, | |
| "mean_token_accuracy": 0.7651786257512867, | |
| "num_tokens": 531735846.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 0.8312183821901241, | |
| "grad_norm": 0.9480170011520386, | |
| "learning_rate": 3.0137659783677484e-05, | |
| "loss": 0.9447, | |
| "mean_token_accuracy": 0.7601438457146287, | |
| "num_tokens": 533482605.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.8339436883612393, | |
| "grad_norm": 0.9402963519096375, | |
| "learning_rate": 3.0067425200168564e-05, | |
| "loss": 0.9299, | |
| "mean_token_accuracy": 0.7631770128384232, | |
| "num_tokens": 535172117.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 0.8366689945323545, | |
| "grad_norm": 0.9365580677986145, | |
| "learning_rate": 2.9997190616659644e-05, | |
| "loss": 0.9213, | |
| "mean_token_accuracy": 0.7642906453460455, | |
| "num_tokens": 536922982.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 0.8393943007034697, | |
| "grad_norm": 0.9361058473587036, | |
| "learning_rate": 2.9926956033150728e-05, | |
| "loss": 0.9286, | |
| "mean_token_accuracy": 0.7624955836683511, | |
| "num_tokens": 538697561.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 0.8421196068745849, | |
| "grad_norm": 0.9975050091743469, | |
| "learning_rate": 2.9856721449641805e-05, | |
| "loss": 0.9319, | |
| "mean_token_accuracy": 0.7618069407530129, | |
| "num_tokens": 540498809.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 0.8448449130457, | |
| "grad_norm": 0.9614945650100708, | |
| "learning_rate": 2.9786486866132885e-05, | |
| "loss": 0.9078, | |
| "mean_token_accuracy": 0.7663526112213731, | |
| "num_tokens": 542220573.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.8475702192168152, | |
| "grad_norm": 0.9452424049377441, | |
| "learning_rate": 2.971625228262396e-05, | |
| "loss": 0.9218, | |
| "mean_token_accuracy": 0.7639887780882418, | |
| "num_tokens": 543995979.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 0.8502955253879303, | |
| "grad_norm": 0.9348092079162598, | |
| "learning_rate": 2.964601769911505e-05, | |
| "loss": 0.9413, | |
| "mean_token_accuracy": 0.7585929285734891, | |
| "num_tokens": 545762046.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 0.8530208315590455, | |
| "grad_norm": 0.857044517993927, | |
| "learning_rate": 2.9575783115606125e-05, | |
| "loss": 0.9189, | |
| "mean_token_accuracy": 0.7647921906784176, | |
| "num_tokens": 547563120.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 0.8557461377301606, | |
| "grad_norm": 0.9445975422859192, | |
| "learning_rate": 2.9505548532097206e-05, | |
| "loss": 0.9354, | |
| "mean_token_accuracy": 0.7610112639144063, | |
| "num_tokens": 549276605.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 0.8584714439012758, | |
| "grad_norm": 0.9536585211753845, | |
| "learning_rate": 2.943531394858829e-05, | |
| "loss": 0.9257, | |
| "mean_token_accuracy": 0.7634840881451964, | |
| "num_tokens": 550991479.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.8611967500723909, | |
| "grad_norm": 0.9297323822975159, | |
| "learning_rate": 2.9365079365079366e-05, | |
| "loss": 0.9143, | |
| "mean_token_accuracy": 0.766822918318212, | |
| "num_tokens": 552777662.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 0.8639220562435062, | |
| "grad_norm": 1.0276936292648315, | |
| "learning_rate": 2.9294844781570446e-05, | |
| "loss": 0.9056, | |
| "mean_token_accuracy": 0.7674277478829026, | |
| "num_tokens": 554539356.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 0.8666473624146213, | |
| "grad_norm": 0.9300222992897034, | |
| "learning_rate": 2.9224610198061526e-05, | |
| "loss": 0.9676, | |
| "mean_token_accuracy": 0.7549269145354629, | |
| "num_tokens": 556212548.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 0.8693726685857365, | |
| "grad_norm": 0.9600440859794617, | |
| "learning_rate": 2.915437561455261e-05, | |
| "loss": 0.9341, | |
| "mean_token_accuracy": 0.7607308451086283, | |
| "num_tokens": 557950461.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 0.8720979747568516, | |
| "grad_norm": 0.9339573383331299, | |
| "learning_rate": 2.9084141031043687e-05, | |
| "loss": 0.9145, | |
| "mean_token_accuracy": 0.7650301210582257, | |
| "num_tokens": 559715926.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.8748232809279668, | |
| "grad_norm": 0.9827753305435181, | |
| "learning_rate": 2.9013906447534767e-05, | |
| "loss": 0.9332, | |
| "mean_token_accuracy": 0.7629943957552314, | |
| "num_tokens": 561479394.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 0.8775485870990819, | |
| "grad_norm": 0.9245224595069885, | |
| "learning_rate": 2.8943671864025844e-05, | |
| "loss": 0.9288, | |
| "mean_token_accuracy": 0.7627130763605237, | |
| "num_tokens": 563271872.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 0.8802738932701971, | |
| "grad_norm": 0.887617826461792, | |
| "learning_rate": 2.887343728051693e-05, | |
| "loss": 0.9231, | |
| "mean_token_accuracy": 0.7647191828116775, | |
| "num_tokens": 565002210.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 0.8829991994413122, | |
| "grad_norm": 0.9698552489280701, | |
| "learning_rate": 2.8803202697008008e-05, | |
| "loss": 0.8498, | |
| "mean_token_accuracy": 0.7792680401355028, | |
| "num_tokens": 566740287.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 0.8857245056124274, | |
| "grad_norm": 0.8935603499412537, | |
| "learning_rate": 2.8732968113499088e-05, | |
| "loss": 0.9517, | |
| "mean_token_accuracy": 0.7585014823824168, | |
| "num_tokens": 568560935.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8884498117835425, | |
| "grad_norm": 1.0148096084594727, | |
| "learning_rate": 2.866273352999017e-05, | |
| "loss": 0.94, | |
| "mean_token_accuracy": 0.7607061000540852, | |
| "num_tokens": 570290159.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 0.8911751179546578, | |
| "grad_norm": 0.9350414872169495, | |
| "learning_rate": 2.8592498946481248e-05, | |
| "loss": 0.9458, | |
| "mean_token_accuracy": 0.7595778482034803, | |
| "num_tokens": 572076704.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 0.8939004241257729, | |
| "grad_norm": 0.8865498304367065, | |
| "learning_rate": 2.852226436297233e-05, | |
| "loss": 0.9528, | |
| "mean_token_accuracy": 0.7583590077236295, | |
| "num_tokens": 573841975.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 0.8966257302968881, | |
| "grad_norm": 0.9123432040214539, | |
| "learning_rate": 2.845202977946341e-05, | |
| "loss": 0.9276, | |
| "mean_token_accuracy": 0.7639707328751684, | |
| "num_tokens": 575587035.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 0.8993510364680032, | |
| "grad_norm": 0.8873813152313232, | |
| "learning_rate": 2.8381795195954492e-05, | |
| "loss": 0.9297, | |
| "mean_token_accuracy": 0.76226064004004, | |
| "num_tokens": 577285852.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.9020763426391184, | |
| "grad_norm": 0.9241533875465393, | |
| "learning_rate": 2.831156061244557e-05, | |
| "loss": 0.9416, | |
| "mean_token_accuracy": 0.7608016451820732, | |
| "num_tokens": 579085053.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 0.9048016488102335, | |
| "grad_norm": 0.8881911635398865, | |
| "learning_rate": 2.824132602893665e-05, | |
| "loss": 0.9152, | |
| "mean_token_accuracy": 0.7653098279610276, | |
| "num_tokens": 580874041.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 0.9075269549813487, | |
| "grad_norm": 0.9386590123176575, | |
| "learning_rate": 2.8171091445427726e-05, | |
| "loss": 0.9154, | |
| "mean_token_accuracy": 0.7642969543114304, | |
| "num_tokens": 582616757.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 0.9102522611524638, | |
| "grad_norm": 0.9226840138435364, | |
| "learning_rate": 2.8100856861918813e-05, | |
| "loss": 0.9081, | |
| "mean_token_accuracy": 0.7670234115794301, | |
| "num_tokens": 584370810.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 0.912977567323579, | |
| "grad_norm": 0.9441806077957153, | |
| "learning_rate": 2.803062227840989e-05, | |
| "loss": 0.9236, | |
| "mean_token_accuracy": 0.7639048263430596, | |
| "num_tokens": 586143330.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.9157028734946941, | |
| "grad_norm": 0.9037384986877441, | |
| "learning_rate": 2.796038769490097e-05, | |
| "loss": 0.8882, | |
| "mean_token_accuracy": 0.7715507194399833, | |
| "num_tokens": 587851884.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 0.9184281796658094, | |
| "grad_norm": 0.9478141665458679, | |
| "learning_rate": 2.7890153111392054e-05, | |
| "loss": 0.9175, | |
| "mean_token_accuracy": 0.7643080299720169, | |
| "num_tokens": 589601267.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 0.9211534858369245, | |
| "grad_norm": 1.0288023948669434, | |
| "learning_rate": 2.7819918527883134e-05, | |
| "loss": 0.9489, | |
| "mean_token_accuracy": 0.7579874385148286, | |
| "num_tokens": 591321205.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 0.9238787920080397, | |
| "grad_norm": 0.8367487192153931, | |
| "learning_rate": 2.774968394437421e-05, | |
| "loss": 0.8963, | |
| "mean_token_accuracy": 0.7705693520605564, | |
| "num_tokens": 593110103.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 0.9266040981791548, | |
| "grad_norm": 1.0031945705413818, | |
| "learning_rate": 2.767944936086529e-05, | |
| "loss": 0.9476, | |
| "mean_token_accuracy": 0.7592636797577142, | |
| "num_tokens": 594823039.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.92932940435027, | |
| "grad_norm": 0.901996374130249, | |
| "learning_rate": 2.7609214777356374e-05, | |
| "loss": 0.927, | |
| "mean_token_accuracy": 0.7638763342052698, | |
| "num_tokens": 596625643.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 0.9320547105213851, | |
| "grad_norm": 0.9304973483085632, | |
| "learning_rate": 2.753898019384745e-05, | |
| "loss": 0.947, | |
| "mean_token_accuracy": 0.7571751626208425, | |
| "num_tokens": 598409404.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 0.9347800166925003, | |
| "grad_norm": 1.1966147422790527, | |
| "learning_rate": 2.746874561033853e-05, | |
| "loss": 0.9304, | |
| "mean_token_accuracy": 0.7618606876581907, | |
| "num_tokens": 600131817.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 0.9375053228636154, | |
| "grad_norm": 0.8487194776535034, | |
| "learning_rate": 2.7398511026829608e-05, | |
| "loss": 0.9325, | |
| "mean_token_accuracy": 0.7624602910131216, | |
| "num_tokens": 601867823.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 0.9402306290347306, | |
| "grad_norm": 0.9328591227531433, | |
| "learning_rate": 2.7328276443320695e-05, | |
| "loss": 0.9096, | |
| "mean_token_accuracy": 0.7685537921264768, | |
| "num_tokens": 603591263.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.9429559352058458, | |
| "grad_norm": 0.9486989378929138, | |
| "learning_rate": 2.7258041859811772e-05, | |
| "loss": 0.9362, | |
| "mean_token_accuracy": 0.7623121970333159, | |
| "num_tokens": 605400422.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 0.945681241376961, | |
| "grad_norm": 0.9277918934822083, | |
| "learning_rate": 2.7187807276302852e-05, | |
| "loss": 0.9289, | |
| "mean_token_accuracy": 0.7633737292140722, | |
| "num_tokens": 607180330.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 0.9484065475480761, | |
| "grad_norm": 0.8774585127830505, | |
| "learning_rate": 2.7117572692793936e-05, | |
| "loss": 0.8978, | |
| "mean_token_accuracy": 0.7696298151277006, | |
| "num_tokens": 608937194.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 0.9511318537191913, | |
| "grad_norm": 0.955440104007721, | |
| "learning_rate": 2.7047338109285016e-05, | |
| "loss": 0.9373, | |
| "mean_token_accuracy": 0.7603609010577201, | |
| "num_tokens": 610676168.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 0.9538571598903064, | |
| "grad_norm": 0.8963478207588196, | |
| "learning_rate": 2.6977103525776093e-05, | |
| "loss": 0.8916, | |
| "mean_token_accuracy": 0.7708989802747965, | |
| "num_tokens": 612491555.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.9565824660614216, | |
| "grad_norm": 0.9914854764938354, | |
| "learning_rate": 2.6906868942267173e-05, | |
| "loss": 0.9422, | |
| "mean_token_accuracy": 0.7594701206311584, | |
| "num_tokens": 614310820.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 0.9593077722325367, | |
| "grad_norm": 0.9009816646575928, | |
| "learning_rate": 2.6836634358758256e-05, | |
| "loss": 0.9448, | |
| "mean_token_accuracy": 0.7597188876941801, | |
| "num_tokens": 616071087.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 0.9620330784036519, | |
| "grad_norm": 0.9539654850959778, | |
| "learning_rate": 2.6766399775249333e-05, | |
| "loss": 0.896, | |
| "mean_token_accuracy": 0.7685511685907841, | |
| "num_tokens": 617769166.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 0.964758384574767, | |
| "grad_norm": 0.9612520337104797, | |
| "learning_rate": 2.6696165191740413e-05, | |
| "loss": 0.9235, | |
| "mean_token_accuracy": 0.7635913614183665, | |
| "num_tokens": 619538232.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 0.9674836907458823, | |
| "grad_norm": 0.8072157502174377, | |
| "learning_rate": 2.662593060823149e-05, | |
| "loss": 0.9138, | |
| "mean_token_accuracy": 0.7675083599984646, | |
| "num_tokens": 621303314.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.9702089969169974, | |
| "grad_norm": 0.8455495238304138, | |
| "learning_rate": 2.6555696024722577e-05, | |
| "loss": 0.9015, | |
| "mean_token_accuracy": 0.7690754882991314, | |
| "num_tokens": 623100776.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 0.9729343030881126, | |
| "grad_norm": 0.9044631123542786, | |
| "learning_rate": 2.6485461441213654e-05, | |
| "loss": 0.961, | |
| "mean_token_accuracy": 0.7549854224547744, | |
| "num_tokens": 624821413.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 0.9756596092592277, | |
| "grad_norm": 0.9768007397651672, | |
| "learning_rate": 2.6415226857704734e-05, | |
| "loss": 0.9319, | |
| "mean_token_accuracy": 0.7615104261785746, | |
| "num_tokens": 626628545.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 0.9783849154303429, | |
| "grad_norm": 0.8489816188812256, | |
| "learning_rate": 2.6344992274195818e-05, | |
| "loss": 0.9142, | |
| "mean_token_accuracy": 0.7664257822558284, | |
| "num_tokens": 628425036.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 0.981110221601458, | |
| "grad_norm": 0.8612228631973267, | |
| "learning_rate": 2.6274757690686898e-05, | |
| "loss": 0.8872, | |
| "mean_token_accuracy": 0.7699680911377073, | |
| "num_tokens": 630092739.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.9838355277725732, | |
| "grad_norm": 1.0233787298202515, | |
| "learning_rate": 2.6204523107177975e-05, | |
| "loss": 0.9362, | |
| "mean_token_accuracy": 0.7623144701123238, | |
| "num_tokens": 631791605.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 0.9865608339436883, | |
| "grad_norm": 2.674288034439087, | |
| "learning_rate": 2.6134288523669055e-05, | |
| "loss": 0.928, | |
| "mean_token_accuracy": 0.7619699375703931, | |
| "num_tokens": 633554055.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 0.9892861401148035, | |
| "grad_norm": 0.9271607398986816, | |
| "learning_rate": 2.606405394016014e-05, | |
| "loss": 0.8983, | |
| "mean_token_accuracy": 0.7686384240165353, | |
| "num_tokens": 635269789.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 0.9920114462859186, | |
| "grad_norm": 0.9083386063575745, | |
| "learning_rate": 2.5993819356651215e-05, | |
| "loss": 0.9123, | |
| "mean_token_accuracy": 0.766354302316904, | |
| "num_tokens": 637078092.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 0.9947367524570339, | |
| "grad_norm": 0.9537090063095093, | |
| "learning_rate": 2.5923584773142296e-05, | |
| "loss": 0.8932, | |
| "mean_token_accuracy": 0.7706195389851928, | |
| "num_tokens": 638813927.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.997462058628149, | |
| "grad_norm": 1.0616086721420288, | |
| "learning_rate": 2.5853350189633372e-05, | |
| "loss": 0.9159, | |
| "mean_token_accuracy": 0.7659485065378249, | |
| "num_tokens": 640594007.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5164661407470703, | |
| "learning_rate": 2.578311560612446e-05, | |
| "loss": 0.8237, | |
| "mean_token_accuracy": 0.7708987323629776, | |
| "num_tokens": 642209396.0, | |
| "step": 3670 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 7340, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1108118326981394e+19, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |