{ "best_metric": 0.6229148507118225, "best_model_checkpoint": "model/checkpoints/run1-csharp-codegen/checkpoint-22000", "epoch": 4.999558732680258, "eval_steps": 1000, "global_step": 28325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017650692789691995, "grad_norm": 5.503284931182861, "learning_rate": 4.9982347749338044e-05, "loss": 1.3127, "step": 10 }, { "epoch": 0.003530138557938399, "grad_norm": 1.4239877462387085, "learning_rate": 4.9964695498676085e-05, "loss": 1.2442, "step": 20 }, { "epoch": 0.0052952078369075985, "grad_norm": 1.141628384590149, "learning_rate": 4.9947043248014126e-05, "loss": 1.1426, "step": 30 }, { "epoch": 0.007060277115876798, "grad_norm": 4.818049907684326, "learning_rate": 4.992939099735217e-05, "loss": 1.2138, "step": 40 }, { "epoch": 0.008825346394845997, "grad_norm": 2.6751058101654053, "learning_rate": 4.991173874669021e-05, "loss": 1.2369, "step": 50 }, { "epoch": 0.010590415673815197, "grad_norm": 1.0971022844314575, "learning_rate": 4.989408649602825e-05, "loss": 1.1062, "step": 60 }, { "epoch": 0.012355484952784396, "grad_norm": 3.7238683700561523, "learning_rate": 4.987643424536629e-05, "loss": 1.1359, "step": 70 }, { "epoch": 0.014120554231753596, "grad_norm": 0.988836944103241, "learning_rate": 4.985878199470433e-05, "loss": 1.0044, "step": 80 }, { "epoch": 0.015885623510722795, "grad_norm": 3.4970245361328125, "learning_rate": 4.984112974404237e-05, "loss": 1.0781, "step": 90 }, { "epoch": 0.017650692789691995, "grad_norm": 1.4898335933685303, "learning_rate": 4.9823477493380414e-05, "loss": 1.0369, "step": 100 }, { "epoch": 0.019415762068661194, "grad_norm": 3.7124125957489014, "learning_rate": 4.980582524271845e-05, "loss": 1.0744, "step": 110 }, { "epoch": 0.021180831347630394, "grad_norm": 1.2735360860824585, "learning_rate": 4.978817299205649e-05, "loss": 1.0761, "step": 120 }, { "epoch": 0.022945900626599593, "grad_norm": 1.1731593608856201, "learning_rate": 4.977052074139453e-05, "loss": 1.0761, "step": 130 }, { "epoch": 0.024710969905568793, "grad_norm": 1.4130672216415405, "learning_rate": 4.9752868490732565e-05, "loss": 0.9775, "step": 140 }, { "epoch": 0.026476039184537992, "grad_norm": 3.3941397666931152, "learning_rate": 4.9735216240070606e-05, "loss": 1.1248, "step": 150 }, { "epoch": 0.028241108463507192, "grad_norm": 2.613243818283081, "learning_rate": 4.971756398940865e-05, "loss": 1.0177, "step": 160 }, { "epoch": 0.03000617774247639, "grad_norm": 4.421411991119385, "learning_rate": 4.969991173874669e-05, "loss": 1.039, "step": 170 }, { "epoch": 0.03177124702144559, "grad_norm": 3.040614128112793, "learning_rate": 4.968225948808473e-05, "loss": 1.0389, "step": 180 }, { "epoch": 0.033536316300414794, "grad_norm": 2.769402265548706, "learning_rate": 4.966460723742277e-05, "loss": 0.9449, "step": 190 }, { "epoch": 0.03530138557938399, "grad_norm": 1.115365743637085, "learning_rate": 4.964695498676081e-05, "loss": 0.9773, "step": 200 }, { "epoch": 0.03706645485835319, "grad_norm": 1.0562186241149902, "learning_rate": 4.962930273609885e-05, "loss": 1.0351, "step": 210 }, { "epoch": 0.03883152413732239, "grad_norm": 1.2428004741668701, "learning_rate": 4.9611650485436894e-05, "loss": 1.0004, "step": 220 }, { "epoch": 0.04059659341629159, "grad_norm": 2.263096809387207, "learning_rate": 4.9593998234774935e-05, "loss": 1.1012, "step": 230 }, { "epoch": 0.04236166269526079, "grad_norm": 3.0100183486938477, "learning_rate": 4.9576345984112976e-05, "loss": 1.0645, "step": 240 }, { "epoch": 0.04412673197422999, "grad_norm": 1.0053493976593018, "learning_rate": 4.955869373345102e-05, "loss": 0.9729, "step": 250 }, { "epoch": 0.04589180125319919, "grad_norm": 1.2269765138626099, "learning_rate": 4.954104148278906e-05, "loss": 0.94, "step": 260 }, { "epoch": 0.04765687053216839, "grad_norm": 0.9635799527168274, "learning_rate": 4.95233892321271e-05, "loss": 0.9649, "step": 270 }, { "epoch": 0.049421939811137586, "grad_norm": 1.4033010005950928, "learning_rate": 4.950573698146514e-05, "loss": 1.0486, "step": 280 }, { "epoch": 0.05118700909010679, "grad_norm": 1.3257986307144165, "learning_rate": 4.948808473080318e-05, "loss": 0.9739, "step": 290 }, { "epoch": 0.052952078369075985, "grad_norm": 3.656306028366089, "learning_rate": 4.947043248014122e-05, "loss": 0.9864, "step": 300 }, { "epoch": 0.05471714764804519, "grad_norm": 3.2791707515716553, "learning_rate": 4.9452780229479264e-05, "loss": 0.9734, "step": 310 }, { "epoch": 0.056482216927014384, "grad_norm": 1.0598105192184448, "learning_rate": 4.9435127978817306e-05, "loss": 0.9014, "step": 320 }, { "epoch": 0.05824728620598359, "grad_norm": 2.3721909523010254, "learning_rate": 4.941747572815535e-05, "loss": 1.0495, "step": 330 }, { "epoch": 0.06001235548495278, "grad_norm": 1.3134008646011353, "learning_rate": 4.939982347749338e-05, "loss": 0.9764, "step": 340 }, { "epoch": 0.061777424763921986, "grad_norm": 2.561619758605957, "learning_rate": 4.938217122683142e-05, "loss": 0.9945, "step": 350 }, { "epoch": 0.06354249404289118, "grad_norm": 4.891090393066406, "learning_rate": 4.936451897616946e-05, "loss": 1.1421, "step": 360 }, { "epoch": 0.06530756332186038, "grad_norm": 1.0825132131576538, "learning_rate": 4.9346866725507504e-05, "loss": 0.9436, "step": 370 }, { "epoch": 0.06707263260082959, "grad_norm": 2.776716947555542, "learning_rate": 4.9329214474845546e-05, "loss": 1.0025, "step": 380 }, { "epoch": 0.06883770187979878, "grad_norm": 0.9759344458580017, "learning_rate": 4.931156222418359e-05, "loss": 0.8608, "step": 390 }, { "epoch": 0.07060277115876798, "grad_norm": 1.0649555921554565, "learning_rate": 4.929390997352162e-05, "loss": 0.9983, "step": 400 }, { "epoch": 0.07236784043773718, "grad_norm": 2.2306652069091797, "learning_rate": 4.927625772285966e-05, "loss": 0.9451, "step": 410 }, { "epoch": 0.07413290971670639, "grad_norm": 3.1723175048828125, "learning_rate": 4.9258605472197703e-05, "loss": 0.8851, "step": 420 }, { "epoch": 0.07589797899567557, "grad_norm": 1.0387526750564575, "learning_rate": 4.9240953221535745e-05, "loss": 0.8624, "step": 430 }, { "epoch": 0.07766304827464478, "grad_norm": 2.3691160678863525, "learning_rate": 4.9223300970873786e-05, "loss": 0.9315, "step": 440 }, { "epoch": 0.07942811755361398, "grad_norm": 2.692945718765259, "learning_rate": 4.920564872021183e-05, "loss": 0.8513, "step": 450 }, { "epoch": 0.08119318683258318, "grad_norm": 3.980057954788208, "learning_rate": 4.918799646954987e-05, "loss": 0.8854, "step": 460 }, { "epoch": 0.08295825611155237, "grad_norm": 2.8114423751831055, "learning_rate": 4.917034421888791e-05, "loss": 1.0698, "step": 470 }, { "epoch": 0.08472332539052158, "grad_norm": 1.0136396884918213, "learning_rate": 4.915269196822595e-05, "loss": 0.9141, "step": 480 }, { "epoch": 0.08648839466949078, "grad_norm": 0.8035016059875488, "learning_rate": 4.913503971756399e-05, "loss": 0.9708, "step": 490 }, { "epoch": 0.08825346394845998, "grad_norm": 3.6212997436523438, "learning_rate": 4.911738746690203e-05, "loss": 0.9765, "step": 500 }, { "epoch": 0.09001853322742917, "grad_norm": 1.0750012397766113, "learning_rate": 4.9099735216240074e-05, "loss": 0.9412, "step": 510 }, { "epoch": 0.09178360250639837, "grad_norm": 0.9042845368385315, "learning_rate": 4.9082082965578115e-05, "loss": 0.863, "step": 520 }, { "epoch": 0.09354867178536758, "grad_norm": 1.1494684219360352, "learning_rate": 4.9064430714916156e-05, "loss": 0.9759, "step": 530 }, { "epoch": 0.09531374106433678, "grad_norm": 1.2344112396240234, "learning_rate": 4.90467784642542e-05, "loss": 0.9703, "step": 540 }, { "epoch": 0.09707881034330597, "grad_norm": 3.4947280883789062, "learning_rate": 4.902912621359224e-05, "loss": 0.9009, "step": 550 }, { "epoch": 0.09884387962227517, "grad_norm": 2.9842255115509033, "learning_rate": 4.901147396293028e-05, "loss": 0.955, "step": 560 }, { "epoch": 0.10060894890124437, "grad_norm": 2.9984028339385986, "learning_rate": 4.899382171226832e-05, "loss": 0.8698, "step": 570 }, { "epoch": 0.10237401818021358, "grad_norm": 1.9957551956176758, "learning_rate": 4.8976169461606355e-05, "loss": 0.9207, "step": 580 }, { "epoch": 0.10413908745918277, "grad_norm": 2.6019134521484375, "learning_rate": 4.8958517210944396e-05, "loss": 0.9672, "step": 590 }, { "epoch": 0.10590415673815197, "grad_norm": 1.1441612243652344, "learning_rate": 4.894086496028244e-05, "loss": 0.904, "step": 600 }, { "epoch": 0.10766922601712117, "grad_norm": 1.24358069896698, "learning_rate": 4.892321270962048e-05, "loss": 0.9852, "step": 610 }, { "epoch": 0.10943429529609038, "grad_norm": 1.109772801399231, "learning_rate": 4.890556045895852e-05, "loss": 0.9899, "step": 620 }, { "epoch": 0.11119936457505956, "grad_norm": 3.4650866985321045, "learning_rate": 4.888790820829656e-05, "loss": 0.8353, "step": 630 }, { "epoch": 0.11296443385402877, "grad_norm": 2.650848865509033, "learning_rate": 4.88702559576346e-05, "loss": 0.8749, "step": 640 }, { "epoch": 0.11472950313299797, "grad_norm": 0.7682311534881592, "learning_rate": 4.885260370697264e-05, "loss": 0.8335, "step": 650 }, { "epoch": 0.11649457241196717, "grad_norm": 4.478682994842529, "learning_rate": 4.8834951456310684e-05, "loss": 0.8194, "step": 660 }, { "epoch": 0.11825964169093636, "grad_norm": 1.0624170303344727, "learning_rate": 4.881729920564872e-05, "loss": 0.8129, "step": 670 }, { "epoch": 0.12002471096990557, "grad_norm": 0.9311354160308838, "learning_rate": 4.879964695498676e-05, "loss": 0.8108, "step": 680 }, { "epoch": 0.12178978024887477, "grad_norm": 0.8768683671951294, "learning_rate": 4.87819947043248e-05, "loss": 0.9004, "step": 690 }, { "epoch": 0.12355484952784397, "grad_norm": 2.0419557094573975, "learning_rate": 4.876434245366284e-05, "loss": 0.9701, "step": 700 }, { "epoch": 0.12531991880681317, "grad_norm": 2.4168951511383057, "learning_rate": 4.874669020300088e-05, "loss": 0.9429, "step": 710 }, { "epoch": 0.12708498808578236, "grad_norm": 0.8814497590065002, "learning_rate": 4.8729037952338924e-05, "loss": 0.8575, "step": 720 }, { "epoch": 0.12885005736475158, "grad_norm": 1.1425260305404663, "learning_rate": 4.8711385701676965e-05, "loss": 0.8663, "step": 730 }, { "epoch": 0.13061512664372077, "grad_norm": 0.9289985299110413, "learning_rate": 4.8693733451015007e-05, "loss": 0.9826, "step": 740 }, { "epoch": 0.13238019592268996, "grad_norm": 3.505676746368408, "learning_rate": 4.867608120035305e-05, "loss": 1.0716, "step": 750 }, { "epoch": 0.13414526520165918, "grad_norm": 1.1043481826782227, "learning_rate": 4.865842894969109e-05, "loss": 0.9505, "step": 760 }, { "epoch": 0.13591033448062836, "grad_norm": 2.4004900455474854, "learning_rate": 4.864077669902913e-05, "loss": 0.9706, "step": 770 }, { "epoch": 0.13767540375959755, "grad_norm": 2.9325432777404785, "learning_rate": 4.862312444836717e-05, "loss": 0.8697, "step": 780 }, { "epoch": 0.13944047303856677, "grad_norm": 2.5764336585998535, "learning_rate": 4.860547219770521e-05, "loss": 0.8022, "step": 790 }, { "epoch": 0.14120554231753596, "grad_norm": 1.0109184980392456, "learning_rate": 4.8587819947043253e-05, "loss": 0.8748, "step": 800 }, { "epoch": 0.14297061159650518, "grad_norm": 1.110236406326294, "learning_rate": 4.8570167696381295e-05, "loss": 0.8522, "step": 810 }, { "epoch": 0.14473568087547437, "grad_norm": 3.5498907566070557, "learning_rate": 4.855251544571933e-05, "loss": 0.9105, "step": 820 }, { "epoch": 0.14650075015444355, "grad_norm": 3.878492832183838, "learning_rate": 4.853486319505737e-05, "loss": 0.8466, "step": 830 }, { "epoch": 0.14826581943341277, "grad_norm": 2.681072235107422, "learning_rate": 4.851721094439541e-05, "loss": 0.8786, "step": 840 }, { "epoch": 0.15003088871238196, "grad_norm": 4.230712890625, "learning_rate": 4.849955869373345e-05, "loss": 0.8864, "step": 850 }, { "epoch": 0.15179595799135115, "grad_norm": 3.1931087970733643, "learning_rate": 4.8481906443071494e-05, "loss": 0.9766, "step": 860 }, { "epoch": 0.15356102727032037, "grad_norm": 2.8019542694091797, "learning_rate": 4.8464254192409535e-05, "loss": 0.908, "step": 870 }, { "epoch": 0.15532609654928956, "grad_norm": 2.4553959369659424, "learning_rate": 4.8446601941747576e-05, "loss": 0.9001, "step": 880 }, { "epoch": 0.15709116582825877, "grad_norm": 3.005300283432007, "learning_rate": 4.842894969108562e-05, "loss": 0.9223, "step": 890 }, { "epoch": 0.15885623510722796, "grad_norm": 1.1253565549850464, "learning_rate": 4.841129744042366e-05, "loss": 0.8084, "step": 900 }, { "epoch": 0.16062130438619715, "grad_norm": 2.9257609844207764, "learning_rate": 4.83936451897617e-05, "loss": 0.8003, "step": 910 }, { "epoch": 0.16238637366516637, "grad_norm": 2.794377326965332, "learning_rate": 4.837599293909974e-05, "loss": 0.7103, "step": 920 }, { "epoch": 0.16415144294413556, "grad_norm": 0.9524262547492981, "learning_rate": 4.8358340688437775e-05, "loss": 0.8484, "step": 930 }, { "epoch": 0.16591651222310475, "grad_norm": 3.1264827251434326, "learning_rate": 4.8340688437775816e-05, "loss": 0.8437, "step": 940 }, { "epoch": 0.16768158150207396, "grad_norm": 3.1434571743011475, "learning_rate": 4.832303618711386e-05, "loss": 0.9207, "step": 950 }, { "epoch": 0.16944665078104315, "grad_norm": 3.0784549713134766, "learning_rate": 4.83053839364519e-05, "loss": 0.8078, "step": 960 }, { "epoch": 0.17121172006001237, "grad_norm": 0.7679011821746826, "learning_rate": 4.828773168578994e-05, "loss": 0.7875, "step": 970 }, { "epoch": 0.17297678933898156, "grad_norm": 2.534778356552124, "learning_rate": 4.827007943512798e-05, "loss": 0.9637, "step": 980 }, { "epoch": 0.17474185861795075, "grad_norm": 0.9244309067726135, "learning_rate": 4.825242718446602e-05, "loss": 0.9574, "step": 990 }, { "epoch": 0.17650692789691996, "grad_norm": 3.150660753250122, "learning_rate": 4.823477493380406e-05, "loss": 0.9187, "step": 1000 }, { "epoch": 0.17650692789691996, "eval_loss": 0.8387640714645386, "eval_runtime": 591.4545, "eval_samples_per_second": 47.894, "eval_steps_per_second": 2.396, "eval_token_accuracy": 0.0005117090228854299, "step": 1000 }, { "epoch": 0.17827199717588915, "grad_norm": 1.0762163400650024, "learning_rate": 4.8217122683142104e-05, "loss": 0.9378, "step": 1010 }, { "epoch": 0.18003706645485834, "grad_norm": 2.823103904724121, "learning_rate": 4.8199470432480145e-05, "loss": 0.7404, "step": 1020 }, { "epoch": 0.18180213573382756, "grad_norm": 2.402489423751831, "learning_rate": 4.8181818181818186e-05, "loss": 0.9427, "step": 1030 }, { "epoch": 0.18356720501279675, "grad_norm": 2.4162659645080566, "learning_rate": 4.816416593115623e-05, "loss": 0.8083, "step": 1040 }, { "epoch": 0.18533227429176596, "grad_norm": 2.9245927333831787, "learning_rate": 4.814651368049427e-05, "loss": 0.9984, "step": 1050 }, { "epoch": 0.18709734357073515, "grad_norm": 0.8261721730232239, "learning_rate": 4.81288614298323e-05, "loss": 0.7679, "step": 1060 }, { "epoch": 0.18886241284970434, "grad_norm": 2.2313833236694336, "learning_rate": 4.8111209179170344e-05, "loss": 0.7864, "step": 1070 }, { "epoch": 0.19062748212867356, "grad_norm": 2.17639422416687, "learning_rate": 4.8093556928508385e-05, "loss": 0.8615, "step": 1080 }, { "epoch": 0.19239255140764275, "grad_norm": 1.0018310546875, "learning_rate": 4.8075904677846426e-05, "loss": 0.955, "step": 1090 }, { "epoch": 0.19415762068661194, "grad_norm": 0.93125319480896, "learning_rate": 4.805825242718447e-05, "loss": 0.9129, "step": 1100 }, { "epoch": 0.19592268996558115, "grad_norm": 2.5495829582214355, "learning_rate": 4.804060017652251e-05, "loss": 0.8793, "step": 1110 }, { "epoch": 0.19768775924455034, "grad_norm": 0.8696863055229187, "learning_rate": 4.802294792586055e-05, "loss": 0.9327, "step": 1120 }, { "epoch": 0.19945282852351956, "grad_norm": 0.8448121547698975, "learning_rate": 4.800529567519859e-05, "loss": 1.016, "step": 1130 }, { "epoch": 0.20121789780248875, "grad_norm": 2.590965747833252, "learning_rate": 4.798764342453663e-05, "loss": 1.0176, "step": 1140 }, { "epoch": 0.20298296708145794, "grad_norm": 2.4967727661132812, "learning_rate": 4.796999117387467e-05, "loss": 0.894, "step": 1150 }, { "epoch": 0.20474803636042715, "grad_norm": 2.591536521911621, "learning_rate": 4.7952338923212714e-05, "loss": 0.8355, "step": 1160 }, { "epoch": 0.20651310563939634, "grad_norm": 0.718768298625946, "learning_rate": 4.7934686672550755e-05, "loss": 0.8619, "step": 1170 }, { "epoch": 0.20827817491836553, "grad_norm": 2.2282583713531494, "learning_rate": 4.79170344218888e-05, "loss": 0.9338, "step": 1180 }, { "epoch": 0.21004324419733475, "grad_norm": 0.8208143711090088, "learning_rate": 4.789938217122684e-05, "loss": 0.8617, "step": 1190 }, { "epoch": 0.21180831347630394, "grad_norm": 0.9473330974578857, "learning_rate": 4.788172992056487e-05, "loss": 0.8861, "step": 1200 }, { "epoch": 0.21357338275527316, "grad_norm": 1.0840715169906616, "learning_rate": 4.786407766990291e-05, "loss": 0.966, "step": 1210 }, { "epoch": 0.21533845203424234, "grad_norm": 1.0690734386444092, "learning_rate": 4.7846425419240954e-05, "loss": 0.8333, "step": 1220 }, { "epoch": 0.21710352131321153, "grad_norm": 2.4799931049346924, "learning_rate": 4.7828773168578996e-05, "loss": 0.7794, "step": 1230 }, { "epoch": 0.21886859059218075, "grad_norm": 1.1045414209365845, "learning_rate": 4.781112091791704e-05, "loss": 0.9114, "step": 1240 }, { "epoch": 0.22063365987114994, "grad_norm": 3.1106817722320557, "learning_rate": 4.779346866725508e-05, "loss": 0.8363, "step": 1250 }, { "epoch": 0.22239872915011913, "grad_norm": 3.011002779006958, "learning_rate": 4.777581641659312e-05, "loss": 0.8383, "step": 1260 }, { "epoch": 0.22416379842908835, "grad_norm": 2.440241575241089, "learning_rate": 4.775816416593116e-05, "loss": 0.8405, "step": 1270 }, { "epoch": 0.22592886770805753, "grad_norm": 2.4338467121124268, "learning_rate": 4.77405119152692e-05, "loss": 0.8317, "step": 1280 }, { "epoch": 0.22769393698702675, "grad_norm": 0.8372032642364502, "learning_rate": 4.7722859664607236e-05, "loss": 0.8259, "step": 1290 }, { "epoch": 0.22945900626599594, "grad_norm": 2.346604824066162, "learning_rate": 4.770520741394528e-05, "loss": 0.8471, "step": 1300 }, { "epoch": 0.23122407554496513, "grad_norm": 1.1082093715667725, "learning_rate": 4.768755516328332e-05, "loss": 0.8924, "step": 1310 }, { "epoch": 0.23298914482393435, "grad_norm": 0.9796193838119507, "learning_rate": 4.766990291262136e-05, "loss": 0.8913, "step": 1320 }, { "epoch": 0.23475421410290354, "grad_norm": 2.4977235794067383, "learning_rate": 4.76522506619594e-05, "loss": 0.9109, "step": 1330 }, { "epoch": 0.23651928338187272, "grad_norm": 3.6340599060058594, "learning_rate": 4.763459841129744e-05, "loss": 1.0204, "step": 1340 }, { "epoch": 0.23828435266084194, "grad_norm": 0.9401459097862244, "learning_rate": 4.761694616063548e-05, "loss": 0.922, "step": 1350 }, { "epoch": 0.24004942193981113, "grad_norm": 3.333784580230713, "learning_rate": 4.7599293909973524e-05, "loss": 0.7972, "step": 1360 }, { "epoch": 0.24181449121878035, "grad_norm": 2.373687505722046, "learning_rate": 4.7581641659311565e-05, "loss": 0.8351, "step": 1370 }, { "epoch": 0.24357956049774954, "grad_norm": 1.3305423259735107, "learning_rate": 4.7563989408649606e-05, "loss": 1.09, "step": 1380 }, { "epoch": 0.24534462977671873, "grad_norm": 0.9417628049850464, "learning_rate": 4.754633715798765e-05, "loss": 0.8717, "step": 1390 }, { "epoch": 0.24710969905568794, "grad_norm": 1.0593584775924683, "learning_rate": 4.752868490732569e-05, "loss": 0.7458, "step": 1400 }, { "epoch": 0.24887476833465713, "grad_norm": 0.8886284232139587, "learning_rate": 4.751103265666373e-05, "loss": 0.8889, "step": 1410 }, { "epoch": 0.25063983761362635, "grad_norm": 2.407033681869507, "learning_rate": 4.749338040600177e-05, "loss": 0.9037, "step": 1420 }, { "epoch": 0.25240490689259554, "grad_norm": 1.047439694404602, "learning_rate": 4.747572815533981e-05, "loss": 0.8975, "step": 1430 }, { "epoch": 0.2541699761715647, "grad_norm": 3.1553778648376465, "learning_rate": 4.745807590467785e-05, "loss": 0.9644, "step": 1440 }, { "epoch": 0.2559350454505339, "grad_norm": 3.748414993286133, "learning_rate": 4.7440423654015894e-05, "loss": 0.9116, "step": 1450 }, { "epoch": 0.25770011472950316, "grad_norm": 1.0617207288742065, "learning_rate": 4.742277140335393e-05, "loss": 0.9632, "step": 1460 }, { "epoch": 0.25946518400847235, "grad_norm": 1.1011401414871216, "learning_rate": 4.740511915269197e-05, "loss": 0.9103, "step": 1470 }, { "epoch": 0.26123025328744154, "grad_norm": 3.420823097229004, "learning_rate": 4.738746690203001e-05, "loss": 0.849, "step": 1480 }, { "epoch": 0.2629953225664107, "grad_norm": 3.4054012298583984, "learning_rate": 4.736981465136805e-05, "loss": 0.9037, "step": 1490 }, { "epoch": 0.2647603918453799, "grad_norm": 0.7947477698326111, "learning_rate": 4.735216240070609e-05, "loss": 0.8944, "step": 1500 }, { "epoch": 0.2665254611243491, "grad_norm": 1.9546200037002563, "learning_rate": 4.7334510150044134e-05, "loss": 0.9005, "step": 1510 }, { "epoch": 0.26829053040331835, "grad_norm": 3.5012011528015137, "learning_rate": 4.7316857899382175e-05, "loss": 0.7747, "step": 1520 }, { "epoch": 0.27005559968228754, "grad_norm": 3.4226977825164795, "learning_rate": 4.729920564872021e-05, "loss": 0.9032, "step": 1530 }, { "epoch": 0.27182066896125673, "grad_norm": 2.116934299468994, "learning_rate": 4.728155339805825e-05, "loss": 0.8319, "step": 1540 }, { "epoch": 0.2735857382402259, "grad_norm": 0.7913022041320801, "learning_rate": 4.726390114739629e-05, "loss": 0.9007, "step": 1550 }, { "epoch": 0.2753508075191951, "grad_norm": 1.7177999019622803, "learning_rate": 4.724624889673433e-05, "loss": 0.8825, "step": 1560 }, { "epoch": 0.27711587679816435, "grad_norm": 0.7861548662185669, "learning_rate": 4.7228596646072374e-05, "loss": 0.9701, "step": 1570 }, { "epoch": 0.27888094607713354, "grad_norm": 0.988394021987915, "learning_rate": 4.7210944395410415e-05, "loss": 0.8454, "step": 1580 }, { "epoch": 0.28064601535610273, "grad_norm": 1.0350430011749268, "learning_rate": 4.7193292144748457e-05, "loss": 0.9811, "step": 1590 }, { "epoch": 0.2824110846350719, "grad_norm": 2.351750373840332, "learning_rate": 4.71756398940865e-05, "loss": 0.8091, "step": 1600 }, { "epoch": 0.2841761539140411, "grad_norm": 1.85820472240448, "learning_rate": 4.715798764342454e-05, "loss": 0.9372, "step": 1610 }, { "epoch": 0.28594122319301035, "grad_norm": 0.8532019257545471, "learning_rate": 4.714033539276258e-05, "loss": 0.9284, "step": 1620 }, { "epoch": 0.28770629247197954, "grad_norm": 1.9938181638717651, "learning_rate": 4.712268314210062e-05, "loss": 0.7879, "step": 1630 }, { "epoch": 0.28947136175094873, "grad_norm": 2.600447177886963, "learning_rate": 4.710503089143866e-05, "loss": 0.8553, "step": 1640 }, { "epoch": 0.2912364310299179, "grad_norm": 1.0016512870788574, "learning_rate": 4.7087378640776703e-05, "loss": 0.9343, "step": 1650 }, { "epoch": 0.2930015003088871, "grad_norm": 2.372873544692993, "learning_rate": 4.7069726390114745e-05, "loss": 0.9583, "step": 1660 }, { "epoch": 0.2947665695878563, "grad_norm": 0.9608777761459351, "learning_rate": 4.7052074139452786e-05, "loss": 0.7909, "step": 1670 }, { "epoch": 0.29653163886682554, "grad_norm": 3.220609664916992, "learning_rate": 4.703442188879083e-05, "loss": 0.879, "step": 1680 }, { "epoch": 0.29829670814579473, "grad_norm": 1.023384690284729, "learning_rate": 4.701676963812887e-05, "loss": 0.8636, "step": 1690 }, { "epoch": 0.3000617774247639, "grad_norm": 1.031020998954773, "learning_rate": 4.699911738746691e-05, "loss": 0.9112, "step": 1700 }, { "epoch": 0.3018268467037331, "grad_norm": 1.9837015867233276, "learning_rate": 4.698146513680495e-05, "loss": 0.8857, "step": 1710 }, { "epoch": 0.3035919159827023, "grad_norm": 1.0206444263458252, "learning_rate": 4.696381288614299e-05, "loss": 0.8305, "step": 1720 }, { "epoch": 0.30535698526167154, "grad_norm": 0.805181086063385, "learning_rate": 4.6946160635481026e-05, "loss": 0.7456, "step": 1730 }, { "epoch": 0.30712205454064073, "grad_norm": 2.465508460998535, "learning_rate": 4.692850838481907e-05, "loss": 0.8171, "step": 1740 }, { "epoch": 0.3088871238196099, "grad_norm": 1.6663717031478882, "learning_rate": 4.691085613415711e-05, "loss": 0.8766, "step": 1750 }, { "epoch": 0.3106521930985791, "grad_norm": 3.0242748260498047, "learning_rate": 4.689320388349515e-05, "loss": 0.8909, "step": 1760 }, { "epoch": 0.3124172623775483, "grad_norm": 1.4774634838104248, "learning_rate": 4.6875551632833184e-05, "loss": 0.7763, "step": 1770 }, { "epoch": 0.31418233165651754, "grad_norm": 1.7824957370758057, "learning_rate": 4.6857899382171225e-05, "loss": 0.7331, "step": 1780 }, { "epoch": 0.31594740093548673, "grad_norm": 3.031038522720337, "learning_rate": 4.6840247131509266e-05, "loss": 0.7964, "step": 1790 }, { "epoch": 0.3177124702144559, "grad_norm": 4.109992504119873, "learning_rate": 4.682259488084731e-05, "loss": 0.8113, "step": 1800 }, { "epoch": 0.3194775394934251, "grad_norm": 2.4531877040863037, "learning_rate": 4.680494263018535e-05, "loss": 0.902, "step": 1810 }, { "epoch": 0.3212426087723943, "grad_norm": 1.009549617767334, "learning_rate": 4.678729037952339e-05, "loss": 0.9048, "step": 1820 }, { "epoch": 0.3230076780513635, "grad_norm": 3.0832602977752686, "learning_rate": 4.676963812886143e-05, "loss": 0.8321, "step": 1830 }, { "epoch": 0.32477274733033273, "grad_norm": 2.7193832397460938, "learning_rate": 4.675198587819947e-05, "loss": 0.9515, "step": 1840 }, { "epoch": 0.3265378166093019, "grad_norm": 2.153125286102295, "learning_rate": 4.673433362753751e-05, "loss": 0.8171, "step": 1850 }, { "epoch": 0.3283028858882711, "grad_norm": 2.327843427658081, "learning_rate": 4.6716681376875554e-05, "loss": 0.8299, "step": 1860 }, { "epoch": 0.3300679551672403, "grad_norm": 2.4244372844696045, "learning_rate": 4.6699029126213595e-05, "loss": 0.8345, "step": 1870 }, { "epoch": 0.3318330244462095, "grad_norm": 2.6205737590789795, "learning_rate": 4.6681376875551636e-05, "loss": 0.7703, "step": 1880 }, { "epoch": 0.33359809372517873, "grad_norm": 3.1626133918762207, "learning_rate": 4.666372462488968e-05, "loss": 0.8627, "step": 1890 }, { "epoch": 0.3353631630041479, "grad_norm": 2.5088164806365967, "learning_rate": 4.664607237422772e-05, "loss": 0.8964, "step": 1900 }, { "epoch": 0.3371282322831171, "grad_norm": 0.9049986600875854, "learning_rate": 4.662842012356576e-05, "loss": 0.8445, "step": 1910 }, { "epoch": 0.3388933015620863, "grad_norm": 1.1667076349258423, "learning_rate": 4.66107678729038e-05, "loss": 0.8793, "step": 1920 }, { "epoch": 0.3406583708410555, "grad_norm": 1.1097145080566406, "learning_rate": 4.659311562224184e-05, "loss": 0.8007, "step": 1930 }, { "epoch": 0.34242344012002474, "grad_norm": 2.5024774074554443, "learning_rate": 4.657546337157988e-05, "loss": 0.7753, "step": 1940 }, { "epoch": 0.3441885093989939, "grad_norm": 0.8330347537994385, "learning_rate": 4.6557811120917924e-05, "loss": 0.9424, "step": 1950 }, { "epoch": 0.3459535786779631, "grad_norm": 1.0398578643798828, "learning_rate": 4.6540158870255965e-05, "loss": 0.7939, "step": 1960 }, { "epoch": 0.3477186479569323, "grad_norm": 0.8610934019088745, "learning_rate": 4.6522506619594e-05, "loss": 0.7586, "step": 1970 }, { "epoch": 0.3494837172359015, "grad_norm": 1.0551927089691162, "learning_rate": 4.650485436893204e-05, "loss": 0.7738, "step": 1980 }, { "epoch": 0.3512487865148707, "grad_norm": 2.8230621814727783, "learning_rate": 4.648720211827008e-05, "loss": 0.8079, "step": 1990 }, { "epoch": 0.3530138557938399, "grad_norm": 2.029458999633789, "learning_rate": 4.646954986760812e-05, "loss": 0.9508, "step": 2000 }, { "epoch": 0.3530138557938399, "eval_loss": 0.7845870852470398, "eval_runtime": 591.5769, "eval_samples_per_second": 47.884, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0005102881727623844, "step": 2000 }, { "epoch": 0.3547789250728091, "grad_norm": 0.8111034631729126, "learning_rate": 4.645189761694616e-05, "loss": 0.886, "step": 2010 }, { "epoch": 0.3565439943517783, "grad_norm": 1.2739574909210205, "learning_rate": 4.64342453662842e-05, "loss": 0.7933, "step": 2020 }, { "epoch": 0.3583090636307475, "grad_norm": 1.3890713453292847, "learning_rate": 4.641659311562224e-05, "loss": 0.7444, "step": 2030 }, { "epoch": 0.3600741329097167, "grad_norm": 4.180551052093506, "learning_rate": 4.639894086496028e-05, "loss": 0.8468, "step": 2040 }, { "epoch": 0.3618392021886859, "grad_norm": 2.901968240737915, "learning_rate": 4.638128861429832e-05, "loss": 0.8847, "step": 2050 }, { "epoch": 0.3636042714676551, "grad_norm": 0.9198481440544128, "learning_rate": 4.636363636363636e-05, "loss": 0.8842, "step": 2060 }, { "epoch": 0.3653693407466243, "grad_norm": 1.019461750984192, "learning_rate": 4.6345984112974404e-05, "loss": 0.6959, "step": 2070 }, { "epoch": 0.3671344100255935, "grad_norm": 1.0316898822784424, "learning_rate": 4.6328331862312446e-05, "loss": 0.8417, "step": 2080 }, { "epoch": 0.3688994793045627, "grad_norm": 0.7056171894073486, "learning_rate": 4.631067961165049e-05, "loss": 0.8592, "step": 2090 }, { "epoch": 0.3706645485835319, "grad_norm": 2.2394204139709473, "learning_rate": 4.629302736098853e-05, "loss": 0.8624, "step": 2100 }, { "epoch": 0.3724296178625011, "grad_norm": 0.519085168838501, "learning_rate": 4.627537511032657e-05, "loss": 0.8056, "step": 2110 }, { "epoch": 0.3741946871414703, "grad_norm": 1.0584418773651123, "learning_rate": 4.625772285966461e-05, "loss": 0.8155, "step": 2120 }, { "epoch": 0.3759597564204395, "grad_norm": 3.4729137420654297, "learning_rate": 4.624007060900265e-05, "loss": 0.8597, "step": 2130 }, { "epoch": 0.3777248256994087, "grad_norm": 3.2471015453338623, "learning_rate": 4.622241835834069e-05, "loss": 0.908, "step": 2140 }, { "epoch": 0.37948989497837793, "grad_norm": 3.068171977996826, "learning_rate": 4.6204766107678734e-05, "loss": 0.7869, "step": 2150 }, { "epoch": 0.3812549642573471, "grad_norm": 0.8335369825363159, "learning_rate": 4.6187113857016775e-05, "loss": 0.8426, "step": 2160 }, { "epoch": 0.3830200335363163, "grad_norm": 0.7075872421264648, "learning_rate": 4.6169461606354816e-05, "loss": 0.7434, "step": 2170 }, { "epoch": 0.3847851028152855, "grad_norm": 0.8802381753921509, "learning_rate": 4.615180935569286e-05, "loss": 0.9235, "step": 2180 }, { "epoch": 0.3865501720942547, "grad_norm": 0.650245189666748, "learning_rate": 4.61341571050309e-05, "loss": 0.8441, "step": 2190 }, { "epoch": 0.3883152413732239, "grad_norm": 0.9095965623855591, "learning_rate": 4.611650485436894e-05, "loss": 0.9275, "step": 2200 }, { "epoch": 0.3900803106521931, "grad_norm": 2.9897871017456055, "learning_rate": 4.6098852603706974e-05, "loss": 0.8002, "step": 2210 }, { "epoch": 0.3918453799311623, "grad_norm": 2.078740119934082, "learning_rate": 4.6081200353045015e-05, "loss": 0.7567, "step": 2220 }, { "epoch": 0.3936104492101315, "grad_norm": 2.397658586502075, "learning_rate": 4.6063548102383056e-05, "loss": 0.808, "step": 2230 }, { "epoch": 0.3953755184891007, "grad_norm": 3.447932720184326, "learning_rate": 4.60458958517211e-05, "loss": 0.8827, "step": 2240 }, { "epoch": 0.3971405877680699, "grad_norm": 0.8672122359275818, "learning_rate": 4.602824360105914e-05, "loss": 0.8806, "step": 2250 }, { "epoch": 0.3989056570470391, "grad_norm": 1.9333924055099487, "learning_rate": 4.601059135039717e-05, "loss": 0.8342, "step": 2260 }, { "epoch": 0.4006707263260083, "grad_norm": 0.8829795718193054, "learning_rate": 4.5992939099735214e-05, "loss": 0.7546, "step": 2270 }, { "epoch": 0.4024357956049775, "grad_norm": 0.8878025412559509, "learning_rate": 4.5975286849073255e-05, "loss": 0.7602, "step": 2280 }, { "epoch": 0.4042008648839467, "grad_norm": 2.6549007892608643, "learning_rate": 4.5957634598411296e-05, "loss": 0.8296, "step": 2290 }, { "epoch": 0.4059659341629159, "grad_norm": 3.5660505294799805, "learning_rate": 4.593998234774934e-05, "loss": 0.8291, "step": 2300 }, { "epoch": 0.4077310034418851, "grad_norm": 2.9781057834625244, "learning_rate": 4.592233009708738e-05, "loss": 0.8012, "step": 2310 }, { "epoch": 0.4094960727208543, "grad_norm": 2.6867148876190186, "learning_rate": 4.590467784642542e-05, "loss": 0.8668, "step": 2320 }, { "epoch": 0.4112611419998235, "grad_norm": 2.252251386642456, "learning_rate": 4.588702559576346e-05, "loss": 0.8445, "step": 2330 }, { "epoch": 0.4130262112787927, "grad_norm": 0.8506266474723816, "learning_rate": 4.58693733451015e-05, "loss": 0.8879, "step": 2340 }, { "epoch": 0.4147912805577619, "grad_norm": 2.8754031658172607, "learning_rate": 4.585172109443954e-05, "loss": 0.996, "step": 2350 }, { "epoch": 0.41655634983673107, "grad_norm": 2.471627712249756, "learning_rate": 4.5834068843777584e-05, "loss": 0.8848, "step": 2360 }, { "epoch": 0.4183214191157003, "grad_norm": 2.1100962162017822, "learning_rate": 4.5816416593115625e-05, "loss": 0.8333, "step": 2370 }, { "epoch": 0.4200864883946695, "grad_norm": 2.6194207668304443, "learning_rate": 4.5798764342453666e-05, "loss": 0.8444, "step": 2380 }, { "epoch": 0.4218515576736387, "grad_norm": 1.025685429573059, "learning_rate": 4.578111209179171e-05, "loss": 0.8992, "step": 2390 }, { "epoch": 0.4236166269526079, "grad_norm": 2.1514077186584473, "learning_rate": 4.576345984112975e-05, "loss": 0.7833, "step": 2400 }, { "epoch": 0.42538169623157707, "grad_norm": 1.8769290447235107, "learning_rate": 4.574580759046779e-05, "loss": 0.7602, "step": 2410 }, { "epoch": 0.4271467655105463, "grad_norm": 0.990746796131134, "learning_rate": 4.572815533980583e-05, "loss": 0.7384, "step": 2420 }, { "epoch": 0.4289118347895155, "grad_norm": 2.712986946105957, "learning_rate": 4.571050308914387e-05, "loss": 0.8197, "step": 2430 }, { "epoch": 0.4306769040684847, "grad_norm": 2.931225061416626, "learning_rate": 4.5692850838481906e-05, "loss": 0.8079, "step": 2440 }, { "epoch": 0.4324419733474539, "grad_norm": 0.9967415928840637, "learning_rate": 4.567519858781995e-05, "loss": 0.7264, "step": 2450 }, { "epoch": 0.43420704262642307, "grad_norm": 1.942150354385376, "learning_rate": 4.565754633715799e-05, "loss": 0.8067, "step": 2460 }, { "epoch": 0.4359721119053923, "grad_norm": 2.0994186401367188, "learning_rate": 4.563989408649603e-05, "loss": 0.8242, "step": 2470 }, { "epoch": 0.4377371811843615, "grad_norm": 1.0104529857635498, "learning_rate": 4.562224183583407e-05, "loss": 0.7889, "step": 2480 }, { "epoch": 0.4395022504633307, "grad_norm": 1.2194324731826782, "learning_rate": 4.560458958517211e-05, "loss": 0.841, "step": 2490 }, { "epoch": 0.4412673197422999, "grad_norm": 1.023699164390564, "learning_rate": 4.558693733451015e-05, "loss": 0.8935, "step": 2500 }, { "epoch": 0.44303238902126907, "grad_norm": 1.049811601638794, "learning_rate": 4.5569285083848195e-05, "loss": 0.9363, "step": 2510 }, { "epoch": 0.44479745830023826, "grad_norm": 1.1310874223709106, "learning_rate": 4.555163283318623e-05, "loss": 0.8604, "step": 2520 }, { "epoch": 0.4465625275792075, "grad_norm": 2.1629273891448975, "learning_rate": 4.553398058252427e-05, "loss": 0.8933, "step": 2530 }, { "epoch": 0.4483275968581767, "grad_norm": 1.2044459581375122, "learning_rate": 4.551632833186231e-05, "loss": 0.9179, "step": 2540 }, { "epoch": 0.4500926661371459, "grad_norm": 0.8064510822296143, "learning_rate": 4.549867608120035e-05, "loss": 0.7586, "step": 2550 }, { "epoch": 0.45185773541611507, "grad_norm": 3.98811936378479, "learning_rate": 4.5481023830538393e-05, "loss": 0.8881, "step": 2560 }, { "epoch": 0.45362280469508426, "grad_norm": 1.326183795928955, "learning_rate": 4.5463371579876435e-05, "loss": 0.8303, "step": 2570 }, { "epoch": 0.4553878739740535, "grad_norm": 2.2841854095458984, "learning_rate": 4.5445719329214476e-05, "loss": 0.808, "step": 2580 }, { "epoch": 0.4571529432530227, "grad_norm": 0.9683240652084351, "learning_rate": 4.542806707855252e-05, "loss": 0.9093, "step": 2590 }, { "epoch": 0.4589180125319919, "grad_norm": 0.8216995000839233, "learning_rate": 4.541041482789056e-05, "loss": 0.7946, "step": 2600 }, { "epoch": 0.46068308181096107, "grad_norm": 0.9047673940658569, "learning_rate": 4.53927625772286e-05, "loss": 0.7807, "step": 2610 }, { "epoch": 0.46244815108993026, "grad_norm": 0.7651693820953369, "learning_rate": 4.537511032656664e-05, "loss": 0.7052, "step": 2620 }, { "epoch": 0.4642132203688995, "grad_norm": 3.0374672412872314, "learning_rate": 4.535745807590468e-05, "loss": 0.8134, "step": 2630 }, { "epoch": 0.4659782896478687, "grad_norm": 4.012673854827881, "learning_rate": 4.533980582524272e-05, "loss": 0.833, "step": 2640 }, { "epoch": 0.4677433589268379, "grad_norm": 2.6433727741241455, "learning_rate": 4.5322153574580764e-05, "loss": 0.9264, "step": 2650 }, { "epoch": 0.46950842820580707, "grad_norm": 2.4709348678588867, "learning_rate": 4.5304501323918805e-05, "loss": 0.9154, "step": 2660 }, { "epoch": 0.47127349748477626, "grad_norm": 0.8282108902931213, "learning_rate": 4.5286849073256846e-05, "loss": 0.7335, "step": 2670 }, { "epoch": 0.47303856676374545, "grad_norm": 1.0846011638641357, "learning_rate": 4.526919682259488e-05, "loss": 0.9035, "step": 2680 }, { "epoch": 0.4748036360427147, "grad_norm": 2.7011947631835938, "learning_rate": 4.525154457193292e-05, "loss": 0.7915, "step": 2690 }, { "epoch": 0.4765687053216839, "grad_norm": 1.912846565246582, "learning_rate": 4.523389232127096e-05, "loss": 0.8141, "step": 2700 }, { "epoch": 0.47833377460065307, "grad_norm": 0.9846749305725098, "learning_rate": 4.5216240070609004e-05, "loss": 0.8167, "step": 2710 }, { "epoch": 0.48009884387962226, "grad_norm": 1.9632867574691772, "learning_rate": 4.5198587819947045e-05, "loss": 0.8177, "step": 2720 }, { "epoch": 0.48186391315859145, "grad_norm": 1.979436993598938, "learning_rate": 4.5180935569285086e-05, "loss": 0.753, "step": 2730 }, { "epoch": 0.4836289824375607, "grad_norm": 2.5256102085113525, "learning_rate": 4.516328331862313e-05, "loss": 0.8481, "step": 2740 }, { "epoch": 0.4853940517165299, "grad_norm": 3.493546962738037, "learning_rate": 4.514563106796117e-05, "loss": 0.8997, "step": 2750 }, { "epoch": 0.4871591209954991, "grad_norm": 1.9048209190368652, "learning_rate": 4.512797881729921e-05, "loss": 0.7698, "step": 2760 }, { "epoch": 0.48892419027446826, "grad_norm": 2.712498664855957, "learning_rate": 4.511032656663725e-05, "loss": 0.8042, "step": 2770 }, { "epoch": 0.49068925955343745, "grad_norm": 1.0537688732147217, "learning_rate": 4.5092674315975285e-05, "loss": 0.9004, "step": 2780 }, { "epoch": 0.4924543288324067, "grad_norm": 2.128462553024292, "learning_rate": 4.5075022065313326e-05, "loss": 0.9807, "step": 2790 }, { "epoch": 0.4942193981113759, "grad_norm": 0.7416483759880066, "learning_rate": 4.505736981465137e-05, "loss": 0.8408, "step": 2800 }, { "epoch": 0.4959844673903451, "grad_norm": 1.0257169008255005, "learning_rate": 4.503971756398941e-05, "loss": 0.8744, "step": 2810 }, { "epoch": 0.49774953666931426, "grad_norm": 2.0702273845672607, "learning_rate": 4.502206531332745e-05, "loss": 0.8238, "step": 2820 }, { "epoch": 0.49951460594828345, "grad_norm": 0.9203521013259888, "learning_rate": 4.500441306266549e-05, "loss": 0.7613, "step": 2830 }, { "epoch": 0.5012796752272527, "grad_norm": 2.582195281982422, "learning_rate": 4.498676081200353e-05, "loss": 0.8405, "step": 2840 }, { "epoch": 0.5030447445062218, "grad_norm": 2.5932891368865967, "learning_rate": 4.496910856134157e-05, "loss": 0.8282, "step": 2850 }, { "epoch": 0.5048098137851911, "grad_norm": 0.9128146171569824, "learning_rate": 4.4951456310679614e-05, "loss": 0.8337, "step": 2860 }, { "epoch": 0.5065748830641603, "grad_norm": 1.1323065757751465, "learning_rate": 4.4933804060017655e-05, "loss": 0.7668, "step": 2870 }, { "epoch": 0.5083399523431295, "grad_norm": 2.5009121894836426, "learning_rate": 4.4916151809355697e-05, "loss": 0.9654, "step": 2880 }, { "epoch": 0.5101050216220987, "grad_norm": 0.845964252948761, "learning_rate": 4.489849955869374e-05, "loss": 0.8481, "step": 2890 }, { "epoch": 0.5118700909010678, "grad_norm": 3.271153450012207, "learning_rate": 4.488084730803178e-05, "loss": 0.8701, "step": 2900 }, { "epoch": 0.5136351601800371, "grad_norm": 1.1637883186340332, "learning_rate": 4.486319505736982e-05, "loss": 0.835, "step": 2910 }, { "epoch": 0.5154002294590063, "grad_norm": 0.9242041707038879, "learning_rate": 4.4845542806707854e-05, "loss": 0.7817, "step": 2920 }, { "epoch": 0.5171652987379755, "grad_norm": 3.017683267593384, "learning_rate": 4.4827890556045896e-05, "loss": 0.8759, "step": 2930 }, { "epoch": 0.5189303680169447, "grad_norm": 0.7924336194992065, "learning_rate": 4.481023830538394e-05, "loss": 0.8159, "step": 2940 }, { "epoch": 0.5206954372959138, "grad_norm": 0.8962329030036926, "learning_rate": 4.479258605472198e-05, "loss": 0.835, "step": 2950 }, { "epoch": 0.5224605065748831, "grad_norm": 1.6473637819290161, "learning_rate": 4.477493380406002e-05, "loss": 0.7439, "step": 2960 }, { "epoch": 0.5242255758538522, "grad_norm": 2.8031015396118164, "learning_rate": 4.475728155339806e-05, "loss": 0.6378, "step": 2970 }, { "epoch": 0.5259906451328215, "grad_norm": 3.2415006160736084, "learning_rate": 4.47396293027361e-05, "loss": 0.7761, "step": 2980 }, { "epoch": 0.5277557144117907, "grad_norm": 0.971165120601654, "learning_rate": 4.472197705207414e-05, "loss": 0.8751, "step": 2990 }, { "epoch": 0.5295207836907598, "grad_norm": 0.7631526589393616, "learning_rate": 4.4704324801412184e-05, "loss": 0.8916, "step": 3000 }, { "epoch": 0.5295207836907598, "eval_loss": 0.7506680488586426, "eval_runtime": 591.5692, "eval_samples_per_second": 47.885, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0005326158175531012, "step": 3000 }, { "epoch": 0.5312858529697291, "grad_norm": 2.212510108947754, "learning_rate": 4.4686672550750225e-05, "loss": 0.837, "step": 3010 }, { "epoch": 0.5330509222486982, "grad_norm": 0.9668548703193665, "learning_rate": 4.4669020300088266e-05, "loss": 0.8005, "step": 3020 }, { "epoch": 0.5348159915276675, "grad_norm": 2.5976197719573975, "learning_rate": 4.465136804942631e-05, "loss": 0.7103, "step": 3030 }, { "epoch": 0.5365810608066367, "grad_norm": 1.0848029851913452, "learning_rate": 4.463371579876435e-05, "loss": 0.8199, "step": 3040 }, { "epoch": 0.5383461300856058, "grad_norm": 0.8735620975494385, "learning_rate": 4.461606354810238e-05, "loss": 0.8422, "step": 3050 }, { "epoch": 0.5401111993645751, "grad_norm": 0.9805198907852173, "learning_rate": 4.4598411297440424e-05, "loss": 0.848, "step": 3060 }, { "epoch": 0.5418762686435442, "grad_norm": 1.2983773946762085, "learning_rate": 4.4580759046778465e-05, "loss": 0.84, "step": 3070 }, { "epoch": 0.5436413379225135, "grad_norm": 2.475261688232422, "learning_rate": 4.4563106796116506e-05, "loss": 0.7414, "step": 3080 }, { "epoch": 0.5454064072014827, "grad_norm": 0.8302130699157715, "learning_rate": 4.454545454545455e-05, "loss": 0.7616, "step": 3090 }, { "epoch": 0.5471714764804518, "grad_norm": 0.9707878828048706, "learning_rate": 4.452780229479259e-05, "loss": 0.8007, "step": 3100 }, { "epoch": 0.5489365457594211, "grad_norm": 0.8669869303703308, "learning_rate": 4.451015004413063e-05, "loss": 0.7382, "step": 3110 }, { "epoch": 0.5507016150383902, "grad_norm": 2.2172632217407227, "learning_rate": 4.449249779346867e-05, "loss": 0.8489, "step": 3120 }, { "epoch": 0.5524666843173595, "grad_norm": 0.8614495396614075, "learning_rate": 4.447484554280671e-05, "loss": 0.8115, "step": 3130 }, { "epoch": 0.5542317535963287, "grad_norm": 2.0771589279174805, "learning_rate": 4.445719329214475e-05, "loss": 0.7712, "step": 3140 }, { "epoch": 0.5559968228752978, "grad_norm": 0.7658421993255615, "learning_rate": 4.4439541041482794e-05, "loss": 0.737, "step": 3150 }, { "epoch": 0.5577618921542671, "grad_norm": 1.0527628660202026, "learning_rate": 4.442188879082083e-05, "loss": 0.7613, "step": 3160 }, { "epoch": 0.5595269614332362, "grad_norm": 1.8799755573272705, "learning_rate": 4.440423654015887e-05, "loss": 0.8304, "step": 3170 }, { "epoch": 0.5612920307122055, "grad_norm": 0.8906537294387817, "learning_rate": 4.438658428949691e-05, "loss": 0.8177, "step": 3180 }, { "epoch": 0.5630570999911747, "grad_norm": 3.328421115875244, "learning_rate": 4.436893203883495e-05, "loss": 0.8311, "step": 3190 }, { "epoch": 0.5648221692701438, "grad_norm": 0.9287519454956055, "learning_rate": 4.435127978817299e-05, "loss": 0.7043, "step": 3200 }, { "epoch": 0.5665872385491131, "grad_norm": 1.993821144104004, "learning_rate": 4.4333627537511034e-05, "loss": 0.8357, "step": 3210 }, { "epoch": 0.5683523078280822, "grad_norm": 1.6915552616119385, "learning_rate": 4.4315975286849075e-05, "loss": 0.766, "step": 3220 }, { "epoch": 0.5701173771070515, "grad_norm": 0.865930438041687, "learning_rate": 4.4298323036187116e-05, "loss": 0.8084, "step": 3230 }, { "epoch": 0.5718824463860207, "grad_norm": 2.868046760559082, "learning_rate": 4.428067078552516e-05, "loss": 0.8383, "step": 3240 }, { "epoch": 0.5736475156649898, "grad_norm": 0.9126543402671814, "learning_rate": 4.42630185348632e-05, "loss": 0.8596, "step": 3250 }, { "epoch": 0.5754125849439591, "grad_norm": 1.795299768447876, "learning_rate": 4.424536628420124e-05, "loss": 0.7732, "step": 3260 }, { "epoch": 0.5771776542229282, "grad_norm": 2.774261474609375, "learning_rate": 4.422771403353928e-05, "loss": 0.7921, "step": 3270 }, { "epoch": 0.5789427235018975, "grad_norm": 0.7389079928398132, "learning_rate": 4.421006178287732e-05, "loss": 0.7926, "step": 3280 }, { "epoch": 0.5807077927808666, "grad_norm": 0.9272739291191101, "learning_rate": 4.419240953221536e-05, "loss": 0.7412, "step": 3290 }, { "epoch": 0.5824728620598358, "grad_norm": 2.9061577320098877, "learning_rate": 4.4174757281553404e-05, "loss": 0.8038, "step": 3300 }, { "epoch": 0.5842379313388051, "grad_norm": 0.781683087348938, "learning_rate": 4.415710503089144e-05, "loss": 0.6972, "step": 3310 }, { "epoch": 0.5860030006177742, "grad_norm": 2.3418874740600586, "learning_rate": 4.413945278022948e-05, "loss": 0.7838, "step": 3320 }, { "epoch": 0.5877680698967435, "grad_norm": 1.735093355178833, "learning_rate": 4.412180052956752e-05, "loss": 0.7661, "step": 3330 }, { "epoch": 0.5895331391757126, "grad_norm": 1.6221524477005005, "learning_rate": 4.410414827890556e-05, "loss": 0.8704, "step": 3340 }, { "epoch": 0.5912982084546818, "grad_norm": 3.7049639225006104, "learning_rate": 4.40864960282436e-05, "loss": 0.7667, "step": 3350 }, { "epoch": 0.5930632777336511, "grad_norm": 0.7311994433403015, "learning_rate": 4.4068843777581644e-05, "loss": 0.7703, "step": 3360 }, { "epoch": 0.5948283470126202, "grad_norm": 0.9005844593048096, "learning_rate": 4.4051191526919686e-05, "loss": 0.7228, "step": 3370 }, { "epoch": 0.5965934162915895, "grad_norm": 0.8378689289093018, "learning_rate": 4.403353927625773e-05, "loss": 0.799, "step": 3380 }, { "epoch": 0.5983584855705586, "grad_norm": 2.0568249225616455, "learning_rate": 4.401588702559576e-05, "loss": 0.8938, "step": 3390 }, { "epoch": 0.6001235548495278, "grad_norm": 3.126661539077759, "learning_rate": 4.39982347749338e-05, "loss": 0.7579, "step": 3400 }, { "epoch": 0.6018886241284971, "grad_norm": 0.8094497323036194, "learning_rate": 4.3980582524271843e-05, "loss": 0.752, "step": 3410 }, { "epoch": 0.6036536934074662, "grad_norm": 0.8366499543190002, "learning_rate": 4.3962930273609885e-05, "loss": 0.6863, "step": 3420 }, { "epoch": 0.6054187626864355, "grad_norm": 0.9357757568359375, "learning_rate": 4.3945278022947926e-05, "loss": 0.7945, "step": 3430 }, { "epoch": 0.6071838319654046, "grad_norm": 2.3124871253967285, "learning_rate": 4.392762577228597e-05, "loss": 0.898, "step": 3440 }, { "epoch": 0.6089489012443738, "grad_norm": 2.69716215133667, "learning_rate": 4.390997352162401e-05, "loss": 0.7801, "step": 3450 }, { "epoch": 0.6107139705233431, "grad_norm": 1.620527982711792, "learning_rate": 4.389232127096205e-05, "loss": 0.7479, "step": 3460 }, { "epoch": 0.6124790398023122, "grad_norm": 2.4881954193115234, "learning_rate": 4.387466902030009e-05, "loss": 0.7216, "step": 3470 }, { "epoch": 0.6142441090812815, "grad_norm": 0.8429247736930847, "learning_rate": 4.385701676963813e-05, "loss": 0.7887, "step": 3480 }, { "epoch": 0.6160091783602506, "grad_norm": 1.0181946754455566, "learning_rate": 4.383936451897617e-05, "loss": 0.6728, "step": 3490 }, { "epoch": 0.6177742476392198, "grad_norm": 2.189239740371704, "learning_rate": 4.3821712268314214e-05, "loss": 0.761, "step": 3500 }, { "epoch": 0.6195393169181891, "grad_norm": 2.9036712646484375, "learning_rate": 4.3804060017652255e-05, "loss": 0.8686, "step": 3510 }, { "epoch": 0.6213043861971582, "grad_norm": 2.4876108169555664, "learning_rate": 4.3786407766990296e-05, "loss": 0.8295, "step": 3520 }, { "epoch": 0.6230694554761275, "grad_norm": 3.2689085006713867, "learning_rate": 4.376875551632834e-05, "loss": 0.933, "step": 3530 }, { "epoch": 0.6248345247550966, "grad_norm": 1.3380885124206543, "learning_rate": 4.375110326566638e-05, "loss": 0.7828, "step": 3540 }, { "epoch": 0.6265995940340658, "grad_norm": 3.1748392581939697, "learning_rate": 4.373345101500442e-05, "loss": 0.824, "step": 3550 }, { "epoch": 0.6283646633130351, "grad_norm": 0.660198450088501, "learning_rate": 4.371579876434246e-05, "loss": 0.7187, "step": 3560 }, { "epoch": 0.6301297325920042, "grad_norm": 1.0571256875991821, "learning_rate": 4.36981465136805e-05, "loss": 0.8839, "step": 3570 }, { "epoch": 0.6318948018709735, "grad_norm": 2.345848560333252, "learning_rate": 4.3680494263018536e-05, "loss": 0.8157, "step": 3580 }, { "epoch": 0.6336598711499426, "grad_norm": 2.5955724716186523, "learning_rate": 4.366284201235658e-05, "loss": 0.9024, "step": 3590 }, { "epoch": 0.6354249404289118, "grad_norm": 0.6851534247398376, "learning_rate": 4.364518976169462e-05, "loss": 0.7143, "step": 3600 }, { "epoch": 0.6371900097078811, "grad_norm": 1.5733120441436768, "learning_rate": 4.362753751103266e-05, "loss": 0.8427, "step": 3610 }, { "epoch": 0.6389550789868502, "grad_norm": 1.0154445171356201, "learning_rate": 4.36098852603707e-05, "loss": 0.7195, "step": 3620 }, { "epoch": 0.6407201482658195, "grad_norm": 1.6782732009887695, "learning_rate": 4.3592233009708735e-05, "loss": 0.7455, "step": 3630 }, { "epoch": 0.6424852175447886, "grad_norm": 0.9354893565177917, "learning_rate": 4.3574580759046776e-05, "loss": 0.7633, "step": 3640 }, { "epoch": 0.6442502868237578, "grad_norm": 2.326085090637207, "learning_rate": 4.355692850838482e-05, "loss": 0.7908, "step": 3650 }, { "epoch": 0.646015356102727, "grad_norm": 0.8648369312286377, "learning_rate": 4.353927625772286e-05, "loss": 0.6926, "step": 3660 }, { "epoch": 0.6477804253816962, "grad_norm": 1.0704703330993652, "learning_rate": 4.35216240070609e-05, "loss": 0.7622, "step": 3670 }, { "epoch": 0.6495454946606655, "grad_norm": 0.9948635101318359, "learning_rate": 4.350397175639894e-05, "loss": 0.7132, "step": 3680 }, { "epoch": 0.6513105639396346, "grad_norm": 3.173682689666748, "learning_rate": 4.348631950573698e-05, "loss": 0.7653, "step": 3690 }, { "epoch": 0.6530756332186038, "grad_norm": 0.9430578351020813, "learning_rate": 4.346866725507502e-05, "loss": 0.786, "step": 3700 }, { "epoch": 0.654840702497573, "grad_norm": 2.426671266555786, "learning_rate": 4.3451015004413064e-05, "loss": 0.7977, "step": 3710 }, { "epoch": 0.6566057717765422, "grad_norm": 2.2651498317718506, "learning_rate": 4.3433362753751105e-05, "loss": 0.6994, "step": 3720 }, { "epoch": 0.6583708410555115, "grad_norm": 2.3418796062469482, "learning_rate": 4.3415710503089147e-05, "loss": 0.7567, "step": 3730 }, { "epoch": 0.6601359103344806, "grad_norm": 0.9187758564949036, "learning_rate": 4.339805825242719e-05, "loss": 0.7054, "step": 3740 }, { "epoch": 0.6619009796134498, "grad_norm": 1.027400016784668, "learning_rate": 4.338040600176523e-05, "loss": 0.8261, "step": 3750 }, { "epoch": 0.663666048892419, "grad_norm": 3.1188466548919678, "learning_rate": 4.336275375110327e-05, "loss": 0.8198, "step": 3760 }, { "epoch": 0.6654311181713882, "grad_norm": 0.9015699028968811, "learning_rate": 4.334510150044131e-05, "loss": 0.7301, "step": 3770 }, { "epoch": 0.6671961874503575, "grad_norm": 1.2164896726608276, "learning_rate": 4.332744924977935e-05, "loss": 0.7977, "step": 3780 }, { "epoch": 0.6689612567293266, "grad_norm": 0.9232011437416077, "learning_rate": 4.3309796999117393e-05, "loss": 0.7386, "step": 3790 }, { "epoch": 0.6707263260082958, "grad_norm": 0.8261239528656006, "learning_rate": 4.3292144748455435e-05, "loss": 0.7747, "step": 3800 }, { "epoch": 0.672491395287265, "grad_norm": 1.8265053033828735, "learning_rate": 4.3274492497793476e-05, "loss": 0.8045, "step": 3810 }, { "epoch": 0.6742564645662342, "grad_norm": 1.030633807182312, "learning_rate": 4.325684024713152e-05, "loss": 0.7423, "step": 3820 }, { "epoch": 0.6760215338452035, "grad_norm": 0.8075785040855408, "learning_rate": 4.323918799646955e-05, "loss": 0.7633, "step": 3830 }, { "epoch": 0.6777866031241726, "grad_norm": 1.1641757488250732, "learning_rate": 4.322153574580759e-05, "loss": 0.7748, "step": 3840 }, { "epoch": 0.6795516724031418, "grad_norm": 0.6394134163856506, "learning_rate": 4.3203883495145634e-05, "loss": 0.7864, "step": 3850 }, { "epoch": 0.681316741682111, "grad_norm": 0.7961482405662537, "learning_rate": 4.3186231244483675e-05, "loss": 0.7264, "step": 3860 }, { "epoch": 0.6830818109610802, "grad_norm": 1.5039088726043701, "learning_rate": 4.316857899382171e-05, "loss": 0.7005, "step": 3870 }, { "epoch": 0.6848468802400495, "grad_norm": 2.984955072402954, "learning_rate": 4.315092674315975e-05, "loss": 0.756, "step": 3880 }, { "epoch": 0.6866119495190186, "grad_norm": 0.5771501660346985, "learning_rate": 4.313327449249779e-05, "loss": 0.7099, "step": 3890 }, { "epoch": 0.6883770187979878, "grad_norm": 3.850680112838745, "learning_rate": 4.311562224183583e-05, "loss": 0.768, "step": 3900 }, { "epoch": 0.690142088076957, "grad_norm": 3.665675163269043, "learning_rate": 4.3097969991173874e-05, "loss": 0.8096, "step": 3910 }, { "epoch": 0.6919071573559262, "grad_norm": 3.480576515197754, "learning_rate": 4.3080317740511915e-05, "loss": 0.8829, "step": 3920 }, { "epoch": 0.6936722266348955, "grad_norm": 0.6120189428329468, "learning_rate": 4.3062665489849956e-05, "loss": 0.6944, "step": 3930 }, { "epoch": 0.6954372959138646, "grad_norm": 2.2129323482513428, "learning_rate": 4.3045013239188e-05, "loss": 0.7401, "step": 3940 }, { "epoch": 0.6972023651928339, "grad_norm": 1.296460747718811, "learning_rate": 4.302736098852604e-05, "loss": 0.792, "step": 3950 }, { "epoch": 0.698967434471803, "grad_norm": 2.8519787788391113, "learning_rate": 4.300970873786408e-05, "loss": 0.7726, "step": 3960 }, { "epoch": 0.7007325037507722, "grad_norm": 0.8078942894935608, "learning_rate": 4.299205648720212e-05, "loss": 0.8331, "step": 3970 }, { "epoch": 0.7024975730297414, "grad_norm": 3.126859426498413, "learning_rate": 4.297440423654016e-05, "loss": 0.845, "step": 3980 }, { "epoch": 0.7042626423087106, "grad_norm": 0.9351972937583923, "learning_rate": 4.29567519858782e-05, "loss": 0.761, "step": 3990 }, { "epoch": 0.7060277115876799, "grad_norm": 0.8094435334205627, "learning_rate": 4.2939099735216244e-05, "loss": 0.7853, "step": 4000 }, { "epoch": 0.7060277115876799, "eval_loss": 0.7302612662315369, "eval_runtime": 591.667, "eval_samples_per_second": 47.877, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0005275413528279383, "step": 4000 }, { "epoch": 0.707792780866649, "grad_norm": 3.202349901199341, "learning_rate": 4.2921447484554285e-05, "loss": 0.8615, "step": 4010 }, { "epoch": 0.7095578501456182, "grad_norm": 1.095604658126831, "learning_rate": 4.2903795233892326e-05, "loss": 0.7358, "step": 4020 }, { "epoch": 0.7113229194245874, "grad_norm": 1.876907229423523, "learning_rate": 4.288614298323037e-05, "loss": 0.7144, "step": 4030 }, { "epoch": 0.7130879887035566, "grad_norm": 1.2887446880340576, "learning_rate": 4.286849073256841e-05, "loss": 0.6641, "step": 4040 }, { "epoch": 0.7148530579825259, "grad_norm": 2.4284539222717285, "learning_rate": 4.285083848190645e-05, "loss": 0.8055, "step": 4050 }, { "epoch": 0.716618127261495, "grad_norm": 1.0811606645584106, "learning_rate": 4.283318623124449e-05, "loss": 0.7444, "step": 4060 }, { "epoch": 0.7183831965404642, "grad_norm": 2.0655746459960938, "learning_rate": 4.2815533980582525e-05, "loss": 0.75, "step": 4070 }, { "epoch": 0.7201482658194334, "grad_norm": 1.7863682508468628, "learning_rate": 4.2797881729920566e-05, "loss": 0.7308, "step": 4080 }, { "epoch": 0.7219133350984026, "grad_norm": 0.7258486747741699, "learning_rate": 4.278022947925861e-05, "loss": 0.7459, "step": 4090 }, { "epoch": 0.7236784043773719, "grad_norm": 2.0655391216278076, "learning_rate": 4.276257722859665e-05, "loss": 0.7109, "step": 4100 }, { "epoch": 0.725443473656341, "grad_norm": 0.7749966979026794, "learning_rate": 4.274492497793468e-05, "loss": 0.7652, "step": 4110 }, { "epoch": 0.7272085429353102, "grad_norm": 0.8433781862258911, "learning_rate": 4.2727272727272724e-05, "loss": 0.7187, "step": 4120 }, { "epoch": 0.7289736122142794, "grad_norm": 2.252690315246582, "learning_rate": 4.2709620476610765e-05, "loss": 0.7223, "step": 4130 }, { "epoch": 0.7307386814932486, "grad_norm": 0.876118540763855, "learning_rate": 4.2691968225948806e-05, "loss": 0.7532, "step": 4140 }, { "epoch": 0.7325037507722179, "grad_norm": 2.4882471561431885, "learning_rate": 4.267431597528685e-05, "loss": 0.7415, "step": 4150 }, { "epoch": 0.734268820051187, "grad_norm": 2.053722381591797, "learning_rate": 4.265666372462489e-05, "loss": 0.8302, "step": 4160 }, { "epoch": 0.7360338893301562, "grad_norm": 2.7460062503814697, "learning_rate": 4.263901147396293e-05, "loss": 0.8077, "step": 4170 }, { "epoch": 0.7377989586091254, "grad_norm": 2.6140687465667725, "learning_rate": 4.262135922330097e-05, "loss": 0.7162, "step": 4180 }, { "epoch": 0.7395640278880946, "grad_norm": 0.9406307935714722, "learning_rate": 4.260370697263901e-05, "loss": 0.7295, "step": 4190 }, { "epoch": 0.7413290971670639, "grad_norm": 1.2313580513000488, "learning_rate": 4.258605472197705e-05, "loss": 0.7561, "step": 4200 }, { "epoch": 0.743094166446033, "grad_norm": 1.6174222230911255, "learning_rate": 4.2568402471315094e-05, "loss": 0.7096, "step": 4210 }, { "epoch": 0.7448592357250022, "grad_norm": 2.9496357440948486, "learning_rate": 4.2550750220653136e-05, "loss": 0.8138, "step": 4220 }, { "epoch": 0.7466243050039714, "grad_norm": 1.0731405019760132, "learning_rate": 4.253309796999118e-05, "loss": 0.7358, "step": 4230 }, { "epoch": 0.7483893742829406, "grad_norm": 1.912284016609192, "learning_rate": 4.251544571932922e-05, "loss": 0.6613, "step": 4240 }, { "epoch": 0.7501544435619099, "grad_norm": 2.178107261657715, "learning_rate": 4.249779346866726e-05, "loss": 0.7444, "step": 4250 }, { "epoch": 0.751919512840879, "grad_norm": 3.0223817825317383, "learning_rate": 4.24801412180053e-05, "loss": 0.7841, "step": 4260 }, { "epoch": 0.7536845821198482, "grad_norm": 2.2032418251037598, "learning_rate": 4.246248896734334e-05, "loss": 0.7662, "step": 4270 }, { "epoch": 0.7554496513988174, "grad_norm": 2.5964224338531494, "learning_rate": 4.244483671668138e-05, "loss": 0.7513, "step": 4280 }, { "epoch": 0.7572147206777866, "grad_norm": 0.7769788503646851, "learning_rate": 4.2427184466019424e-05, "loss": 0.753, "step": 4290 }, { "epoch": 0.7589797899567559, "grad_norm": 2.543841600418091, "learning_rate": 4.2409532215357465e-05, "loss": 0.7381, "step": 4300 }, { "epoch": 0.760744859235725, "grad_norm": 1.1544684171676636, "learning_rate": 4.23918799646955e-05, "loss": 0.7067, "step": 4310 }, { "epoch": 0.7625099285146942, "grad_norm": 3.0461089611053467, "learning_rate": 4.237422771403354e-05, "loss": 0.7107, "step": 4320 }, { "epoch": 0.7642749977936634, "grad_norm": 2.073349952697754, "learning_rate": 4.235657546337158e-05, "loss": 0.7259, "step": 4330 }, { "epoch": 0.7660400670726326, "grad_norm": 2.9507546424865723, "learning_rate": 4.233892321270962e-05, "loss": 0.781, "step": 4340 }, { "epoch": 0.7678051363516017, "grad_norm": 0.9811722636222839, "learning_rate": 4.2321270962047664e-05, "loss": 0.7541, "step": 4350 }, { "epoch": 0.769570205630571, "grad_norm": 2.324629068374634, "learning_rate": 4.2303618711385705e-05, "loss": 0.8045, "step": 4360 }, { "epoch": 0.7713352749095402, "grad_norm": 2.0748445987701416, "learning_rate": 4.228596646072374e-05, "loss": 0.8305, "step": 4370 }, { "epoch": 0.7731003441885094, "grad_norm": 2.2968170642852783, "learning_rate": 4.226831421006178e-05, "loss": 0.7492, "step": 4380 }, { "epoch": 0.7748654134674786, "grad_norm": 2.235989570617676, "learning_rate": 4.225066195939982e-05, "loss": 0.9504, "step": 4390 }, { "epoch": 0.7766304827464477, "grad_norm": 1.7711191177368164, "learning_rate": 4.223300970873786e-05, "loss": 0.7801, "step": 4400 }, { "epoch": 0.778395552025417, "grad_norm": 1.8657867908477783, "learning_rate": 4.2215357458075904e-05, "loss": 0.7403, "step": 4410 }, { "epoch": 0.7801606213043862, "grad_norm": 0.9536270499229431, "learning_rate": 4.2197705207413945e-05, "loss": 0.7349, "step": 4420 }, { "epoch": 0.7819256905833554, "grad_norm": 1.7488056421279907, "learning_rate": 4.2180052956751986e-05, "loss": 0.9128, "step": 4430 }, { "epoch": 0.7836907598623246, "grad_norm": 1.809354543685913, "learning_rate": 4.216240070609003e-05, "loss": 0.8036, "step": 4440 }, { "epoch": 0.7854558291412937, "grad_norm": 1.8551990985870361, "learning_rate": 4.214474845542807e-05, "loss": 0.7554, "step": 4450 }, { "epoch": 0.787220898420263, "grad_norm": 0.8702948093414307, "learning_rate": 4.212709620476611e-05, "loss": 0.7256, "step": 4460 }, { "epoch": 0.7889859676992322, "grad_norm": 1.093543291091919, "learning_rate": 4.210944395410415e-05, "loss": 0.705, "step": 4470 }, { "epoch": 0.7907510369782014, "grad_norm": 2.765627384185791, "learning_rate": 4.209179170344219e-05, "loss": 0.6901, "step": 4480 }, { "epoch": 0.7925161062571706, "grad_norm": 1.0054060220718384, "learning_rate": 4.207413945278023e-05, "loss": 0.7003, "step": 4490 }, { "epoch": 0.7942811755361397, "grad_norm": 2.850806951522827, "learning_rate": 4.2056487202118274e-05, "loss": 0.7657, "step": 4500 }, { "epoch": 0.796046244815109, "grad_norm": 2.02577543258667, "learning_rate": 4.2038834951456315e-05, "loss": 0.7876, "step": 4510 }, { "epoch": 0.7978113140940782, "grad_norm": 2.8630881309509277, "learning_rate": 4.2021182700794356e-05, "loss": 0.8187, "step": 4520 }, { "epoch": 0.7995763833730474, "grad_norm": 1.5044877529144287, "learning_rate": 4.20035304501324e-05, "loss": 0.7504, "step": 4530 }, { "epoch": 0.8013414526520166, "grad_norm": 1.9010065793991089, "learning_rate": 4.198587819947043e-05, "loss": 0.7958, "step": 4540 }, { "epoch": 0.8031065219309858, "grad_norm": 2.720659017562866, "learning_rate": 4.196822594880847e-05, "loss": 0.7737, "step": 4550 }, { "epoch": 0.804871591209955, "grad_norm": 0.9726505875587463, "learning_rate": 4.1950573698146514e-05, "loss": 0.8296, "step": 4560 }, { "epoch": 0.8066366604889242, "grad_norm": 2.710341215133667, "learning_rate": 4.1932921447484555e-05, "loss": 0.7509, "step": 4570 }, { "epoch": 0.8084017297678934, "grad_norm": 0.9819344282150269, "learning_rate": 4.1915269196822597e-05, "loss": 0.7451, "step": 4580 }, { "epoch": 0.8101667990468626, "grad_norm": 0.9601898193359375, "learning_rate": 4.189761694616064e-05, "loss": 0.929, "step": 4590 }, { "epoch": 0.8119318683258318, "grad_norm": 2.9946250915527344, "learning_rate": 4.187996469549868e-05, "loss": 0.7867, "step": 4600 }, { "epoch": 0.813696937604801, "grad_norm": 1.9155701398849487, "learning_rate": 4.186231244483672e-05, "loss": 0.664, "step": 4610 }, { "epoch": 0.8154620068837702, "grad_norm": 2.1461758613586426, "learning_rate": 4.184466019417476e-05, "loss": 0.7129, "step": 4620 }, { "epoch": 0.8172270761627394, "grad_norm": 3.015730381011963, "learning_rate": 4.18270079435128e-05, "loss": 0.8257, "step": 4630 }, { "epoch": 0.8189921454417086, "grad_norm": 0.8726127743721008, "learning_rate": 4.180935569285084e-05, "loss": 0.6789, "step": 4640 }, { "epoch": 0.8207572147206778, "grad_norm": 3.004166603088379, "learning_rate": 4.179170344218888e-05, "loss": 0.7675, "step": 4650 }, { "epoch": 0.822522283999647, "grad_norm": 0.798729419708252, "learning_rate": 4.177405119152692e-05, "loss": 0.702, "step": 4660 }, { "epoch": 0.8242873532786161, "grad_norm": 0.7195820212364197, "learning_rate": 4.175639894086496e-05, "loss": 0.7232, "step": 4670 }, { "epoch": 0.8260524225575854, "grad_norm": 0.9878723621368408, "learning_rate": 4.1738746690203e-05, "loss": 0.7543, "step": 4680 }, { "epoch": 0.8278174918365546, "grad_norm": 2.5027530193328857, "learning_rate": 4.172109443954104e-05, "loss": 0.7169, "step": 4690 }, { "epoch": 0.8295825611155238, "grad_norm": 0.9524794220924377, "learning_rate": 4.1703442188879084e-05, "loss": 0.8512, "step": 4700 }, { "epoch": 0.831347630394493, "grad_norm": 0.9306320548057556, "learning_rate": 4.1685789938217125e-05, "loss": 0.8466, "step": 4710 }, { "epoch": 0.8331126996734621, "grad_norm": 0.6997801661491394, "learning_rate": 4.1668137687555166e-05, "loss": 0.6713, "step": 4720 }, { "epoch": 0.8348777689524314, "grad_norm": 1.0483647584915161, "learning_rate": 4.165048543689321e-05, "loss": 0.7824, "step": 4730 }, { "epoch": 0.8366428382314006, "grad_norm": 1.7400377988815308, "learning_rate": 4.163283318623125e-05, "loss": 0.7119, "step": 4740 }, { "epoch": 0.8384079075103698, "grad_norm": 2.9384422302246094, "learning_rate": 4.161518093556929e-05, "loss": 0.837, "step": 4750 }, { "epoch": 0.840172976789339, "grad_norm": 0.8086302280426025, "learning_rate": 4.159752868490733e-05, "loss": 0.7704, "step": 4760 }, { "epoch": 0.8419380460683081, "grad_norm": 2.740748405456543, "learning_rate": 4.157987643424537e-05, "loss": 0.8491, "step": 4770 }, { "epoch": 0.8437031153472774, "grad_norm": 2.4292073249816895, "learning_rate": 4.1562224183583406e-05, "loss": 0.7163, "step": 4780 }, { "epoch": 0.8454681846262466, "grad_norm": 0.8519812226295471, "learning_rate": 4.154457193292145e-05, "loss": 0.7669, "step": 4790 }, { "epoch": 0.8472332539052158, "grad_norm": 2.4483604431152344, "learning_rate": 4.152691968225949e-05, "loss": 0.6766, "step": 4800 }, { "epoch": 0.848998323184185, "grad_norm": 0.7039122581481934, "learning_rate": 4.150926743159753e-05, "loss": 0.7462, "step": 4810 }, { "epoch": 0.8507633924631541, "grad_norm": 1.0183981657028198, "learning_rate": 4.149161518093557e-05, "loss": 0.7203, "step": 4820 }, { "epoch": 0.8525284617421234, "grad_norm": 2.9556922912597656, "learning_rate": 4.147396293027361e-05, "loss": 0.7431, "step": 4830 }, { "epoch": 0.8542935310210926, "grad_norm": 1.9758566617965698, "learning_rate": 4.145631067961165e-05, "loss": 0.6689, "step": 4840 }, { "epoch": 0.8560586003000618, "grad_norm": 1.072682499885559, "learning_rate": 4.1438658428949694e-05, "loss": 0.7904, "step": 4850 }, { "epoch": 0.857823669579031, "grad_norm": 1.078199028968811, "learning_rate": 4.1421006178287735e-05, "loss": 0.7404, "step": 4860 }, { "epoch": 0.8595887388580001, "grad_norm": 1.5657134056091309, "learning_rate": 4.1403353927625776e-05, "loss": 0.7326, "step": 4870 }, { "epoch": 0.8613538081369694, "grad_norm": 2.0855634212493896, "learning_rate": 4.138570167696382e-05, "loss": 0.7502, "step": 4880 }, { "epoch": 0.8631188774159386, "grad_norm": 0.7354227304458618, "learning_rate": 4.136804942630186e-05, "loss": 0.6875, "step": 4890 }, { "epoch": 0.8648839466949078, "grad_norm": 0.7951927781105042, "learning_rate": 4.135039717563989e-05, "loss": 0.7122, "step": 4900 }, { "epoch": 0.866649015973877, "grad_norm": 1.1246849298477173, "learning_rate": 4.1332744924977934e-05, "loss": 0.8425, "step": 4910 }, { "epoch": 0.8684140852528461, "grad_norm": 1.5327008962631226, "learning_rate": 4.1315092674315975e-05, "loss": 0.7479, "step": 4920 }, { "epoch": 0.8701791545318154, "grad_norm": 2.5594542026519775, "learning_rate": 4.1297440423654016e-05, "loss": 0.7138, "step": 4930 }, { "epoch": 0.8719442238107846, "grad_norm": 1.085647702217102, "learning_rate": 4.127978817299206e-05, "loss": 0.7799, "step": 4940 }, { "epoch": 0.8737092930897538, "grad_norm": 0.8347994685173035, "learning_rate": 4.12621359223301e-05, "loss": 0.6876, "step": 4950 }, { "epoch": 0.875474362368723, "grad_norm": 1.6027971506118774, "learning_rate": 4.124448367166814e-05, "loss": 0.7479, "step": 4960 }, { "epoch": 0.8772394316476921, "grad_norm": 0.7663355469703674, "learning_rate": 4.122683142100618e-05, "loss": 0.6946, "step": 4970 }, { "epoch": 0.8790045009266614, "grad_norm": 0.6748294234275818, "learning_rate": 4.120917917034422e-05, "loss": 0.623, "step": 4980 }, { "epoch": 0.8807695702056306, "grad_norm": 0.9014782905578613, "learning_rate": 4.119152691968226e-05, "loss": 0.8502, "step": 4990 }, { "epoch": 0.8825346394845998, "grad_norm": 0.7959718108177185, "learning_rate": 4.1173874669020304e-05, "loss": 0.6841, "step": 5000 }, { "epoch": 0.8825346394845998, "eval_loss": 0.7080652713775635, "eval_runtime": 591.5822, "eval_samples_per_second": 47.883, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0005015600934351041, "step": 5000 }, { "epoch": 0.884299708763569, "grad_norm": 2.963513135910034, "learning_rate": 4.1156222418358345e-05, "loss": 0.8104, "step": 5010 }, { "epoch": 0.8860647780425381, "grad_norm": 1.7745774984359741, "learning_rate": 4.113857016769638e-05, "loss": 0.7374, "step": 5020 }, { "epoch": 0.8878298473215074, "grad_norm": 0.7581727504730225, "learning_rate": 4.112091791703442e-05, "loss": 0.6546, "step": 5030 }, { "epoch": 0.8895949166004765, "grad_norm": 0.9805993437767029, "learning_rate": 4.110326566637246e-05, "loss": 0.7799, "step": 5040 }, { "epoch": 0.8913599858794458, "grad_norm": 2.659174919128418, "learning_rate": 4.10856134157105e-05, "loss": 0.8281, "step": 5050 }, { "epoch": 0.893125055158415, "grad_norm": 1.8057835102081299, "learning_rate": 4.1067961165048544e-05, "loss": 0.7072, "step": 5060 }, { "epoch": 0.8948901244373841, "grad_norm": 0.9309649467468262, "learning_rate": 4.1050308914386586e-05, "loss": 0.8069, "step": 5070 }, { "epoch": 0.8966551937163534, "grad_norm": 2.148200273513794, "learning_rate": 4.103265666372463e-05, "loss": 0.7694, "step": 5080 }, { "epoch": 0.8984202629953225, "grad_norm": 0.7988691329956055, "learning_rate": 4.101500441306267e-05, "loss": 0.6688, "step": 5090 }, { "epoch": 0.9001853322742918, "grad_norm": 1.795934796333313, "learning_rate": 4.099735216240071e-05, "loss": 0.7281, "step": 5100 }, { "epoch": 0.901950401553261, "grad_norm": 1.2138944864273071, "learning_rate": 4.097969991173875e-05, "loss": 0.6958, "step": 5110 }, { "epoch": 0.9037154708322301, "grad_norm": 0.8563159704208374, "learning_rate": 4.096204766107679e-05, "loss": 0.7041, "step": 5120 }, { "epoch": 0.9054805401111994, "grad_norm": 1.0345391035079956, "learning_rate": 4.094439541041483e-05, "loss": 0.6316, "step": 5130 }, { "epoch": 0.9072456093901685, "grad_norm": 1.0438239574432373, "learning_rate": 4.0926743159752874e-05, "loss": 0.7822, "step": 5140 }, { "epoch": 0.9090106786691378, "grad_norm": 2.5692899227142334, "learning_rate": 4.0909090909090915e-05, "loss": 0.8251, "step": 5150 }, { "epoch": 0.910775747948107, "grad_norm": 0.9563241004943848, "learning_rate": 4.0891438658428956e-05, "loss": 0.8355, "step": 5160 }, { "epoch": 0.9125408172270761, "grad_norm": 2.4111404418945312, "learning_rate": 4.087378640776699e-05, "loss": 0.7656, "step": 5170 }, { "epoch": 0.9143058865060454, "grad_norm": 2.1064178943634033, "learning_rate": 4.085613415710503e-05, "loss": 0.6923, "step": 5180 }, { "epoch": 0.9160709557850145, "grad_norm": 1.022700548171997, "learning_rate": 4.083848190644307e-05, "loss": 0.7944, "step": 5190 }, { "epoch": 0.9178360250639838, "grad_norm": 1.8107175827026367, "learning_rate": 4.0820829655781114e-05, "loss": 0.7349, "step": 5200 }, { "epoch": 0.919601094342953, "grad_norm": 2.170214891433716, "learning_rate": 4.0803177405119155e-05, "loss": 0.7676, "step": 5210 }, { "epoch": 0.9213661636219221, "grad_norm": 2.0516133308410645, "learning_rate": 4.0785525154457196e-05, "loss": 0.8233, "step": 5220 }, { "epoch": 0.9231312329008914, "grad_norm": 2.494417428970337, "learning_rate": 4.076787290379524e-05, "loss": 0.7054, "step": 5230 }, { "epoch": 0.9248963021798605, "grad_norm": 2.1171908378601074, "learning_rate": 4.075022065313328e-05, "loss": 0.6854, "step": 5240 }, { "epoch": 0.9266613714588298, "grad_norm": 0.9174603223800659, "learning_rate": 4.073256840247132e-05, "loss": 0.6768, "step": 5250 }, { "epoch": 0.928426440737799, "grad_norm": 2.3948967456817627, "learning_rate": 4.0714916151809354e-05, "loss": 0.8037, "step": 5260 }, { "epoch": 0.9301915100167681, "grad_norm": 1.9202173948287964, "learning_rate": 4.0697263901147395e-05, "loss": 0.747, "step": 5270 }, { "epoch": 0.9319565792957374, "grad_norm": 2.5543434619903564, "learning_rate": 4.0679611650485436e-05, "loss": 0.7877, "step": 5280 }, { "epoch": 0.9337216485747065, "grad_norm": 2.527691602706909, "learning_rate": 4.066195939982348e-05, "loss": 0.7361, "step": 5290 }, { "epoch": 0.9354867178536758, "grad_norm": 1.1392663717269897, "learning_rate": 4.064430714916152e-05, "loss": 0.7346, "step": 5300 }, { "epoch": 0.937251787132645, "grad_norm": 2.4553403854370117, "learning_rate": 4.062665489849956e-05, "loss": 0.8383, "step": 5310 }, { "epoch": 0.9390168564116141, "grad_norm": 1.9993935823440552, "learning_rate": 4.06090026478376e-05, "loss": 0.7392, "step": 5320 }, { "epoch": 0.9407819256905834, "grad_norm": 0.6954941749572754, "learning_rate": 4.059135039717564e-05, "loss": 0.7452, "step": 5330 }, { "epoch": 0.9425469949695525, "grad_norm": 1.6930773258209229, "learning_rate": 4.057369814651368e-05, "loss": 0.7071, "step": 5340 }, { "epoch": 0.9443120642485218, "grad_norm": 0.7938792705535889, "learning_rate": 4.0556045895851724e-05, "loss": 0.6892, "step": 5350 }, { "epoch": 0.9460771335274909, "grad_norm": 0.7790454030036926, "learning_rate": 4.0538393645189765e-05, "loss": 0.6887, "step": 5360 }, { "epoch": 0.9478422028064601, "grad_norm": 1.0746580362319946, "learning_rate": 4.0520741394527806e-05, "loss": 0.7402, "step": 5370 }, { "epoch": 0.9496072720854294, "grad_norm": 2.610752582550049, "learning_rate": 4.050308914386585e-05, "loss": 0.7915, "step": 5380 }, { "epoch": 0.9513723413643985, "grad_norm": 2.0769665241241455, "learning_rate": 4.048543689320389e-05, "loss": 0.6858, "step": 5390 }, { "epoch": 0.9531374106433678, "grad_norm": 0.7704333662986755, "learning_rate": 4.046778464254193e-05, "loss": 0.7199, "step": 5400 }, { "epoch": 0.9549024799223369, "grad_norm": 1.770251750946045, "learning_rate": 4.045013239187997e-05, "loss": 0.6665, "step": 5410 }, { "epoch": 0.9566675492013061, "grad_norm": 3.3667657375335693, "learning_rate": 4.043248014121801e-05, "loss": 0.6992, "step": 5420 }, { "epoch": 0.9584326184802754, "grad_norm": 2.0577900409698486, "learning_rate": 4.0414827890556047e-05, "loss": 0.728, "step": 5430 }, { "epoch": 0.9601976877592445, "grad_norm": 0.8403862118721008, "learning_rate": 4.039717563989409e-05, "loss": 0.8615, "step": 5440 }, { "epoch": 0.9619627570382138, "grad_norm": 2.633230686187744, "learning_rate": 4.037952338923213e-05, "loss": 0.7057, "step": 5450 }, { "epoch": 0.9637278263171829, "grad_norm": 1.1049001216888428, "learning_rate": 4.036187113857017e-05, "loss": 0.755, "step": 5460 }, { "epoch": 0.9654928955961521, "grad_norm": 2.386627435684204, "learning_rate": 4.034421888790821e-05, "loss": 0.7846, "step": 5470 }, { "epoch": 0.9672579648751214, "grad_norm": 2.9040069580078125, "learning_rate": 4.032656663724625e-05, "loss": 0.7653, "step": 5480 }, { "epoch": 0.9690230341540905, "grad_norm": 2.9959592819213867, "learning_rate": 4.0308914386584287e-05, "loss": 0.7296, "step": 5490 }, { "epoch": 0.9707881034330598, "grad_norm": 3.807882785797119, "learning_rate": 4.029126213592233e-05, "loss": 0.7168, "step": 5500 }, { "epoch": 0.9725531727120289, "grad_norm": 1.1350913047790527, "learning_rate": 4.027360988526037e-05, "loss": 0.7343, "step": 5510 }, { "epoch": 0.9743182419909981, "grad_norm": 0.6422159671783447, "learning_rate": 4.025595763459841e-05, "loss": 0.6593, "step": 5520 }, { "epoch": 0.9760833112699674, "grad_norm": 1.7712031602859497, "learning_rate": 4.023830538393645e-05, "loss": 0.6703, "step": 5530 }, { "epoch": 0.9778483805489365, "grad_norm": 3.241684675216675, "learning_rate": 4.022065313327449e-05, "loss": 0.7294, "step": 5540 }, { "epoch": 0.9796134498279058, "grad_norm": 2.6312735080718994, "learning_rate": 4.0203000882612533e-05, "loss": 0.6784, "step": 5550 }, { "epoch": 0.9813785191068749, "grad_norm": 0.8754311800003052, "learning_rate": 4.0185348631950575e-05, "loss": 0.7842, "step": 5560 }, { "epoch": 0.9831435883858441, "grad_norm": 2.187657594680786, "learning_rate": 4.0167696381288616e-05, "loss": 0.7507, "step": 5570 }, { "epoch": 0.9849086576648134, "grad_norm": 0.8857598304748535, "learning_rate": 4.015004413062666e-05, "loss": 0.761, "step": 5580 }, { "epoch": 0.9866737269437825, "grad_norm": 1.6948868036270142, "learning_rate": 4.01323918799647e-05, "loss": 0.7893, "step": 5590 }, { "epoch": 0.9884387962227518, "grad_norm": 2.731844902038574, "learning_rate": 4.011473962930274e-05, "loss": 0.752, "step": 5600 }, { "epoch": 0.9902038655017209, "grad_norm": 2.4988324642181396, "learning_rate": 4.009708737864078e-05, "loss": 0.6898, "step": 5610 }, { "epoch": 0.9919689347806901, "grad_norm": 2.6746983528137207, "learning_rate": 4.007943512797882e-05, "loss": 0.7696, "step": 5620 }, { "epoch": 0.9937340040596594, "grad_norm": 0.6741234064102173, "learning_rate": 4.006178287731686e-05, "loss": 0.6574, "step": 5630 }, { "epoch": 0.9954990733386285, "grad_norm": 0.7766237258911133, "learning_rate": 4.0044130626654904e-05, "loss": 0.6623, "step": 5640 }, { "epoch": 0.9972641426175978, "grad_norm": 0.9466866254806519, "learning_rate": 4.0026478375992945e-05, "loss": 0.7363, "step": 5650 }, { "epoch": 0.9990292118965669, "grad_norm": 2.129058599472046, "learning_rate": 4.0008826125330986e-05, "loss": 0.6279, "step": 5660 }, { "epoch": 1.0007942811755361, "grad_norm": 1.1645121574401855, "learning_rate": 3.999117387466903e-05, "loss": 0.7561, "step": 5670 }, { "epoch": 1.0025593504545054, "grad_norm": 0.7823759913444519, "learning_rate": 3.997352162400707e-05, "loss": 0.5656, "step": 5680 }, { "epoch": 1.0043244197334746, "grad_norm": 1.0414769649505615, "learning_rate": 3.995586937334511e-05, "loss": 0.567, "step": 5690 }, { "epoch": 1.0060894890124437, "grad_norm": 1.495110273361206, "learning_rate": 3.9938217122683144e-05, "loss": 0.6325, "step": 5700 }, { "epoch": 1.007854558291413, "grad_norm": 0.8955295085906982, "learning_rate": 3.9920564872021185e-05, "loss": 0.7344, "step": 5710 }, { "epoch": 1.0096196275703821, "grad_norm": 2.1746819019317627, "learning_rate": 3.9902912621359226e-05, "loss": 0.6629, "step": 5720 }, { "epoch": 1.0113846968493514, "grad_norm": 0.9131489396095276, "learning_rate": 3.988526037069726e-05, "loss": 0.5815, "step": 5730 }, { "epoch": 1.0131497661283206, "grad_norm": 1.0117508172988892, "learning_rate": 3.98676081200353e-05, "loss": 0.5926, "step": 5740 }, { "epoch": 1.0149148354072897, "grad_norm": 1.6003496646881104, "learning_rate": 3.984995586937334e-05, "loss": 0.5605, "step": 5750 }, { "epoch": 1.016679904686259, "grad_norm": 1.622113823890686, "learning_rate": 3.9832303618711384e-05, "loss": 0.6624, "step": 5760 }, { "epoch": 1.0184449739652282, "grad_norm": 2.4824321269989014, "learning_rate": 3.9814651368049425e-05, "loss": 0.6636, "step": 5770 }, { "epoch": 1.0202100432441974, "grad_norm": 3.0618515014648438, "learning_rate": 3.9796999117387466e-05, "loss": 0.7127, "step": 5780 }, { "epoch": 1.0219751125231666, "grad_norm": 1.2835907936096191, "learning_rate": 3.977934686672551e-05, "loss": 0.5987, "step": 5790 }, { "epoch": 1.0237401818021357, "grad_norm": 2.8061492443084717, "learning_rate": 3.976169461606355e-05, "loss": 0.7181, "step": 5800 }, { "epoch": 1.025505251081105, "grad_norm": 1.2812072038650513, "learning_rate": 3.974404236540159e-05, "loss": 0.6901, "step": 5810 }, { "epoch": 1.0272703203600742, "grad_norm": 4.024072647094727, "learning_rate": 3.972639011473963e-05, "loss": 0.5881, "step": 5820 }, { "epoch": 1.0290353896390434, "grad_norm": 2.0519351959228516, "learning_rate": 3.970873786407767e-05, "loss": 0.6031, "step": 5830 }, { "epoch": 1.0308004589180126, "grad_norm": 1.9414854049682617, "learning_rate": 3.969108561341571e-05, "loss": 0.659, "step": 5840 }, { "epoch": 1.0325655281969817, "grad_norm": 0.7736496925354004, "learning_rate": 3.9673433362753754e-05, "loss": 0.6354, "step": 5850 }, { "epoch": 1.034330597475951, "grad_norm": 2.337108850479126, "learning_rate": 3.9655781112091795e-05, "loss": 0.6873, "step": 5860 }, { "epoch": 1.0360956667549202, "grad_norm": 2.7758662700653076, "learning_rate": 3.9638128861429837e-05, "loss": 0.619, "step": 5870 }, { "epoch": 1.0378607360338894, "grad_norm": 0.7028997540473938, "learning_rate": 3.962047661076788e-05, "loss": 0.5763, "step": 5880 }, { "epoch": 1.0396258053128586, "grad_norm": 1.7901548147201538, "learning_rate": 3.960282436010592e-05, "loss": 0.5724, "step": 5890 }, { "epoch": 1.0413908745918277, "grad_norm": 2.513047218322754, "learning_rate": 3.958517210944396e-05, "loss": 0.7337, "step": 5900 }, { "epoch": 1.043155943870797, "grad_norm": 1.1275721788406372, "learning_rate": 3.9567519858782e-05, "loss": 0.6653, "step": 5910 }, { "epoch": 1.0449210131497662, "grad_norm": 0.7849834561347961, "learning_rate": 3.954986760812004e-05, "loss": 0.6944, "step": 5920 }, { "epoch": 1.0466860824287354, "grad_norm": 2.418022632598877, "learning_rate": 3.953221535745808e-05, "loss": 0.6294, "step": 5930 }, { "epoch": 1.0484511517077044, "grad_norm": 1.0242151021957397, "learning_rate": 3.951456310679612e-05, "loss": 0.6257, "step": 5940 }, { "epoch": 1.0502162209866737, "grad_norm": 1.2218064069747925, "learning_rate": 3.949691085613416e-05, "loss": 0.6824, "step": 5950 }, { "epoch": 1.051981290265643, "grad_norm": 2.2518460750579834, "learning_rate": 3.94792586054722e-05, "loss": 0.7105, "step": 5960 }, { "epoch": 1.0537463595446122, "grad_norm": 0.7626746892929077, "learning_rate": 3.9461606354810235e-05, "loss": 0.5903, "step": 5970 }, { "epoch": 1.0555114288235814, "grad_norm": 2.1651909351348877, "learning_rate": 3.9443954104148276e-05, "loss": 0.5997, "step": 5980 }, { "epoch": 1.0572764981025504, "grad_norm": 3.152777910232544, "learning_rate": 3.942630185348632e-05, "loss": 0.7502, "step": 5990 }, { "epoch": 1.0590415673815197, "grad_norm": 0.6665771007537842, "learning_rate": 3.940864960282436e-05, "loss": 0.6785, "step": 6000 }, { "epoch": 1.0590415673815197, "eval_loss": 0.6956175565719604, "eval_runtime": 591.8022, "eval_samples_per_second": 47.866, "eval_steps_per_second": 2.394, "eval_token_accuracy": 0.0005115060442964234, "step": 6000 }, { "epoch": 1.060806636660489, "grad_norm": 1.2140997648239136, "learning_rate": 3.93909973521624e-05, "loss": 0.6932, "step": 6010 }, { "epoch": 1.0625717059394582, "grad_norm": 1.9795242547988892, "learning_rate": 3.937511032656664e-05, "loss": 0.6777, "step": 6020 }, { "epoch": 1.0643367752184274, "grad_norm": 2.0826520919799805, "learning_rate": 3.935745807590468e-05, "loss": 0.7064, "step": 6030 }, { "epoch": 1.0661018444973964, "grad_norm": 2.58691143989563, "learning_rate": 3.933980582524272e-05, "loss": 0.7037, "step": 6040 }, { "epoch": 1.0678669137763657, "grad_norm": 2.3377749919891357, "learning_rate": 3.932215357458076e-05, "loss": 0.6285, "step": 6050 }, { "epoch": 1.069631983055335, "grad_norm": 2.8440327644348145, "learning_rate": 3.93045013239188e-05, "loss": 0.6828, "step": 6060 }, { "epoch": 1.0713970523343042, "grad_norm": 1.790805459022522, "learning_rate": 3.928684907325684e-05, "loss": 0.5888, "step": 6070 }, { "epoch": 1.0731621216132734, "grad_norm": 0.8189541697502136, "learning_rate": 3.9269196822594884e-05, "loss": 0.6213, "step": 6080 }, { "epoch": 1.0749271908922424, "grad_norm": 1.0348247289657593, "learning_rate": 3.9251544571932925e-05, "loss": 0.6106, "step": 6090 }, { "epoch": 1.0766922601712117, "grad_norm": 1.8420778512954712, "learning_rate": 3.9233892321270966e-05, "loss": 0.6458, "step": 6100 }, { "epoch": 1.078457329450181, "grad_norm": 0.9982885122299194, "learning_rate": 3.921624007060901e-05, "loss": 0.6307, "step": 6110 }, { "epoch": 1.0802223987291502, "grad_norm": 3.139690399169922, "learning_rate": 3.919858781994705e-05, "loss": 0.6454, "step": 6120 }, { "epoch": 1.0819874680081194, "grad_norm": 0.9377104640007019, "learning_rate": 3.918093556928509e-05, "loss": 0.7181, "step": 6130 }, { "epoch": 1.0837525372870884, "grad_norm": 0.8897203207015991, "learning_rate": 3.916328331862313e-05, "loss": 0.5477, "step": 6140 }, { "epoch": 1.0855176065660577, "grad_norm": 2.0230560302734375, "learning_rate": 3.914563106796117e-05, "loss": 0.6154, "step": 6150 }, { "epoch": 1.087282675845027, "grad_norm": 1.1352148056030273, "learning_rate": 3.912797881729921e-05, "loss": 0.658, "step": 6160 }, { "epoch": 1.0890477451239962, "grad_norm": 1.4181119203567505, "learning_rate": 3.911032656663725e-05, "loss": 0.5686, "step": 6170 }, { "epoch": 1.0908128144029654, "grad_norm": 2.281613349914551, "learning_rate": 3.909267431597529e-05, "loss": 0.6839, "step": 6180 }, { "epoch": 1.0925778836819344, "grad_norm": 2.092272996902466, "learning_rate": 3.907502206531333e-05, "loss": 0.6573, "step": 6190 }, { "epoch": 1.0943429529609037, "grad_norm": 1.9302388429641724, "learning_rate": 3.905736981465137e-05, "loss": 0.5752, "step": 6200 }, { "epoch": 1.096108022239873, "grad_norm": 0.6753134727478027, "learning_rate": 3.903971756398941e-05, "loss": 0.6363, "step": 6210 }, { "epoch": 1.0978730915188422, "grad_norm": 3.038670539855957, "learning_rate": 3.9022065313327446e-05, "loss": 0.606, "step": 6220 }, { "epoch": 1.0996381607978114, "grad_norm": 0.8844441175460815, "learning_rate": 3.900441306266549e-05, "loss": 0.5935, "step": 6230 }, { "epoch": 1.1014032300767804, "grad_norm": 3.831115245819092, "learning_rate": 3.898676081200353e-05, "loss": 0.7762, "step": 6240 }, { "epoch": 1.1031682993557497, "grad_norm": 0.9423919916152954, "learning_rate": 3.896910856134157e-05, "loss": 0.6149, "step": 6250 }, { "epoch": 1.104933368634719, "grad_norm": 0.7882218956947327, "learning_rate": 3.895145631067961e-05, "loss": 0.5544, "step": 6260 }, { "epoch": 1.1066984379136882, "grad_norm": 3.2080211639404297, "learning_rate": 3.893380406001765e-05, "loss": 0.6712, "step": 6270 }, { "epoch": 1.1084635071926574, "grad_norm": 0.7996103167533875, "learning_rate": 3.891615180935569e-05, "loss": 0.5802, "step": 6280 }, { "epoch": 1.1102285764716264, "grad_norm": 1.8485013246536255, "learning_rate": 3.8898499558693734e-05, "loss": 0.6356, "step": 6290 }, { "epoch": 1.1119936457505957, "grad_norm": 1.0424914360046387, "learning_rate": 3.8880847308031775e-05, "loss": 0.6124, "step": 6300 }, { "epoch": 1.113758715029565, "grad_norm": 1.83061683177948, "learning_rate": 3.8863195057369816e-05, "loss": 0.718, "step": 6310 }, { "epoch": 1.1155237843085342, "grad_norm": 1.0995100736618042, "learning_rate": 3.884554280670786e-05, "loss": 0.6101, "step": 6320 }, { "epoch": 1.1172888535875034, "grad_norm": 1.6366883516311646, "learning_rate": 3.88278905560459e-05, "loss": 0.6742, "step": 6330 }, { "epoch": 1.1190539228664724, "grad_norm": 2.9243485927581787, "learning_rate": 3.881023830538394e-05, "loss": 0.6337, "step": 6340 }, { "epoch": 1.1208189921454417, "grad_norm": 3.8549795150756836, "learning_rate": 3.879258605472198e-05, "loss": 0.649, "step": 6350 }, { "epoch": 1.122584061424411, "grad_norm": 2.9887771606445312, "learning_rate": 3.877493380406002e-05, "loss": 0.6472, "step": 6360 }, { "epoch": 1.1243491307033802, "grad_norm": 3.433417797088623, "learning_rate": 3.875728155339806e-05, "loss": 0.7064, "step": 6370 }, { "epoch": 1.1261141999823492, "grad_norm": 1.2327500581741333, "learning_rate": 3.8739629302736105e-05, "loss": 0.8526, "step": 6380 }, { "epoch": 1.1278792692613184, "grad_norm": 1.535632848739624, "learning_rate": 3.8721977052074146e-05, "loss": 0.7019, "step": 6390 }, { "epoch": 1.1296443385402877, "grad_norm": 2.638556718826294, "learning_rate": 3.870432480141219e-05, "loss": 0.6715, "step": 6400 }, { "epoch": 1.131409407819257, "grad_norm": 2.2223639488220215, "learning_rate": 3.868667255075023e-05, "loss": 0.628, "step": 6410 }, { "epoch": 1.1331744770982262, "grad_norm": 0.6442582011222839, "learning_rate": 3.866902030008826e-05, "loss": 0.5226, "step": 6420 }, { "epoch": 1.1349395463771952, "grad_norm": 0.9084598422050476, "learning_rate": 3.8651368049426303e-05, "loss": 0.6445, "step": 6430 }, { "epoch": 1.1367046156561644, "grad_norm": 0.926341712474823, "learning_rate": 3.8633715798764345e-05, "loss": 0.6636, "step": 6440 }, { "epoch": 1.1384696849351337, "grad_norm": 0.653100848197937, "learning_rate": 3.8616063548102386e-05, "loss": 0.5924, "step": 6450 }, { "epoch": 1.140234754214103, "grad_norm": 1.7288234233856201, "learning_rate": 3.859841129744042e-05, "loss": 0.7612, "step": 6460 }, { "epoch": 1.1419998234930722, "grad_norm": 2.005732536315918, "learning_rate": 3.858075904677846e-05, "loss": 0.6961, "step": 6470 }, { "epoch": 1.1437648927720412, "grad_norm": 1.5119543075561523, "learning_rate": 3.85631067961165e-05, "loss": 0.5927, "step": 6480 }, { "epoch": 1.1455299620510104, "grad_norm": 0.7860826849937439, "learning_rate": 3.8545454545454544e-05, "loss": 0.6218, "step": 6490 }, { "epoch": 1.1472950313299797, "grad_norm": 0.9668664336204529, "learning_rate": 3.8527802294792585e-05, "loss": 0.6406, "step": 6500 }, { "epoch": 1.149060100608949, "grad_norm": 1.3885242938995361, "learning_rate": 3.8510150044130626e-05, "loss": 0.6488, "step": 6510 }, { "epoch": 1.1508251698879182, "grad_norm": 2.342130422592163, "learning_rate": 3.849249779346867e-05, "loss": 0.6748, "step": 6520 }, { "epoch": 1.1525902391668872, "grad_norm": 1.0215253829956055, "learning_rate": 3.847484554280671e-05, "loss": 0.6612, "step": 6530 }, { "epoch": 1.1543553084458564, "grad_norm": 1.2095513343811035, "learning_rate": 3.845719329214475e-05, "loss": 0.6519, "step": 6540 }, { "epoch": 1.1561203777248257, "grad_norm": 2.6220827102661133, "learning_rate": 3.843954104148279e-05, "loss": 0.6615, "step": 6550 }, { "epoch": 1.157885447003795, "grad_norm": 1.501598596572876, "learning_rate": 3.842188879082083e-05, "loss": 0.6628, "step": 6560 }, { "epoch": 1.1596505162827642, "grad_norm": 0.7498106360435486, "learning_rate": 3.840423654015887e-05, "loss": 0.5555, "step": 6570 }, { "epoch": 1.1614155855617332, "grad_norm": 0.6536363363265991, "learning_rate": 3.8386584289496914e-05, "loss": 0.6255, "step": 6580 }, { "epoch": 1.1631806548407024, "grad_norm": 2.851771831512451, "learning_rate": 3.8368932038834955e-05, "loss": 0.7082, "step": 6590 }, { "epoch": 1.1649457241196717, "grad_norm": 1.1639115810394287, "learning_rate": 3.8351279788172996e-05, "loss": 0.6362, "step": 6600 }, { "epoch": 1.166710793398641, "grad_norm": 2.0477452278137207, "learning_rate": 3.833362753751104e-05, "loss": 0.6903, "step": 6610 }, { "epoch": 1.1684758626776102, "grad_norm": 0.8935057520866394, "learning_rate": 3.831597528684908e-05, "loss": 0.69, "step": 6620 }, { "epoch": 1.1702409319565792, "grad_norm": 0.8473203778266907, "learning_rate": 3.829832303618712e-05, "loss": 0.5369, "step": 6630 }, { "epoch": 1.1720060012355484, "grad_norm": 0.7587894797325134, "learning_rate": 3.828067078552516e-05, "loss": 0.6086, "step": 6640 }, { "epoch": 1.1737710705145177, "grad_norm": 2.5984623432159424, "learning_rate": 3.82630185348632e-05, "loss": 0.6499, "step": 6650 }, { "epoch": 1.175536139793487, "grad_norm": 1.1337202787399292, "learning_rate": 3.8245366284201236e-05, "loss": 0.6571, "step": 6660 }, { "epoch": 1.1773012090724562, "grad_norm": 0.6955274939537048, "learning_rate": 3.822771403353928e-05, "loss": 0.6734, "step": 6670 }, { "epoch": 1.1790662783514252, "grad_norm": 2.8740875720977783, "learning_rate": 3.821006178287732e-05, "loss": 0.7027, "step": 6680 }, { "epoch": 1.1808313476303944, "grad_norm": 2.8766727447509766, "learning_rate": 3.819240953221536e-05, "loss": 0.7541, "step": 6690 }, { "epoch": 1.1825964169093637, "grad_norm": 3.3576607704162598, "learning_rate": 3.8174757281553394e-05, "loss": 0.6901, "step": 6700 }, { "epoch": 1.184361486188333, "grad_norm": 1.2829562425613403, "learning_rate": 3.8157105030891435e-05, "loss": 0.6446, "step": 6710 }, { "epoch": 1.1861265554673022, "grad_norm": 0.8178977370262146, "learning_rate": 3.8139452780229476e-05, "loss": 0.5423, "step": 6720 }, { "epoch": 1.1878916247462712, "grad_norm": 0.8948667645454407, "learning_rate": 3.812180052956752e-05, "loss": 0.6114, "step": 6730 }, { "epoch": 1.1896566940252404, "grad_norm": 0.9846989512443542, "learning_rate": 3.810414827890556e-05, "loss": 0.6515, "step": 6740 }, { "epoch": 1.1914217633042097, "grad_norm": 0.9987642765045166, "learning_rate": 3.80864960282436e-05, "loss": 0.7103, "step": 6750 }, { "epoch": 1.193186832583179, "grad_norm": 2.2832117080688477, "learning_rate": 3.806884377758164e-05, "loss": 0.6537, "step": 6760 }, { "epoch": 1.1949519018621482, "grad_norm": 1.747611403465271, "learning_rate": 3.805119152691968e-05, "loss": 0.5134, "step": 6770 }, { "epoch": 1.1967169711411172, "grad_norm": 2.011439561843872, "learning_rate": 3.803353927625772e-05, "loss": 0.6191, "step": 6780 }, { "epoch": 1.1984820404200864, "grad_norm": 1.4673908948898315, "learning_rate": 3.8015887025595764e-05, "loss": 0.601, "step": 6790 }, { "epoch": 1.2002471096990557, "grad_norm": 0.9508843421936035, "learning_rate": 3.7998234774933806e-05, "loss": 0.5438, "step": 6800 }, { "epoch": 1.202012178978025, "grad_norm": 0.8046093583106995, "learning_rate": 3.798058252427185e-05, "loss": 0.6197, "step": 6810 }, { "epoch": 1.2037772482569942, "grad_norm": 0.741568386554718, "learning_rate": 3.796293027360989e-05, "loss": 0.6795, "step": 6820 }, { "epoch": 1.2055423175359632, "grad_norm": 0.751842200756073, "learning_rate": 3.794527802294793e-05, "loss": 0.626, "step": 6830 }, { "epoch": 1.2073073868149324, "grad_norm": 1.0445666313171387, "learning_rate": 3.792762577228597e-05, "loss": 0.7252, "step": 6840 }, { "epoch": 1.2090724560939017, "grad_norm": 0.7413420677185059, "learning_rate": 3.790997352162401e-05, "loss": 0.6268, "step": 6850 }, { "epoch": 1.210837525372871, "grad_norm": 0.6899815201759338, "learning_rate": 3.789232127096205e-05, "loss": 0.643, "step": 6860 }, { "epoch": 1.2126025946518402, "grad_norm": 2.5429368019104004, "learning_rate": 3.7874669020300094e-05, "loss": 0.6941, "step": 6870 }, { "epoch": 1.2143676639308092, "grad_norm": 0.969836950302124, "learning_rate": 3.7857016769638135e-05, "loss": 0.6907, "step": 6880 }, { "epoch": 1.2161327332097784, "grad_norm": 0.7424036860466003, "learning_rate": 3.7839364518976176e-05, "loss": 0.6324, "step": 6890 }, { "epoch": 1.2178978024887477, "grad_norm": 2.565000057220459, "learning_rate": 3.782171226831421e-05, "loss": 0.6324, "step": 6900 }, { "epoch": 1.219662871767717, "grad_norm": 1.3518567085266113, "learning_rate": 3.780406001765225e-05, "loss": 0.6398, "step": 6910 }, { "epoch": 1.2214279410466862, "grad_norm": 0.7438364624977112, "learning_rate": 3.778640776699029e-05, "loss": 0.6396, "step": 6920 }, { "epoch": 1.2231930103256552, "grad_norm": 0.72452712059021, "learning_rate": 3.7768755516328334e-05, "loss": 0.6357, "step": 6930 }, { "epoch": 1.2249580796046244, "grad_norm": 0.9610331058502197, "learning_rate": 3.7751103265666375e-05, "loss": 0.6202, "step": 6940 }, { "epoch": 1.2267231488835937, "grad_norm": 0.8247737288475037, "learning_rate": 3.7733451015004416e-05, "loss": 0.5992, "step": 6950 }, { "epoch": 1.228488218162563, "grad_norm": 1.0646226406097412, "learning_rate": 3.771579876434246e-05, "loss": 0.6119, "step": 6960 }, { "epoch": 1.2302532874415322, "grad_norm": 0.8103353977203369, "learning_rate": 3.769814651368049e-05, "loss": 0.6086, "step": 6970 }, { "epoch": 1.2320183567205012, "grad_norm": 0.9309709072113037, "learning_rate": 3.768049426301853e-05, "loss": 0.6288, "step": 6980 }, { "epoch": 1.2337834259994704, "grad_norm": 0.6081012487411499, "learning_rate": 3.7662842012356574e-05, "loss": 0.5723, "step": 6990 }, { "epoch": 1.2355484952784397, "grad_norm": 2.909627914428711, "learning_rate": 3.7645189761694615e-05, "loss": 0.7148, "step": 7000 }, { "epoch": 1.2355484952784397, "eval_loss": 0.6834109425544739, "eval_runtime": 591.6374, "eval_samples_per_second": 47.879, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0004970945644769607, "step": 7000 }, { "epoch": 1.237313564557409, "grad_norm": 3.0048515796661377, "learning_rate": 3.7627537511032656e-05, "loss": 0.574, "step": 7010 }, { "epoch": 1.2390786338363782, "grad_norm": 0.748912513256073, "learning_rate": 3.76098852603707e-05, "loss": 0.6341, "step": 7020 }, { "epoch": 1.2408437031153472, "grad_norm": 0.8313178420066833, "learning_rate": 3.759223300970874e-05, "loss": 0.6569, "step": 7030 }, { "epoch": 1.2426087723943164, "grad_norm": 0.7276086211204529, "learning_rate": 3.757458075904678e-05, "loss": 0.6759, "step": 7040 }, { "epoch": 1.2443738416732857, "grad_norm": 2.707221508026123, "learning_rate": 3.755692850838482e-05, "loss": 0.6932, "step": 7050 }, { "epoch": 1.246138910952255, "grad_norm": 0.9770756959915161, "learning_rate": 3.753927625772286e-05, "loss": 0.6494, "step": 7060 }, { "epoch": 1.2479039802312242, "grad_norm": 0.7149855494499207, "learning_rate": 3.75216240070609e-05, "loss": 0.615, "step": 7070 }, { "epoch": 1.2496690495101932, "grad_norm": 0.9433127045631409, "learning_rate": 3.7503971756398944e-05, "loss": 0.6613, "step": 7080 }, { "epoch": 1.2514341187891624, "grad_norm": 0.9925957322120667, "learning_rate": 3.7486319505736985e-05, "loss": 0.7038, "step": 7090 }, { "epoch": 1.2531991880681317, "grad_norm": 2.517293691635132, "learning_rate": 3.7468667255075026e-05, "loss": 0.7463, "step": 7100 }, { "epoch": 1.254964257347101, "grad_norm": 2.6264331340789795, "learning_rate": 3.745101500441307e-05, "loss": 0.6378, "step": 7110 }, { "epoch": 1.2567293266260702, "grad_norm": 1.0302218198776245, "learning_rate": 3.743336275375111e-05, "loss": 0.6998, "step": 7120 }, { "epoch": 1.2584943959050392, "grad_norm": 0.9635699391365051, "learning_rate": 3.741571050308914e-05, "loss": 0.5448, "step": 7130 }, { "epoch": 1.2602594651840084, "grad_norm": 0.65135258436203, "learning_rate": 3.7398058252427184e-05, "loss": 0.619, "step": 7140 }, { "epoch": 1.2620245344629777, "grad_norm": 0.7986999154090881, "learning_rate": 3.7380406001765225e-05, "loss": 0.7834, "step": 7150 }, { "epoch": 1.263789603741947, "grad_norm": 0.9011120200157166, "learning_rate": 3.7362753751103266e-05, "loss": 0.6495, "step": 7160 }, { "epoch": 1.2655546730209162, "grad_norm": 2.6666321754455566, "learning_rate": 3.734510150044131e-05, "loss": 0.6368, "step": 7170 }, { "epoch": 1.2673197422998852, "grad_norm": 2.041754722595215, "learning_rate": 3.732744924977935e-05, "loss": 0.6377, "step": 7180 }, { "epoch": 1.2690848115788544, "grad_norm": 0.9585306644439697, "learning_rate": 3.730979699911739e-05, "loss": 0.6688, "step": 7190 }, { "epoch": 1.2708498808578237, "grad_norm": 2.1470253467559814, "learning_rate": 3.729214474845543e-05, "loss": 0.6592, "step": 7200 }, { "epoch": 1.272614950136793, "grad_norm": 3.2827067375183105, "learning_rate": 3.727449249779347e-05, "loss": 0.6111, "step": 7210 }, { "epoch": 1.2743800194157622, "grad_norm": 0.5595322251319885, "learning_rate": 3.725684024713151e-05, "loss": 0.5442, "step": 7220 }, { "epoch": 1.2761450886947312, "grad_norm": 1.8486113548278809, "learning_rate": 3.723918799646955e-05, "loss": 0.6346, "step": 7230 }, { "epoch": 1.2779101579737004, "grad_norm": 2.427776336669922, "learning_rate": 3.722153574580759e-05, "loss": 0.6365, "step": 7240 }, { "epoch": 1.2796752272526697, "grad_norm": 0.7573685646057129, "learning_rate": 3.720388349514563e-05, "loss": 0.5591, "step": 7250 }, { "epoch": 1.281440296531639, "grad_norm": 3.019432783126831, "learning_rate": 3.718623124448367e-05, "loss": 0.6491, "step": 7260 }, { "epoch": 1.2832053658106082, "grad_norm": 0.7502275109291077, "learning_rate": 3.716857899382171e-05, "loss": 0.5882, "step": 7270 }, { "epoch": 1.2849704350895772, "grad_norm": 0.9686793684959412, "learning_rate": 3.7150926743159753e-05, "loss": 0.5767, "step": 7280 }, { "epoch": 1.2867355043685464, "grad_norm": 1.0646675825119019, "learning_rate": 3.7133274492497795e-05, "loss": 0.5611, "step": 7290 }, { "epoch": 1.2885005736475157, "grad_norm": 0.9285513162612915, "learning_rate": 3.7115622241835836e-05, "loss": 0.5345, "step": 7300 }, { "epoch": 1.290265642926485, "grad_norm": 2.482447624206543, "learning_rate": 3.709796999117388e-05, "loss": 0.6679, "step": 7310 }, { "epoch": 1.2920307122054542, "grad_norm": 0.7033592462539673, "learning_rate": 3.708031774051192e-05, "loss": 0.657, "step": 7320 }, { "epoch": 1.2937957814844232, "grad_norm": 1.1254754066467285, "learning_rate": 3.706266548984996e-05, "loss": 0.6271, "step": 7330 }, { "epoch": 1.2955608507633924, "grad_norm": 2.424238920211792, "learning_rate": 3.7045013239188e-05, "loss": 0.6206, "step": 7340 }, { "epoch": 1.2973259200423617, "grad_norm": 1.2514790296554565, "learning_rate": 3.702736098852604e-05, "loss": 0.6148, "step": 7350 }, { "epoch": 1.299090989321331, "grad_norm": 2.183605909347534, "learning_rate": 3.700970873786408e-05, "loss": 0.6309, "step": 7360 }, { "epoch": 1.3008560586003002, "grad_norm": 2.991091251373291, "learning_rate": 3.699205648720212e-05, "loss": 0.6264, "step": 7370 }, { "epoch": 1.3026211278792692, "grad_norm": 2.2229976654052734, "learning_rate": 3.697440423654016e-05, "loss": 0.665, "step": 7380 }, { "epoch": 1.3043861971582384, "grad_norm": 0.7638149857521057, "learning_rate": 3.69567519858782e-05, "loss": 0.682, "step": 7390 }, { "epoch": 1.3061512664372077, "grad_norm": 2.2151546478271484, "learning_rate": 3.693909973521624e-05, "loss": 0.6459, "step": 7400 }, { "epoch": 1.307916335716177, "grad_norm": 1.0009431838989258, "learning_rate": 3.692144748455428e-05, "loss": 0.6326, "step": 7410 }, { "epoch": 1.3096814049951462, "grad_norm": 2.3861541748046875, "learning_rate": 3.690379523389232e-05, "loss": 0.6182, "step": 7420 }, { "epoch": 1.3114464742741152, "grad_norm": 0.7620774507522583, "learning_rate": 3.6886142983230364e-05, "loss": 0.6425, "step": 7430 }, { "epoch": 1.3132115435530844, "grad_norm": 1.1126807928085327, "learning_rate": 3.6868490732568405e-05, "loss": 0.6115, "step": 7440 }, { "epoch": 1.3149766128320537, "grad_norm": 1.0040702819824219, "learning_rate": 3.6850838481906446e-05, "loss": 0.644, "step": 7450 }, { "epoch": 1.316741682111023, "grad_norm": 2.9938085079193115, "learning_rate": 3.683318623124449e-05, "loss": 0.6142, "step": 7460 }, { "epoch": 1.3185067513899922, "grad_norm": 2.7956087589263916, "learning_rate": 3.681553398058253e-05, "loss": 0.6253, "step": 7470 }, { "epoch": 1.3202718206689612, "grad_norm": 1.8580163717269897, "learning_rate": 3.679788172992057e-05, "loss": 0.607, "step": 7480 }, { "epoch": 1.3220368899479304, "grad_norm": 2.707329511642456, "learning_rate": 3.678022947925861e-05, "loss": 0.6795, "step": 7490 }, { "epoch": 1.3238019592268997, "grad_norm": 0.6467975378036499, "learning_rate": 3.6762577228596645e-05, "loss": 0.5137, "step": 7500 }, { "epoch": 1.325567028505869, "grad_norm": 0.9823447465896606, "learning_rate": 3.6744924977934686e-05, "loss": 0.74, "step": 7510 }, { "epoch": 1.3273320977848382, "grad_norm": 0.680709183216095, "learning_rate": 3.672727272727273e-05, "loss": 0.6245, "step": 7520 }, { "epoch": 1.3290971670638072, "grad_norm": 4.53438663482666, "learning_rate": 3.670962047661077e-05, "loss": 0.6091, "step": 7530 }, { "epoch": 1.3308622363427765, "grad_norm": 0.66978919506073, "learning_rate": 3.669196822594881e-05, "loss": 0.5938, "step": 7540 }, { "epoch": 1.3326273056217457, "grad_norm": 2.9042139053344727, "learning_rate": 3.667431597528685e-05, "loss": 0.6434, "step": 7550 }, { "epoch": 1.3343923749007147, "grad_norm": 0.6210140585899353, "learning_rate": 3.665666372462489e-05, "loss": 0.6021, "step": 7560 }, { "epoch": 1.3361574441796842, "grad_norm": 3.4923391342163086, "learning_rate": 3.663901147396293e-05, "loss": 0.6562, "step": 7570 }, { "epoch": 1.3379225134586532, "grad_norm": 0.7961561679840088, "learning_rate": 3.6621359223300974e-05, "loss": 0.611, "step": 7580 }, { "epoch": 1.3396875827376225, "grad_norm": 1.0654687881469727, "learning_rate": 3.6603706972639015e-05, "loss": 0.6824, "step": 7590 }, { "epoch": 1.3414526520165917, "grad_norm": 3.1214218139648438, "learning_rate": 3.6586054721977057e-05, "loss": 0.7111, "step": 7600 }, { "epoch": 1.3432177212955607, "grad_norm": 2.3251359462738037, "learning_rate": 3.656840247131509e-05, "loss": 0.5993, "step": 7610 }, { "epoch": 1.3449827905745302, "grad_norm": 0.804865837097168, "learning_rate": 3.655075022065313e-05, "loss": 0.6413, "step": 7620 }, { "epoch": 1.3467478598534992, "grad_norm": 3.0983431339263916, "learning_rate": 3.653309796999117e-05, "loss": 0.6574, "step": 7630 }, { "epoch": 1.3485129291324685, "grad_norm": 0.9018577933311462, "learning_rate": 3.6515445719329214e-05, "loss": 0.6052, "step": 7640 }, { "epoch": 1.3502779984114377, "grad_norm": 0.7807791233062744, "learning_rate": 3.6497793468667256e-05, "loss": 0.5871, "step": 7650 }, { "epoch": 1.3520430676904067, "grad_norm": 0.8855323195457458, "learning_rate": 3.64801412180053e-05, "loss": 0.6492, "step": 7660 }, { "epoch": 1.3538081369693762, "grad_norm": 2.0354232788085938, "learning_rate": 3.646248896734334e-05, "loss": 0.6495, "step": 7670 }, { "epoch": 1.3555732062483452, "grad_norm": 3.2508060932159424, "learning_rate": 3.644483671668138e-05, "loss": 0.7525, "step": 7680 }, { "epoch": 1.3573382755273145, "grad_norm": 2.6301016807556152, "learning_rate": 3.642718446601942e-05, "loss": 0.5911, "step": 7690 }, { "epoch": 1.3591033448062837, "grad_norm": 0.9165176153182983, "learning_rate": 3.640953221535746e-05, "loss": 0.6071, "step": 7700 }, { "epoch": 1.3608684140852527, "grad_norm": 0.8968174457550049, "learning_rate": 3.63918799646955e-05, "loss": 0.6456, "step": 7710 }, { "epoch": 1.362633483364222, "grad_norm": 0.6524578332901001, "learning_rate": 3.6374227714033544e-05, "loss": 0.5536, "step": 7720 }, { "epoch": 1.3643985526431912, "grad_norm": 0.7832716107368469, "learning_rate": 3.6356575463371585e-05, "loss": 0.6569, "step": 7730 }, { "epoch": 1.3661636219221605, "grad_norm": 2.0386695861816406, "learning_rate": 3.6338923212709626e-05, "loss": 0.7066, "step": 7740 }, { "epoch": 1.3679286912011297, "grad_norm": 0.8649764657020569, "learning_rate": 3.632127096204767e-05, "loss": 0.7365, "step": 7750 }, { "epoch": 1.3696937604800987, "grad_norm": 1.356444001197815, "learning_rate": 3.63036187113857e-05, "loss": 0.6324, "step": 7760 }, { "epoch": 1.371458829759068, "grad_norm": 1.923880934715271, "learning_rate": 3.628596646072374e-05, "loss": 0.5376, "step": 7770 }, { "epoch": 1.3732238990380372, "grad_norm": 0.9914042353630066, "learning_rate": 3.6268314210061784e-05, "loss": 0.5838, "step": 7780 }, { "epoch": 1.3749889683170065, "grad_norm": 2.0924298763275146, "learning_rate": 3.6250661959399825e-05, "loss": 0.5839, "step": 7790 }, { "epoch": 1.3767540375959757, "grad_norm": 1.959044098854065, "learning_rate": 3.6233009708737866e-05, "loss": 0.6112, "step": 7800 }, { "epoch": 1.3785191068749447, "grad_norm": 2.142839193344116, "learning_rate": 3.621535745807591e-05, "loss": 0.6222, "step": 7810 }, { "epoch": 1.380284176153914, "grad_norm": 1.0231504440307617, "learning_rate": 3.619770520741395e-05, "loss": 0.6006, "step": 7820 }, { "epoch": 1.3820492454328832, "grad_norm": 4.729889869689941, "learning_rate": 3.618005295675199e-05, "loss": 0.5953, "step": 7830 }, { "epoch": 1.3838143147118525, "grad_norm": 1.9787018299102783, "learning_rate": 3.616240070609003e-05, "loss": 0.6817, "step": 7840 }, { "epoch": 1.3855793839908217, "grad_norm": 0.9792063236236572, "learning_rate": 3.6144748455428065e-05, "loss": 0.6927, "step": 7850 }, { "epoch": 1.3873444532697907, "grad_norm": 0.8229073882102966, "learning_rate": 3.6127096204766106e-05, "loss": 0.6544, "step": 7860 }, { "epoch": 1.38910952254876, "grad_norm": 0.9874571561813354, "learning_rate": 3.610944395410415e-05, "loss": 0.7415, "step": 7870 }, { "epoch": 1.3908745918277292, "grad_norm": 2.853695869445801, "learning_rate": 3.609179170344219e-05, "loss": 0.6804, "step": 7880 }, { "epoch": 1.3926396611066985, "grad_norm": 0.6411375403404236, "learning_rate": 3.607413945278023e-05, "loss": 0.6304, "step": 7890 }, { "epoch": 1.3944047303856677, "grad_norm": 0.9372557997703552, "learning_rate": 3.605648720211827e-05, "loss": 0.6058, "step": 7900 }, { "epoch": 1.3961697996646367, "grad_norm": 2.166592836380005, "learning_rate": 3.603883495145631e-05, "loss": 0.694, "step": 7910 }, { "epoch": 1.397934868943606, "grad_norm": 2.2341320514678955, "learning_rate": 3.602118270079435e-05, "loss": 0.6103, "step": 7920 }, { "epoch": 1.3996999382225752, "grad_norm": 3.3916139602661133, "learning_rate": 3.6003530450132394e-05, "loss": 0.6624, "step": 7930 }, { "epoch": 1.4014650075015445, "grad_norm": 0.7535905838012695, "learning_rate": 3.5985878199470435e-05, "loss": 0.5625, "step": 7940 }, { "epoch": 1.4032300767805137, "grad_norm": 2.7306909561157227, "learning_rate": 3.5968225948808476e-05, "loss": 0.6289, "step": 7950 }, { "epoch": 1.4049951460594827, "grad_norm": 3.3049874305725098, "learning_rate": 3.595057369814652e-05, "loss": 0.6652, "step": 7960 }, { "epoch": 1.406760215338452, "grad_norm": 1.124943494796753, "learning_rate": 3.593292144748456e-05, "loss": 0.6471, "step": 7970 }, { "epoch": 1.4085252846174212, "grad_norm": 1.1680623292922974, "learning_rate": 3.59152691968226e-05, "loss": 0.661, "step": 7980 }, { "epoch": 1.4102903538963905, "grad_norm": 0.8950367569923401, "learning_rate": 3.589761694616064e-05, "loss": 0.6447, "step": 7990 }, { "epoch": 1.4120554231753597, "grad_norm": 3.3598592281341553, "learning_rate": 3.587996469549868e-05, "loss": 0.6222, "step": 8000 }, { "epoch": 1.4120554231753597, "eval_loss": 0.6748311519622803, "eval_runtime": 591.9003, "eval_samples_per_second": 47.858, "eval_steps_per_second": 2.394, "eval_token_accuracy": 0.0005295711387180035, "step": 8000 }, { "epoch": 1.4138204924543287, "grad_norm": 3.1322696208953857, "learning_rate": 3.586231244483672e-05, "loss": 0.5731, "step": 8010 }, { "epoch": 1.415585561733298, "grad_norm": 0.8525201082229614, "learning_rate": 3.5846425419240954e-05, "loss": 0.5651, "step": 8020 }, { "epoch": 1.4173506310122672, "grad_norm": 1.0389692783355713, "learning_rate": 3.5828773168578995e-05, "loss": 0.674, "step": 8030 }, { "epoch": 1.4191157002912365, "grad_norm": 1.0880883932113647, "learning_rate": 3.5811120917917036e-05, "loss": 0.5929, "step": 8040 }, { "epoch": 1.4208807695702057, "grad_norm": 2.932532548904419, "learning_rate": 3.579346866725508e-05, "loss": 0.6861, "step": 8050 }, { "epoch": 1.4226458388491747, "grad_norm": 1.5305405855178833, "learning_rate": 3.577581641659312e-05, "loss": 0.5335, "step": 8060 }, { "epoch": 1.424410908128144, "grad_norm": 3.449867010116577, "learning_rate": 3.575816416593116e-05, "loss": 0.7106, "step": 8070 }, { "epoch": 1.4261759774071132, "grad_norm": 1.029525637626648, "learning_rate": 3.57405119152692e-05, "loss": 0.6468, "step": 8080 }, { "epoch": 1.4279410466860825, "grad_norm": 3.1664063930511475, "learning_rate": 3.572285966460724e-05, "loss": 0.6735, "step": 8090 }, { "epoch": 1.4297061159650517, "grad_norm": 2.8890814781188965, "learning_rate": 3.5705207413945277e-05, "loss": 0.6517, "step": 8100 }, { "epoch": 1.4314711852440207, "grad_norm": 1.0854856967926025, "learning_rate": 3.568755516328332e-05, "loss": 0.6061, "step": 8110 }, { "epoch": 1.43323625452299, "grad_norm": 0.7709710597991943, "learning_rate": 3.566990291262136e-05, "loss": 0.5545, "step": 8120 }, { "epoch": 1.4350013238019592, "grad_norm": 2.4035892486572266, "learning_rate": 3.56522506619594e-05, "loss": 0.5354, "step": 8130 }, { "epoch": 1.4367663930809285, "grad_norm": 2.355015754699707, "learning_rate": 3.563459841129744e-05, "loss": 0.5083, "step": 8140 }, { "epoch": 1.4385314623598977, "grad_norm": 2.647833824157715, "learning_rate": 3.561694616063548e-05, "loss": 0.5592, "step": 8150 }, { "epoch": 1.4402965316388667, "grad_norm": 0.4458978772163391, "learning_rate": 3.5599293909973523e-05, "loss": 0.5751, "step": 8160 }, { "epoch": 1.442061600917836, "grad_norm": 1.8528035879135132, "learning_rate": 3.5581641659311565e-05, "loss": 0.5605, "step": 8170 }, { "epoch": 1.4438266701968052, "grad_norm": 2.4769816398620605, "learning_rate": 3.5563989408649606e-05, "loss": 0.6521, "step": 8180 }, { "epoch": 1.4455917394757745, "grad_norm": 0.6114835739135742, "learning_rate": 3.554633715798765e-05, "loss": 0.6659, "step": 8190 }, { "epoch": 1.4473568087547437, "grad_norm": 0.9413579106330872, "learning_rate": 3.552868490732569e-05, "loss": 0.6604, "step": 8200 }, { "epoch": 1.4491218780337127, "grad_norm": 2.2076611518859863, "learning_rate": 3.551103265666373e-05, "loss": 0.5319, "step": 8210 }, { "epoch": 1.450886947312682, "grad_norm": 2.3304405212402344, "learning_rate": 3.549338040600177e-05, "loss": 0.7263, "step": 8220 }, { "epoch": 1.4526520165916512, "grad_norm": 3.872177839279175, "learning_rate": 3.547572815533981e-05, "loss": 0.6536, "step": 8230 }, { "epoch": 1.4544170858706205, "grad_norm": 0.8268742561340332, "learning_rate": 3.5458075904677846e-05, "loss": 0.4964, "step": 8240 }, { "epoch": 1.4561821551495897, "grad_norm": 0.9529076218605042, "learning_rate": 3.544042365401589e-05, "loss": 0.5737, "step": 8250 }, { "epoch": 1.4579472244285587, "grad_norm": 0.7376457452774048, "learning_rate": 3.542277140335393e-05, "loss": 0.5622, "step": 8260 }, { "epoch": 1.459712293707528, "grad_norm": 0.8693493008613586, "learning_rate": 3.540511915269197e-05, "loss": 0.5919, "step": 8270 }, { "epoch": 1.4614773629864972, "grad_norm": 1.8074113130569458, "learning_rate": 3.538746690203001e-05, "loss": 0.5726, "step": 8280 }, { "epoch": 1.4632424322654665, "grad_norm": 1.9681881666183472, "learning_rate": 3.536981465136805e-05, "loss": 0.605, "step": 8290 }, { "epoch": 1.4650075015444357, "grad_norm": 2.3759384155273438, "learning_rate": 3.535216240070609e-05, "loss": 0.7572, "step": 8300 }, { "epoch": 1.4667725708234047, "grad_norm": 0.8262273669242859, "learning_rate": 3.5334510150044134e-05, "loss": 0.6218, "step": 8310 }, { "epoch": 1.468537640102374, "grad_norm": 3.165436267852783, "learning_rate": 3.5316857899382175e-05, "loss": 0.7645, "step": 8320 }, { "epoch": 1.4703027093813432, "grad_norm": 0.9559675455093384, "learning_rate": 3.5299205648720216e-05, "loss": 0.5479, "step": 8330 }, { "epoch": 1.4720677786603125, "grad_norm": 2.0745654106140137, "learning_rate": 3.528155339805825e-05, "loss": 0.6754, "step": 8340 }, { "epoch": 1.4738328479392817, "grad_norm": 0.9734734892845154, "learning_rate": 3.526390114739629e-05, "loss": 0.5967, "step": 8350 }, { "epoch": 1.4755979172182507, "grad_norm": 0.9553613066673279, "learning_rate": 3.524624889673433e-05, "loss": 0.6862, "step": 8360 }, { "epoch": 1.47736298649722, "grad_norm": 0.822285532951355, "learning_rate": 3.5228596646072374e-05, "loss": 0.6204, "step": 8370 }, { "epoch": 1.4791280557761892, "grad_norm": 0.9431090950965881, "learning_rate": 3.5210944395410415e-05, "loss": 0.6096, "step": 8380 }, { "epoch": 1.4808931250551585, "grad_norm": 4.019901275634766, "learning_rate": 3.5193292144748456e-05, "loss": 0.5686, "step": 8390 }, { "epoch": 1.4826581943341277, "grad_norm": 3.979301691055298, "learning_rate": 3.51756398940865e-05, "loss": 0.665, "step": 8400 }, { "epoch": 1.4844232636130967, "grad_norm": 0.7126337885856628, "learning_rate": 3.515798764342454e-05, "loss": 0.7266, "step": 8410 }, { "epoch": 1.486188332892066, "grad_norm": 1.1517317295074463, "learning_rate": 3.514033539276258e-05, "loss": 0.6027, "step": 8420 }, { "epoch": 1.4879534021710352, "grad_norm": 1.9708514213562012, "learning_rate": 3.512268314210062e-05, "loss": 0.707, "step": 8430 }, { "epoch": 1.4897184714500045, "grad_norm": 0.8406355381011963, "learning_rate": 3.510503089143866e-05, "loss": 0.5988, "step": 8440 }, { "epoch": 1.4914835407289737, "grad_norm": 3.593660354614258, "learning_rate": 3.50873786407767e-05, "loss": 0.6455, "step": 8450 }, { "epoch": 1.4932486100079427, "grad_norm": 3.105415105819702, "learning_rate": 3.5069726390114744e-05, "loss": 0.6632, "step": 8460 }, { "epoch": 1.495013679286912, "grad_norm": 2.8463072776794434, "learning_rate": 3.5052074139452785e-05, "loss": 0.6502, "step": 8470 }, { "epoch": 1.4967787485658812, "grad_norm": 1.0606666803359985, "learning_rate": 3.5034421888790827e-05, "loss": 0.5616, "step": 8480 }, { "epoch": 1.4985438178448505, "grad_norm": 5.110599517822266, "learning_rate": 3.501676963812887e-05, "loss": 0.6141, "step": 8490 }, { "epoch": 1.5003088871238197, "grad_norm": 1.8309544324874878, "learning_rate": 3.49991173874669e-05, "loss": 0.603, "step": 8500 }, { "epoch": 1.5020739564027887, "grad_norm": 3.568061113357544, "learning_rate": 3.498146513680494e-05, "loss": 0.6794, "step": 8510 }, { "epoch": 1.503839025681758, "grad_norm": 2.1609065532684326, "learning_rate": 3.4963812886142984e-05, "loss": 0.6598, "step": 8520 }, { "epoch": 1.5056040949607272, "grad_norm": 1.8619052171707153, "learning_rate": 3.4946160635481025e-05, "loss": 0.6009, "step": 8530 }, { "epoch": 1.5073691642396962, "grad_norm": 1.1925305128097534, "learning_rate": 3.492850838481907e-05, "loss": 0.7325, "step": 8540 }, { "epoch": 1.5091342335186657, "grad_norm": 0.7662707567214966, "learning_rate": 3.491085613415711e-05, "loss": 0.6295, "step": 8550 }, { "epoch": 1.5108993027976347, "grad_norm": 1.0044121742248535, "learning_rate": 3.489320388349515e-05, "loss": 0.6156, "step": 8560 }, { "epoch": 1.512664372076604, "grad_norm": 0.935528039932251, "learning_rate": 3.487555163283318e-05, "loss": 0.6138, "step": 8570 }, { "epoch": 1.5144294413555732, "grad_norm": 2.4792351722717285, "learning_rate": 3.4857899382171224e-05, "loss": 0.6002, "step": 8580 }, { "epoch": 1.5161945106345422, "grad_norm": 2.8710122108459473, "learning_rate": 3.4840247131509266e-05, "loss": 0.6942, "step": 8590 }, { "epoch": 1.5179595799135117, "grad_norm": 0.8079431653022766, "learning_rate": 3.482259488084731e-05, "loss": 0.6515, "step": 8600 }, { "epoch": 1.5197246491924807, "grad_norm": 1.9361704587936401, "learning_rate": 3.480494263018535e-05, "loss": 0.5994, "step": 8610 }, { "epoch": 1.52148971847145, "grad_norm": 1.108770728111267, "learning_rate": 3.478729037952339e-05, "loss": 0.6323, "step": 8620 }, { "epoch": 1.5232547877504192, "grad_norm": 0.8695691227912903, "learning_rate": 3.476963812886143e-05, "loss": 0.5698, "step": 8630 }, { "epoch": 1.5250198570293882, "grad_norm": 1.988014578819275, "learning_rate": 3.475198587819947e-05, "loss": 0.598, "step": 8640 }, { "epoch": 1.5267849263083577, "grad_norm": 2.2199532985687256, "learning_rate": 3.473433362753751e-05, "loss": 0.6659, "step": 8650 }, { "epoch": 1.5285499955873267, "grad_norm": 0.845320463180542, "learning_rate": 3.4716681376875554e-05, "loss": 0.593, "step": 8660 }, { "epoch": 1.530315064866296, "grad_norm": 2.645056962966919, "learning_rate": 3.4699029126213595e-05, "loss": 0.6378, "step": 8670 }, { "epoch": 1.5320801341452652, "grad_norm": 1.788921594619751, "learning_rate": 3.4681376875551636e-05, "loss": 0.6099, "step": 8680 }, { "epoch": 1.5338452034242342, "grad_norm": 0.826831579208374, "learning_rate": 3.466372462488968e-05, "loss": 0.7102, "step": 8690 }, { "epoch": 1.5356102727032037, "grad_norm": 2.5500268936157227, "learning_rate": 3.464607237422772e-05, "loss": 0.5822, "step": 8700 }, { "epoch": 1.5373753419821727, "grad_norm": 3.20348858833313, "learning_rate": 3.462842012356576e-05, "loss": 0.6996, "step": 8710 }, { "epoch": 1.539140411261142, "grad_norm": 0.9246511459350586, "learning_rate": 3.46107678729038e-05, "loss": 0.6416, "step": 8720 }, { "epoch": 1.5409054805401112, "grad_norm": 2.670889139175415, "learning_rate": 3.459311562224184e-05, "loss": 0.5498, "step": 8730 }, { "epoch": 1.5426705498190803, "grad_norm": 2.474168062210083, "learning_rate": 3.457546337157988e-05, "loss": 0.6324, "step": 8740 }, { "epoch": 1.5444356190980497, "grad_norm": 3.8397858142852783, "learning_rate": 3.4557811120917924e-05, "loss": 0.6154, "step": 8750 }, { "epoch": 1.5462006883770187, "grad_norm": 2.40657901763916, "learning_rate": 3.4540158870255965e-05, "loss": 0.6554, "step": 8760 }, { "epoch": 1.547965757655988, "grad_norm": 0.9760830998420715, "learning_rate": 3.4522506619594e-05, "loss": 0.5496, "step": 8770 }, { "epoch": 1.5497308269349572, "grad_norm": 0.8230709433555603, "learning_rate": 3.450485436893204e-05, "loss": 0.4997, "step": 8780 }, { "epoch": 1.5514958962139263, "grad_norm": 1.9407001733779907, "learning_rate": 3.448720211827008e-05, "loss": 0.6807, "step": 8790 }, { "epoch": 1.5532609654928957, "grad_norm": 3.2647931575775146, "learning_rate": 3.446954986760812e-05, "loss": 0.5814, "step": 8800 }, { "epoch": 1.5550260347718647, "grad_norm": 1.772679328918457, "learning_rate": 3.445189761694616e-05, "loss": 0.6516, "step": 8810 }, { "epoch": 1.556791104050834, "grad_norm": 1.2060052156448364, "learning_rate": 3.44342453662842e-05, "loss": 0.6775, "step": 8820 }, { "epoch": 1.5585561733298032, "grad_norm": 1.1109216213226318, "learning_rate": 3.441659311562224e-05, "loss": 0.6459, "step": 8830 }, { "epoch": 1.5603212426087723, "grad_norm": 2.8640549182891846, "learning_rate": 3.439894086496028e-05, "loss": 0.5936, "step": 8840 }, { "epoch": 1.5620863118877417, "grad_norm": 1.027740478515625, "learning_rate": 3.438128861429832e-05, "loss": 0.6354, "step": 8850 }, { "epoch": 1.5638513811667107, "grad_norm": 1.7239662408828735, "learning_rate": 3.436363636363636e-05, "loss": 0.6095, "step": 8860 }, { "epoch": 1.56561645044568, "grad_norm": 0.9941009283065796, "learning_rate": 3.4345984112974404e-05, "loss": 0.6207, "step": 8870 }, { "epoch": 1.5673815197246492, "grad_norm": 2.983370542526245, "learning_rate": 3.4328331862312445e-05, "loss": 0.5937, "step": 8880 }, { "epoch": 1.5691465890036183, "grad_norm": 1.0334148406982422, "learning_rate": 3.4310679611650486e-05, "loss": 0.733, "step": 8890 }, { "epoch": 1.5709116582825877, "grad_norm": 0.7821390628814697, "learning_rate": 3.429302736098853e-05, "loss": 0.6771, "step": 8900 }, { "epoch": 1.5726767275615567, "grad_norm": 3.0682766437530518, "learning_rate": 3.427537511032657e-05, "loss": 0.6754, "step": 8910 }, { "epoch": 1.574441796840526, "grad_norm": 1.0227667093276978, "learning_rate": 3.425772285966461e-05, "loss": 0.5639, "step": 8920 }, { "epoch": 1.5762068661194952, "grad_norm": 0.9798213839530945, "learning_rate": 3.424007060900265e-05, "loss": 0.6775, "step": 8930 }, { "epoch": 1.5779719353984643, "grad_norm": 0.8554840087890625, "learning_rate": 3.422241835834069e-05, "loss": 0.6324, "step": 8940 }, { "epoch": 1.5797370046774337, "grad_norm": 0.913659393787384, "learning_rate": 3.420476610767873e-05, "loss": 0.6402, "step": 8950 }, { "epoch": 1.5815020739564027, "grad_norm": 2.2020037174224854, "learning_rate": 3.4187113857016774e-05, "loss": 0.6192, "step": 8960 }, { "epoch": 1.583267143235372, "grad_norm": 1.2366520166397095, "learning_rate": 3.4169461606354816e-05, "loss": 0.6373, "step": 8970 }, { "epoch": 1.5850322125143412, "grad_norm": 0.6150069236755371, "learning_rate": 3.415180935569286e-05, "loss": 0.6285, "step": 8980 }, { "epoch": 1.5867972817933103, "grad_norm": 3.2270278930664062, "learning_rate": 3.41341571050309e-05, "loss": 0.5097, "step": 8990 }, { "epoch": 1.5885623510722797, "grad_norm": 1.8538926839828491, "learning_rate": 3.411650485436894e-05, "loss": 0.629, "step": 9000 }, { "epoch": 1.5885623510722797, "eval_loss": 0.6662057638168335, "eval_runtime": 591.8171, "eval_samples_per_second": 47.864, "eval_steps_per_second": 2.394, "eval_token_accuracy": 0.000523887738225821, "step": 9000 }, { "epoch": 1.5903274203512487, "grad_norm": 2.432816982269287, "learning_rate": 3.409885260370697e-05, "loss": 0.5923, "step": 9010 }, { "epoch": 1.592092489630218, "grad_norm": 2.547600507736206, "learning_rate": 3.4081200353045015e-05, "loss": 0.6395, "step": 9020 }, { "epoch": 1.5938575589091872, "grad_norm": 1.0481719970703125, "learning_rate": 3.4063548102383056e-05, "loss": 0.6421, "step": 9030 }, { "epoch": 1.5956226281881563, "grad_norm": 2.210798740386963, "learning_rate": 3.40458958517211e-05, "loss": 0.6653, "step": 9040 }, { "epoch": 1.5973876974671257, "grad_norm": 1.0804709196090698, "learning_rate": 3.402824360105913e-05, "loss": 0.6169, "step": 9050 }, { "epoch": 1.5991527667460947, "grad_norm": 0.8143006563186646, "learning_rate": 3.401059135039717e-05, "loss": 0.637, "step": 9060 }, { "epoch": 1.600917836025064, "grad_norm": 1.9847668409347534, "learning_rate": 3.3992939099735213e-05, "loss": 0.5887, "step": 9070 }, { "epoch": 1.6026829053040332, "grad_norm": 1.3877208232879639, "learning_rate": 3.3975286849073255e-05, "loss": 0.626, "step": 9080 }, { "epoch": 1.6044479745830023, "grad_norm": 0.9856419563293457, "learning_rate": 3.3957634598411296e-05, "loss": 0.5996, "step": 9090 }, { "epoch": 1.6062130438619717, "grad_norm": 2.6040806770324707, "learning_rate": 3.393998234774934e-05, "loss": 0.6237, "step": 9100 }, { "epoch": 1.6079781131409407, "grad_norm": 0.9981382489204407, "learning_rate": 3.392233009708738e-05, "loss": 0.5975, "step": 9110 }, { "epoch": 1.60974318241991, "grad_norm": 0.9611056447029114, "learning_rate": 3.390467784642542e-05, "loss": 0.5722, "step": 9120 }, { "epoch": 1.6115082516988792, "grad_norm": 1.7118070125579834, "learning_rate": 3.388702559576346e-05, "loss": 0.5698, "step": 9130 }, { "epoch": 1.6132733209778483, "grad_norm": 0.9777098894119263, "learning_rate": 3.38693733451015e-05, "loss": 0.6284, "step": 9140 }, { "epoch": 1.6150383902568177, "grad_norm": 3.165189743041992, "learning_rate": 3.385172109443954e-05, "loss": 0.6531, "step": 9150 }, { "epoch": 1.6168034595357867, "grad_norm": 2.1566078662872314, "learning_rate": 3.3834068843777584e-05, "loss": 0.6539, "step": 9160 }, { "epoch": 1.618568528814756, "grad_norm": 0.7770845890045166, "learning_rate": 3.3816416593115625e-05, "loss": 0.6576, "step": 9170 }, { "epoch": 1.6203335980937252, "grad_norm": 0.9254570603370667, "learning_rate": 3.3798764342453666e-05, "loss": 0.6781, "step": 9180 }, { "epoch": 1.6220986673726943, "grad_norm": 3.901735782623291, "learning_rate": 3.378111209179171e-05, "loss": 0.6293, "step": 9190 }, { "epoch": 1.6238637366516637, "grad_norm": 3.071284770965576, "learning_rate": 3.376345984112975e-05, "loss": 0.6078, "step": 9200 }, { "epoch": 1.6256288059306327, "grad_norm": 0.9238028526306152, "learning_rate": 3.374580759046779e-05, "loss": 0.6236, "step": 9210 }, { "epoch": 1.627393875209602, "grad_norm": 0.923682451248169, "learning_rate": 3.372815533980583e-05, "loss": 0.6485, "step": 9220 }, { "epoch": 1.6291589444885712, "grad_norm": 0.8640393018722534, "learning_rate": 3.371050308914387e-05, "loss": 0.6288, "step": 9230 }, { "epoch": 1.6309240137675403, "grad_norm": 1.036797046661377, "learning_rate": 3.369285083848191e-05, "loss": 0.6533, "step": 9240 }, { "epoch": 1.6326890830465097, "grad_norm": 1.9658279418945312, "learning_rate": 3.367519858781995e-05, "loss": 0.6105, "step": 9250 }, { "epoch": 1.6344541523254787, "grad_norm": 1.4766429662704468, "learning_rate": 3.365754633715799e-05, "loss": 0.6158, "step": 9260 }, { "epoch": 1.636219221604448, "grad_norm": 0.8438056707382202, "learning_rate": 3.363989408649603e-05, "loss": 0.6446, "step": 9270 }, { "epoch": 1.6379842908834172, "grad_norm": 0.8339599370956421, "learning_rate": 3.362224183583407e-05, "loss": 0.6512, "step": 9280 }, { "epoch": 1.6397493601623863, "grad_norm": 1.8318166732788086, "learning_rate": 3.360458958517211e-05, "loss": 0.5804, "step": 9290 }, { "epoch": 1.6415144294413557, "grad_norm": 1.179258108139038, "learning_rate": 3.3586937334510146e-05, "loss": 0.6192, "step": 9300 }, { "epoch": 1.6432794987203247, "grad_norm": 2.5670175552368164, "learning_rate": 3.356928508384819e-05, "loss": 0.6171, "step": 9310 }, { "epoch": 1.645044567999294, "grad_norm": 0.7155138254165649, "learning_rate": 3.355163283318623e-05, "loss": 0.6379, "step": 9320 }, { "epoch": 1.6468096372782632, "grad_norm": 3.0688700675964355, "learning_rate": 3.353398058252427e-05, "loss": 0.659, "step": 9330 }, { "epoch": 1.6485747065572323, "grad_norm": 0.8237749934196472, "learning_rate": 3.351632833186231e-05, "loss": 0.5595, "step": 9340 }, { "epoch": 1.6503397758362017, "grad_norm": 0.6362634301185608, "learning_rate": 3.349867608120035e-05, "loss": 0.5071, "step": 9350 }, { "epoch": 1.6521048451151708, "grad_norm": 2.7345876693725586, "learning_rate": 3.348102383053839e-05, "loss": 0.5936, "step": 9360 }, { "epoch": 1.65386991439414, "grad_norm": 0.9333705902099609, "learning_rate": 3.3463371579876434e-05, "loss": 0.6701, "step": 9370 }, { "epoch": 1.6556349836731092, "grad_norm": 2.97668194770813, "learning_rate": 3.3445719329214475e-05, "loss": 0.5406, "step": 9380 }, { "epoch": 1.6574000529520783, "grad_norm": 0.6854531168937683, "learning_rate": 3.3428067078552517e-05, "loss": 0.6807, "step": 9390 }, { "epoch": 1.6591651222310477, "grad_norm": 2.8641855716705322, "learning_rate": 3.341041482789056e-05, "loss": 0.6658, "step": 9400 }, { "epoch": 1.6609301915100168, "grad_norm": 2.481201410293579, "learning_rate": 3.33927625772286e-05, "loss": 0.6206, "step": 9410 }, { "epoch": 1.662695260788986, "grad_norm": 2.451979160308838, "learning_rate": 3.337511032656664e-05, "loss": 0.6998, "step": 9420 }, { "epoch": 1.6644603300679552, "grad_norm": 0.9651976823806763, "learning_rate": 3.335745807590468e-05, "loss": 0.7207, "step": 9430 }, { "epoch": 1.6662253993469243, "grad_norm": 1.954372525215149, "learning_rate": 3.333980582524272e-05, "loss": 0.6195, "step": 9440 }, { "epoch": 1.6679904686258937, "grad_norm": 1.883323073387146, "learning_rate": 3.3322153574580763e-05, "loss": 0.5419, "step": 9450 }, { "epoch": 1.6697555379048628, "grad_norm": 2.2481348514556885, "learning_rate": 3.3304501323918805e-05, "loss": 0.582, "step": 9460 }, { "epoch": 1.671520607183832, "grad_norm": 3.2739031314849854, "learning_rate": 3.3286849073256846e-05, "loss": 0.6003, "step": 9470 }, { "epoch": 1.6732856764628012, "grad_norm": 2.612467050552368, "learning_rate": 3.326919682259489e-05, "loss": 0.6108, "step": 9480 }, { "epoch": 1.6750507457417703, "grad_norm": 2.2844624519348145, "learning_rate": 3.325154457193292e-05, "loss": 0.6185, "step": 9490 }, { "epoch": 1.6768158150207397, "grad_norm": 0.9374117851257324, "learning_rate": 3.323389232127096e-05, "loss": 0.6983, "step": 9500 }, { "epoch": 1.6785808842997088, "grad_norm": 2.6882810592651367, "learning_rate": 3.3216240070609004e-05, "loss": 0.5724, "step": 9510 }, { "epoch": 1.680345953578678, "grad_norm": 1.042275309562683, "learning_rate": 3.3198587819947045e-05, "loss": 0.6018, "step": 9520 }, { "epoch": 1.6821110228576472, "grad_norm": 2.711000442504883, "learning_rate": 3.3180935569285086e-05, "loss": 0.6355, "step": 9530 }, { "epoch": 1.6838760921366163, "grad_norm": 1.005644679069519, "learning_rate": 3.316328331862313e-05, "loss": 0.5572, "step": 9540 }, { "epoch": 1.6856411614155855, "grad_norm": 0.8825269937515259, "learning_rate": 3.314563106796117e-05, "loss": 0.6376, "step": 9550 }, { "epoch": 1.6874062306945548, "grad_norm": 4.650322914123535, "learning_rate": 3.31279788172992e-05, "loss": 0.5378, "step": 9560 }, { "epoch": 1.689171299973524, "grad_norm": 0.8221828937530518, "learning_rate": 3.3110326566637244e-05, "loss": 0.7414, "step": 9570 }, { "epoch": 1.6909363692524932, "grad_norm": 1.6978652477264404, "learning_rate": 3.3092674315975285e-05, "loss": 0.5823, "step": 9580 }, { "epoch": 1.6927014385314623, "grad_norm": 0.978359043598175, "learning_rate": 3.3075022065313326e-05, "loss": 0.6359, "step": 9590 }, { "epoch": 1.6944665078104315, "grad_norm": 2.7753124237060547, "learning_rate": 3.305736981465137e-05, "loss": 0.6715, "step": 9600 }, { "epoch": 1.6962315770894008, "grad_norm": 3.3159520626068115, "learning_rate": 3.303971756398941e-05, "loss": 0.5691, "step": 9610 }, { "epoch": 1.69799664636837, "grad_norm": 2.1915853023529053, "learning_rate": 3.302206531332745e-05, "loss": 0.6413, "step": 9620 }, { "epoch": 1.6997617156473392, "grad_norm": 0.6985222697257996, "learning_rate": 3.300441306266549e-05, "loss": 0.5339, "step": 9630 }, { "epoch": 1.7015267849263083, "grad_norm": 1.000430703163147, "learning_rate": 3.298676081200353e-05, "loss": 0.627, "step": 9640 }, { "epoch": 1.7032918542052775, "grad_norm": 2.2707552909851074, "learning_rate": 3.296910856134157e-05, "loss": 0.5694, "step": 9650 }, { "epoch": 1.7050569234842468, "grad_norm": 0.9370206594467163, "learning_rate": 3.2951456310679614e-05, "loss": 0.6173, "step": 9660 }, { "epoch": 1.706821992763216, "grad_norm": 0.840900719165802, "learning_rate": 3.2933804060017655e-05, "loss": 0.6081, "step": 9670 }, { "epoch": 1.7085870620421852, "grad_norm": 0.9345539212226868, "learning_rate": 3.2916151809355696e-05, "loss": 0.5849, "step": 9680 }, { "epoch": 1.7103521313211543, "grad_norm": 1.4001808166503906, "learning_rate": 3.289849955869374e-05, "loss": 0.6198, "step": 9690 }, { "epoch": 1.7121172006001235, "grad_norm": 2.205587387084961, "learning_rate": 3.288084730803178e-05, "loss": 0.6937, "step": 9700 }, { "epoch": 1.7138822698790928, "grad_norm": 2.924020290374756, "learning_rate": 3.286319505736982e-05, "loss": 0.5451, "step": 9710 }, { "epoch": 1.715647339158062, "grad_norm": 2.550149917602539, "learning_rate": 3.2845542806707854e-05, "loss": 0.5779, "step": 9720 }, { "epoch": 1.7174124084370312, "grad_norm": 0.8188669085502625, "learning_rate": 3.2827890556045895e-05, "loss": 0.6391, "step": 9730 }, { "epoch": 1.7191774777160003, "grad_norm": 2.1378254890441895, "learning_rate": 3.2810238305383936e-05, "loss": 0.5491, "step": 9740 }, { "epoch": 1.7209425469949695, "grad_norm": 1.0728440284729004, "learning_rate": 3.279258605472198e-05, "loss": 0.6991, "step": 9750 }, { "epoch": 1.7227076162739388, "grad_norm": 2.225074529647827, "learning_rate": 3.277493380406002e-05, "loss": 0.6602, "step": 9760 }, { "epoch": 1.724472685552908, "grad_norm": 2.452317714691162, "learning_rate": 3.275728155339806e-05, "loss": 0.6031, "step": 9770 }, { "epoch": 1.7262377548318772, "grad_norm": 1.0104994773864746, "learning_rate": 3.27396293027361e-05, "loss": 0.65, "step": 9780 }, { "epoch": 1.7280028241108463, "grad_norm": 2.9075286388397217, "learning_rate": 3.272197705207414e-05, "loss": 0.66, "step": 9790 }, { "epoch": 1.7297678933898155, "grad_norm": 3.247784376144409, "learning_rate": 3.270432480141218e-05, "loss": 0.6073, "step": 9800 }, { "epoch": 1.7315329626687848, "grad_norm": 1.802399754524231, "learning_rate": 3.2686672550750224e-05, "loss": 0.7026, "step": 9810 }, { "epoch": 1.733298031947754, "grad_norm": 1.8000247478485107, "learning_rate": 3.2669020300088266e-05, "loss": 0.6974, "step": 9820 }, { "epoch": 1.7350631012267232, "grad_norm": 0.6736454367637634, "learning_rate": 3.26513680494263e-05, "loss": 0.6313, "step": 9830 }, { "epoch": 1.7368281705056923, "grad_norm": 0.8809193968772888, "learning_rate": 3.263371579876434e-05, "loss": 0.5773, "step": 9840 }, { "epoch": 1.7385932397846615, "grad_norm": 1.3804481029510498, "learning_rate": 3.261606354810238e-05, "loss": 0.5644, "step": 9850 }, { "epoch": 1.7403583090636308, "grad_norm": 2.855910301208496, "learning_rate": 3.259841129744042e-05, "loss": 0.7065, "step": 9860 }, { "epoch": 1.7421233783425998, "grad_norm": 0.8118297457695007, "learning_rate": 3.2580759046778465e-05, "loss": 0.5712, "step": 9870 }, { "epoch": 1.7438884476215692, "grad_norm": 1.8637617826461792, "learning_rate": 3.2563106796116506e-05, "loss": 0.615, "step": 9880 }, { "epoch": 1.7456535169005383, "grad_norm": 0.7027237415313721, "learning_rate": 3.254545454545455e-05, "loss": 0.585, "step": 9890 }, { "epoch": 1.7474185861795075, "grad_norm": 1.9294718503952026, "learning_rate": 3.252780229479259e-05, "loss": 0.721, "step": 9900 }, { "epoch": 1.7491836554584768, "grad_norm": 0.7499776482582092, "learning_rate": 3.251015004413063e-05, "loss": 0.5693, "step": 9910 }, { "epoch": 1.7509487247374458, "grad_norm": 0.7779677510261536, "learning_rate": 3.249249779346867e-05, "loss": 0.6228, "step": 9920 }, { "epoch": 1.7527137940164152, "grad_norm": 0.8223459720611572, "learning_rate": 3.247484554280671e-05, "loss": 0.5272, "step": 9930 }, { "epoch": 1.7544788632953843, "grad_norm": 2.417341709136963, "learning_rate": 3.245719329214475e-05, "loss": 0.6413, "step": 9940 }, { "epoch": 1.7562439325743535, "grad_norm": 1.4994111061096191, "learning_rate": 3.2439541041482794e-05, "loss": 0.6917, "step": 9950 }, { "epoch": 1.7580090018533228, "grad_norm": 1.04421865940094, "learning_rate": 3.242188879082083e-05, "loss": 0.6192, "step": 9960 }, { "epoch": 1.7597740711322918, "grad_norm": 0.8585307598114014, "learning_rate": 3.240423654015887e-05, "loss": 0.5649, "step": 9970 }, { "epoch": 1.7615391404112613, "grad_norm": 2.860748529434204, "learning_rate": 3.238658428949691e-05, "loss": 0.6115, "step": 9980 }, { "epoch": 1.7633042096902303, "grad_norm": 1.2423299551010132, "learning_rate": 3.236893203883495e-05, "loss": 0.5609, "step": 9990 }, { "epoch": 1.7650692789691995, "grad_norm": 1.2183946371078491, "learning_rate": 3.235127978817299e-05, "loss": 0.6757, "step": 10000 }, { "epoch": 1.7650692789691995, "eval_loss": 0.6546699404716492, "eval_runtime": 592.0492, "eval_samples_per_second": 47.846, "eval_steps_per_second": 2.393, "eval_token_accuracy": 0.0004970945644769607, "step": 10000 }, { "epoch": 1.7668343482481688, "grad_norm": 2.420112133026123, "learning_rate": 3.2333627537511034e-05, "loss": 0.6171, "step": 10010 }, { "epoch": 1.7685994175271378, "grad_norm": 1.9335789680480957, "learning_rate": 3.2315975286849075e-05, "loss": 0.5806, "step": 10020 }, { "epoch": 1.7703644868061073, "grad_norm": 0.945406973361969, "learning_rate": 3.2298323036187116e-05, "loss": 0.6006, "step": 10030 }, { "epoch": 1.7721295560850763, "grad_norm": 4.3664445877075195, "learning_rate": 3.228067078552516e-05, "loss": 0.638, "step": 10040 }, { "epoch": 1.7738946253640455, "grad_norm": 3.256833076477051, "learning_rate": 3.22630185348632e-05, "loss": 0.6134, "step": 10050 }, { "epoch": 1.7756596946430148, "grad_norm": 1.7355093955993652, "learning_rate": 3.224713150926743e-05, "loss": 0.5735, "step": 10060 }, { "epoch": 1.7774247639219838, "grad_norm": 0.8020845651626587, "learning_rate": 3.222947925860547e-05, "loss": 0.6668, "step": 10070 }, { "epoch": 1.7791898332009533, "grad_norm": 2.6556622982025146, "learning_rate": 3.221182700794351e-05, "loss": 0.5895, "step": 10080 }, { "epoch": 1.7809549024799223, "grad_norm": 0.7236629724502563, "learning_rate": 3.219417475728155e-05, "loss": 0.7201, "step": 10090 }, { "epoch": 1.7827199717588915, "grad_norm": 2.882829427719116, "learning_rate": 3.2176522506619594e-05, "loss": 0.5578, "step": 10100 }, { "epoch": 1.7844850410378608, "grad_norm": 0.6686634421348572, "learning_rate": 3.2158870255957635e-05, "loss": 0.6551, "step": 10110 }, { "epoch": 1.7862501103168298, "grad_norm": 1.0659387111663818, "learning_rate": 3.2141218005295676e-05, "loss": 0.6214, "step": 10120 }, { "epoch": 1.7880151795957993, "grad_norm": 0.9529328942298889, "learning_rate": 3.212356575463372e-05, "loss": 0.5799, "step": 10130 }, { "epoch": 1.7897802488747683, "grad_norm": 0.9451195001602173, "learning_rate": 3.210591350397176e-05, "loss": 0.6425, "step": 10140 }, { "epoch": 1.7915453181537375, "grad_norm": 0.9170915484428406, "learning_rate": 3.20882612533098e-05, "loss": 0.6067, "step": 10150 }, { "epoch": 1.7933103874327068, "grad_norm": 0.7924943566322327, "learning_rate": 3.207060900264784e-05, "loss": 0.6592, "step": 10160 }, { "epoch": 1.7950754567116758, "grad_norm": 0.8339998722076416, "learning_rate": 3.205295675198588e-05, "loss": 0.5662, "step": 10170 }, { "epoch": 1.7968405259906453, "grad_norm": 2.43949818611145, "learning_rate": 3.203530450132392e-05, "loss": 0.6113, "step": 10180 }, { "epoch": 1.7986055952696143, "grad_norm": 2.2304933071136475, "learning_rate": 3.2017652250661964e-05, "loss": 0.6392, "step": 10190 }, { "epoch": 1.8003706645485835, "grad_norm": 0.8846359252929688, "learning_rate": 3.2000000000000005e-05, "loss": 0.6161, "step": 10200 }, { "epoch": 1.8021357338275528, "grad_norm": 3.043703317642212, "learning_rate": 3.198234774933804e-05, "loss": 0.6005, "step": 10210 }, { "epoch": 1.8039008031065218, "grad_norm": 3.343468427658081, "learning_rate": 3.196469549867608e-05, "loss": 0.6702, "step": 10220 }, { "epoch": 1.8056658723854913, "grad_norm": 0.95237797498703, "learning_rate": 3.194704324801412e-05, "loss": 0.6036, "step": 10230 }, { "epoch": 1.8074309416644603, "grad_norm": 2.6117990016937256, "learning_rate": 3.192939099735216e-05, "loss": 0.6074, "step": 10240 }, { "epoch": 1.8091960109434295, "grad_norm": 1.5844181776046753, "learning_rate": 3.1911738746690204e-05, "loss": 0.613, "step": 10250 }, { "epoch": 1.8109610802223988, "grad_norm": 0.8131887316703796, "learning_rate": 3.1894086496028245e-05, "loss": 0.6517, "step": 10260 }, { "epoch": 1.8127261495013678, "grad_norm": 2.3605868816375732, "learning_rate": 3.1876434245366287e-05, "loss": 0.7417, "step": 10270 }, { "epoch": 1.8144912187803373, "grad_norm": 0.7757390737533569, "learning_rate": 3.185878199470433e-05, "loss": 0.6353, "step": 10280 }, { "epoch": 1.8162562880593063, "grad_norm": 0.8382796049118042, "learning_rate": 3.184112974404237e-05, "loss": 0.5868, "step": 10290 }, { "epoch": 1.8180213573382755, "grad_norm": 2.488783836364746, "learning_rate": 3.182347749338041e-05, "loss": 0.6489, "step": 10300 }, { "epoch": 1.8197864266172448, "grad_norm": 0.9477055668830872, "learning_rate": 3.1805825242718444e-05, "loss": 0.5931, "step": 10310 }, { "epoch": 1.8215514958962138, "grad_norm": 0.9396250247955322, "learning_rate": 3.1788172992056486e-05, "loss": 0.5828, "step": 10320 }, { "epoch": 1.8233165651751833, "grad_norm": 1.528105616569519, "learning_rate": 3.177052074139453e-05, "loss": 0.6485, "step": 10330 }, { "epoch": 1.8250816344541523, "grad_norm": 1.0290082693099976, "learning_rate": 3.175286849073257e-05, "loss": 0.5571, "step": 10340 }, { "epoch": 1.8268467037331215, "grad_norm": 2.224963665008545, "learning_rate": 3.173521624007061e-05, "loss": 0.6077, "step": 10350 }, { "epoch": 1.8286117730120908, "grad_norm": 0.8264523148536682, "learning_rate": 3.171756398940865e-05, "loss": 0.6299, "step": 10360 }, { "epoch": 1.8303768422910598, "grad_norm": 0.7780860066413879, "learning_rate": 3.169991173874669e-05, "loss": 0.6193, "step": 10370 }, { "epoch": 1.8321419115700293, "grad_norm": 2.4344122409820557, "learning_rate": 3.168225948808473e-05, "loss": 0.6094, "step": 10380 }, { "epoch": 1.8339069808489983, "grad_norm": 2.5928852558135986, "learning_rate": 3.1664607237422774e-05, "loss": 0.6777, "step": 10390 }, { "epoch": 1.8356720501279675, "grad_norm": 4.44277811050415, "learning_rate": 3.1646954986760815e-05, "loss": 0.623, "step": 10400 }, { "epoch": 1.8374371194069368, "grad_norm": 2.1471035480499268, "learning_rate": 3.1629302736098856e-05, "loss": 0.6382, "step": 10410 }, { "epoch": 1.8392021886859058, "grad_norm": 2.2607192993164062, "learning_rate": 3.16116504854369e-05, "loss": 0.6338, "step": 10420 }, { "epoch": 1.8409672579648753, "grad_norm": 0.7422958016395569, "learning_rate": 3.159399823477494e-05, "loss": 0.6132, "step": 10430 }, { "epoch": 1.8427323272438443, "grad_norm": 0.8959227204322815, "learning_rate": 3.157634598411298e-05, "loss": 0.6039, "step": 10440 }, { "epoch": 1.8444973965228135, "grad_norm": 1.0099540948867798, "learning_rate": 3.1558693733451014e-05, "loss": 0.5652, "step": 10450 }, { "epoch": 1.8462624658017828, "grad_norm": 0.943722128868103, "learning_rate": 3.1541041482789055e-05, "loss": 0.6487, "step": 10460 }, { "epoch": 1.8480275350807518, "grad_norm": 0.8334342241287231, "learning_rate": 3.1523389232127096e-05, "loss": 0.5692, "step": 10470 }, { "epoch": 1.8497926043597213, "grad_norm": 2.3223907947540283, "learning_rate": 3.150573698146514e-05, "loss": 0.614, "step": 10480 }, { "epoch": 1.8515576736386903, "grad_norm": 1.7104765176773071, "learning_rate": 3.148808473080318e-05, "loss": 0.591, "step": 10490 }, { "epoch": 1.8533227429176595, "grad_norm": 2.331986665725708, "learning_rate": 3.147043248014122e-05, "loss": 0.6332, "step": 10500 }, { "epoch": 1.8550878121966288, "grad_norm": 2.031975746154785, "learning_rate": 3.145278022947926e-05, "loss": 0.5918, "step": 10510 }, { "epoch": 1.8568528814755978, "grad_norm": 0.9749746918678284, "learning_rate": 3.14351279788173e-05, "loss": 0.6063, "step": 10520 }, { "epoch": 1.8586179507545673, "grad_norm": 1.1249350309371948, "learning_rate": 3.141747572815534e-05, "loss": 0.6134, "step": 10530 }, { "epoch": 1.8603830200335363, "grad_norm": 2.1191256046295166, "learning_rate": 3.1399823477493384e-05, "loss": 0.6165, "step": 10540 }, { "epoch": 1.8621480893125055, "grad_norm": 3.8866329193115234, "learning_rate": 3.1382171226831425e-05, "loss": 0.5966, "step": 10550 }, { "epoch": 1.8639131585914748, "grad_norm": 0.8333554267883301, "learning_rate": 3.1364518976169466e-05, "loss": 0.5847, "step": 10560 }, { "epoch": 1.8656782278704438, "grad_norm": 0.8728168606758118, "learning_rate": 3.13468667255075e-05, "loss": 0.5101, "step": 10570 }, { "epoch": 1.8674432971494133, "grad_norm": 1.8896209001541138, "learning_rate": 3.132921447484554e-05, "loss": 0.5888, "step": 10580 }, { "epoch": 1.8692083664283823, "grad_norm": 2.8303306102752686, "learning_rate": 3.131156222418358e-05, "loss": 0.5488, "step": 10590 }, { "epoch": 1.8709734357073515, "grad_norm": 0.8313313722610474, "learning_rate": 3.1293909973521624e-05, "loss": 0.5155, "step": 10600 }, { "epoch": 1.8727385049863208, "grad_norm": 0.8679807186126709, "learning_rate": 3.1276257722859665e-05, "loss": 0.6382, "step": 10610 }, { "epoch": 1.8745035742652898, "grad_norm": 1.9634865522384644, "learning_rate": 3.1258605472197706e-05, "loss": 0.5434, "step": 10620 }, { "epoch": 1.8762686435442593, "grad_norm": 1.5889586210250854, "learning_rate": 3.124095322153575e-05, "loss": 0.6297, "step": 10630 }, { "epoch": 1.8780337128232283, "grad_norm": 3.3781940937042236, "learning_rate": 3.122330097087379e-05, "loss": 0.6748, "step": 10640 }, { "epoch": 1.8797987821021975, "grad_norm": 1.1254082918167114, "learning_rate": 3.120564872021183e-05, "loss": 0.5963, "step": 10650 }, { "epoch": 1.8815638513811668, "grad_norm": 0.8166277408599854, "learning_rate": 3.118799646954987e-05, "loss": 0.5659, "step": 10660 }, { "epoch": 1.8833289206601358, "grad_norm": 0.9837700128555298, "learning_rate": 3.117034421888791e-05, "loss": 0.6685, "step": 10670 }, { "epoch": 1.8850939899391053, "grad_norm": 0.8627223968505859, "learning_rate": 3.115269196822595e-05, "loss": 0.6403, "step": 10680 }, { "epoch": 1.8868590592180743, "grad_norm": 0.9987977743148804, "learning_rate": 3.113503971756399e-05, "loss": 0.6842, "step": 10690 }, { "epoch": 1.8886241284970435, "grad_norm": 3.286561965942383, "learning_rate": 3.111738746690203e-05, "loss": 0.7374, "step": 10700 }, { "epoch": 1.8903891977760128, "grad_norm": 0.7906441688537598, "learning_rate": 3.109973521624007e-05, "loss": 0.5732, "step": 10710 }, { "epoch": 1.8921542670549818, "grad_norm": 2.687250852584839, "learning_rate": 3.108208296557811e-05, "loss": 0.5192, "step": 10720 }, { "epoch": 1.8939193363339513, "grad_norm": 1.3238317966461182, "learning_rate": 3.106443071491615e-05, "loss": 0.6694, "step": 10730 }, { "epoch": 1.8956844056129203, "grad_norm": 0.8862921595573425, "learning_rate": 3.104677846425419e-05, "loss": 0.6259, "step": 10740 }, { "epoch": 1.8974494748918895, "grad_norm": 1.2485682964324951, "learning_rate": 3.1029126213592234e-05, "loss": 0.6235, "step": 10750 }, { "epoch": 1.8992145441708588, "grad_norm": 1.9461857080459595, "learning_rate": 3.1011473962930276e-05, "loss": 0.6214, "step": 10760 }, { "epoch": 1.9009796134498278, "grad_norm": 3.330052137374878, "learning_rate": 3.099382171226832e-05, "loss": 0.6129, "step": 10770 }, { "epoch": 1.9027446827287973, "grad_norm": 0.7479764819145203, "learning_rate": 3.097616946160636e-05, "loss": 0.6273, "step": 10780 }, { "epoch": 1.9045097520077663, "grad_norm": 0.8017953634262085, "learning_rate": 3.09585172109444e-05, "loss": 0.5989, "step": 10790 }, { "epoch": 1.9062748212867355, "grad_norm": 2.009951114654541, "learning_rate": 3.094086496028244e-05, "loss": 0.5804, "step": 10800 }, { "epoch": 1.9080398905657048, "grad_norm": 0.820184588432312, "learning_rate": 3.092321270962048e-05, "loss": 0.6231, "step": 10810 }, { "epoch": 1.9098049598446738, "grad_norm": 2.120673656463623, "learning_rate": 3.090556045895852e-05, "loss": 0.5958, "step": 10820 }, { "epoch": 1.9115700291236433, "grad_norm": 1.511352300643921, "learning_rate": 3.0887908208296564e-05, "loss": 0.5967, "step": 10830 }, { "epoch": 1.9133350984026123, "grad_norm": 2.4746806621551514, "learning_rate": 3.08702559576346e-05, "loss": 0.584, "step": 10840 }, { "epoch": 1.9151001676815815, "grad_norm": 0.655605673789978, "learning_rate": 3.085260370697264e-05, "loss": 0.606, "step": 10850 }, { "epoch": 1.9168652369605508, "grad_norm": 0.8358765244483948, "learning_rate": 3.083495145631068e-05, "loss": 0.6417, "step": 10860 }, { "epoch": 1.9186303062395198, "grad_norm": 0.9366624355316162, "learning_rate": 3.081729920564872e-05, "loss": 0.6077, "step": 10870 }, { "epoch": 1.9203953755184893, "grad_norm": 0.7896908521652222, "learning_rate": 3.079964695498676e-05, "loss": 0.5887, "step": 10880 }, { "epoch": 1.9221604447974583, "grad_norm": 1.6127266883850098, "learning_rate": 3.0781994704324804e-05, "loss": 0.6263, "step": 10890 }, { "epoch": 1.9239255140764275, "grad_norm": 2.684758424758911, "learning_rate": 3.0764342453662845e-05, "loss": 0.6099, "step": 10900 }, { "epoch": 1.9256905833553968, "grad_norm": 3.7798666954040527, "learning_rate": 3.0746690203000886e-05, "loss": 0.67, "step": 10910 }, { "epoch": 1.9274556526343658, "grad_norm": 1.0354872941970825, "learning_rate": 3.072903795233893e-05, "loss": 0.5863, "step": 10920 }, { "epoch": 1.929220721913335, "grad_norm": 0.7059375047683716, "learning_rate": 3.071138570167696e-05, "loss": 0.53, "step": 10930 }, { "epoch": 1.9309857911923043, "grad_norm": 0.7127173542976379, "learning_rate": 3.0693733451015e-05, "loss": 0.5633, "step": 10940 }, { "epoch": 1.9327508604712735, "grad_norm": 0.8398742079734802, "learning_rate": 3.0676081200353044e-05, "loss": 0.581, "step": 10950 }, { "epoch": 1.9345159297502428, "grad_norm": 2.5236151218414307, "learning_rate": 3.0658428949691085e-05, "loss": 0.6473, "step": 10960 }, { "epoch": 1.9362809990292118, "grad_norm": 0.8333614468574524, "learning_rate": 3.0640776699029126e-05, "loss": 0.6541, "step": 10970 }, { "epoch": 1.938046068308181, "grad_norm": 0.7542189359664917, "learning_rate": 3.062312444836717e-05, "loss": 0.5332, "step": 10980 }, { "epoch": 1.9398111375871503, "grad_norm": 1.0335618257522583, "learning_rate": 3.060547219770521e-05, "loss": 0.6618, "step": 10990 }, { "epoch": 1.9415762068661195, "grad_norm": 2.821669816970825, "learning_rate": 3.058781994704325e-05, "loss": 0.5796, "step": 11000 }, { "epoch": 1.9415762068661195, "eval_loss": 0.6470552682876587, "eval_runtime": 591.6835, "eval_samples_per_second": 47.875, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0005082583868723192, "step": 11000 }, { "epoch": 1.9433412761450888, "grad_norm": 0.9215902090072632, "learning_rate": 3.057016769638129e-05, "loss": 0.6368, "step": 11010 }, { "epoch": 1.9451063454240578, "grad_norm": 0.8010556101799011, "learning_rate": 3.055251544571933e-05, "loss": 0.5164, "step": 11020 }, { "epoch": 1.946871414703027, "grad_norm": 2.074794054031372, "learning_rate": 3.053486319505737e-05, "loss": 0.5884, "step": 11030 }, { "epoch": 1.9486364839819963, "grad_norm": 1.1630234718322754, "learning_rate": 3.0517210944395414e-05, "loss": 0.534, "step": 11040 }, { "epoch": 1.9504015532609655, "grad_norm": 1.5974197387695312, "learning_rate": 3.0499558693733455e-05, "loss": 0.6164, "step": 11050 }, { "epoch": 1.9521666225399348, "grad_norm": 3.0076427459716797, "learning_rate": 3.0481906443071496e-05, "loss": 0.6304, "step": 11060 }, { "epoch": 1.9539316918189038, "grad_norm": 2.112062454223633, "learning_rate": 3.0464254192409538e-05, "loss": 0.5802, "step": 11070 }, { "epoch": 1.955696761097873, "grad_norm": 3.1546525955200195, "learning_rate": 3.0446601941747575e-05, "loss": 0.6581, "step": 11080 }, { "epoch": 1.9574618303768423, "grad_norm": 1.4764760732650757, "learning_rate": 3.0428949691085616e-05, "loss": 0.614, "step": 11090 }, { "epoch": 1.9592268996558115, "grad_norm": 2.507228136062622, "learning_rate": 3.0411297440423654e-05, "loss": 0.5967, "step": 11100 }, { "epoch": 1.9609919689347808, "grad_norm": 0.6856989860534668, "learning_rate": 3.0393645189761692e-05, "loss": 0.6435, "step": 11110 }, { "epoch": 1.9627570382137498, "grad_norm": 1.0282143354415894, "learning_rate": 3.0375992939099733e-05, "loss": 0.6121, "step": 11120 }, { "epoch": 1.964522107492719, "grad_norm": 2.7744762897491455, "learning_rate": 3.0358340688437774e-05, "loss": 0.6126, "step": 11130 }, { "epoch": 1.9662871767716883, "grad_norm": 1.6380807161331177, "learning_rate": 3.0340688437775815e-05, "loss": 0.5215, "step": 11140 }, { "epoch": 1.9680522460506575, "grad_norm": 1.3833621740341187, "learning_rate": 3.0323036187113857e-05, "loss": 0.5779, "step": 11150 }, { "epoch": 1.9698173153296268, "grad_norm": 1.4948760271072388, "learning_rate": 3.0305383936451898e-05, "loss": 0.6363, "step": 11160 }, { "epoch": 1.9715823846085958, "grad_norm": 1.3232519626617432, "learning_rate": 3.028773168578994e-05, "loss": 0.5343, "step": 11170 }, { "epoch": 1.973347453887565, "grad_norm": 1.552229404449463, "learning_rate": 3.027007943512798e-05, "loss": 0.5917, "step": 11180 }, { "epoch": 1.9751125231665343, "grad_norm": 0.6373162865638733, "learning_rate": 3.025242718446602e-05, "loss": 0.634, "step": 11190 }, { "epoch": 1.9768775924455035, "grad_norm": 1.2558025121688843, "learning_rate": 3.0234774933804062e-05, "loss": 0.5806, "step": 11200 }, { "epoch": 1.9786426617244728, "grad_norm": 1.1361700296401978, "learning_rate": 3.02171226831421e-05, "loss": 0.6575, "step": 11210 }, { "epoch": 1.9804077310034418, "grad_norm": 0.8260495662689209, "learning_rate": 3.019947043248014e-05, "loss": 0.6143, "step": 11220 }, { "epoch": 1.982172800282411, "grad_norm": 0.9218713045120239, "learning_rate": 3.0181818181818182e-05, "loss": 0.616, "step": 11230 }, { "epoch": 1.9839378695613803, "grad_norm": 0.8892648816108704, "learning_rate": 3.0164165931156224e-05, "loss": 0.5788, "step": 11240 }, { "epoch": 1.9857029388403493, "grad_norm": 1.1470361948013306, "learning_rate": 3.0146513680494265e-05, "loss": 0.5623, "step": 11250 }, { "epoch": 1.9874680081193188, "grad_norm": 2.0089900493621826, "learning_rate": 3.0128861429832306e-05, "loss": 0.665, "step": 11260 }, { "epoch": 1.9892330773982878, "grad_norm": 2.222125768661499, "learning_rate": 3.0111209179170347e-05, "loss": 0.6518, "step": 11270 }, { "epoch": 1.990998146677257, "grad_norm": 0.7392178177833557, "learning_rate": 3.0093556928508388e-05, "loss": 0.6416, "step": 11280 }, { "epoch": 1.9927632159562263, "grad_norm": 0.8574917912483215, "learning_rate": 3.007590467784643e-05, "loss": 0.5742, "step": 11290 }, { "epoch": 1.9945282852351953, "grad_norm": 3.2676074504852295, "learning_rate": 3.005825242718447e-05, "loss": 0.5671, "step": 11300 }, { "epoch": 1.9962933545141648, "grad_norm": 2.6135575771331787, "learning_rate": 3.0040600176522508e-05, "loss": 0.6729, "step": 11310 }, { "epoch": 1.9980584237931338, "grad_norm": 1.1514455080032349, "learning_rate": 3.002294792586055e-05, "loss": 0.5945, "step": 11320 }, { "epoch": 1.999823493072103, "grad_norm": 0.957177996635437, "learning_rate": 3.000529567519859e-05, "loss": 0.6641, "step": 11330 }, { "epoch": 2.0015885623510723, "grad_norm": 1.1928191184997559, "learning_rate": 2.998764342453663e-05, "loss": 0.5776, "step": 11340 }, { "epoch": 2.0033536316300413, "grad_norm": 0.9931208491325378, "learning_rate": 2.9969991173874673e-05, "loss": 0.5331, "step": 11350 }, { "epoch": 2.005118700909011, "grad_norm": 0.9708819389343262, "learning_rate": 2.9952338923212707e-05, "loss": 0.466, "step": 11360 }, { "epoch": 2.00688377018798, "grad_norm": 0.8825761675834656, "learning_rate": 2.9934686672550748e-05, "loss": 0.48, "step": 11370 }, { "epoch": 2.0086488394669493, "grad_norm": 0.8337153792381287, "learning_rate": 2.991703442188879e-05, "loss": 0.4293, "step": 11380 }, { "epoch": 2.0104139087459183, "grad_norm": 4.886162281036377, "learning_rate": 2.989938217122683e-05, "loss": 0.5525, "step": 11390 }, { "epoch": 2.0121789780248873, "grad_norm": 0.7176535129547119, "learning_rate": 2.9881729920564872e-05, "loss": 0.5155, "step": 11400 }, { "epoch": 2.013944047303857, "grad_norm": 1.8255113363265991, "learning_rate": 2.9864077669902913e-05, "loss": 0.5057, "step": 11410 }, { "epoch": 2.015709116582826, "grad_norm": 1.6702226400375366, "learning_rate": 2.9846425419240954e-05, "loss": 0.4671, "step": 11420 }, { "epoch": 2.0174741858617953, "grad_norm": 0.8029167652130127, "learning_rate": 2.9828773168578995e-05, "loss": 0.4813, "step": 11430 }, { "epoch": 2.0192392551407643, "grad_norm": 3.2598228454589844, "learning_rate": 2.9811120917917036e-05, "loss": 0.4942, "step": 11440 }, { "epoch": 2.0210043244197333, "grad_norm": 2.657938003540039, "learning_rate": 2.9793468667255074e-05, "loss": 0.4563, "step": 11450 }, { "epoch": 2.022769393698703, "grad_norm": 2.4557859897613525, "learning_rate": 2.9775816416593115e-05, "loss": 0.4351, "step": 11460 }, { "epoch": 2.024534462977672, "grad_norm": 2.4432296752929688, "learning_rate": 2.9758164165931156e-05, "loss": 0.4639, "step": 11470 }, { "epoch": 2.0262995322566413, "grad_norm": 1.0158432722091675, "learning_rate": 2.9740511915269197e-05, "loss": 0.561, "step": 11480 }, { "epoch": 2.0280646015356103, "grad_norm": 2.512036085128784, "learning_rate": 2.972285966460724e-05, "loss": 0.5906, "step": 11490 }, { "epoch": 2.0298296708145793, "grad_norm": 1.3112326860427856, "learning_rate": 2.970520741394528e-05, "loss": 0.4741, "step": 11500 }, { "epoch": 2.031594740093549, "grad_norm": 1.4458718299865723, "learning_rate": 2.968755516328332e-05, "loss": 0.522, "step": 11510 }, { "epoch": 2.033359809372518, "grad_norm": 0.7055060267448425, "learning_rate": 2.9669902912621362e-05, "loss": 0.4711, "step": 11520 }, { "epoch": 2.0351248786514873, "grad_norm": 2.6412127017974854, "learning_rate": 2.9652250661959403e-05, "loss": 0.4922, "step": 11530 }, { "epoch": 2.0368899479304563, "grad_norm": 2.383212089538574, "learning_rate": 2.9634598411297444e-05, "loss": 0.5204, "step": 11540 }, { "epoch": 2.0386550172094253, "grad_norm": 0.963070809841156, "learning_rate": 2.9616946160635482e-05, "loss": 0.4838, "step": 11550 }, { "epoch": 2.040420086488395, "grad_norm": 1.1218928098678589, "learning_rate": 2.9599293909973523e-05, "loss": 0.504, "step": 11560 }, { "epoch": 2.042185155767364, "grad_norm": 1.0899280309677124, "learning_rate": 2.9581641659311564e-05, "loss": 0.5522, "step": 11570 }, { "epoch": 2.0439502250463333, "grad_norm": 1.1442947387695312, "learning_rate": 2.9563989408649606e-05, "loss": 0.5598, "step": 11580 }, { "epoch": 2.0457152943253023, "grad_norm": 1.0687470436096191, "learning_rate": 2.9546337157987647e-05, "loss": 0.4778, "step": 11590 }, { "epoch": 2.0474803636042713, "grad_norm": 1.081070899963379, "learning_rate": 2.9528684907325688e-05, "loss": 0.5483, "step": 11600 }, { "epoch": 2.049245432883241, "grad_norm": 0.8817394375801086, "learning_rate": 2.951103265666373e-05, "loss": 0.5219, "step": 11610 }, { "epoch": 2.05101050216221, "grad_norm": 0.8029956817626953, "learning_rate": 2.949338040600177e-05, "loss": 0.5053, "step": 11620 }, { "epoch": 2.0527755714411793, "grad_norm": 0.783224880695343, "learning_rate": 2.9475728155339804e-05, "loss": 0.5109, "step": 11630 }, { "epoch": 2.0545406407201483, "grad_norm": 0.7042371034622192, "learning_rate": 2.9458075904677846e-05, "loss": 0.5278, "step": 11640 }, { "epoch": 2.0563057099991173, "grad_norm": 4.542788028717041, "learning_rate": 2.9440423654015887e-05, "loss": 0.5746, "step": 11650 }, { "epoch": 2.058070779278087, "grad_norm": 0.9003473520278931, "learning_rate": 2.9422771403353928e-05, "loss": 0.4479, "step": 11660 }, { "epoch": 2.059835848557056, "grad_norm": 3.547114372253418, "learning_rate": 2.940511915269197e-05, "loss": 0.6115, "step": 11670 }, { "epoch": 2.0616009178360253, "grad_norm": 1.4339213371276855, "learning_rate": 2.938746690203001e-05, "loss": 0.4837, "step": 11680 }, { "epoch": 2.0633659871149943, "grad_norm": 2.337256908416748, "learning_rate": 2.9369814651368048e-05, "loss": 0.4576, "step": 11690 }, { "epoch": 2.0651310563939633, "grad_norm": 2.767146587371826, "learning_rate": 2.935216240070609e-05, "loss": 0.526, "step": 11700 }, { "epoch": 2.066896125672933, "grad_norm": 1.023708701133728, "learning_rate": 2.933451015004413e-05, "loss": 0.4683, "step": 11710 }, { "epoch": 2.068661194951902, "grad_norm": 0.7664569020271301, "learning_rate": 2.931685789938217e-05, "loss": 0.4743, "step": 11720 }, { "epoch": 2.0704262642308713, "grad_norm": 2.275400400161743, "learning_rate": 2.9299205648720213e-05, "loss": 0.4842, "step": 11730 }, { "epoch": 2.0721913335098403, "grad_norm": 0.7984259724617004, "learning_rate": 2.9281553398058254e-05, "loss": 0.4877, "step": 11740 }, { "epoch": 2.0739564027888093, "grad_norm": 0.8254725337028503, "learning_rate": 2.9263901147396295e-05, "loss": 0.5589, "step": 11750 }, { "epoch": 2.075721472067779, "grad_norm": 1.0608854293823242, "learning_rate": 2.9246248896734336e-05, "loss": 0.5527, "step": 11760 }, { "epoch": 2.077486541346748, "grad_norm": 1.1715601682662964, "learning_rate": 2.9228596646072377e-05, "loss": 0.4665, "step": 11770 }, { "epoch": 2.0792516106257173, "grad_norm": 1.4764703512191772, "learning_rate": 2.9210944395410418e-05, "loss": 0.4224, "step": 11780 }, { "epoch": 2.0810166799046863, "grad_norm": 1.0137091875076294, "learning_rate": 2.9193292144748456e-05, "loss": 0.5108, "step": 11790 }, { "epoch": 2.0827817491836553, "grad_norm": 2.340433359146118, "learning_rate": 2.9175639894086497e-05, "loss": 0.4594, "step": 11800 }, { "epoch": 2.084546818462625, "grad_norm": 2.333954334259033, "learning_rate": 2.915798764342454e-05, "loss": 0.6047, "step": 11810 }, { "epoch": 2.086311887741594, "grad_norm": 0.9210914969444275, "learning_rate": 2.914033539276258e-05, "loss": 0.4875, "step": 11820 }, { "epoch": 2.0880769570205633, "grad_norm": 1.9261822700500488, "learning_rate": 2.912268314210062e-05, "loss": 0.5139, "step": 11830 }, { "epoch": 2.0898420262995323, "grad_norm": 1.805763840675354, "learning_rate": 2.9105030891438662e-05, "loss": 0.5626, "step": 11840 }, { "epoch": 2.0916070955785013, "grad_norm": 2.940823554992676, "learning_rate": 2.9087378640776703e-05, "loss": 0.5426, "step": 11850 }, { "epoch": 2.093372164857471, "grad_norm": 1.9291481971740723, "learning_rate": 2.9069726390114744e-05, "loss": 0.5193, "step": 11860 }, { "epoch": 2.09513723413644, "grad_norm": 1.1445153951644897, "learning_rate": 2.9052074139452785e-05, "loss": 0.5539, "step": 11870 }, { "epoch": 2.096902303415409, "grad_norm": 2.1614785194396973, "learning_rate": 2.9034421888790826e-05, "loss": 0.5235, "step": 11880 }, { "epoch": 2.0986673726943783, "grad_norm": 1.3679187297821045, "learning_rate": 2.901676963812886e-05, "loss": 0.5553, "step": 11890 }, { "epoch": 2.1004324419733473, "grad_norm": 1.0410507917404175, "learning_rate": 2.8999117387466902e-05, "loss": 0.4773, "step": 11900 }, { "epoch": 2.102197511252317, "grad_norm": 0.9981822967529297, "learning_rate": 2.8981465136804943e-05, "loss": 0.5068, "step": 11910 }, { "epoch": 2.103962580531286, "grad_norm": 1.834923505783081, "learning_rate": 2.8963812886142984e-05, "loss": 0.5367, "step": 11920 }, { "epoch": 2.105727649810255, "grad_norm": 1.878109335899353, "learning_rate": 2.8946160635481022e-05, "loss": 0.4888, "step": 11930 }, { "epoch": 2.1074927190892243, "grad_norm": 0.9439142346382141, "learning_rate": 2.8928508384819063e-05, "loss": 0.5235, "step": 11940 }, { "epoch": 2.1092577883681933, "grad_norm": 1.915279507637024, "learning_rate": 2.8910856134157104e-05, "loss": 0.5273, "step": 11950 }, { "epoch": 2.111022857647163, "grad_norm": 2.888288974761963, "learning_rate": 2.8893203883495145e-05, "loss": 0.5289, "step": 11960 }, { "epoch": 2.112787926926132, "grad_norm": 2.066969871520996, "learning_rate": 2.8875551632833187e-05, "loss": 0.4869, "step": 11970 }, { "epoch": 2.114552996205101, "grad_norm": 2.3141443729400635, "learning_rate": 2.8857899382171228e-05, "loss": 0.5212, "step": 11980 }, { "epoch": 2.1163180654840703, "grad_norm": 2.632960081100464, "learning_rate": 2.884024713150927e-05, "loss": 0.5814, "step": 11990 }, { "epoch": 2.1180831347630393, "grad_norm": 0.9722346663475037, "learning_rate": 2.882259488084731e-05, "loss": 0.4128, "step": 12000 }, { "epoch": 2.1180831347630393, "eval_loss": 0.654511570930481, "eval_runtime": 591.8488, "eval_samples_per_second": 47.862, "eval_steps_per_second": 2.394, "eval_token_accuracy": 0.0004901932924507392, "step": 12000 }, { "epoch": 2.119848204042009, "grad_norm": 0.8732126951217651, "learning_rate": 2.880494263018535e-05, "loss": 0.4396, "step": 12010 }, { "epoch": 2.121613273320978, "grad_norm": 3.0739586353302, "learning_rate": 2.8787290379523392e-05, "loss": 0.5427, "step": 12020 }, { "epoch": 2.123378342599947, "grad_norm": 2.652486801147461, "learning_rate": 2.876963812886143e-05, "loss": 0.4744, "step": 12030 }, { "epoch": 2.1251434118789163, "grad_norm": 1.4533450603485107, "learning_rate": 2.875198587819947e-05, "loss": 0.5421, "step": 12040 }, { "epoch": 2.1269084811578853, "grad_norm": 0.821504533290863, "learning_rate": 2.8734333627537512e-05, "loss": 0.5684, "step": 12050 }, { "epoch": 2.128673550436855, "grad_norm": 0.6715283989906311, "learning_rate": 2.8716681376875553e-05, "loss": 0.4973, "step": 12060 }, { "epoch": 2.130438619715824, "grad_norm": 3.1312203407287598, "learning_rate": 2.8699029126213595e-05, "loss": 0.4808, "step": 12070 }, { "epoch": 2.132203688994793, "grad_norm": 0.6378737688064575, "learning_rate": 2.8681376875551636e-05, "loss": 0.4798, "step": 12080 }, { "epoch": 2.1339687582737623, "grad_norm": 1.9007421731948853, "learning_rate": 2.8665489849955873e-05, "loss": 0.5055, "step": 12090 }, { "epoch": 2.1357338275527313, "grad_norm": 0.5962501168251038, "learning_rate": 2.8647837599293915e-05, "loss": 0.4686, "step": 12100 }, { "epoch": 2.137498896831701, "grad_norm": 1.4856303930282593, "learning_rate": 2.863018534863195e-05, "loss": 0.4889, "step": 12110 }, { "epoch": 2.13926396611067, "grad_norm": 0.873531699180603, "learning_rate": 2.861253309796999e-05, "loss": 0.4911, "step": 12120 }, { "epoch": 2.141029035389639, "grad_norm": 2.513932228088379, "learning_rate": 2.859488084730803e-05, "loss": 0.5479, "step": 12130 }, { "epoch": 2.1427941046686083, "grad_norm": 2.6930124759674072, "learning_rate": 2.8577228596646072e-05, "loss": 0.5069, "step": 12140 }, { "epoch": 2.1445591739475773, "grad_norm": 1.6831334829330444, "learning_rate": 2.8559576345984114e-05, "loss": 0.4757, "step": 12150 }, { "epoch": 2.146324243226547, "grad_norm": 2.069110155105591, "learning_rate": 2.8541924095322155e-05, "loss": 0.5105, "step": 12160 }, { "epoch": 2.148089312505516, "grad_norm": 1.4201005697250366, "learning_rate": 2.8524271844660196e-05, "loss": 0.5031, "step": 12170 }, { "epoch": 2.149854381784485, "grad_norm": 3.460890293121338, "learning_rate": 2.8506619593998234e-05, "loss": 0.5576, "step": 12180 }, { "epoch": 2.1516194510634543, "grad_norm": 1.0495176315307617, "learning_rate": 2.8488967343336275e-05, "loss": 0.5344, "step": 12190 }, { "epoch": 2.1533845203424233, "grad_norm": 0.7707245349884033, "learning_rate": 2.8471315092674316e-05, "loss": 0.4722, "step": 12200 }, { "epoch": 2.155149589621393, "grad_norm": 1.064219355583191, "learning_rate": 2.8453662842012357e-05, "loss": 0.5478, "step": 12210 }, { "epoch": 2.156914658900362, "grad_norm": 2.337414264678955, "learning_rate": 2.8436010591350398e-05, "loss": 0.5004, "step": 12220 }, { "epoch": 2.158679728179331, "grad_norm": 0.9675964713096619, "learning_rate": 2.841835834068844e-05, "loss": 0.4763, "step": 12230 }, { "epoch": 2.1604447974583003, "grad_norm": 1.2413973808288574, "learning_rate": 2.840070609002648e-05, "loss": 0.495, "step": 12240 }, { "epoch": 2.1622098667372693, "grad_norm": 1.6234698295593262, "learning_rate": 2.838305383936452e-05, "loss": 0.4424, "step": 12250 }, { "epoch": 2.163974936016239, "grad_norm": 2.0990002155303955, "learning_rate": 2.8365401588702563e-05, "loss": 0.5045, "step": 12260 }, { "epoch": 2.165740005295208, "grad_norm": 0.6744937896728516, "learning_rate": 2.8347749338040604e-05, "loss": 0.4955, "step": 12270 }, { "epoch": 2.167505074574177, "grad_norm": 1.7633548974990845, "learning_rate": 2.833009708737864e-05, "loss": 0.476, "step": 12280 }, { "epoch": 2.1692701438531463, "grad_norm": 1.0914983749389648, "learning_rate": 2.8312444836716683e-05, "loss": 0.4602, "step": 12290 }, { "epoch": 2.1710352131321153, "grad_norm": 1.7474454641342163, "learning_rate": 2.8294792586054724e-05, "loss": 0.5249, "step": 12300 }, { "epoch": 2.172800282411085, "grad_norm": 1.0030485391616821, "learning_rate": 2.8277140335392765e-05, "loss": 0.5095, "step": 12310 }, { "epoch": 2.174565351690054, "grad_norm": 1.8654377460479736, "learning_rate": 2.8259488084730806e-05, "loss": 0.4896, "step": 12320 }, { "epoch": 2.176330420969023, "grad_norm": 1.0025534629821777, "learning_rate": 2.8241835834068847e-05, "loss": 0.493, "step": 12330 }, { "epoch": 2.1780954902479923, "grad_norm": 2.6646952629089355, "learning_rate": 2.822418358340689e-05, "loss": 0.5012, "step": 12340 }, { "epoch": 2.1798605595269613, "grad_norm": 2.589738130569458, "learning_rate": 2.820653133274493e-05, "loss": 0.5174, "step": 12350 }, { "epoch": 2.181625628805931, "grad_norm": 1.0582941770553589, "learning_rate": 2.818887908208297e-05, "loss": 0.479, "step": 12360 }, { "epoch": 2.1833906980849, "grad_norm": 1.1127334833145142, "learning_rate": 2.8171226831421005e-05, "loss": 0.5474, "step": 12370 }, { "epoch": 2.185155767363869, "grad_norm": 1.8785793781280518, "learning_rate": 2.8153574580759046e-05, "loss": 0.4606, "step": 12380 }, { "epoch": 2.1869208366428383, "grad_norm": 0.9331899285316467, "learning_rate": 2.8135922330097087e-05, "loss": 0.5505, "step": 12390 }, { "epoch": 2.1886859059218073, "grad_norm": 2.787630081176758, "learning_rate": 2.811827007943513e-05, "loss": 0.5077, "step": 12400 }, { "epoch": 2.190450975200777, "grad_norm": 1.945959448814392, "learning_rate": 2.8100617828773166e-05, "loss": 0.4737, "step": 12410 }, { "epoch": 2.192216044479746, "grad_norm": 2.3678271770477295, "learning_rate": 2.8082965578111208e-05, "loss": 0.549, "step": 12420 }, { "epoch": 2.193981113758715, "grad_norm": 1.0043656826019287, "learning_rate": 2.806531332744925e-05, "loss": 0.4822, "step": 12430 }, { "epoch": 2.1957461830376843, "grad_norm": 0.9238103628158569, "learning_rate": 2.804766107678729e-05, "loss": 0.55, "step": 12440 }, { "epoch": 2.1975112523166533, "grad_norm": 1.7914990186691284, "learning_rate": 2.803000882612533e-05, "loss": 0.4653, "step": 12450 }, { "epoch": 2.199276321595623, "grad_norm": 2.3372256755828857, "learning_rate": 2.8012356575463372e-05, "loss": 0.5211, "step": 12460 }, { "epoch": 2.201041390874592, "grad_norm": 2.7075140476226807, "learning_rate": 2.7994704324801413e-05, "loss": 0.5067, "step": 12470 }, { "epoch": 2.202806460153561, "grad_norm": 1.2606568336486816, "learning_rate": 2.7977052074139454e-05, "loss": 0.5076, "step": 12480 }, { "epoch": 2.2045715294325303, "grad_norm": 2.36053466796875, "learning_rate": 2.7959399823477496e-05, "loss": 0.4988, "step": 12490 }, { "epoch": 2.2063365987114993, "grad_norm": 1.0246137380599976, "learning_rate": 2.7941747572815537e-05, "loss": 0.4638, "step": 12500 }, { "epoch": 2.208101667990469, "grad_norm": 0.9959467649459839, "learning_rate": 2.7924095322153574e-05, "loss": 0.4979, "step": 12510 }, { "epoch": 2.209866737269438, "grad_norm": 0.8300411701202393, "learning_rate": 2.7906443071491616e-05, "loss": 0.5047, "step": 12520 }, { "epoch": 2.211631806548407, "grad_norm": 1.7269713878631592, "learning_rate": 2.7888790820829657e-05, "loss": 0.524, "step": 12530 }, { "epoch": 2.2133968758273763, "grad_norm": 1.104152798652649, "learning_rate": 2.7871138570167698e-05, "loss": 0.5376, "step": 12540 }, { "epoch": 2.2151619451063453, "grad_norm": 3.106315851211548, "learning_rate": 2.785348631950574e-05, "loss": 0.5384, "step": 12550 }, { "epoch": 2.216927014385315, "grad_norm": 2.4172351360321045, "learning_rate": 2.783583406884378e-05, "loss": 0.5317, "step": 12560 }, { "epoch": 2.218692083664284, "grad_norm": 1.0057008266448975, "learning_rate": 2.781818181818182e-05, "loss": 0.5518, "step": 12570 }, { "epoch": 2.220457152943253, "grad_norm": 1.2210044860839844, "learning_rate": 2.7800529567519862e-05, "loss": 0.4703, "step": 12580 }, { "epoch": 2.2222222222222223, "grad_norm": 0.9134182333946228, "learning_rate": 2.7782877316857904e-05, "loss": 0.4795, "step": 12590 }, { "epoch": 2.2239872915011913, "grad_norm": 2.1350576877593994, "learning_rate": 2.7765225066195945e-05, "loss": 0.4975, "step": 12600 }, { "epoch": 2.225752360780161, "grad_norm": 0.8448552489280701, "learning_rate": 2.7747572815533986e-05, "loss": 0.5284, "step": 12610 }, { "epoch": 2.22751743005913, "grad_norm": 2.1405866146087646, "learning_rate": 2.7729920564872024e-05, "loss": 0.5373, "step": 12620 }, { "epoch": 2.229282499338099, "grad_norm": 2.148709774017334, "learning_rate": 2.7712268314210065e-05, "loss": 0.4989, "step": 12630 }, { "epoch": 2.2310475686170683, "grad_norm": 1.166394591331482, "learning_rate": 2.7694616063548103e-05, "loss": 0.5028, "step": 12640 }, { "epoch": 2.2328126378960373, "grad_norm": 0.9031407237052917, "learning_rate": 2.767696381288614e-05, "loss": 0.5754, "step": 12650 }, { "epoch": 2.234577707175007, "grad_norm": 0.9934903979301453, "learning_rate": 2.765931156222418e-05, "loss": 0.5072, "step": 12660 }, { "epoch": 2.236342776453976, "grad_norm": 2.5429446697235107, "learning_rate": 2.7641659311562223e-05, "loss": 0.515, "step": 12670 }, { "epoch": 2.238107845732945, "grad_norm": 2.769002676010132, "learning_rate": 2.7624007060900264e-05, "loss": 0.4504, "step": 12680 }, { "epoch": 2.2398729150119143, "grad_norm": 0.9032069444656372, "learning_rate": 2.7606354810238305e-05, "loss": 0.5617, "step": 12690 }, { "epoch": 2.2416379842908833, "grad_norm": 0.7437412142753601, "learning_rate": 2.7588702559576346e-05, "loss": 0.4948, "step": 12700 }, { "epoch": 2.243403053569853, "grad_norm": 3.5754473209381104, "learning_rate": 2.7571050308914387e-05, "loss": 0.4823, "step": 12710 }, { "epoch": 2.245168122848822, "grad_norm": 1.0576409101486206, "learning_rate": 2.755339805825243e-05, "loss": 0.4672, "step": 12720 }, { "epoch": 2.246933192127791, "grad_norm": 1.3094158172607422, "learning_rate": 2.753574580759047e-05, "loss": 0.5177, "step": 12730 }, { "epoch": 2.2486982614067603, "grad_norm": 0.7089628577232361, "learning_rate": 2.751809355692851e-05, "loss": 0.5172, "step": 12740 }, { "epoch": 2.2504633306857293, "grad_norm": 1.1376948356628418, "learning_rate": 2.750044130626655e-05, "loss": 0.5558, "step": 12750 }, { "epoch": 2.2522283999646984, "grad_norm": 1.1415077447891235, "learning_rate": 2.748278905560459e-05, "loss": 0.5039, "step": 12760 }, { "epoch": 2.253993469243668, "grad_norm": 1.022479772567749, "learning_rate": 2.746513680494263e-05, "loss": 0.5461, "step": 12770 }, { "epoch": 2.255758538522637, "grad_norm": 1.0342376232147217, "learning_rate": 2.7447484554280672e-05, "loss": 0.4556, "step": 12780 }, { "epoch": 2.2575236078016063, "grad_norm": 1.3401023149490356, "learning_rate": 2.7429832303618713e-05, "loss": 0.5366, "step": 12790 }, { "epoch": 2.2592886770805753, "grad_norm": 0.9429671764373779, "learning_rate": 2.7412180052956754e-05, "loss": 0.487, "step": 12800 }, { "epoch": 2.2610537463595444, "grad_norm": 1.254345178604126, "learning_rate": 2.7394527802294795e-05, "loss": 0.588, "step": 12810 }, { "epoch": 2.262818815638514, "grad_norm": 1.041446328163147, "learning_rate": 2.7376875551632836e-05, "loss": 0.5193, "step": 12820 }, { "epoch": 2.264583884917483, "grad_norm": 3.8499042987823486, "learning_rate": 2.7359223300970878e-05, "loss": 0.5235, "step": 12830 }, { "epoch": 2.2663489541964523, "grad_norm": 3.177006721496582, "learning_rate": 2.734157105030892e-05, "loss": 0.5519, "step": 12840 }, { "epoch": 2.2681140234754213, "grad_norm": 1.080491065979004, "learning_rate": 2.7323918799646956e-05, "loss": 0.4556, "step": 12850 }, { "epoch": 2.2698790927543904, "grad_norm": 0.6767769455909729, "learning_rate": 2.7306266548984998e-05, "loss": 0.5929, "step": 12860 }, { "epoch": 2.27164416203336, "grad_norm": 3.2237303256988525, "learning_rate": 2.728861429832304e-05, "loss": 0.5824, "step": 12870 }, { "epoch": 2.273409231312329, "grad_norm": 0.7552109956741333, "learning_rate": 2.727096204766108e-05, "loss": 0.5254, "step": 12880 }, { "epoch": 2.2751743005912983, "grad_norm": 1.4033763408660889, "learning_rate": 2.725330979699912e-05, "loss": 0.5456, "step": 12890 }, { "epoch": 2.2769393698702673, "grad_norm": 0.9239394068717957, "learning_rate": 2.7235657546337155e-05, "loss": 0.5253, "step": 12900 }, { "epoch": 2.2787044391492364, "grad_norm": 2.7689387798309326, "learning_rate": 2.7218005295675197e-05, "loss": 0.4943, "step": 12910 }, { "epoch": 2.280469508428206, "grad_norm": 0.866808295249939, "learning_rate": 2.7200353045013238e-05, "loss": 0.5595, "step": 12920 }, { "epoch": 2.282234577707175, "grad_norm": 1.0920748710632324, "learning_rate": 2.718270079435128e-05, "loss": 0.5383, "step": 12930 }, { "epoch": 2.2839996469861443, "grad_norm": 0.7788814306259155, "learning_rate": 2.716504854368932e-05, "loss": 0.4792, "step": 12940 }, { "epoch": 2.2857647162651133, "grad_norm": 0.875015377998352, "learning_rate": 2.714739629302736e-05, "loss": 0.499, "step": 12950 }, { "epoch": 2.2875297855440824, "grad_norm": 0.830392599105835, "learning_rate": 2.7129744042365402e-05, "loss": 0.4868, "step": 12960 }, { "epoch": 2.289294854823052, "grad_norm": 0.8815006613731384, "learning_rate": 2.7112091791703443e-05, "loss": 0.4666, "step": 12970 }, { "epoch": 2.291059924102021, "grad_norm": 0.8470727205276489, "learning_rate": 2.7094439541041485e-05, "loss": 0.5324, "step": 12980 }, { "epoch": 2.2928249933809903, "grad_norm": 1.8830323219299316, "learning_rate": 2.7076787290379522e-05, "loss": 0.5445, "step": 12990 }, { "epoch": 2.2945900626599594, "grad_norm": 0.8701585531234741, "learning_rate": 2.7059135039717564e-05, "loss": 0.5036, "step": 13000 }, { "epoch": 2.2945900626599594, "eval_loss": 0.6412675976753235, "eval_runtime": 591.7448, "eval_samples_per_second": 47.87, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0004934409498748434, "step": 13000 }, { "epoch": 2.2963551319389284, "grad_norm": 0.9522472023963928, "learning_rate": 2.7041482789055605e-05, "loss": 0.4838, "step": 13010 }, { "epoch": 2.298120201217898, "grad_norm": 2.1521029472351074, "learning_rate": 2.7023830538393646e-05, "loss": 0.473, "step": 13020 }, { "epoch": 2.299885270496867, "grad_norm": 1.0218665599822998, "learning_rate": 2.7006178287731687e-05, "loss": 0.4596, "step": 13030 }, { "epoch": 2.3016503397758363, "grad_norm": 0.8671874403953552, "learning_rate": 2.6988526037069728e-05, "loss": 0.5186, "step": 13040 }, { "epoch": 2.3034154090548054, "grad_norm": 0.9349090456962585, "learning_rate": 2.697087378640777e-05, "loss": 0.5075, "step": 13050 }, { "epoch": 2.3051804783337744, "grad_norm": 1.8751778602600098, "learning_rate": 2.695322153574581e-05, "loss": 0.5692, "step": 13060 }, { "epoch": 2.306945547612744, "grad_norm": 1.0619752407073975, "learning_rate": 2.693556928508385e-05, "loss": 0.5053, "step": 13070 }, { "epoch": 2.308710616891713, "grad_norm": 1.0086804628372192, "learning_rate": 2.6917917034421893e-05, "loss": 0.5665, "step": 13080 }, { "epoch": 2.3104756861706823, "grad_norm": 1.0648972988128662, "learning_rate": 2.690026478375993e-05, "loss": 0.4463, "step": 13090 }, { "epoch": 2.3122407554496514, "grad_norm": 1.061785340309143, "learning_rate": 2.688261253309797e-05, "loss": 0.5137, "step": 13100 }, { "epoch": 2.3140058247286204, "grad_norm": 2.9344685077667236, "learning_rate": 2.6864960282436013e-05, "loss": 0.4788, "step": 13110 }, { "epoch": 2.31577089400759, "grad_norm": 1.2300915718078613, "learning_rate": 2.6847308031774054e-05, "loss": 0.5406, "step": 13120 }, { "epoch": 2.317535963286559, "grad_norm": 1.0264790058135986, "learning_rate": 2.6829655781112095e-05, "loss": 0.4655, "step": 13130 }, { "epoch": 2.3193010325655283, "grad_norm": 2.2145140171051025, "learning_rate": 2.6812003530450136e-05, "loss": 0.5669, "step": 13140 }, { "epoch": 2.3210661018444974, "grad_norm": 3.4140677452087402, "learning_rate": 2.6794351279788177e-05, "loss": 0.5037, "step": 13150 }, { "epoch": 2.3228311711234664, "grad_norm": 0.9157199859619141, "learning_rate": 2.677669902912622e-05, "loss": 0.5114, "step": 13160 }, { "epoch": 2.324596240402436, "grad_norm": 2.4897093772888184, "learning_rate": 2.6759046778464253e-05, "loss": 0.5387, "step": 13170 }, { "epoch": 2.326361309681405, "grad_norm": 0.6659016013145447, "learning_rate": 2.6741394527802294e-05, "loss": 0.5019, "step": 13180 }, { "epoch": 2.3281263789603743, "grad_norm": 1.978591799736023, "learning_rate": 2.6723742277140335e-05, "loss": 0.5258, "step": 13190 }, { "epoch": 2.3298914482393434, "grad_norm": 0.8332827687263489, "learning_rate": 2.6706090026478376e-05, "loss": 0.5617, "step": 13200 }, { "epoch": 2.3316565175183124, "grad_norm": 1.862585425376892, "learning_rate": 2.6688437775816417e-05, "loss": 0.4242, "step": 13210 }, { "epoch": 2.333421586797282, "grad_norm": 2.829312801361084, "learning_rate": 2.667078552515446e-05, "loss": 0.4638, "step": 13220 }, { "epoch": 2.335186656076251, "grad_norm": 0.7040713429450989, "learning_rate": 2.6653133274492496e-05, "loss": 0.4604, "step": 13230 }, { "epoch": 2.3369517253552203, "grad_norm": 2.7348663806915283, "learning_rate": 2.6635481023830537e-05, "loss": 0.5251, "step": 13240 }, { "epoch": 2.3387167946341894, "grad_norm": 1.9702792167663574, "learning_rate": 2.661782877316858e-05, "loss": 0.507, "step": 13250 }, { "epoch": 2.3404818639131584, "grad_norm": 0.856442928314209, "learning_rate": 2.660017652250662e-05, "loss": 0.5338, "step": 13260 }, { "epoch": 2.342246933192128, "grad_norm": 2.1334874629974365, "learning_rate": 2.658252427184466e-05, "loss": 0.5276, "step": 13270 }, { "epoch": 2.344012002471097, "grad_norm": 0.8047542572021484, "learning_rate": 2.6564872021182702e-05, "loss": 0.533, "step": 13280 }, { "epoch": 2.3457770717500663, "grad_norm": 1.1873878240585327, "learning_rate": 2.6547219770520743e-05, "loss": 0.5194, "step": 13290 }, { "epoch": 2.3475421410290354, "grad_norm": 1.130898356437683, "learning_rate": 2.6529567519858784e-05, "loss": 0.4911, "step": 13300 }, { "epoch": 2.3493072103080044, "grad_norm": 0.7969009280204773, "learning_rate": 2.6511915269196825e-05, "loss": 0.4655, "step": 13310 }, { "epoch": 2.351072279586974, "grad_norm": 0.8343009948730469, "learning_rate": 2.6494263018534867e-05, "loss": 0.4632, "step": 13320 }, { "epoch": 2.352837348865943, "grad_norm": 2.038909673690796, "learning_rate": 2.6476610767872904e-05, "loss": 0.468, "step": 13330 }, { "epoch": 2.3546024181449123, "grad_norm": 3.487717628479004, "learning_rate": 2.6458958517210946e-05, "loss": 0.4837, "step": 13340 }, { "epoch": 2.3563674874238814, "grad_norm": 3.3392558097839355, "learning_rate": 2.6441306266548987e-05, "loss": 0.4344, "step": 13350 }, { "epoch": 2.3581325567028504, "grad_norm": 2.897871971130371, "learning_rate": 2.6423654015887028e-05, "loss": 0.5185, "step": 13360 }, { "epoch": 2.35989762598182, "grad_norm": 2.5428342819213867, "learning_rate": 2.640600176522507e-05, "loss": 0.5117, "step": 13370 }, { "epoch": 2.361662695260789, "grad_norm": 1.0503923892974854, "learning_rate": 2.638834951456311e-05, "loss": 0.4753, "step": 13380 }, { "epoch": 2.3634277645397583, "grad_norm": 2.5742833614349365, "learning_rate": 2.637069726390115e-05, "loss": 0.5126, "step": 13390 }, { "epoch": 2.3651928338187274, "grad_norm": 3.079458475112915, "learning_rate": 2.6353045013239192e-05, "loss": 0.4613, "step": 13400 }, { "epoch": 2.3669579030976964, "grad_norm": 0.7799136638641357, "learning_rate": 2.6335392762577234e-05, "loss": 0.4927, "step": 13410 }, { "epoch": 2.368722972376666, "grad_norm": 2.216231346130371, "learning_rate": 2.6317740511915275e-05, "loss": 0.5037, "step": 13420 }, { "epoch": 2.370488041655635, "grad_norm": 1.005723476409912, "learning_rate": 2.630008826125331e-05, "loss": 0.5267, "step": 13430 }, { "epoch": 2.3722531109346043, "grad_norm": 0.884051501750946, "learning_rate": 2.628243601059135e-05, "loss": 0.4745, "step": 13440 }, { "epoch": 2.3740181802135734, "grad_norm": 1.1464614868164062, "learning_rate": 2.626478375992939e-05, "loss": 0.4789, "step": 13450 }, { "epoch": 2.3757832494925424, "grad_norm": 1.0192292928695679, "learning_rate": 2.624713150926743e-05, "loss": 0.4834, "step": 13460 }, { "epoch": 2.377548318771512, "grad_norm": 3.1154890060424805, "learning_rate": 2.622947925860547e-05, "loss": 0.5096, "step": 13470 }, { "epoch": 2.379313388050481, "grad_norm": 3.1223933696746826, "learning_rate": 2.621182700794351e-05, "loss": 0.4817, "step": 13480 }, { "epoch": 2.3810784573294503, "grad_norm": 1.5079442262649536, "learning_rate": 2.6194174757281553e-05, "loss": 0.4582, "step": 13490 }, { "epoch": 2.3828435266084194, "grad_norm": 2.5750479698181152, "learning_rate": 2.6176522506619594e-05, "loss": 0.5411, "step": 13500 }, { "epoch": 2.3846085958873884, "grad_norm": 2.674363613128662, "learning_rate": 2.6158870255957635e-05, "loss": 0.489, "step": 13510 }, { "epoch": 2.386373665166358, "grad_norm": 2.6622684001922607, "learning_rate": 2.6141218005295676e-05, "loss": 0.5215, "step": 13520 }, { "epoch": 2.388138734445327, "grad_norm": 1.0257827043533325, "learning_rate": 2.6123565754633717e-05, "loss": 0.4981, "step": 13530 }, { "epoch": 2.3899038037242963, "grad_norm": 1.8958207368850708, "learning_rate": 2.6105913503971758e-05, "loss": 0.4808, "step": 13540 }, { "epoch": 2.3916688730032654, "grad_norm": 0.9230754375457764, "learning_rate": 2.60882612533098e-05, "loss": 0.6154, "step": 13550 }, { "epoch": 2.3934339422822344, "grad_norm": 3.1750411987304688, "learning_rate": 2.6070609002647837e-05, "loss": 0.5281, "step": 13560 }, { "epoch": 2.395199011561204, "grad_norm": 0.994310200214386, "learning_rate": 2.605295675198588e-05, "loss": 0.5862, "step": 13570 }, { "epoch": 2.396964080840173, "grad_norm": 1.904697060585022, "learning_rate": 2.603530450132392e-05, "loss": 0.5261, "step": 13580 }, { "epoch": 2.3987291501191423, "grad_norm": 0.8757647275924683, "learning_rate": 2.601765225066196e-05, "loss": 0.5154, "step": 13590 }, { "epoch": 2.4004942193981114, "grad_norm": 2.1605241298675537, "learning_rate": 2.6000000000000002e-05, "loss": 0.4573, "step": 13600 }, { "epoch": 2.4022592886770804, "grad_norm": 1.7989857196807861, "learning_rate": 2.5982347749338043e-05, "loss": 0.4879, "step": 13610 }, { "epoch": 2.40402435795605, "grad_norm": 2.11667799949646, "learning_rate": 2.5964695498676084e-05, "loss": 0.4869, "step": 13620 }, { "epoch": 2.405789427235019, "grad_norm": 3.442117929458618, "learning_rate": 2.5947043248014125e-05, "loss": 0.5279, "step": 13630 }, { "epoch": 2.4075544965139883, "grad_norm": 1.5708115100860596, "learning_rate": 2.5929390997352166e-05, "loss": 0.4816, "step": 13640 }, { "epoch": 2.4093195657929574, "grad_norm": 1.3552768230438232, "learning_rate": 2.5911738746690208e-05, "loss": 0.564, "step": 13650 }, { "epoch": 2.4110846350719264, "grad_norm": 0.8411927223205566, "learning_rate": 2.589408649602825e-05, "loss": 0.401, "step": 13660 }, { "epoch": 2.412849704350896, "grad_norm": 3.806356906890869, "learning_rate": 2.5876434245366286e-05, "loss": 0.4354, "step": 13670 }, { "epoch": 2.414614773629865, "grad_norm": 2.0317461490631104, "learning_rate": 2.5858781994704328e-05, "loss": 0.4978, "step": 13680 }, { "epoch": 2.4163798429088343, "grad_norm": 2.2540907859802246, "learning_rate": 2.584112974404237e-05, "loss": 0.4289, "step": 13690 }, { "epoch": 2.4181449121878034, "grad_norm": 0.8400912880897522, "learning_rate": 2.5823477493380403e-05, "loss": 0.4892, "step": 13700 }, { "epoch": 2.4199099814667724, "grad_norm": 0.6846358180046082, "learning_rate": 2.5805825242718444e-05, "loss": 0.5721, "step": 13710 }, { "epoch": 2.421675050745742, "grad_norm": 0.9299115538597107, "learning_rate": 2.5788172992056485e-05, "loss": 0.5353, "step": 13720 }, { "epoch": 2.423440120024711, "grad_norm": 1.3329682350158691, "learning_rate": 2.5770520741394527e-05, "loss": 0.5371, "step": 13730 }, { "epoch": 2.4252051893036803, "grad_norm": 1.0928109884262085, "learning_rate": 2.5752868490732568e-05, "loss": 0.5186, "step": 13740 }, { "epoch": 2.4269702585826494, "grad_norm": 0.9928961396217346, "learning_rate": 2.573521624007061e-05, "loss": 0.5265, "step": 13750 }, { "epoch": 2.4287353278616184, "grad_norm": 2.193777322769165, "learning_rate": 2.571756398940865e-05, "loss": 0.572, "step": 13760 }, { "epoch": 2.430500397140588, "grad_norm": 1.8867632150650024, "learning_rate": 2.569991173874669e-05, "loss": 0.4702, "step": 13770 }, { "epoch": 2.432265466419557, "grad_norm": 3.6797149181365967, "learning_rate": 2.5682259488084732e-05, "loss": 0.5486, "step": 13780 }, { "epoch": 2.4340305356985263, "grad_norm": 0.9218922257423401, "learning_rate": 2.5664607237422773e-05, "loss": 0.5172, "step": 13790 }, { "epoch": 2.4357956049774954, "grad_norm": 0.8286840319633484, "learning_rate": 2.564695498676081e-05, "loss": 0.5309, "step": 13800 }, { "epoch": 2.4375606742564644, "grad_norm": 2.5113284587860107, "learning_rate": 2.5629302736098852e-05, "loss": 0.5122, "step": 13810 }, { "epoch": 2.439325743535434, "grad_norm": 4.121739864349365, "learning_rate": 2.5611650485436893e-05, "loss": 0.5529, "step": 13820 }, { "epoch": 2.441090812814403, "grad_norm": 0.8111094236373901, "learning_rate": 2.5593998234774935e-05, "loss": 0.5267, "step": 13830 }, { "epoch": 2.4428558820933723, "grad_norm": 3.327519178390503, "learning_rate": 2.5576345984112976e-05, "loss": 0.4871, "step": 13840 }, { "epoch": 2.4446209513723414, "grad_norm": 3.18033766746521, "learning_rate": 2.5558693733451017e-05, "loss": 0.4858, "step": 13850 }, { "epoch": 2.4463860206513104, "grad_norm": 2.803899049758911, "learning_rate": 2.5541041482789058e-05, "loss": 0.5272, "step": 13860 }, { "epoch": 2.44815108993028, "grad_norm": 0.9279343485832214, "learning_rate": 2.55233892321271e-05, "loss": 0.4904, "step": 13870 }, { "epoch": 2.449916159209249, "grad_norm": 1.1356558799743652, "learning_rate": 2.550573698146514e-05, "loss": 0.4783, "step": 13880 }, { "epoch": 2.4516812284882183, "grad_norm": 2.152723550796509, "learning_rate": 2.548808473080318e-05, "loss": 0.5241, "step": 13890 }, { "epoch": 2.4534462977671874, "grad_norm": 0.8582736253738403, "learning_rate": 2.547043248014122e-05, "loss": 0.4888, "step": 13900 }, { "epoch": 2.4552113670461564, "grad_norm": 0.7155855894088745, "learning_rate": 2.545278022947926e-05, "loss": 0.5235, "step": 13910 }, { "epoch": 2.456976436325126, "grad_norm": 1.35896897315979, "learning_rate": 2.54351279788173e-05, "loss": 0.5206, "step": 13920 }, { "epoch": 2.458741505604095, "grad_norm": 1.0848078727722168, "learning_rate": 2.5417475728155343e-05, "loss": 0.5047, "step": 13930 }, { "epoch": 2.4605065748830643, "grad_norm": 2.7077338695526123, "learning_rate": 2.5399823477493384e-05, "loss": 0.5035, "step": 13940 }, { "epoch": 2.4622716441620334, "grad_norm": 2.3435027599334717, "learning_rate": 2.5382171226831425e-05, "loss": 0.4596, "step": 13950 }, { "epoch": 2.4640367134410024, "grad_norm": 1.2936254739761353, "learning_rate": 2.536451897616946e-05, "loss": 0.5803, "step": 13960 }, { "epoch": 2.465801782719972, "grad_norm": 0.7503966093063354, "learning_rate": 2.53468667255075e-05, "loss": 0.4078, "step": 13970 }, { "epoch": 2.467566851998941, "grad_norm": 3.8789782524108887, "learning_rate": 2.532921447484554e-05, "loss": 0.4721, "step": 13980 }, { "epoch": 2.4693319212779103, "grad_norm": 3.136807441711426, "learning_rate": 2.5311562224183583e-05, "loss": 0.5074, "step": 13990 }, { "epoch": 2.4710969905568794, "grad_norm": 1.8736839294433594, "learning_rate": 2.5293909973521624e-05, "loss": 0.5167, "step": 14000 }, { "epoch": 2.4710969905568794, "eval_loss": 0.6444569826126099, "eval_runtime": 591.67, "eval_samples_per_second": 47.876, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.00049364392846385, "step": 14000 }, { "epoch": 2.4728620598358484, "grad_norm": 2.7958600521087646, "learning_rate": 2.5276257722859665e-05, "loss": 0.4353, "step": 14010 }, { "epoch": 2.474627129114818, "grad_norm": 1.9591412544250488, "learning_rate": 2.5258605472197706e-05, "loss": 0.4061, "step": 14020 }, { "epoch": 2.476392198393787, "grad_norm": 0.845340371131897, "learning_rate": 2.5240953221535747e-05, "loss": 0.4764, "step": 14030 }, { "epoch": 2.4781572676727563, "grad_norm": 2.308152198791504, "learning_rate": 2.5223300970873785e-05, "loss": 0.555, "step": 14040 }, { "epoch": 2.4799223369517254, "grad_norm": 1.7259413003921509, "learning_rate": 2.5205648720211826e-05, "loss": 0.4994, "step": 14050 }, { "epoch": 2.4816874062306944, "grad_norm": 0.7561100721359253, "learning_rate": 2.5187996469549867e-05, "loss": 0.4588, "step": 14060 }, { "epoch": 2.483452475509664, "grad_norm": 0.8931876420974731, "learning_rate": 2.517034421888791e-05, "loss": 0.4509, "step": 14070 }, { "epoch": 2.485217544788633, "grad_norm": 0.8635971546173096, "learning_rate": 2.515269196822595e-05, "loss": 0.5047, "step": 14080 }, { "epoch": 2.4869826140676023, "grad_norm": 1.0457156896591187, "learning_rate": 2.513503971756399e-05, "loss": 0.5713, "step": 14090 }, { "epoch": 2.4887476833465714, "grad_norm": 0.7360007762908936, "learning_rate": 2.5117387466902032e-05, "loss": 0.4962, "step": 14100 }, { "epoch": 2.4905127526255404, "grad_norm": 1.2129534482955933, "learning_rate": 2.5099735216240073e-05, "loss": 0.5531, "step": 14110 }, { "epoch": 2.49227782190451, "grad_norm": 1.0140886306762695, "learning_rate": 2.5082082965578114e-05, "loss": 0.5137, "step": 14120 }, { "epoch": 2.494042891183479, "grad_norm": 3.158255100250244, "learning_rate": 2.5064430714916155e-05, "loss": 0.524, "step": 14130 }, { "epoch": 2.4958079604624483, "grad_norm": 1.9043729305267334, "learning_rate": 2.5048543689320393e-05, "loss": 0.5126, "step": 14140 }, { "epoch": 2.4975730297414174, "grad_norm": 0.8182691335678101, "learning_rate": 2.503089143865843e-05, "loss": 0.4468, "step": 14150 }, { "epoch": 2.4993380990203864, "grad_norm": 2.1772260665893555, "learning_rate": 2.5013239187996472e-05, "loss": 0.502, "step": 14160 }, { "epoch": 2.501103168299356, "grad_norm": 0.8185103535652161, "learning_rate": 2.499558693733451e-05, "loss": 0.5546, "step": 14170 }, { "epoch": 2.502868237578325, "grad_norm": 1.6303755044937134, "learning_rate": 2.497793468667255e-05, "loss": 0.4519, "step": 14180 }, { "epoch": 2.5046333068572944, "grad_norm": 2.8604867458343506, "learning_rate": 2.4960282436010592e-05, "loss": 0.5637, "step": 14190 }, { "epoch": 2.5063983761362634, "grad_norm": 3.644176721572876, "learning_rate": 2.4942630185348633e-05, "loss": 0.5599, "step": 14200 }, { "epoch": 2.5081634454152324, "grad_norm": 0.834730863571167, "learning_rate": 2.4924977934686674e-05, "loss": 0.4952, "step": 14210 }, { "epoch": 2.509928514694202, "grad_norm": 1.0383777618408203, "learning_rate": 2.4907325684024716e-05, "loss": 0.5917, "step": 14220 }, { "epoch": 2.511693583973171, "grad_norm": 0.9282052516937256, "learning_rate": 2.4889673433362757e-05, "loss": 0.4442, "step": 14230 }, { "epoch": 2.5134586532521404, "grad_norm": 0.8705268502235413, "learning_rate": 2.4872021182700794e-05, "loss": 0.5424, "step": 14240 }, { "epoch": 2.5152237225311094, "grad_norm": 0.9563567638397217, "learning_rate": 2.4854368932038836e-05, "loss": 0.4315, "step": 14250 }, { "epoch": 2.5169887918100784, "grad_norm": 3.457864999771118, "learning_rate": 2.4836716681376877e-05, "loss": 0.4982, "step": 14260 }, { "epoch": 2.518753861089048, "grad_norm": 0.9856398105621338, "learning_rate": 2.4819064430714918e-05, "loss": 0.4919, "step": 14270 }, { "epoch": 2.520518930368017, "grad_norm": 0.7606726288795471, "learning_rate": 2.480141218005296e-05, "loss": 0.4871, "step": 14280 }, { "epoch": 2.5222839996469864, "grad_norm": 0.8468009233474731, "learning_rate": 2.4783759929390997e-05, "loss": 0.5633, "step": 14290 }, { "epoch": 2.5240490689259554, "grad_norm": 0.9710422158241272, "learning_rate": 2.4766107678729038e-05, "loss": 0.5282, "step": 14300 }, { "epoch": 2.5258141382049244, "grad_norm": 3.3125197887420654, "learning_rate": 2.474845542806708e-05, "loss": 0.5101, "step": 14310 }, { "epoch": 2.527579207483894, "grad_norm": 3.0758326053619385, "learning_rate": 2.473080317740512e-05, "loss": 0.5018, "step": 14320 }, { "epoch": 2.529344276762863, "grad_norm": 1.0053824186325073, "learning_rate": 2.471315092674316e-05, "loss": 0.5137, "step": 14330 }, { "epoch": 2.5311093460418324, "grad_norm": 4.004051685333252, "learning_rate": 2.4695498676081202e-05, "loss": 0.5151, "step": 14340 }, { "epoch": 2.5328744153208014, "grad_norm": 0.7134153246879578, "learning_rate": 2.4677846425419244e-05, "loss": 0.5042, "step": 14350 }, { "epoch": 2.5346394845997704, "grad_norm": 3.08558988571167, "learning_rate": 2.4660194174757285e-05, "loss": 0.5112, "step": 14360 }, { "epoch": 2.53640455387874, "grad_norm": 0.9656035304069519, "learning_rate": 2.4642541924095323e-05, "loss": 0.5184, "step": 14370 }, { "epoch": 2.538169623157709, "grad_norm": 0.7227429747581482, "learning_rate": 2.4624889673433364e-05, "loss": 0.4754, "step": 14380 }, { "epoch": 2.5399346924366784, "grad_norm": 1.0796962976455688, "learning_rate": 2.4607237422771405e-05, "loss": 0.5395, "step": 14390 }, { "epoch": 2.5416997617156474, "grad_norm": 1.065271019935608, "learning_rate": 2.4589585172109446e-05, "loss": 0.5096, "step": 14400 }, { "epoch": 2.5434648309946164, "grad_norm": 1.0752772092819214, "learning_rate": 2.4571932921447484e-05, "loss": 0.554, "step": 14410 }, { "epoch": 2.545229900273586, "grad_norm": 3.272254228591919, "learning_rate": 2.4554280670785525e-05, "loss": 0.4955, "step": 14420 }, { "epoch": 2.546994969552555, "grad_norm": 2.887012481689453, "learning_rate": 2.4536628420123566e-05, "loss": 0.4783, "step": 14430 }, { "epoch": 2.5487600388315244, "grad_norm": 1.104333758354187, "learning_rate": 2.4518976169461607e-05, "loss": 0.5279, "step": 14440 }, { "epoch": 2.5505251081104934, "grad_norm": 3.4142651557922363, "learning_rate": 2.450132391879965e-05, "loss": 0.4985, "step": 14450 }, { "epoch": 2.5522901773894624, "grad_norm": 1.1748323440551758, "learning_rate": 2.448367166813769e-05, "loss": 0.6331, "step": 14460 }, { "epoch": 2.554055246668432, "grad_norm": 0.7295975089073181, "learning_rate": 2.446601941747573e-05, "loss": 0.5106, "step": 14470 }, { "epoch": 2.555820315947401, "grad_norm": 3.932593584060669, "learning_rate": 2.4448367166813772e-05, "loss": 0.467, "step": 14480 }, { "epoch": 2.5575853852263704, "grad_norm": 1.1309473514556885, "learning_rate": 2.4430714916151813e-05, "loss": 0.5545, "step": 14490 }, { "epoch": 2.5593504545053394, "grad_norm": 2.504162073135376, "learning_rate": 2.441306266548985e-05, "loss": 0.4719, "step": 14500 }, { "epoch": 2.5611155237843084, "grad_norm": 1.8151975870132446, "learning_rate": 2.4395410414827892e-05, "loss": 0.4756, "step": 14510 }, { "epoch": 2.562880593063278, "grad_norm": 0.7189649343490601, "learning_rate": 2.4377758164165933e-05, "loss": 0.4718, "step": 14520 }, { "epoch": 2.564645662342247, "grad_norm": 2.3369531631469727, "learning_rate": 2.436010591350397e-05, "loss": 0.4687, "step": 14530 }, { "epoch": 2.5664107316212164, "grad_norm": 0.9797418117523193, "learning_rate": 2.4342453662842012e-05, "loss": 0.4821, "step": 14540 }, { "epoch": 2.5681758009001854, "grad_norm": 1.5502965450286865, "learning_rate": 2.4324801412180053e-05, "loss": 0.5433, "step": 14550 }, { "epoch": 2.5699408701791544, "grad_norm": 2.704186201095581, "learning_rate": 2.4307149161518094e-05, "loss": 0.5147, "step": 14560 }, { "epoch": 2.571705939458124, "grad_norm": 0.8617478013038635, "learning_rate": 2.4289496910856135e-05, "loss": 0.491, "step": 14570 }, { "epoch": 2.573471008737093, "grad_norm": 3.2564940452575684, "learning_rate": 2.4271844660194176e-05, "loss": 0.4789, "step": 14580 }, { "epoch": 2.5752360780160624, "grad_norm": 3.6057090759277344, "learning_rate": 2.4254192409532218e-05, "loss": 0.5254, "step": 14590 }, { "epoch": 2.5770011472950314, "grad_norm": 3.0366363525390625, "learning_rate": 2.423654015887026e-05, "loss": 0.4894, "step": 14600 }, { "epoch": 2.5787662165740004, "grad_norm": 2.611250877380371, "learning_rate": 2.42188879082083e-05, "loss": 0.5071, "step": 14610 }, { "epoch": 2.58053128585297, "grad_norm": 0.9398654103279114, "learning_rate": 2.420123565754634e-05, "loss": 0.4138, "step": 14620 }, { "epoch": 2.582296355131939, "grad_norm": 1.245468258857727, "learning_rate": 2.418358340688438e-05, "loss": 0.6041, "step": 14630 }, { "epoch": 2.5840614244109084, "grad_norm": 2.410236120223999, "learning_rate": 2.416593115622242e-05, "loss": 0.5736, "step": 14640 }, { "epoch": 2.5858264936898774, "grad_norm": 2.7321484088897705, "learning_rate": 2.4148278905560458e-05, "loss": 0.4715, "step": 14650 }, { "epoch": 2.5875915629688464, "grad_norm": 1.2167099714279175, "learning_rate": 2.41306266548985e-05, "loss": 0.5238, "step": 14660 }, { "epoch": 2.589356632247816, "grad_norm": 0.7984771132469177, "learning_rate": 2.411297440423654e-05, "loss": 0.4759, "step": 14670 }, { "epoch": 2.591121701526785, "grad_norm": 2.0809378623962402, "learning_rate": 2.409532215357458e-05, "loss": 0.4867, "step": 14680 }, { "epoch": 2.5928867708057544, "grad_norm": 1.0306892395019531, "learning_rate": 2.4077669902912622e-05, "loss": 0.4754, "step": 14690 }, { "epoch": 2.5946518400847234, "grad_norm": 4.413130760192871, "learning_rate": 2.4060017652250663e-05, "loss": 0.5013, "step": 14700 }, { "epoch": 2.5964169093636924, "grad_norm": 0.9244773387908936, "learning_rate": 2.4042365401588705e-05, "loss": 0.5159, "step": 14710 }, { "epoch": 2.598181978642662, "grad_norm": 2.5842370986938477, "learning_rate": 2.4024713150926746e-05, "loss": 0.6024, "step": 14720 }, { "epoch": 2.599947047921631, "grad_norm": 2.524390697479248, "learning_rate": 2.4007060900264787e-05, "loss": 0.4822, "step": 14730 }, { "epoch": 2.6017121172006004, "grad_norm": 2.7176120281219482, "learning_rate": 2.3989408649602828e-05, "loss": 0.5486, "step": 14740 }, { "epoch": 2.6034771864795694, "grad_norm": 0.8411146402359009, "learning_rate": 2.3971756398940866e-05, "loss": 0.5113, "step": 14750 }, { "epoch": 2.6052422557585384, "grad_norm": 1.2995619773864746, "learning_rate": 2.3954104148278907e-05, "loss": 0.4908, "step": 14760 }, { "epoch": 2.607007325037508, "grad_norm": 0.9292383193969727, "learning_rate": 2.3936451897616945e-05, "loss": 0.5188, "step": 14770 }, { "epoch": 2.608772394316477, "grad_norm": 0.9706587791442871, "learning_rate": 2.3918799646954986e-05, "loss": 0.4462, "step": 14780 }, { "epoch": 2.6105374635954464, "grad_norm": 0.8439898490905762, "learning_rate": 2.3901147396293027e-05, "loss": 0.4806, "step": 14790 }, { "epoch": 2.6123025328744154, "grad_norm": 2.3372583389282227, "learning_rate": 2.3883495145631068e-05, "loss": 0.4816, "step": 14800 }, { "epoch": 2.6140676021533844, "grad_norm": 1.188154935836792, "learning_rate": 2.3867608120035306e-05, "loss": 0.5135, "step": 14810 }, { "epoch": 2.615832671432354, "grad_norm": 1.5531268119812012, "learning_rate": 2.3849955869373347e-05, "loss": 0.4791, "step": 14820 }, { "epoch": 2.617597740711323, "grad_norm": 1.0564147233963013, "learning_rate": 2.3832303618711388e-05, "loss": 0.4124, "step": 14830 }, { "epoch": 2.6193628099902924, "grad_norm": 0.7218682169914246, "learning_rate": 2.381465136804943e-05, "loss": 0.4817, "step": 14840 }, { "epoch": 2.6211278792692614, "grad_norm": 2.071608304977417, "learning_rate": 2.3796999117387467e-05, "loss": 0.4893, "step": 14850 }, { "epoch": 2.6228929485482304, "grad_norm": 3.6240506172180176, "learning_rate": 2.3779346866725508e-05, "loss": 0.4753, "step": 14860 }, { "epoch": 2.6246580178272, "grad_norm": 2.2952630519866943, "learning_rate": 2.376169461606355e-05, "loss": 0.449, "step": 14870 }, { "epoch": 2.626423087106169, "grad_norm": 0.794567346572876, "learning_rate": 2.374404236540159e-05, "loss": 0.442, "step": 14880 }, { "epoch": 2.6281881563851384, "grad_norm": 0.6543694138526917, "learning_rate": 2.372639011473963e-05, "loss": 0.4496, "step": 14890 }, { "epoch": 2.6299532256641074, "grad_norm": 3.7700748443603516, "learning_rate": 2.370873786407767e-05, "loss": 0.4946, "step": 14900 }, { "epoch": 2.6317182949430764, "grad_norm": 1.2146788835525513, "learning_rate": 2.369108561341571e-05, "loss": 0.4665, "step": 14910 }, { "epoch": 2.633483364222046, "grad_norm": 2.967947006225586, "learning_rate": 2.367343336275375e-05, "loss": 0.5184, "step": 14920 }, { "epoch": 2.635248433501015, "grad_norm": 0.9510634541511536, "learning_rate": 2.3655781112091793e-05, "loss": 0.4812, "step": 14930 }, { "epoch": 2.6370135027799844, "grad_norm": 0.8221954107284546, "learning_rate": 2.3638128861429834e-05, "loss": 0.4662, "step": 14940 }, { "epoch": 2.6387785720589534, "grad_norm": 0.9185769557952881, "learning_rate": 2.3620476610767875e-05, "loss": 0.4848, "step": 14950 }, { "epoch": 2.6405436413379224, "grad_norm": 0.8108286261558533, "learning_rate": 2.3602824360105916e-05, "loss": 0.4619, "step": 14960 }, { "epoch": 2.642308710616892, "grad_norm": 2.633342742919922, "learning_rate": 2.3585172109443957e-05, "loss": 0.5568, "step": 14970 }, { "epoch": 2.644073779895861, "grad_norm": 0.9109094142913818, "learning_rate": 2.3567519858781995e-05, "loss": 0.5136, "step": 14980 }, { "epoch": 2.6458388491748304, "grad_norm": 1.047798991203308, "learning_rate": 2.3549867608120036e-05, "loss": 0.5128, "step": 14990 }, { "epoch": 2.6476039184537994, "grad_norm": 1.1590611934661865, "learning_rate": 2.3532215357458077e-05, "loss": 0.5211, "step": 15000 }, { "epoch": 2.6476039184537994, "eval_loss": 0.62970370054245, "eval_runtime": 591.6243, "eval_samples_per_second": 47.88, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0004849158491365697, "step": 15000 }, { "epoch": 2.6493689877327684, "grad_norm": 2.34171199798584, "learning_rate": 2.351456310679612e-05, "loss": 0.5036, "step": 15010 }, { "epoch": 2.651134057011738, "grad_norm": 1.2570987939834595, "learning_rate": 2.3496910856134156e-05, "loss": 0.545, "step": 15020 }, { "epoch": 2.652899126290707, "grad_norm": 1.4143269062042236, "learning_rate": 2.3479258605472197e-05, "loss": 0.5092, "step": 15030 }, { "epoch": 2.6546641955696764, "grad_norm": 1.1344726085662842, "learning_rate": 2.346160635481024e-05, "loss": 0.4663, "step": 15040 }, { "epoch": 2.6564292648486454, "grad_norm": 2.5450966358184814, "learning_rate": 2.344395410414828e-05, "loss": 0.4743, "step": 15050 }, { "epoch": 2.6581943341276144, "grad_norm": 4.249632835388184, "learning_rate": 2.342630185348632e-05, "loss": 0.5652, "step": 15060 }, { "epoch": 2.6599594034065834, "grad_norm": 3.0133004188537598, "learning_rate": 2.3408649602824362e-05, "loss": 0.5316, "step": 15070 }, { "epoch": 2.661724472685553, "grad_norm": 2.640355110168457, "learning_rate": 2.3390997352162403e-05, "loss": 0.4818, "step": 15080 }, { "epoch": 2.6634895419645224, "grad_norm": 2.2960193157196045, "learning_rate": 2.3373345101500444e-05, "loss": 0.5672, "step": 15090 }, { "epoch": 2.6652546112434914, "grad_norm": 0.8862185478210449, "learning_rate": 2.3355692850838485e-05, "loss": 0.4891, "step": 15100 }, { "epoch": 2.6670196805224604, "grad_norm": 1.2502179145812988, "learning_rate": 2.3338040600176523e-05, "loss": 0.4982, "step": 15110 }, { "epoch": 2.6687847498014294, "grad_norm": 2.1781082153320312, "learning_rate": 2.3320388349514564e-05, "loss": 0.5247, "step": 15120 }, { "epoch": 2.670549819080399, "grad_norm": 0.93157559633255, "learning_rate": 2.3302736098852606e-05, "loss": 0.5113, "step": 15130 }, { "epoch": 2.6723148883593684, "grad_norm": 2.9610769748687744, "learning_rate": 2.3285083848190643e-05, "loss": 0.5343, "step": 15140 }, { "epoch": 2.6740799576383374, "grad_norm": 1.2315391302108765, "learning_rate": 2.3267431597528684e-05, "loss": 0.4567, "step": 15150 }, { "epoch": 2.6758450269173064, "grad_norm": 2.971158266067505, "learning_rate": 2.3249779346866726e-05, "loss": 0.5709, "step": 15160 }, { "epoch": 2.6776100961962754, "grad_norm": 0.7065374851226807, "learning_rate": 2.3232127096204767e-05, "loss": 0.494, "step": 15170 }, { "epoch": 2.679375165475245, "grad_norm": 2.8088817596435547, "learning_rate": 2.3214474845542808e-05, "loss": 0.4709, "step": 15180 }, { "epoch": 2.6811402347542144, "grad_norm": 1.246795654296875, "learning_rate": 2.319682259488085e-05, "loss": 0.4718, "step": 15190 }, { "epoch": 2.6829053040331834, "grad_norm": 1.635353446006775, "learning_rate": 2.317917034421889e-05, "loss": 0.4402, "step": 15200 }, { "epoch": 2.6846703733121524, "grad_norm": 0.8145955204963684, "learning_rate": 2.316151809355693e-05, "loss": 0.4706, "step": 15210 }, { "epoch": 2.6864354425911214, "grad_norm": 1.0448297262191772, "learning_rate": 2.3143865842894972e-05, "loss": 0.4669, "step": 15220 }, { "epoch": 2.688200511870091, "grad_norm": 0.9495267271995544, "learning_rate": 2.3126213592233014e-05, "loss": 0.4408, "step": 15230 }, { "epoch": 2.6899655811490604, "grad_norm": 0.7679589986801147, "learning_rate": 2.310856134157105e-05, "loss": 0.5135, "step": 15240 }, { "epoch": 2.6917306504280294, "grad_norm": 2.6463985443115234, "learning_rate": 2.309090909090909e-05, "loss": 0.3998, "step": 15250 }, { "epoch": 2.6934957197069984, "grad_norm": 0.8487569093704224, "learning_rate": 2.307325684024713e-05, "loss": 0.563, "step": 15260 }, { "epoch": 2.6952607889859674, "grad_norm": 0.8544154763221741, "learning_rate": 2.305560458958517e-05, "loss": 0.4757, "step": 15270 }, { "epoch": 2.697025858264937, "grad_norm": 0.7770341634750366, "learning_rate": 2.3037952338923213e-05, "loss": 0.5124, "step": 15280 }, { "epoch": 2.6987909275439064, "grad_norm": 2.1505966186523438, "learning_rate": 2.3020300088261254e-05, "loss": 0.5038, "step": 15290 }, { "epoch": 2.7005559968228754, "grad_norm": 0.8879594802856445, "learning_rate": 2.3002647837599295e-05, "loss": 0.4735, "step": 15300 }, { "epoch": 2.7023210661018444, "grad_norm": 1.0117619037628174, "learning_rate": 2.2984995586937336e-05, "loss": 0.5157, "step": 15310 }, { "epoch": 2.7040861353808134, "grad_norm": 1.0806429386138916, "learning_rate": 2.2967343336275377e-05, "loss": 0.4627, "step": 15320 }, { "epoch": 2.705851204659783, "grad_norm": 2.156562328338623, "learning_rate": 2.2949691085613418e-05, "loss": 0.5131, "step": 15330 }, { "epoch": 2.7076162739387524, "grad_norm": 2.4664413928985596, "learning_rate": 2.293203883495146e-05, "loss": 0.4633, "step": 15340 }, { "epoch": 2.7093813432177214, "grad_norm": 0.7217195630073547, "learning_rate": 2.29143865842895e-05, "loss": 0.4845, "step": 15350 }, { "epoch": 2.7111464124966904, "grad_norm": 0.9697237610816956, "learning_rate": 2.289673433362754e-05, "loss": 0.4361, "step": 15360 }, { "epoch": 2.7129114817756594, "grad_norm": 2.4647676944732666, "learning_rate": 2.287908208296558e-05, "loss": 0.428, "step": 15370 }, { "epoch": 2.714676551054629, "grad_norm": 3.4656920433044434, "learning_rate": 2.2861429832303617e-05, "loss": 0.4057, "step": 15380 }, { "epoch": 2.716441620333598, "grad_norm": 4.08831787109375, "learning_rate": 2.284377758164166e-05, "loss": 0.449, "step": 15390 }, { "epoch": 2.7182066896125674, "grad_norm": 1.6453620195388794, "learning_rate": 2.28261253309797e-05, "loss": 0.5226, "step": 15400 }, { "epoch": 2.7199717588915364, "grad_norm": 0.8757455348968506, "learning_rate": 2.280847308031774e-05, "loss": 0.4323, "step": 15410 }, { "epoch": 2.7217368281705054, "grad_norm": 0.7374882102012634, "learning_rate": 2.2790820829655782e-05, "loss": 0.5089, "step": 15420 }, { "epoch": 2.723501897449475, "grad_norm": 0.9058319926261902, "learning_rate": 2.2773168578993823e-05, "loss": 0.5055, "step": 15430 }, { "epoch": 2.725266966728444, "grad_norm": 0.81839519739151, "learning_rate": 2.2755516328331864e-05, "loss": 0.4996, "step": 15440 }, { "epoch": 2.7270320360074134, "grad_norm": 2.061976194381714, "learning_rate": 2.2737864077669905e-05, "loss": 0.4803, "step": 15450 }, { "epoch": 2.7287971052863824, "grad_norm": 0.9650241732597351, "learning_rate": 2.2720211827007946e-05, "loss": 0.4967, "step": 15460 }, { "epoch": 2.7305621745653514, "grad_norm": 3.2927639484405518, "learning_rate": 2.2702559576345984e-05, "loss": 0.4602, "step": 15470 }, { "epoch": 2.732327243844321, "grad_norm": 0.7154979109764099, "learning_rate": 2.2684907325684025e-05, "loss": 0.4768, "step": 15480 }, { "epoch": 2.73409231312329, "grad_norm": 1.0628308057785034, "learning_rate": 2.2667255075022066e-05, "loss": 0.4454, "step": 15490 }, { "epoch": 2.7358573824022594, "grad_norm": 3.443286418914795, "learning_rate": 2.2649602824360108e-05, "loss": 0.5177, "step": 15500 }, { "epoch": 2.7376224516812284, "grad_norm": 1.6210869550704956, "learning_rate": 2.2631950573698145e-05, "loss": 0.5108, "step": 15510 }, { "epoch": 2.7393875209601974, "grad_norm": 4.365480422973633, "learning_rate": 2.2614298323036187e-05, "loss": 0.5139, "step": 15520 }, { "epoch": 2.741152590239167, "grad_norm": 3.1389224529266357, "learning_rate": 2.2596646072374228e-05, "loss": 0.4778, "step": 15530 }, { "epoch": 2.742917659518136, "grad_norm": 0.8731733560562134, "learning_rate": 2.257899382171227e-05, "loss": 0.4987, "step": 15540 }, { "epoch": 2.7446827287971054, "grad_norm": 1.9244966506958008, "learning_rate": 2.256134157105031e-05, "loss": 0.5318, "step": 15550 }, { "epoch": 2.7464477980760744, "grad_norm": 2.3997249603271484, "learning_rate": 2.254368932038835e-05, "loss": 0.5424, "step": 15560 }, { "epoch": 2.7482128673550434, "grad_norm": 2.4747679233551025, "learning_rate": 2.2526037069726392e-05, "loss": 0.4674, "step": 15570 }, { "epoch": 2.749977936634013, "grad_norm": 1.0028587579727173, "learning_rate": 2.2508384819064433e-05, "loss": 0.496, "step": 15580 }, { "epoch": 2.751743005912982, "grad_norm": 0.7981588244438171, "learning_rate": 2.249073256840247e-05, "loss": 0.4122, "step": 15590 }, { "epoch": 2.7535080751919514, "grad_norm": 1.8606398105621338, "learning_rate": 2.2473080317740512e-05, "loss": 0.5136, "step": 15600 }, { "epoch": 2.7552731444709204, "grad_norm": 0.8973249793052673, "learning_rate": 2.2455428067078553e-05, "loss": 0.5144, "step": 15610 }, { "epoch": 2.7570382137498894, "grad_norm": 1.283713459968567, "learning_rate": 2.2437775816416595e-05, "loss": 0.6096, "step": 15620 }, { "epoch": 2.758803283028859, "grad_norm": 1.2897413969039917, "learning_rate": 2.2420123565754636e-05, "loss": 0.4907, "step": 15630 }, { "epoch": 2.760568352307828, "grad_norm": 3.3617520332336426, "learning_rate": 2.2402471315092673e-05, "loss": 0.4862, "step": 15640 }, { "epoch": 2.7623334215867974, "grad_norm": 0.9050626158714294, "learning_rate": 2.2384819064430715e-05, "loss": 0.504, "step": 15650 }, { "epoch": 2.7640984908657664, "grad_norm": 2.170416831970215, "learning_rate": 2.2367166813768756e-05, "loss": 0.4483, "step": 15660 }, { "epoch": 2.7658635601447354, "grad_norm": 1.0081535577774048, "learning_rate": 2.2349514563106797e-05, "loss": 0.6113, "step": 15670 }, { "epoch": 2.767628629423705, "grad_norm": 3.635767936706543, "learning_rate": 2.2331862312444838e-05, "loss": 0.5047, "step": 15680 }, { "epoch": 2.769393698702674, "grad_norm": 1.7012405395507812, "learning_rate": 2.231421006178288e-05, "loss": 0.4778, "step": 15690 }, { "epoch": 2.7711587679816434, "grad_norm": 1.043668270111084, "learning_rate": 2.229655781112092e-05, "loss": 0.5586, "step": 15700 }, { "epoch": 2.7729238372606124, "grad_norm": 2.9892914295196533, "learning_rate": 2.2278905560458958e-05, "loss": 0.4703, "step": 15710 }, { "epoch": 2.7746889065395814, "grad_norm": 0.8967382311820984, "learning_rate": 2.2261253309797e-05, "loss": 0.4559, "step": 15720 }, { "epoch": 2.776453975818551, "grad_norm": 0.8660327792167664, "learning_rate": 2.224360105913504e-05, "loss": 0.4824, "step": 15730 }, { "epoch": 2.77821904509752, "grad_norm": 2.1754908561706543, "learning_rate": 2.222594880847308e-05, "loss": 0.4651, "step": 15740 }, { "epoch": 2.7799841143764894, "grad_norm": 0.9087356328964233, "learning_rate": 2.2208296557811123e-05, "loss": 0.4744, "step": 15750 }, { "epoch": 2.7817491836554584, "grad_norm": 1.9040993452072144, "learning_rate": 2.2190644307149164e-05, "loss": 0.562, "step": 15760 }, { "epoch": 2.7835142529344274, "grad_norm": 0.6683452725410461, "learning_rate": 2.2172992056487205e-05, "loss": 0.4921, "step": 15770 }, { "epoch": 2.785279322213397, "grad_norm": 2.6163249015808105, "learning_rate": 2.2155339805825243e-05, "loss": 0.4716, "step": 15780 }, { "epoch": 2.787044391492366, "grad_norm": 2.3667445182800293, "learning_rate": 2.2137687555163284e-05, "loss": 0.5489, "step": 15790 }, { "epoch": 2.7888094607713354, "grad_norm": 0.7131396532058716, "learning_rate": 2.2120035304501325e-05, "loss": 0.4945, "step": 15800 }, { "epoch": 2.7905745300503044, "grad_norm": 2.674614667892456, "learning_rate": 2.2102383053839366e-05, "loss": 0.48, "step": 15810 }, { "epoch": 2.7923395993292734, "grad_norm": 1.0316333770751953, "learning_rate": 2.2084730803177407e-05, "loss": 0.5085, "step": 15820 }, { "epoch": 2.794104668608243, "grad_norm": 2.8404314517974854, "learning_rate": 2.2067078552515445e-05, "loss": 0.4339, "step": 15830 }, { "epoch": 2.795869737887212, "grad_norm": 0.8546010851860046, "learning_rate": 2.2049426301853486e-05, "loss": 0.5151, "step": 15840 }, { "epoch": 2.7976348071661814, "grad_norm": 3.1289405822753906, "learning_rate": 2.2031774051191527e-05, "loss": 0.4742, "step": 15850 }, { "epoch": 2.7993998764451504, "grad_norm": 3.0443801879882812, "learning_rate": 2.201412180052957e-05, "loss": 0.5226, "step": 15860 }, { "epoch": 2.8011649457241194, "grad_norm": 2.93648099899292, "learning_rate": 2.199646954986761e-05, "loss": 0.551, "step": 15870 }, { "epoch": 2.802930015003089, "grad_norm": 0.9104002714157104, "learning_rate": 2.197881729920565e-05, "loss": 0.499, "step": 15880 }, { "epoch": 2.804695084282058, "grad_norm": 1.0089343786239624, "learning_rate": 2.1961165048543692e-05, "loss": 0.5425, "step": 15890 }, { "epoch": 2.8064601535610274, "grad_norm": 0.7696927785873413, "learning_rate": 2.1943512797881733e-05, "loss": 0.4238, "step": 15900 }, { "epoch": 2.8082252228399964, "grad_norm": 3.6322391033172607, "learning_rate": 2.192586054721977e-05, "loss": 0.5536, "step": 15910 }, { "epoch": 2.8099902921189654, "grad_norm": 0.7961804270744324, "learning_rate": 2.1908208296557812e-05, "loss": 0.4964, "step": 15920 }, { "epoch": 2.811755361397935, "grad_norm": 1.4182825088500977, "learning_rate": 2.1890556045895853e-05, "loss": 0.5449, "step": 15930 }, { "epoch": 2.813520430676904, "grad_norm": 0.6621285676956177, "learning_rate": 2.1872903795233894e-05, "loss": 0.5348, "step": 15940 }, { "epoch": 2.8152854999558734, "grad_norm": 1.5774197578430176, "learning_rate": 2.1855251544571932e-05, "loss": 0.4791, "step": 15950 }, { "epoch": 2.8170505692348424, "grad_norm": 0.7247095704078674, "learning_rate": 2.1837599293909973e-05, "loss": 0.4904, "step": 15960 }, { "epoch": 2.8188156385138115, "grad_norm": 2.010996103286743, "learning_rate": 2.1819947043248014e-05, "loss": 0.4235, "step": 15970 }, { "epoch": 2.820580707792781, "grad_norm": 1.0321121215820312, "learning_rate": 2.1802294792586056e-05, "loss": 0.4999, "step": 15980 }, { "epoch": 2.82234577707175, "grad_norm": 1.1265519857406616, "learning_rate": 2.1784642541924097e-05, "loss": 0.4972, "step": 15990 }, { "epoch": 2.8241108463507194, "grad_norm": 1.2448257207870483, "learning_rate": 2.1766990291262138e-05, "loss": 0.4863, "step": 16000 }, { "epoch": 2.8241108463507194, "eval_loss": 0.6306177973747253, "eval_runtime": 591.8945, "eval_samples_per_second": 47.858, "eval_steps_per_second": 2.394, "eval_token_accuracy": 0.0005015600934351041, "step": 16000 }, { "epoch": 2.8258759156296884, "grad_norm": 1.085065484046936, "learning_rate": 2.174933804060018e-05, "loss": 0.5037, "step": 16010 }, { "epoch": 2.8276409849086575, "grad_norm": 1.0297991037368774, "learning_rate": 2.173168578993822e-05, "loss": 0.4709, "step": 16020 }, { "epoch": 2.829406054187627, "grad_norm": 1.2882356643676758, "learning_rate": 2.171403353927626e-05, "loss": 0.4344, "step": 16030 }, { "epoch": 2.831171123466596, "grad_norm": 3.9636170864105225, "learning_rate": 2.16963812886143e-05, "loss": 0.4831, "step": 16040 }, { "epoch": 2.8329361927455654, "grad_norm": 1.1417694091796875, "learning_rate": 2.167872903795234e-05, "loss": 0.5312, "step": 16050 }, { "epoch": 2.8347012620245344, "grad_norm": 1.081305742263794, "learning_rate": 2.166107678729038e-05, "loss": 0.5413, "step": 16060 }, { "epoch": 2.8364663313035035, "grad_norm": 1.6925846338272095, "learning_rate": 2.164342453662842e-05, "loss": 0.4676, "step": 16070 }, { "epoch": 2.838231400582473, "grad_norm": 0.9859732389450073, "learning_rate": 2.162577228596646e-05, "loss": 0.4722, "step": 16080 }, { "epoch": 2.839996469861442, "grad_norm": 3.5599405765533447, "learning_rate": 2.16081200353045e-05, "loss": 0.4447, "step": 16090 }, { "epoch": 2.8417615391404114, "grad_norm": 2.7686328887939453, "learning_rate": 2.1590467784642542e-05, "loss": 0.4336, "step": 16100 }, { "epoch": 2.8435266084193804, "grad_norm": 1.155197262763977, "learning_rate": 2.1572815533980584e-05, "loss": 0.4605, "step": 16110 }, { "epoch": 2.8452916776983495, "grad_norm": 1.153023362159729, "learning_rate": 2.1555163283318625e-05, "loss": 0.4993, "step": 16120 }, { "epoch": 2.847056746977319, "grad_norm": 0.9941584467887878, "learning_rate": 2.1537511032656666e-05, "loss": 0.5547, "step": 16130 }, { "epoch": 2.848821816256288, "grad_norm": 1.042536973953247, "learning_rate": 2.1519858781994707e-05, "loss": 0.4827, "step": 16140 }, { "epoch": 2.8505868855352574, "grad_norm": 0.9316404461860657, "learning_rate": 2.1502206531332748e-05, "loss": 0.4378, "step": 16150 }, { "epoch": 2.8523519548142264, "grad_norm": 0.9529580473899841, "learning_rate": 2.148455428067079e-05, "loss": 0.5597, "step": 16160 }, { "epoch": 2.8541170240931955, "grad_norm": 3.498310089111328, "learning_rate": 2.1466902030008827e-05, "loss": 0.4683, "step": 16170 }, { "epoch": 2.855882093372165, "grad_norm": 2.061889410018921, "learning_rate": 2.1449249779346868e-05, "loss": 0.5144, "step": 16180 }, { "epoch": 2.857647162651134, "grad_norm": 3.0003912448883057, "learning_rate": 2.1431597528684906e-05, "loss": 0.5241, "step": 16190 }, { "epoch": 2.8594122319301034, "grad_norm": 4.18428897857666, "learning_rate": 2.1413945278022947e-05, "loss": 0.4634, "step": 16200 }, { "epoch": 2.8611773012090724, "grad_norm": 2.9743313789367676, "learning_rate": 2.139629302736099e-05, "loss": 0.5063, "step": 16210 }, { "epoch": 2.8629423704880415, "grad_norm": 1.0419633388519287, "learning_rate": 2.137864077669903e-05, "loss": 0.4746, "step": 16220 }, { "epoch": 2.864707439767011, "grad_norm": 1.1511107683181763, "learning_rate": 2.136098852603707e-05, "loss": 0.4511, "step": 16230 }, { "epoch": 2.86647250904598, "grad_norm": 3.314408302307129, "learning_rate": 2.1343336275375112e-05, "loss": 0.5168, "step": 16240 }, { "epoch": 2.8682375783249494, "grad_norm": 1.9849070310592651, "learning_rate": 2.1325684024713153e-05, "loss": 0.4397, "step": 16250 }, { "epoch": 2.8700026476039184, "grad_norm": 4.134389877319336, "learning_rate": 2.1308031774051194e-05, "loss": 0.4684, "step": 16260 }, { "epoch": 2.8717677168828875, "grad_norm": 1.888159990310669, "learning_rate": 2.1290379523389235e-05, "loss": 0.4666, "step": 16270 }, { "epoch": 2.873532786161857, "grad_norm": 0.756315290927887, "learning_rate": 2.1272727272727276e-05, "loss": 0.5064, "step": 16280 }, { "epoch": 2.875297855440826, "grad_norm": 1.1455638408660889, "learning_rate": 2.1255075022065314e-05, "loss": 0.4677, "step": 16290 }, { "epoch": 2.8770629247197954, "grad_norm": 2.3531157970428467, "learning_rate": 2.1237422771403355e-05, "loss": 0.5371, "step": 16300 }, { "epoch": 2.8788279939987644, "grad_norm": 2.039882183074951, "learning_rate": 2.1219770520741393e-05, "loss": 0.5056, "step": 16310 }, { "epoch": 2.8805930632777335, "grad_norm": 0.7574838995933533, "learning_rate": 2.1202118270079434e-05, "loss": 0.4737, "step": 16320 }, { "epoch": 2.882358132556703, "grad_norm": 2.321002960205078, "learning_rate": 2.1184466019417475e-05, "loss": 0.4875, "step": 16330 }, { "epoch": 2.884123201835672, "grad_norm": 1.2612684965133667, "learning_rate": 2.1166813768755516e-05, "loss": 0.5249, "step": 16340 }, { "epoch": 2.8858882711146414, "grad_norm": 0.8896175622940063, "learning_rate": 2.1149161518093558e-05, "loss": 0.4589, "step": 16350 }, { "epoch": 2.8876533403936104, "grad_norm": 0.7815883755683899, "learning_rate": 2.11315092674316e-05, "loss": 0.5376, "step": 16360 }, { "epoch": 2.8894184096725795, "grad_norm": 1.2477344274520874, "learning_rate": 2.111385701676964e-05, "loss": 0.495, "step": 16370 }, { "epoch": 2.891183478951549, "grad_norm": 1.113911747932434, "learning_rate": 2.109620476610768e-05, "loss": 0.6071, "step": 16380 }, { "epoch": 2.892948548230518, "grad_norm": 0.9196587204933167, "learning_rate": 2.1078552515445722e-05, "loss": 0.5024, "step": 16390 }, { "epoch": 2.8947136175094874, "grad_norm": 0.9714921116828918, "learning_rate": 2.1060900264783763e-05, "loss": 0.447, "step": 16400 }, { "epoch": 2.8964786867884564, "grad_norm": 1.1055673360824585, "learning_rate": 2.10432480141218e-05, "loss": 0.4452, "step": 16410 }, { "epoch": 2.8982437560674255, "grad_norm": 1.2155486345291138, "learning_rate": 2.1025595763459842e-05, "loss": 0.565, "step": 16420 }, { "epoch": 2.900008825346395, "grad_norm": 1.8765925168991089, "learning_rate": 2.1007943512797883e-05, "loss": 0.4882, "step": 16430 }, { "epoch": 2.901773894625364, "grad_norm": 0.997562050819397, "learning_rate": 2.099029126213592e-05, "loss": 0.6414, "step": 16440 }, { "epoch": 2.9035389639043334, "grad_norm": 1.0786616802215576, "learning_rate": 2.0972639011473962e-05, "loss": 0.5226, "step": 16450 }, { "epoch": 2.9053040331833024, "grad_norm": 4.614233493804932, "learning_rate": 2.0954986760812003e-05, "loss": 0.5057, "step": 16460 }, { "epoch": 2.9070691024622715, "grad_norm": 2.1534485816955566, "learning_rate": 2.0937334510150045e-05, "loss": 0.4424, "step": 16470 }, { "epoch": 2.908834171741241, "grad_norm": 0.6817697286605835, "learning_rate": 2.0919682259488086e-05, "loss": 0.5091, "step": 16480 }, { "epoch": 2.91059924102021, "grad_norm": 3.658924102783203, "learning_rate": 2.0902030008826127e-05, "loss": 0.4715, "step": 16490 }, { "epoch": 2.9123643102991794, "grad_norm": 0.9275711178779602, "learning_rate": 2.0884377758164168e-05, "loss": 0.4714, "step": 16500 }, { "epoch": 2.9141293795781484, "grad_norm": 3.1437737941741943, "learning_rate": 2.086672550750221e-05, "loss": 0.4552, "step": 16510 }, { "epoch": 2.9158944488571175, "grad_norm": 1.1514102220535278, "learning_rate": 2.0849073256840247e-05, "loss": 0.5323, "step": 16520 }, { "epoch": 2.917659518136087, "grad_norm": 2.7238919734954834, "learning_rate": 2.0831421006178288e-05, "loss": 0.4578, "step": 16530 }, { "epoch": 2.919424587415056, "grad_norm": 1.4829517602920532, "learning_rate": 2.081376875551633e-05, "loss": 0.5226, "step": 16540 }, { "epoch": 2.9211896566940254, "grad_norm": 2.4364633560180664, "learning_rate": 2.079611650485437e-05, "loss": 0.5122, "step": 16550 }, { "epoch": 2.9229547259729944, "grad_norm": 1.0211104154586792, "learning_rate": 2.077846425419241e-05, "loss": 0.5789, "step": 16560 }, { "epoch": 2.9247197952519635, "grad_norm": 2.342478036880493, "learning_rate": 2.076081200353045e-05, "loss": 0.5076, "step": 16570 }, { "epoch": 2.926484864530933, "grad_norm": 1.1480425596237183, "learning_rate": 2.074315975286849e-05, "loss": 0.5127, "step": 16580 }, { "epoch": 2.928249933809902, "grad_norm": 2.9418277740478516, "learning_rate": 2.072550750220653e-05, "loss": 0.4792, "step": 16590 }, { "epoch": 2.9300150030888714, "grad_norm": 1.6210964918136597, "learning_rate": 2.0707855251544573e-05, "loss": 0.5162, "step": 16600 }, { "epoch": 2.9317800723678404, "grad_norm": 1.0910576581954956, "learning_rate": 2.0690203000882614e-05, "loss": 0.4656, "step": 16610 }, { "epoch": 2.9335451416468095, "grad_norm": 2.7877092361450195, "learning_rate": 2.0672550750220655e-05, "loss": 0.44, "step": 16620 }, { "epoch": 2.935310210925779, "grad_norm": 1.0605442523956299, "learning_rate": 2.0654898499558696e-05, "loss": 0.5368, "step": 16630 }, { "epoch": 2.937075280204748, "grad_norm": 1.2327733039855957, "learning_rate": 2.0637246248896734e-05, "loss": 0.469, "step": 16640 }, { "epoch": 2.9388403494837174, "grad_norm": 0.9705036282539368, "learning_rate": 2.0619593998234775e-05, "loss": 0.4702, "step": 16650 }, { "epoch": 2.9406054187626864, "grad_norm": 0.7878549695014954, "learning_rate": 2.0601941747572816e-05, "loss": 0.4524, "step": 16660 }, { "epoch": 2.9423704880416555, "grad_norm": 1.2322081327438354, "learning_rate": 2.0584289496910857e-05, "loss": 0.4912, "step": 16670 }, { "epoch": 2.944135557320625, "grad_norm": 0.9457627534866333, "learning_rate": 2.05666372462489e-05, "loss": 0.5209, "step": 16680 }, { "epoch": 2.945900626599594, "grad_norm": 1.9645498991012573, "learning_rate": 2.054898499558694e-05, "loss": 0.4715, "step": 16690 }, { "epoch": 2.9476656958785634, "grad_norm": 3.2327866554260254, "learning_rate": 2.0531332744924977e-05, "loss": 0.4796, "step": 16700 }, { "epoch": 2.9494307651575324, "grad_norm": 1.7311758995056152, "learning_rate": 2.051368049426302e-05, "loss": 0.5579, "step": 16710 }, { "epoch": 2.9511958344365015, "grad_norm": 0.8861629962921143, "learning_rate": 2.049602824360106e-05, "loss": 0.4753, "step": 16720 }, { "epoch": 2.952960903715471, "grad_norm": 0.8525285124778748, "learning_rate": 2.04783759929391e-05, "loss": 0.4858, "step": 16730 }, { "epoch": 2.95472597299444, "grad_norm": 3.608468532562256, "learning_rate": 2.0460723742277142e-05, "loss": 0.4523, "step": 16740 }, { "epoch": 2.9564910422734094, "grad_norm": 0.978895902633667, "learning_rate": 2.0443071491615183e-05, "loss": 0.4952, "step": 16750 }, { "epoch": 2.9582561115523784, "grad_norm": 2.2159929275512695, "learning_rate": 2.042541924095322e-05, "loss": 0.5031, "step": 16760 }, { "epoch": 2.9600211808313475, "grad_norm": 2.59801983833313, "learning_rate": 2.0407766990291262e-05, "loss": 0.4754, "step": 16770 }, { "epoch": 2.961786250110317, "grad_norm": 1.045888066291809, "learning_rate": 2.0390114739629303e-05, "loss": 0.5499, "step": 16780 }, { "epoch": 2.963551319389286, "grad_norm": 0.7944245338439941, "learning_rate": 2.0372462488967344e-05, "loss": 0.4866, "step": 16790 }, { "epoch": 2.9653163886682554, "grad_norm": 3.4561140537261963, "learning_rate": 2.0354810238305385e-05, "loss": 0.5503, "step": 16800 }, { "epoch": 2.9670814579472244, "grad_norm": 1.817888855934143, "learning_rate": 2.0337157987643427e-05, "loss": 0.541, "step": 16810 }, { "epoch": 2.9688465272261935, "grad_norm": 0.9524929523468018, "learning_rate": 2.0319505736981468e-05, "loss": 0.4855, "step": 16820 }, { "epoch": 2.970611596505163, "grad_norm": 2.639288902282715, "learning_rate": 2.0301853486319505e-05, "loss": 0.4909, "step": 16830 }, { "epoch": 2.972376665784132, "grad_norm": 0.8207223415374756, "learning_rate": 2.0284201235657547e-05, "loss": 0.5087, "step": 16840 }, { "epoch": 2.9741417350631014, "grad_norm": 0.8275809288024902, "learning_rate": 2.0266548984995588e-05, "loss": 0.4533, "step": 16850 }, { "epoch": 2.9759068043420704, "grad_norm": 0.7429968118667603, "learning_rate": 2.024889673433363e-05, "loss": 0.5182, "step": 16860 }, { "epoch": 2.9776718736210395, "grad_norm": 1.4792590141296387, "learning_rate": 2.023124448367167e-05, "loss": 0.4303, "step": 16870 }, { "epoch": 2.979436942900009, "grad_norm": 2.502073287963867, "learning_rate": 2.0213592233009708e-05, "loss": 0.5349, "step": 16880 }, { "epoch": 2.981202012178978, "grad_norm": 2.577254295349121, "learning_rate": 2.019593998234775e-05, "loss": 0.5173, "step": 16890 }, { "epoch": 2.9829670814579474, "grad_norm": 3.045180559158325, "learning_rate": 2.017828773168579e-05, "loss": 0.4923, "step": 16900 }, { "epoch": 2.9847321507369164, "grad_norm": 0.8430206775665283, "learning_rate": 2.016063548102383e-05, "loss": 0.498, "step": 16910 }, { "epoch": 2.9864972200158855, "grad_norm": 2.0831096172332764, "learning_rate": 2.0142983230361872e-05, "loss": 0.5765, "step": 16920 }, { "epoch": 2.988262289294855, "grad_norm": 0.9307297468185425, "learning_rate": 2.0125330979699914e-05, "loss": 0.441, "step": 16930 }, { "epoch": 2.990027358573824, "grad_norm": 1.0840940475463867, "learning_rate": 2.0107678729037955e-05, "loss": 0.5035, "step": 16940 }, { "epoch": 2.9917924278527934, "grad_norm": 1.0643527507781982, "learning_rate": 2.0090026478375996e-05, "loss": 0.4537, "step": 16950 }, { "epoch": 2.9935574971317624, "grad_norm": 0.859700620174408, "learning_rate": 2.0072374227714037e-05, "loss": 0.5502, "step": 16960 }, { "epoch": 2.9953225664107315, "grad_norm": 1.9049547910690308, "learning_rate": 2.0054721977052075e-05, "loss": 0.4597, "step": 16970 }, { "epoch": 2.997087635689701, "grad_norm": 1.4590740203857422, "learning_rate": 2.0037069726390116e-05, "loss": 0.4553, "step": 16980 }, { "epoch": 2.99885270496867, "grad_norm": 2.080549955368042, "learning_rate": 2.0019417475728157e-05, "loss": 0.4728, "step": 16990 }, { "epoch": 3.0006177742476394, "grad_norm": 1.6013654470443726, "learning_rate": 2.0001765225066195e-05, "loss": 0.4843, "step": 17000 }, { "epoch": 3.0006177742476394, "eval_loss": 0.6180456280708313, "eval_runtime": 591.8623, "eval_samples_per_second": 47.861, "eval_steps_per_second": 2.394, "eval_token_accuracy": 0.0004966886072989477, "step": 17000 }, { "epoch": 3.0023828435266084, "grad_norm": 1.9730802774429321, "learning_rate": 1.9984112974404236e-05, "loss": 0.3307, "step": 17010 }, { "epoch": 3.0041479128055775, "grad_norm": 3.203862428665161, "learning_rate": 1.9966460723742277e-05, "loss": 0.4227, "step": 17020 }, { "epoch": 3.005912982084547, "grad_norm": 0.9686912894248962, "learning_rate": 1.9948808473080318e-05, "loss": 0.3816, "step": 17030 }, { "epoch": 3.007678051363516, "grad_norm": 0.8262260556221008, "learning_rate": 1.993115622241836e-05, "loss": 0.3952, "step": 17040 }, { "epoch": 3.0094431206424854, "grad_norm": 1.0676703453063965, "learning_rate": 1.99135039717564e-05, "loss": 0.3561, "step": 17050 }, { "epoch": 3.0112081899214544, "grad_norm": 1.036767840385437, "learning_rate": 1.989585172109444e-05, "loss": 0.3818, "step": 17060 }, { "epoch": 3.0129732592004235, "grad_norm": 0.9834463000297546, "learning_rate": 1.9878199470432483e-05, "loss": 0.378, "step": 17070 }, { "epoch": 3.014738328479393, "grad_norm": 0.8777109980583191, "learning_rate": 1.9860547219770524e-05, "loss": 0.4404, "step": 17080 }, { "epoch": 3.016503397758362, "grad_norm": 0.7740920186042786, "learning_rate": 1.9842894969108565e-05, "loss": 0.4248, "step": 17090 }, { "epoch": 3.0182684670373314, "grad_norm": 1.1949310302734375, "learning_rate": 1.9825242718446603e-05, "loss": 0.4148, "step": 17100 }, { "epoch": 3.0200335363163004, "grad_norm": 0.8576988577842712, "learning_rate": 1.9807590467784644e-05, "loss": 0.384, "step": 17110 }, { "epoch": 3.0217986055952695, "grad_norm": 1.0700007677078247, "learning_rate": 1.9789938217122682e-05, "loss": 0.3596, "step": 17120 }, { "epoch": 3.023563674874239, "grad_norm": 3.1668031215667725, "learning_rate": 1.9772285966460723e-05, "loss": 0.3766, "step": 17130 }, { "epoch": 3.025328744153208, "grad_norm": 0.8742389678955078, "learning_rate": 1.9754633715798764e-05, "loss": 0.3683, "step": 17140 }, { "epoch": 3.0270938134321774, "grad_norm": 2.866408109664917, "learning_rate": 1.9736981465136805e-05, "loss": 0.3652, "step": 17150 }, { "epoch": 3.0288588827111464, "grad_norm": 3.1418302059173584, "learning_rate": 1.9719329214474846e-05, "loss": 0.3758, "step": 17160 }, { "epoch": 3.0306239519901155, "grad_norm": 3.4927444458007812, "learning_rate": 1.9701676963812888e-05, "loss": 0.4894, "step": 17170 }, { "epoch": 3.032389021269085, "grad_norm": 1.428471565246582, "learning_rate": 1.968402471315093e-05, "loss": 0.3906, "step": 17180 }, { "epoch": 3.034154090548054, "grad_norm": 2.707277297973633, "learning_rate": 1.966637246248897e-05, "loss": 0.4501, "step": 17190 }, { "epoch": 3.0359191598270234, "grad_norm": 0.8563360571861267, "learning_rate": 1.964872021182701e-05, "loss": 0.3319, "step": 17200 }, { "epoch": 3.0376842291059925, "grad_norm": 0.9764096140861511, "learning_rate": 1.9631067961165052e-05, "loss": 0.38, "step": 17210 }, { "epoch": 3.0394492983849615, "grad_norm": 2.875133991241455, "learning_rate": 1.961341571050309e-05, "loss": 0.3718, "step": 17220 }, { "epoch": 3.041214367663931, "grad_norm": 0.8888715505599976, "learning_rate": 1.959576345984113e-05, "loss": 0.3622, "step": 17230 }, { "epoch": 3.0429794369429, "grad_norm": 3.4094061851501465, "learning_rate": 1.957811120917917e-05, "loss": 0.3764, "step": 17240 }, { "epoch": 3.0447445062218694, "grad_norm": 1.1295435428619385, "learning_rate": 1.956045895851721e-05, "loss": 0.3731, "step": 17250 }, { "epoch": 3.0465095755008385, "grad_norm": 2.553759813308716, "learning_rate": 1.954280670785525e-05, "loss": 0.3827, "step": 17260 }, { "epoch": 3.0482746447798075, "grad_norm": 2.6989941596984863, "learning_rate": 1.9525154457193292e-05, "loss": 0.3205, "step": 17270 }, { "epoch": 3.050039714058777, "grad_norm": 3.229684352874756, "learning_rate": 1.9507502206531333e-05, "loss": 0.4119, "step": 17280 }, { "epoch": 3.051804783337746, "grad_norm": 2.388998508453369, "learning_rate": 1.9489849955869374e-05, "loss": 0.3727, "step": 17290 }, { "epoch": 3.0535698526167154, "grad_norm": 0.7352622151374817, "learning_rate": 1.9472197705207416e-05, "loss": 0.4236, "step": 17300 }, { "epoch": 3.0553349218956845, "grad_norm": 1.1700186729431152, "learning_rate": 1.9454545454545457e-05, "loss": 0.3871, "step": 17310 }, { "epoch": 3.0570999911746535, "grad_norm": 0.6964054703712463, "learning_rate": 1.9436893203883498e-05, "loss": 0.3644, "step": 17320 }, { "epoch": 3.058865060453623, "grad_norm": 2.9264075756073, "learning_rate": 1.941924095322154e-05, "loss": 0.3235, "step": 17330 }, { "epoch": 3.060630129732592, "grad_norm": 2.7784578800201416, "learning_rate": 1.9401588702559577e-05, "loss": 0.4357, "step": 17340 }, { "epoch": 3.062395199011561, "grad_norm": 3.4185690879821777, "learning_rate": 1.9383936451897618e-05, "loss": 0.4222, "step": 17350 }, { "epoch": 3.0641602682905305, "grad_norm": 2.2247936725616455, "learning_rate": 1.9366284201235656e-05, "loss": 0.4036, "step": 17360 }, { "epoch": 3.0659253375694995, "grad_norm": 3.4868199825286865, "learning_rate": 1.9348631950573697e-05, "loss": 0.3873, "step": 17370 }, { "epoch": 3.067690406848469, "grad_norm": 3.106703758239746, "learning_rate": 1.9330979699911738e-05, "loss": 0.384, "step": 17380 }, { "epoch": 3.069455476127438, "grad_norm": 0.9752678871154785, "learning_rate": 1.931332744924978e-05, "loss": 0.3587, "step": 17390 }, { "epoch": 3.071220545406407, "grad_norm": 1.7790659666061401, "learning_rate": 1.929567519858782e-05, "loss": 0.3493, "step": 17400 }, { "epoch": 3.0729856146853765, "grad_norm": 0.9488353133201599, "learning_rate": 1.927802294792586e-05, "loss": 0.3924, "step": 17410 }, { "epoch": 3.0747506839643455, "grad_norm": 3.756638288497925, "learning_rate": 1.9260370697263903e-05, "loss": 0.4504, "step": 17420 }, { "epoch": 3.076515753243315, "grad_norm": 2.0129315853118896, "learning_rate": 1.9242718446601944e-05, "loss": 0.4081, "step": 17430 }, { "epoch": 3.078280822522284, "grad_norm": 0.9279769062995911, "learning_rate": 1.9225066195939985e-05, "loss": 0.3911, "step": 17440 }, { "epoch": 3.080045891801253, "grad_norm": 1.094159483909607, "learning_rate": 1.9207413945278026e-05, "loss": 0.3615, "step": 17450 }, { "epoch": 3.0818109610802225, "grad_norm": 1.5764673948287964, "learning_rate": 1.9189761694616064e-05, "loss": 0.427, "step": 17460 }, { "epoch": 3.0835760303591915, "grad_norm": 2.997194528579712, "learning_rate": 1.9172109443954105e-05, "loss": 0.3908, "step": 17470 }, { "epoch": 3.085341099638161, "grad_norm": 0.5910589694976807, "learning_rate": 1.9154457193292146e-05, "loss": 0.3617, "step": 17480 }, { "epoch": 3.08710616891713, "grad_norm": 2.0380403995513916, "learning_rate": 1.9136804942630187e-05, "loss": 0.3828, "step": 17490 }, { "epoch": 3.088871238196099, "grad_norm": 2.3677608966827393, "learning_rate": 1.9119152691968225e-05, "loss": 0.3954, "step": 17500 }, { "epoch": 3.0906363074750685, "grad_norm": 1.9572163820266724, "learning_rate": 1.9101500441306266e-05, "loss": 0.3373, "step": 17510 }, { "epoch": 3.0924013767540375, "grad_norm": 3.817796230316162, "learning_rate": 1.9083848190644307e-05, "loss": 0.3983, "step": 17520 }, { "epoch": 3.094166446033007, "grad_norm": 0.9311608672142029, "learning_rate": 1.906619593998235e-05, "loss": 0.4515, "step": 17530 }, { "epoch": 3.095931515311976, "grad_norm": 2.0781655311584473, "learning_rate": 1.904854368932039e-05, "loss": 0.3921, "step": 17540 }, { "epoch": 3.097696584590945, "grad_norm": 1.2067509889602661, "learning_rate": 1.903089143865843e-05, "loss": 0.4393, "step": 17550 }, { "epoch": 3.0994616538699145, "grad_norm": 0.8493309020996094, "learning_rate": 1.9013239187996472e-05, "loss": 0.4344, "step": 17560 }, { "epoch": 3.1012267231488835, "grad_norm": 4.165531158447266, "learning_rate": 1.8995586937334513e-05, "loss": 0.3652, "step": 17570 }, { "epoch": 3.102991792427853, "grad_norm": 0.9132296442985535, "learning_rate": 1.897793468667255e-05, "loss": 0.3535, "step": 17580 }, { "epoch": 3.104756861706822, "grad_norm": 4.703742027282715, "learning_rate": 1.8960282436010592e-05, "loss": 0.412, "step": 17590 }, { "epoch": 3.106521930985791, "grad_norm": 2.0259885787963867, "learning_rate": 1.8942630185348633e-05, "loss": 0.3206, "step": 17600 }, { "epoch": 3.1082870002647605, "grad_norm": 0.9822014570236206, "learning_rate": 1.8924977934686674e-05, "loss": 0.3595, "step": 17610 }, { "epoch": 3.1100520695437295, "grad_norm": 0.8171116709709167, "learning_rate": 1.8907325684024715e-05, "loss": 0.3018, "step": 17620 }, { "epoch": 3.111817138822699, "grad_norm": 1.1940850019454956, "learning_rate": 1.8889673433362753e-05, "loss": 0.3922, "step": 17630 }, { "epoch": 3.113582208101668, "grad_norm": 0.7778500914573669, "learning_rate": 1.8872021182700794e-05, "loss": 0.3292, "step": 17640 }, { "epoch": 3.115347277380637, "grad_norm": 1.2759803533554077, "learning_rate": 1.8854368932038835e-05, "loss": 0.3997, "step": 17650 }, { "epoch": 3.1171123466596065, "grad_norm": 1.983506441116333, "learning_rate": 1.8836716681376877e-05, "loss": 0.3327, "step": 17660 }, { "epoch": 3.1188774159385755, "grad_norm": 1.1584614515304565, "learning_rate": 1.8819064430714918e-05, "loss": 0.345, "step": 17670 }, { "epoch": 3.120642485217545, "grad_norm": 0.903225839138031, "learning_rate": 1.880141218005296e-05, "loss": 0.3868, "step": 17680 }, { "epoch": 3.122407554496514, "grad_norm": 1.5125501155853271, "learning_rate": 1.8783759929390997e-05, "loss": 0.369, "step": 17690 }, { "epoch": 3.124172623775483, "grad_norm": 0.9562115669250488, "learning_rate": 1.8766107678729038e-05, "loss": 0.3886, "step": 17700 }, { "epoch": 3.1259376930544525, "grad_norm": 1.0078356266021729, "learning_rate": 1.874845542806708e-05, "loss": 0.4391, "step": 17710 }, { "epoch": 3.1277027623334215, "grad_norm": 1.0851801633834839, "learning_rate": 1.873080317740512e-05, "loss": 0.3967, "step": 17720 }, { "epoch": 3.129467831612391, "grad_norm": 2.5538651943206787, "learning_rate": 1.871315092674316e-05, "loss": 0.3671, "step": 17730 }, { "epoch": 3.13123290089136, "grad_norm": 4.896754264831543, "learning_rate": 1.8695498676081202e-05, "loss": 0.3569, "step": 17740 }, { "epoch": 3.132997970170329, "grad_norm": 1.1560003757476807, "learning_rate": 1.8677846425419243e-05, "loss": 0.3262, "step": 17750 }, { "epoch": 3.1347630394492985, "grad_norm": 3.046627998352051, "learning_rate": 1.866019417475728e-05, "loss": 0.4524, "step": 17760 }, { "epoch": 3.1365281087282675, "grad_norm": 3.14935040473938, "learning_rate": 1.8642541924095322e-05, "loss": 0.3734, "step": 17770 }, { "epoch": 3.138293178007237, "grad_norm": 1.015893578529358, "learning_rate": 1.8624889673433364e-05, "loss": 0.522, "step": 17780 }, { "epoch": 3.140058247286206, "grad_norm": 2.6474082469940186, "learning_rate": 1.8607237422771405e-05, "loss": 0.3783, "step": 17790 }, { "epoch": 3.141823316565175, "grad_norm": 0.9868051409721375, "learning_rate": 1.8589585172109446e-05, "loss": 0.4548, "step": 17800 }, { "epoch": 3.1435883858441445, "grad_norm": 2.009326696395874, "learning_rate": 1.8571932921447484e-05, "loss": 0.3878, "step": 17810 }, { "epoch": 3.1453534551231135, "grad_norm": 2.1260197162628174, "learning_rate": 1.8554280670785525e-05, "loss": 0.365, "step": 17820 }, { "epoch": 3.147118524402083, "grad_norm": 1.069753885269165, "learning_rate": 1.8536628420123566e-05, "loss": 0.4153, "step": 17830 }, { "epoch": 3.148883593681052, "grad_norm": 1.5456312894821167, "learning_rate": 1.8518976169461607e-05, "loss": 0.4015, "step": 17840 }, { "epoch": 3.150648662960021, "grad_norm": 1.6291440725326538, "learning_rate": 1.8501323918799648e-05, "loss": 0.3667, "step": 17850 }, { "epoch": 3.1524137322389905, "grad_norm": 1.152271032333374, "learning_rate": 1.848367166813769e-05, "loss": 0.3785, "step": 17860 }, { "epoch": 3.1541788015179595, "grad_norm": 1.0361758470535278, "learning_rate": 1.846601941747573e-05, "loss": 0.3797, "step": 17870 }, { "epoch": 3.155943870796929, "grad_norm": 0.9247083067893982, "learning_rate": 1.844836716681377e-05, "loss": 0.4611, "step": 17880 }, { "epoch": 3.157708940075898, "grad_norm": 1.0259307622909546, "learning_rate": 1.843071491615181e-05, "loss": 0.3774, "step": 17890 }, { "epoch": 3.159474009354867, "grad_norm": 0.8783037662506104, "learning_rate": 1.841306266548985e-05, "loss": 0.3992, "step": 17900 }, { "epoch": 3.1612390786338365, "grad_norm": 1.3424186706542969, "learning_rate": 1.839541041482789e-05, "loss": 0.4161, "step": 17910 }, { "epoch": 3.1630041479128055, "grad_norm": 1.8806232213974, "learning_rate": 1.8377758164165933e-05, "loss": 0.4088, "step": 17920 }, { "epoch": 3.164769217191775, "grad_norm": 4.50385046005249, "learning_rate": 1.836010591350397e-05, "loss": 0.4092, "step": 17930 }, { "epoch": 3.166534286470744, "grad_norm": 1.1971051692962646, "learning_rate": 1.8342453662842012e-05, "loss": 0.4802, "step": 17940 }, { "epoch": 3.168299355749713, "grad_norm": 2.5064802169799805, "learning_rate": 1.8324801412180053e-05, "loss": 0.4109, "step": 17950 }, { "epoch": 3.1700644250286825, "grad_norm": 1.0116615295410156, "learning_rate": 1.8307149161518094e-05, "loss": 0.3885, "step": 17960 }, { "epoch": 3.1718294943076515, "grad_norm": 2.087759256362915, "learning_rate": 1.8289496910856135e-05, "loss": 0.4146, "step": 17970 }, { "epoch": 3.173594563586621, "grad_norm": 1.0581564903259277, "learning_rate": 1.8271844660194176e-05, "loss": 0.4173, "step": 17980 }, { "epoch": 3.17535963286559, "grad_norm": 3.7093119621276855, "learning_rate": 1.825595763459841e-05, "loss": 0.4077, "step": 17990 }, { "epoch": 3.177124702144559, "grad_norm": 0.9037616848945618, "learning_rate": 1.8238305383936452e-05, "loss": 0.4099, "step": 18000 }, { "epoch": 3.177124702144559, "eval_loss": 0.635881781578064, "eval_runtime": 591.7121, "eval_samples_per_second": 47.873, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0004861337206706088, "step": 18000 }, { "epoch": 3.1788897714235285, "grad_norm": 1.0461044311523438, "learning_rate": 1.8220653133274493e-05, "loss": 0.3745, "step": 18010 }, { "epoch": 3.1806548407024975, "grad_norm": 3.2152843475341797, "learning_rate": 1.8203000882612534e-05, "loss": 0.4088, "step": 18020 }, { "epoch": 3.182419909981467, "grad_norm": 0.8267636895179749, "learning_rate": 1.8185348631950575e-05, "loss": 0.4058, "step": 18030 }, { "epoch": 3.184184979260436, "grad_norm": 1.0266914367675781, "learning_rate": 1.8167696381288616e-05, "loss": 0.4244, "step": 18040 }, { "epoch": 3.185950048539405, "grad_norm": 3.4762895107269287, "learning_rate": 1.8150044130626657e-05, "loss": 0.4025, "step": 18050 }, { "epoch": 3.1877151178183745, "grad_norm": 2.3744864463806152, "learning_rate": 1.8132391879964695e-05, "loss": 0.3202, "step": 18060 }, { "epoch": 3.1894801870973435, "grad_norm": 4.3991379737854, "learning_rate": 1.8114739629302736e-05, "loss": 0.4466, "step": 18070 }, { "epoch": 3.191245256376313, "grad_norm": 1.149390697479248, "learning_rate": 1.8097087378640778e-05, "loss": 0.3886, "step": 18080 }, { "epoch": 3.193010325655282, "grad_norm": 2.5938711166381836, "learning_rate": 1.807943512797882e-05, "loss": 0.3872, "step": 18090 }, { "epoch": 3.194775394934251, "grad_norm": 2.244466781616211, "learning_rate": 1.806178287731686e-05, "loss": 0.424, "step": 18100 }, { "epoch": 3.1965404642132205, "grad_norm": 1.0753962993621826, "learning_rate": 1.8044130626654898e-05, "loss": 0.4268, "step": 18110 }, { "epoch": 3.1983055334921895, "grad_norm": 2.415478229522705, "learning_rate": 1.802647837599294e-05, "loss": 0.4399, "step": 18120 }, { "epoch": 3.200070602771159, "grad_norm": 1.0524026155471802, "learning_rate": 1.800882612533098e-05, "loss": 0.4179, "step": 18130 }, { "epoch": 3.201835672050128, "grad_norm": 0.7873696684837341, "learning_rate": 1.799117387466902e-05, "loss": 0.3989, "step": 18140 }, { "epoch": 3.203600741329097, "grad_norm": 1.1974061727523804, "learning_rate": 1.7973521624007062e-05, "loss": 0.3389, "step": 18150 }, { "epoch": 3.2053658106080665, "grad_norm": 2.7853527069091797, "learning_rate": 1.7955869373345103e-05, "loss": 0.4219, "step": 18160 }, { "epoch": 3.2071308798870355, "grad_norm": 1.1441482305526733, "learning_rate": 1.7938217122683144e-05, "loss": 0.3902, "step": 18170 }, { "epoch": 3.208895949166005, "grad_norm": 2.0141642093658447, "learning_rate": 1.7920564872021182e-05, "loss": 0.3761, "step": 18180 }, { "epoch": 3.210661018444974, "grad_norm": 1.0025014877319336, "learning_rate": 1.7902912621359223e-05, "loss": 0.4484, "step": 18190 }, { "epoch": 3.212426087723943, "grad_norm": 3.727806568145752, "learning_rate": 1.7885260370697265e-05, "loss": 0.3911, "step": 18200 }, { "epoch": 3.2141911570029125, "grad_norm": 0.956070065498352, "learning_rate": 1.7867608120035306e-05, "loss": 0.4688, "step": 18210 }, { "epoch": 3.2159562262818815, "grad_norm": 0.9003653526306152, "learning_rate": 1.7849955869373347e-05, "loss": 0.4221, "step": 18220 }, { "epoch": 3.217721295560851, "grad_norm": 0.8405077457427979, "learning_rate": 1.7832303618711388e-05, "loss": 0.3842, "step": 18230 }, { "epoch": 3.21948636483982, "grad_norm": 3.997713088989258, "learning_rate": 1.7814651368049426e-05, "loss": 0.3925, "step": 18240 }, { "epoch": 3.221251434118789, "grad_norm": 2.2268900871276855, "learning_rate": 1.7796999117387467e-05, "loss": 0.4403, "step": 18250 }, { "epoch": 3.2230165033977585, "grad_norm": 2.1332101821899414, "learning_rate": 1.7779346866725508e-05, "loss": 0.3982, "step": 18260 }, { "epoch": 3.2247815726767275, "grad_norm": 0.9260847568511963, "learning_rate": 1.776169461606355e-05, "loss": 0.3877, "step": 18270 }, { "epoch": 3.226546641955697, "grad_norm": 1.0425130128860474, "learning_rate": 1.774404236540159e-05, "loss": 0.3911, "step": 18280 }, { "epoch": 3.228311711234666, "grad_norm": 0.8160964250564575, "learning_rate": 1.772639011473963e-05, "loss": 0.383, "step": 18290 }, { "epoch": 3.230076780513635, "grad_norm": 0.8338566422462463, "learning_rate": 1.770873786407767e-05, "loss": 0.4303, "step": 18300 }, { "epoch": 3.2318418497926045, "grad_norm": 1.5822185277938843, "learning_rate": 1.769108561341571e-05, "loss": 0.3649, "step": 18310 }, { "epoch": 3.2336069190715735, "grad_norm": 2.6177778244018555, "learning_rate": 1.767343336275375e-05, "loss": 0.4294, "step": 18320 }, { "epoch": 3.235371988350543, "grad_norm": 1.1790587902069092, "learning_rate": 1.7655781112091793e-05, "loss": 0.4309, "step": 18330 }, { "epoch": 3.237137057629512, "grad_norm": 1.1360477209091187, "learning_rate": 1.7638128861429834e-05, "loss": 0.4458, "step": 18340 }, { "epoch": 3.238902126908481, "grad_norm": 1.3596748113632202, "learning_rate": 1.7620476610767875e-05, "loss": 0.4219, "step": 18350 }, { "epoch": 3.2406671961874505, "grad_norm": 3.5767550468444824, "learning_rate": 1.7602824360105916e-05, "loss": 0.387, "step": 18360 }, { "epoch": 3.2424322654664195, "grad_norm": 1.0752480030059814, "learning_rate": 1.7585172109443954e-05, "loss": 0.4437, "step": 18370 }, { "epoch": 3.244197334745389, "grad_norm": 0.7791188955307007, "learning_rate": 1.7567519858781995e-05, "loss": 0.3724, "step": 18380 }, { "epoch": 3.245962404024358, "grad_norm": 2.880629062652588, "learning_rate": 1.7549867608120036e-05, "loss": 0.3876, "step": 18390 }, { "epoch": 3.247727473303327, "grad_norm": 1.916669487953186, "learning_rate": 1.7532215357458077e-05, "loss": 0.3946, "step": 18400 }, { "epoch": 3.2494925425822965, "grad_norm": 1.1796541213989258, "learning_rate": 1.751456310679612e-05, "loss": 0.3918, "step": 18410 }, { "epoch": 3.2512576118612655, "grad_norm": 2.1787917613983154, "learning_rate": 1.7496910856134156e-05, "loss": 0.3771, "step": 18420 }, { "epoch": 3.2530226811402345, "grad_norm": 2.3476030826568604, "learning_rate": 1.7479258605472197e-05, "loss": 0.4083, "step": 18430 }, { "epoch": 3.254787750419204, "grad_norm": 2.371819496154785, "learning_rate": 1.746160635481024e-05, "loss": 0.3711, "step": 18440 }, { "epoch": 3.256552819698173, "grad_norm": 0.7971315383911133, "learning_rate": 1.744395410414828e-05, "loss": 0.3566, "step": 18450 }, { "epoch": 3.2583178889771425, "grad_norm": 1.1119778156280518, "learning_rate": 1.742630185348632e-05, "loss": 0.39, "step": 18460 }, { "epoch": 3.2600829582561115, "grad_norm": 3.2914371490478516, "learning_rate": 1.7408649602824362e-05, "loss": 0.429, "step": 18470 }, { "epoch": 3.2618480275350805, "grad_norm": 1.6263724565505981, "learning_rate": 1.7390997352162403e-05, "loss": 0.3881, "step": 18480 }, { "epoch": 3.26361309681405, "grad_norm": 3.777236223220825, "learning_rate": 1.7373345101500444e-05, "loss": 0.3768, "step": 18490 }, { "epoch": 3.265378166093019, "grad_norm": 0.7307336330413818, "learning_rate": 1.7355692850838482e-05, "loss": 0.3739, "step": 18500 }, { "epoch": 3.2671432353719885, "grad_norm": 0.869637668132782, "learning_rate": 1.7338040600176523e-05, "loss": 0.5611, "step": 18510 }, { "epoch": 3.2689083046509575, "grad_norm": 1.3609660863876343, "learning_rate": 1.7320388349514564e-05, "loss": 0.3561, "step": 18520 }, { "epoch": 3.2706733739299265, "grad_norm": 1.3725719451904297, "learning_rate": 1.7302736098852605e-05, "loss": 0.4136, "step": 18530 }, { "epoch": 3.272438443208896, "grad_norm": 1.636234998703003, "learning_rate": 1.7285083848190643e-05, "loss": 0.3679, "step": 18540 }, { "epoch": 3.274203512487865, "grad_norm": 0.9934202432632446, "learning_rate": 1.7267431597528684e-05, "loss": 0.4268, "step": 18550 }, { "epoch": 3.2759685817668345, "grad_norm": 2.107452869415283, "learning_rate": 1.7249779346866725e-05, "loss": 0.4559, "step": 18560 }, { "epoch": 3.2777336510458035, "grad_norm": 1.0061404705047607, "learning_rate": 1.7232127096204767e-05, "loss": 0.3688, "step": 18570 }, { "epoch": 3.2794987203247725, "grad_norm": 1.0886799097061157, "learning_rate": 1.7214474845542808e-05, "loss": 0.3709, "step": 18580 }, { "epoch": 3.281263789603742, "grad_norm": 0.9672642946243286, "learning_rate": 1.719682259488085e-05, "loss": 0.4803, "step": 18590 }, { "epoch": 3.283028858882711, "grad_norm": 2.108147382736206, "learning_rate": 1.717917034421889e-05, "loss": 0.3735, "step": 18600 }, { "epoch": 3.2847939281616805, "grad_norm": 1.3874099254608154, "learning_rate": 1.716151809355693e-05, "loss": 0.3592, "step": 18610 }, { "epoch": 3.2865589974406495, "grad_norm": 1.8656635284423828, "learning_rate": 1.7143865842894972e-05, "loss": 0.3573, "step": 18620 }, { "epoch": 3.2883240667196185, "grad_norm": 2.979400873184204, "learning_rate": 1.7126213592233013e-05, "loss": 0.4112, "step": 18630 }, { "epoch": 3.290089135998588, "grad_norm": 1.2417200803756714, "learning_rate": 1.710856134157105e-05, "loss": 0.3632, "step": 18640 }, { "epoch": 3.291854205277557, "grad_norm": 1.0724737644195557, "learning_rate": 1.7090909090909092e-05, "loss": 0.3741, "step": 18650 }, { "epoch": 3.2936192745565265, "grad_norm": 1.0055967569351196, "learning_rate": 1.707325684024713e-05, "loss": 0.4599, "step": 18660 }, { "epoch": 3.2953843438354955, "grad_norm": 1.7651218175888062, "learning_rate": 1.705560458958517e-05, "loss": 0.3824, "step": 18670 }, { "epoch": 3.2971494131144645, "grad_norm": 0.8168540000915527, "learning_rate": 1.7037952338923212e-05, "loss": 0.4056, "step": 18680 }, { "epoch": 3.298914482393434, "grad_norm": 1.0433987379074097, "learning_rate": 1.7020300088261254e-05, "loss": 0.4021, "step": 18690 }, { "epoch": 3.300679551672403, "grad_norm": 1.1858054399490356, "learning_rate": 1.7002647837599295e-05, "loss": 0.4621, "step": 18700 }, { "epoch": 3.3024446209513725, "grad_norm": 2.612271785736084, "learning_rate": 1.6984995586937336e-05, "loss": 0.4165, "step": 18710 }, { "epoch": 3.3042096902303415, "grad_norm": 2.5530431270599365, "learning_rate": 1.6967343336275377e-05, "loss": 0.3873, "step": 18720 }, { "epoch": 3.3059747595093105, "grad_norm": 1.0394740104675293, "learning_rate": 1.6949691085613418e-05, "loss": 0.368, "step": 18730 }, { "epoch": 3.30773982878828, "grad_norm": 2.9890246391296387, "learning_rate": 1.693203883495146e-05, "loss": 0.3394, "step": 18740 }, { "epoch": 3.309504898067249, "grad_norm": 0.9640330672264099, "learning_rate": 1.69143865842895e-05, "loss": 0.365, "step": 18750 }, { "epoch": 3.3112699673462185, "grad_norm": 2.4221484661102295, "learning_rate": 1.6896734333627538e-05, "loss": 0.4086, "step": 18760 }, { "epoch": 3.3130350366251875, "grad_norm": 0.8883141875267029, "learning_rate": 1.687908208296558e-05, "loss": 0.5143, "step": 18770 }, { "epoch": 3.3148001059041565, "grad_norm": 2.6699986457824707, "learning_rate": 1.6861429832303617e-05, "loss": 0.4004, "step": 18780 }, { "epoch": 3.316565175183126, "grad_norm": 2.7353172302246094, "learning_rate": 1.6843777581641658e-05, "loss": 0.4226, "step": 18790 }, { "epoch": 3.318330244462095, "grad_norm": 2.341336727142334, "learning_rate": 1.68261253309797e-05, "loss": 0.4535, "step": 18800 }, { "epoch": 3.3200953137410645, "grad_norm": 0.9151778817176819, "learning_rate": 1.680847308031774e-05, "loss": 0.3625, "step": 18810 }, { "epoch": 3.3218603830200335, "grad_norm": 0.7896936535835266, "learning_rate": 1.679082082965578e-05, "loss": 0.3674, "step": 18820 }, { "epoch": 3.3236254522990025, "grad_norm": 1.1445233821868896, "learning_rate": 1.6773168578993823e-05, "loss": 0.3445, "step": 18830 }, { "epoch": 3.325390521577972, "grad_norm": 1.9561645984649658, "learning_rate": 1.6755516328331864e-05, "loss": 0.4055, "step": 18840 }, { "epoch": 3.327155590856941, "grad_norm": 3.36482834815979, "learning_rate": 1.6737864077669905e-05, "loss": 0.436, "step": 18850 }, { "epoch": 3.3289206601359105, "grad_norm": 1.7744373083114624, "learning_rate": 1.6720211827007946e-05, "loss": 0.4467, "step": 18860 }, { "epoch": 3.3306857294148795, "grad_norm": 1.379241704940796, "learning_rate": 1.6702559576345987e-05, "loss": 0.4124, "step": 18870 }, { "epoch": 3.3324507986938485, "grad_norm": 0.9114649891853333, "learning_rate": 1.6684907325684025e-05, "loss": 0.3661, "step": 18880 }, { "epoch": 3.334215867972818, "grad_norm": 2.910946846008301, "learning_rate": 1.6667255075022066e-05, "loss": 0.3866, "step": 18890 }, { "epoch": 3.335980937251787, "grad_norm": 1.5581128597259521, "learning_rate": 1.6649602824360104e-05, "loss": 0.4196, "step": 18900 }, { "epoch": 3.3377460065307565, "grad_norm": 1.8757463693618774, "learning_rate": 1.6631950573698145e-05, "loss": 0.3783, "step": 18910 }, { "epoch": 3.3395110758097255, "grad_norm": 3.579040765762329, "learning_rate": 1.6614298323036186e-05, "loss": 0.3791, "step": 18920 }, { "epoch": 3.3412761450886945, "grad_norm": 3.096893072128296, "learning_rate": 1.6596646072374228e-05, "loss": 0.4378, "step": 18930 }, { "epoch": 3.343041214367664, "grad_norm": 3.1453256607055664, "learning_rate": 1.657899382171227e-05, "loss": 0.3997, "step": 18940 }, { "epoch": 3.344806283646633, "grad_norm": 1.0607496500015259, "learning_rate": 1.656134157105031e-05, "loss": 0.4079, "step": 18950 }, { "epoch": 3.3465713529256025, "grad_norm": 1.0377461910247803, "learning_rate": 1.654368932038835e-05, "loss": 0.4123, "step": 18960 }, { "epoch": 3.3483364222045715, "grad_norm": 1.3114358186721802, "learning_rate": 1.6526037069726392e-05, "loss": 0.4246, "step": 18970 }, { "epoch": 3.3501014914835405, "grad_norm": 4.857133388519287, "learning_rate": 1.6508384819064433e-05, "loss": 0.3943, "step": 18980 }, { "epoch": 3.35186656076251, "grad_norm": 2.0322043895721436, "learning_rate": 1.6490732568402474e-05, "loss": 0.399, "step": 18990 }, { "epoch": 3.353631630041479, "grad_norm": 0.7585875391960144, "learning_rate": 1.6473080317740512e-05, "loss": 0.3883, "step": 19000 }, { "epoch": 3.353631630041479, "eval_loss": 0.6337409615516663, "eval_runtime": 591.7269, "eval_samples_per_second": 47.872, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0004964856287099412, "step": 19000 }, { "epoch": 3.3553966993204485, "grad_norm": 3.319875717163086, "learning_rate": 1.6455428067078553e-05, "loss": 0.4074, "step": 19010 }, { "epoch": 3.3571617685994175, "grad_norm": 1.039914846420288, "learning_rate": 1.6437775816416594e-05, "loss": 0.3939, "step": 19020 }, { "epoch": 3.3589268378783865, "grad_norm": 2.315622329711914, "learning_rate": 1.6420123565754632e-05, "loss": 0.4632, "step": 19030 }, { "epoch": 3.360691907157356, "grad_norm": 1.0073977708816528, "learning_rate": 1.6402471315092673e-05, "loss": 0.3984, "step": 19040 }, { "epoch": 3.362456976436325, "grad_norm": 2.9430811405181885, "learning_rate": 1.6384819064430714e-05, "loss": 0.3565, "step": 19050 }, { "epoch": 3.3642220457152945, "grad_norm": 2.2811427116394043, "learning_rate": 1.6367166813768756e-05, "loss": 0.3919, "step": 19060 }, { "epoch": 3.3659871149942635, "grad_norm": 0.8443194031715393, "learning_rate": 1.6349514563106797e-05, "loss": 0.413, "step": 19070 }, { "epoch": 3.3677521842732325, "grad_norm": 1.146048665046692, "learning_rate": 1.6331862312444838e-05, "loss": 0.4198, "step": 19080 }, { "epoch": 3.369517253552202, "grad_norm": 0.9706215262413025, "learning_rate": 1.631421006178288e-05, "loss": 0.3997, "step": 19090 }, { "epoch": 3.371282322831171, "grad_norm": 0.9332062005996704, "learning_rate": 1.629655781112092e-05, "loss": 0.4265, "step": 19100 }, { "epoch": 3.3730473921101405, "grad_norm": 2.7127761840820312, "learning_rate": 1.6278905560458958e-05, "loss": 0.4015, "step": 19110 }, { "epoch": 3.3748124613891095, "grad_norm": 3.304807424545288, "learning_rate": 1.6261253309797e-05, "loss": 0.4094, "step": 19120 }, { "epoch": 3.3765775306680785, "grad_norm": 2.3264896869659424, "learning_rate": 1.624360105913504e-05, "loss": 0.4057, "step": 19130 }, { "epoch": 3.378342599947048, "grad_norm": 2.1947805881500244, "learning_rate": 1.622594880847308e-05, "loss": 0.4145, "step": 19140 }, { "epoch": 3.380107669226017, "grad_norm": 1.283929705619812, "learning_rate": 1.6208296557811123e-05, "loss": 0.3286, "step": 19150 }, { "epoch": 3.3818727385049865, "grad_norm": 2.9055802822113037, "learning_rate": 1.619064430714916e-05, "loss": 0.3332, "step": 19160 }, { "epoch": 3.3836378077839555, "grad_norm": 2.830183267593384, "learning_rate": 1.61729920564872e-05, "loss": 0.399, "step": 19170 }, { "epoch": 3.3854028770629245, "grad_norm": 1.0093673467636108, "learning_rate": 1.6155339805825243e-05, "loss": 0.3873, "step": 19180 }, { "epoch": 3.387167946341894, "grad_norm": 2.602121591567993, "learning_rate": 1.6137687555163284e-05, "loss": 0.3895, "step": 19190 }, { "epoch": 3.388933015620863, "grad_norm": 3.5917704105377197, "learning_rate": 1.6120035304501325e-05, "loss": 0.372, "step": 19200 }, { "epoch": 3.3906980848998325, "grad_norm": 0.8379570245742798, "learning_rate": 1.6102383053839366e-05, "loss": 0.3456, "step": 19210 }, { "epoch": 3.3924631541788015, "grad_norm": 1.011858582496643, "learning_rate": 1.6084730803177407e-05, "loss": 0.4192, "step": 19220 }, { "epoch": 3.3942282234577705, "grad_norm": 2.373765707015991, "learning_rate": 1.6067078552515445e-05, "loss": 0.4006, "step": 19230 }, { "epoch": 3.39599329273674, "grad_norm": 3.50282621383667, "learning_rate": 1.6049426301853486e-05, "loss": 0.49, "step": 19240 }, { "epoch": 3.397758362015709, "grad_norm": 2.8352739810943604, "learning_rate": 1.6031774051191527e-05, "loss": 0.4188, "step": 19250 }, { "epoch": 3.3995234312946785, "grad_norm": 2.508870840072632, "learning_rate": 1.601412180052957e-05, "loss": 0.389, "step": 19260 }, { "epoch": 3.4012885005736475, "grad_norm": 1.0266497135162354, "learning_rate": 1.599646954986761e-05, "loss": 0.3711, "step": 19270 }, { "epoch": 3.4030535698526165, "grad_norm": 1.2052311897277832, "learning_rate": 1.597881729920565e-05, "loss": 0.4537, "step": 19280 }, { "epoch": 3.404818639131586, "grad_norm": 2.9178948402404785, "learning_rate": 1.5961165048543692e-05, "loss": 0.423, "step": 19290 }, { "epoch": 3.406583708410555, "grad_norm": 0.8659635186195374, "learning_rate": 1.594351279788173e-05, "loss": 0.4, "step": 19300 }, { "epoch": 3.4083487776895245, "grad_norm": 0.8659955859184265, "learning_rate": 1.592586054721977e-05, "loss": 0.3794, "step": 19310 }, { "epoch": 3.4101138469684935, "grad_norm": 0.8710178732872009, "learning_rate": 1.5908208296557812e-05, "loss": 0.4134, "step": 19320 }, { "epoch": 3.4118789162474625, "grad_norm": 0.758415162563324, "learning_rate": 1.5890556045895853e-05, "loss": 0.3794, "step": 19330 }, { "epoch": 3.413643985526432, "grad_norm": 1.963364839553833, "learning_rate": 1.5872903795233894e-05, "loss": 0.3979, "step": 19340 }, { "epoch": 3.415409054805401, "grad_norm": 1.1023908853530884, "learning_rate": 1.5855251544571932e-05, "loss": 0.394, "step": 19350 }, { "epoch": 3.4171741240843705, "grad_norm": 2.183608055114746, "learning_rate": 1.5837599293909973e-05, "loss": 0.4181, "step": 19360 }, { "epoch": 3.4189391933633395, "grad_norm": 1.6452324390411377, "learning_rate": 1.5819947043248014e-05, "loss": 0.378, "step": 19370 }, { "epoch": 3.4207042626423085, "grad_norm": 1.1097979545593262, "learning_rate": 1.5802294792586055e-05, "loss": 0.3809, "step": 19380 }, { "epoch": 3.422469331921278, "grad_norm": 0.9524782299995422, "learning_rate": 1.5784642541924097e-05, "loss": 0.4181, "step": 19390 }, { "epoch": 3.424234401200247, "grad_norm": 2.443341016769409, "learning_rate": 1.5766990291262138e-05, "loss": 0.4372, "step": 19400 }, { "epoch": 3.4259994704792165, "grad_norm": 1.8552148342132568, "learning_rate": 1.574933804060018e-05, "loss": 0.4068, "step": 19410 }, { "epoch": 3.4277645397581855, "grad_norm": 2.9345669746398926, "learning_rate": 1.573168578993822e-05, "loss": 0.3257, "step": 19420 }, { "epoch": 3.4295296090371545, "grad_norm": 2.683713674545288, "learning_rate": 1.5714033539276258e-05, "loss": 0.3966, "step": 19430 }, { "epoch": 3.431294678316124, "grad_norm": 2.2262651920318604, "learning_rate": 1.56963812886143e-05, "loss": 0.3859, "step": 19440 }, { "epoch": 3.433059747595093, "grad_norm": 3.83772611618042, "learning_rate": 1.567872903795234e-05, "loss": 0.3707, "step": 19450 }, { "epoch": 3.4348248168740625, "grad_norm": 1.1714105606079102, "learning_rate": 1.566107678729038e-05, "loss": 0.4113, "step": 19460 }, { "epoch": 3.4365898861530315, "grad_norm": 2.8277933597564697, "learning_rate": 1.564342453662842e-05, "loss": 0.388, "step": 19470 }, { "epoch": 3.4383549554320005, "grad_norm": 2.253077268600464, "learning_rate": 1.562577228596646e-05, "loss": 0.3965, "step": 19480 }, { "epoch": 3.44012002471097, "grad_norm": 2.4589033126831055, "learning_rate": 1.56081200353045e-05, "loss": 0.4136, "step": 19490 }, { "epoch": 3.441885093989939, "grad_norm": 1.1025439500808716, "learning_rate": 1.5590467784642542e-05, "loss": 0.4406, "step": 19500 }, { "epoch": 3.4436501632689085, "grad_norm": 4.201873302459717, "learning_rate": 1.5572815533980583e-05, "loss": 0.4258, "step": 19510 }, { "epoch": 3.4454152325478775, "grad_norm": 2.1414599418640137, "learning_rate": 1.5555163283318625e-05, "loss": 0.3728, "step": 19520 }, { "epoch": 3.4471803018268465, "grad_norm": 3.3690412044525146, "learning_rate": 1.5537511032656666e-05, "loss": 0.4301, "step": 19530 }, { "epoch": 3.448945371105816, "grad_norm": 0.9869192838668823, "learning_rate": 1.5519858781994707e-05, "loss": 0.406, "step": 19540 }, { "epoch": 3.450710440384785, "grad_norm": 0.9513691663742065, "learning_rate": 1.5502206531332748e-05, "loss": 0.3683, "step": 19550 }, { "epoch": 3.4524755096637545, "grad_norm": 3.8297016620635986, "learning_rate": 1.5484554280670786e-05, "loss": 0.4542, "step": 19560 }, { "epoch": 3.4542405789427235, "grad_norm": 0.9088470935821533, "learning_rate": 1.5466902030008827e-05, "loss": 0.3416, "step": 19570 }, { "epoch": 3.4560056482216925, "grad_norm": 3.7486371994018555, "learning_rate": 1.5449249779346868e-05, "loss": 0.3973, "step": 19580 }, { "epoch": 3.457770717500662, "grad_norm": 1.234134554862976, "learning_rate": 1.5431597528684906e-05, "loss": 0.4224, "step": 19590 }, { "epoch": 3.459535786779631, "grad_norm": 2.473367691040039, "learning_rate": 1.5413945278022947e-05, "loss": 0.4131, "step": 19600 }, { "epoch": 3.4613008560586005, "grad_norm": 0.9623596668243408, "learning_rate": 1.5396293027360988e-05, "loss": 0.4584, "step": 19610 }, { "epoch": 3.4630659253375695, "grad_norm": 1.3496516942977905, "learning_rate": 1.537864077669903e-05, "loss": 0.3863, "step": 19620 }, { "epoch": 3.4648309946165385, "grad_norm": 3.4756240844726562, "learning_rate": 1.536098852603707e-05, "loss": 0.4121, "step": 19630 }, { "epoch": 3.466596063895508, "grad_norm": 0.9052891731262207, "learning_rate": 1.534333627537511e-05, "loss": 0.4241, "step": 19640 }, { "epoch": 3.468361133174477, "grad_norm": 1.1238759756088257, "learning_rate": 1.5325684024713153e-05, "loss": 0.3724, "step": 19650 }, { "epoch": 3.4701262024534465, "grad_norm": 0.7360913753509521, "learning_rate": 1.5308031774051194e-05, "loss": 0.374, "step": 19660 }, { "epoch": 3.4718912717324155, "grad_norm": 2.606687068939209, "learning_rate": 1.5290379523389235e-05, "loss": 0.5032, "step": 19670 }, { "epoch": 3.4736563410113845, "grad_norm": 3.076735496520996, "learning_rate": 1.5272727272727276e-05, "loss": 0.3582, "step": 19680 }, { "epoch": 3.475421410290354, "grad_norm": 3.728522300720215, "learning_rate": 1.5255075022065312e-05, "loss": 0.4011, "step": 19690 }, { "epoch": 3.477186479569323, "grad_norm": 1.052911639213562, "learning_rate": 1.5237422771403353e-05, "loss": 0.4207, "step": 19700 }, { "epoch": 3.4789515488482925, "grad_norm": 1.7988530397415161, "learning_rate": 1.5219770520741395e-05, "loss": 0.4107, "step": 19710 }, { "epoch": 3.4807166181272615, "grad_norm": 1.0843051671981812, "learning_rate": 1.5202118270079436e-05, "loss": 0.3656, "step": 19720 }, { "epoch": 3.4824816874062305, "grad_norm": 2.6198229789733887, "learning_rate": 1.5184466019417475e-05, "loss": 0.37, "step": 19730 }, { "epoch": 3.4842467566852, "grad_norm": 2.750885009765625, "learning_rate": 1.5166813768755516e-05, "loss": 0.3745, "step": 19740 }, { "epoch": 3.486011825964169, "grad_norm": 3.1571855545043945, "learning_rate": 1.5149161518093557e-05, "loss": 0.3779, "step": 19750 }, { "epoch": 3.4877768952431385, "grad_norm": 1.179776668548584, "learning_rate": 1.5131509267431599e-05, "loss": 0.3756, "step": 19760 }, { "epoch": 3.4895419645221075, "grad_norm": 0.8757907748222351, "learning_rate": 1.511385701676964e-05, "loss": 0.4455, "step": 19770 }, { "epoch": 3.4913070338010765, "grad_norm": 1.0547289848327637, "learning_rate": 1.509620476610768e-05, "loss": 0.3914, "step": 19780 }, { "epoch": 3.493072103080046, "grad_norm": 0.7314989566802979, "learning_rate": 1.507855251544572e-05, "loss": 0.445, "step": 19790 }, { "epoch": 3.494837172359015, "grad_norm": 3.2221102714538574, "learning_rate": 1.5060900264783761e-05, "loss": 0.374, "step": 19800 }, { "epoch": 3.4966022416379845, "grad_norm": 0.7653711438179016, "learning_rate": 1.5043248014121803e-05, "loss": 0.3547, "step": 19810 }, { "epoch": 3.4983673109169535, "grad_norm": 1.3949837684631348, "learning_rate": 1.5025595763459844e-05, "loss": 0.3801, "step": 19820 }, { "epoch": 3.5001323801959225, "grad_norm": 2.3859100341796875, "learning_rate": 1.5007943512797882e-05, "loss": 0.4, "step": 19830 }, { "epoch": 3.501897449474892, "grad_norm": 2.28014874458313, "learning_rate": 1.4990291262135923e-05, "loss": 0.4424, "step": 19840 }, { "epoch": 3.503662518753861, "grad_norm": 0.9215405583381653, "learning_rate": 1.4972639011473962e-05, "loss": 0.3775, "step": 19850 }, { "epoch": 3.5054275880328305, "grad_norm": 3.190624237060547, "learning_rate": 1.4954986760812003e-05, "loss": 0.3817, "step": 19860 }, { "epoch": 3.5071926573117995, "grad_norm": 1.0653491020202637, "learning_rate": 1.4937334510150044e-05, "loss": 0.3981, "step": 19870 }, { "epoch": 3.5089577265907685, "grad_norm": 0.8434626460075378, "learning_rate": 1.4919682259488086e-05, "loss": 0.3985, "step": 19880 }, { "epoch": 3.510722795869738, "grad_norm": 1.124345064163208, "learning_rate": 1.4902030008826127e-05, "loss": 0.4017, "step": 19890 }, { "epoch": 3.512487865148707, "grad_norm": 2.5151355266571045, "learning_rate": 1.4884377758164166e-05, "loss": 0.4992, "step": 19900 }, { "epoch": 3.5142529344276765, "grad_norm": 1.2481873035430908, "learning_rate": 1.4866725507502207e-05, "loss": 0.3878, "step": 19910 }, { "epoch": 3.5160180037066455, "grad_norm": 1.1398800611495972, "learning_rate": 1.4849073256840248e-05, "loss": 0.3707, "step": 19920 }, { "epoch": 3.5177830729856145, "grad_norm": 2.3349595069885254, "learning_rate": 1.483142100617829e-05, "loss": 0.3675, "step": 19930 }, { "epoch": 3.519548142264584, "grad_norm": 1.104368805885315, "learning_rate": 1.481376875551633e-05, "loss": 0.3351, "step": 19940 }, { "epoch": 3.521313211543553, "grad_norm": 2.1135847568511963, "learning_rate": 1.479611650485437e-05, "loss": 0.326, "step": 19950 }, { "epoch": 3.5230782808225225, "grad_norm": 0.7931279540061951, "learning_rate": 1.477846425419241e-05, "loss": 0.3917, "step": 19960 }, { "epoch": 3.5248433501014915, "grad_norm": 3.3754491806030273, "learning_rate": 1.4760812003530449e-05, "loss": 0.3659, "step": 19970 }, { "epoch": 3.5266084193804605, "grad_norm": 3.644896984100342, "learning_rate": 1.474315975286849e-05, "loss": 0.3987, "step": 19980 }, { "epoch": 3.52837348865943, "grad_norm": 1.0437495708465576, "learning_rate": 1.4725507502206531e-05, "loss": 0.3862, "step": 19990 }, { "epoch": 3.530138557938399, "grad_norm": 0.6430332660675049, "learning_rate": 1.4707855251544573e-05, "loss": 0.361, "step": 20000 }, { "epoch": 3.530138557938399, "eval_loss": 0.627667248249054, "eval_runtime": 591.668, "eval_samples_per_second": 47.877, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0004995303075450389, "step": 20000 }, { "epoch": 3.5319036272173685, "grad_norm": 0.9894623756408691, "learning_rate": 1.4690203000882614e-05, "loss": 0.397, "step": 20010 }, { "epoch": 3.5336686964963375, "grad_norm": 0.8043822646141052, "learning_rate": 1.4672550750220653e-05, "loss": 0.2927, "step": 20020 }, { "epoch": 3.5354337657753065, "grad_norm": 1.4241021871566772, "learning_rate": 1.4654898499558694e-05, "loss": 0.3976, "step": 20030 }, { "epoch": 3.537198835054276, "grad_norm": 2.9625587463378906, "learning_rate": 1.4637246248896735e-05, "loss": 0.4065, "step": 20040 }, { "epoch": 3.538963904333245, "grad_norm": 1.040947675704956, "learning_rate": 1.4619593998234777e-05, "loss": 0.4393, "step": 20050 }, { "epoch": 3.5407289736122145, "grad_norm": 2.4093751907348633, "learning_rate": 1.4601941747572818e-05, "loss": 0.4131, "step": 20060 }, { "epoch": 3.5424940428911835, "grad_norm": 0.9673560857772827, "learning_rate": 1.4584289496910857e-05, "loss": 0.3504, "step": 20070 }, { "epoch": 3.5442591121701525, "grad_norm": 1.1420180797576904, "learning_rate": 1.4566637246248898e-05, "loss": 0.3997, "step": 20080 }, { "epoch": 3.546024181449122, "grad_norm": 2.3320648670196533, "learning_rate": 1.4548984995586936e-05, "loss": 0.4331, "step": 20090 }, { "epoch": 3.547789250728091, "grad_norm": 2.1832408905029297, "learning_rate": 1.4531332744924977e-05, "loss": 0.4219, "step": 20100 }, { "epoch": 3.5495543200070605, "grad_norm": 0.7137057781219482, "learning_rate": 1.4513680494263018e-05, "loss": 0.3839, "step": 20110 }, { "epoch": 3.5513193892860295, "grad_norm": 2.7392122745513916, "learning_rate": 1.449602824360106e-05, "loss": 0.3617, "step": 20120 }, { "epoch": 3.5530844585649985, "grad_norm": 2.4449989795684814, "learning_rate": 1.44783759929391e-05, "loss": 0.3957, "step": 20130 }, { "epoch": 3.554849527843968, "grad_norm": 1.5275472402572632, "learning_rate": 1.446072374227714e-05, "loss": 0.3073, "step": 20140 }, { "epoch": 3.556614597122937, "grad_norm": 0.8892549276351929, "learning_rate": 1.4443071491615181e-05, "loss": 0.4079, "step": 20150 }, { "epoch": 3.5583796664019065, "grad_norm": 2.4779117107391357, "learning_rate": 1.4425419240953222e-05, "loss": 0.3696, "step": 20160 }, { "epoch": 3.5601447356808755, "grad_norm": 2.004605770111084, "learning_rate": 1.4407766990291264e-05, "loss": 0.3899, "step": 20170 }, { "epoch": 3.5619098049598445, "grad_norm": 1.1132246255874634, "learning_rate": 1.4390114739629305e-05, "loss": 0.3858, "step": 20180 }, { "epoch": 3.563674874238814, "grad_norm": 3.4782907962799072, "learning_rate": 1.4372462488967344e-05, "loss": 0.3474, "step": 20190 }, { "epoch": 3.565439943517783, "grad_norm": 0.9343898296356201, "learning_rate": 1.4354810238305385e-05, "loss": 0.3925, "step": 20200 }, { "epoch": 3.5672050127967525, "grad_norm": 1.3837854862213135, "learning_rate": 1.4337157987643426e-05, "loss": 0.4391, "step": 20210 }, { "epoch": 3.5689700820757215, "grad_norm": 4.418737888336182, "learning_rate": 1.4319505736981464e-05, "loss": 0.3682, "step": 20220 }, { "epoch": 3.5707351513546906, "grad_norm": 2.9316399097442627, "learning_rate": 1.4301853486319505e-05, "loss": 0.3837, "step": 20230 }, { "epoch": 3.57250022063366, "grad_norm": 1.3634068965911865, "learning_rate": 1.4284201235657546e-05, "loss": 0.4566, "step": 20240 }, { "epoch": 3.574265289912629, "grad_norm": 1.034454107284546, "learning_rate": 1.4266548984995588e-05, "loss": 0.4131, "step": 20250 }, { "epoch": 3.5760303591915985, "grad_norm": 0.7224887013435364, "learning_rate": 1.4248896734333627e-05, "loss": 0.3576, "step": 20260 }, { "epoch": 3.5777954284705675, "grad_norm": 1.0184725522994995, "learning_rate": 1.4231244483671668e-05, "loss": 0.4344, "step": 20270 }, { "epoch": 3.5795604977495366, "grad_norm": 0.8519681096076965, "learning_rate": 1.421359223300971e-05, "loss": 0.3649, "step": 20280 }, { "epoch": 3.581325567028506, "grad_norm": 0.8168842196464539, "learning_rate": 1.419593998234775e-05, "loss": 0.3725, "step": 20290 }, { "epoch": 3.583090636307475, "grad_norm": 4.341702938079834, "learning_rate": 1.4178287731685792e-05, "loss": 0.4082, "step": 20300 }, { "epoch": 3.5848557055864445, "grad_norm": 0.7056460380554199, "learning_rate": 1.4160635481023831e-05, "loss": 0.379, "step": 20310 }, { "epoch": 3.5866207748654135, "grad_norm": 3.9937493801116943, "learning_rate": 1.4142983230361872e-05, "loss": 0.3684, "step": 20320 }, { "epoch": 3.5883858441443826, "grad_norm": 2.532386064529419, "learning_rate": 1.4125330979699913e-05, "loss": 0.3415, "step": 20330 }, { "epoch": 3.590150913423352, "grad_norm": 2.4106085300445557, "learning_rate": 1.4107678729037955e-05, "loss": 0.3569, "step": 20340 }, { "epoch": 3.591915982702321, "grad_norm": 1.1405956745147705, "learning_rate": 1.4090026478375996e-05, "loss": 0.4453, "step": 20350 }, { "epoch": 3.5936810519812905, "grad_norm": 1.306481957435608, "learning_rate": 1.4072374227714033e-05, "loss": 0.3698, "step": 20360 }, { "epoch": 3.5954461212602595, "grad_norm": 0.9987174272537231, "learning_rate": 1.4054721977052075e-05, "loss": 0.3728, "step": 20370 }, { "epoch": 3.5972111905392286, "grad_norm": 4.221369743347168, "learning_rate": 1.4037069726390114e-05, "loss": 0.41, "step": 20380 }, { "epoch": 3.598976259818198, "grad_norm": 1.0658732652664185, "learning_rate": 1.4019417475728155e-05, "loss": 0.3043, "step": 20390 }, { "epoch": 3.600741329097167, "grad_norm": 1.355228066444397, "learning_rate": 1.4001765225066196e-05, "loss": 0.377, "step": 20400 }, { "epoch": 3.6025063983761365, "grad_norm": 3.138543128967285, "learning_rate": 1.3984112974404237e-05, "loss": 0.3937, "step": 20410 }, { "epoch": 3.6042714676551055, "grad_norm": 0.8033159375190735, "learning_rate": 1.3966460723742279e-05, "loss": 0.3792, "step": 20420 }, { "epoch": 3.6060365369340746, "grad_norm": 0.810020923614502, "learning_rate": 1.3948808473080318e-05, "loss": 0.4466, "step": 20430 }, { "epoch": 3.607801606213044, "grad_norm": 1.978060007095337, "learning_rate": 1.393115622241836e-05, "loss": 0.4336, "step": 20440 }, { "epoch": 3.609566675492013, "grad_norm": 2.3692550659179688, "learning_rate": 1.39135039717564e-05, "loss": 0.3689, "step": 20450 }, { "epoch": 3.6113317447709825, "grad_norm": 1.4536701440811157, "learning_rate": 1.3895851721094442e-05, "loss": 0.377, "step": 20460 }, { "epoch": 3.6130968140499515, "grad_norm": 1.230667233467102, "learning_rate": 1.3878199470432483e-05, "loss": 0.3791, "step": 20470 }, { "epoch": 3.6148618833289206, "grad_norm": 0.8049829006195068, "learning_rate": 1.3860547219770522e-05, "loss": 0.3594, "step": 20480 }, { "epoch": 3.61662695260789, "grad_norm": 0.8730904459953308, "learning_rate": 1.3842894969108562e-05, "loss": 0.3676, "step": 20490 }, { "epoch": 3.618392021886859, "grad_norm": 4.918626308441162, "learning_rate": 1.3825242718446601e-05, "loss": 0.4362, "step": 20500 }, { "epoch": 3.6201570911658285, "grad_norm": 1.2398933172225952, "learning_rate": 1.3807590467784642e-05, "loss": 0.3963, "step": 20510 }, { "epoch": 3.6219221604447975, "grad_norm": 1.5302728414535522, "learning_rate": 1.3789938217122683e-05, "loss": 0.3749, "step": 20520 }, { "epoch": 3.6236872297237666, "grad_norm": 3.465019702911377, "learning_rate": 1.3772285966460724e-05, "loss": 0.4055, "step": 20530 }, { "epoch": 3.625452299002736, "grad_norm": 2.143458604812622, "learning_rate": 1.3754633715798766e-05, "loss": 0.3957, "step": 20540 }, { "epoch": 3.627217368281705, "grad_norm": 2.2639827728271484, "learning_rate": 1.3736981465136805e-05, "loss": 0.3577, "step": 20550 }, { "epoch": 3.6289824375606745, "grad_norm": 1.8318665027618408, "learning_rate": 1.3719329214474846e-05, "loss": 0.3756, "step": 20560 }, { "epoch": 3.6307475068396435, "grad_norm": 0.9574584364891052, "learning_rate": 1.3701676963812887e-05, "loss": 0.3594, "step": 20570 }, { "epoch": 3.6325125761186126, "grad_norm": 1.0685011148452759, "learning_rate": 1.3684024713150929e-05, "loss": 0.3595, "step": 20580 }, { "epoch": 3.6342776453975816, "grad_norm": 1.5938297510147095, "learning_rate": 1.366637246248897e-05, "loss": 0.3697, "step": 20590 }, { "epoch": 3.636042714676551, "grad_norm": 1.924798846244812, "learning_rate": 1.3648720211827009e-05, "loss": 0.4691, "step": 20600 }, { "epoch": 3.6378077839555205, "grad_norm": 1.6222426891326904, "learning_rate": 1.363106796116505e-05, "loss": 0.36, "step": 20610 }, { "epoch": 3.6395728532344895, "grad_norm": 1.0212628841400146, "learning_rate": 1.3613415710503088e-05, "loss": 0.3561, "step": 20620 }, { "epoch": 3.6413379225134586, "grad_norm": 1.080259084701538, "learning_rate": 1.359576345984113e-05, "loss": 0.3642, "step": 20630 }, { "epoch": 3.6431029917924276, "grad_norm": 1.1045883893966675, "learning_rate": 1.357811120917917e-05, "loss": 0.4131, "step": 20640 }, { "epoch": 3.644868061071397, "grad_norm": 2.390874147415161, "learning_rate": 1.3560458958517211e-05, "loss": 0.3447, "step": 20650 }, { "epoch": 3.6466331303503665, "grad_norm": 1.1138941049575806, "learning_rate": 1.3542806707855251e-05, "loss": 0.3586, "step": 20660 }, { "epoch": 3.6483981996293355, "grad_norm": 1.5280704498291016, "learning_rate": 1.3525154457193292e-05, "loss": 0.3944, "step": 20670 }, { "epoch": 3.6501632689083046, "grad_norm": 1.3573005199432373, "learning_rate": 1.3507502206531333e-05, "loss": 0.3321, "step": 20680 }, { "epoch": 3.6519283381872736, "grad_norm": 2.458770751953125, "learning_rate": 1.3489849955869374e-05, "loss": 0.4535, "step": 20690 }, { "epoch": 3.653693407466243, "grad_norm": 1.4218165874481201, "learning_rate": 1.3472197705207415e-05, "loss": 0.3851, "step": 20700 }, { "epoch": 3.6554584767452125, "grad_norm": 3.5979576110839844, "learning_rate": 1.3454545454545457e-05, "loss": 0.4107, "step": 20710 }, { "epoch": 3.6572235460241815, "grad_norm": 0.8865223526954651, "learning_rate": 1.3436893203883496e-05, "loss": 0.3959, "step": 20720 }, { "epoch": 3.6589886153031506, "grad_norm": 1.084224820137024, "learning_rate": 1.3419240953221537e-05, "loss": 0.3803, "step": 20730 }, { "epoch": 3.6607536845821196, "grad_norm": 1.8160624504089355, "learning_rate": 1.3401588702559578e-05, "loss": 0.4098, "step": 20740 }, { "epoch": 3.662518753861089, "grad_norm": 1.106541633605957, "learning_rate": 1.3383936451897616e-05, "loss": 0.4183, "step": 20750 }, { "epoch": 3.6642838231400585, "grad_norm": 0.7795540690422058, "learning_rate": 1.3366284201235657e-05, "loss": 0.288, "step": 20760 }, { "epoch": 3.6660488924190275, "grad_norm": 1.5033721923828125, "learning_rate": 1.3348631950573698e-05, "loss": 0.4001, "step": 20770 }, { "epoch": 3.6678139616979966, "grad_norm": 0.7211555242538452, "learning_rate": 1.3330979699911738e-05, "loss": 0.297, "step": 20780 }, { "epoch": 3.6695790309769656, "grad_norm": 4.685546875, "learning_rate": 1.3313327449249779e-05, "loss": 0.3903, "step": 20790 }, { "epoch": 3.671344100255935, "grad_norm": 0.9873565435409546, "learning_rate": 1.329567519858782e-05, "loss": 0.3887, "step": 20800 }, { "epoch": 3.6731091695349045, "grad_norm": 3.9295639991760254, "learning_rate": 1.3278022947925861e-05, "loss": 0.3855, "step": 20810 }, { "epoch": 3.6748742388138735, "grad_norm": Infinity, "learning_rate": 1.3262135922330099e-05, "loss": 0.4774, "step": 20820 }, { "epoch": 3.6766393080928426, "grad_norm": 1.827843427658081, "learning_rate": 1.3244483671668137e-05, "loss": 0.3703, "step": 20830 }, { "epoch": 3.6784043773718116, "grad_norm": 3.078402042388916, "learning_rate": 1.3226831421006178e-05, "loss": 0.3907, "step": 20840 }, { "epoch": 3.680169446650781, "grad_norm": 1.2314592599868774, "learning_rate": 1.3209179170344219e-05, "loss": 0.3843, "step": 20850 }, { "epoch": 3.6819345159297505, "grad_norm": 1.0087709426879883, "learning_rate": 1.319152691968226e-05, "loss": 0.4249, "step": 20860 }, { "epoch": 3.6836995852087195, "grad_norm": 0.9953848719596863, "learning_rate": 1.31738746690203e-05, "loss": 0.3404, "step": 20870 }, { "epoch": 3.6854646544876886, "grad_norm": 2.3661205768585205, "learning_rate": 1.315622241835834e-05, "loss": 0.3602, "step": 20880 }, { "epoch": 3.6872297237666576, "grad_norm": 2.3409297466278076, "learning_rate": 1.3138570167696382e-05, "loss": 0.3858, "step": 20890 }, { "epoch": 3.688994793045627, "grad_norm": 1.0345180034637451, "learning_rate": 1.3120917917034423e-05, "loss": 0.4095, "step": 20900 }, { "epoch": 3.690759862324596, "grad_norm": 0.862300455570221, "learning_rate": 1.3103265666372464e-05, "loss": 0.3509, "step": 20910 }, { "epoch": 3.6925249316035655, "grad_norm": 4.365932941436768, "learning_rate": 1.3085613415710504e-05, "loss": 0.3304, "step": 20920 }, { "epoch": 3.6942900008825346, "grad_norm": 1.525173544883728, "learning_rate": 1.3067961165048545e-05, "loss": 0.3922, "step": 20930 }, { "epoch": 3.6960550701615036, "grad_norm": 1.042880654335022, "learning_rate": 1.3050308914386586e-05, "loss": 0.379, "step": 20940 }, { "epoch": 3.697820139440473, "grad_norm": 5.789563179016113, "learning_rate": 1.3032656663724627e-05, "loss": 0.4081, "step": 20950 }, { "epoch": 3.699585208719442, "grad_norm": 1.1197853088378906, "learning_rate": 1.3015004413062668e-05, "loss": 0.3498, "step": 20960 }, { "epoch": 3.7013502779984115, "grad_norm": 3.8499810695648193, "learning_rate": 1.2997352162400706e-05, "loss": 0.4205, "step": 20970 }, { "epoch": 3.7031153472773806, "grad_norm": 1.4235578775405884, "learning_rate": 1.2979699911738746e-05, "loss": 0.3928, "step": 20980 }, { "epoch": 3.7048804165563496, "grad_norm": 1.1226133108139038, "learning_rate": 1.2962047661076787e-05, "loss": 0.4256, "step": 20990 }, { "epoch": 3.706645485835319, "grad_norm": 1.0519105195999146, "learning_rate": 1.2944395410414828e-05, "loss": 0.3961, "step": 21000 }, { "epoch": 3.706645485835319, "eval_loss": 0.624920129776001, "eval_runtime": 591.415, "eval_samples_per_second": 47.897, "eval_steps_per_second": 2.396, "eval_token_accuracy": 0.0005052137080372214, "step": 21000 }, { "epoch": 3.708410555114288, "grad_norm": 1.1179771423339844, "learning_rate": 1.2926743159752869e-05, "loss": 0.4488, "step": 21010 }, { "epoch": 3.7101756243932575, "grad_norm": 1.0017509460449219, "learning_rate": 1.290909090909091e-05, "loss": 0.3567, "step": 21020 }, { "epoch": 3.7119406936722266, "grad_norm": 3.0131490230560303, "learning_rate": 1.289143865842895e-05, "loss": 0.3817, "step": 21030 }, { "epoch": 3.7137057629511956, "grad_norm": 2.2893624305725098, "learning_rate": 1.287378640776699e-05, "loss": 0.3994, "step": 21040 }, { "epoch": 3.715470832230165, "grad_norm": 3.590670347213745, "learning_rate": 1.2856134157105032e-05, "loss": 0.3815, "step": 21050 }, { "epoch": 3.717235901509134, "grad_norm": 0.7297062873840332, "learning_rate": 1.2838481906443073e-05, "loss": 0.4081, "step": 21060 }, { "epoch": 3.7190009707881035, "grad_norm": 2.541386127471924, "learning_rate": 1.2820829655781114e-05, "loss": 0.3312, "step": 21070 }, { "epoch": 3.7207660400670726, "grad_norm": 2.1308650970458984, "learning_rate": 1.2803177405119155e-05, "loss": 0.4065, "step": 21080 }, { "epoch": 3.7225311093460416, "grad_norm": 1.2172911167144775, "learning_rate": 1.2785525154457195e-05, "loss": 0.3209, "step": 21090 }, { "epoch": 3.724296178625011, "grad_norm": 3.1587109565734863, "learning_rate": 1.2767872903795232e-05, "loss": 0.4147, "step": 21100 }, { "epoch": 3.72606124790398, "grad_norm": 2.742392063140869, "learning_rate": 1.2750220653133274e-05, "loss": 0.3323, "step": 21110 }, { "epoch": 3.7278263171829495, "grad_norm": 1.6285436153411865, "learning_rate": 1.2732568402471315e-05, "loss": 0.3819, "step": 21120 }, { "epoch": 3.7295913864619186, "grad_norm": 0.7405969500541687, "learning_rate": 1.2714916151809356e-05, "loss": 0.4268, "step": 21130 }, { "epoch": 3.7313564557408876, "grad_norm": 0.9725006818771362, "learning_rate": 1.2697263901147397e-05, "loss": 0.3536, "step": 21140 }, { "epoch": 3.733121525019857, "grad_norm": 0.6396371126174927, "learning_rate": 1.2679611650485437e-05, "loss": 0.3708, "step": 21150 }, { "epoch": 3.734886594298826, "grad_norm": 0.7963176369667053, "learning_rate": 1.2661959399823478e-05, "loss": 0.3832, "step": 21160 }, { "epoch": 3.7366516635777955, "grad_norm": 1.2382067441940308, "learning_rate": 1.2644307149161519e-05, "loss": 0.3598, "step": 21170 }, { "epoch": 3.7384167328567646, "grad_norm": 2.2310237884521484, "learning_rate": 1.262665489849956e-05, "loss": 0.4122, "step": 21180 }, { "epoch": 3.7401818021357336, "grad_norm": 2.335681915283203, "learning_rate": 1.2609002647837601e-05, "loss": 0.4517, "step": 21190 }, { "epoch": 3.741946871414703, "grad_norm": 1.0254428386688232, "learning_rate": 1.259135039717564e-05, "loss": 0.404, "step": 21200 }, { "epoch": 3.743711940693672, "grad_norm": 2.330514907836914, "learning_rate": 1.2573698146513682e-05, "loss": 0.4372, "step": 21210 }, { "epoch": 3.7454770099726415, "grad_norm": 1.1486183404922485, "learning_rate": 1.2556045895851723e-05, "loss": 0.4251, "step": 21220 }, { "epoch": 3.7472420792516106, "grad_norm": 1.931495189666748, "learning_rate": 1.253839364518976e-05, "loss": 0.3881, "step": 21230 }, { "epoch": 3.7490071485305796, "grad_norm": 1.995705246925354, "learning_rate": 1.2520741394527802e-05, "loss": 0.38, "step": 21240 }, { "epoch": 3.750772217809549, "grad_norm": 3.3702809810638428, "learning_rate": 1.2503089143865843e-05, "loss": 0.4274, "step": 21250 }, { "epoch": 3.752537287088518, "grad_norm": 0.8253573179244995, "learning_rate": 1.2485436893203884e-05, "loss": 0.4227, "step": 21260 }, { "epoch": 3.7543023563674875, "grad_norm": 2.0807230472564697, "learning_rate": 1.2467784642541923e-05, "loss": 0.412, "step": 21270 }, { "epoch": 3.7560674256464566, "grad_norm": 2.1444883346557617, "learning_rate": 1.2450132391879965e-05, "loss": 0.3729, "step": 21280 }, { "epoch": 3.7578324949254256, "grad_norm": 0.9703782200813293, "learning_rate": 1.2432480141218006e-05, "loss": 0.4516, "step": 21290 }, { "epoch": 3.759597564204395, "grad_norm": 1.991798758506775, "learning_rate": 1.2414827890556047e-05, "loss": 0.4525, "step": 21300 }, { "epoch": 3.761362633483364, "grad_norm": 1.26823091506958, "learning_rate": 1.2397175639894088e-05, "loss": 0.3618, "step": 21310 }, { "epoch": 3.7631277027623335, "grad_norm": 0.8779565691947937, "learning_rate": 1.2379523389232128e-05, "loss": 0.3955, "step": 21320 }, { "epoch": 3.7648927720413026, "grad_norm": 1.0764127969741821, "learning_rate": 1.2361871138570167e-05, "loss": 0.3765, "step": 21330 }, { "epoch": 3.7666578413202716, "grad_norm": 1.1968106031417847, "learning_rate": 1.2344218887908208e-05, "loss": 0.4095, "step": 21340 }, { "epoch": 3.768422910599241, "grad_norm": 3.0292351245880127, "learning_rate": 1.232656663724625e-05, "loss": 0.4801, "step": 21350 }, { "epoch": 3.77018797987821, "grad_norm": 1.730216145515442, "learning_rate": 1.230891438658429e-05, "loss": 0.3684, "step": 21360 }, { "epoch": 3.7719530491571795, "grad_norm": 4.9076080322265625, "learning_rate": 1.2291262135922332e-05, "loss": 0.3868, "step": 21370 }, { "epoch": 3.7737181184361486, "grad_norm": 1.4453113079071045, "learning_rate": 1.2273609885260371e-05, "loss": 0.3908, "step": 21380 }, { "epoch": 3.7754831877151176, "grad_norm": 2.025132417678833, "learning_rate": 1.2255957634598412e-05, "loss": 0.3748, "step": 21390 }, { "epoch": 3.777248256994087, "grad_norm": 0.9815580248832703, "learning_rate": 1.2238305383936452e-05, "loss": 0.3808, "step": 21400 }, { "epoch": 3.779013326273056, "grad_norm": 4.732287883758545, "learning_rate": 1.2220653133274493e-05, "loss": 0.3854, "step": 21410 }, { "epoch": 3.7807783955520256, "grad_norm": 2.854905605316162, "learning_rate": 1.2203000882612534e-05, "loss": 0.432, "step": 21420 }, { "epoch": 3.7825434648309946, "grad_norm": 2.033327341079712, "learning_rate": 1.2185348631950575e-05, "loss": 0.3847, "step": 21430 }, { "epoch": 3.7843085341099636, "grad_norm": 1.2547252178192139, "learning_rate": 1.2167696381288614e-05, "loss": 0.3925, "step": 21440 }, { "epoch": 3.786073603388933, "grad_norm": 0.7951124310493469, "learning_rate": 1.2150044130626656e-05, "loss": 0.3882, "step": 21450 }, { "epoch": 3.787838672667902, "grad_norm": 1.1173574924468994, "learning_rate": 1.2132391879964695e-05, "loss": 0.3262, "step": 21460 }, { "epoch": 3.7896037419468716, "grad_norm": 1.071058988571167, "learning_rate": 1.2114739629302736e-05, "loss": 0.4359, "step": 21470 }, { "epoch": 3.7913688112258406, "grad_norm": 1.3308414220809937, "learning_rate": 1.2097087378640777e-05, "loss": 0.4015, "step": 21480 }, { "epoch": 3.7931338805048096, "grad_norm": 1.0162357091903687, "learning_rate": 1.2079435127978819e-05, "loss": 0.4003, "step": 21490 }, { "epoch": 3.794898949783779, "grad_norm": 0.7248963713645935, "learning_rate": 1.2061782877316858e-05, "loss": 0.3947, "step": 21500 }, { "epoch": 3.796664019062748, "grad_norm": 2.628209114074707, "learning_rate": 1.2044130626654899e-05, "loss": 0.3891, "step": 21510 }, { "epoch": 3.7984290883417176, "grad_norm": 1.0009437799453735, "learning_rate": 1.202647837599294e-05, "loss": 0.4172, "step": 21520 }, { "epoch": 3.8001941576206866, "grad_norm": 1.0610207319259644, "learning_rate": 1.200882612533098e-05, "loss": 0.3344, "step": 21530 }, { "epoch": 3.8019592268996556, "grad_norm": 2.8646228313446045, "learning_rate": 1.1991173874669021e-05, "loss": 0.4041, "step": 21540 }, { "epoch": 3.803724296178625, "grad_norm": 1.103757619857788, "learning_rate": 1.1973521624007062e-05, "loss": 0.4352, "step": 21550 }, { "epoch": 3.805489365457594, "grad_norm": 1.9466677904129028, "learning_rate": 1.1955869373345101e-05, "loss": 0.3928, "step": 21560 }, { "epoch": 3.8072544347365636, "grad_norm": 0.7169159650802612, "learning_rate": 1.1938217122683143e-05, "loss": 0.419, "step": 21570 }, { "epoch": 3.8090195040155326, "grad_norm": 2.1274571418762207, "learning_rate": 1.1920564872021184e-05, "loss": 0.3842, "step": 21580 }, { "epoch": 3.8107845732945016, "grad_norm": 2.4608614444732666, "learning_rate": 1.1902912621359223e-05, "loss": 0.4074, "step": 21590 }, { "epoch": 3.812549642573471, "grad_norm": 0.9707876443862915, "learning_rate": 1.1885260370697264e-05, "loss": 0.3886, "step": 21600 }, { "epoch": 3.81431471185244, "grad_norm": 1.1373486518859863, "learning_rate": 1.1867608120035306e-05, "loss": 0.3951, "step": 21610 }, { "epoch": 3.8160797811314096, "grad_norm": 1.4261958599090576, "learning_rate": 1.1849955869373345e-05, "loss": 0.4412, "step": 21620 }, { "epoch": 3.8178448504103786, "grad_norm": 1.2144572734832764, "learning_rate": 1.1832303618711386e-05, "loss": 0.4004, "step": 21630 }, { "epoch": 3.8196099196893476, "grad_norm": 1.5304597616195679, "learning_rate": 1.1814651368049427e-05, "loss": 0.3798, "step": 21640 }, { "epoch": 3.821374988968317, "grad_norm": 2.4164507389068604, "learning_rate": 1.1796999117387468e-05, "loss": 0.4313, "step": 21650 }, { "epoch": 3.823140058247286, "grad_norm": 0.7153837084770203, "learning_rate": 1.1779346866725508e-05, "loss": 0.3471, "step": 21660 }, { "epoch": 3.8249051275262556, "grad_norm": 0.9994945526123047, "learning_rate": 1.1761694616063549e-05, "loss": 0.4341, "step": 21670 }, { "epoch": 3.8266701968052246, "grad_norm": 2.4096806049346924, "learning_rate": 1.1744042365401588e-05, "loss": 0.432, "step": 21680 }, { "epoch": 3.8284352660841936, "grad_norm": 4.5101141929626465, "learning_rate": 1.172639011473963e-05, "loss": 0.4459, "step": 21690 }, { "epoch": 3.830200335363163, "grad_norm": 2.3713507652282715, "learning_rate": 1.170873786407767e-05, "loss": 0.4236, "step": 21700 }, { "epoch": 3.831965404642132, "grad_norm": 1.1655126810073853, "learning_rate": 1.1691085613415712e-05, "loss": 0.4113, "step": 21710 }, { "epoch": 3.8337304739211016, "grad_norm": 1.9403069019317627, "learning_rate": 1.1673433362753753e-05, "loss": 0.3659, "step": 21720 }, { "epoch": 3.8354955432000706, "grad_norm": 2.1725008487701416, "learning_rate": 1.1655781112091792e-05, "loss": 0.3584, "step": 21730 }, { "epoch": 3.8372606124790396, "grad_norm": 1.0423593521118164, "learning_rate": 1.1638128861429832e-05, "loss": 0.3916, "step": 21740 }, { "epoch": 3.839025681758009, "grad_norm": 0.7833170294761658, "learning_rate": 1.1620476610767873e-05, "loss": 0.3754, "step": 21750 }, { "epoch": 3.840790751036978, "grad_norm": 2.643669843673706, "learning_rate": 1.1602824360105914e-05, "loss": 0.3785, "step": 21760 }, { "epoch": 3.8425558203159476, "grad_norm": 1.3187824487686157, "learning_rate": 1.1585172109443955e-05, "loss": 0.3943, "step": 21770 }, { "epoch": 3.8443208895949166, "grad_norm": 1.010647177696228, "learning_rate": 1.1567519858781997e-05, "loss": 0.4056, "step": 21780 }, { "epoch": 3.8460859588738856, "grad_norm": 1.4921151399612427, "learning_rate": 1.1549867608120036e-05, "loss": 0.4014, "step": 21790 }, { "epoch": 3.847851028152855, "grad_norm": 1.1991304159164429, "learning_rate": 1.1532215357458075e-05, "loss": 0.3655, "step": 21800 }, { "epoch": 3.849616097431824, "grad_norm": 3.473905324935913, "learning_rate": 1.1514563106796117e-05, "loss": 0.3779, "step": 21810 }, { "epoch": 3.8513811667107936, "grad_norm": 1.5528671741485596, "learning_rate": 1.1496910856134158e-05, "loss": 0.3933, "step": 21820 }, { "epoch": 3.8531462359897626, "grad_norm": 2.279825210571289, "learning_rate": 1.1479258605472199e-05, "loss": 0.4505, "step": 21830 }, { "epoch": 3.8549113052687316, "grad_norm": 1.464953899383545, "learning_rate": 1.146160635481024e-05, "loss": 0.4017, "step": 21840 }, { "epoch": 3.856676374547701, "grad_norm": 3.4239885807037354, "learning_rate": 1.144395410414828e-05, "loss": 0.4011, "step": 21850 }, { "epoch": 3.85844144382667, "grad_norm": 3.552250385284424, "learning_rate": 1.1426301853486319e-05, "loss": 0.3232, "step": 21860 }, { "epoch": 3.8602065131056396, "grad_norm": 3.231083393096924, "learning_rate": 1.140864960282436e-05, "loss": 0.3967, "step": 21870 }, { "epoch": 3.8619715823846086, "grad_norm": 1.8909341096878052, "learning_rate": 1.1390997352162401e-05, "loss": 0.3818, "step": 21880 }, { "epoch": 3.8637366516635776, "grad_norm": 1.1951463222503662, "learning_rate": 1.1373345101500442e-05, "loss": 0.4373, "step": 21890 }, { "epoch": 3.865501720942547, "grad_norm": 2.14921498298645, "learning_rate": 1.1355692850838483e-05, "loss": 0.4407, "step": 21900 }, { "epoch": 3.867266790221516, "grad_norm": 1.4103434085845947, "learning_rate": 1.1338040600176523e-05, "loss": 0.3667, "step": 21910 }, { "epoch": 3.8690318595004856, "grad_norm": 0.9304023385047913, "learning_rate": 1.1320388349514564e-05, "loss": 0.3761, "step": 21920 }, { "epoch": 3.8707969287794546, "grad_norm": 0.8020132184028625, "learning_rate": 1.1302736098852604e-05, "loss": 0.3869, "step": 21930 }, { "epoch": 3.8725619980584236, "grad_norm": 1.112574815750122, "learning_rate": 1.1285083848190645e-05, "loss": 0.3541, "step": 21940 }, { "epoch": 3.874327067337393, "grad_norm": 1.3293650150299072, "learning_rate": 1.1267431597528686e-05, "loss": 0.5064, "step": 21950 }, { "epoch": 3.876092136616362, "grad_norm": 1.3969178199768066, "learning_rate": 1.1249779346866727e-05, "loss": 0.3996, "step": 21960 }, { "epoch": 3.8778572058953316, "grad_norm": 1.1569809913635254, "learning_rate": 1.1232127096204766e-05, "loss": 0.3674, "step": 21970 }, { "epoch": 3.8796222751743006, "grad_norm": 1.0100200176239014, "learning_rate": 1.1214474845542808e-05, "loss": 0.3997, "step": 21980 }, { "epoch": 3.8813873444532696, "grad_norm": 0.8106080293655396, "learning_rate": 1.1196822594880847e-05, "loss": 0.4123, "step": 21990 }, { "epoch": 3.883152413732239, "grad_norm": 1.5351225137710571, "learning_rate": 1.1179170344218888e-05, "loss": 0.3907, "step": 22000 }, { "epoch": 3.883152413732239, "eval_loss": 0.6229148507118225, "eval_runtime": 591.6, "eval_samples_per_second": 47.882, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0005044017936811953, "step": 22000 }, { "epoch": 3.884917483011208, "grad_norm": 1.0947341918945312, "learning_rate": 1.116151809355693e-05, "loss": 0.3824, "step": 22010 }, { "epoch": 3.8866825522901776, "grad_norm": 2.8309361934661865, "learning_rate": 1.114386584289497e-05, "loss": 0.357, "step": 22020 }, { "epoch": 3.8884476215691466, "grad_norm": 3.102479934692383, "learning_rate": 1.112621359223301e-05, "loss": 0.3478, "step": 22030 }, { "epoch": 3.8902126908481156, "grad_norm": 2.9752273559570312, "learning_rate": 1.1108561341571051e-05, "loss": 0.4227, "step": 22040 }, { "epoch": 3.891977760127085, "grad_norm": 1.2526441812515259, "learning_rate": 1.1090909090909092e-05, "loss": 0.4003, "step": 22050 }, { "epoch": 3.893742829406054, "grad_norm": 1.1294667720794678, "learning_rate": 1.1073256840247132e-05, "loss": 0.3236, "step": 22060 }, { "epoch": 3.8955078986850236, "grad_norm": 0.9164405465126038, "learning_rate": 1.1055604589585173e-05, "loss": 0.3619, "step": 22070 }, { "epoch": 3.8972729679639926, "grad_norm": 1.0946732759475708, "learning_rate": 1.1037952338923214e-05, "loss": 0.4133, "step": 22080 }, { "epoch": 3.8990380372429616, "grad_norm": 1.1232048273086548, "learning_rate": 1.1020300088261253e-05, "loss": 0.3772, "step": 22090 }, { "epoch": 3.900803106521931, "grad_norm": 3.4187843799591064, "learning_rate": 1.1002647837599295e-05, "loss": 0.4244, "step": 22100 }, { "epoch": 3.9025681758009, "grad_norm": 2.3030498027801514, "learning_rate": 1.0984995586937336e-05, "loss": 0.3796, "step": 22110 }, { "epoch": 3.9043332450798696, "grad_norm": 1.2743421792984009, "learning_rate": 1.0967343336275375e-05, "loss": 0.3715, "step": 22120 }, { "epoch": 3.9060983143588386, "grad_norm": 1.7408547401428223, "learning_rate": 1.0949691085613416e-05, "loss": 0.3988, "step": 22130 }, { "epoch": 3.9078633836378076, "grad_norm": 3.8301382064819336, "learning_rate": 1.0932038834951456e-05, "loss": 0.3595, "step": 22140 }, { "epoch": 3.909628452916777, "grad_norm": 1.3640156984329224, "learning_rate": 1.0914386584289497e-05, "loss": 0.3977, "step": 22150 }, { "epoch": 3.911393522195746, "grad_norm": 2.915240526199341, "learning_rate": 1.0896734333627538e-05, "loss": 0.3883, "step": 22160 }, { "epoch": 3.9131585914747156, "grad_norm": 1.0516003370285034, "learning_rate": 1.087908208296558e-05, "loss": 0.4486, "step": 22170 }, { "epoch": 3.9149236607536846, "grad_norm": 1.0945969820022583, "learning_rate": 1.086142983230362e-05, "loss": 0.4029, "step": 22180 }, { "epoch": 3.9166887300326536, "grad_norm": 3.0902202129364014, "learning_rate": 1.084377758164166e-05, "loss": 0.448, "step": 22190 }, { "epoch": 3.918453799311623, "grad_norm": 0.8262447118759155, "learning_rate": 1.08261253309797e-05, "loss": 0.3915, "step": 22200 }, { "epoch": 3.920218868590592, "grad_norm": 0.8729456663131714, "learning_rate": 1.080847308031774e-05, "loss": 0.4089, "step": 22210 }, { "epoch": 3.9219839378695616, "grad_norm": 1.9946736097335815, "learning_rate": 1.0790820829655782e-05, "loss": 0.3995, "step": 22220 }, { "epoch": 3.9237490071485306, "grad_norm": 0.9540746808052063, "learning_rate": 1.0773168578993823e-05, "loss": 0.4238, "step": 22230 }, { "epoch": 3.9255140764274996, "grad_norm": 0.8667685985565186, "learning_rate": 1.0755516328331864e-05, "loss": 0.3823, "step": 22240 }, { "epoch": 3.927279145706469, "grad_norm": 1.721529483795166, "learning_rate": 1.0737864077669903e-05, "loss": 0.3752, "step": 22250 }, { "epoch": 3.929044214985438, "grad_norm": 2.2760324478149414, "learning_rate": 1.0720211827007943e-05, "loss": 0.3983, "step": 22260 }, { "epoch": 3.9308092842644076, "grad_norm": 3.106400489807129, "learning_rate": 1.0702559576345984e-05, "loss": 0.4276, "step": 22270 }, { "epoch": 3.9325743535433766, "grad_norm": 1.0799741744995117, "learning_rate": 1.0684907325684025e-05, "loss": 0.3941, "step": 22280 }, { "epoch": 3.9343394228223456, "grad_norm": 0.885012149810791, "learning_rate": 1.0667255075022066e-05, "loss": 0.437, "step": 22290 }, { "epoch": 3.936104492101315, "grad_norm": 0.9948944449424744, "learning_rate": 1.0649602824360107e-05, "loss": 0.3712, "step": 22300 }, { "epoch": 3.937869561380284, "grad_norm": 1.2208306789398193, "learning_rate": 1.0631950573698147e-05, "loss": 0.3858, "step": 22310 }, { "epoch": 3.9396346306592536, "grad_norm": 1.3812928199768066, "learning_rate": 1.0614298323036186e-05, "loss": 0.3662, "step": 22320 }, { "epoch": 3.9413996999382226, "grad_norm": 2.223764657974243, "learning_rate": 1.0596646072374227e-05, "loss": 0.3875, "step": 22330 }, { "epoch": 3.9431647692171916, "grad_norm": 1.549903392791748, "learning_rate": 1.0578993821712269e-05, "loss": 0.3764, "step": 22340 }, { "epoch": 3.944929838496161, "grad_norm": 1.3353488445281982, "learning_rate": 1.056134157105031e-05, "loss": 0.4389, "step": 22350 }, { "epoch": 3.94669490777513, "grad_norm": 0.9973052144050598, "learning_rate": 1.054368932038835e-05, "loss": 0.4134, "step": 22360 }, { "epoch": 3.9484599770540996, "grad_norm": 0.7656703591346741, "learning_rate": 1.052603706972639e-05, "loss": 0.3623, "step": 22370 }, { "epoch": 3.9502250463330686, "grad_norm": 1.0713194608688354, "learning_rate": 1.0508384819064431e-05, "loss": 0.3699, "step": 22380 }, { "epoch": 3.9519901156120376, "grad_norm": 0.9375460147857666, "learning_rate": 1.0490732568402471e-05, "loss": 0.3788, "step": 22390 }, { "epoch": 3.953755184891007, "grad_norm": 2.809772253036499, "learning_rate": 1.0473080317740512e-05, "loss": 0.3766, "step": 22400 }, { "epoch": 3.955520254169976, "grad_norm": 1.0046566724777222, "learning_rate": 1.0455428067078553e-05, "loss": 0.3173, "step": 22410 }, { "epoch": 3.9572853234489456, "grad_norm": 3.1235032081604004, "learning_rate": 1.0437775816416594e-05, "loss": 0.3317, "step": 22420 }, { "epoch": 3.9590503927279146, "grad_norm": 2.9782588481903076, "learning_rate": 1.0420123565754634e-05, "loss": 0.368, "step": 22430 }, { "epoch": 3.9608154620068836, "grad_norm": 1.7899539470672607, "learning_rate": 1.0402471315092675e-05, "loss": 0.3335, "step": 22440 }, { "epoch": 3.962580531285853, "grad_norm": 1.4096568822860718, "learning_rate": 1.0384819064430714e-05, "loss": 0.387, "step": 22450 }, { "epoch": 3.964345600564822, "grad_norm": 2.6816868782043457, "learning_rate": 1.0367166813768755e-05, "loss": 0.4641, "step": 22460 }, { "epoch": 3.9661106698437916, "grad_norm": 0.9058939814567566, "learning_rate": 1.0349514563106797e-05, "loss": 0.3344, "step": 22470 }, { "epoch": 3.9678757391227606, "grad_norm": 1.312925934791565, "learning_rate": 1.0331862312444838e-05, "loss": 0.403, "step": 22480 }, { "epoch": 3.9696408084017296, "grad_norm": 2.5122547149658203, "learning_rate": 1.0314210061782877e-05, "loss": 0.3507, "step": 22490 }, { "epoch": 3.971405877680699, "grad_norm": 1.3816343545913696, "learning_rate": 1.0296557811120918e-05, "loss": 0.4329, "step": 22500 }, { "epoch": 3.973170946959668, "grad_norm": 3.181731939315796, "learning_rate": 1.027890556045896e-05, "loss": 0.4151, "step": 22510 }, { "epoch": 3.9749360162386376, "grad_norm": 2.3720247745513916, "learning_rate": 1.0261253309796999e-05, "loss": 0.354, "step": 22520 }, { "epoch": 3.9767010855176066, "grad_norm": 2.9242501258850098, "learning_rate": 1.024360105913504e-05, "loss": 0.3514, "step": 22530 }, { "epoch": 3.9784661547965756, "grad_norm": 2.076550245285034, "learning_rate": 1.0225948808473081e-05, "loss": 0.3575, "step": 22540 }, { "epoch": 3.980231224075545, "grad_norm": 3.645087957382202, "learning_rate": 1.020829655781112e-05, "loss": 0.4174, "step": 22550 }, { "epoch": 3.981996293354514, "grad_norm": 1.596049427986145, "learning_rate": 1.0190644307149162e-05, "loss": 0.393, "step": 22560 }, { "epoch": 3.9837613626334836, "grad_norm": 1.1321353912353516, "learning_rate": 1.0172992056487203e-05, "loss": 0.4111, "step": 22570 }, { "epoch": 3.9855264319124526, "grad_norm": 1.0473411083221436, "learning_rate": 1.0155339805825244e-05, "loss": 0.3882, "step": 22580 }, { "epoch": 3.9872915011914216, "grad_norm": 0.7440062761306763, "learning_rate": 1.0137687555163284e-05, "loss": 0.3498, "step": 22590 }, { "epoch": 3.989056570470391, "grad_norm": 2.2967731952667236, "learning_rate": 1.0120035304501325e-05, "loss": 0.3788, "step": 22600 }, { "epoch": 3.99082163974936, "grad_norm": 2.151320219039917, "learning_rate": 1.0102383053839364e-05, "loss": 0.3777, "step": 22610 }, { "epoch": 3.9925867090283296, "grad_norm": 1.2454546689987183, "learning_rate": 1.0084730803177405e-05, "loss": 0.332, "step": 22620 }, { "epoch": 3.9943517783072986, "grad_norm": 1.2453608512878418, "learning_rate": 1.0067078552515446e-05, "loss": 0.4272, "step": 22630 }, { "epoch": 3.9961168475862676, "grad_norm": 2.866697072982788, "learning_rate": 1.0049426301853488e-05, "loss": 0.4149, "step": 22640 }, { "epoch": 3.997881916865237, "grad_norm": 0.9703179597854614, "learning_rate": 1.0031774051191527e-05, "loss": 0.4228, "step": 22650 }, { "epoch": 3.999646986144206, "grad_norm": 2.9622578620910645, "learning_rate": 1.0014121800529568e-05, "loss": 0.4309, "step": 22660 }, { "epoch": 4.001412055423176, "grad_norm": 0.811493992805481, "learning_rate": 9.996469549867608e-06, "loss": 0.3302, "step": 22670 }, { "epoch": 4.003177124702145, "grad_norm": 2.814358949661255, "learning_rate": 9.978817299205649e-06, "loss": 0.3583, "step": 22680 }, { "epoch": 4.004942193981114, "grad_norm": 1.2781128883361816, "learning_rate": 9.96116504854369e-06, "loss": 0.2715, "step": 22690 }, { "epoch": 4.006707263260083, "grad_norm": 0.9884144067764282, "learning_rate": 9.943512797881731e-06, "loss": 0.3188, "step": 22700 }, { "epoch": 4.0084723325390526, "grad_norm": 4.0260539054870605, "learning_rate": 9.925860547219772e-06, "loss": 0.3265, "step": 22710 }, { "epoch": 4.010237401818022, "grad_norm": 2.8330905437469482, "learning_rate": 9.908208296557812e-06, "loss": 0.3375, "step": 22720 }, { "epoch": 4.012002471096991, "grad_norm": 1.2538163661956787, "learning_rate": 9.890556045895851e-06, "loss": 0.2851, "step": 22730 }, { "epoch": 4.01376754037596, "grad_norm": 1.3961235284805298, "learning_rate": 9.872903795233892e-06, "loss": 0.2863, "step": 22740 }, { "epoch": 4.015532609654929, "grad_norm": 0.8649784922599792, "learning_rate": 9.855251544571933e-06, "loss": 0.3217, "step": 22750 }, { "epoch": 4.0172976789338986, "grad_norm": 1.030613660812378, "learning_rate": 9.837599293909975e-06, "loss": 0.3168, "step": 22760 }, { "epoch": 4.019062748212868, "grad_norm": 1.1077706813812256, "learning_rate": 9.819947043248016e-06, "loss": 0.3424, "step": 22770 }, { "epoch": 4.020827817491837, "grad_norm": 1.1141341924667358, "learning_rate": 9.802294792586055e-06, "loss": 0.2827, "step": 22780 }, { "epoch": 4.022592886770806, "grad_norm": 3.150998830795288, "learning_rate": 9.784642541924095e-06, "loss": 0.3143, "step": 22790 }, { "epoch": 4.024357956049775, "grad_norm": 2.8929078578948975, "learning_rate": 9.766990291262136e-06, "loss": 0.3366, "step": 22800 }, { "epoch": 4.0261230253287446, "grad_norm": 0.665348470211029, "learning_rate": 9.749338040600177e-06, "loss": 0.3051, "step": 22810 }, { "epoch": 4.027888094607714, "grad_norm": 1.1569024324417114, "learning_rate": 9.731685789938218e-06, "loss": 0.3493, "step": 22820 }, { "epoch": 4.029653163886683, "grad_norm": 1.1093579530715942, "learning_rate": 9.71403353927626e-06, "loss": 0.287, "step": 22830 }, { "epoch": 4.031418233165652, "grad_norm": 2.9330356121063232, "learning_rate": 9.696381288614299e-06, "loss": 0.3206, "step": 22840 }, { "epoch": 4.033183302444621, "grad_norm": 2.4523935317993164, "learning_rate": 9.678729037952338e-06, "loss": 0.3066, "step": 22850 }, { "epoch": 4.0349483717235906, "grad_norm": 1.1754714250564575, "learning_rate": 9.66107678729038e-06, "loss": 0.2948, "step": 22860 }, { "epoch": 4.03671344100256, "grad_norm": 1.2477717399597168, "learning_rate": 9.64342453662842e-06, "loss": 0.2733, "step": 22870 }, { "epoch": 4.038478510281529, "grad_norm": 2.5531203746795654, "learning_rate": 9.625772285966462e-06, "loss": 0.2801, "step": 22880 }, { "epoch": 4.040243579560498, "grad_norm": 1.2535523176193237, "learning_rate": 9.608120035304503e-06, "loss": 0.3116, "step": 22890 }, { "epoch": 4.042008648839467, "grad_norm": 1.3747566938400269, "learning_rate": 9.590467784642542e-06, "loss": 0.3896, "step": 22900 }, { "epoch": 4.0437737181184366, "grad_norm": 1.3517296314239502, "learning_rate": 9.572815533980583e-06, "loss": 0.3097, "step": 22910 }, { "epoch": 4.045538787397406, "grad_norm": 1.2062978744506836, "learning_rate": 9.555163283318623e-06, "loss": 0.3538, "step": 22920 }, { "epoch": 4.047303856676375, "grad_norm": 1.1064728498458862, "learning_rate": 9.537511032656664e-06, "loss": 0.2828, "step": 22930 }, { "epoch": 4.049068925955344, "grad_norm": 3.050992488861084, "learning_rate": 9.519858781994705e-06, "loss": 0.2996, "step": 22940 }, { "epoch": 4.050833995234313, "grad_norm": 3.6021342277526855, "learning_rate": 9.503971756398941e-06, "loss": 0.3003, "step": 22950 }, { "epoch": 4.052599064513283, "grad_norm": 1.802708625793457, "learning_rate": 9.486319505736982e-06, "loss": 0.2647, "step": 22960 }, { "epoch": 4.054364133792252, "grad_norm": 0.673669159412384, "learning_rate": 9.468667255075023e-06, "loss": 0.3215, "step": 22970 }, { "epoch": 4.056129203071221, "grad_norm": 1.2035069465637207, "learning_rate": 9.451015004413063e-06, "loss": 0.2952, "step": 22980 }, { "epoch": 4.05789427235019, "grad_norm": 2.897989273071289, "learning_rate": 9.433362753751104e-06, "loss": 0.3877, "step": 22990 }, { "epoch": 4.059659341629159, "grad_norm": 1.7467148303985596, "learning_rate": 9.415710503089143e-06, "loss": 0.2597, "step": 23000 }, { "epoch": 4.059659341629159, "eval_loss": 0.648861289024353, "eval_runtime": 591.5301, "eval_samples_per_second": 47.888, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0004983124360109999, "step": 23000 }, { "epoch": 4.061424410908129, "grad_norm": 2.3363940715789795, "learning_rate": 9.398058252427185e-06, "loss": 0.3482, "step": 23010 }, { "epoch": 4.063189480187098, "grad_norm": 2.2884440422058105, "learning_rate": 9.380406001765226e-06, "loss": 0.3512, "step": 23020 }, { "epoch": 4.064954549466067, "grad_norm": 1.645193338394165, "learning_rate": 9.362753751103267e-06, "loss": 0.2675, "step": 23030 }, { "epoch": 4.066719618745036, "grad_norm": 1.3125427961349487, "learning_rate": 9.345101500441306e-06, "loss": 0.2646, "step": 23040 }, { "epoch": 4.068484688024005, "grad_norm": 1.4610532522201538, "learning_rate": 9.327449249779347e-06, "loss": 0.2981, "step": 23050 }, { "epoch": 4.070249757302975, "grad_norm": 1.4041917324066162, "learning_rate": 9.309796999117387e-06, "loss": 0.2779, "step": 23060 }, { "epoch": 4.072014826581944, "grad_norm": 1.26522958278656, "learning_rate": 9.292144748455428e-06, "loss": 0.2745, "step": 23070 }, { "epoch": 4.073779895860913, "grad_norm": 0.81779944896698, "learning_rate": 9.27449249779347e-06, "loss": 0.3133, "step": 23080 }, { "epoch": 4.075544965139882, "grad_norm": 1.1725612878799438, "learning_rate": 9.25684024713151e-06, "loss": 0.3377, "step": 23090 }, { "epoch": 4.077310034418851, "grad_norm": 1.5018218755722046, "learning_rate": 9.23918799646955e-06, "loss": 0.2722, "step": 23100 }, { "epoch": 4.079075103697821, "grad_norm": 2.033559799194336, "learning_rate": 9.221535745807591e-06, "loss": 0.2613, "step": 23110 }, { "epoch": 4.08084017297679, "grad_norm": 1.1398875713348389, "learning_rate": 9.203883495145632e-06, "loss": 0.3586, "step": 23120 }, { "epoch": 4.082605242255759, "grad_norm": 1.7472472190856934, "learning_rate": 9.186231244483672e-06, "loss": 0.3397, "step": 23130 }, { "epoch": 4.084370311534728, "grad_norm": 1.9572392702102661, "learning_rate": 9.168578993821713e-06, "loss": 0.3449, "step": 23140 }, { "epoch": 4.086135380813697, "grad_norm": 2.6116466522216797, "learning_rate": 9.150926743159754e-06, "loss": 0.3446, "step": 23150 }, { "epoch": 4.087900450092667, "grad_norm": 4.654027938842773, "learning_rate": 9.133274492497793e-06, "loss": 0.3033, "step": 23160 }, { "epoch": 4.089665519371636, "grad_norm": 5.693676471710205, "learning_rate": 9.115622241835834e-06, "loss": 0.3421, "step": 23170 }, { "epoch": 4.091430588650605, "grad_norm": 1.7942348718643188, "learning_rate": 9.097969991173876e-06, "loss": 0.319, "step": 23180 }, { "epoch": 4.093195657929574, "grad_norm": 2.1697189807891846, "learning_rate": 9.080317740511917e-06, "loss": 0.3362, "step": 23190 }, { "epoch": 4.094960727208543, "grad_norm": 0.8903745412826538, "learning_rate": 9.062665489849956e-06, "loss": 0.331, "step": 23200 }, { "epoch": 4.096725796487513, "grad_norm": 3.2681972980499268, "learning_rate": 9.045013239187997e-06, "loss": 0.2898, "step": 23210 }, { "epoch": 4.098490865766482, "grad_norm": 1.1167973279953003, "learning_rate": 9.027360988526037e-06, "loss": 0.2927, "step": 23220 }, { "epoch": 4.100255935045451, "grad_norm": 2.5101799964904785, "learning_rate": 9.009708737864078e-06, "loss": 0.3068, "step": 23230 }, { "epoch": 4.10202100432442, "grad_norm": 1.2922667264938354, "learning_rate": 8.992056487202119e-06, "loss": 0.3001, "step": 23240 }, { "epoch": 4.103786073603389, "grad_norm": 5.074741840362549, "learning_rate": 8.97440423654016e-06, "loss": 0.2934, "step": 23250 }, { "epoch": 4.105551142882359, "grad_norm": 2.8039944171905518, "learning_rate": 8.9567519858782e-06, "loss": 0.317, "step": 23260 }, { "epoch": 4.107316212161328, "grad_norm": 1.5906391143798828, "learning_rate": 8.93909973521624e-06, "loss": 0.3094, "step": 23270 }, { "epoch": 4.109081281440297, "grad_norm": 0.6800219416618347, "learning_rate": 8.92144748455428e-06, "loss": 0.2973, "step": 23280 }, { "epoch": 4.110846350719266, "grad_norm": 1.584240198135376, "learning_rate": 8.903795233892321e-06, "loss": 0.3333, "step": 23290 }, { "epoch": 4.112611419998235, "grad_norm": 1.1392163038253784, "learning_rate": 8.886142983230363e-06, "loss": 0.3363, "step": 23300 }, { "epoch": 4.114376489277205, "grad_norm": 2.1709890365600586, "learning_rate": 8.868490732568404e-06, "loss": 0.3035, "step": 23310 }, { "epoch": 4.116141558556174, "grad_norm": 2.6064887046813965, "learning_rate": 8.850838481906445e-06, "loss": 0.3309, "step": 23320 }, { "epoch": 4.117906627835143, "grad_norm": 4.0854997634887695, "learning_rate": 8.833186231244484e-06, "loss": 0.2956, "step": 23330 }, { "epoch": 4.119671697114112, "grad_norm": 2.8295369148254395, "learning_rate": 8.815533980582524e-06, "loss": 0.3026, "step": 23340 }, { "epoch": 4.121436766393081, "grad_norm": 1.6397031545639038, "learning_rate": 8.797881729920565e-06, "loss": 0.3333, "step": 23350 }, { "epoch": 4.123201835672051, "grad_norm": 2.4158713817596436, "learning_rate": 8.780229479258606e-06, "loss": 0.3445, "step": 23360 }, { "epoch": 4.12496690495102, "grad_norm": 3.3127028942108154, "learning_rate": 8.762577228596647e-06, "loss": 0.2818, "step": 23370 }, { "epoch": 4.126731974229989, "grad_norm": 1.6492713689804077, "learning_rate": 8.744924977934688e-06, "loss": 0.3062, "step": 23380 }, { "epoch": 4.128497043508958, "grad_norm": 1.1045948266983032, "learning_rate": 8.727272727272728e-06, "loss": 0.3042, "step": 23390 }, { "epoch": 4.130262112787927, "grad_norm": 0.809982419013977, "learning_rate": 8.709620476610767e-06, "loss": 0.304, "step": 23400 }, { "epoch": 4.132027182066896, "grad_norm": 1.7027791738510132, "learning_rate": 8.691968225948808e-06, "loss": 0.3169, "step": 23410 }, { "epoch": 4.133792251345866, "grad_norm": 0.8467365503311157, "learning_rate": 8.67431597528685e-06, "loss": 0.2853, "step": 23420 }, { "epoch": 4.135557320624835, "grad_norm": 1.5310596227645874, "learning_rate": 8.65666372462489e-06, "loss": 0.3149, "step": 23430 }, { "epoch": 4.137322389903804, "grad_norm": 0.9673229455947876, "learning_rate": 8.639011473962932e-06, "loss": 0.3051, "step": 23440 }, { "epoch": 4.139087459182773, "grad_norm": 1.3377794027328491, "learning_rate": 8.621359223300971e-06, "loss": 0.3517, "step": 23450 }, { "epoch": 4.140852528461743, "grad_norm": 1.3316737413406372, "learning_rate": 8.60370697263901e-06, "loss": 0.3095, "step": 23460 }, { "epoch": 4.142617597740712, "grad_norm": 1.3464562892913818, "learning_rate": 8.586054721977052e-06, "loss": 0.3085, "step": 23470 }, { "epoch": 4.144382667019681, "grad_norm": 3.453214645385742, "learning_rate": 8.568402471315093e-06, "loss": 0.3038, "step": 23480 }, { "epoch": 4.14614773629865, "grad_norm": 1.2823230028152466, "learning_rate": 8.550750220653134e-06, "loss": 0.3958, "step": 23490 }, { "epoch": 4.147912805577619, "grad_norm": 1.4263204336166382, "learning_rate": 8.533097969991175e-06, "loss": 0.2977, "step": 23500 }, { "epoch": 4.149677874856588, "grad_norm": 0.9644317626953125, "learning_rate": 8.515445719329215e-06, "loss": 0.3711, "step": 23510 }, { "epoch": 4.151442944135558, "grad_norm": 1.897396206855774, "learning_rate": 8.497793468667256e-06, "loss": 0.2964, "step": 23520 }, { "epoch": 4.153208013414527, "grad_norm": 2.8374288082122803, "learning_rate": 8.480141218005295e-06, "loss": 0.3009, "step": 23530 }, { "epoch": 4.154973082693496, "grad_norm": 0.8667998313903809, "learning_rate": 8.462488967343337e-06, "loss": 0.3245, "step": 23540 }, { "epoch": 4.156738151972465, "grad_norm": 3.246046543121338, "learning_rate": 8.444836716681378e-06, "loss": 0.3408, "step": 23550 }, { "epoch": 4.158503221251435, "grad_norm": 1.8256837129592896, "learning_rate": 8.427184466019419e-06, "loss": 0.3052, "step": 23560 }, { "epoch": 4.160268290530404, "grad_norm": 0.8874765634536743, "learning_rate": 8.409532215357458e-06, "loss": 0.2911, "step": 23570 }, { "epoch": 4.162033359809373, "grad_norm": 1.2588152885437012, "learning_rate": 8.3918799646955e-06, "loss": 0.2877, "step": 23580 }, { "epoch": 4.163798429088342, "grad_norm": 2.0045018196105957, "learning_rate": 8.374227714033539e-06, "loss": 0.398, "step": 23590 }, { "epoch": 4.165563498367311, "grad_norm": 3.729039430618286, "learning_rate": 8.35657546337158e-06, "loss": 0.305, "step": 23600 }, { "epoch": 4.16732856764628, "grad_norm": 2.881028652191162, "learning_rate": 8.338923212709621e-06, "loss": 0.2958, "step": 23610 }, { "epoch": 4.16909363692525, "grad_norm": 1.8063958883285522, "learning_rate": 8.32127096204766e-06, "loss": 0.3089, "step": 23620 }, { "epoch": 4.170858706204219, "grad_norm": 1.1365619897842407, "learning_rate": 8.303618711385702e-06, "loss": 0.3338, "step": 23630 }, { "epoch": 4.172623775483188, "grad_norm": 1.2051860094070435, "learning_rate": 8.285966460723743e-06, "loss": 0.2829, "step": 23640 }, { "epoch": 4.174388844762157, "grad_norm": 1.248076319694519, "learning_rate": 8.268314210061784e-06, "loss": 0.3573, "step": 23650 }, { "epoch": 4.176153914041127, "grad_norm": 2.788290500640869, "learning_rate": 8.250661959399823e-06, "loss": 0.2695, "step": 23660 }, { "epoch": 4.177918983320096, "grad_norm": 1.5468093156814575, "learning_rate": 8.233009708737865e-06, "loss": 0.3258, "step": 23670 }, { "epoch": 4.179684052599065, "grad_norm": 3.705085039138794, "learning_rate": 8.215357458075904e-06, "loss": 0.2781, "step": 23680 }, { "epoch": 4.181449121878034, "grad_norm": 1.7882062196731567, "learning_rate": 8.197705207413945e-06, "loss": 0.2967, "step": 23690 }, { "epoch": 4.183214191157003, "grad_norm": 3.2124860286712646, "learning_rate": 8.180052956751986e-06, "loss": 0.2557, "step": 23700 }, { "epoch": 4.184979260435972, "grad_norm": 1.0433921813964844, "learning_rate": 8.162400706090028e-06, "loss": 0.3152, "step": 23710 }, { "epoch": 4.186744329714942, "grad_norm": 1.109761357307434, "learning_rate": 8.144748455428069e-06, "loss": 0.3309, "step": 23720 }, { "epoch": 4.188509398993911, "grad_norm": 1.2098308801651, "learning_rate": 8.127096204766108e-06, "loss": 0.2816, "step": 23730 }, { "epoch": 4.19027446827288, "grad_norm": 1.9796377420425415, "learning_rate": 8.109443954104148e-06, "loss": 0.3377, "step": 23740 }, { "epoch": 4.192039537551849, "grad_norm": 2.3625099658966064, "learning_rate": 8.091791703442189e-06, "loss": 0.3028, "step": 23750 }, { "epoch": 4.193804606830818, "grad_norm": 0.8961766958236694, "learning_rate": 8.07413945278023e-06, "loss": 0.2704, "step": 23760 }, { "epoch": 4.195569676109788, "grad_norm": 1.137475609779358, "learning_rate": 8.056487202118271e-06, "loss": 0.3296, "step": 23770 }, { "epoch": 4.197334745388757, "grad_norm": 0.7209318280220032, "learning_rate": 8.038834951456312e-06, "loss": 0.2598, "step": 23780 }, { "epoch": 4.199099814667726, "grad_norm": 1.4462792873382568, "learning_rate": 8.021182700794352e-06, "loss": 0.3049, "step": 23790 }, { "epoch": 4.200864883946695, "grad_norm": 1.377173900604248, "learning_rate": 8.003530450132391e-06, "loss": 0.3041, "step": 23800 }, { "epoch": 4.202629953225664, "grad_norm": 1.8368486166000366, "learning_rate": 7.985878199470432e-06, "loss": 0.2636, "step": 23810 }, { "epoch": 4.204395022504634, "grad_norm": 1.310995101928711, "learning_rate": 7.968225948808473e-06, "loss": 0.2845, "step": 23820 }, { "epoch": 4.206160091783603, "grad_norm": 1.4313207864761353, "learning_rate": 7.950573698146515e-06, "loss": 0.3365, "step": 23830 }, { "epoch": 4.207925161062572, "grad_norm": 2.2771763801574707, "learning_rate": 7.932921447484556e-06, "loss": 0.3221, "step": 23840 }, { "epoch": 4.209690230341541, "grad_norm": 0.9911360740661621, "learning_rate": 7.915269196822595e-06, "loss": 0.3255, "step": 23850 }, { "epoch": 4.21145529962051, "grad_norm": 1.0318197011947632, "learning_rate": 7.897616946160635e-06, "loss": 0.2863, "step": 23860 }, { "epoch": 4.21322036889948, "grad_norm": 1.4511429071426392, "learning_rate": 7.879964695498676e-06, "loss": 0.2939, "step": 23870 }, { "epoch": 4.214985438178449, "grad_norm": 0.9572875499725342, "learning_rate": 7.862312444836717e-06, "loss": 0.3005, "step": 23880 }, { "epoch": 4.216750507457418, "grad_norm": 2.9129538536071777, "learning_rate": 7.844660194174758e-06, "loss": 0.2933, "step": 23890 }, { "epoch": 4.218515576736387, "grad_norm": 1.1331290006637573, "learning_rate": 7.827007943512799e-06, "loss": 0.2633, "step": 23900 }, { "epoch": 4.220280646015356, "grad_norm": 0.7028851509094238, "learning_rate": 7.809355692850839e-06, "loss": 0.3477, "step": 23910 }, { "epoch": 4.222045715294326, "grad_norm": 1.0875290632247925, "learning_rate": 7.791703442188878e-06, "loss": 0.3177, "step": 23920 }, { "epoch": 4.223810784573295, "grad_norm": 2.365562677383423, "learning_rate": 7.77405119152692e-06, "loss": 0.3045, "step": 23930 }, { "epoch": 4.225575853852264, "grad_norm": 1.9424879550933838, "learning_rate": 7.75639894086496e-06, "loss": 0.2623, "step": 23940 }, { "epoch": 4.227340923131233, "grad_norm": 2.014070510864258, "learning_rate": 7.738746690203001e-06, "loss": 0.2916, "step": 23950 }, { "epoch": 4.229105992410202, "grad_norm": 2.267733573913574, "learning_rate": 7.721094439541043e-06, "loss": 0.2807, "step": 23960 }, { "epoch": 4.230871061689172, "grad_norm": 2.6851611137390137, "learning_rate": 7.703442188879082e-06, "loss": 0.3615, "step": 23970 }, { "epoch": 4.232636130968141, "grad_norm": 0.8334643244743347, "learning_rate": 7.685789938217123e-06, "loss": 0.2835, "step": 23980 }, { "epoch": 4.23440120024711, "grad_norm": 1.605870008468628, "learning_rate": 7.668137687555163e-06, "loss": 0.3236, "step": 23990 }, { "epoch": 4.236166269526079, "grad_norm": 1.1965278387069702, "learning_rate": 7.650485436893204e-06, "loss": 0.3069, "step": 24000 }, { "epoch": 4.236166269526079, "eval_loss": 0.6535650491714478, "eval_runtime": 591.7187, "eval_samples_per_second": 47.872, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.000500342221901065, "step": 24000 }, { "epoch": 4.237931338805048, "grad_norm": 2.919116973876953, "learning_rate": 7.632833186231245e-06, "loss": 0.3474, "step": 24010 }, { "epoch": 4.239696408084018, "grad_norm": 2.139883041381836, "learning_rate": 7.615180935569285e-06, "loss": 0.2733, "step": 24020 }, { "epoch": 4.241461477362987, "grad_norm": 0.7100754380226135, "learning_rate": 7.597528684907326e-06, "loss": 0.2755, "step": 24030 }, { "epoch": 4.243226546641956, "grad_norm": 1.0110312700271606, "learning_rate": 7.579876434245367e-06, "loss": 0.2658, "step": 24040 }, { "epoch": 4.244991615920925, "grad_norm": 1.016863465309143, "learning_rate": 7.562224183583408e-06, "loss": 0.2966, "step": 24050 }, { "epoch": 4.246756685199894, "grad_norm": 2.464367389678955, "learning_rate": 7.544571932921447e-06, "loss": 0.334, "step": 24060 }, { "epoch": 4.248521754478864, "grad_norm": 4.494397163391113, "learning_rate": 7.5269196822594885e-06, "loss": 0.2397, "step": 24070 }, { "epoch": 4.250286823757833, "grad_norm": 2.648230791091919, "learning_rate": 7.509267431597529e-06, "loss": 0.3144, "step": 24080 }, { "epoch": 4.252051893036802, "grad_norm": 3.175685405731201, "learning_rate": 7.49161518093557e-06, "loss": 0.2716, "step": 24090 }, { "epoch": 4.253816962315771, "grad_norm": 2.834336757659912, "learning_rate": 7.47396293027361e-06, "loss": 0.2755, "step": 24100 }, { "epoch": 4.25558203159474, "grad_norm": 2.6625614166259766, "learning_rate": 7.456310679611651e-06, "loss": 0.367, "step": 24110 }, { "epoch": 4.25734710087371, "grad_norm": 0.9785534143447876, "learning_rate": 7.438658428949691e-06, "loss": 0.292, "step": 24120 }, { "epoch": 4.259112170152679, "grad_norm": 1.2775884866714478, "learning_rate": 7.421006178287732e-06, "loss": 0.3023, "step": 24130 }, { "epoch": 4.260877239431648, "grad_norm": 2.735036611557007, "learning_rate": 7.403353927625772e-06, "loss": 0.3281, "step": 24140 }, { "epoch": 4.262642308710617, "grad_norm": 1.1893835067749023, "learning_rate": 7.385701676963813e-06, "loss": 0.2853, "step": 24150 }, { "epoch": 4.264407377989586, "grad_norm": 1.292029857635498, "learning_rate": 7.368049426301854e-06, "loss": 0.2648, "step": 24160 }, { "epoch": 4.266172447268556, "grad_norm": 2.2875688076019287, "learning_rate": 7.350397175639895e-06, "loss": 0.3576, "step": 24170 }, { "epoch": 4.267937516547525, "grad_norm": 0.7808769345283508, "learning_rate": 7.332744924977936e-06, "loss": 0.2908, "step": 24180 }, { "epoch": 4.269702585826494, "grad_norm": 2.999920606613159, "learning_rate": 7.3150926743159754e-06, "loss": 0.327, "step": 24190 }, { "epoch": 4.271467655105463, "grad_norm": 2.7963919639587402, "learning_rate": 7.297440423654016e-06, "loss": 0.3061, "step": 24200 }, { "epoch": 4.273232724384432, "grad_norm": 1.0172919034957886, "learning_rate": 7.279788172992057e-06, "loss": 0.2894, "step": 24210 }, { "epoch": 4.274997793663402, "grad_norm": 1.3990018367767334, "learning_rate": 7.262135922330097e-06, "loss": 0.2977, "step": 24220 }, { "epoch": 4.276762862942371, "grad_norm": 1.0484473705291748, "learning_rate": 7.244483671668138e-06, "loss": 0.2686, "step": 24230 }, { "epoch": 4.27852793222134, "grad_norm": 3.1869072914123535, "learning_rate": 7.2268314210061795e-06, "loss": 0.3518, "step": 24240 }, { "epoch": 4.280293001500309, "grad_norm": 3.1976029872894287, "learning_rate": 7.20917917034422e-06, "loss": 0.346, "step": 24250 }, { "epoch": 4.282058070779278, "grad_norm": 1.2420626878738403, "learning_rate": 7.191526919682259e-06, "loss": 0.3339, "step": 24260 }, { "epoch": 4.283823140058248, "grad_norm": 1.1138620376586914, "learning_rate": 7.1738746690203e-06, "loss": 0.3505, "step": 24270 }, { "epoch": 4.285588209337217, "grad_norm": 0.9700965285301208, "learning_rate": 7.156222418358341e-06, "loss": 0.3217, "step": 24280 }, { "epoch": 4.287353278616186, "grad_norm": 0.8993445634841919, "learning_rate": 7.138570167696382e-06, "loss": 0.2542, "step": 24290 }, { "epoch": 4.289118347895155, "grad_norm": 3.1148288249969482, "learning_rate": 7.120917917034423e-06, "loss": 0.3403, "step": 24300 }, { "epoch": 4.290883417174124, "grad_norm": 1.1724573373794556, "learning_rate": 7.103265666372463e-06, "loss": 0.3562, "step": 24310 }, { "epoch": 4.292648486453094, "grad_norm": 3.517512083053589, "learning_rate": 7.085613415710503e-06, "loss": 0.3412, "step": 24320 }, { "epoch": 4.294413555732063, "grad_norm": 1.830450177192688, "learning_rate": 7.067961165048544e-06, "loss": 0.2899, "step": 24330 }, { "epoch": 4.296178625011032, "grad_norm": 2.5125741958618164, "learning_rate": 7.050308914386584e-06, "loss": 0.3349, "step": 24340 }, { "epoch": 4.297943694290001, "grad_norm": 3.087559223175049, "learning_rate": 7.032656663724625e-06, "loss": 0.3021, "step": 24350 }, { "epoch": 4.29970876356897, "grad_norm": 1.0679352283477783, "learning_rate": 7.0150044130626664e-06, "loss": 0.3215, "step": 24360 }, { "epoch": 4.30147383284794, "grad_norm": 0.7437068223953247, "learning_rate": 6.997352162400707e-06, "loss": 0.322, "step": 24370 }, { "epoch": 4.303238902126909, "grad_norm": 1.2267742156982422, "learning_rate": 6.979699911738748e-06, "loss": 0.3966, "step": 24380 }, { "epoch": 4.305003971405878, "grad_norm": 0.9550343155860901, "learning_rate": 6.962047661076787e-06, "loss": 0.3122, "step": 24390 }, { "epoch": 4.306769040684847, "grad_norm": 1.301531195640564, "learning_rate": 6.944395410414828e-06, "loss": 0.2963, "step": 24400 }, { "epoch": 4.308534109963816, "grad_norm": 1.4933806657791138, "learning_rate": 6.926743159752869e-06, "loss": 0.2739, "step": 24410 }, { "epoch": 4.310299179242786, "grad_norm": 0.6646918654441833, "learning_rate": 6.909090909090909e-06, "loss": 0.2877, "step": 24420 }, { "epoch": 4.312064248521755, "grad_norm": 1.2793468236923218, "learning_rate": 6.89143865842895e-06, "loss": 0.2802, "step": 24430 }, { "epoch": 4.313829317800724, "grad_norm": 2.5267844200134277, "learning_rate": 6.873786407766991e-06, "loss": 0.2813, "step": 24440 }, { "epoch": 4.315594387079693, "grad_norm": 0.9937548637390137, "learning_rate": 6.856134157105031e-06, "loss": 0.2761, "step": 24450 }, { "epoch": 4.317359456358662, "grad_norm": 0.7776408791542053, "learning_rate": 6.838481906443071e-06, "loss": 0.3156, "step": 24460 }, { "epoch": 4.319124525637632, "grad_norm": 3.936626672744751, "learning_rate": 6.820829655781112e-06, "loss": 0.29, "step": 24470 }, { "epoch": 4.320889594916601, "grad_norm": 3.12445330619812, "learning_rate": 6.8031774051191526e-06, "loss": 0.3339, "step": 24480 }, { "epoch": 4.32265466419557, "grad_norm": 2.77976131439209, "learning_rate": 6.785525154457194e-06, "loss": 0.2989, "step": 24490 }, { "epoch": 4.324419733474539, "grad_norm": 1.1880944967269897, "learning_rate": 6.767872903795235e-06, "loss": 0.2955, "step": 24500 }, { "epoch": 4.326184802753508, "grad_norm": 1.6699707508087158, "learning_rate": 6.750220653133275e-06, "loss": 0.2068, "step": 24510 }, { "epoch": 4.327949872032478, "grad_norm": 4.329336166381836, "learning_rate": 6.732568402471315e-06, "loss": 0.3533, "step": 24520 }, { "epoch": 4.329714941311447, "grad_norm": 0.8252127766609192, "learning_rate": 6.714916151809356e-06, "loss": 0.292, "step": 24530 }, { "epoch": 4.331480010590416, "grad_norm": 4.663092613220215, "learning_rate": 6.697263901147396e-06, "loss": 0.3377, "step": 24540 }, { "epoch": 4.333245079869385, "grad_norm": 1.191158413887024, "learning_rate": 6.679611650485437e-06, "loss": 0.2992, "step": 24550 }, { "epoch": 4.335010149148354, "grad_norm": 0.9868506193161011, "learning_rate": 6.661959399823478e-06, "loss": 0.3129, "step": 24560 }, { "epoch": 4.336775218427324, "grad_norm": 1.1885898113250732, "learning_rate": 6.644307149161519e-06, "loss": 0.3082, "step": 24570 }, { "epoch": 4.338540287706293, "grad_norm": 1.1443339586257935, "learning_rate": 6.62665489849956e-06, "loss": 0.2772, "step": 24580 }, { "epoch": 4.340305356985262, "grad_norm": 1.6731210947036743, "learning_rate": 6.609002647837599e-06, "loss": 0.2865, "step": 24590 }, { "epoch": 4.342070426264231, "grad_norm": 2.7773597240448, "learning_rate": 6.5913503971756395e-06, "loss": 0.3296, "step": 24600 }, { "epoch": 4.3438354955432, "grad_norm": 1.090955138206482, "learning_rate": 6.573698146513681e-06, "loss": 0.2642, "step": 24610 }, { "epoch": 4.34560056482217, "grad_norm": 2.6028826236724854, "learning_rate": 6.556045895851722e-06, "loss": 0.2895, "step": 24620 }, { "epoch": 4.347365634101139, "grad_norm": 0.9110934734344482, "learning_rate": 6.538393645189762e-06, "loss": 0.3021, "step": 24630 }, { "epoch": 4.349130703380108, "grad_norm": 3.2843897342681885, "learning_rate": 6.520741394527803e-06, "loss": 0.2826, "step": 24640 }, { "epoch": 4.350895772659077, "grad_norm": 3.9276788234710693, "learning_rate": 6.503089143865843e-06, "loss": 0.2596, "step": 24650 }, { "epoch": 4.352660841938046, "grad_norm": 1.0411996841430664, "learning_rate": 6.485436893203883e-06, "loss": 0.2316, "step": 24660 }, { "epoch": 4.354425911217016, "grad_norm": 1.1377794742584229, "learning_rate": 6.467784642541924e-06, "loss": 0.308, "step": 24670 }, { "epoch": 4.356190980495985, "grad_norm": 1.1812376976013184, "learning_rate": 6.450132391879965e-06, "loss": 0.3091, "step": 24680 }, { "epoch": 4.357956049774954, "grad_norm": 1.2442140579223633, "learning_rate": 6.432480141218006e-06, "loss": 0.3041, "step": 24690 }, { "epoch": 4.359721119053923, "grad_norm": 1.8300071954727173, "learning_rate": 6.414827890556047e-06, "loss": 0.3012, "step": 24700 }, { "epoch": 4.361486188332892, "grad_norm": 2.2776644229888916, "learning_rate": 6.397175639894087e-06, "loss": 0.3097, "step": 24710 }, { "epoch": 4.363251257611862, "grad_norm": 1.6646955013275146, "learning_rate": 6.3795233892321265e-06, "loss": 0.2554, "step": 24720 }, { "epoch": 4.365016326890831, "grad_norm": 0.6946931481361389, "learning_rate": 6.361871138570168e-06, "loss": 0.2638, "step": 24730 }, { "epoch": 4.3667813961698, "grad_norm": 3.5868146419525146, "learning_rate": 6.344218887908209e-06, "loss": 0.2959, "step": 24740 }, { "epoch": 4.368546465448769, "grad_norm": 0.9338198304176331, "learning_rate": 6.326566637246249e-06, "loss": 0.2671, "step": 24750 }, { "epoch": 4.370311534727738, "grad_norm": 3.109726667404175, "learning_rate": 6.30891438658429e-06, "loss": 0.2825, "step": 24760 }, { "epoch": 4.372076604006708, "grad_norm": 2.315507411956787, "learning_rate": 6.2912621359223306e-06, "loss": 0.3094, "step": 24770 }, { "epoch": 4.373841673285677, "grad_norm": 3.1632182598114014, "learning_rate": 6.273609885260372e-06, "loss": 0.2725, "step": 24780 }, { "epoch": 4.375606742564646, "grad_norm": 1.8601646423339844, "learning_rate": 6.255957634598411e-06, "loss": 0.289, "step": 24790 }, { "epoch": 4.377371811843615, "grad_norm": 2.712733507156372, "learning_rate": 6.238305383936452e-06, "loss": 0.3093, "step": 24800 }, { "epoch": 4.379136881122584, "grad_norm": 3.8398969173431396, "learning_rate": 6.220653133274493e-06, "loss": 0.3063, "step": 24810 }, { "epoch": 4.380901950401554, "grad_norm": 0.9455320835113525, "learning_rate": 6.203000882612534e-06, "loss": 0.3086, "step": 24820 }, { "epoch": 4.382667019680523, "grad_norm": 2.375248908996582, "learning_rate": 6.185348631950574e-06, "loss": 0.2885, "step": 24830 }, { "epoch": 4.384432088959492, "grad_norm": 0.8944133520126343, "learning_rate": 6.167696381288614e-06, "loss": 0.2758, "step": 24840 }, { "epoch": 4.386197158238461, "grad_norm": 1.490126609802246, "learning_rate": 6.1500441306266555e-06, "loss": 0.2911, "step": 24850 }, { "epoch": 4.38796222751743, "grad_norm": 1.6396925449371338, "learning_rate": 6.132391879964696e-06, "loss": 0.2867, "step": 24860 }, { "epoch": 4.3897272967964, "grad_norm": 2.53312087059021, "learning_rate": 6.114739629302736e-06, "loss": 0.2978, "step": 24870 }, { "epoch": 4.391492366075369, "grad_norm": 2.5027709007263184, "learning_rate": 6.097087378640777e-06, "loss": 0.3359, "step": 24880 }, { "epoch": 4.393257435354338, "grad_norm": 1.1508357524871826, "learning_rate": 6.0794351279788175e-06, "loss": 0.2817, "step": 24890 }, { "epoch": 4.395022504633307, "grad_norm": 1.068000316619873, "learning_rate": 6.061782877316858e-06, "loss": 0.3096, "step": 24900 }, { "epoch": 4.396787573912276, "grad_norm": 0.8342865109443665, "learning_rate": 6.044130626654899e-06, "loss": 0.2927, "step": 24910 }, { "epoch": 4.398552643191246, "grad_norm": 1.2853881120681763, "learning_rate": 6.026478375992939e-06, "loss": 0.2963, "step": 24920 }, { "epoch": 4.400317712470215, "grad_norm": 1.2073771953582764, "learning_rate": 6.00882612533098e-06, "loss": 0.2856, "step": 24930 }, { "epoch": 4.402082781749184, "grad_norm": 3.3324711322784424, "learning_rate": 5.991173874669021e-06, "loss": 0.3403, "step": 24940 }, { "epoch": 4.403847851028153, "grad_norm": 3.556697130203247, "learning_rate": 5.973521624007061e-06, "loss": 0.2825, "step": 24950 }, { "epoch": 4.405612920307122, "grad_norm": 2.4177629947662354, "learning_rate": 5.955869373345102e-06, "loss": 0.3349, "step": 24960 }, { "epoch": 4.407377989586092, "grad_norm": 2.2014477252960205, "learning_rate": 5.9382171226831425e-06, "loss": 0.2541, "step": 24970 }, { "epoch": 4.409143058865061, "grad_norm": 1.251031517982483, "learning_rate": 5.920564872021183e-06, "loss": 0.3314, "step": 24980 }, { "epoch": 4.41090812814403, "grad_norm": 2.201280117034912, "learning_rate": 5.902912621359224e-06, "loss": 0.2466, "step": 24990 }, { "epoch": 4.412673197422999, "grad_norm": 1.493913173675537, "learning_rate": 5.885260370697264e-06, "loss": 0.3005, "step": 25000 }, { "epoch": 4.412673197422999, "eval_loss": 0.655875027179718, "eval_runtime": 591.548, "eval_samples_per_second": 47.886, "eval_steps_per_second": 2.395, "eval_token_accuracy": 0.0004981094574219933, "step": 25000 }, { "epoch": 4.414438266701968, "grad_norm": 1.3809815645217896, "learning_rate": 5.8676081200353045e-06, "loss": 0.2812, "step": 25010 }, { "epoch": 4.416203335980938, "grad_norm": 2.053154468536377, "learning_rate": 5.849955869373346e-06, "loss": 0.3088, "step": 25020 }, { "epoch": 4.417968405259907, "grad_norm": 1.3587795495986938, "learning_rate": 5.832303618711386e-06, "loss": 0.2717, "step": 25030 }, { "epoch": 4.419733474538876, "grad_norm": 2.7400026321411133, "learning_rate": 5.814651368049426e-06, "loss": 0.3474, "step": 25040 }, { "epoch": 4.421498543817845, "grad_norm": 0.9677148461341858, "learning_rate": 5.796999117387467e-06, "loss": 0.2745, "step": 25050 }, { "epoch": 4.423263613096814, "grad_norm": 2.704630136489868, "learning_rate": 5.779346866725508e-06, "loss": 0.3524, "step": 25060 }, { "epoch": 4.425028682375784, "grad_norm": 2.6748154163360596, "learning_rate": 5.761694616063548e-06, "loss": 0.2928, "step": 25070 }, { "epoch": 4.426793751654753, "grad_norm": 1.1494990587234497, "learning_rate": 5.744042365401589e-06, "loss": 0.2947, "step": 25080 }, { "epoch": 4.428558820933722, "grad_norm": 1.6972780227661133, "learning_rate": 5.7263901147396294e-06, "loss": 0.3203, "step": 25090 }, { "epoch": 4.430323890212691, "grad_norm": 0.7590330243110657, "learning_rate": 5.70873786407767e-06, "loss": 0.3069, "step": 25100 }, { "epoch": 4.43208895949166, "grad_norm": 2.139500617980957, "learning_rate": 5.691085613415711e-06, "loss": 0.3059, "step": 25110 }, { "epoch": 4.43385402877063, "grad_norm": 2.8314902782440186, "learning_rate": 5.673433362753751e-06, "loss": 0.3206, "step": 25120 }, { "epoch": 4.435619098049599, "grad_norm": 2.3796584606170654, "learning_rate": 5.6557811120917915e-06, "loss": 0.2859, "step": 25130 }, { "epoch": 4.437384167328568, "grad_norm": 1.3357577323913574, "learning_rate": 5.638128861429833e-06, "loss": 0.291, "step": 25140 }, { "epoch": 4.439149236607537, "grad_norm": 3.7719616889953613, "learning_rate": 5.620476610767873e-06, "loss": 0.328, "step": 25150 }, { "epoch": 4.440914305886506, "grad_norm": 3.9590110778808594, "learning_rate": 5.602824360105914e-06, "loss": 0.3063, "step": 25160 }, { "epoch": 4.442679375165476, "grad_norm": 0.9310747385025024, "learning_rate": 5.585172109443954e-06, "loss": 0.2763, "step": 25170 }, { "epoch": 4.444444444444445, "grad_norm": 1.384122610092163, "learning_rate": 5.567519858781995e-06, "loss": 0.3029, "step": 25180 }, { "epoch": 4.446209513723414, "grad_norm": 1.3160362243652344, "learning_rate": 5.549867608120036e-06, "loss": 0.3112, "step": 25190 }, { "epoch": 4.447974583002383, "grad_norm": 1.24907386302948, "learning_rate": 5.532215357458076e-06, "loss": 0.2652, "step": 25200 }, { "epoch": 4.449739652281352, "grad_norm": 2.1747610569000244, "learning_rate": 5.514563106796116e-06, "loss": 0.3258, "step": 25210 }, { "epoch": 4.451504721560322, "grad_norm": 0.5644562244415283, "learning_rate": 5.4969108561341576e-06, "loss": 0.2719, "step": 25220 }, { "epoch": 4.453269790839291, "grad_norm": 0.9033040404319763, "learning_rate": 5.479258605472198e-06, "loss": 0.3326, "step": 25230 }, { "epoch": 4.45503486011826, "grad_norm": 0.9956077933311462, "learning_rate": 5.461606354810238e-06, "loss": 0.3305, "step": 25240 }, { "epoch": 4.456799929397229, "grad_norm": 1.1431922912597656, "learning_rate": 5.443954104148279e-06, "loss": 0.3509, "step": 25250 }, { "epoch": 4.458564998676198, "grad_norm": 1.020296573638916, "learning_rate": 5.42630185348632e-06, "loss": 0.2813, "step": 25260 }, { "epoch": 4.460330067955168, "grad_norm": 1.0820090770721436, "learning_rate": 5.40864960282436e-06, "loss": 0.2997, "step": 25270 }, { "epoch": 4.462095137234137, "grad_norm": 1.5949653387069702, "learning_rate": 5.390997352162401e-06, "loss": 0.2982, "step": 25280 }, { "epoch": 4.463860206513106, "grad_norm": 1.358872890472412, "learning_rate": 5.373345101500442e-06, "loss": 0.3245, "step": 25290 }, { "epoch": 4.465625275792075, "grad_norm": 1.1218355894088745, "learning_rate": 5.355692850838482e-06, "loss": 0.3468, "step": 25300 }, { "epoch": 4.467390345071044, "grad_norm": 3.1862800121307373, "learning_rate": 5.338040600176523e-06, "loss": 0.3242, "step": 25310 }, { "epoch": 4.469155414350014, "grad_norm": 0.8951514363288879, "learning_rate": 5.320388349514564e-06, "loss": 0.2978, "step": 25320 }, { "epoch": 4.470920483628983, "grad_norm": 2.006993293762207, "learning_rate": 5.302736098852603e-06, "loss": 0.2933, "step": 25330 }, { "epoch": 4.472685552907952, "grad_norm": 4.0217719078063965, "learning_rate": 5.2850838481906445e-06, "loss": 0.3422, "step": 25340 }, { "epoch": 4.474450622186921, "grad_norm": 1.2529191970825195, "learning_rate": 5.267431597528686e-06, "loss": 0.3059, "step": 25350 }, { "epoch": 4.47621569146589, "grad_norm": 3.027233362197876, "learning_rate": 5.249779346866725e-06, "loss": 0.3498, "step": 25360 }, { "epoch": 4.47798076074486, "grad_norm": 1.0020170211791992, "learning_rate": 5.232127096204766e-06, "loss": 0.273, "step": 25370 }, { "epoch": 4.479745830023829, "grad_norm": 1.1415488719940186, "learning_rate": 5.2144748455428074e-06, "loss": 0.3093, "step": 25380 }, { "epoch": 4.481510899302798, "grad_norm": 0.8501665592193604, "learning_rate": 5.196822594880848e-06, "loss": 0.2857, "step": 25390 }, { "epoch": 4.483275968581767, "grad_norm": 0.9540072083473206, "learning_rate": 5.179170344218888e-06, "loss": 0.2998, "step": 25400 }, { "epoch": 4.485041037860736, "grad_norm": 4.062297344207764, "learning_rate": 5.161518093556929e-06, "loss": 0.3466, "step": 25410 }, { "epoch": 4.486806107139706, "grad_norm": 2.4073777198791504, "learning_rate": 5.1438658428949695e-06, "loss": 0.3052, "step": 25420 }, { "epoch": 4.488571176418675, "grad_norm": 2.450510025024414, "learning_rate": 5.12621359223301e-06, "loss": 0.3023, "step": 25430 }, { "epoch": 4.490336245697644, "grad_norm": 5.3954315185546875, "learning_rate": 5.108561341571051e-06, "loss": 0.3043, "step": 25440 }, { "epoch": 4.492101314976613, "grad_norm": 1.8781121969223022, "learning_rate": 5.090909090909091e-06, "loss": 0.2563, "step": 25450 }, { "epoch": 4.493866384255582, "grad_norm": 2.038400173187256, "learning_rate": 5.0732568402471315e-06, "loss": 0.3205, "step": 25460 }, { "epoch": 4.495631453534552, "grad_norm": 2.6978981494903564, "learning_rate": 5.055604589585173e-06, "loss": 0.2934, "step": 25470 }, { "epoch": 4.497396522813521, "grad_norm": 1.041235089302063, "learning_rate": 5.037952338923213e-06, "loss": 0.3096, "step": 25480 }, { "epoch": 4.49916159209249, "grad_norm": 1.190179467201233, "learning_rate": 5.020300088261254e-06, "loss": 0.2895, "step": 25490 }, { "epoch": 4.500926661371459, "grad_norm": 0.9067394733428955, "learning_rate": 5.0026478375992936e-06, "loss": 0.3435, "step": 25500 }, { "epoch": 4.502691730650428, "grad_norm": 2.3193790912628174, "learning_rate": 4.984995586937335e-06, "loss": 0.3417, "step": 25510 }, { "epoch": 4.504456799929397, "grad_norm": 0.9801955819129944, "learning_rate": 4.967343336275376e-06, "loss": 0.3055, "step": 25520 }, { "epoch": 4.506221869208367, "grad_norm": 5.301249027252197, "learning_rate": 4.949691085613415e-06, "loss": 0.2871, "step": 25530 }, { "epoch": 4.507986938487336, "grad_norm": 2.5554471015930176, "learning_rate": 4.9320388349514564e-06, "loss": 0.3287, "step": 25540 }, { "epoch": 4.509752007766305, "grad_norm": 0.8739365935325623, "learning_rate": 4.914386584289498e-06, "loss": 0.2924, "step": 25550 }, { "epoch": 4.511517077045274, "grad_norm": 2.3037023544311523, "learning_rate": 4.896734333627537e-06, "loss": 0.2868, "step": 25560 }, { "epoch": 4.513282146324244, "grad_norm": 2.186363697052002, "learning_rate": 4.879082082965578e-06, "loss": 0.2893, "step": 25570 }, { "epoch": 4.515047215603213, "grad_norm": 2.8541760444641113, "learning_rate": 4.861429832303619e-06, "loss": 0.3106, "step": 25580 }, { "epoch": 4.516812284882182, "grad_norm": 4.5733160972595215, "learning_rate": 4.84377758164166e-06, "loss": 0.3346, "step": 25590 }, { "epoch": 4.518577354161151, "grad_norm": 0.9147126078605652, "learning_rate": 4.8261253309797e-06, "loss": 0.2274, "step": 25600 }, { "epoch": 4.52034242344012, "grad_norm": 1.2009501457214355, "learning_rate": 4.808473080317741e-06, "loss": 0.317, "step": 25610 }, { "epoch": 4.522107492719089, "grad_norm": 1.2615430355072021, "learning_rate": 4.790820829655781e-06, "loss": 0.2832, "step": 25620 }, { "epoch": 4.523872561998059, "grad_norm": 2.852447032928467, "learning_rate": 4.773168578993822e-06, "loss": 0.3147, "step": 25630 }, { "epoch": 4.525637631277028, "grad_norm": 0.9824477434158325, "learning_rate": 4.755516328331863e-06, "loss": 0.2997, "step": 25640 }, { "epoch": 4.527402700555997, "grad_norm": 0.7785859107971191, "learning_rate": 4.737864077669903e-06, "loss": 0.305, "step": 25650 }, { "epoch": 4.529167769834966, "grad_norm": 0.7817701697349548, "learning_rate": 4.7202118270079434e-06, "loss": 0.3145, "step": 25660 }, { "epoch": 4.530932839113936, "grad_norm": 1.9367749691009521, "learning_rate": 4.7025595763459846e-06, "loss": 0.2832, "step": 25670 }, { "epoch": 4.532697908392905, "grad_norm": 3.135561943054199, "learning_rate": 4.684907325684025e-06, "loss": 0.2559, "step": 25680 }, { "epoch": 4.534462977671874, "grad_norm": 0.9335846304893494, "learning_rate": 4.667255075022066e-06, "loss": 0.3028, "step": 25690 }, { "epoch": 4.536228046950843, "grad_norm": 1.048282504081726, "learning_rate": 4.649602824360106e-06, "loss": 0.3062, "step": 25700 }, { "epoch": 4.537993116229812, "grad_norm": 1.0533701181411743, "learning_rate": 4.631950573698147e-06, "loss": 0.3095, "step": 25710 }, { "epoch": 4.539758185508781, "grad_norm": 1.0367389917373657, "learning_rate": 4.614298323036188e-06, "loss": 0.2964, "step": 25720 }, { "epoch": 4.541523254787751, "grad_norm": 4.537725925445557, "learning_rate": 4.596646072374228e-06, "loss": 0.3129, "step": 25730 }, { "epoch": 4.54328832406672, "grad_norm": 1.0677293539047241, "learning_rate": 4.578993821712268e-06, "loss": 0.2888, "step": 25740 }, { "epoch": 4.545053393345689, "grad_norm": 2.7014944553375244, "learning_rate": 4.5613415710503095e-06, "loss": 0.3432, "step": 25750 }, { "epoch": 4.546818462624658, "grad_norm": 3.0516560077667236, "learning_rate": 4.54368932038835e-06, "loss": 0.2645, "step": 25760 }, { "epoch": 4.548583531903628, "grad_norm": 1.2660284042358398, "learning_rate": 4.52603706972639e-06, "loss": 0.3056, "step": 25770 }, { "epoch": 4.550348601182597, "grad_norm": 1.121484398841858, "learning_rate": 4.508384819064431e-06, "loss": 0.2779, "step": 25780 }, { "epoch": 4.552113670461566, "grad_norm": 0.7952722907066345, "learning_rate": 4.4907325684024715e-06, "loss": 0.2805, "step": 25790 }, { "epoch": 4.553878739740535, "grad_norm": 0.9755123257637024, "learning_rate": 4.473080317740512e-06, "loss": 0.3174, "step": 25800 }, { "epoch": 4.555643809019504, "grad_norm": 3.9789111614227295, "learning_rate": 4.455428067078553e-06, "loss": 0.3924, "step": 25810 }, { "epoch": 4.557408878298473, "grad_norm": 0.7337201237678528, "learning_rate": 4.437775816416593e-06, "loss": 0.2729, "step": 25820 }, { "epoch": 4.559173947577443, "grad_norm": 1.2232738733291626, "learning_rate": 4.420123565754634e-06, "loss": 0.275, "step": 25830 }, { "epoch": 4.560939016856412, "grad_norm": 0.9414122700691223, "learning_rate": 4.402471315092675e-06, "loss": 0.3133, "step": 25840 }, { "epoch": 4.562704086135381, "grad_norm": 2.035956621170044, "learning_rate": 4.384819064430715e-06, "loss": 0.306, "step": 25850 }, { "epoch": 4.56446915541435, "grad_norm": 1.1455971002578735, "learning_rate": 4.367166813768755e-06, "loss": 0.2878, "step": 25860 }, { "epoch": 4.56623422469332, "grad_norm": 1.037244439125061, "learning_rate": 4.3495145631067965e-06, "loss": 0.3086, "step": 25870 }, { "epoch": 4.567999293972289, "grad_norm": 1.216088891029358, "learning_rate": 4.331862312444837e-06, "loss": 0.3132, "step": 25880 }, { "epoch": 4.569764363251258, "grad_norm": 1.1905804872512817, "learning_rate": 4.314210061782877e-06, "loss": 0.3662, "step": 25890 }, { "epoch": 4.571529432530227, "grad_norm": 1.2199269533157349, "learning_rate": 4.296557811120918e-06, "loss": 0.2483, "step": 25900 }, { "epoch": 4.573294501809196, "grad_norm": 1.5603306293487549, "learning_rate": 4.2789055604589585e-06, "loss": 0.2351, "step": 25910 }, { "epoch": 4.575059571088165, "grad_norm": 1.185232400894165, "learning_rate": 4.261253309797e-06, "loss": 0.3162, "step": 25920 }, { "epoch": 4.576824640367135, "grad_norm": 1.825040340423584, "learning_rate": 4.24360105913504e-06, "loss": 0.2868, "step": 25930 }, { "epoch": 4.578589709646104, "grad_norm": 2.968623399734497, "learning_rate": 4.22594880847308e-06, "loss": 0.3104, "step": 25940 }, { "epoch": 4.580354778925073, "grad_norm": 1.5956579446792603, "learning_rate": 4.208296557811121e-06, "loss": 0.311, "step": 25950 }, { "epoch": 4.582119848204042, "grad_norm": 1.1295750141143799, "learning_rate": 4.190644307149162e-06, "loss": 0.3386, "step": 25960 }, { "epoch": 4.583884917483012, "grad_norm": 2.4485180377960205, "learning_rate": 4.172992056487202e-06, "loss": 0.2868, "step": 25970 }, { "epoch": 4.585649986761981, "grad_norm": 1.623205542564392, "learning_rate": 4.155339805825243e-06, "loss": 0.3648, "step": 25980 }, { "epoch": 4.58741505604095, "grad_norm": 1.0542960166931152, "learning_rate": 4.1376875551632835e-06, "loss": 0.3063, "step": 25990 }, { "epoch": 4.589180125319919, "grad_norm": 2.7634682655334473, "learning_rate": 4.120035304501324e-06, "loss": 0.2696, "step": 26000 }, { "epoch": 4.589180125319919, "eval_loss": 0.6520219445228577, "eval_runtime": 591.3466, "eval_samples_per_second": 47.903, "eval_steps_per_second": 2.396, "eval_token_accuracy": 0.0004987183931890128, "step": 26000 }, { "epoch": 4.590945194598888, "grad_norm": 0.9608787298202515, "learning_rate": 4.102383053839365e-06, "loss": 0.3891, "step": 26010 }, { "epoch": 4.592710263877857, "grad_norm": 1.4607149362564087, "learning_rate": 4.084730803177406e-06, "loss": 0.3429, "step": 26020 }, { "epoch": 4.594475333156827, "grad_norm": 3.5650391578674316, "learning_rate": 4.0670785525154455e-06, "loss": 0.3296, "step": 26030 }, { "epoch": 4.596240402435796, "grad_norm": 2.4345691204071045, "learning_rate": 4.051191526919682e-06, "loss": 0.2763, "step": 26040 }, { "epoch": 4.598005471714765, "grad_norm": 3.1073009967803955, "learning_rate": 4.033539276257723e-06, "loss": 0.295, "step": 26050 }, { "epoch": 4.599770540993734, "grad_norm": 1.295482873916626, "learning_rate": 4.015887025595764e-06, "loss": 0.2796, "step": 26060 }, { "epoch": 4.601535610272704, "grad_norm": 2.9781415462493896, "learning_rate": 3.998234774933804e-06, "loss": 0.2426, "step": 26070 }, { "epoch": 4.603300679551673, "grad_norm": 2.263942003250122, "learning_rate": 3.980582524271844e-06, "loss": 0.3208, "step": 26080 }, { "epoch": 4.605065748830642, "grad_norm": 1.9064913988113403, "learning_rate": 3.9629302736098855e-06, "loss": 0.3221, "step": 26090 }, { "epoch": 4.606830818109611, "grad_norm": 1.2600352764129639, "learning_rate": 3.945278022947927e-06, "loss": 0.2916, "step": 26100 }, { "epoch": 4.60859588738858, "grad_norm": 1.0407512187957764, "learning_rate": 3.927625772285966e-06, "loss": 0.3659, "step": 26110 }, { "epoch": 4.610360956667549, "grad_norm": 0.909622073173523, "learning_rate": 3.909973521624007e-06, "loss": 0.3047, "step": 26120 }, { "epoch": 4.612126025946519, "grad_norm": 1.2238434553146362, "learning_rate": 3.8923212709620484e-06, "loss": 0.2643, "step": 26130 }, { "epoch": 4.613891095225488, "grad_norm": 1.068109154701233, "learning_rate": 3.874669020300088e-06, "loss": 0.2787, "step": 26140 }, { "epoch": 4.615656164504457, "grad_norm": 1.3901350498199463, "learning_rate": 3.857016769638129e-06, "loss": 0.3241, "step": 26150 }, { "epoch": 4.617421233783426, "grad_norm": 1.4353913068771362, "learning_rate": 3.83936451897617e-06, "loss": 0.2735, "step": 26160 }, { "epoch": 4.619186303062396, "grad_norm": 2.3656747341156006, "learning_rate": 3.82171226831421e-06, "loss": 0.3031, "step": 26170 }, { "epoch": 4.620951372341365, "grad_norm": 5.758845806121826, "learning_rate": 3.8040600176522508e-06, "loss": 0.3019, "step": 26180 }, { "epoch": 4.622716441620334, "grad_norm": 1.460636854171753, "learning_rate": 3.7864077669902915e-06, "loss": 0.3448, "step": 26190 }, { "epoch": 4.624481510899303, "grad_norm": 0.8635392189025879, "learning_rate": 3.7687555163283322e-06, "loss": 0.3295, "step": 26200 }, { "epoch": 4.626246580178272, "grad_norm": 1.209629774093628, "learning_rate": 3.7511032656663725e-06, "loss": 0.3744, "step": 26210 }, { "epoch": 4.628011649457241, "grad_norm": 0.9914260506629944, "learning_rate": 3.7334510150044132e-06, "loss": 0.3099, "step": 26220 }, { "epoch": 4.629776718736211, "grad_norm": 3.453619956970215, "learning_rate": 3.715798764342454e-06, "loss": 0.3271, "step": 26230 }, { "epoch": 4.63154178801518, "grad_norm": 0.9711024165153503, "learning_rate": 3.6981465136804943e-06, "loss": 0.3152, "step": 26240 }, { "epoch": 4.633306857294149, "grad_norm": 1.7442153692245483, "learning_rate": 3.680494263018535e-06, "loss": 0.2736, "step": 26250 }, { "epoch": 4.635071926573118, "grad_norm": 1.5687910318374634, "learning_rate": 3.6628420123565757e-06, "loss": 0.2782, "step": 26260 }, { "epoch": 4.636836995852088, "grad_norm": 0.8251959085464478, "learning_rate": 3.645189761694616e-06, "loss": 0.3215, "step": 26270 }, { "epoch": 4.638602065131057, "grad_norm": 1.3255902528762817, "learning_rate": 3.6275375110326567e-06, "loss": 0.2467, "step": 26280 }, { "epoch": 4.640367134410026, "grad_norm": 1.0360016822814941, "learning_rate": 3.6098852603706975e-06, "loss": 0.3133, "step": 26290 }, { "epoch": 4.642132203688995, "grad_norm": 0.9186058640480042, "learning_rate": 3.5922330097087378e-06, "loss": 0.2761, "step": 26300 }, { "epoch": 4.643897272967964, "grad_norm": 1.3283425569534302, "learning_rate": 3.5745807590467785e-06, "loss": 0.3016, "step": 26310 }, { "epoch": 4.645662342246933, "grad_norm": 2.8542580604553223, "learning_rate": 3.556928508384819e-06, "loss": 0.3201, "step": 26320 }, { "epoch": 4.647427411525903, "grad_norm": 0.8498136401176453, "learning_rate": 3.53927625772286e-06, "loss": 0.307, "step": 26330 }, { "epoch": 4.649192480804872, "grad_norm": 1.2035598754882812, "learning_rate": 3.5216240070609002e-06, "loss": 0.2858, "step": 26340 }, { "epoch": 4.650957550083841, "grad_norm": 1.4415249824523926, "learning_rate": 3.503971756398941e-06, "loss": 0.2998, "step": 26350 }, { "epoch": 4.65272261936281, "grad_norm": 3.0700762271881104, "learning_rate": 3.4863195057369817e-06, "loss": 0.3217, "step": 26360 }, { "epoch": 4.65448768864178, "grad_norm": 0.8596258163452148, "learning_rate": 3.468667255075022e-06, "loss": 0.2927, "step": 26370 }, { "epoch": 4.656252757920749, "grad_norm": 2.478158950805664, "learning_rate": 3.4510150044130627e-06, "loss": 0.3098, "step": 26380 }, { "epoch": 4.658017827199718, "grad_norm": 3.169443368911743, "learning_rate": 3.4333627537511034e-06, "loss": 0.3009, "step": 26390 }, { "epoch": 4.659782896478687, "grad_norm": 1.1622791290283203, "learning_rate": 3.4157105030891437e-06, "loss": 0.2945, "step": 26400 }, { "epoch": 4.661547965757656, "grad_norm": 0.9778208136558533, "learning_rate": 3.3980582524271844e-06, "loss": 0.3669, "step": 26410 }, { "epoch": 4.663313035036625, "grad_norm": 1.3064228296279907, "learning_rate": 3.380406001765225e-06, "loss": 0.2849, "step": 26420 }, { "epoch": 4.665078104315595, "grad_norm": 2.3085849285125732, "learning_rate": 3.3627537511032663e-06, "loss": 0.3482, "step": 26430 }, { "epoch": 4.666843173594564, "grad_norm": 4.210567951202393, "learning_rate": 3.345101500441306e-06, "loss": 0.2852, "step": 26440 }, { "epoch": 4.668608242873533, "grad_norm": 1.2309918403625488, "learning_rate": 3.327449249779347e-06, "loss": 0.2712, "step": 26450 }, { "epoch": 4.670373312152502, "grad_norm": 0.7104286551475525, "learning_rate": 3.309796999117388e-06, "loss": 0.2868, "step": 26460 }, { "epoch": 4.672138381431472, "grad_norm": 1.697925090789795, "learning_rate": 3.292144748455428e-06, "loss": 0.3129, "step": 26470 }, { "epoch": 4.673903450710441, "grad_norm": 1.1698533296585083, "learning_rate": 3.2744924977934686e-06, "loss": 0.2597, "step": 26480 }, { "epoch": 4.67566851998941, "grad_norm": 1.2262734174728394, "learning_rate": 3.2568402471315098e-06, "loss": 0.3117, "step": 26490 }, { "epoch": 4.677433589268379, "grad_norm": 2.3895866870880127, "learning_rate": 3.2391879964695497e-06, "loss": 0.2886, "step": 26500 }, { "epoch": 4.679198658547348, "grad_norm": 1.1458373069763184, "learning_rate": 3.2215357458075904e-06, "loss": 0.2731, "step": 26510 }, { "epoch": 4.680963727826317, "grad_norm": 1.3387194871902466, "learning_rate": 3.2038834951456315e-06, "loss": 0.3467, "step": 26520 }, { "epoch": 4.682728797105287, "grad_norm": 0.7710012197494507, "learning_rate": 3.1862312444836723e-06, "loss": 0.3574, "step": 26530 }, { "epoch": 4.684493866384256, "grad_norm": 0.7596765160560608, "learning_rate": 3.168578993821712e-06, "loss": 0.2881, "step": 26540 }, { "epoch": 4.686258935663225, "grad_norm": 2.8583171367645264, "learning_rate": 3.150926743159753e-06, "loss": 0.3057, "step": 26550 }, { "epoch": 4.688024004942194, "grad_norm": 4.765782833099365, "learning_rate": 3.133274492497794e-06, "loss": 0.2745, "step": 26560 }, { "epoch": 4.689789074221164, "grad_norm": 1.0168406963348389, "learning_rate": 3.1156222418358343e-06, "loss": 0.31, "step": 26570 }, { "epoch": 4.691554143500133, "grad_norm": 1.3798327445983887, "learning_rate": 3.0979699911738746e-06, "loss": 0.3772, "step": 26580 }, { "epoch": 4.693319212779102, "grad_norm": 1.2798494100570679, "learning_rate": 3.0803177405119153e-06, "loss": 0.3575, "step": 26590 }, { "epoch": 4.695084282058071, "grad_norm": 0.6293618083000183, "learning_rate": 3.062665489849956e-06, "loss": 0.2763, "step": 26600 }, { "epoch": 4.69684935133704, "grad_norm": 0.893796980381012, "learning_rate": 3.0450132391879963e-06, "loss": 0.2968, "step": 26610 }, { "epoch": 4.698614420616009, "grad_norm": 2.7735981941223145, "learning_rate": 3.0273609885260375e-06, "loss": 0.3324, "step": 26620 }, { "epoch": 4.700379489894979, "grad_norm": 3.296457290649414, "learning_rate": 3.0097087378640778e-06, "loss": 0.3665, "step": 26630 }, { "epoch": 4.702144559173948, "grad_norm": 2.9078474044799805, "learning_rate": 2.992056487202118e-06, "loss": 0.3013, "step": 26640 }, { "epoch": 4.703909628452917, "grad_norm": 0.9801350235939026, "learning_rate": 2.9744042365401592e-06, "loss": 0.3093, "step": 26650 }, { "epoch": 4.705674697731886, "grad_norm": 3.157639980316162, "learning_rate": 2.9567519858781995e-06, "loss": 0.3188, "step": 26660 }, { "epoch": 4.707439767010856, "grad_norm": 3.6760361194610596, "learning_rate": 2.9390997352162403e-06, "loss": 0.3403, "step": 26670 }, { "epoch": 4.709204836289825, "grad_norm": 3.899162530899048, "learning_rate": 2.921447484554281e-06, "loss": 0.3169, "step": 26680 }, { "epoch": 4.710969905568794, "grad_norm": 0.8276218175888062, "learning_rate": 2.9037952338923213e-06, "loss": 0.3051, "step": 26690 }, { "epoch": 4.712734974847763, "grad_norm": 2.197704315185547, "learning_rate": 2.886142983230362e-06, "loss": 0.3056, "step": 26700 }, { "epoch": 4.714500044126732, "grad_norm": 1.597664475440979, "learning_rate": 2.8684907325684027e-06, "loss": 0.2863, "step": 26710 }, { "epoch": 4.716265113405701, "grad_norm": 2.530172348022461, "learning_rate": 2.8508384819064434e-06, "loss": 0.306, "step": 26720 }, { "epoch": 4.718030182684671, "grad_norm": 3.7757041454315186, "learning_rate": 2.8331862312444837e-06, "loss": 0.3334, "step": 26730 }, { "epoch": 4.71979525196364, "grad_norm": 1.145796775817871, "learning_rate": 2.8155339805825245e-06, "loss": 0.312, "step": 26740 }, { "epoch": 4.721560321242609, "grad_norm": 2.3895106315612793, "learning_rate": 2.797881729920565e-06, "loss": 0.3311, "step": 26750 }, { "epoch": 4.723325390521578, "grad_norm": 1.0439083576202393, "learning_rate": 2.7802294792586055e-06, "loss": 0.2596, "step": 26760 }, { "epoch": 4.725090459800548, "grad_norm": 2.6261661052703857, "learning_rate": 2.762577228596646e-06, "loss": 0.3281, "step": 26770 }, { "epoch": 4.726855529079517, "grad_norm": 4.33652925491333, "learning_rate": 2.744924977934687e-06, "loss": 0.3354, "step": 26780 }, { "epoch": 4.728620598358486, "grad_norm": 1.4785486459732056, "learning_rate": 2.7272727272727272e-06, "loss": 0.2876, "step": 26790 }, { "epoch": 4.730385667637455, "grad_norm": 0.853244960308075, "learning_rate": 2.709620476610768e-06, "loss": 0.3128, "step": 26800 }, { "epoch": 4.732150736916424, "grad_norm": 4.548651218414307, "learning_rate": 2.6919682259488087e-06, "loss": 0.3242, "step": 26810 }, { "epoch": 4.733915806195393, "grad_norm": 1.5922703742980957, "learning_rate": 2.6743159752868494e-06, "loss": 0.3453, "step": 26820 }, { "epoch": 4.735680875474363, "grad_norm": 1.2137644290924072, "learning_rate": 2.6566637246248897e-06, "loss": 0.3563, "step": 26830 }, { "epoch": 4.737445944753332, "grad_norm": 3.0799238681793213, "learning_rate": 2.6390114739629304e-06, "loss": 0.2741, "step": 26840 }, { "epoch": 4.739211014032301, "grad_norm": 1.2434720993041992, "learning_rate": 2.621359223300971e-06, "loss": 0.3753, "step": 26850 }, { "epoch": 4.74097608331127, "grad_norm": 1.0864171981811523, "learning_rate": 2.6037069726390114e-06, "loss": 0.2686, "step": 26860 }, { "epoch": 4.74274115259024, "grad_norm": 1.02934992313385, "learning_rate": 2.586054721977052e-06, "loss": 0.3207, "step": 26870 }, { "epoch": 4.744506221869209, "grad_norm": 2.575124740600586, "learning_rate": 2.568402471315093e-06, "loss": 0.276, "step": 26880 }, { "epoch": 4.746271291148178, "grad_norm": 2.9327895641326904, "learning_rate": 2.550750220653133e-06, "loss": 0.2728, "step": 26890 }, { "epoch": 4.748036360427147, "grad_norm": 1.2971775531768799, "learning_rate": 2.533097969991174e-06, "loss": 0.3054, "step": 26900 }, { "epoch": 4.749801429706116, "grad_norm": 2.2022430896759033, "learning_rate": 2.5154457193292146e-06, "loss": 0.2774, "step": 26910 }, { "epoch": 4.751566498985085, "grad_norm": 1.1809123754501343, "learning_rate": 2.4977934686672553e-06, "loss": 0.3174, "step": 26920 }, { "epoch": 4.753331568264055, "grad_norm": 1.0642763376235962, "learning_rate": 2.4801412180052956e-06, "loss": 0.2903, "step": 26930 }, { "epoch": 4.755096637543024, "grad_norm": 1.3782179355621338, "learning_rate": 2.4624889673433364e-06, "loss": 0.3092, "step": 26940 }, { "epoch": 4.756861706821993, "grad_norm": 0.7173749208450317, "learning_rate": 2.444836716681377e-06, "loss": 0.3167, "step": 26950 }, { "epoch": 4.758626776100962, "grad_norm": 3.7624456882476807, "learning_rate": 2.4271844660194174e-06, "loss": 0.3088, "step": 26960 }, { "epoch": 4.760391845379932, "grad_norm": 2.7406694889068604, "learning_rate": 2.4095322153574585e-06, "loss": 0.3334, "step": 26970 }, { "epoch": 4.762156914658901, "grad_norm": 2.3339078426361084, "learning_rate": 2.391879964695499e-06, "loss": 0.3057, "step": 26980 }, { "epoch": 4.76392198393787, "grad_norm": 2.0263259410858154, "learning_rate": 2.374227714033539e-06, "loss": 0.2526, "step": 26990 }, { "epoch": 4.765687053216839, "grad_norm": 3.8648524284362793, "learning_rate": 2.3565754633715803e-06, "loss": 0.2975, "step": 27000 }, { "epoch": 4.765687053216839, "eval_loss": 0.6528812646865845, "eval_runtime": 592.735, "eval_samples_per_second": 47.79, "eval_steps_per_second": 2.391, "eval_token_accuracy": 0.0004985154146000063, "step": 27000 }, { "epoch": 4.767452122495808, "grad_norm": 0.8504769802093506, "learning_rate": 2.3389232127096206e-06, "loss": 0.2618, "step": 27010 }, { "epoch": 4.769217191774777, "grad_norm": 4.1417436599731445, "learning_rate": 2.321270962047661e-06, "loss": 0.2816, "step": 27020 }, { "epoch": 4.770982261053747, "grad_norm": 1.3575209379196167, "learning_rate": 2.303618711385702e-06, "loss": 0.2941, "step": 27030 }, { "epoch": 4.772747330332716, "grad_norm": 3.2614238262176514, "learning_rate": 2.2859664607237423e-06, "loss": 0.339, "step": 27040 }, { "epoch": 4.774512399611685, "grad_norm": 2.057924270629883, "learning_rate": 2.268314210061783e-06, "loss": 0.3645, "step": 27050 }, { "epoch": 4.776277468890654, "grad_norm": 2.174006462097168, "learning_rate": 2.2506619593998238e-06, "loss": 0.3252, "step": 27060 }, { "epoch": 4.778042538169624, "grad_norm": 1.5190887451171875, "learning_rate": 2.233009708737864e-06, "loss": 0.2854, "step": 27070 }, { "epoch": 4.779807607448593, "grad_norm": 1.7354347705841064, "learning_rate": 2.215357458075905e-06, "loss": 0.2674, "step": 27080 }, { "epoch": 4.781572676727562, "grad_norm": 0.9061219692230225, "learning_rate": 2.197705207413945e-06, "loss": 0.303, "step": 27090 }, { "epoch": 4.783337746006531, "grad_norm": 0.9815694093704224, "learning_rate": 2.1800529567519862e-06, "loss": 0.3104, "step": 27100 }, { "epoch": 4.7851028152855, "grad_norm": 1.0961108207702637, "learning_rate": 2.1624007060900265e-06, "loss": 0.2876, "step": 27110 }, { "epoch": 4.786867884564469, "grad_norm": 0.8509754538536072, "learning_rate": 2.144748455428067e-06, "loss": 0.3311, "step": 27120 }, { "epoch": 4.788632953843439, "grad_norm": 2.176682233810425, "learning_rate": 2.127096204766108e-06, "loss": 0.3585, "step": 27130 }, { "epoch": 4.790398023122408, "grad_norm": 0.9792783260345459, "learning_rate": 2.1094439541041483e-06, "loss": 0.3132, "step": 27140 }, { "epoch": 4.792163092401377, "grad_norm": 0.9529822468757629, "learning_rate": 2.091791703442189e-06, "loss": 0.3025, "step": 27150 }, { "epoch": 4.793928161680346, "grad_norm": 1.79104745388031, "learning_rate": 2.0741394527802297e-06, "loss": 0.3481, "step": 27160 }, { "epoch": 4.795693230959315, "grad_norm": 1.240500807762146, "learning_rate": 2.05648720211827e-06, "loss": 0.2879, "step": 27170 }, { "epoch": 4.797458300238285, "grad_norm": 2.7122185230255127, "learning_rate": 2.0388349514563107e-06, "loss": 0.2797, "step": 27180 }, { "epoch": 4.799223369517254, "grad_norm": 1.1486530303955078, "learning_rate": 2.0211827007943515e-06, "loss": 0.2731, "step": 27190 }, { "epoch": 4.800988438796223, "grad_norm": 2.8081750869750977, "learning_rate": 2.003530450132392e-06, "loss": 0.2995, "step": 27200 }, { "epoch": 4.802753508075192, "grad_norm": 1.1571046113967896, "learning_rate": 1.9858781994704325e-06, "loss": 0.2822, "step": 27210 }, { "epoch": 4.804518577354161, "grad_norm": 1.4669501781463623, "learning_rate": 1.968225948808473e-06, "loss": 0.3253, "step": 27220 }, { "epoch": 4.806283646633131, "grad_norm": 0.9071950316429138, "learning_rate": 1.950573698146514e-06, "loss": 0.3247, "step": 27230 }, { "epoch": 4.8080487159121, "grad_norm": 1.4200551509857178, "learning_rate": 1.9329214474845542e-06, "loss": 0.3058, "step": 27240 }, { "epoch": 4.809813785191069, "grad_norm": 1.8995357751846313, "learning_rate": 1.915269196822595e-06, "loss": 0.2876, "step": 27250 }, { "epoch": 4.811578854470038, "grad_norm": 1.1847381591796875, "learning_rate": 1.8976169461606357e-06, "loss": 0.3068, "step": 27260 }, { "epoch": 4.813343923749007, "grad_norm": 1.081455111503601, "learning_rate": 1.879964695498676e-06, "loss": 0.3231, "step": 27270 }, { "epoch": 4.815108993027977, "grad_norm": 1.0959587097167969, "learning_rate": 1.862312444836717e-06, "loss": 0.3252, "step": 27280 }, { "epoch": 4.816874062306946, "grad_norm": 1.3235975503921509, "learning_rate": 1.8446601941747572e-06, "loss": 0.3169, "step": 27290 }, { "epoch": 4.818639131585915, "grad_norm": 2.8046417236328125, "learning_rate": 1.8270079435127981e-06, "loss": 0.3005, "step": 27300 }, { "epoch": 4.820404200864884, "grad_norm": 3.0383732318878174, "learning_rate": 1.8093556928508387e-06, "loss": 0.3221, "step": 27310 }, { "epoch": 4.822169270143853, "grad_norm": 2.7719833850860596, "learning_rate": 1.791703442188879e-06, "loss": 0.2759, "step": 27320 }, { "epoch": 4.823934339422823, "grad_norm": 1.3720282316207886, "learning_rate": 1.7740511915269199e-06, "loss": 0.3227, "step": 27330 }, { "epoch": 4.825699408701792, "grad_norm": 1.1153744459152222, "learning_rate": 1.7563989408649604e-06, "loss": 0.2728, "step": 27340 }, { "epoch": 4.827464477980761, "grad_norm": 3.1630284786224365, "learning_rate": 1.7387466902030011e-06, "loss": 0.3009, "step": 27350 }, { "epoch": 4.82922954725973, "grad_norm": 1.8349213600158691, "learning_rate": 1.7210944395410416e-06, "loss": 0.3477, "step": 27360 }, { "epoch": 4.830994616538699, "grad_norm": 0.9961835741996765, "learning_rate": 1.7034421888790821e-06, "loss": 0.2948, "step": 27370 }, { "epoch": 4.832759685817669, "grad_norm": 2.6590805053710938, "learning_rate": 1.6857899382171229e-06, "loss": 0.212, "step": 27380 }, { "epoch": 4.834524755096638, "grad_norm": 1.2290103435516357, "learning_rate": 1.6681376875551634e-06, "loss": 0.2845, "step": 27390 }, { "epoch": 4.836289824375607, "grad_norm": 3.4003355503082275, "learning_rate": 1.650485436893204e-06, "loss": 0.3422, "step": 27400 }, { "epoch": 4.838054893654576, "grad_norm": 1.2254893779754639, "learning_rate": 1.6328331862312446e-06, "loss": 0.3088, "step": 27410 }, { "epoch": 4.839819962933545, "grad_norm": 2.233959197998047, "learning_rate": 1.6151809355692851e-06, "loss": 0.291, "step": 27420 }, { "epoch": 4.841585032212515, "grad_norm": 0.9931178092956543, "learning_rate": 1.5975286849073258e-06, "loss": 0.2904, "step": 27430 }, { "epoch": 4.843350101491484, "grad_norm": 2.9077725410461426, "learning_rate": 1.5798764342453664e-06, "loss": 0.3081, "step": 27440 }, { "epoch": 4.845115170770453, "grad_norm": 4.963522911071777, "learning_rate": 1.5622241835834069e-06, "loss": 0.2661, "step": 27450 }, { "epoch": 4.846880240049422, "grad_norm": 2.2293930053710938, "learning_rate": 1.5445719329214476e-06, "loss": 0.3111, "step": 27460 }, { "epoch": 4.848645309328391, "grad_norm": 1.4293302297592163, "learning_rate": 1.5269196822594883e-06, "loss": 0.311, "step": 27470 }, { "epoch": 4.850410378607361, "grad_norm": 1.2167930603027344, "learning_rate": 1.5092674315975286e-06, "loss": 0.3095, "step": 27480 }, { "epoch": 4.85217544788633, "grad_norm": 0.9582422375679016, "learning_rate": 1.4916151809355693e-06, "loss": 0.3293, "step": 27490 }, { "epoch": 4.853940517165299, "grad_norm": 1.0664198398590088, "learning_rate": 1.47396293027361e-06, "loss": 0.2998, "step": 27500 }, { "epoch": 4.855705586444268, "grad_norm": 2.815174102783203, "learning_rate": 1.4563106796116506e-06, "loss": 0.3487, "step": 27510 }, { "epoch": 4.857470655723237, "grad_norm": 2.406156301498413, "learning_rate": 1.4386584289496913e-06, "loss": 0.2857, "step": 27520 }, { "epoch": 4.859235725002207, "grad_norm": 4.202737808227539, "learning_rate": 1.4210061782877318e-06, "loss": 0.3014, "step": 27530 }, { "epoch": 4.861000794281176, "grad_norm": 2.53226375579834, "learning_rate": 1.4033539276257723e-06, "loss": 0.2891, "step": 27540 }, { "epoch": 4.862765863560145, "grad_norm": 1.1201049089431763, "learning_rate": 1.385701676963813e-06, "loss": 0.2834, "step": 27550 }, { "epoch": 4.864530932839114, "grad_norm": 0.8204585909843445, "learning_rate": 1.3680494263018535e-06, "loss": 0.3099, "step": 27560 }, { "epoch": 4.866296002118083, "grad_norm": 0.8954245448112488, "learning_rate": 1.350397175639894e-06, "loss": 0.2838, "step": 27570 }, { "epoch": 4.868061071397053, "grad_norm": 1.3307112455368042, "learning_rate": 1.3327449249779348e-06, "loss": 0.2736, "step": 27580 }, { "epoch": 4.869826140676022, "grad_norm": 2.3387129306793213, "learning_rate": 1.3150926743159753e-06, "loss": 0.2762, "step": 27590 }, { "epoch": 4.871591209954991, "grad_norm": 0.8025361895561218, "learning_rate": 1.297440423654016e-06, "loss": 0.3007, "step": 27600 }, { "epoch": 4.87335627923396, "grad_norm": 2.6926400661468506, "learning_rate": 1.2797881729920565e-06, "loss": 0.2714, "step": 27610 }, { "epoch": 4.875121348512929, "grad_norm": 0.843543291091919, "learning_rate": 1.262135922330097e-06, "loss": 0.2508, "step": 27620 }, { "epoch": 4.876886417791899, "grad_norm": 2.3910794258117676, "learning_rate": 1.2444836716681377e-06, "loss": 0.221, "step": 27630 }, { "epoch": 4.878651487070868, "grad_norm": 1.1623315811157227, "learning_rate": 1.2268314210061783e-06, "loss": 0.3141, "step": 27640 }, { "epoch": 4.880416556349837, "grad_norm": 1.3146125078201294, "learning_rate": 1.209179170344219e-06, "loss": 0.3237, "step": 27650 }, { "epoch": 4.882181625628806, "grad_norm": 1.2800966501235962, "learning_rate": 1.1915269196822597e-06, "loss": 0.2762, "step": 27660 }, { "epoch": 4.883946694907775, "grad_norm": 1.5484886169433594, "learning_rate": 1.1738746690203e-06, "loss": 0.3123, "step": 27670 }, { "epoch": 4.885711764186745, "grad_norm": 3.3829500675201416, "learning_rate": 1.1562224183583407e-06, "loss": 0.2624, "step": 27680 }, { "epoch": 4.887476833465714, "grad_norm": 1.0227965116500854, "learning_rate": 1.1385701676963814e-06, "loss": 0.3367, "step": 27690 }, { "epoch": 4.889241902744683, "grad_norm": 0.977554440498352, "learning_rate": 1.120917917034422e-06, "loss": 0.3134, "step": 27700 }, { "epoch": 4.891006972023652, "grad_norm": 1.9226443767547607, "learning_rate": 1.1032656663724627e-06, "loss": 0.2859, "step": 27710 }, { "epoch": 4.892772041302621, "grad_norm": 0.9863432049751282, "learning_rate": 1.085613415710503e-06, "loss": 0.2962, "step": 27720 }, { "epoch": 4.894537110581591, "grad_norm": 2.392817974090576, "learning_rate": 1.0679611650485437e-06, "loss": 0.3292, "step": 27730 }, { "epoch": 4.89630217986056, "grad_norm": 2.1724038124084473, "learning_rate": 1.0503089143865844e-06, "loss": 0.2853, "step": 27740 }, { "epoch": 4.898067249139529, "grad_norm": 1.4158661365509033, "learning_rate": 1.032656663724625e-06, "loss": 0.2829, "step": 27750 }, { "epoch": 4.899832318418498, "grad_norm": 1.2781078815460205, "learning_rate": 1.0150044130626657e-06, "loss": 0.2682, "step": 27760 }, { "epoch": 4.901597387697467, "grad_norm": 0.9909201860427856, "learning_rate": 9.973521624007062e-07, "loss": 0.2903, "step": 27770 }, { "epoch": 4.903362456976437, "grad_norm": 1.9638125896453857, "learning_rate": 9.796999117387467e-07, "loss": 0.2802, "step": 27780 }, { "epoch": 4.905127526255406, "grad_norm": 1.5546956062316895, "learning_rate": 9.620476610767874e-07, "loss": 0.3387, "step": 27790 }, { "epoch": 4.906892595534375, "grad_norm": 2.3524317741394043, "learning_rate": 9.44395410414828e-07, "loss": 0.2755, "step": 27800 }, { "epoch": 4.908657664813344, "grad_norm": 2.4190759658813477, "learning_rate": 9.267431597528684e-07, "loss": 0.3054, "step": 27810 }, { "epoch": 4.910422734092313, "grad_norm": 0.7786517143249512, "learning_rate": 9.09090909090909e-07, "loss": 0.2976, "step": 27820 }, { "epoch": 4.912187803371283, "grad_norm": 3.312124490737915, "learning_rate": 8.914386584289498e-07, "loss": 0.2602, "step": 27830 }, { "epoch": 4.913952872650252, "grad_norm": 1.6175801753997803, "learning_rate": 8.737864077669904e-07, "loss": 0.3014, "step": 27840 }, { "epoch": 4.915717941929221, "grad_norm": 1.6575732231140137, "learning_rate": 8.56134157105031e-07, "loss": 0.3052, "step": 27850 }, { "epoch": 4.91748301120819, "grad_norm": 1.234856367111206, "learning_rate": 8.384819064430715e-07, "loss": 0.3095, "step": 27860 }, { "epoch": 4.919248080487159, "grad_norm": 3.541182041168213, "learning_rate": 8.208296557811121e-07, "loss": 0.2807, "step": 27870 }, { "epoch": 4.921013149766129, "grad_norm": 1.1563400030136108, "learning_rate": 8.031774051191527e-07, "loss": 0.3132, "step": 27880 }, { "epoch": 4.922778219045098, "grad_norm": 0.9835913777351379, "learning_rate": 7.855251544571934e-07, "loss": 0.2582, "step": 27890 }, { "epoch": 4.924543288324067, "grad_norm": 1.1049256324768066, "learning_rate": 7.678729037952339e-07, "loss": 0.2754, "step": 27900 }, { "epoch": 4.926308357603036, "grad_norm": 1.1144856214523315, "learning_rate": 7.502206531332746e-07, "loss": 0.3033, "step": 27910 }, { "epoch": 4.928073426882005, "grad_norm": 1.1903610229492188, "learning_rate": 7.325684024713151e-07, "loss": 0.3207, "step": 27920 }, { "epoch": 4.929838496160975, "grad_norm": 1.6650744676589966, "learning_rate": 7.149161518093557e-07, "loss": 0.3029, "step": 27930 }, { "epoch": 4.931603565439944, "grad_norm": 3.3469290733337402, "learning_rate": 6.972639011473963e-07, "loss": 0.2502, "step": 27940 }, { "epoch": 4.933368634718913, "grad_norm": 0.8579782247543335, "learning_rate": 6.79611650485437e-07, "loss": 0.2735, "step": 27950 }, { "epoch": 4.935133703997882, "grad_norm": 0.7689581513404846, "learning_rate": 6.619593998234776e-07, "loss": 0.3048, "step": 27960 }, { "epoch": 4.936898773276851, "grad_norm": 1.6697453260421753, "learning_rate": 6.443071491615181e-07, "loss": 0.3127, "step": 27970 }, { "epoch": 4.938663842555821, "grad_norm": 2.41105055809021, "learning_rate": 6.266548984995587e-07, "loss": 0.2993, "step": 27980 }, { "epoch": 4.94042891183479, "grad_norm": 1.329819679260254, "learning_rate": 6.090026478375993e-07, "loss": 0.3397, "step": 27990 }, { "epoch": 4.942193981113759, "grad_norm": 0.8519781231880188, "learning_rate": 5.913503971756399e-07, "loss": 0.3279, "step": 28000 }, { "epoch": 4.942193981113759, "eval_loss": 0.6499401926994324, "eval_runtime": 592.4285, "eval_samples_per_second": 47.815, "eval_steps_per_second": 2.392, "eval_token_accuracy": 0.0004987183931890128, "step": 28000 }, { "epoch": 4.943959050392728, "grad_norm": 1.1171196699142456, "learning_rate": 5.736981465136805e-07, "loss": 0.3191, "step": 28010 }, { "epoch": 4.945724119671697, "grad_norm": 3.382251262664795, "learning_rate": 5.560458958517212e-07, "loss": 0.341, "step": 28020 }, { "epoch": 4.947489188950667, "grad_norm": 1.3143904209136963, "learning_rate": 5.383936451897618e-07, "loss": 0.268, "step": 28030 }, { "epoch": 4.949254258229636, "grad_norm": 1.3122084140777588, "learning_rate": 5.207413945278023e-07, "loss": 0.3608, "step": 28040 }, { "epoch": 4.951019327508605, "grad_norm": 3.165703296661377, "learning_rate": 5.030891438658429e-07, "loss": 0.313, "step": 28050 }, { "epoch": 4.952784396787574, "grad_norm": 1.2517273426055908, "learning_rate": 4.854368932038835e-07, "loss": 0.2962, "step": 28060 }, { "epoch": 4.954549466066543, "grad_norm": 0.9983559250831604, "learning_rate": 4.6778464254192414e-07, "loss": 0.2787, "step": 28070 }, { "epoch": 4.956314535345513, "grad_norm": 1.5988553762435913, "learning_rate": 4.5013239187996475e-07, "loss": 0.3001, "step": 28080 }, { "epoch": 4.958079604624482, "grad_norm": 2.1683664321899414, "learning_rate": 4.324801412180053e-07, "loss": 0.2809, "step": 28090 }, { "epoch": 4.959844673903451, "grad_norm": 1.0645288228988647, "learning_rate": 4.1482789055604593e-07, "loss": 0.3469, "step": 28100 }, { "epoch": 4.96160974318242, "grad_norm": 1.2240803241729736, "learning_rate": 3.971756398940865e-07, "loss": 0.3191, "step": 28110 }, { "epoch": 4.963374812461389, "grad_norm": 1.3843520879745483, "learning_rate": 3.795233892321271e-07, "loss": 0.3317, "step": 28120 }, { "epoch": 4.965139881740359, "grad_norm": 1.5506443977355957, "learning_rate": 3.6187113857016773e-07, "loss": 0.3127, "step": 28130 }, { "epoch": 4.966904951019328, "grad_norm": 0.9059777855873108, "learning_rate": 3.442188879082083e-07, "loss": 0.3205, "step": 28140 }, { "epoch": 4.968670020298297, "grad_norm": 3.5530388355255127, "learning_rate": 3.265666372462489e-07, "loss": 0.291, "step": 28150 }, { "epoch": 4.970435089577266, "grad_norm": 4.22071647644043, "learning_rate": 3.0891438658428953e-07, "loss": 0.2899, "step": 28160 }, { "epoch": 4.972200158856235, "grad_norm": 2.5634772777557373, "learning_rate": 2.9126213592233014e-07, "loss": 0.28, "step": 28170 }, { "epoch": 4.973965228135205, "grad_norm": 0.8161858916282654, "learning_rate": 2.736098852603707e-07, "loss": 0.253, "step": 28180 }, { "epoch": 4.975730297414174, "grad_norm": 2.5096681118011475, "learning_rate": 2.559576345984113e-07, "loss": 0.2672, "step": 28190 }, { "epoch": 4.977495366693143, "grad_norm": 1.2688097953796387, "learning_rate": 2.383053839364519e-07, "loss": 0.2874, "step": 28200 }, { "epoch": 4.979260435972112, "grad_norm": 1.17268705368042, "learning_rate": 2.2065313327449248e-07, "loss": 0.3518, "step": 28210 }, { "epoch": 4.981025505251081, "grad_norm": 1.4544918537139893, "learning_rate": 2.0300088261253312e-07, "loss": 0.422, "step": 28220 }, { "epoch": 4.98279057453005, "grad_norm": 1.8409720659255981, "learning_rate": 1.853486319505737e-07, "loss": 0.2917, "step": 28230 }, { "epoch": 4.98455564380902, "grad_norm": 1.1786491870880127, "learning_rate": 1.676963812886143e-07, "loss": 0.2828, "step": 28240 }, { "epoch": 4.986320713087989, "grad_norm": 2.36482834815979, "learning_rate": 1.500441306266549e-07, "loss": 0.2697, "step": 28250 }, { "epoch": 4.988085782366958, "grad_norm": 2.1647708415985107, "learning_rate": 1.323918799646955e-07, "loss": 0.2416, "step": 28260 }, { "epoch": 4.989850851645927, "grad_norm": 2.631108522415161, "learning_rate": 1.1473962930273611e-07, "loss": 0.3057, "step": 28270 }, { "epoch": 4.991615920924897, "grad_norm": 2.9568164348602295, "learning_rate": 9.70873786407767e-08, "loss": 0.3125, "step": 28280 }, { "epoch": 4.993380990203866, "grad_norm": 1.0665050745010376, "learning_rate": 7.94351279788173e-08, "loss": 0.2802, "step": 28290 }, { "epoch": 4.995146059482835, "grad_norm": 2.899059534072876, "learning_rate": 6.178287731685791e-08, "loss": 0.316, "step": 28300 }, { "epoch": 4.996911128761804, "grad_norm": 1.2794692516326904, "learning_rate": 4.4130626654898505e-08, "loss": 0.3383, "step": 28310 }, { "epoch": 4.998676198040773, "grad_norm": 1.1049646139144897, "learning_rate": 2.64783759929391e-08, "loss": 0.3078, "step": 28320 } ], "logging_steps": 10, "max_steps": 28325, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0162875069185393e+18, "train_batch_size": 10, "trial_name": null, "trial_params": null }